#!/usr/bin/ruby ####################################################################### ## File: mkWordCloud.rb ## Author: Henry Feild (hfeild @ cs umass edu) ## Date: 06-Jul-2011 usage = " Usage: mkWordCloud.rb [|-] should be a text file with two tab-delimited columns: word-count word The word may contain any characters except tabs. The filename may be replaced with a '-' to read from stdin. The output is an html file with styling to make more frequent words appear larger. " ####################################################################### require 'cgi' #### ## Shuffles an array of things in place. #### def shuffle( a ) for i in (0...a.size) j = rand( a.size ) t = a[j] a[j] = a[i] a[i] = t end end if ARGV.size < 1 STDERR.puts usage exit end file = ARGV.shift wordsAndCounts = {} wordsAsHTML = [] fd = STDIN unless file == "-" fd = File.open( file, "r" ) end ## Read the file in. while line = fd.gets (count, word) = line.chomp.split( /\t/ ) wordsAndCounts[word] = count.to_i end unless file == "-" fd.close end ## Figure out what our normalizing factors should be. min = wordsAndCounts.values.min.to_f max = wordsAndCounts.values.max.to_f minFontSize = 0.6 ## em maxFontSize = 5.0 ## em STDERR.puts "max: #{max}; min: #{min}" for (word, count) in wordsAndCounts if word.nil? next end #STDERR.puts "(#{word}, #{count})" wordsAsHTML << "#{CGI::escapeHTML(word)}\n" end shuffle(wordsAsHTML) puts "" wordsAsHTML.each do |word| print word end puts ""