Commits

Anonymous committed eeb5e4e

First public release of cable-word-cloud

Comments (0)

Files changed (2)

-In the name of God
+==============================================================================
+                             In the name of God
+==============================================================================
 
 cable-word-counter is a tool to count unique words in WikiLeaks US Embassy
 Cables HTML files. It takes names of HTML files as arguments and outputs a
 list of count-word pairs.
 
-Example usage:
-$ cd ~/wikileaks
+==============================================================================
+
+Example usage of cable-word-counter:
+$ cd ~/cablegate-201012081854
 $ ls
 articles  cable  classification  date  origin  reldate  static  tag
 $ cd cable
-$ ~/bin/cable-word-counter `find -name '*.html'` >words-count.txt
+$ ~/wikileaks/cable-word-counter `find -name '*.html'` >words-count.txt
 $ head words-count.txt
    4685 iran
    4097 government
    2539 support
    2319 new
 
-The code is public domain.
+==============================================================================
+
+cable-word-cloud is a word cloud generator using PyTagCloud to visualize
+output of cable-word-counter. It takes two file names as argument:
+1. A text file containing output of cable-word-counter.
+2. A file name for output, which will be a PNG file if the name ends with
+   '.png', and an HTML file otherwise. 
+
+==============================================================================
+
+Example usage of cable-word-cloud:
+$ pwd
+/home/ebrahim/cablegate-201012081854/cable
+$ ~/wikileaks/cable-word-cloud words-count.txt word-cloud.png
+[ long wait with no output on screen, but word-cloud.png is ready. ]
+$ ~/wikileaks/cable-word-cloud words-count.txt word-cloud.html
+[ long wait again with no output on screen, but word-cloud.html is ready. ]
+
+==============================================================================
+
+The source code of all tools are public domain.
+Feel free to contact me: ebrahim@mohammadi.ir
+
+==============================================================================

cables/cable-word-cloud

+#!/usr/bin/python
+
+# In the name of God
+# This code is public domain
+
+from sys import argv
+from pytagcloud import create_tag_image, create_html_data, make_tags
+
+def defscale(count, mincount, maxcount, minsize, maxsize):
+    return minsize + (maxsize - minsize) * ((count - mincount) / float(maxcount - mincount)) ** 1.5
+
+class Generator:
+	def __init__(self, input_filename, output_filename):
+		self.font = '/usr/share/fonts/truetype/ttf-dejavu/DejaVuSerif.ttf'
+		self.fontzoom = 2
+		self.size = (800, 600)
+		self.words = {}
+		infile = open(input_filename, 'r')
+		remained = 150
+		while infile and remained > 0:
+			line = infile.readline()
+			parts = line.split()
+			if len(parts) != 2:
+				break
+			self.words[parts[1]] = int(parts[0])
+			remained = remained - 1
+		self.output_filename = output_filename
+
+	def generate(self):
+		self.tags = make_tags(self.words, minsize=6, maxsize=42, scalef=defscale)
+		if self.output_filename.endswith('.png'):
+			create_tag_image(self.tags, file=self.output_filename, size=self.size, fontname=self.font, background=(0, 0, 0, 0), fontzoom=self.fontzoom, vertical=False, crop=True, rectangular=False)
+		else:
+			data = create_html_data(self.tags, size=self.size, fontname=self.font, fontzoom=self.fontzoom)
+			outfile = open(self.output_filename, 'w')
+			outfile.write("""<html>
+<head>
+<title>Word Cloud</title>
+<style>
+body{background-color: black;}
+.tag{position: absolute;}
+a{text-decoration: none;}""")
+			for style in data['css']:
+				outfile.write(style)
+				outfile.write('\n')
+			outfile.write("</style>\n</head>\n<body>\n")
+			for link in data['links']:
+				outfile.write('<a class="tag %(cls)s" style="top: %(top)dpx; left: %(left)dpx; font-size: %(size)dpx;">%(tag)s</a>\n' % link)
+			outfile.write("</body>\n")
+
+# entry point
+if len(argv) != 3:
+	print "Usage: cable-word-cloud counts-file output-file\n"
+	print "  counts-file must contain lines with count and word separated by whitespace"
+	print "  output-file can end with '.png' to produce an image, or anything else for HTML output"
+	exit(1)
+
+generator = Generator(argv[1], argv[2])
+generator.generate()