Source

IMDBReview_NaiveBayes / imdb.py

import os, collections, urllib2, sys, codecs
import json
import re
import getopt
import math
import NaiveBayes

def analyze_review(review):
	results = []

	start = review.find('<hr noshade="1" size="1" width="50%" align="center">', 0);
	while (start > 0):
		p1 = review.find('<p>', start + 20);
		p1end = review.find('</p>', p1);

		p2 = review.find('<p>', p1end + 2);
		p2end = review.find('</p>', p2 + 2);

		start = review.find('<hr noshade="1" size="1" width="50%" align="center">', start + 20);

		if p2 > 0 and p2end > 0:
			results.append([review[p1+4:p1end], review[p2+4:p2end]]);

	return results;

def extract_review(movie_id, review_count):
	imdburl = 'http://www.imdb.com/title/{0}/reviews'.format(movie_id);

	reviews = []

	query = urllib2.urlopen(urllib2.quote(imdburl, safe="%/:=&?~#+!$,;'@()*[]"))
	result = query.read()
	query.close()

	reviews = reviews + analyze_review(result);

	if(review_count > 10):
		for i in range(1, review_count / 10):
			imdburl = 'http://www.imdb.com/title/{0}/reviews?start={1}'.format(movie_id, i * 10);

			print imdburl

			query = urllib2.urlopen(urllib2.quote(imdburl, safe="%/:=&?~#+!$,;'@()*[]"))
			result = query.read()
			query.close()

			reviews = reviews + analyze_review(result);
	return reviews;

def build_html(query, term, comment_num, is_train):
	dom = json.loads(query)

	html = open("imdb_template.html").read()
	html += "<title>{0}</title></head>".format(term)

	html += "<body>";
	html += "<div id=\"movie\">"

	html += "<table border=\"0\"><tr>";

	html += "<td style=\"padding-left: 20px; width: 100%\">"

	html += u"<span class=\"resulttitle\">Title:</span> {0} <br/><br/>".format(unicode(dom['Title']));
	html += u'<span class=\"resulttitle\">Year:</span> {0} <br/><br/>'.format(unicode(dom['Year']));
	html += u'<span class=\"resulttitle\">Rated:</span> {0} <br/><br/>'.format(unicode(dom['Rated']));
	html += u'<span class=\"resulttitle\">Released:</span> {0} <br/><br/>'.format(unicode(dom['Released']));
	html += u'<span class=\"resulttitle\">Director:</span> {0} <br/><br/>'.format(unicode(dom['Director']));
	html += u'<span class=\"resulttitle\">Genre:</span> {0} <br/><br/>'.format(unicode(dom['Genre']));
	html += u'<span class=\"resulttitle\">Writer:</span> {0} <br/><br/>'.format(unicode(dom['Writer']));
	html += u'<span class=\"resulttitle\">Actors:</span> {0} <br/><br/>'.format(unicode(dom['Actors']));
	html += u'<span class=\"resulttitle\">Plot:</span> {0} <br/><br/>'.format(unicode(dom['Plot']));
	html += u'<span class=\"resulttitle\">Runtime:</span> {0} <br/><br/>'.format(unicode(dom['Runtime']));
	html += u'<span class=\"resulttitle\">Rating:</span> {0} <br/><br/>'.format(unicode(dom['Rating']));
	html += u'<span class=\"resulttitle\">Votes:</span> {0} <br/><br/>'.format(unicode(dom['Votes']));
	html += u'<span class=\"resulttitle\">ID:</span> <a href=\"http://www.imdb.com/title/{0}/\">{0}</a> <br/><br/>'.format(unicode(dom['ID']));

	html += "</td>";
	html += u'<td><img src={0} float=\"left\"></img></td>'.format(dom['Poster']);


	html += "</tr></table>"
	html += "</div>"

	html += "<div id=\"reviews\">"

	reviews = extract_review(dom['ID'], comment_num);

	index = 0;

	if not os.path.exists(term):
		os.makedirs(term)
	if not os.path.exists(term+ '/pos'):
		os.makedirs(term + '/pos')
	if not os.path.exists(term + '/neg'):
		os.makedirs(term + '/neg')
	if not os.path.exists(term + '/ukn'):
		os.makedirs(term + '/ukn')

	for review in reviews:
		html += u"<div id=\"review_{0}\">".format(do_classify(review[1]))
		html += u"{0}".format(unicode(review[0]));
		html += u"<p>{0}</p><br />".format(unicode(review[1]));
		html += u"</div>"
		
		if is_train:
			alt_pos = review[0].find('alt=')
			alt_end_pos = review[0].find('/', alt_pos)
			alt = -1;
			if alt_pos > 0:
				alt = (int)(review[0][alt_pos+5:alt_end_pos])

			path = 'r{0}.txt'.format(index)
			if alt > 6:
				path = term + '/pos/' + path
			elif alt < 5:
				path = term + '/neg/' + path
			else:
				path = term + '/ukn/' + path

			output = codecs.open(path, encoding="utf-8", mode="w+")
			output.write(review[1])
			output.close()
			index += 1;

#	do_train(term);

	html += "</body></html>"

	return html;

def do_classify(comment):
	nb = NaiveBayes.NaiveBayes()
	nb.FILTER_STOP_WORDS = True
	nb.read_trained('trained.raw')

	words = comment.split()
	if nb.FILTER_STOP_WORDS:
		words = nb.filterStopWords(words)
	guess = nb.classify(words)
	
	return guess

def do_train(path):
	if not os.path.exists(path):
		return 1.0;

	nb = NaiveBayes.NaiveBayes()
	nb.FILTER_STOP_WORDS = True
	nb.LOAD_TRAINED_DATA = True

	splits = nb.buildSplits(['trained.raw', path])
	avgAccuracy = 0.0
	fold = 0
	for split in splits:
		classifier = NaiveBayes.NaiveBayes()
		accuracy = 0.0
		for example in split.train:
			words = example.words
			if nb.FILTER_STOP_WORDS:
				words = classifier.filterStopWords(words)
				classifier.addExample(example.klass, words)

	if nb.LOAD_TRAINED_DATA:
		classifier.positive_examples = nb.positive_examples
		classifier.negative_examples = nb.negative_examples
	
	if len(split.test) > 0:
		for example in split.test:
			words = example.words
			if nb.FILTER_STOP_WORDS:
				words =  classifier.filterStopWords(words)
			guess = classifier.classify(words)
			if example.klass == guess:
				accuracy += 1.0
			classifier.addExample(example.klass, words)

		classifier.save_trained('trained.raw');

		accuracy = accuracy / len(split.test)
		avgAccuracy += accuracy
		print '[INFO]\tAccuracy: %f' % (accuracy) 
		fold += 1
		avgAccuracy = avgAccuracy / fold
		return avgAccuracy;

def save_html(html, file):
	output = codecs.open(file, encoding="utf-8", mode="w+")
	output.write(html)
	output.close()

def do_query(movie, year):
	api_url = u"http://www.imdbapi.com/?t={0}".format(movie)
	if(year != 0):
		api_url += "&y={0}".format(year)

	query = urllib2.urlopen(urllib2.quote(api_url, safe="%/:=&?~#+!$,;'@()*[]"))
	result = query.read()
	query.close()

	return result

def do_list(list, comment_num):
	avg = 0.0
	for item in list:
		item.replace('\n', '')
		build_html(do_query(item, 0), item, comment_num, True);
		avg += do_train(item);
	print "**** Total Average: " + str(avg / len(list))


def main():
	(options, args) = getopt.getopt(sys.argv[1:], 'tc')
  	if ('-t','') in options:
  		input = open(args[0], 'r');
		lines = input.readlines();
		input.close();
		do_list(lines, int(args[1]));
	elif ('-c', '') in options:
		if (len(args[0]) > 1):
			comment_num = 10
			if(len(args[0]) > 2):
				comment_num = int(args[2])
			save_html(build_html(do_query(args[1], 0), args[1], comment_num, False), args[0])
	else:
		print "Usage:" 
		print "Train:		imdb.py -t TrainList MaxComment"
		print "Classifiy:	imdb.py -c OutputHtml MovieTitle MaxComment"
		
if __name__ == '__main__':
	main()
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.