Anonymous avatar Anonymous committed 77902dd

Added new parser for tv.com

Comments (0)

Files changed (2)

 def show_version():
 	print 'This is episoder ' + version
 
+def show_parsers():
+	parsers = plugins.all()['parsing']
+
+	print 'Available parsers:\n'
+	print 'name                description'
+	print '-----               ------------'
+
+	for parser in parsers:
+		print "%-20s%s" % (parser.__class__.__name__, parser)
+
 def show_help():
-	print """Usage: episoder [options]
+	print """Usage: %s [options]
 
 Global options:
   -h			show this help
   -v			verbose operation
   -w			very verbose (debug) operation
   -V			show version information
+  -p			show available parsers
+  -l <file>		log to file instead of stdout
 
 Options for database update:
   -d <YYYY-MM-DD>	remove episodes prior to this date (default: yesterday)
   -i			ignore date (don't remove old episodes)
+  -f <file>		get data from file, ignore configured sources (needs -P)
+  -P <parser>		force parser to be used (only in combination with -f)
 
 Options for console output:
   -d <YYYY-MM-DD>	only show episodes newer than date (default: yesterday)
 
 Note that -i overrules -d and -n
 
-Report episoder bugs on http://episoder.sf.net/"""
+Report episoder bugs on http://episoder.sf.net/""" % sys.argv[0]
 
 def parse_rc(path):
 	datafile = None
 	return config
 
 def show_data(options, config):
+	if not os.path.exists(config['datafile']):
+		sys.stderr.write('%s not found\n' % config['datafile'])
+		sys.exit(4)
+
 	store = episoder.DataStore(config['datafile'])
 	renderer = plugins.all()['output'][0]
 	renderer.render(store, options, config)
 	store = episoder.DataStore(config['datafile'])
 	store.clear()
 
-	for source in config['sources']:
-		url = source['url']
-		parser = plugins.parser_for(url)
+	if options['inputfile']:
+		file = options['inputfile']
+		logging.debug('Only parsing %s' % file)
+		parser = plugins.parser_named(options['parser'])
+		parser.parseFile(file, store)
+	else:
+		for source in config['sources']:
+			url = source['url']
+			parser = plugins.parser_for(url)
 
-		if not parser:
-			logging.warning('No parser found for %s' % url)
-			continue
+			if not parser:
+				logging.warning('No parser found for %s' % url)
+				continue
 
-		parser.parse(source, store)
+			parser.parse(source, store)
 
 	if not options['nodate']:
 		basedate = options['date']
 	nodate = False
 	search = ''
 	command = show_data
+	inputfile = None
+	parser = None
+	logfile = None
 
 	try:
-		options, args = getopt.getopt(sys.argv[1:], "c:d:hin:s:vVwb")
+		valid_options = 'c:d:hin:s:vVwbpf:P:l:'
+		options, args = getopt.getopt(sys.argv[1:], valid_options)
 	except getopt.error, msg:
 		print msg
 		print "for help use -h"
 			sys.exit(0)
 		elif option == '-n':
 			daysahead = int(argument)
+		elif option == '-p':
+			show_parsers()
+			sys.exit(0)
 		elif option == '-d':
 			parts = argument.split('-')
 			date = datetime.date(int(parts[0]), int(parts[1]),
 			sys.exit(99)
 		elif option == '-b':
 			command = update_data
+		elif option == '-f':
+			inputfile = argument
+		elif option == '-P':
+			parser = argument
+		elif option == '-l':
+			logfile=argument
 
 	return {
 		'rcfile': rcfile,
 		'date': date,
 		'nodate': nodate,
 		'search': search,
-		'command': command
+		'command': command,
+		'inputfile': inputfile,
+		'parser': parser,
+		'logfile': logfile
 	}
 
 def main():
 	options = get_options()
-	logging.basicConfig(level=options['loglevel'])
+
+	if options['logfile']:
+		logging.basicConfig(level=options['loglevel'],
+				filename=options['logfile'])
+	else:
+		logging.basicConfig(level=options['loglevel'])
 	config = parse_rc(options['rcfile'])
+
+	if os.path.exists(config['datafile']):
+		file = open(config['datafile'])
+		if file.read(6) != 'SQLite':
+			sys.stderr.write('episoder found an old data file at ' +
+				'%s. You have to delete ' % config['datafile'] +
+				'that file before episoder can proceed.\n')
+			sys.exit(4)
+
 	options['command'](options, config)
 
 if __name__ == "__main__":

pyepisoder/plugins.py

 import os
+import re
 import yaml
 import logging
 import urllib2
 import tempfile
 import datetime
 
+from BeautifulSoup import BeautifulSoup
+
 def all():
 	return {
-		'parsing': [ EpguidesParser(), DummyParser() ],
+		'parsing': [ EpguidesParser(), TVComParser(), 
+			TVComDummyParser() ],
 		'output': [ ConsoleRenderer() ]
 	}
 
 
 	return None
 
+def parser_named(name):
+	parsers = all()['parsing']
+
+	for parser in parsers:
+		if parser.__class__.__name__ == name:
+			return parser
+
+	raise Exception('Parser %s not found\n' % name)
+
 class DummyParser(object):
 	def __init__(self):
 		self.logger = logging.getLogger('DummyParser')
 		return url.startswith('http://www.epguides.com/')
 
 	def parse(self, source, store):
-		self.store = store
 		url = source['url']
 
 		if 'name' in source:
 			name = None
 
 		webdata = self._fetchPage(url)
-		yamlfile = self._runAwk(webdata)
+		self.parseFile(webdata, store, name)
+		os.unlink(webdata)
+
+	def parseFile(self, file, store, name=None):
+		self.store = store
+		yamlfile = self._runAwk(file)
 		data = self._readYaml(yamlfile, name)
 		self.store.commit()
-		os.unlink(webdata)
 		os.unlink(yamlfile)
 
 	def _fetchPage(self, url):
 			self.logger.debug('Found episode %s' % episode['title'])
 			self.store.addEpisode(show_id, episode)
 
+class TVComDummyParser(object):
+	def __str__(self):
+		return 'dummy tv.com parser to detect old urls (DO NOT USE)'
+
+	def accept(self, url):
+		return url.startswith('http://www.tv.com')
+
+	def parse(self, source, _):
+		logging.error("The url %s needs to be updated" % source['url'])
+
+class TVComParser(object):
+	def __init__(self):
+		self.logger = logging.getLogger('TVComParser')
+
+	def __str__(self):
+		return 'tv.com parser'
+
+	def accept(self, url):
+		return re.match('http://(www.)?tv.com/\w+/show/\d+/?', url)
+
+	def parse(self, source, store):
+		url = source['url']
+
+		if 'name' in source:
+			name = source['name']
+		else:
+			name = None
+
+		print url
+
+	def parseFile(self, filename, store, name=None):
+		self.store = store
+		self.episodes = {}
+
+		file = open(filename)
+		data = file.read()
+		soup = BeautifulSoup(data.decode('ISO-8859-1'))
+		file.close()
+
+		elements = soup.findAll('li',
+				{ 'class': re.compile('episode.*')})
+
+		switch = soup.find('a', { 'class': 'switch_to_guide'})
+
+		if (switch):
+			self.logger.debug('This is a list view page')
+			self.parseListViewPage(soup)
+		else:
+			self.logger.debug('This is a guide view page')
+			self.parseGuideViewPage(soup)
+
+	def parseListViewPage(self, soup):
+		elements = soup.findAll('tr', { 'class': 'episode' })
+
+		for element in elements:
+			tr = element.find('td', { 'class': 'number' })
+			totalepnum = int(tr.contents[0].strip())
+
+			tr = element.find('td', { 'class': 'prod_no' })
+			prodnum = tr.contents[0].strip()
+
+			reviews = element.find('td', { 'class': 'reviews' })
+			link = reviews.contents[0]
+			url = link['href']
+			parts = url.split('/')
+			id = int(parts[-2])
+
+			self.logger.debug("Found episode %d (%d)" %
+					(totalepnum, id))
+
+			if not id in self.episodes:
+				self.episodes[id] = {}
+
+			self.episodes[id]['prodnum'] = prodnum
+			self.episodes[id]['totalepnum'] = totalepnum
+
+	def parseGuideViewPage(self, soup):
+		h1 = soup.find('h1')
+		show_name = h1.contents[0]
+		self.logger.debug('Got show "%s"' % show_name)
+
+		elements = soup.findAll('li',
+				{ 'class': re.compile('episode.*')})
+
+		for element in elements:
+			meta = element.find('div', { 'class': 'meta' })
+			data = meta.contents[0].strip()
+			result = re.search('Season ([0-9]+), Episode ([0-9]+)' +
+				'.* Aired: (.*)$', data)
+
+			season = result.group(1)
+			episode_num = result.group(2)
+			airdate = result.group(3)
+
+			h3 = element.find('h3')
+			link = h3.find('a')
+			title = link.contents[0]
+			url = link['href']
+			parts = url.split('/')
+			id = int(parts[-2])
+
+			if not id in self.episodes:
+				self.episodes[id] = {}
+			self.episodes[id]['season'] = season
+			self.episodes[id]['episode'] = episode_num
+			self.episodes[id]['airdate'] = airdate
+			self.episodes[id]['title'] = title
+
+			self.logger.debug('Found episode %s (%d)' %
+					(title, id))
+
 class ConsoleRenderer(object):
 	DEFAULT='\033[30;0m'
 	RED='\033[31;1m'
 			else:
 				self._renderEpisode(episode,
 						ConsoleRenderer.DEFAULT)
-
-
-
-	"""
-
-	if [ ! -z "$NODATE" ] && [ ! -z "$SEARCH_TEXT" ]; then
-		echo -ne ${color_gray}
-		grep -i "$SEARCH_TEXT" $EPISODER_DATAFILE | while read line; do
-			DATE=${line:0:10}
-			output=`echo $line | sed -r "s/([0-9]{4}-[0-9]{2}-[0-9]{2})/\`date +\"$DATE_FORMAT\" -d $DATE\`/" | sed -e "s/$SEARCH_TEXT/\\\\\E${color_green}\0\\\\\E${color_gray}/ig"`
-			echo -e $output
-		done
-	else
-		YESTERDAY=`get_date_by_offset -1`
-		TODAY=`get_date_by_offset 0`
-		TOMORROW=`get_date_by_offset 1`
-
-		echo -ne ${color_red}
-		grep "^$YESTERDAY" $EPISODER_DATAFILE | grep -i "$SEARCH_TEXT" | sed s/^/\ / | sed -r "s/([0-9]{4}-[0-9]{2}-[0-9]{2})/`date +"$DATE_FORMAT" -d $YESTERDAY`/"
-
-		echo -ne ${color_yellow}
-		grep "^$TODAY" $EPISODER_DATAFILE | grep -i "$SEARCH_TEXT" | sed 's/.*/>\0</' | sed -r "s/([0-9]{4}-[0-9]{2}-[0-9]{2})/`date +"$DATE_FORMAT" -d $TODAY`/"
-
-		echo -ne ${color_green}	
-		grep "^$TOMORROW" $EPISODER_DATAFILE | grep -i "$SEARCH_TEXT" | sed s/^/\ / | sed -r "s/([0-9]{4}-[0-9]{2}-[0-9]{2})/`date +"$DATE_FORMAT" -d $TOMORROW`/"
-
-		echo -ne ${color_lightblue}
-		for ((day=2; day <= $NUM_DAYS; day++)); do
-			DATE=`get_date_by_offset $day`
-			grep "^$DATE" $EPISODER_DATAFILE | grep -i "$SEARCH_TEXT" | sed s/^/\ / | sed -r "s/([0-9]{4}-[0-9]{2}-[0-9]{2})/`date +"$DATE_FORMAT" -d $DATE`/"
-		done
-
-	fi
-	echo -ne ${color_default}"""
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.