Anonymous committed d8a881b

'Finished' parser

Comments (0)

Files changed (6)

 In order to use episoder, you will need python installed on your system. In
-addition to the default distribution, the 'beautifulsoup' and 'yaml' modules
-are required. On debian systems the corresponding packages are called
-python-beautifulsoup and python-yaml.
+addition to the default distribution, the 'beautifulsoup', 'yaml' and sqlite2
+modules are required. On debian systems the corresponding packages are called
+python-beautifulsoup and python-yaml and python-pysqlite2.
-Be warned, the new parser is highly experimental. If you notice any
-issues with it (wrong parsing results, crashes, ...) please report them via
-our bug tracker.
+With the new website design it's finally possible to parse their data
+in a reasonable way again. We still need to fetch two pages per show, but
+that's way better than before when we basically had to fetch one page for each
-Also, since does not include season information in the html pages,
-episoder has to download each season individually, leading to a high number of
-downloads per show. To reduce this effect, seasons are downloaded in reverse
-order and downloading stops as soon as an episode older than the requested
-start date is found, but still, at least two downloads per show and run are
-If you want to build a complete list of all episodes from a specific show,
-please use instead since that parser only needs to download one
-page per show.
+You might have to update your source lines for to something like this:
+	src=
+If you don't, episoder should complain.
 	print '-----               ------------'
 	for parser in parsers:
-		print "%-20s%s" % (parser.__class__.__name__, parser)
+		description = str(parser)
+		if 'DO NOT USE' not in description:
+			print "%-20s%s" % (parser.__class__.__name__, parser)
 def show_help():
 	print """Usage: %s [options]
 .B data=/path/to/file
 Specifies the file to be used to store the information about upcoming shows.
-.B output_plugin=unquoted_plugin_name
-Specifies the output plugin to be used, defaults to 'plain' for backwards compatibility.
 .B src=http://some.web.add/ress
 Each src entry specifies a URL with episode information. Make sure you have the
-appropriate plugin before adding random new URLs (See \fBPLUGINS\fR).
+appropriate plugin before adding random new URLs (check with -p)
 .B src=http://some.web.add/ress name=some name
 Optionally you can specify a name to use for your show instead of the one extracted from the website.
     echo "40 5 * * * episoder -b" >> crontab
     crontab crontab
-episoder uses a set of plugins to parse the source files and to generate its output. See README.plugins for details.
 .B ~/.episoder
 - default configuration file


 		file = os.fdopen(fd, 'w')
+		self.logger.debug("Stored in %s" % name)
 		return name
 	def _runAwk(self, webdata):
 	def accept(self, url):
 		return re.match('http://(www.)?\w+/show/\d+/?', url)
+	def _fetchPage(self, url):
+'Fetching ' + url)
+		headers = { 'User-Agent': 'foo' }
+		request = urllib2.Request(url, None, headers)
+		result = urllib2.urlopen(request)
+		(fd, name) = tempfile.mkstemp()
+		file = os.fdopen(fd, 'w')
+		file.write(
+		file.close()
+		self.logger.debug("Stored in %s" % name)
+		return name
 	def parse(self, source, store): = store
 		self.episodes = {} = None
-		# need to get episode.html?season=All and
-		# episode.html?season=All&shv=guide
 		url = source['url']
+		guidepage = self._fetchPage(url +
+				'episode.html?season=All&shv=guide')
+		listpage = self._fetchPage(url +
+				'episode.html?season=All&shv=list')
 		if 'name' in source:
 			name = source['name']
 			name = None
-		print url
+		file = open(listpage)
+		self.parseListViewPage(BeautifulSoup(
+			'ISO-8859-1')))
+		file.close()
+		file = open(guidepage)
+		self.parseGuideViewPage(BeautifulSoup(
+			'ISO-8859-1')))
+		file.close()
+		os.unlink(guidepage)
+		os.unlink(listpage)
+		show_id =
+		for key in self.episodes:
+, self.episodes[key])
 	def parseFile(self, filename, store, name=None): = store
 					(totalepnum, id))
 			if not id in self.episodes:
-				#self.episodes[id] = {}
 				self.episodes[id] = Episode(None, None, 0,
 					0,, None, 0)
 		for element in elements:
 			meta = element.find('div', { 'class': 'meta' })
 			data = meta.contents[0].strip()
-			result ='Season ([0-9]+), Episode ([0-9]+)' +
-				'.* Aired: (.*)$', data)
+			result ='Season ([0-9]+).*Episode ([0-9]+)',
+					data)
 			season =
 			episode_num =
-			airdate = datetime.datetime.strptime(,
-					"%m/%d/%Y").date()
+			result ='Aired: (.*)$', data)
+			airdate = datetime.datetime.strptime(
+, "%m/%d/%Y").date()
 			h3 = element.find('h3')
 			link = h3.find('a')
 		scripts			= [ 'episoder' ],
 		long_description	= LONG_DESCRIPTION,
 		data_files		= files,
-		requires		= [ 'beautifulsoup', 'pysqlite2' ]
+		requires		= [ 'beautifulsoup', 'pysqlite2', 'yaml' ]
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.