Commits

Moritz Wilhelmy committed 23d3bcc

rename cron-feed -> .py for clarity

  • Participants
  • Parent commits 0e1607e

Comments (0)

Files changed (2)

File cron-feed

-#!/usr/local/bin/python
-# This file is in the public domain.
-#
-# Written 2012 by
-#   Moritz Wilhelmy, mw at furnace wzff de
-#   Dirk Mallunat, s4msung at exigen org
-#
-# TODO:
-# - improve error handling, i.e. if things fail between opening the seen-file
-#   and finishing, have a backup file...
-# - Do something about RSS feeds without GUIDs in entries
-
-"""Poor man's feedreader"""
-
-import os, sys, codecs, time, fcntl, getopt
-import feedparser
-import mailbox
-import email.utils
-from urllib import pathname2url as urlenc
-from email.MIMEText import MIMEText
-from email.utils    import formatdate, mktime_tz
-from email.Header   import Header
-
-try:
-	import cPickle as pickle
-except ImportError:
-	import pickle
-
-######
-home   = os.getenv("HOME") or "."
-base   = home + "/lib/feeds/"
-furls  = base + "feeds" # textfile containing urls.
-fseen  = base + "seen"  # things that have already been sent, just pickle it..
-renderc= "elinks -dump /dev/stdin"
-# should also be possible to send them with sendmail if you need it..
-mbox   = mailbox.Maildir(home + "/.Maildir/.feeds")
-######
-
-class Bug(Exception):
-	pass
-
-class Unreachable(Bug):
-	pass
-
-class FeedCache(dict):
-	def __init__(self, filename):
-		self.filename = filename
-
-	def load(self):
-		self.clear()
-		try:
-			with open(self.filename, "rb") as fd:
-				self.update(pickle.load(fd))
-		except (IOError, EOFError):
-			pass
-
-	def dump(self):
-		try:
-			with open(self.filename, "wb") as fd:
-				fcntl.flock(fd, fcntl.LOCK_EX)
-				pickle.dump(dict(self), fd, -1)
-		except IOError:
-			print >> sys.stderr, "Can't open '%s' for writing. Check permissions, rinse, repeat" % self.filename
-			sys.exit(1)
-
-def render(html):
-	i, o = os.popen2(renderc)
-	i.write(html.encode("utf-8"))
-	i.close()
-	return o.read()
-
-def time_convert(t):
-	"""Workaround for feedparser discarding the timezone information from the
-	input data. (RSS and Atom do have a timezone field in their date/time
-	specification. Unfortunately, feedparser just drops it for no particular
-	reason). This means this reader might be some hours off with the timestamp."""
-	return formatdate(mktime_tz(t[:] + (0,)))
-
-def create_feed_mail(tags, author, title, body, date, id, link):
-	for body_charset in ("US-ASCII", "ISO-8859-1", "UTF-8"):
-		try:
-			body.encode(body_charset)
-		except UnicodeDecodeError:
-			pass
-		else:
-			break
-	#print body_charset
-
-	mail = MIMEText(body, "plain", body_charset)
-
-	# FIXME: Escaping?
-	if type(author) in (str, unicode):
-		mail["From"] = '%s <>' % Header(author.encode("utf-8"), "utf-8").encode("utf-8")
-	elif type(author) == feedparser.FeedParserDict: # FIXME: needs tweaking:
-		mail["From"] = '%s <%s>' %(Header(author.get("name", u"Unknown"), "utf-8"), author.get("email", u"").encode("utf-8"))
-	else:
-		raise Unreachable, "Unknown author type. This shouldn't happen"
-	# FIXME: Theoretically, there might be any mime type possible here:
-
-	if tags:
-		header_title = "[%s] %s" % (tags.encode("utf-8"), title.encode("utf-8"))
-	else: 
-		header_title = title.encode("utf-8")
-
-	mail["Subject"] = Header(header_title, "utf-8")
-	if date: mail["Date"] = date.encode("utf-8")
-	if id:   mail["Message-Id"] = '<%s@localhost>' % urlenc(id) # Helps filter duplicates
-	if link: mail["To"] = ('<%s>' % link).encode("utf-8") # XXX: come up with a better header?
-	return mail
-
-def iter_feed_config(filename):
-	with open(filename, "r") as fd:
-		for line in fd.xreadlines():
-			line = line.strip()
-			if not line or line[0] == "#":
-				continue
-
-			items = line.split()
-			folder = None
-			tags = set()
-			etag_enabled = True
-			for entry in items[1:]:
-				entry = entry.strip()
-				if not entry:
-					continue
-
-				try:
-					setting, val = entry.split("=",1)
-				except AttributeError:
-					print >> sys.stderr, \
-						"Invalid setting format for '%s', skipping feed '%s'." %(entry, items[0])
-					continue
-
-				if setting == "folder":
-					folder = val
-				elif setting == "tag":
-					tags.add(val)
-				elif setting == "etag":
-					etag_enabled = val.lower().startswith(("y", "t", "1"))
-
-			yield items[0], folder, ", ".join(tags), etag_enabled
-
-def process(fcache):
-	for url, folder, tags, etag_enabled in iter_feed_config(furls):
-		try:
-			feed = feedparser.parse(url, etag = etag_enabled and fcache.get("etag\n"+url))
-			if feed.get("status") == 304:
-				# Skip feeds with unmodified etag.
-				continue
-		except KeyboardInterrupt:
-			print >> sys.stderr, "Interrupted."
-			return
-		except:
-			print >> sys.stderr, "Error retrieving feed '%s'" % url
-			continue
-
-		if folder:
-			try:
-				fbox = mbox.get_folder(folder)
-			except:
-				fbox = mbox.add_folder(folder)
-		else:
-			fbox = mbox
-
-		fs = fcache.get(url, set())
-		for ent in feed.entries:
-			if not ent.has_key("id"):
-				print >> sys.stderr, "Feed '%s' currently does not work with cron-feed." % url
-				break
-			if ent["id"] in fs:
-				continue
-			title = ent.title_detail["value"] # XXX: take care of content types..
-			summ = render(ent.summary) # pray that elinks autodetects text/plain...
-			author = ent.get("author_detail") or ent.get("author") or\
-				feed.feed.get("author_detail") or feed.feed.get("author") or "Unknown"
-			link = ent.get("link", None)
-			try:
-				date = time_convert(ent.get("updated_parsed", feed.feed.get("updated_parsed")))
-			except:
-				date = None
-			mail = create_feed_mail(tags, author, title, summ, date, ent["id"], link)
-			fbox.add(mail)
-			fs.add(ent["id"])
-
-		fcache[url] = fs
-		if etag_enabled and feed.get("etag"):
-			fcache["etag\n"+url] = feed.get("etag")
-
-def options(fcache):
-	try:
-		opts, args = getopt.getopt(sys.argv[1:], "e:l", ["expunge=", "list"])
-	except getopt.GetoptError, err:
-		print str(err)
-		print
-		print "Usage: %s [-e/--expunge url] [-l/--list]" % os.path.basename(sys.argv[0])
-		print "  -l: list all entries in seen-file"
-		print "  -e: remove all entries of a certain feed"
-		print "Without any options, update all feeds specified in the feeds-file"
-		sys.exit(2)
-	for o, a in opts:
-		if o in ("--expunge", "-e"):
-			if fcache.has_key(a):
-				del fcache[a]
-				if fcache.has_key("etag\n"+a):
-					del fcache["etag\n"+a]
-				return
-			else:
-				print >> sys.stderr, "Feed-URL '%s' not found in seen-file" % a
-				sys.exit(1)
-		elif o in ("--list", "-l"):
-			for k in fcache.keys():
-				if not k.startswith("etag\n"):
-					print k
-			return
-		else:
-			raise Unreachable, "Unknown getopt result"	
-	# no options given, enter default mode
-	process(fcache)
-
-if __name__ == "__main__":
-	fcache = FeedCache(fseen)
-	fcache.load()
-	
-	options(fcache)
-
-	fcache.dump()

File cron-feed.py

+#!/usr/local/bin/python
+# This file is in the public domain.
+#
+# Written 2012 by
+#   Moritz Wilhelmy, mw at furnace wzff de
+#   Dirk Mallunat, s4msung at exigen org
+#
+# TODO:
+# - improve error handling, i.e. if things fail between opening the seen-file
+#   and finishing, have a backup file...
+# - Do something about RSS feeds without GUIDs in entries
+
+"""Poor man's feedreader"""
+
+import os, sys, codecs, time, fcntl, getopt
+import feedparser
+import mailbox
+import email.utils
+from urllib import pathname2url as urlenc
+from email.MIMEText import MIMEText
+from email.utils    import formatdate, mktime_tz
+from email.Header   import Header
+
+try:
+	import cPickle as pickle
+except ImportError:
+	import pickle
+
+######
+home   = os.getenv("HOME") or "."
+base   = home + "/lib/feeds/"
+furls  = base + "feeds" # textfile containing urls.
+fseen  = base + "seen"  # things that have already been sent, just pickle it..
+renderc= "elinks -dump /dev/stdin"
+# should also be possible to send them with sendmail if you need it..
+mbox   = mailbox.Maildir(home + "/.Maildir/.feeds")
+######
+
+class Bug(Exception):
+	pass
+
+class Unreachable(Bug):
+	pass
+
+class FeedCache(dict):
+	def __init__(self, filename):
+		self.filename = filename
+
+	def load(self):
+		self.clear()
+		try:
+			with open(self.filename, "rb") as fd:
+				self.update(pickle.load(fd))
+		except (IOError, EOFError):
+			pass
+
+	def dump(self):
+		try:
+			with open(self.filename, "wb") as fd:
+				fcntl.flock(fd, fcntl.LOCK_EX)
+				pickle.dump(dict(self), fd, -1)
+		except IOError:
+			print >> sys.stderr, "Can't open '%s' for writing. Check permissions, rinse, repeat" % self.filename
+			sys.exit(1)
+
+def render(html):
+	i, o = os.popen2(renderc)
+	i.write(html.encode("utf-8"))
+	i.close()
+	return o.read()
+
+def time_convert(t):
+	"""Workaround for feedparser discarding the timezone information from the
+	input data. (RSS and Atom do have a timezone field in their date/time
+	specification. Unfortunately, feedparser just drops it for no particular
+	reason). This means this reader might be some hours off with the timestamp."""
+	return formatdate(mktime_tz(t[:] + (0,)))
+
+def create_feed_mail(tags, author, title, body, date, id, link):
+	for body_charset in ("US-ASCII", "ISO-8859-1", "UTF-8"):
+		try:
+			body.encode(body_charset)
+		except UnicodeDecodeError:
+			pass
+		else:
+			break
+	#print body_charset
+
+	mail = MIMEText(body, "plain", body_charset)
+
+	# FIXME: Escaping?
+	if type(author) in (str, unicode):
+		mail["From"] = '%s <>' % Header(author.encode("utf-8"), "utf-8").encode("utf-8")
+	elif type(author) == feedparser.FeedParserDict: # FIXME: needs tweaking:
+		mail["From"] = '%s <%s>' %(Header(author.get("name", u"Unknown"), "utf-8"), author.get("email", u"").encode("utf-8"))
+	else:
+		raise Unreachable, "Unknown author type. This shouldn't happen"
+	# FIXME: Theoretically, there might be any mime type possible here:
+
+	if tags:
+		header_title = "[%s] %s" % (tags.encode("utf-8"), title.encode("utf-8"))
+	else: 
+		header_title = title.encode("utf-8")
+
+	mail["Subject"] = Header(header_title, "utf-8")
+	if date: mail["Date"] = date.encode("utf-8")
+	if id:   mail["Message-Id"] = '<%s@localhost>' % urlenc(id) # Helps filter duplicates
+	if link: mail["To"] = ('<%s>' % link).encode("utf-8") # XXX: come up with a better header?
+	return mail
+
+def iter_feed_config(filename):
+	with open(filename, "r") as fd:
+		for line in fd.xreadlines():
+			line = line.strip()
+			if not line or line[0] == "#":
+				continue
+
+			items = line.split()
+			folder = None
+			tags = set()
+			etag_enabled = True
+			for entry in items[1:]:
+				entry = entry.strip()
+				if not entry:
+					continue
+
+				try:
+					setting, val = entry.split("=",1)
+				except AttributeError:
+					print >> sys.stderr, \
+						"Invalid setting format for '%s', skipping feed '%s'." %(entry, items[0])
+					continue
+
+				if setting == "folder":
+					folder = val
+				elif setting == "tag":
+					tags.add(val)
+				elif setting == "etag":
+					etag_enabled = val.lower().startswith(("y", "t", "1"))
+
+			yield items[0], folder, ", ".join(tags), etag_enabled
+
+def process(fcache):
+	for url, folder, tags, etag_enabled in iter_feed_config(furls):
+		try:
+			feed = feedparser.parse(url, etag = etag_enabled and fcache.get("etag\n"+url))
+			if feed.get("status") == 304:
+				# Skip feeds with unmodified etag.
+				continue
+		except KeyboardInterrupt:
+			print >> sys.stderr, "Interrupted."
+			return
+		except:
+			print >> sys.stderr, "Error retrieving feed '%s'" % url
+			continue
+
+		if folder:
+			try:
+				fbox = mbox.get_folder(folder)
+			except:
+				fbox = mbox.add_folder(folder)
+		else:
+			fbox = mbox
+
+		fs = fcache.get(url, set())
+		for ent in feed.entries:
+			if not ent.has_key("id"):
+				print >> sys.stderr, "Feed '%s' currently does not work with cron-feed." % url
+				break
+			if ent["id"] in fs:
+				continue
+			title = ent.title_detail["value"] # XXX: take care of content types..
+			summ = render(ent.summary) # pray that elinks autodetects text/plain...
+			author = ent.get("author_detail") or ent.get("author") or\
+				feed.feed.get("author_detail") or feed.feed.get("author") or "Unknown"
+			link = ent.get("link", None)
+			try:
+				date = time_convert(ent.get("updated_parsed", feed.feed.get("updated_parsed")))
+			except:
+				date = None
+			mail = create_feed_mail(tags, author, title, summ, date, ent["id"], link)
+			fbox.add(mail)
+			fs.add(ent["id"])
+
+		fcache[url] = fs
+		if etag_enabled and feed.get("etag"):
+			fcache["etag\n"+url] = feed.get("etag")
+
+def options(fcache):
+	try:
+		opts, args = getopt.getopt(sys.argv[1:], "e:l", ["expunge=", "list"])
+	except getopt.GetoptError, err:
+		print str(err)
+		print
+		print "Usage: %s [-e/--expunge url] [-l/--list]" % os.path.basename(sys.argv[0])
+		print "  -l: list all entries in seen-file"
+		print "  -e: remove all entries of a certain feed"
+		print "Without any options, update all feeds specified in the feeds-file"
+		sys.exit(2)
+	for o, a in opts:
+		if o in ("--expunge", "-e"):
+			if fcache.has_key(a):
+				del fcache[a]
+				if fcache.has_key("etag\n"+a):
+					del fcache["etag\n"+a]
+				return
+			else:
+				print >> sys.stderr, "Feed-URL '%s' not found in seen-file" % a
+				sys.exit(1)
+		elif o in ("--list", "-l"):
+			for k in fcache.keys():
+				if not k.startswith("etag\n"):
+					print k
+			return
+		else:
+			raise Unreachable, "Unknown getopt result"	
+	# no options given, enter default mode
+	process(fcache)
+
+if __name__ == "__main__":
+	fcache = FeedCache(fseen)
+	fcache.load()
+	
+	options(fcache)
+
+	fcache.dump()