Grigoriy Petukhov avatar Grigoriy Petukhov committed 0c83d69

Parser's code cleanup. Verbose docstring

Comments (0)

Files changed (1)


-Functions for easy parsing RSS and ATOM feeds.
+This module provides `parse_feed` function which
+uses `feedparser` power to parse RSS/Atom feeds.
+`feedparser` return parsed feed as it is. If you need
+post's content then you have to check `content`, `summary`,
+`description` attributes of the feed entry. The same is true
+for updated time and some other things.
+`parse_feed` tries to normalize these thinkgs. You'll always
+get content in `content` attribute and updated time in `created` attribute.
+Also `parse_feed` tries to parse non-English variants of updated time and
+extract tags.
 import locale
 import sha
 import clean
+log = logging.getLogger('feedzilla.util.parse')
 def guess_date(dates, feed):
     Try to parse date in non-standart format.
         guessed = guess_date(unparsed, feed)
         if guessed:
             return guessed
-    example = unparsed[0] if unparsed else ''
-    logging.error('Could not parse modified date %s of post %s' % (getattr(entry, 'link', ''), example))
     return None
 def get_tags(entry):
-    """
-    Returns a list of tag objects from an entry.
-    """
+    "Return a list of tag objects of the entry"
     tags = set()
     if 'tags' in entry:
 def parse_feed(url=None, source_data=None, summary_size=1000, etag=None):
     Parse feed from url or source data.
     Returns dict with feed, entries and success flag
         resp['feed'] = feedparser.parse(url and url or source_data)
     except Exception, ex:
         resp['error'] = ex
+        log.error('Feed parsing failed', exc_info=ex)
         return resp
         resp['success'] = True
         resp['feed'].last_checked =
     for entry in resp['feed'].entries:
+        link = getattr(entry, 'link', '')
         # Do not process entries without title
         if not hasattr(entry, 'title'):
+            log.error('Post %s does not has a title' % link)
         title = entry.title
-        link = getattr(entry, 'link', '')
         if hasattr(entry,'content'):
             content = entry.content[0].value
         created = parse_modified_date(entry, resp['feed'])
         if not created:
+            log.error('Post %s does not has modified date' % link)
         tags = get_tags(entry)
         guid ='utf-8')).hexdigest()
-        entry = {'title': title, 'link': link, 'summary': summary,
-                 'content': content, 'created': created,
-                 'guid': guid, 'tags': tags}
+        entry = {
+            'title': title,
+            'link': link,
+            'summary': summary,
+            'content': content,
+            'created': created,
+            'guid': guid,
+            'tags': tags,
+        }
     return resp
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.