Commits

Anonymous committed a9077ee

Skeleton design for scraper.

  • Participants

Comments (0)

Files changed (1)

File citaworker.py

+import gearman
+import BeautifulSoup
+import hashlib
+import datetime
+
+# BeautifulSoup 3.1 is fsckbroken.
+# See http://www.crummy.com/software/BeautifulSoup/3.1-problems.html
+if note BeautifulSoup.__version__.startswith('3.0'):
+    raise ImportError('Only BeautifulSoup versions 3.0.x are supported.')
+
+MAX_DOWNLOAD_SIZE= 1024 * 1024 * 10 # 10MiB
+GEARMAN_DAEMON_IP = '127.0.0.1'
+OUR_GEARMAN_FUNCTION = 'scrape'
+PARSER_GEARMAN_FUNCTION = 'parse'
+
+class ScraperError(exception):
+    pass
+
+class CitabilityScraper(object):
+
+    def __call__(self, url):
+        """CitabilityScraper()(url):
+
+        Scraper plan:
+           * Simple infinite loop avoidance through scraper history checking
+           * Grab the URL in question
+           * If it's a mime-type that lends itself to referenced content,
+             issue sub-jobs to access referenced content.
+           * Issue parser jobs to gearman queues for parsing mime types
+        """
+        pass
+    
+    def url_is_well_formed(self, url):
+        """CitabilityScraper.url_is_well_formed(url):
+
+        Returns True or False based on whether the url is well formed.
+        """
+        pass
+
+    def scraper_history_lookup(self, url):
+        """CitabilityScraper.scraper_history_lookup(url):
+        
+        Performs a lookup to see how long since the last time this content was
+        scraped and what its content hash was at the time.
+        
+        Arguments:
+            url -- The url to lookup in the history
+        
+        Returns:
+            A 3-tuple of a datetime for the last time the scraper accessed the
+            content, the ETag given or None if none was given, and the SHA1 hash
+            of the content.
+            If the scraper has never accessed this url, return None, None, None
+        """
+        pass
+
+    def log_scraper_lookup(self, url, etag, sha1hash):
+        """CitabilityScraper.log_scraper_lookup(url, etag, sha1hash):
+        
+        Logs that the scraper has grabbed and processed a particular URL.
+        
+        Arguments:
+            url -- the URL that has been processed
+            etag -- the ETag returned by the webserver or None
+            sha1hash -- the SHA1 hash of the content obtained
+        """
+        pass
+
+    def url_get(self, url, redirects_okay=True):
+        """CitabilityScraper.url_get(url):
+
+        Given a particular URL, download and do HTTP processing of the content.
+
+        Arguments: 
+            url -- a well-formed URL
+            redirects_okay -- a boolean as to whether 30x redirects will be
+                followed
+        
+        Returns: a 4-tuple of mime-type, SHA1 hash of the content, the ETag 
+        returned by the webserver or None, and the downloaded content itself.
+
+        Raises: ScraperError in the event of a bad HTTP response
+        """
+        pass
+    
+    def handle_embedded_links(self, mimetype, content):
+        """CitabilityScraper.handle_embedded_links(mimetype, content):
+        
+        Given a mimetype and some content, handle dispatching of the scraper
+        to embedded urls.
+        
+        Arguments:
+            mimetype -- the mimetype of the content
+            content -- the content as a unicode string
+        
+        Returns:
+            None
+        
+        Side-effects:
+            If the mimetype is supported for embedded content, the scraper is
+            dispatched for all relevant embedded urls.
+        """
+        pass
+
+    def html_reference_parser(self, html):
+        """CitabilityScraper.html_reference_parser(html):
+
+        Extracts all relevant referenced content from the html content.
+        
+        Arguments:
+            html -- the HTML content as a unicode string
+        
+        Returns:
+            A list of relevant URLs referenced from within the HTML
+        
+        Raises:
+            ScraperError if the HTML is not comprehensible.
+        """
+        pass
+    
+    def text_reference_parser(self, text):
+        """CitabilityScraper.text_reference_parser(text):
+        
+        Extracts all relevant referenced content from the plain text content.
+        
+        Arguments:
+            text -- a unicode string of text
+        
+        Returns:
+            A list of relevant URLs referenced from within the plain text
+        
+        Raises:
+            ScraperError if the text is not comprehensible (can that happen?)
+        """
+        pass
+    
+    embedded_link_mimetypes = {
+        'text/html': html_reference_parser,
+        'text/plain': text_reference_parser,
+        'application/xhtml+xml': html_reference_parser
+        }