Commits

j00bar  committed 5975e0c

Skeleton of __call__ and added dispatch_parser

  • Participants
  • Parent commits ae8ff87

Comments (0)

Files changed (1)

File citaworker.py

              issue sub-jobs to access referenced content.
            * Issue parser jobs to gearman queues for parsing mime types
         """
-        if not url_is_well_formed(url):
+        if not self.url_is_well_formed(url):
             raise ScraperError('URL format unknown or unaccessible to this scraper.')
         
+        last_datetime, etag, sha1 = self.scraper_history_lookup(url)
+        if self.scrape_is_stale(url, last_datetime, etag, sha1):
+            mimetype, sha1hash, etag, content = self.url_get(url)
+            self.log_scraper_lookup(url, etag, sha1hash)
+            self.handle_embedded_links(mimetype, content)
+            self.dispatch_parser(url, content)
     
     def url_is_well_formed(self, url):
         """CitabilityScraper.url_is_well_formed(url):
         """
         pass
     
+    def dispatch_parser(self, url, content):
+        """CitabilityScraper.dispatch_parser(url, content):
+        
+        Issue work requests to the gearman content parsers.
+
+        Arguments:
+            url -- the URL of the content parsed
+            content -- the content to parse
+        
+        Returns:
+            None
+        """
+        pass
+
     def handle_embedded_links(self, mimetype, content):
         """CitabilityScraper.handle_embedded_links(mimetype, content):