Commits

Anonymous committed ae8ff87

Added doctest and scrape_is_stale method

Comments (0)

Files changed (1)

 import BeautifulSoup
 import hashlib
 import datetime
+import urllib2
+import re
 
 # BeautifulSoup 3.1 is fsckbroken.
 # See http://www.crummy.com/software/BeautifulSoup/3.1-problems.html
              issue sub-jobs to access referenced content.
            * Issue parser jobs to gearman queues for parsing mime types
         """
-        pass
+        if not url_is_well_formed(url):
+            raise ScraperError('URL format unknown or unaccessible to this scraper.')
+        
     
     def url_is_well_formed(self, url):
         """CitabilityScraper.url_is_well_formed(url):
 
-        Returns True or False based on whether the url is well formed.
+        Returns True or False based on whether the url is well formed and sane
+        for our purposes.
+
+        >>> scraper = CitabilityScraper()
+        >>> scraper.url_is_well_formed('http://google.com')
+        True
+        >>> scraper.url_is_well_formed('http://www.google.com/')
+        True
+        >>> scraper.url_is_well_formed('http://www.google.com/path/')
+        True
+        >>> scraper.url_is_well_formed('http://www.google.com/path/file.html')
+        True
+        >>> scraper.url_is_well_formed('http:www.failsauce.com')
+        False
+        >>> scraper.url_is_well_formed('http:/www.doublefailsauce.com')
+        False
+        >>> scraper.url_is_well_formed('http//www.superfailsauce.com')
+        False
+        >>> scraper.url_is_well_formed('http://failhost/')
+        False
+        >>> scraper.url_is_well_formed('htp://protocolfail.com')
+        False
+        >>> scraper.url_is_well_formed('this really just is ridiculous.')
+        False
         """
         pass
 
         """
         pass
 
+    def scrape_is_stale(self, url, last_datetime, etag, sha1hash):
+        """CitabilityScraper.scrape_is_stale(url, last_datetime, etag, sha1hash):
+        Returns True or False based on whether or not a new scrape ought to
+        take place, comparing the last-modified and etag headers, falling back
+        to content has as a last resort.
+
+        Arguments:
+            url -- the URL in question
+            last_datetime -- a datetime object for the last time this was
+                scraped or None
+            etag -- the ETag value given during the last scrape or None
+            sha1hash -- the SHA1 hash of the content from the last scrape or 
+                None
+        
+        Returns:
+            True or False
+        """
+        pass
+
     def url_get(self, url, redirects_okay=True):
         """CitabilityScraper.url_get(url):