Source

citability-scraper / citaworker.py

Full commit
import gearman
import BeautifulSoup
import hashlib
import datetime
import urllib2
import re
import sets

# BeautifulSoup 3.1 is fsckbroken.
# See http://www.crummy.com/software/BeautifulSoup/3.1-problems.html
if not BeautifulSoup.__version__.startswith('3.0'):
    raise ImportError('Only BeautifulSoup versions 3.0.x are supported.')

MAX_DOWNLOAD_SIZE= 1024 * 1024 * 10 # 10MiB
GEARMAN_DAEMON_IP = '127.0.0.1'
OUR_GEARMAN_FUNCTION = 'scrape'
PARSER_GEARMAN_FUNCTION = 'parse'
URL_RE = re.compile(
    r'https?://' # http:// or https://
    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' #domain...
    r'localhost|' #localhost...
    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
    r'(?::\d+)?' # optional port
    r'(?:/?|/\S+)', re.IGNORECASE) # Thanks, Django!
USER_AGENT = 'CitabilityScraper/1.0'

class ScraperError(Exception):
    """ ScraperError -- raised by errors attempting to scrape a given URL
    Attributes:
        url -- url that was being scraped when error encountered
        msg -- message set by the raising code to describe error encountered
    """
    def __init__(self, url, msg):
        self.url = url
        self.msg = msg

class CitabilityScraper(object):

    def __call__(self, url):
        """CitabilityScraper()(url):

        Scraper plan:
           * Simple infinite loop avoidance through scraper history checking
           * Grab the URL in question
           * If it's a mime-type that lends itself to referenced content,
             issue sub-jobs to access referenced content.
           * Issue parser jobs to gearman queues for parsing mime types
        """
        if not self.url_is_well_formed(url):
            raise ScraperError(url, 'URL format unknown or unaccessible to this scraper.')
        
        last_datetime, etag, sha1 = self.scraper_history_lookup(url)
        if self.scrape_is_stale(url, last_datetime, etag, sha1):
            mimetype, sha1hash, etag, content = self.url_get(url)
            self.log_scraper_lookup(url, etag, sha1hash)
            self.handle_embedded_links(mimetype, content)
            self.dispatch_parser(url, content)
    
    def url_is_well_formed(self, url):
        """CitabilityScraper.url_is_well_formed(url):

        Returns True or False based on whether the url is well formed and sane
        for our purposes.

        >>> scraper = CitabilityScraper()
        >>> scraper.url_is_well_formed('http://google.com')
        True
        >>> scraper.url_is_well_formed('http://www.google.com/')
        True
        >>> scraper.url_is_well_formed('http://www.google.com/path/')
        True
        >>> scraper.url_is_well_formed('http://www.google.com/path/file.html')
        True
        >>> scraper.url_is_well_formed('http:www.failsauce.com')
        False
        >>> scraper.url_is_well_formed('http:/www.doublefailsauce.com')
        False
        >>> scraper.url_is_well_formed('http//www.superfailsauce.com')
        False
        >>> scraper.url_is_well_formed('http://failhost/')
        False
        >>> scraper.url_is_well_formed('htp://protocolfail.com')
        False
        >>> scraper.url_is_well_formed('this really just is ridiculous.')
        False
        """
        return not URL_RE.match(url) == None

    def scraper_history_lookup(self, url):
        """CitabilityScraper.scraper_history_lookup(url):
        
        Performs a lookup to see how long since the last time this content was
        scraped and what its content hash was at the time.
        
        Arguments:
            url -- The url to lookup in the history
        
        Returns:
            A 3-tuple of a datetime for the last time the scraper accessed the
            content, the ETag given or None if none was given, and the SHA1 hash
            of the content.
            If the scraper has never accessed this url, return None, None, None
        """
        pass

    def log_scraper_lookup(self, url, etag, sha1hash):
        """CitabilityScraper.log_scraper_lookup(url, etag, sha1hash):
        
        Logs that the scraper has grabbed and processed a particular URL.
        
        Arguments:
            url -- the URL that has been processed
            etag -- the ETag returned by the webserver or None
            sha1hash -- the SHA1 hash of the content obtained
        """
        pass

    def scrape_is_stale(self, url, last_datetime, etag, sha1hash):
        """CitabilityScraper.scrape_is_stale(url, last_datetime, etag, sha1hash):
        Returns True or False based on whether or not a new scrape ought to
        take place, comparing the last-modified and etag headers, falling back
        to content has as a last resort.

        Arguments:
            url -- the URL in question
            last_datetime -- a datetime object for the last time this was
                scraped or None
            etag -- the ETag value given during the last scrape or None
            sha1hash -- the SHA1 hash of the content from the last scrape or 
                None
        
        Returns:
            True or False
        """
        pass

    def url_get(self, url, redirects_okay=True):
        """CitabilityScraper.url_get(url):

        Given a particular URL, download and do HTTP processing of the content.

        Arguments: 
            url -- a well-formed URL
            redirects_okay -- a boolean as to whether 30x redirects will be
                followed
        
        Returns: a 4-tuple of mime-type, SHA1 hash of the content, the ETag 
        returned by the webserver or None, and the downloaded content itself.

        Raises: ScraperError in the event of a bad HTTP response
        """
        request = urllib2.Request(url)
        request.add_header('User-Agent', USER_AGENT)
        
        if (redirects_ok):
            f = urllib2.urlopen(request) 
            info = f.info()
            data = f.read()

            mime = info['content-type']
            etag = info['etag'] if info.has_key('etag') else None
            sha1 = hashlib.sha1(data).hexdigest()

            return (mime, sha1, etag, data)
        else:
            # somehow construct this to throw an error on detection of redirect?
            pass
    
    def dispatch_parser(self, url, content):
        """CitabilityScraper.dispatch_parser(url, content):
        
        Issue work requests to the gearman content parsers.

        Arguments:
            url -- the URL of the content parsed
            content -- the content to parse
        
        Returns:
            The handle ID of the Gearman task
        """
        client = gearman.GearmanClient([GEARMAN_DAEMON_IP])
        handle = client.dispatch_background_task(PARSER_GEARMAN_FUNCTION,
                                                 url, content)
        return handle

    def handle_embedded_links(self, mimetype, content):
        """CitabilityScraper.handle_embedded_links(mimetype, content):
        
        Given a mimetype and some content, handle dispatching of the scraper
        to embedded urls.
        
        Arguments:
            mimetype -- the mimetype of the content
            content -- the content as a unicode string
        
        Returns:
            None
        
        Side-effects:
            If the mimetype is supported for embedded content, the scraper is
            dispatched for all relevant embedded urls.
        """
        if mimetype in self.embedded_link_mimetypes:
            # since embedded_link_types has the class functions, not the
            # instance functions, manually hand self off as the first argument
            urls = self.embedded_link_mimetypes[mimetype](self, content)
            client = gearman.GearmanClient([GEARMAN_DAEMON_IP])
            for url in urls:
                handle = client.dispatch_background_task(OUR_GEARMAN_FUNCTION,
                                                         url)

    def html_reference_parser(self, html):
        """CitabilityScraper.html_reference_parser(html):

        Extracts all relevant referenced content from the html content.
        
        Arguments:
            html -- the HTML content as a unicode string
        
        Returns:
            A list of relevant URLs referenced from within the HTML
        
        Raises:
            ScraperError if the HTML is not comprehensible.
        """
        to_return = sets.Set()
        try:
            soup = BeautifulSoup.BeautifulSoup(html)
        except Exception, e:
            raise ScraperError(str(e))
        # we care about anchors, images, iframes, and frames
        for tag, attr in [('a', 'href'),
                          ('img', 'src'),
                          ('frame', 'src'),
                          ('iframe', 'src'),]:
            instances = soup.findAll(tag)
            for instance in instances:
                try:
                    to_return.add(instance[attr])
                except KeyError:
                    pass
        return list(to_return)
    
    def text_reference_parser(self, text):
        """CitabilityScraper.text_reference_parser(text):
        
        Extracts all relevant referenced content from the plain text content.
        
        Arguments:
            text -- a unicode string of text
        
        Returns:
            A list of relevant URLs referenced from within the plain text
        
        Raises:
            ScraperError if the text is not comprehensible (can that happen?)
        """
        return URL_RE.findall(text)
        pass

    def pdf_reference_parser(self, pdf):
        """CitabilityScraper.pdf_reference_parser(pdf):
            
        Extracts all relevant referenced content from the PDF.

        Arguments:
            pdf -- filename of a downloaded pdf to parse references from.

        Returns:
            A list of relevant URLs extracted from the PDF

        Raises:
            ScraperError if the pdf is not parseable
        """

    def msword_reference_parser(self, doc):
        """ CitabilityScraper.msword_reference_parser(doc):
            
        Extracts all relevant referenced content from the MS Word file.

        Arguments:
            doc -- filename of a downloaded doc to parse references from.

        Returns:
            A list of relevant URLs extracted from the doc

        Raises:
            ScraperError if the doc is not parseable
        
        """

        pass
    
    def msexcel_reference_parser(self, xls):
        """CitabilityScraper.msexcel_reference_parser(xls):
            
        Extracts all relevant referenced content from the MS Excel file.

        Arguments:
            xls -- filename of a downloaded xls to parse references from.

        Returns:
            A list of relevant URLs extracted from the xls

        Raises:
            ScraperError if the xls is not parseable 

        """

        pass

    def mspowerpoint_reference_parser(self, ppt):
        """CitabilityScraper.ppt_reference_parser(ppt):
            
        Extracts all relevant referenced content from the MS Powerpoint.

        Arguments:
            ppt -- filename of a downloaded ppt to parse references from.

        Returns:
            A list of relevant URLs extracted from the ppt`

        Raises:
            ScraperError if the ppt is not parseable

        """

        pass

    embedded_link_mimetypes = {
        'text/html': html_reference_parser,
        'text/plain': text_reference_parser,
        'application/xhtml+xml': html_reference_parser,
        'application/pdf': pdf_reference_parser,
        'application/msword': msword_reference_parser,
        'application/vnd.ms-excel': msexcel_reference_parser,
        'application/vnd.ms-powerpoint': mspowerpoint_reference_parser
        }