1. Joshua Ginsberg
  2. citability-scraper

Commits

j00bar  committed 3e9d607

Implemented text and html link reference parsers

  • Participants
  • Parent commits 5975e0c
  • Branches default

Comments (0)

Files changed (1)

File citaworker.py

View file
 import datetime
 import urllib2
 import re
+import set
 
 # BeautifulSoup 3.1 is fsckbroken.
 # See http://www.crummy.com/software/BeautifulSoup/3.1-problems.html
 GEARMAN_DAEMON_IP = '127.0.0.1'
 OUR_GEARMAN_FUNCTION = 'scrape'
 PARSER_GEARMAN_FUNCTION = 'parse'
+URL_RE = re.compile(
+    r'https?://' # http:// or https://
+    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' #domain...
+    r'localhost|' #localhost...
+    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
+    r'(?::\d+)?' # optional port
+    r'(?:/?|/\S+)', re.IGNORECASE) # Thanks, Django!
+
 
 class ScraperError(exception):
     pass
         Raises:
             ScraperError if the HTML is not comprehensible.
         """
-        pass
+        to_return = sets.Set()
+        try:
+            soup = BeautifulSoup.BeautifulSoup(html)
+        except Exception, e:
+            raise ScraperError(str(e))
+        # we care about anchors, images, iframes, and frames
+        for tag, attr in [('a', 'href'),
+                          ('img', 'src'),
+                          ('frame', 'src'),
+                          ('iframe', 'src'),]:
+            instances = soup.findAll(tag)
+            for instance in instances:
+                try:
+                    to_return.add(instance[attr])
+                except KeyError:
+                    pass
+        return list(to_return)
     
     def text_reference_parser(self, text):
         """CitabilityScraper.text_reference_parser(text):
         Raises:
             ScraperError if the text is not comprehensible (can that happen?)
         """
-        pass
+        return URL_RE.findall(text)
     
     embedded_link_mimetypes = {
         'text/html': html_reference_parser,