Commits

Frederic De Groef committed 36ee531

[plaintext url extraction] using a new regexp to detect schemeless urls (e.g. 'bit.ly' or 'bit.ly/foo';). Updated relevant tests.

Comments (0)

Files changed (4)

csxj/datasources/parser_tools/utils.py

 import urlparse
 import re
 import random
-from BeautifulSoup import BeautifulSoup, Tag, Comment, NavigableString
+from BeautifulSoup import BeautifulSoup, Tag, NavigableString
 from useragents import USER_AGENT_STRINGS
 from datetime import datetime
 import bs4
 
+
 def pick_random_ua_string():
-    index = random.randint(0, len(USER_AGENT_STRINGS)-1)
+    index = random.randint(0, len(USER_AGENT_STRINGS) - 1)
     return USER_AGENT_STRINGS[index]
 
 
         return BeautifulSoup(html_content)
 
 
-URL_MATCHER = re.compile(r'\(?\bhttp://[-A-Za-z0-9+&@#/%?=~_()|!:,.;]*[-A-Za-z0-9+&@#/%=~_()|]') #comes from http://www.codinghorror.com/blog/2008/10/the-problem-with-urls.html
+# regexp from:  http://daringfireball.net/2010/07/improved_regex_for_matching_urls
+url_regexp = r'''(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'''
+URL_MATCHER = re.compile(url_regexp)
+
 
 def strip_matching_parenthesis(text):
+    """
+    >>> strip_matching_parenthesis("(foo)")
+    'foo'
+
+    >>> strip_matching_parenthesis("foo")
+    'foo'
+
+    >>> strip_matching_parenthesis("foo)")
+    'foo)'
+
+    >>> strip_matching_parenthesis("(foo")
+    '(foo'
+    """
     if text.startswith('(') and text.endswith(')'):
         return text[1:-1]
     return text
 
+
 def extract_plaintext_urls_from_text(some_text):
     """
     """
-    urls = URL_MATCHER.findall(some_text)
+    urls = [c[0] for c in URL_MATCHER.findall(some_text)]
+
     urls = [strip_matching_parenthesis(url) for url in urls]
     return urls
 
 
-TEXT_MARKUP_TAGS = ['a', 'b', 'i', 'u', 'em', 'strong', 'tt', 'h1',  'h2',  'h3',  'h4',  'h5', 'span', 'sub', 'sup', 'p', 'img' ]
+TEXT_MARKUP_TAGS = ['a', 'b', 'i', 'u', 'em', 'strong', 'tt', 'h1', 'h2', 'h3', 'h4', 'h5', 'span', 'sub', 'sup', 'p', 'img']
+
 
 def remove_text_formatting_markup(formatted_text_fragment, strip_chars, remove_links):
     """
         return formatted_text_fragment.strip(strip_chars)
 
 
-
 def remove_text_formatting_markup_from_fragments(fragments, strip_chars=''):
     """
     cleans up the html markup from a collection of fragments
     return u''.join(remove_text_formatting_markup(f, strip_chars, remove_links=True) for f in fragments)
 
 
-
 def setup_locales():
-    import locale, sys
+    import locale
+    import sys
     # for datetime conversions
     if sys.platform in ['linux2', 'cygwin']:
         locale.setlocale(locale.LC_TIME, 'fr_FR.UTF8')
-    elif sys.platform in [ 'darwin']:
+    elif sys.platform in ['darwin']:
         locale.setlocale(locale.LC_TIME, 'fr_FR')
-    elif sys.platform in [ 'win32']:
+    elif sys.platform in ['win32']:
         # locale string from: http://msdn.microsoft.com/en-us/library/cdax410z(v=VS.80).aspx
         locale.setlocale(locale.LC_ALL, 'fra')
 
 
-
 def is_date_in_range(date_string, date_range):
     start_date_string, end_date_string = date_range
 
     return date_to_test >= start_date and date_to_test <= end_date
 
 
-
 def convert_utf8_url_to_ascii(url):
     """
     taken from http://stackoverflow.com/questions/804336/best-way-to-convert-a-unicode-url-to-ascii-utf-8-percent-escaped-in-python
     """
        # turn string into unicode
-    if not isinstance(url,unicode):
+    if not isinstance(url, unicode):
         url = url.decode('utf8')
 
     # parse it
     parsed = urlparse.urlsplit(url)
 
     # divide the netloc further
-    userpass,at,hostport = parsed.netloc.rpartition('@')
-    user,colon1,pass_ = userpass.partition(':')
-    host,colon2,port = hostport.partition(':')
+    userpass, at, hostport = parsed.netloc.rpartition('@')
+    user, colon1, pass_ = userpass.partition(':')
+    host, colon2, port = hostport.partition(':')
 
     # encode each component
     scheme = parsed.scheme.encode('utf8')
     colon2 = colon2.encode('utf8')
     port = port.encode('utf8')
     path = '/'.join(  # could be encoded slashes!
-        urllib2.quote(urllib2.unquote(pce).encode('utf8'),'')
+        urllib2.quote(urllib2.unquote(pce).encode('utf8'), '')
         for pce in parsed.path.split('/')
     )
-    query = urllib2.quote(urllib2.unquote(parsed.query).encode('utf8'),'=&?/')
+    query = urllib2.quote(urllib2.unquote(parsed.query).encode('utf8'), '=&?/')
     fragment = urllib2.quote(urllib2.unquote(parsed.fragment).encode('utf8'))
 
     # put it back together
-    netloc = ''.join((user,colon1,pass_,at,host,colon2,port))
-    return urlparse.urlunsplit((scheme,netloc,path,query,fragment))
+    netloc = ''.join((user, colon1, pass_, at, host, colon2, port))
+    return urlparse.urlunsplit((scheme, netloc, path, query, fragment))
 
 
 
+if __name__ == '__main__':
 
-if __name__ == "__main__":
-    import doctest
-    doctest.testmod()
+    print URL_MATCHER.findall("http://fff.com")

tests/datasources/parser_tools/test_url_extraction.py

+# coding=utf-8
+
 from nose.tools import eq_
 from csxj.datasources.parser_tools.utils import extract_plaintext_urls_from_text
 
 
-class TestPlainTextURLExtractor():
+class TestPlainTextURLExtractor(object):
     def setUp(self):
         self.simple_url = 'http://www.foo.com'
         # fuck yeah gehol
         """
 
     def test_simple_url(self):
-        """
-            Test Plaintext URL extraction with a simple URL
-        """
+        """ extract_plaintext_urls_from_text() can extract a simple URL """
         text_with_url = self.text.format(self.simple_url)
         urls = extract_plaintext_urls_from_text(text_with_url)
         eq_(urls, [self.simple_url])
 
     def test_complex_url(self):
-        """
-            Test Plaintext URL extraction with a complex URL (parameters, port, spaces and semicolons)
-        """
+        """ extract_plaintext_urls_from_text() can extract a complex URL (parameters, port, spaces and semicolons) """
         text_with_url = self.text.format(self.complex_url)
         urls = extract_plaintext_urls_from_text(text_with_url)
         eq_(urls, [self.complex_url])
 
     def test_multiple_urls(self):
-        """
-            Test Plaintext URL extraction from a text with several URLs
-        """
+        """ extract_plaintext_urls_from_text() can extract several URLs from a piece of text"""
         text = 'this {0} has {1} many {2} links {3}'
         text_with_urls = text.format(self.simple_url, self.complex_url, self.complex_url, self.simple_url)
         urls = extract_plaintext_urls_from_text(text_with_urls)
         eq_(urls, [self.simple_url, self.complex_url, self.complex_url, self.simple_url])
 
     def test_text_with_urls(self):
+        """ extract_plaintext_urls_from_text()"""
         urls = extract_plaintext_urls_from_text(self.text_with_urls)
         eq_(urls, ['http://www.example.com', 'http://en.wikipedia.org/wiki/PC_Tools_(Central_Point_Software)', 'http://msdn.microsoft.com/en-us/library/aa752574(VS.85).aspx', 'http://www.awesomeexample.com'])
 
     def test_no_url(self):
-        """
-            Test Plaintext URL extraction on a text with no URL
-        """
+        """ extract_plaintext_urls_from_text() returns an empty list if the text contains no URL"""
         text = self.text.format('not a url')
         urls = extract_plaintext_urls_from_text(text)
         eq_(urls, [])
 
-#    def test_schemeless_url(self):
-#        url = "foo.com"
-#        extracted_urls = extract_plaintext_urls_from_text(url)
-#        eq_([url], extracted_urls)
+    def test_schemeless_url(self):
+        """ extract_plaintext_urls_from_text() can handle urls with no scheme (e.g. 'www.foo.com') """
+        url = "www.foo.com"
+        extracted_urls = extract_plaintext_urls_from_text(url)
+        eq_([url], extracted_urls)
+
+    def test_tinylinks(self):
+        """extract_plaintext_urls_from_text() correctly guesses that things like “bit.ly/foo” and “is.gd/foo/” """
+        url = "bit.ly/foo"
+        extracted_urls = extract_plaintext_urls_from_text(url)
+        eq_([url], extracted_urls)

tests/datasources/test_dhnet.py

                 make_tagged_url("/infos/belgique/article/417311/sncb-et-infrabel-activent-leur-plan-hiver.html", u"""SNCB et Infrabel activent leur plan hiver""", set(['bottom box', 'internal'])),
                 make_tagged_url("/infos/belgique/article/417423/au-volant-le-calme-est-le-plus-important.html", u"""Au volant, le calme est le plus important""", set(['bottom box', 'internal'])),
                 make_tagged_url("/infos/belgique/article/417478/vingt-centres-de-ski-ouverts.html", u"""Vingt centres de ski ouverts""", set(['bottom box', 'internal'])),
+                make_tagged_url("www.thalys.com", "www.thalys.com", set(["plaintext", "in text", "external"]))
             ]
             expected_links = tagged_urls
             assert_taggedURLs_equals(expected_links, extracted_links)
             ]
 
             expected_intext_links = [
-                make_tagged_url("", u"www.soisbelge.be ou www.compagnievictor.be. ", set(["no target", "in text"]))
+                make_tagged_url("", u"www.soisbelge.be ou www.compagnievictor.be. ", set(["no target", "in text"])),
+                make_tagged_url("www.compagnievictor.be", "www.compagnievictor.be", set(["plaintext", "in text", "external"])),
+                make_tagged_url("www.soisbelge.be", "www.soisbelge.be", set(["plaintext", "in text", "external"]))
             ]
 
             expected_embedded_videos = [

tests/datasources/test_lalibre.py

                 make_tagged_url("http://www.madamemoustache.be/page/Upcoming/101/16122011-JoyeuxBordelpresentFUCKYOUITSMAGIC.html", u""" FUCK YOU IT S XMAS""", set(['external', 'in text'])),
                 make_tagged_url("http://www.netevents.be/fr/soiree/203907/Chez-Maman-fete-ses-17-ans/", u'''"Chez maman"''', set(['external', 'in text'])),
                 make_tagged_url("", u"""“Super Saturday”""", set(['no target', 'in text'])),
+                make_tagged_url("www.forestnational.be", "www.forestnational.be", set(['external', 'in text', 'plaintext'])),
+                make_tagged_url("www.stratos-sphere.com", "www.stratos-sphere.com", set(['external', 'in text', 'plaintext']))
             ]
 
             assert_taggedURLs_equals(expected_intext_links, extracted_links)
             article, raw_html = lalibre.extract_article_data(f)
             extracted_links = article.links
             tagged_urls = [
-                make_tagged_url("http://www.micronutris.com/)", u"""http://www.micronutris.com/)""", set(['plaintext', 'external', 'in text'])),
                 make_tagged_url("/societe/gastronomie/article/785611/belgian-bubbles-un-produit-100-naturel-pour-des-fetes-reussies.html", u"""Belgian Bubbles, un produit 100% naturel pour des fêtes réussies""", set(['internal', 'sidebar box'])),
                 make_tagged_url("/societe/gastronomie/article/787152/edito-crise-de-foie-gras.html", u"""Edito: Crise de foie (gras)""", set(['internal', 'sidebar box'])),
                 make_tagged_url("/economie/entreprise-emploi/article/787084/upignac-a-la-gnaque.html", u"""Upignac a la gnaque""", set(['internal', 'sidebar box'])),
                 make_tagged_url("/societe/gastronomie/article/785611/belgian-bubbles-un-produit-100-naturel-pour-des-fetes-reussies.html", u"""Belgian Bubbles, un produit 100% naturel pour des fêtes réussies""", set(['bottom box', 'internal'])),
                 make_tagged_url("/societe/gastronomie/article/787152/edito-crise-de-foie-gras.html", u"""Edito: Crise de foie (gras)""", set(['bottom box', 'internal'])),
                 make_tagged_url("/economie/entreprise-emploi/article/787084/upignac-a-la-gnaque.html", u"""Upignac a la gnaque""", set(['bottom box', 'internal'])),
+                make_tagged_url("http://www.micronutris.com/", "http://www.micronutris.com/", set(['plaintext', 'external', 'in text']))
             ]
             expected_links = tagged_urls
             assert_taggedURLs_equals(expected_links, extracted_links)