Commits

Frederic De Groef committed 19610ec

[sudinfo] extract_article_data() supports url and file-like objects

Comments (0)

Files changed (1)

csxj/datasources/sudinfo.py

 
 
 
-def extract_article_data(source_url):
+def extract_article_data(source):
     """
     """
-    source_url = convert_utf8_url_to_ascii(source_url)
 
-    try:
-        html_content = fetch_html_content(source_url)
-    except urllib2.HTTPError as err:
-        if err.code == 404:
-            return None, "<html><head><title>404</title></head><body></body></html>"
-        else:
-            raise err
+    if hasattr(source, 'read'):
+        html_content = source.read()
+    else:
+        source_url = convert_utf8_url_to_ascii(source)
+        try:
+            html_content = fetch_html_content(source_url)
+        except urllib2.HTTPError as err:
+            if err.code == 404:
+                return None, "<html><head><title>404</title></head><body></body></html>"
+            else:
+                raise err
 
     hxs = HtmlXPathSelector(text=html_content)