1. Frederic De Groef
  2. csxj-crawler


Frederic De Groef  committed e2e6eff

removed useless imports

  • Participants
  • Parent commits 6cee522
  • Branches default

Comments (0)

Files changed (3)

File csxj/datasources/lavenir.py

View file
  • Ignore whitespace
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
-import sys
-import locale
 from datetime import datetime
 import codecs
 from itertools import izip, chain
 from urlparse import urlparse
 from scrapy.selector import HtmlXPathSelector
-from csxj.common.tagging import classify_and_tag, make_tagged_url, update_tagged_urls, update_tagged_urls
+from csxj.common.tagging import classify_and_tag, make_tagged_url, update_tagged_urls
 from csxj.db.article import ArticleData
 from parser_tools.utils import fetch_html_content
 from parser_tools.utils import extract_plaintext_urls_from_text, setup_locales
 from parser_tools.utils import remove_text_formatting_markup_from_fragments, remove_text_formatting_and_links_from_fragments
-from helpers.unittest_generator import generate_test_func, save_sample_data_file

File csxj/datasources/lesoir.py

View file
  • Ignore whitespace
 from BeautifulSoup import BeautifulStoneSoup, Tag
 import urlparse
 import codecs
-from csxj.common.tagging import tag_URL, classify_and_tag, make_tagged_url, TaggedURL, update_tagged_urls
+from csxj.common.tagging import classify_and_tag, make_tagged_url, TaggedURL, update_tagged_urls
 from csxj.db.article import ArticleData
 from parser_tools.utils import fetch_html_content, fetch_rss_content, make_soup_from_html_content
 from parser_tools.utils import remove_text_formatting_markup_from_fragments, extract_plaintext_urls_from_text, remove_text_formatting_and_links_from_fragments

File csxj/datasources/septsursept.py

View file
  • Ignore whitespace
         source = None
     return source
 def extract_intro(soup):
     intro_box = soup.find(attrs = {"class" : "intro"})
     tagged_urls = []
         return []
 def extract_links_from_sidebar_box(soup):
     tagged_urls = list()
     sidebar_box = soup.find(attrs = {"class" : "teas_article_306 mar10 clear clearfix relatedcomponents"})
     return tagged_urls
 def extract_title_and_url_from_bslink(link):
     base_tags = []
     if link.get('href'):
     return title, url, base_tags
 def extract_category(soup):
     category_box = soup.find(attrs = {"class" : "actua_nav"})
     links = category_box.find_all('a')
     return [utils.remove_text_formatting_markup_from_fragments(link.contents[0]) for link in links]
 def find_embedded_media_in_multimedia_box(multimedia_box):
     tagged_urls = list()
     all_sections = multimedia_box.findAll("section")
             else :
                 raise ValueError("There seems to be an embedded video but we could not identify it. Please update parser.")
         elif 'snippet' in section.attrs['class']:
             # it might be a tweet
     return tagged_urls
 def extract_embedded_media(soup):
     tagged_urls = list()
 def detect_page_type(url):
     current_item_count = len(try_extract_frontpage_items(url)[0])
     frontpage_item_count  = len(get_frontpage_toc()[0])
   <!-- MEDUSA -->
 def is_404_page(html_data):
     stripped_html_data = html_data.translate(None, ' \n\t')
     stripped_404 = SEPTSURSEPT_404_PAGE_CONTENT.translate(None, ' \n\t')
         # save_sample_data_file(html_data, source, 'same_owner', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/septsursept')
         return (ArticleData(source, title, pub_date, pub_time, dt.datetime.now(),
-                        updated_tagged_urls,
-                        category, author_name,
-                        intro, text),
-            html_data)
+                updated_tagged_urls,
+                category, author_name,
+                intro, text),
+                html_data)
 # on vérifie que les urls de la frontpage ne renvoient pas vers la frontpage (en y appliquant la fonction qui extrait les urls des la frontpage!!)
 def show_frontpage():
-    frontpage_items, blogposts = get_frontpage_toc()
+    frontpage_items, blogposts, _ = get_frontpage_toc()
     print "NEWS ({0}):".format(len(frontpage_items))
     for title, url in frontpage_items:
-        x, y = try_extract_frontpage_items(url)
+        x, y, _ = try_extract_frontpage_items(url)
         if len(x) > 0:
             print u"{0} \t\t [{1}]".format(title, url)
             print len(x)
 if __name__ == '__main__':
     url1 = "http://www.7sur7.be/7s7/fr/1504/Insolite/article/detail/1494529/2012/09/02/Ils-passent-devant-une-vitrine-avant-de-disparaitre.dhtml"
     url2 = "http://www.7sur7.be/7s7/fr/1504/Insolite/article/detail/1491869/2012/08/27/Fin-des-recherches-apres-une-alerte-au-lion-pres-de-Londres.dhtml"