csxj-crawler / csxj / datasources /

Full commit
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from datetime import datetime, time
from itertools import chain
import re
import urlparse

from BeautifulSoup import Tag

from csxj.common.tagging import classify_and_tag, make_tagged_url
from csxj.db.article import ArticleData
from parser_tools.utils import fetch_html_content, make_soup_from_html_content, remove_text_formatting_markup_from_fragments
from parser_tools.utils import extract_plaintext_urls_from_text, setup_locales
from parser_tools import constants
from parser_tools import ipm_utils


    '': ['internal blog', 'internal', 'sports'],
    '': ['internal site', 'internal', 'image gallery'],


SOURCE_NAME = u"dhnet"

def is_on_same_domain(url):
    Until we get all the internal blogs/sites, we can still detect
    if a page is hosted on the same domain.
    scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
    if netloc not in DHNET_INTERNAL_SITES:
        return netloc.endswith('')
    return False

def classify_and_make_tagged_url(urls_and_titles, additional_tags=set()):
    Classify (with tags) every element in a list of (url, title) tuples
    Returns a list of TaggedURLs
    tagged_urls = []
    for url, title in urls_and_titles:
        tags = classify_and_tag(url, DHNET_NETLOC, DHNET_INTERNAL_SITES)
        if is_on_same_domain(url):
            tags = tags.union(['internal site', 'internal'])
        all_tags = tags.union(additional_tags)
        tagged_urls.append(make_tagged_url(url, title, all_tags))
    return tagged_urls

def cleanup_text_fragment(text_fragment):
    Recursively cleans up a text fragment (e.g. nested tags).
    Returns a plain text string with no formatting info whatsoever.
    if isinstance(text_fragment, Tag):
        return remove_text_formatting_markup_from_fragments(text_fragment.contents)
        return text_fragment

def filter_out_useless_fragments(text_fragments):
    Removes all <br /> tags and '\n' string from a list of text fragments
    extracted from an article.
    def is_linebreak(text_fragment):
        if isinstance(text_fragment, Tag):
            return == 'br'
            return len(text_fragment.strip()) == 0

    return [fragment for fragment in text_fragments if not is_linebreak(fragment)]

def separate_no_target_links(links):
    no_target_links = [(target, title) for (target, title) in links if not target]
    other_links = list(set(links) - set(no_target_links))
    return [('', title) for (target, title) in no_target_links], other_links

def separate_keyword_links(all_links):
    keyword_links = [l for l in all_links if l[0].startswith('/sujet')]
    other_links = list(set(all_links) - set(keyword_links))

    return keyword_links, other_links

def extract_and_tag_in_text_links(article_text):
    Finds the links tags in the html text content.
    Detects which links are keyword and which aren't, sets the adequate tags.
    Returns a list of TaggedURL objects.
    def extract_link_and_title(link):
        return link.get('href'), remove_text_formatting_markup_from_fragments(link.contents)

    links = [extract_link_and_title(link)
             for link in article_text.findAll('a', recursive=True)]

    no_target_links, target_links = separate_no_target_links(links)
    keyword_links, other_links = separate_keyword_links(target_links)

    tagged_urls = (
        classify_and_make_tagged_url(keyword_links, additional_tags=set(['keyword', 'in text'])) +
        classify_and_make_tagged_url(other_links, additional_tags=set(['in text'])) +
        classify_and_make_tagged_url(no_target_links, additional_tags=set(['in text', 'no target']))

    return tagged_urls

def extract_text_content_and_links_from_articletext(article_text, has_intro=True):
    Cleans up the text from html tags, extracts and tags all
    links (clickable _and_ plaintext).

    Returns a list of string (one item per paragraph) and a
    list of TaggedURL objects.

    Note: sometimes paragraphs are clearly marked with nice <p> tags. When it's not
    the case, we consider linebreaks to be paragraph separators.

    in_text_tagged_urls = extract_and_tag_in_text_links(article_text)

    children = filter_out_useless_fragments(article_text.contents)
    # first child is the intro paragraph, discard it
    if has_intro:
        children = children[1:]

    # the rest might be a list of paragraphs, but might also just be the text, sometimes with
    # formatting.

    cleaned_up_text_fragments = list()
    for text_block in children:
        cleaned_up_text_fragments.append(remove_text_formatting_markup_from_fragments(text_block, '\n\t '))

    all_plaintext_urls = []
    for text in cleaned_up_text_fragments:
    # plaintext urls are their own title
    urls_and_titles = zip(all_plaintext_urls, all_plaintext_urls)
    plaintext_tagged_urls = classify_and_make_tagged_url(urls_and_titles, additional_tags=set(['plaintext', 'in text']))

    return cleaned_up_text_fragments, in_text_tagged_urls + plaintext_tagged_urls

def article_has_intro(article_text):
    return article_text.p

def extract_intro_from_articletext(article_text):
    Finds the introduction paragraph, returns a string with the text
    # intro text seems to always be in the first paragraph.
    if article_has_intro(article_text):
        intro_paragraph = article_text.p
        return remove_text_formatting_markup_from_fragments(intro_paragraph.contents)
    # but sometimes there is no intro. What the hell.
        return u''

def extract_author_name_from_maincontent(main_content):
    Finds the <p> element with author info, if available.
    Returns a string if found, 'None' if not.
    signature = main_content.find('p', {'id': 'articleSign'})
    if signature:
        # the actual author name is often lost in a puddle of \n and \t
        # cleaning it up.
        return signature.contents[0].lstrip().rstrip()
        return constants.NO_AUTHOR_NAME

def extract_category_from_maincontent(main_content):
    Finds the breadcrumbs list. Returns a list of strings,
    one per item in the trail. The '\t\n' soup around each entry is cleaned up.
    breadcrumbs = main_content.find('p', {'id': 'breadcrumbs'})
    links = breadcrumbs.findAll('a', recursive=False)

    return [link.contents[0].rstrip().lstrip() for link in links]

DATE_MATCHER = re.compile('\(\d\d/\d\d/\d\d\d\d\)')

def was_publish_date_updated(date_string):
    In case of live events (soccer, the article gets updated.
    Hour of last update is appended to the publish date.
    # we try to match a non-updated date, and check that it failed.<
    match = DATE_MATCHER.match(date_string)
    return not match

def make_time_from_string(time_string):
    Takes a HH:MM string, returns a time object
    h, m = [int(i) for i in time_string.split(':')]
    return time(h, m)

def extract_date_from_maincontent(main_content):
    Finds the publication date string, returns a datetime object
    date_string = main_content.find('p', {'id': 'articleDate'}).contents[0]

    if was_publish_date_updated(date_string):
        # extract the update time, make the date look like '(dd/mm/yyyy)'
        date_string, time_string = date_string.split(',')
        date_string = '{0})'.format(date_string)

        # the time string looks like : 'mis à jour le hh:mm)'
        time_string = time_string.split(' ')[-1]
        pub_time = make_time_from_string(time_string.rstrip(')'))
        pub_time = None

    pub_date = datetime.strptime(date_string, "(%d/%m/%Y)").date()

    return pub_date, pub_time

def extract_links_from_embedded_content(embedded_content):
    if embedded_content.iframe:
        url = embedded_content.iframe.get('src')
        title = u"Embedded content"
        all_tags = classify_and_tag(url, DHNET_NETLOC, DHNET_INTERNAL_SITES)
        return [make_tagged_url(url, title, all_tags | set(['embedded']))]
        divs = embedded_content.findAll('div', recursive=False)
        kplayer = embedded_content.find('div', {'class': 'containerKplayer'})
        if kplayer:
            kplayer_infos = kplayer.find('video')
            url = kplayer_infos.get('data-src')
            title = remove_text_formatting_markup_from_fragments(divs[1].contents)
            all_tags = classify_and_tag(url, DHNET_NETLOC, DHNET_INTERNAL_SITES)
            return [make_tagged_url(url, title, all_tags | set(['video', 'embedded', 'kplayer']))]
            return []

def extract_links_to_embedded_content(main_content):
    items = main_content.findAll('div', {'class': 'embedContents'})
    return [ipm_utils.extract_tagged_url_from_embedded_item(item, DHNET_NETLOC, DHNET_INTERNAL_SITES) for item in items]

def extract_article_data(source):
    if hasattr(source, 'read'):
        html_content =
        html_content = fetch_html_content(source)

    soup = make_soup_from_html_content(html_content)

    main_content = soup.find('div', {'id': 'maincontent'})

    if main_content and main_content.h1:
        title = remove_text_formatting_markup_from_fragments(main_content.h1.contents)
        pub_date, pub_time = extract_date_from_maincontent(main_content)
        category = extract_category_from_maincontent(main_content)
        author_name = extract_author_name_from_maincontent(main_content)

        article_text = main_content.find('div', {'id': 'articleText'})
        if article_has_intro(article_text):
            intro = extract_intro_from_articletext(article_text)
            text, in_text_links = extract_text_content_and_links_from_articletext(article_text)
            intro = u""
            text, in_text_links = extract_text_content_and_links_from_articletext(article_text, False)

        audio_content_links = ipm_utils.extract_embedded_audio_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES)
        sidebox_links = ipm_utils.extract_and_tag_associated_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES)
        bottom_links = ipm_utils.extract_bottom_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES)
        embedded_content_links = extract_links_to_embedded_content(main_content)

        fetched_datetime =

        new_article = ArticleData(source, title, pub_date, pub_time, fetched_datetime,
                                  in_text_links + sidebox_links + embedded_content_links + bottom_links + audio_content_links,
                                  category, author_name, intro, text)
        return new_article, html_content
        return None, html_content

def extract_title_and_link_from_item_box(item_box):
    title = item_box.h2.a.contents[0].rstrip().lstrip()
    url = item_box.h2.a.get('href')
    return title, url

def is_item_box_an_ad_placeholder(item_box):
    # awesome heuristic : if children are iframes, then go to hell
    return len(item_box.findAll('iframe')) != 0

def extract_title_and_link_from_anounce_group(announce_group):
    # sometimes they use item box to show ads or some crap like that.
    odd_boxes = announce_group.findAll('div', {'class': 'box4 odd'})
    even_boxes = announce_group.findAll('div', {'class': 'box4 even'})

    all_boxes = chain(odd_boxes, even_boxes)

    return [extract_title_and_link_from_item_box(box)
            for box in all_boxes
            if not is_item_box_an_ad_placeholder(box)]

def get_first_story_title_and_url(main_content):
    Extract the title and url of the main frontpage story
    first_announce = main_content.find('div', {'id': 'firstAnnounce'})
    first_title = first_announce.h2.a.get('title')
    first_url = first_announce.h2.a.get('href')

    return first_title, first_url

def get_frontpage_toc():
    url = ''
    html_content = fetch_html_content(url)
    soup = make_soup_from_html_content(html_content)

    main_content = soup.find('div', {'id': 'maincontent'})
    if main_content:
        all_titles_and_urls = []

        # so, the list here is a combination of several subcontainer types.
        # processing every type separately
        first_title, first_url = get_first_story_title_and_url(main_content)
        all_titles_and_urls.append((first_title, first_url))

        # this will pick up the 'annouceGroup' containers with same type in the 'regions' div
        first_announce_groups = main_content.findAll('div',
                                                     {'class': 'announceGroupFirst announceGroup'},
        announce_groups = main_content.findAll('div',
                                               {'class': 'announceGroup'},

        # all those containers have two sub stories
        for announce_group in chain(first_announce_groups, announce_groups):
            titles_and_urls = extract_title_and_link_from_anounce_group(announce_group)

        return [(title, '' % url) for (title, url) in all_titles_and_urls], []
        return [], []

if __name__ == "__main__":
    urls = [

    for url in urls[-1:]:
        article, html = extract_article_data(url)