Source

csxj-crawler / csxj / datasources / lalibre.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from datetime import datetime, time
import urlparse

import BeautifulSoup

from csxj.common.tagging import classify_and_tag, make_tagged_url
from csxj.db.article import ArticleData
from parser_tools.utils import fetch_html_content, make_soup_from_html_content, extract_plaintext_urls_from_text
from parser_tools.utils import remove_text_formatting_markup_from_fragments
from parser_tools import constants
from parser_tools import ipm_utils
from parser_tools import twitter_utils

LALIBRE_ASSOCIATED_SITES = {

}

LALIBRE_NETLOC = 'www.lalibre.be'

SOURCE_TITLE = u"La Libre"
SOURCE_NAME = u"lalibre"


def is_on_same_domain(url):
    """
    Until we get all the internal blogs/sites, we can still detect
    if a page is hosted on the same domain.
    """
    scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
    if netloc not in LALIBRE_ASSOCIATED_SITES:
        return netloc.endswith('lalibre.be')
    return False


def classify_and_make_tagged_url(urls_and_titles, additional_tags=set()):
    """
    Classify (with tags) every element in a list of (url, title) tuples
    Returns a list of TaggedURLs
    """
    tagged_urls = []
    for url, title in urls_and_titles:
        tags = classify_and_tag(url, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES)
        if is_on_same_domain(url):
            tags.update(['internal site'])
        tagged_urls.append(make_tagged_url(url, title, tags | additional_tags))
    return tagged_urls


def was_story_updated(date_string):
    return not date_string.startswith('Mis en ligne le')


def extract_date(main_content):
    publication_date = main_content.find('p', {'id': 'publicationDate'}).contents[0]
    publication_date = publication_date.rstrip().lstrip()

    if was_story_updated(publication_date):
        fragments = publication_date.split(' ')
        date_string = fragments[4]
        h, m = [int(i) for i in fragments[-1].split(':')]
        pub_time = time(h, m)
    else:
        date_string = publication_date.replace('Mis en ligne le ', '')
        pub_time = None

    pub_date = datetime.strptime(date_string, '%d/%m/%Y')
    return pub_date.date(), pub_time


def separate_no_target_links(links):
    no_target_links = [(target, title) for (target, title) in links if not target]
    other_links = list(set(links) - set(no_target_links))
    return [('', title) for (target, title) in no_target_links], other_links


def separate_keyword_links(all_links):
    keyword_links = [l for l in all_links if l[0].startswith('/sujet')]
    other_links = list(set(all_links) - set(keyword_links))
    return keyword_links, other_links


def extract_and_tag_in_text_links(article_text):
    """
    Finds the links tags in the html text content.
    Detects which links are keyword and which aren't, sets the adequate tags.
    Returns a list of TaggedURL objects.
    """
    def extract_link_and_title(link):
        return link.get('href'), remove_text_formatting_markup_from_fragments(link.contents)
    links = [extract_link_and_title(link)
             for link in article_text.findAll('a', recursive=True)]

    no_target_links, target_links = separate_no_target_links(links)
    keyword_links, other_links = separate_keyword_links(target_links)

    tagged_urls = (
        classify_and_make_tagged_url(keyword_links, additional_tags=set(['keyword', 'in text'])) +
        classify_and_make_tagged_url(other_links, additional_tags=set(['in text'])) +
        classify_and_make_tagged_url(no_target_links, additional_tags=set(['in text', 'no target']))
    )

    return tagged_urls


def sanitize_paragraph(paragraph):
    """Returns plain text article"""
    
    sanitized_paragraph = [remove_text_formatting_markup_from_fragments(fragment) for fragment in paragraph.contents if not isinstance(fragment, BeautifulSoup.Comment)]

    return ''.join(sanitized_paragraph)


def extract_text_content_and_links(main_content):
    article_text = main_content.find('div', {'id': 'articleText'})

    in_text_tagged_urls = []
    all_fragments = []
    all_plaintext_urls = []
    embedded_tweets = []

    paragraphs = article_text.findAll('p', recursive=False)

    for paragraph in paragraphs:
        if not paragraph.find('blockquote', {'class': 'twitter-tweet'}):

            in_text_links = extract_and_tag_in_text_links(paragraph)
            in_text_tagged_urls.extend(in_text_links)

            fragments = sanitize_paragraph(paragraph)
            all_fragments.append(fragments)
            all_fragments.append('\n')
            plaintext_links = extract_plaintext_urls_from_text(fragments)
            urls_and_titles = zip(plaintext_links, plaintext_links)
            all_plaintext_urls.extend(classify_and_make_tagged_url(urls_and_titles, additional_tags=set(['plaintext'])))
        else:
            embedded_tweets.extend(twitter_utils.extract_rendered_tweet(paragraph, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES))

    text_content = all_fragments

    return text_content, in_text_tagged_urls + all_plaintext_urls + embedded_tweets


def extract_category(main_content):
    breadcrumbs = main_content.find('p', {'id': 'breadCrumbs'})
    links = breadcrumbs.findAll('a', recursive=False)

    return [link.contents[0].rstrip().lstrip() for link in links]


def extract_embedded_content_links(main_content):
    items = main_content.findAll('div', {'class': 'embedContents'})
    return [ipm_utils.extract_tagged_url_from_embedded_item(item, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES) for item in items]


def extract_author_name(main_content):
    writer = main_content.find('p', {'id': 'writer'})
    if writer:
        return writer.contents[0].rstrip().lstrip()
    else:
        return constants.NO_AUTHOR_NAME


def extract_intro(main_content):
    hat = main_content.find('div', {'id': 'articleHat'})

    if hat:
        return hat.contents[0].rstrip().lstrip()
    else:
        return ''


def extract_article_data_from_file(source_url, source_file):

    if not hasattr(source_file, 'read'):
        f = open(source_file)
    else:
        f = source_file

    html_content = f.read()
    return extract_article_data_from_html(html_content, source_url)


def print_for_test(taggedURLs):
    print "---"
    for taggedURL in taggedURLs:
        print u"""make_tagged_url("{0}", u\"\"\"{1}\"\"\", {2}),""".format(taggedURL.URL, taggedURL.title, taggedURL.tags)


def extract_article_data_from_html(html_content, source_url):
    soup = make_soup_from_html_content(html_content)

    main_content = soup.find('div', {'id': 'mainContent'})

    if main_content.h1:
        title = main_content.h1.contents[0].rstrip().lstrip()
    else:
        return None, html_content

    category = extract_category(main_content)
    author = extract_author_name(main_content)
    pub_date, pub_time = extract_date(main_content)
    fetched_datetime = datetime.today()

    intro = extract_intro(main_content)
    text_content, in_text_urls = extract_text_content_and_links(main_content)

    embedded_audio_links = ipm_utils.extract_embedded_audio_links(main_content, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES)
    associated_tagged_urls = ipm_utils.extract_and_tag_associated_links(main_content, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES)
    bottom_links = ipm_utils.extract_bottom_links(main_content, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES)
    embedded_content_links = extract_embedded_content_links(main_content)

    all_links = in_text_urls + associated_tagged_urls + bottom_links + embedded_content_links + embedded_audio_links

    new_article = ArticleData(source_url, title,
                              pub_date, pub_time, fetched_datetime,
                              all_links,
                              category, author,
                              intro, text_content)

    return new_article, html_content


def extract_article_data(source):
    """
    """
    if hasattr(source, 'read'):
        html_content = source.read()
    else:
        html_content = fetch_html_content(source)

    return extract_article_data_from_html(html_content, source)


def get_frontpage_toc():
    hostname_url = 'http://www.lalibre.be'
    html_content = fetch_html_content(hostname_url)

    soup = make_soup_from_html_content(html_content)

    article_list_container = soup.find('div', {'id': 'mainContent'})
    announces = article_list_container.findAll('div', {'class': 'announce'}, recursive=False)

    def extract_title_and_link(announce):
        title, url = announce.h1.a.contents[0], announce.h1.a.get('href')
        return title, '{0}{1}'.format(hostname_url, url)

    return [extract_title_and_link(announce) for announce in announces], []


def test_sample_data():
    urls = ["http://www.lalibre.be/economie/actualite/article/704138/troisieme-belgian-day-a-wall-street.html",
            "http://www.lalibre.be/culture/selection-culturelle/article/707244/ou-sortir-ce-week-end.html",
            "http://www.lalibre.be/actu/usa-2012/article/773294/obama-raille-les-chevaux-et-baionnettes-de-romney.html",
            "http://www.lalibre.be/actu/international/article/774524/sandy-le-calme-avant-la-tempete.html",
            "http://www.lalibre.be/sports/football/article/778966/suivez-anderlecht-milan-ac-en-live-des-20h30.html",
            "http://www.lalibre.be/societe/insolite/article/786611/le-tweet-sarcastique-de-johnny-a-gege.html"
            ]

    files = [
"2012-04-19/16.05.08/raw_data/3.html",
"2012-04-19/16.05.08/raw_data/3.html",
"2012-04-25/13.05.06/raw_data/5.html",
"2012-04-25/13.05.06/raw_data/5.html",
"2012-05-08/10.05.06/raw_data/4.html",
"2012-05-08/10.05.06/raw_data/4.html",
"2012-05-08/21.05.06/raw_data/0.html",
"2012-05-08/21.05.06/raw_data/0.html",
"2012-05-16/12.05.06/raw_data/0.html",
"2012-05-17/10.05.05/raw_data/0.html",
"2012-05-21/11.05.05/raw_data/4.html",
"2012-05-23/10.05.06/raw_data/1.html",
"2012-05-23/10.05.06/raw_data/1.html",
"2012-05-23/10.05.06/raw_data/1.html",
"2012-05-23/18.05.06/raw_data/4.html",
"2012-05-23/18.05.06/raw_data/4.html",
"2012-05-23/18.05.06/raw_data/4.html",
"2012-06-12/14.05.06/raw_data/2.html",
"2012-07-07/12.05.05/raw_data/4.html",
"2012-08-02/06.05.06/raw_data/0.html",
"2012-08-02/06.05.06/raw_data/0.html",
"2012-08-13/15.05.05/raw_data/1.html",
"2012-08-13/15.05.05/raw_data/1.html",
"2012-08-13/15.05.05/raw_data/1.html",
"2012-08-13/16.05.06/raw_data/5.html",
"2012-08-13/16.05.06/raw_data/5.html",
"2012-08-13/16.05.06/raw_data/5.html",
"2012-08-14/09.05.05/raw_data/2.html",
"2012-08-14/09.05.05/raw_data/2.html",
"2012-08-14/09.05.05/raw_data/2.html",
"2012-08-14/13.05.06/raw_data/1.html",
"2012-08-14/13.05.06/raw_data/1.html",
"2012-08-21/09.05.05/raw_data/4.html",
"2012-08-31/10.05.05/raw_data/2.html",
"2012-09-06/06.05.06/raw_data/0.html",
"2012-09-18/10.05.06/raw_data/2.html",
"2012-09-18/10.05.06/raw_data/2.html",
"2012-09-18/10.05.06/raw_data/2.html",
"2012-09-18/10.05.06/raw_data/2.html",
"2012-10-03/10.05.05/raw_data/4.html",
"2012-10-16/15.05.04/raw_data/3.html",
"2012-10-16/15.05.04/raw_data/3.html",
"2012-10-16/15.05.04/raw_data/3.html",
"2012-10-18/10.05.04/raw_data/5.html",
"2012-11-20/01.05.34/raw_data/1.html",
"2012-11-20/06.05.34/raw_data/1.html",
"2012-11-20/13.05.36/raw_data/4.html",
"2012-12-10/14.05.05/raw_data/0.html",
"2012-12-10/14.05.05/raw_data/0.html",
"2012-12-10/14.05.05/raw_data/0.html",
"2012-12-10/14.05.05/raw_data/0.html",
"2012-12-10/14.05.05/raw_data/0.html",
"2012-12-10/14.05.05/raw_data/0.html",
"2012-12-10/14.05.05/raw_data/0.html",
"2012-12-10/14.05.05/raw_data/0.html",
"2012-12-10/14.05.05/raw_data/0.html",
"2012-12-10/14.05.05/raw_data/0.html",
"2012-12-10/14.05.05/raw_data/0.html",
"2012-12-10/14.05.05/raw_data/0.html",
"2012-12-10/14.05.05/raw_data/0.html",
"2012-12-10/14.05.05/raw_data/0.html",
"2012-12-10/14.05.05/raw_data/0.html",
"2012-12-11/01.05.05/raw_data/4.html",
"2012-12-11/01.05.05/raw_data/4.html",
"2012-12-11/01.05.05/raw_data/4.html",
"2012-12-11/01.05.05/raw_data/4.html",
"2012-12-11/01.05.05/raw_data/4.html",
"2012-12-11/01.05.05/raw_data/4.html",
"2012-12-11/01.05.05/raw_data/4.html",
"2012-12-11/01.05.05/raw_data/4.html",
"2012-12-11/01.05.05/raw_data/4.html",
"2012-12-11/01.05.05/raw_data/4.html",
"2012-12-11/01.05.05/raw_data/4.html",
"2012-12-11/01.05.05/raw_data/4.html",
"2012-12-11/01.05.05/raw_data/4.html",
"2012-12-11/01.05.05/raw_data/4.html",
"2012-12-11/01.05.05/raw_data/4.html",
"2012-12-11/07.05.04/raw_data/7.html",
"2012-12-11/07.05.04/raw_data/7.html",
"2012-12-11/07.05.04/raw_data/7.html",
"2012-12-11/07.05.04/raw_data/7.html",
"2012-12-11/07.05.04/raw_data/7.html",
"2012-12-11/07.05.04/raw_data/7.html",
"2012-12-11/07.05.04/raw_data/7.html",
"2012-12-11/07.05.04/raw_data/7.html",
"2012-12-11/07.05.04/raw_data/7.html",
"2012-12-11/07.05.04/raw_data/7.html",
"2012-12-11/07.05.04/raw_data/7.html",
"2012-12-11/07.05.04/raw_data/7.html",
"2012-12-11/07.05.04/raw_data/7.html",
"2012-12-11/07.05.04/raw_data/7.html",
"2012-12-11/07.05.04/raw_data/7.html",
"2012-12-12/12.05.34/raw_data/0.html",
"2012-12-12/14.05.04/raw_data/0.html",
"2012-12-26/11.05.04/raw_data/0.html",
"2012-12-27/18.05.05/raw_data/3.html",
"2013-01-03/08.05.04/raw_data/2.html",
"2013-01-08/11.05.05/raw_data/1.html",
"2013-01-08/14.05.04/raw_data/1.html",
"2013-01-08/15.05.04/raw_data/1.html",


    ]


    root = r"/Volumes/Curst/csxj/tartiflette/json_db_0_5/lalibre"

    from pprint import pprint
    import os

    for url in files[:]:
        try:
            url = os.path.join(root, url)
            with open(url) as f:

                article, html = extract_article_data(f)
                
                tweets = [l for l in article.links if 'tweet' in l.tags]
                # print article.title
                # print tweets
                # print len(tweets)
                # print "...................." * 3
                print len(article.content)
                print article.url
                print article.title
                print "...................." * 3
                # if len(tweets) == 0:
                #     print article.title
                #     print article.url
                #     print article.content
        except ValueError as e:
            print "something went wrong with: ", url


if __name__ == '__main__':
    test_sample_data()