Source

data-country-languages / mine_languages.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
#  mine_languages.py
#  lang
#

"""
Mine official languages from Wikipedia.
"""

import codecs
import sys

import BeautifulSoup
import requests

BASE_URL = 'http://en.wikipedia.org/wiki/List_of_official_languages'

def mine_languages():
    ostream = codecs.getwriter('utf8')(sys.stdout)
    content = safe_get(BASE_URL).content
    soup = BeautifulSoup.BeautifulSoup(content)
    lang_nodes = [n for n in soup('a', {'class': 'mw-redirect'})
            if n.parent.name == 'b']

    for lang_node in lang_nodes:
        lang = lang_node.text
        country_ul = lang_node.parent.parent.nextSibling.nextSibling
        assert country_ul.name == 'ul'
        countries = [l.findChild('a').text for l in
                country_ul.findChildren('li')]
        for c in countries:
            print >> ostream, '\t'.join((lang, c))

class ScrapingError(Exception):
    pass

def safe_get(*args, **kwargs):
    response = requests.get(*args, **kwargs)
    if response.status_code != 200:
        raise ScrapingError('got status code %d' % response.status_code)
    return response

if __name__ == '__main__':
    mine_languages()