1. Lars Yencken
  2. data-country-languages

Source

data-country-languages / mine_multilingual_books.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
#  mine_multilingual_books.py
#  lang
#

"""
Mine country's languages from MultilingualBooks.com.
"""

import codecs
import sys

import bs4
import requests
import globetrotter

BASE_URL = 'http://www.multilingualbooks.com/fromcountry2lang.html'

def mine_languages():
    ostream = codecs.getwriter('utf8')(sys.stdout)
    print >> ostream, '#' + '\t'.join(('language', 'langcode', 'country',
            'countrycode'))
    content = safe_get(BASE_URL).content
    soup = bs4.BeautifulSoup(content)

    table = soup.find('a', text=lambda t: t and 'Afghanistan' in
            t).findParent('table')
    for tr in table.findChildren('tr')[1:-1]:
        country_td, lang_td = tr.findChildren('td')
        try:
            country = country_td.findChild('b').text
        except AttributeError:
            continue

        try:
            code = globetrotter.find_country(country).alpha2
        except KeyError:
            code = '__'

        languages = [l.text for l in lang_td.findChildren('a')]
        for language in languages:
            try:
                langcode = globetrotter.find_language(language).alpha2
            except (KeyError, AttributeError):
                langcode = '__'
            print >> ostream, '\t'.join((language, langcode, country, code))

class ScrapingError(Exception):
    pass

def safe_get(*args, **kwargs):
    response = requests.get(*args, **kwargs)
    if response.status_code != 200:
        raise ScrapingError('got status code %d' % response.status_code)
    return response

if __name__ == '__main__':
    mine_languages()