Source

data-country-languages / mine_multilingual_books.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
#  mine_multilingual_books.py
#  lang
#

"""
Mine country's languages from MultilingualBooks.com.
"""

import codecs
import sys

import bs4
import requests

BASE_URL = 'http://www.multilingualbooks.com/fromcountry2lang.html'

def mine_languages():
    ostream = codecs.getwriter('utf8')(sys.stdout)
    content = safe_get(BASE_URL).content
    soup = bs4.BeautifulSoup(content)

    table = soup.find('a', text=lambda t: t and 'Afghanistan' in
            t).findParent('table')
    for tr in table.findChildren('tr')[1:-1]:
        country_td, lang_td = tr.findChildren('td')
        try:
            country = country_td.findChild('b').text
        except AttributeError:
            continue
        languages = [l.text for l in lang_td.findChildren('a')]
        for language in languages:
            print >> ostream, '\t'.join((language, country))

class ScrapingError(Exception):
    pass

def safe_get(*args, **kwargs):
    response = requests.get(*args, **kwargs)
    if response.status_code != 200:
        raise ScrapingError('got status code %d' % response.status_code)
    return response

if __name__ == '__main__':
    mine_languages()
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.