Commits

Lars Yencken committed ddddf4d

Explicitly namespace data sourced from Wikipedia.

  • Participants
  • Parent commits b001d10

Comments (0)

Files changed (4)

File country_language_2012-09-07.csv

-Afrikaans	South Africa
-Berber	Morocco
-Bislama	Vanuatu
-Chinese, Mandarin	China
-Chinese, Mandarin	Taiwan
-Chinese, Mandarin	Singapore
-Dari	Afghanistan
-Dhivehi	Maldives
-Dzongkha	Bhutan
-Guaraní	Paraguay
-Guaraní	Bolivia
-Hindi	India
-Hiri Motu	Papua New Guinea
-Ibo	Nigeria
-Kikongo	Democratic Republic of the Congo
-Kituba	Republic of the Congo
-Latin	Vatican City
-Lingala	Democratic Republic of the Congo
-Lingala	Republic of the Congo
-Māori	New Zealand
-Ossetian	South Ossetia
-Pashtu	Afghanistan
-Quechua	Bolivia
-Quechua	Peru
-Swati	Swaziland
-Swati	South Africa
-Tetum	East Timor
-Tigrinya	Eritrea
-Tshiluba	Democratic Republic of the Congo
-Urdu	Pakistan
-Urdu	Fiji
-Altay	Altay, Republic of
-Azeri	Dagestan
-Balkar	Kabardino-Balkaria
-Cantonese Chinese	Hong Kong
-Cantonese Chinese	Macau
-Guaraní	Argentina
-Guaraní	Corrientes Province
-Gwich'in	Northwest Territories
-Inuinnaqtun	Northwest Territories
-Inuinnaqtun	Nunavut
-Inuvialuktun	Northwest Territories
-Kalaallisut	Greenland
-Kalmyk	Kalmykia
-Karachay	Karachay–Cherkessia
-Malayalam	India
-Malayalam	Kerala
-Malayalam	Pondicherry
-Malayalam	Lakshadweep
-Mayan	Mexico
-Mayan	Guatemala
-Mayan	Belize
-Mayan	Honduras
-Mayan	El Salvador
-Náhuatl	Mexico
-Náhuatl	El Salvador
-Aranese	Aran Valley
-Sanskrit	India
-North and South Slavey	Northwest Territories
-Tibetan	Tibet Autonomous Region
-Tibetan	Aba
-Tibetan	Garzê
-Tibetan	Diqing
-Tibetan	Wenshan
-Tibetan	Gannan
-Tibetan	Haibai
-Tibetan	Hainan
-Tibetan	Huangnan
-Tibetan	Golog
-Tibetan	Gyêgu
-Tibetan	Haixi
-Tibetan	Muli
-Tibetan	Tianzhu
-Urdu	India
-Urdu	Jammu and Kashmir
-Urdu	Delhi
-Urdu	Uttar Pradesh
-Urdu	Bihar
-Urdu	Andhra Pradesh

File mine_languages.py

-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-#  mine_languages.py
-#  lang
-#
-
-"""
-Mine official languages from Wikipedia.
-"""
-
-import codecs
-import sys
-
-import BeautifulSoup
-import requests
-
-BASE_URL = 'http://en.wikipedia.org/wiki/List_of_official_languages'
-
-def mine_languages():
-    ostream = codecs.getwriter('utf8')(sys.stdout)
-    content = safe_get(BASE_URL).content
-    soup = BeautifulSoup.BeautifulSoup(content)
-    lang_nodes = [n for n in soup('a', {'class': 'mw-redirect'})
-            if n.parent.name == 'b']
-
-    for lang_node in lang_nodes:
-        lang = lang_node.text
-        country_ul = lang_node.parent.parent.nextSibling.nextSibling
-        assert country_ul.name == 'ul'
-        countries = [l.findChild('a').text for l in
-                country_ul.findChildren('li')]
-        for c in countries:
-            print >> ostream, '\t'.join((lang, c))
-
-class ScrapingError(Exception):
-    pass
-
-def safe_get(*args, **kwargs):
-    response = requests.get(*args, **kwargs)
-    if response.status_code != 200:
-        raise ScrapingError('got status code %d' % response.status_code)
-    return response
-
-if __name__ == '__main__':
-    mine_languages()

File mine_wikipedia.py

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#  mine_languages.py
+#  lang
+#
+
+"""
+Mine official languages from Wikipedia.
+"""
+
+import codecs
+import sys
+
+import BeautifulSoup
+import requests
+
+BASE_URL = 'http://en.wikipedia.org/wiki/List_of_official_languages'
+
+def mine_languages():
+    ostream = codecs.getwriter('utf8')(sys.stdout)
+    content = safe_get(BASE_URL).content
+    soup = BeautifulSoup.BeautifulSoup(content)
+    lang_nodes = [n for n in soup('a', {'class': 'mw-redirect'})
+            if n.parent.name == 'b']
+
+    for lang_node in lang_nodes:
+        lang = lang_node.text
+        country_ul = lang_node.parent.parent.nextSibling.nextSibling
+        assert country_ul.name == 'ul'
+        countries = [l.findChild('a').text for l in
+                country_ul.findChildren('li')]
+        for c in countries:
+            print >> ostream, '\t'.join((lang, c))
+
+class ScrapingError(Exception):
+    pass
+
+def safe_get(*args, **kwargs):
+    response = requests.get(*args, **kwargs)
+    if response.status_code != 200:
+        raise ScrapingError('got status code %d' % response.status_code)
+    return response
+
+if __name__ == '__main__':
+    mine_languages()

File wiki_country_language_2012-09-07.csv

+Afrikaans	South Africa
+Berber	Morocco
+Bislama	Vanuatu
+Chinese, Mandarin	China
+Chinese, Mandarin	Taiwan
+Chinese, Mandarin	Singapore
+Dari	Afghanistan
+Dhivehi	Maldives
+Dzongkha	Bhutan
+Guaraní	Paraguay
+Guaraní	Bolivia
+Hindi	India
+Hiri Motu	Papua New Guinea
+Ibo	Nigeria
+Kikongo	Democratic Republic of the Congo
+Kituba	Republic of the Congo
+Latin	Vatican City
+Lingala	Democratic Republic of the Congo
+Lingala	Republic of the Congo
+Māori	New Zealand
+Ossetian	South Ossetia
+Pashtu	Afghanistan
+Quechua	Bolivia
+Quechua	Peru
+Swati	Swaziland
+Swati	South Africa
+Tetum	East Timor
+Tigrinya	Eritrea
+Tshiluba	Democratic Republic of the Congo
+Urdu	Pakistan
+Urdu	Fiji
+Altay	Altay, Republic of
+Azeri	Dagestan
+Balkar	Kabardino-Balkaria
+Cantonese Chinese	Hong Kong
+Cantonese Chinese	Macau
+Guaraní	Argentina
+Guaraní	Corrientes Province
+Gwich'in	Northwest Territories
+Inuinnaqtun	Northwest Territories
+Inuinnaqtun	Nunavut
+Inuvialuktun	Northwest Territories
+Kalaallisut	Greenland
+Kalmyk	Kalmykia
+Karachay	Karachay–Cherkessia
+Malayalam	India
+Malayalam	Kerala
+Malayalam	Pondicherry
+Malayalam	Lakshadweep
+Mayan	Mexico
+Mayan	Guatemala
+Mayan	Belize
+Mayan	Honduras
+Mayan	El Salvador
+Náhuatl	Mexico
+Náhuatl	El Salvador
+Aranese	Aran Valley
+Sanskrit	India
+North and South Slavey	Northwest Territories
+Tibetan	Tibet Autonomous Region
+Tibetan	Aba
+Tibetan	Garzê
+Tibetan	Diqing
+Tibetan	Wenshan
+Tibetan	Gannan
+Tibetan	Haibai
+Tibetan	Hainan
+Tibetan	Huangnan
+Tibetan	Golog
+Tibetan	Gyêgu
+Tibetan	Haixi
+Tibetan	Muli
+Tibetan	Tianzhu
+Urdu	India
+Urdu	Jammu and Kashmir
+Urdu	Delhi
+Urdu	Uttar Pradesh
+Urdu	Bihar
+Urdu	Andhra Pradesh