Commits

spirit committed 5a00eeb

Minor

Comments (0)

Files changed (4)

 BLOCKS_URL = "http://unicode.org/Public/UNIDATA/Blocks.txt"
 BLOCKS_FN = os.path.basename(BLOCKS_URL)
 ENCODING = "utf-8"
+MAX_GRAMS = 300
 
 
 def make_data_dir():
                     assert not consecutive_spaces_re.search(value)
                     assert n == int(m.group(2))
                     model[value] = n
-            assert len(model) == 300
+            assert len(model) == MAX_GRAMS
 
         path = os.path.join(MODELS_DIR, model_file.lower() + ".py")
 

generate_trigrams.py

 http://medialab.di.unipi.it/wiki/Wikipedia_Extractor
 """
 
-
 import argparse
 import os
 import re
 import sys
 
 from collections import defaultdict
+from guess_language import WORD_RE, MAX_GRAMS
 
-from guess_language import WORD_RE, MAX_GRAMS
 
 TAG_RE = re.compile(r"<.*?>|\{.*?\}|\\\w+")
 

guess_language/__init__.py

 
 import functools
 import re
-import unicodedata
 import warnings
 
 from collections import defaultdict, OrderedDict
 MODEL_ROOT = __name__ + ".data.models."
 
 BASIC_LATIN = {
-    "en", "ceb", "ha", "so", "tlh", "id", "haw", "la", "sw", "eu",
-    "nr", "nso", "zu", "xh", "ss", "st", "tn", "ts"
+    "ceb", "en", "eu", "ha", "haw", "id", "la", "nr", "nso", "so", "ss", "st",
+    "sw", "tlh", "tn", "ts", "xh", "zu"
 }
 EXTENDED_LATIN = {
-    "cs", "af", "pl", "hr", "ro", "sk", "sl", "tr", "hu", "az",
-    "et", "sq", "ca", "es", "fr", "de", "nl", "it", "da", "is", "nb", "sv",
-    "fi", "lv", "pt", "ve", "lt", "tl", "cy", "vi"
+    "af", "az", "ca", "cs", "cy", "da", "de", "es", "et", "fi", "fr", "hr",
+    "hu", "is", "it", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "sk", "sl",
+    "sq", "sv", "tl", "tr", "ve", "vi"
 }
 ALL_LATIN = BASIC_LATIN.union(EXTENDED_LATIN)
-CYRILLIC = {"ru", "uk", "kk", "uz", "mn", "sr", "mk", "bg", "ky"}
+CYRILLIC = {"bg", "kk", "ky", "mk", "mn", "ru", "sr", "uk", "uz"}
 ARABIC = {"ar", "fa", "ps", "ur"}
 DEVANAGARI = {"hi", "ne"}
 PT = {"pt_BR", "pt_PT"}

guess_language/data/__init__.py

 BLOCKS[0x7c:0x80] = ['NKo'] * 0x4
 BLOCKS[0x80:0x84] = ['Samaritan'] * 0x4
 BLOCKS[0x84:0x86] = ['Mandaic'] * 0x2
+BLOCKS[0x8a:0x90] = ['Arabic Extended-A'] * 0x6
 BLOCKS[0x90:0x98] = ['Devanagari'] * 0x8
 BLOCKS[0x98:0xa0] = ['Bengali'] * 0x8
 BLOCKS[0xa0:0xa8] = ['Gurmukhi'] * 0x8
 BLOCKS[0xaa0:0xaa6] = ['Cham'] * 0x6
 BLOCKS[0xaa6:0xaa8] = ['Myanmar Extended-A'] * 0x2
 BLOCKS[0xaa8:0xaae] = ['Tai Viet'] * 0x6
+BLOCKS[0xaae:0xab0] = ['Meetei Mayek Extensions'] * 0x2
 BLOCKS[0xab0:0xab3] = ['Ethiopic Extended-A'] * 0x3
 BLOCKS[0xabc:0xac0] = ['Meetei Mayek'] * 0x4
 BLOCKS[0xac0:0xd7b] = ['Hangul Syllables'] * 0x2bb
 BLOCKS[0x1084:0x1086] = ['Imperial Aramaic'] * 0x2
 BLOCKS[0x1090:0x1092] = ['Phoenician'] * 0x2
 BLOCKS[0x1092:0x1094] = ['Lydian'] * 0x2
+BLOCKS[0x1098:0x109a] = ['Meroitic Hieroglyphs'] * 0x2
+BLOCKS[0x109a:0x10a0] = ['Meroitic Cursive'] * 0x6
 BLOCKS[0x10a0:0x10a6] = ['Kharoshthi'] * 0x6
 BLOCKS[0x10a6:0x10a8] = ['Old South Arabian'] * 0x2
 BLOCKS[0x10b0:0x10b4] = ['Avestan'] * 0x4
 BLOCKS[0x10c0:0x10c5] = ['Old Turkic'] * 0x5
 BLOCKS[0x1100:0x1108] = ['Brahmi'] * 0x8
 BLOCKS[0x1108:0x110d] = ['Kaithi'] * 0x5
+BLOCKS[0x110d:0x1110] = ['Sora Sompeng'] * 0x3
+BLOCKS[0x1110:0x1115] = ['Chakma'] * 0x5
+BLOCKS[0x1118:0x111e] = ['Sharada'] * 0x6
+BLOCKS[0x1168:0x116d] = ['Takri'] * 0x5
 BLOCKS[0x1200:0x1240] = ['Cuneiform'] * 0x40
 BLOCKS[0x1300:0x1343] = ['Egyptian Hieroglyphs'] * 0x43
 BLOCKS[0x1680:0x16a4] = ['Bamum Supplement'] * 0x24
+BLOCKS[0x16f0:0x16fa] = ['Miao'] * 0xa
 BLOCKS[0x1b00:0x1b10] = ['Kana Supplement'] * 0x10
 BLOCKS[0x1d40:0x1d80] = ['Mathematical Alphanumeric Symbols'] * 0x40
+BLOCKS[0x1ee0:0x1ef0] = ['Arabic Mathematical Alphabetic Symbols'] * 0x10
 BLOCKS[0x2000:0x2a6e] = ['CJK Unified Ideographs Extension B'] * 0xa6e
 BLOCKS[0x2a70:0x2b74] = ['CJK Unified Ideographs Extension C'] * 0x104
 BLOCKS[0x2b74:0x2b82] = ['CJK Unified Ideographs Extension D'] * 0xe