Source

uchardet-enhanced / langstats / mkpairmodel.py

Full commit
#!/usr/bin/env python

import sys
import os

maxrank = 64


def Usage():
    print "Usage: mkchartoorder.py <charstats file> <text reference file>"
    sys.exit(1)
    
if len(sys.argv) != 3:
    Usage()

charstats = sys.argv[1]
reftext = sys.argv[2]

# print "Charstats file:", charstats, "Ref text:", reftext

# The reference text name is like french_cp1252.txt
langcode = os.path.splitext(os.path.basename(reftext))[0]
lang,encoding = langcode.split('_')
# print "lang:", lang, "encoding:", encoding

f = open(charstats, "r")

chartoorder = []
for i in range(256):
    chartoorder.append(255)

order = 1
for line in f:
    l = line.split()
    charpos = int(l[0], 16)
    #print "charpos ", charpos, " -> order", order
    chartoorder[charpos] = order
    order += 1
f.close()

def ordertoichar(order):
    for i in range(256):
        if chartoorder[i] == order:
            return i
    return 0

ordermapname = "%s_%sCharToOrderMap" % (lang, encoding)
print "static const unsigned char %s[] = " % ordermapname
print "{"
for i in range(16):
    for j in range(16):
        sys.stdout.write("%3d," % chartoorder[16*i+j])
    print
print "};\n"


# Compute the char pair frequency stats

f = open(reftext, "r")
text = f.read()

pairfreqs = []
for i in range(maxrank*maxrank):
    pairfreqs.append(0)

totalpairs = 0
prevorder = 255
for i in range(len(text)):
    ichar = ord(text[i])
    order = chartoorder[ichar]
    #print "char ", text[i], "order", order
    if order > maxrank:
        prevorder = 255
        continue
    if prevorder <= maxrank:
        pairfreqs[(prevorder-1) * maxrank + (order-1)] += 1
        totalpairs += 1
    prevorder = order

# We now have a 4096 entries array indexed by the 64x64 possible
# pairs of frequent characters, listing their count of occurences.
# We want to transform this array so that the values are just 0, 1, 2, 3
# 3 is for the 512 most frequent sequences
# 2 for the 512 next
# 1 for all having more than 3 occurences
# 0 for negative sequences 0 to 2 occurences
# So sort by order of occurences, and compute the 512th,and 1024th ranks
byocc = sorted(pairfreqs, reverse=1)
t512 = byocc[512]
t1024 = byocc[1024]
oc512 = 0
for i in range(512):
    oc512 += byocc[i]
prop512 = float(oc512) / float(totalpairs)

# print "/* Threshold 512 is ", t512, "threshold 1024 is ", t1024, "*/"
modelname = "%sLangModel" % lang
ctext = "static const PRUint8 %s[] = " % modelname + "\n{\n"
linecounter = 0
for i in range(maxrank*maxrank):
    order1 = i / maxrank + 1
    order2 = i % maxrank + 1
    ichar1 = ordertoichar(order1)
    if ichar1 == 0:
        print "ordertoichar returned 0 for order", order1
    ichar2 = ordertoichar(order2)
    if ichar2 == 0:
        print "ordertoichar returned 0 for order", order2
#   print "order1", order1, "order2", order2, "ichar1", ichar1, "ichar2", ichar2

    cntocs = pairfreqs[(order1-1) * maxrank + order2-1]
    fclass = 1
    if cntocs >= t512:
        fclass = 3
    elif cntocs >= t1024:
        fclass = 2
    elif cntocs <= 2:
        fclass = 0
        
    #print chr(ichar1), chr(ichar2), fclass, cntocs
    ctext += "%d%s" % (fclass, ",")
    linecounter += 1
    if linecounter == 32:
        linecounter = 0
        ctext += "\n"

ctext += "};\n"
print ctext

print "const SequenceModel %s%sModel = " % (encoding, lang)
print "{"
print "  %s," % ordermapname
print "  %s," % modelname
print "  (float)%f," % prop512
print "  PR_TRUE,"
print "  \"%s\"" % encoding
print "};"