Source

psilib / diffengine.py

#! /usr/bin/env python

from __future__ import generators
from difflib import SequenceMatcher
import htmlutils


class Templatizer(object):    
    
    def __init__(self, path=None):
        self.template, self._rules, self.filelist = None, None, None
        self._path = None
        if path: self.path = path        

    def _compare(self, a, b):
        sm = SequenceMatcher(None, a, b)        
        for tag, alo, ahi, blo, bhi in sm.get_opcodes():
            if tag == 'replace': g = self._near(a, alo, ahi, b, blo, bhi)
            elif tag == 'delete': g = self._tagger('-', a, alo, ahi)
            elif tag == 'insert': g = self._tagger('+', b, blo, bhi)
            elif tag == 'equal': g = self._tagger(' ', a, alo, ahi)
            else: raise ValueError, ' '.join(['unknown tag', `tag`])
            for line in g: yield line

    def _choice(self, aline, bline):
        check = None
        for i in (aline, bline):
            if i[0] == '<':
                if check:
                    if check.split(' ')[0] == i.split(' ')[0]:
                        yield ' '.join(['1', aline])
                        yield ' '.join(['2', bline])
                        return
                    else:
                        yield ' '.join(['-', aline])
                        yield ' '.join(['+', bline])
                else: check = i
            else:
                yield ' '.join(['1', aline])
                yield ' '.join(['2', bline])
                
    def _digest(self, a, alo, ahi, b, blo, bhi):
        g = list()
        if alo < ahi:
            if blo < bhi: g = self._near(a, alo, ahi, b, blo, bhi)
            else: g = self._tagger('-', a, alo, ahi)
        elif blo < bhi: g = self._tagger('+', b, blo, bhi)
        for line in g: yield line

    def _near(self, a, alo, ahi, b, blo, bhi):
        self.best_ratio, self.cutoff = 0.65, 0.7
        sm = SequenceMatcher(None)
        eqi, eqj = None, None
        for j in xrange(blo, bhi):
            bj = b[j]
            sm.set_seq2(bj)
            for i in xrange(alo, ahi):
                ai = a[i]
                if ai == bj:
                    if eqi is None: eqi, eqj = i, j
                    continue
                sm.set_seq1(ai)
                if sm.real_quick_ratio() > self.best_ratio and \
                      sm.quick_ratio() > self.best_ratio and \
                      sm.ratio() > self.best_ratio:
                    self.best_ratio, besti, bestj = sm.ratio(), i, j
        if self.best_ratio < self.cutoff:
            if eqi is None:
                for line in self._replace(a, alo, ahi, b, blo, bhi): yield line
                return
            besti, bestj, self.best_ratio = eqi, eqj, 1.0
        else: eqi = None
        for line in self._digest(a, alo, besti, b, blo, bestj): yield line
        aelt, belt = a[besti], b[bestj]
        if eqi is None:
            for line in self._choice(aelt, belt): yield line
        else: yield ' '.join([' ', aelt])
        for line in self._digest(a, besti+1, ahi, b, bestj+1, bhi): yield line

    def _replace(self, a, alo, ahi, b, blo, bhi):
        assert alo < ahi and blo < bhi
        if bhi - blo < ahi - alo:
            first  = self._tagger('+', b, blo, bhi)
            second = self._tagger('-', a, alo, ahi)
        else:
            first  = self._tagger('-', a, alo, ahi)
            second = self._tagger('+', b, blo, bhi)
        for g in first, second:
            for line in g: yield line

    def _tagger(self, tag, x, lo, hi):        
        for i in xrange(lo, hi):
            if x[i].find('<psi:field') != -1: yield ' '.join([' ', x[i]])
            else: yield ' '.join([tag, x[i]])

    def _extract(self, a, b):
        return list(self._compare(a, b))

    def _litesort(self, diff, learn=None):

        def classifier(choice):
            if choice == 1: template.append(aline)
            elif choice == 2: template.append(bline)
            elif choice == 3: template.append(fieldtag)
            elif choice == 4:
                mirror, count = list(), 0
                linetemp = self._compare(aline.split(), bline.split())
                for l in linetemp:
                    if l[:1] == '1': mirror.append(fieldattr)   
                    elif l[:1] != '2': mirror.append(l[1:])
                    count += 1
                template.append(''.join(mirror))
            elif choice == 5: pass

        fullcount, template, fieldattr = 0, list(), ' psi:field="%s"'
        fieldtag, lcount = '<psi:field name="%s" />', 0
        if not learn: learn = dict()
        for i in diff:
            if i[:1] == ' ':
                if lcount > 20:
                    template.append(fieldtag)
                    lcount = 0
                template.append(i[2:])
            elif i[:1] == '-': lcount += 1
            elif i[:11] == '+': lcount += 1
            elif i[:1] == '1':
                if lcount > 20:
                    template.append(fieldtag)
                    lcount = 0
                aline, bline = i[2:], diff[fullcount+1][2:]
                if aline in learn: classifier(learn[aline])
                else:
                    print 'Choose line to insert in template'
                    print '1. %s' % aline
                    print '2. %s' % bline
                    print '3. Insert field element'
                    print '4. Insert field attribute'
                    print '5. Discard both lines'
                    choice = input('Enter a number: ')
                    classifier(choice)
                    learn[aline] = choice
            fullcount += 1
        return template, learn

    def _sorter(self, diff, learn=None):

        def autofield(line, glist, label):
            glist.append(fragstart % (label, label))
            glist.append(line)
            glist.append(fragend)

        def manfield(tag, glist, count): 
            if diff[fullcount-1][:1] != tag:
                label = str(count)
                if template[-1].find('<psi:field') == -1:
                    template.append(fieldtag % label)
                elif count > int(template[-1][-5]):
                    template.append(fieldtag % label)
                else: count += 1
                glist.append(fragstart % (label, label))            
            glist.append(i[2:])
            if diff[fullcount+1][:1] != tag: glist.append(fragend)
            return count
        
        def classifier(choice, fcount):
            if choice == 1: template.append(aline)
            elif choice == 2: template.append(bline)
            elif choice == 3:
                label = str(fcount)
                template.append(fieldtag % label)
                autofield(aline, alist, label)
                autofield(bline, blist, label)
                fcount += 1
            elif choice == 4:
                mirror, count = [], 0
                linetemp = self._compare(aline.split(), bline.split())
                for j in linetemp:
                    if j[:1] == '1':
                        label = str(fcount)
                        aattr, battr = j, linetemp[count+1]
                        mirror.append(fieldattr % label)                        
                        autofield(aattr[2:], alist, label)
                        autofield(battr[2:], blist, label)
                        fcount += 1
                    elif j[:1] != '2': mirror.append(j[1:])
                    count += 1
                template.append(''.join(mirror))
            elif choice == 5:
                alist.append(aline)
                blist.append(bline)
            elif choice == 6: return fcount            
            return fcount

        fullcount, fcount, template, alist, blist = 0, 1, [], [], []
        fieldtag, fragend = '<psi:field name="%s" />', '</psi:fragment>'
        fragstart = '<psi:fragment name="%s" class="%s">'
        fieldattr = ' psi:field="%s"'
        if not learn: learn = dict()
        for i in diff:
            if i[:1] == ' ': template.append(i[2:])
            elif i[:1] == '-': fcount = manfield('-', alist, fcount)
            elif i[:1] == '+': fcount = manfield('+', blist, fcount)
            elif i[:1] == '1':
                aline, bline = i[2:], diff[fullcount+1][2:]
                if aline in learn: fcount = classifier(learn[aline], fcount)
                else:
                    print 'Choose line to insert in template'
                    print '1. %s' % aline
                    print '2. %s' % bline
                    print '3. Insert field element'
                    print '4. Insert field attribute'
                    print '5. Insert in resources without field'
                    print '6. Discard both lines'
                    choice = input('Enter a number: ')
                    fcount = classifier(choice, fcount)
                    learn[aline] = choice
            fullcount += 1
        return template, alist, blist, learn

    def _filesort(self, file1, file2, learn=None):
        a = self._extract(htmlutils.htmlines(file1), htmlutils.htmlines(file2))
        return self._sorter(a, learn)

    def _listsort(self, la, lb, learn=None):
        return self._sorter(self._extract(la, lb), learn)

    def _mixsort(self, file, elist, learn=None):
        a = htmlutils.htmlines(file)
        return self._sorter(self._extract(a, elist), learn)

    def _filesort2(self, file1, file2, learn=None):
        a = self._extract(htmlutils.htmlines(file1), htmlutils.htmlines(file2))
        return self._litesort(a, learn)

    def _listsort2(self, la, lb, learn=None):
        return self._litesort(self._extract(la, lb), learn)    

    def _mixsort2(self, file, elist, learn=None):
        a = htmlutils.htmlines(file)
        return self._litesort(self._extract(a, elist), learn)

    def _htmlfiles(self, path):
        from mimetypes import guess_type        
        import os        
        if path != os.getcwd(): os.chdir(path)
        self._path, htype = os.getcwd(), 'text/html'
        hfiles = [i for i in os.listdir(path) if guess_type(i)[0] == htype]
        return hfiles

    def _setpath(self, path):
        self.filelist = self._htmlfiles(path)

    def _getpath(self):
        print self._path

    def _delpath(self):
        self.filelist, self._path, self.template = None, None, None

    path = property(_getpath, _setpath, _delpath)    

    def _fieldsort(self, clist):
        last, count, field = str(), 1, '<psi:field'
        for line in clist:
            if last.find(field) == -1:
                last = line
                #if line.find(field) != -1:
                #    line = line % str(count)
                #    count += 1                
                yield line
            else:
                if line.find(field) == -1:
                    last = line
                    yield line

    def _autosort(self, path):        
        from random import shuffle

        def flog(olist, mlearn):
            if len(olist) > 1:
                nlist = list()
                shuffle(olist)
                while olist:
                    extract = self._listsort2(olist.pop(), olist.pop(), mlearn)
                    nlist.append(extract[0])
                    mlearn.update(extract[1])
                flog(nlist, mlearn)
            else: clist.append(olist[0])

        if self.filelist: hfiles = self.filelist[:]
        else: hfiles = self._htmlfiles(path)
        if self._rules: mlearn = self._rules
        else: mlearn = dict()
        if len(hfiles) % 2: hfiles.pop()
        shuffle(hfiles)
        tlist, clist = list(), list()
        while hfiles:
            extract = self._filesort2(hfiles.pop(), hfiles.pop(), mlearn)
            tlist.append(extract[0])
            mlearn.update(extract[1])
        flog(tlist, mlearn)
        return list(self._fieldsort(clist[0]))

    def preview(self):
        from pprint import pprint
        pprint(self.template)

    def create(self, file1, file2):
        tmp, self._rules = self._filesort2(file1, file2)
        self.template = list(self._fieldsort(tmp))

    def refine(self, file):
        if self.template:
            tmp = self._mixsort2(file, self.template, self._rules)
            self.template = list(self._fieldsort(tmp[0]))
            self._rules.update(tmp[1])

    def auto(self, path=None):
        self.template = self._autosort(path)