Source

psilib / diffengine.py

Lynn Rees 94135fc 
Lynn Rees 30a2bee 
Lynn Rees 8ffe729 
Lynn Rees 9ce7bf6 
Lynn Rees ecde0d6 
Lynn Rees 9ce7bf6 

Lynn Rees 138d60a 
Lynn Rees 9ce7bf6 
Lynn Rees 138d60a 



Lynn Rees 9ce7bf6 
Lynn Rees df9d6f3 
Lynn Rees e66b6ff 

Lynn Rees 9ce7bf6 







Lynn Rees 46d7268 















Lynn Rees 9ce7bf6 
Lynn Rees 1cde482 
Lynn Rees 9ce7bf6 






Lynn Rees dc714e5 
Lynn Rees 138d60a 
Lynn Rees 9ce7bf6 


Lynn Rees 1cde482 
Lynn Rees 9ce7bf6 




Lynn Rees 1cde482 




Lynn Rees dc714e5 
Lynn Rees 9ce7bf6 


Lynn Rees dc714e5 
Lynn Rees 9ce7bf6 


















Lynn Rees 46d7268 



Lynn Rees 9ce7bf6 
Lynn Rees df9d6f3 

Lynn Rees 8ffe729 
Lynn Rees 138d60a 






Lynn Rees 46d7268 
Lynn Rees 138d60a 







Lynn Rees f5bfb9f 
Lynn Rees 46d7268 
Lynn Rees 138d60a 

Lynn Rees 46d7268 






Lynn Rees 138d60a 
Lynn Rees 46d7268 


Lynn Rees 138d60a 















Lynn Rees 7e57475 
Lynn Rees df9d6f3 



Lynn Rees dc714e5 
Lynn Rees df9d6f3 
Lynn Rees 138d60a 
Lynn Rees df9d6f3 







Lynn Rees 138d60a 
Lynn Rees df9d6f3 
Lynn Rees dc714e5 
Lynn Rees df9d6f3 










Lynn Rees 138d60a 
Lynn Rees df9d6f3 















Lynn Rees dc714e5 
Lynn Rees 138d60a 
























Lynn Rees 0f3b02e 
Lynn Rees 138d60a 
Lynn Rees df9d6f3 
Lynn Rees 138d60a 
Lynn Rees 8ffe729 
Lynn Rees 138d60a 

Lynn Rees dc714e5 
Lynn Rees 138d60a 


Lynn Rees 9ce7bf6 
Lynn Rees 138d60a 












































Lynn Rees df9d6f3 
Lynn Rees 9ce7bf6 
Lynn Rees df9d6f3 



Lynn Rees 46d7268 
Lynn Rees 138d60a 
Lynn Rees df9d6f3 
Lynn Rees 138d60a 
Lynn Rees df9d6f3 

Lynn Rees 9ce7bf6 
Lynn Rees e432f4b 
Lynn Rees 138d60a 





Lynn Rees 46d7268 
Lynn Rees 138d60a 




Lynn Rees 9ce7bf6 
Lynn Rees 138d60a 


Lynn Rees 7e57475 
Lynn Rees 138d60a 


Lynn Rees a91e0e3 
Lynn Rees 138d60a 




Lynn Rees dc714e5 
Lynn Rees 138d60a 
Lynn Rees e432f4b 
#! /usr/bin/env python

from __future__ import generators
from difflib import SequenceMatcher
import htmlutils


class Templatizer(object):    
    
    def __init__(self, path=None):
        self.template, self._rules, self.filelist = None, None, None
        self._path = None
        if path: self.path = path        

    def _compare(self, a, b):
        sm = SequenceMatcher(None, a, b)        
        for tag, alo, ahi, blo, bhi in sm.get_opcodes():
            if tag == 'replace': g = self._near(a, alo, ahi, b, blo, bhi)
            elif tag == 'delete': g = self._tagger('-', a, alo, ahi)
            elif tag == 'insert': g = self._tagger('+', b, blo, bhi)
            elif tag == 'equal': g = self._tagger(' ', a, alo, ahi)
            else: raise ValueError, ' '.join(['unknown tag', `tag`])
            for line in g: yield line

    def _choice(self, aline, bline):
        check = None
        for i in (aline, bline):
            if i[0] == '<':
                if check:
                    if check.split(' ')[0] == i.split(' ')[0]:
                        yield ' '.join(['1', aline])
                        yield ' '.join(['2', bline])
                        return
                    else:
                        yield ' '.join(['-', aline])
                        yield ' '.join(['+', bline])
                else: check = i
            else:
                yield ' '.join(['1', aline])
                yield ' '.join(['2', bline])
                
    def _digest(self, a, alo, ahi, b, blo, bhi):
        g = list()
        if alo < ahi:
            if blo < bhi: g = self._near(a, alo, ahi, b, blo, bhi)
            else: g = self._tagger('-', a, alo, ahi)
        elif blo < bhi: g = self._tagger('+', b, blo, bhi)
        for line in g: yield line

    def _near(self, a, alo, ahi, b, blo, bhi):
        self.best_ratio, self.cutoff = 0.65, 0.7
        sm = SequenceMatcher(None)
        eqi, eqj = None, None
        for j in xrange(blo, bhi):
            bj = b[j]
            sm.set_seq2(bj)
            for i in xrange(alo, ahi):
                ai = a[i]
                if ai == bj:
                    if eqi is None: eqi, eqj = i, j
                    continue
                sm.set_seq1(ai)
                if sm.real_quick_ratio() > self.best_ratio and \
                      sm.quick_ratio() > self.best_ratio and \
                      sm.ratio() > self.best_ratio:
                    self.best_ratio, besti, bestj = sm.ratio(), i, j
        if self.best_ratio < self.cutoff:
            if eqi is None:
                for line in self._replace(a, alo, ahi, b, blo, bhi): yield line
                return
            besti, bestj, self.best_ratio = eqi, eqj, 1.0
        else: eqi = None
        for line in self._digest(a, alo, besti, b, blo, bestj): yield line
        aelt, belt = a[besti], b[bestj]
        if eqi is None:
            for line in self._choice(aelt, belt): yield line
        else: yield ' '.join([' ', aelt])
        for line in self._digest(a, besti+1, ahi, b, bestj+1, bhi): yield line

    def _replace(self, a, alo, ahi, b, blo, bhi):
        assert alo < ahi and blo < bhi
        if bhi - blo < ahi - alo:
            first  = self._tagger('+', b, blo, bhi)
            second = self._tagger('-', a, alo, ahi)
        else:
            first  = self._tagger('-', a, alo, ahi)
            second = self._tagger('+', b, blo, bhi)
        for g in first, second:
            for line in g: yield line

    def _tagger(self, tag, x, lo, hi):        
        for i in xrange(lo, hi):
            if x[i].find('<psi:field') != -1: yield ' '.join([' ', x[i]])
            else: yield ' '.join([tag, x[i]])

    def _extract(self, a, b):
        return list(self._compare(a, b))

    def _litesort(self, diff, learn=None):

        def classifier(choice):
            if choice == 1: template.append(aline)
            elif choice == 2: template.append(bline)
            elif choice == 3: template.append(fieldtag)
            elif choice == 4:
                mirror, count = list(), 0
                linetemp = self._compare(aline.split(), bline.split())
                for l in linetemp:
                    if l[:1] == '1': mirror.append(fieldattr)   
                    elif l[:1] != '2': mirror.append(l[1:])
                    count += 1
                template.append(''.join(mirror))
            elif choice == 5: pass

        fullcount, template, fieldattr = 0, list(), ' psi:field="%s"'
        fieldtag, lcount = '<psi:field name="%s" />', 0
        if not learn: learn = dict()
        for i in diff:
            if i[:1] == ' ':
                if lcount > 20:
                    template.append(fieldtag)
                    lcount = 0
                template.append(i[2:])
            elif i[:1] == '-': lcount += 1
            elif i[:11] == '+': lcount += 1
            elif i[:1] == '1':
                if lcount > 20:
                    template.append(fieldtag)
                    lcount = 0
                aline, bline = i[2:], diff[fullcount+1][2:]
                if aline in learn: classifier(learn[aline])
                else:
                    print 'Choose line to insert in template'
                    print '1. %s' % aline
                    print '2. %s' % bline
                    print '3. Insert field element'
                    print '4. Insert field attribute'
                    print '5. Discard both lines'
                    choice = input('Enter a number: ')
                    classifier(choice)
                    learn[aline] = choice
            fullcount += 1
        return template, learn

    def _sorter(self, diff, learn=None):

        def autofield(line, glist, label):
            glist.append(fragstart % (label, label))
            glist.append(line)
            glist.append(fragend)

        def manfield(tag, glist, count): 
            if diff[fullcount-1][:1] != tag:
                label = str(count)
                if template[-1].find('<psi:field') == -1:
                    template.append(fieldtag % label)
                elif count > int(template[-1][-5]):
                    template.append(fieldtag % label)
                else: count += 1
                glist.append(fragstart % (label, label))            
            glist.append(i[2:])
            if diff[fullcount+1][:1] != tag: glist.append(fragend)
            return count
        
        def classifier(choice, fcount):
            if choice == 1: template.append(aline)
            elif choice == 2: template.append(bline)
            elif choice == 3:
                label = str(fcount)
                template.append(fieldtag % label)
                autofield(aline, alist, label)
                autofield(bline, blist, label)
                fcount += 1
            elif choice == 4:
                mirror, count = [], 0
                linetemp = self._compare(aline.split(), bline.split())
                for j in linetemp:
                    if j[:1] == '1':
                        label = str(fcount)
                        aattr, battr = j, linetemp[count+1]
                        mirror.append(fieldattr % label)                        
                        autofield(aattr[2:], alist, label)
                        autofield(battr[2:], blist, label)
                        fcount += 1
                    elif j[:1] != '2': mirror.append(j[1:])
                    count += 1
                template.append(''.join(mirror))
            elif choice == 5:
                alist.append(aline)
                blist.append(bline)
            elif choice == 6: return fcount            
            return fcount

        fullcount, fcount, template, alist, blist = 0, 1, [], [], []
        fieldtag, fragend = '<psi:field name="%s" />', '</psi:fragment>'
        fragstart = '<psi:fragment name="%s" class="%s">'
        fieldattr = ' psi:field="%s"'
        if not learn: learn = dict()
        for i in diff:
            if i[:1] == ' ': template.append(i[2:])
            elif i[:1] == '-': fcount = manfield('-', alist, fcount)
            elif i[:1] == '+': fcount = manfield('+', blist, fcount)
            elif i[:1] == '1':
                aline, bline = i[2:], diff[fullcount+1][2:]
                if aline in learn: fcount = classifier(learn[aline], fcount)
                else:
                    print 'Choose line to insert in template'
                    print '1. %s' % aline
                    print '2. %s' % bline
                    print '3. Insert field element'
                    print '4. Insert field attribute'
                    print '5. Insert in resources without field'
                    print '6. Discard both lines'
                    choice = input('Enter a number: ')
                    fcount = classifier(choice, fcount)
                    learn[aline] = choice
            fullcount += 1
        return template, alist, blist, learn

    def _filesort(self, file1, file2, learn=None):
        a = self._extract(htmlutils.htmlines(file1), htmlutils.htmlines(file2))
        return self._sorter(a, learn)

    def _listsort(self, la, lb, learn=None):
        return self._sorter(self._extract(la, lb), learn)

    def _mixsort(self, file, elist, learn=None):
        a = htmlutils.htmlines(file)
        return self._sorter(self._extract(a, elist), learn)

    def _filesort2(self, file1, file2, learn=None):
        a = self._extract(htmlutils.htmlines(file1), htmlutils.htmlines(file2))
        return self._litesort(a, learn)

    def _listsort2(self, la, lb, learn=None):
        return self._litesort(self._extract(la, lb), learn)    

    def _mixsort2(self, file, elist, learn=None):
        a = htmlutils.htmlines(file)
        return self._litesort(self._extract(a, elist), learn)

    def _htmlfiles(self, path):
        from mimetypes import guess_type        
        import os        
        if path != os.getcwd(): os.chdir(path)
        self._path, htype = os.getcwd(), 'text/html'
        hfiles = [i for i in os.listdir(path) if guess_type(i)[0] == htype]
        return hfiles

    def _setpath(self, path):
        self.filelist = self._htmlfiles(path)

    def _getpath(self):
        print self._path

    def _delpath(self):
        self.filelist, self._path, self.template = None, None, None

    path = property(_getpath, _setpath, _delpath)    

    def _fieldsort(self, clist):
        last, count, field = str(), 1, '<psi:field'
        for line in clist:
            if last.find(field) == -1:
                last = line
                #if line.find(field) != -1:
                #    line = line % str(count)
                #    count += 1                
                yield line
            else:
                if line.find(field) == -1:
                    last = line
                    yield line

    def _autosort(self, path):        
        from random import shuffle

        def flog(olist, mlearn):
            if len(olist) > 1:
                nlist = list()
                shuffle(olist)
                while olist:
                    extract = self._listsort2(olist.pop(), olist.pop(), mlearn)
                    nlist.append(extract[0])
                    mlearn.update(extract[1])
                flog(nlist, mlearn)
            else: clist.append(olist[0])

        if self.filelist: hfiles = self.filelist[:]
        else: hfiles = self._htmlfiles(path)
        if self._rules: mlearn = self._rules
        else: mlearn = dict()
        if len(hfiles) % 2: hfiles.pop()
        shuffle(hfiles)
        tlist, clist = list(), list()
        while hfiles:
            extract = self._filesort2(hfiles.pop(), hfiles.pop(), mlearn)
            tlist.append(extract[0])
            mlearn.update(extract[1])
        flog(tlist, mlearn)
        return list(self._fieldsort(clist[0]))

    def preview(self):
        from pprint import pprint
        pprint(self.template)

    def create(self, file1, file2):
        tmp, self._rules = self._filesort2(file1, file2)
        self.template = list(self._fieldsort(tmp))

    def refine(self, file):
        if self.template:
            tmp = self._mixsort2(file, self.template, self._rules)
            self.template = list(self._fieldsort(tmp[0]))
            self._rules.update(tmp[1])

    def auto(self, path=None):
        self.template = self._autosort(path)