Commits

Lynn Rees committed 7e57475

- more fixes

  • Participants
  • Parent commits a2b3c95

Comments (0)

Files changed (3)

 class Templar:
     
     def __init__(self, a, b):
-        self.linejunk, self.charjunk = None, None        
-        self.a, self.b = a, b    
+        self.linejunk, self.charjunk = None, None
+        self.a, self.b = a, b
+        self.smash = SequenceMatcher(self.linejunk, self.a, self.b)
 
-    def _compare(self, a, b):
-        smash = SequenceMatcher(self.linejunk, a, b)
-        for tag, alo, ahi, blo, bhi in smash.get_opcodes():
+    def _compare(self, a, b):        
+        for tag, alo, ahi, blo, bhi in self.smash.get_opcodes():
             if tag == 'replace': g = self._near(a, alo, ahi, b, blo, bhi)
             elif tag == 'delete': g = self._tagger('-', a, alo, ahi)
             elif tag == 'insert': g = self._tagger('+', b, blo, bhi)
 
     def _near(self, a, alo, ahi, b, blo, bhi):
         best_ratio, cutoff = 0.74, 0.75
-        smash = SequenceMatcher(self.charjunk)
+        sh = SequenceMatcher(self.charjunk)
         eqi, eqj = None, None
         for j in xrange(blo, bhi):
             bj = b[j]
-            smash.set_seq2(bj)
+            sh.set_seq2(bj)
             for i in xrange(alo, ahi):
                 ai = a[i]
                 if ai == bj:
                     if eqi is None: eqi, eqj = i, j
                     continue
-                smash.set_seq1(ai)
-                if smash.real_quick_ratio() > best_ratio and \
-                      smash.quick_ratio() > best_ratio and \
-                      smash.ratio() > best_ratio:
-                    best_ratio, besti, bestj = smash.ratio(), i, j
+                sh.set_seq1(ai)
+                if sh.real_quick_ratio() > best_ratio and \
+                      sh.quick_ratio() > best_ratio and \
+                      sh.ratio() > best_ratio:
+                    best_ratio, besti, bestj = sh.ratio(), i, j
         if best_ratio < cutoff:
             if eqi is None:
                 for line in self._replace(a, alo, ahi, b, blo, bhi): yield line
     def extract(self):
         return list(self._compare(self.a, self.b))
 
+    def ratio(self):
+        return self.smash.ratio()
+
 def filetemplar(a, b):
-    return Templar(htmlutils.htmlines(a), htmlutils.htmlines(b)).extract()
+    tmp = Templar(htmlutils.htmlines(a), htmlutils.htmlines(b))
+    if tmp.ratio() > 0.6: return tmp.extract()
 
 def listemplar(a, b):
-    return Templar(a, b).extract()
+    tmp = Templar(a, b)
+    if tmp.ratio() > 0.6: return tmp.extract()
 
-def sorter(rawtemplate):
+def sorter(rawtemplate, learn=None):
 
     def autofield(line, glist, label):
         glist.append(fragstart % (label, label))
             blist.append(bline)
         return fcount
 
-    fullcount, fcount, template, alist, blist, learn = 0, 1, [], [], [], {}
+    fullcount, fcount, template, alist, blist = 0, 1, [], [], []
     fieldtag, fragend = '<psi:field name="%s" />', '</psi:fragment>'
     fragstart = '<psi:fragment name="%s" class="%s">'
     fieldattr = ' psi:field="%s"'
+    if not learn: learn = dict()
     for i in rawtemplate:
         if i[:1] == ' ': template.append(i[2:])
         elif i[:1] == '-': fcount = manfield('-', alist, fcount)
         fullcount += 1
     return template, alist, blist
 
-def tempsort(file1, file2):
-    return sorter(filetemplar(file1, file2))
+def tempsort(file1, file2, learn=None):
+    return sorter(filetemplar(file1, file2), learn)
+
+def listsort(la, lb, learn=None):
+    return sorter(listemplar(la, lb), learn)
 
 def autotemplar():
     from mimetypes import guess_type
     from random import shuffle
     import os
-    htmls = [i for i in os.listdir(os.getcwd())
-             if guess_type(i)[0] == 'text/html']
+
+    #def temper(rlist):
+    #    tlist = list()
+    #    an, bn = htmls.pop(), htmls.pop()
+    #    temp, aniq, bniq = tempsort(an, bn, mlearn)
+    
+    mlearn, htmls, temps, subs = dict(), [i for i in os.listdir(os.getcwd())
+        if guess_type(i)[0] == 'text/html'], list(), dict()
     shuffle(htmls)
-    return htmls
+   # for i in htmls:
+   #     an, bn = htmls.pop(), htmls.pop()
+   #     temp, aniq, bniq = tempsort(an, bn, mlearn)
+   #     temps.append(temp)
+   #     subs[an] = aniq
+   #     subs[bn] = bniq
+   # for i in temps:
+    return htmls
+        
 import os
 from psiutils import attset, attdel
 from Ft.Xml.Domlette import NonvalidatingReader as nr
-from psiutils import toxmlfile, tags, getname, getClass, stripws, psins
+from psiutils import toxmlfile, tags, getname, getclass, stripws, psins
 
 rsrc, fld, fmnt, cls, nm = u'resource', u'field', u'fragment', u'class', u'name'
 
         fields = [f for f in tags(source, psins, fld)]
         fragments = [f for f in tags(target, psins, fmnt)]
         for fragment in fragments:
-            fclass = getClass(fragment)
+            fclass = getclass(fragment)
             for field in fields:
                 if fclass == getname(field):
                     for child in fragment.childNodes:
     def _expand(self, lo, fragment, end):
         for resource in lo:
             for master in self._templates:
-                if getClass(resource) == getname(master):
+                if getclass(resource) == getname(master):
                     child = master.cloneNode(1)
                     attset(child, nm, getname(resource))
                     attdel(child, cls)
         fields = [f for f in tags(source, psins, fld)]
         fragments = [f for f in tags(target, psins, fmnt)]
         for fragment in fragments:
-            fclass = getClass(fragment)
+            fclass = getclass(fragment)
             for field in fields:
                 if fclass == getname(field):
                     pn = field.parentNode
         dwtemplates = [d.cloneNode(1) for d in self._templates]
         for i in dwtemplates:
             for master in self._templates:
-                if getClass(i) == getname(master):
+                if getclass(i) == getname(master):
                     child = master.cloneNode(1)
                     attset(child, nm, getname(i))
                     attdel(child, cls)
         codeOutsideHTMLIsLocked="false"'''
         tee = 'InstanceEnd'
         instance = child.firstChild
-        ttbe = self._doc.createComment(tbe % getClass(resource))
+        ttbe = self._doc.createComment(tbe % getclass(resource))
         ttee = self._doc.createComment(tee)
         instance.insertBefore(ttbe, instance.firstChild)
         instance.appendChild(ttee)
         if file: self._doc = nr.parseUri(file)
         stripws(self._doc)
         names = [getname(i) for i in tags(self._doc, psins, rsrc)]
-        classes = [getClass(i) for i in tags(self._doc, psins, rsrc)]
+        classes = [getclass(i) for i in tags(self._doc, psins, rsrc)]
         self._unexpanded = [i for i in tags(self._doc, psins, rsrc)
-                            if getClass(i) in names]
+                            if getclass(i) in names]
         self._templates = [i for i in tags(self._doc, psins, rsrc)
                            if getname(i) in classes]
         self._expandTemplatesDW()
         if file: self._doc = nr.parseUri(file)
         stripws(self._doc)
         names = [getname(i) for i in tags(self._doc, psins, rsrc)]
-        classes = [getClass(i) for i in tags(self._doc, psins, rsrc)]
+        classes = [getclass(i) for i in tags(self._doc, psins, rsrc)]
         self._unexpanded = [i for i in tags(self._doc, psins, rsrc)
-                            if getClass(i) in names]
+                            if getclass(i) in names]
         self._templates = [i for i in tags(self._doc, psins, rsrc)
                            if getname(i) in classes]
         self._expandTemplates()
 ## See COPYRIGHT file for license terms.
 
 __title__ = 'spider'
-__version__ = '0.42'
+__version__ = '0.43'
 __author__ = 'L.C. Rees (xanimal@users.sf.net)'
 
 '''This module provides FTP and Web spiders and mirroring utilities in one