Commits

Flávio Coelho committed 55cded6

corrigi alguns bugs na aranha

Comments (0)

Files changed (1)

 from BeautifulSoup import SoupStrainer, BeautifulSoup as BS
 from BeautifulSoup import BeautifulStoneSoup as XS
 import sys, os, urllib2, urllib, re
+from urllib2 import HTTPError
 from elixir import  Date, Integer,PickleType, Entity, Field, create_all, setup_all, metadata, session, Unicode, String
 import pylab as P
 
     nome = Field(Unicode(32))
     nlinks =  Field(Integer)
     links = Field(PickleType())
-    ender = Field(String(256,convert_unicode='force'))
     
 class Crawler:
     def __init__(self,palavra,lingua,depth):
         Cria URL da wikipedia a partir de lingua e palavra
         """
         nomeatual = palavra
-        self.baseurl = "http://"+lingua+".wikipedia.org/wiki/"
+        self.baseurl = "http://"+lingua+".wikipedia.org/"
         u = "http://"+lingua+".wikipedia.org/wiki/"+palavra
 
         return u
             self.curlinks = artlist
         else:
             self.curlinks = artlist
-            Ideia(nome=self.termoatual,nlinks = len(artlist), links = artlist,ender = self.urlatual)
+            Ideia(nome=self.termoatual,nlinks = len(artlist), links = artlist)
             self.G.add_edges_from([(self.termoatual,i.split('/')[-1]) for i in self.curlinks])
             if self.curdepth > self.depth:
                 return
-            self.fila.extend([self.baseurl + i for i in artlist])
+            self.fila.extend([self.baseurl[:-1] + i for i in artlist])
             self.curdepth +=1
-        #~ session.commit()
+        
             
     def move(self):
         if not self.fila:
                 self.fila.append(self.SU)
         while self.fila:
             self.started = 1
-            self.urlatual = self.fila.pop(0)
+            self.urlatual = self.fila.pop()
             self.termoatual = self.urlatual.split('/')[-1]
-            if ":" in self.termoatual: continue
-            if self.termoatual in ['Main_page']+self.history:continue
+            
+            if ":" in self.termoatual:
+                print "+++",self.termoatual
+                continue
+            if self.termoatual in ['Main_page']+self.history:
+                continue
             print "buscando ", self.termoatual,
             print "Faltam: ", len(self.fila)
             try:
                 html = self.parsePag(self.urlatual)
-            except:
+            except HTTPError:
+                print "==> não encontrei ",self.urlatual
                 continue
             self.verResp(html)
             self.nlinks +=1
             self.history.append(self.termoatual)
+            session.commit()
 
    
 if __name__=="__main__":
     palavra = u'matemática'
     lingua = 'pt'
     #~ Cr = Crawler(palavra,lingua,1)
-    Cr = Crawler(sys.argv[1],sys.argv[2],1)
+    Cr = Crawler(sys.argv[1],sys.argv[2],2)
     Cr.move()
-    NX.draw(Cr.G)
-    P.show()
+    #~ NX.draw(Cr.G)
+    #~ P.show()