Source

CCP2011 / aranha.py

Full commit
# -*- coding:utf-8 -*-
import networkx as NX
from BeautifulSoup import SoupStrainer, BeautifulSoup as BS
from BeautifulSoup import BeautifulStoneSoup as XS
import sys, os, urllib2, urllib, re
from urllib2 import HTTPError
from elixir import  Date, Integer,PickleType, Entity, Field, create_all, setup_all, metadata, session, Unicode, String
import pylab as P

metadata.bind = 'sqlite:///knowdb.sqlite'
metadata.bind.echo = False


class Ideia(Entity):
    nome = Field(Unicode(32))
    nlinks =  Field(Integer)
    links = Field(PickleType())
    
class Crawler:
    def __init__(self,palavra,lingua,depth):
        self.cria_banco()
        self.SU = self.urlifica(palavra,lingua)
        self.urlatual = self.SU
        self.termoatual = palavra
        self.depth = depth
        self.lingua = lingua
        self.fila=[]
        self.curdepth = 0
        self.started = 0
        self.nlinks = 0
        self.history = []
        self.G = NX.Graph()

    def cria_banco(self):
        setup_all()
        create_all()
        
    def urlifica(self,palavra,lingua):
        """
        Cria URL da wikipedia a partir de lingua e palavra
        """
        nomeatual = palavra
        self.baseurl = "http://"+lingua+".wikipedia.org/"
        u = "http://"+lingua+".wikipedia.org/wiki/"+palavra

        return u
        
    def parsePag(self,urlend):
        self.urlatual = urlend
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        values = {'name' : 'John Smith',
          'location' : 'Northampton',
          'language' : 'Python' }
        headers = { 'User-Agent' : user_agent }
        data = urllib.urlencode(values)
        print "Abrindo ", urlend
        req = urllib2.Request(urlend,data,headers)
        fd = urllib2.urlopen(req)
        html = fd.read()
        return html
    
    def verResp(self,html):
        '''
        Verifica se resposta e um hit ou nao
        '''
        lnkart = SoupStrainer('a', href=re.compile('^/wiki/*'))
        artlist =  [tag['href'] for tag in BS(html, parseOnlyThese=lnkart)]
        if artlist[0].endswith('Disambig.svg'):
            self.fila.append('http://'+self.langatual+'.wikipedia.org'+artlist[3])
            self.curlinks = artlist
        else:
            self.curlinks = artlist
            Ideia(nome=self.termoatual,nlinks = len(artlist), links = artlist)
            self.G.add_edges_from([(self.termoatual,i.split('/')[-1]) for i in self.curlinks])
            if self.curdepth > self.depth:
                return
            self.fila.extend([self.baseurl[:-1] + i for i in artlist])
            self.curdepth +=1
        
            
    def move(self):
        if not self.fila:
            if not self.started:
                self.fila.append(self.SU)
        while self.fila:
            self.started = 1
            self.urlatual = self.fila.pop()
            self.termoatual = self.urlatual.split('/')[-1]
            
            if ":" in self.termoatual:
                #~ print "+++",self.termoatual
                continue
            if self.termoatual in ['Main_page']+self.history:
                continue
            print "buscando ", self.termoatual,
            print "Faltam: ", len(self.fila)
            try:
                html = self.parsePag(self.urlatual)
            except HTTPError:
                print "==> não encontrei ",self.urlatual
                continue
            self.verResp(html)
            self.nlinks +=1
            self.history.append(self.termoatual)
            session.commit()

   
if __name__=="__main__":
    #~ palavra = u'matemática'
    #~ lingua = 'pt'
    #~ Cr = Crawler(palavra,lingua,1)
    Cr = Crawler(sys.argv[1],sys.argv[2],1)
    Cr.move()
    #~ NX.draw(Cr.G)
    #~ P.show()