CCP2011 / nlp1.py

# -*- coding:utf-8 -*-
"""
Este módulo contém uma classe que implementa algumas das análises
descritas em: http://nltk.googlecode.com/svn/trunk/doc/howto/portuguese_en.html

"""
import nltk
from nltk import ingrams, FreqDist
import sys
reload(sys)
sys.setdefaultencoding('latin-1')

stopwords = nltk.corpus.stopwords.words('portuguese')
sent_tokenizer=nltk.data.load('tokenizers/punkt/portuguese.pickle')

class ProcessaTexto:
    """
    Esta classe oferece métodos para analisar
    um texto de várias maneiras
    """
    def __init__(self, corpus, fid):
        self.title = fid
        self.words = corpus.words(fid)
        self.raw = corpus.raw(fid)
        self.enc = corpus.encoding(fid) #codificação do texto
        self.sents = self.sent_seg(self.raw)
        self._fp = None #cache da frequencia de palavras
        
    def find_ngrams(self,texto, palavra, n):
        """
        retorna os ngramas contendo palavra no texto
        """
        fd = FreqDist(ng for ng in ingrams(texto,n) if palavra in ng )
        return fd

    @property
    def fp(self):
        """
        Calcula frequencia de palavras no texto
        """
        if not self._fp:
            self._fd = nltk.FreqDist(w.decode('latin-1').lower() for w in self.words if w not in stopwords)
        return self._fd

    def sent_seg(self,texto):
        return sent_tokenizer.tokenize(texto)

    def termos_relevantes(self):
        """
        Calcula a relevância dos termos do documento baseda do score tf-idf
        """
        rel = {}
        for w in self.fp.iterkeys():
            rel[w] = nltk.TextCollection.tf_idf(w,self.raw)
        return rel
    
    def concordance(self,palavra,contexto=30):
        for sent in self.sents:
            if palavra in sent:
                pos = sent.index(palavra)
                left = ' '.join(sent[:pos])
                right = ' '.join(sent[pos+len(palavra):])
                print '%s %s %s'%(left[-contexto:], palavra, right[:contexto])


    
if __name__=="__main__":
    from nltk.corpus import machado

    print "numero de textos disponiveis: ",len(machado.fileids())
    PT = ProcessaTexto(machado,'romance/marm05.txt')
    print "++> Codificação: ",PT.enc
    ngs = PT.find_ngrams(PT.words, 'olho',4)
    for ng,c in ngs.iteritems():
        print ' '.join(ng), c
    frases = PT.sent_seg(PT.raw)
    print frases[10]
    for w,c in PT.fp.items()[10:30]:
        print w,c
    PT.concordance('Lobo',30)
    T = nltk.Text(machado.words('romance/marm05.txt'))
    T.plot(100)
    
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.