CCP2011 /

# -*- coding:utf-8 -*-
Este módulo contém uma classe que implementa algumas das análises
descritas em:

import nltk
from nltk import ingrams, FreqDist
import sys

stopwords = nltk.corpus.stopwords.words('portuguese')'tokenizers/punkt/portuguese.pickle')

class ProcessaTexto:
    Esta classe oferece métodos para analisar
    um texto de várias maneiras
    def __init__(self, corpus, fid):
        self.title = fid
        self.words = corpus.words(fid)
        self.raw = corpus.raw(fid)
        self.enc = corpus.encoding(fid) #codificação do texto
        self.sents = self.sent_seg(self.raw)
        self._fp = None #cache da frequencia de palavras
    def find_ngrams(self,texto, palavra, n):
        retorna os ngramas contendo palavra no texto
        fd = FreqDist(ng for ng in ingrams(texto,n) if palavra in ng )
        return fd

    def fp(self):
        Calcula frequencia de palavras no texto
        if not self._fp:
            self._fd = nltk.FreqDist(w.decode('latin-1').lower() for w in self.words if w not in stopwords)
        return self._fd

    def sent_seg(self,texto):
        return sent_tokenizer.tokenize(texto)

    def termos_relevantes(self):
        Calcula a relevância dos termos do documento baseda do score tf-idf
        rel = {}
        for w in self.fp.iterkeys():
            rel[w] = nltk.TextCollection.tf_idf(w,self.raw)
        return rel
    def concordance(self,palavra,contexto=30):
        for sent in self.sents:
            if palavra in sent:
                pos = sent.index(palavra)
                left = ' '.join(sent[:pos])
                right = ' '.join(sent[pos+len(palavra):])
                print '%s %s %s'%(left[-contexto:], palavra, right[:contexto])

if __name__=="__main__":
    from nltk.corpus import machado

    print "numero de textos disponiveis: ",len(machado.fileids())
    PT = ProcessaTexto(machado,'romance/marm05.txt')
    print "++> Codificação: ",PT.enc
    ngs = PT.find_ngrams(PT.words, 'olho',4)
    for ng,c in ngs.iteritems():
        print ' '.join(ng), c
    frases = PT.sent_seg(PT.raw)
    print frases[10]
    for w,c in PT.fp.items()[10:30]:
        print w,c
    T = nltk.Text(machado.words('romance/marm05.txt'))
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.