# Commits

committed f9b9eda

corrigido o problema de encoding

• Participants
• Parent commits 0039701

# nlp1.py

"""
import nltk
from nltk import ingrams, FreqDist
+import sys
+sys.setdefaultencoding('latin-1')

stopwords = nltk.corpus.stopwords.words('portuguese')
um texto de várias maneiras
"""
def __init__(self, corpus, fid):
+        self.title = fid
self.words = corpus.words(fid)
self.raw = corpus.raw(fid)
+        self.enc = corpus.encoding(fid) #codificação do texto
self.sents = self.sent_seg(self.raw)
-        self.fd = None
+        self._fp = None #cache da frequencia de palavras

def find_ngrams(self,texto, palavra, n):
"""
fd = FreqDist(ng for ng in ingrams(texto,n) if palavra in ng )
return fd

-    def frequencia_de_palavras(self):
+    @property
+    def fp(self):
"""
Calcula frequencia de palavras no texto
"""
-        if not self.fd:
-            self.fd = nltk.FreqDist(w.lower() for w in self.words() if w not in stopwords)
-        return self.fd
+        if not self._fp:
+            self._fd = nltk.FreqDist(w.decode('latin-1').lower() for w in self.words if w not in stopwords)
+        return self._fd

def sent_seg(self,texto):
return sent_tokenizer.tokenize(texto)

+    def termos_relevantes(self):
+        """
+        Calcula a relevância dos termos do documento baseda do score tf-idf
+        """
+        rel = {}
+        for w in self.fp.iterkeys():
+            rel[w] = nltk.TextCollection.tf_idf(w,self.raw)
+        return rel
+
def concordance(self,palavra,contexto=30):
for sent in self.sents:
if palavra in sent:
right = ' '.join(sent[pos+len(palavra):])
print '%s %s %s'%(left[-contexto:], palavra, right[:contexto])

+
+
if __name__=="__main__":
-    import locale
+
print "numero de textos disponiveis: ",len(machado.fileids())