Commits

Flávio Coelho committed f9b9eda

corrigido o problema de encoding

Comments (0)

Files changed (1)

 """
 import nltk
 from nltk import ingrams, FreqDist
+import sys
+reload(sys)
+sys.setdefaultencoding('latin-1')
 
 stopwords = nltk.corpus.stopwords.words('portuguese')
 sent_tokenizer=nltk.data.load('tokenizers/punkt/portuguese.pickle')
     um texto de várias maneiras
     """
     def __init__(self, corpus, fid):
+        self.title = fid
         self.words = corpus.words(fid)
         self.raw = corpus.raw(fid)
+        self.enc = corpus.encoding(fid) #codificação do texto
         self.sents = self.sent_seg(self.raw)
-        self.fd = None
+        self._fp = None #cache da frequencia de palavras
         
     def find_ngrams(self,texto, palavra, n):
         """
         fd = FreqDist(ng for ng in ingrams(texto,n) if palavra in ng )
         return fd
 
-    def frequencia_de_palavras(self):
+    @property
+    def fp(self):
         """
         Calcula frequencia de palavras no texto
         """
-        if not self.fd:
-            self.fd = nltk.FreqDist(w.lower() for w in self.words() if w not in stopwords)
-        return self.fd
+        if not self._fp:
+            self._fd = nltk.FreqDist(w.decode('latin-1').lower() for w in self.words if w not in stopwords)
+        return self._fd
 
     def sent_seg(self,texto):
         return sent_tokenizer.tokenize(texto)
 
+    def termos_relevantes(self):
+        """
+        Calcula a relevância dos termos do documento baseda do score tf-idf
+        """
+        rel = {}
+        for w in self.fp.iterkeys():
+            rel[w] = nltk.TextCollection.tf_idf(w,self.raw)
+        return rel
+    
     def concordance(self,palavra,contexto=30):
         for sent in self.sents:
             if palavra in sent:
                 right = ' '.join(sent[pos+len(palavra):])
                 print '%s %s %s'%(left[-contexto:], palavra, right[:contexto])
 
+
+    
 if __name__=="__main__":
     from nltk.corpus import machado
-    import locale
+
     print "numero de textos disponiveis: ",len(machado.fileids())
     PT = ProcessaTexto(machado,'romance/marm05.txt')
+    print "++> Codificação: ",PT.enc
     ngs = PT.find_ngrams(PT.words, 'olho',4)
     for ng,c in ngs.iteritems():
         print ' '.join(ng), c
     frases = PT.sent_seg(PT.raw)
+    for w,c in PT.fp.items()[10:30]:
+        print w,c
     PT.concordance('Lobo',30)
-    print locale.getlocale()
-    #T = nltk.Text(machado.words('romance/marm05.txt'))
-    #T.plot(100)
+    T = nltk.Text(machado.words('romance/marm05.txt'))
+    T.plot(100)