Commits

Miki Tebeka committed 91c211e

start train on spam

Comments (0)

Files changed (1)

 import numpy as np
 from sklearn import svm
 import re
+from Stemmer import Stemmer
+
+stem = Stemmer('en').stemWord
 
 def plot(xs, ys, clf, title=None):
     data = np.append(xs, ys, 1)
     plot(raw['X'], raw['y'], clf)
 
 
-def find_best(**kw):
-    raw = loadmat('ex6/ex6data3.mat')
+def find_best(raw, **kw):
     values = [.01, .03, .1, .3, 1, 3, 10, 30]
     best, best_score = None, 0
     best_score = 0
 
 def load_voc():
     with open('ex6/vocab.txt') as fo:
-        return np.array([line.split()[1].strip() for line in fo])
+        kv = (line.split() for line in fo)
+        return dict((v.strip(), int(k)) for k, v in kv)
 
 
 def normailze(text):
     return text
 
 
+def tokenize(text):
+    text = normailze(text)
+    tokens = re.split(r'[ @$/#.\-:&*+=\[\]?!(){},\'">_<;%\n\r]', text)
+    tokens = (re.sub('[^a-zA-Z]', '', token) for token in tokens)
+    return (stem(token) for token in tokens if token.strip())
+
+
+def vectorize(text):
+    voc = load_voc()
+    vec = np.zeros(len(voc))
+    for token in tokenize(text):
+        i = voc.get(token, -1)
+        if i == -1:
+            continue
+        vec[i] = 1
+
+    return vec
+
+
+def train():
+    data = loadmat('ex6/spamTrain.mat')
+    xs = data['X']
+    ys = data['y'].ravel()>0
+
 
 def main(argv=None):
     import sys