Commits

Timo Sulg committed 97f86b7

Added chapter.7 example.
Actually,thats quite pointless example - i mean, this example
is not taken from reallife and data is just taken from dummy data generator.
But this example works correctly and shows how elegant and powerful is
scikit.learn module. Even simpler and shorter code than books Java example.

Comments (0)

Files changed (5)

Chapter7/data/class.csv

+#logins
+5
+2
+3
+4

Chapter7/data/result.csv

+3.000000000000000000e+00
+2.000000000000000000e+00
+3.000000000000000000e+00
+5.000000000000000000e+00

Chapter7/data/test.csv

+#this file include new data
+#userid,age,gender
+5,60,2
+6,30,1
+7,40,2
+8,15,1

Chapter7/data/training.csv

+#userid,age,gender,logins
+1,20,2,5
+2,30,1,2
+3,40,2,3
+4,35,1,4

Chapter7/predict_logins.py

+'''
+Chapter.7 Data mining
+
+
+This is solution fo book example using scikit-learn module.
+
+This example shows how to build simple predictive models,
+which will predict how many logins new user will do.
+This is actually totally pointless example,just to show how data mining works.
+But if you look better examples, then look scikits.learn's homepage:
+For more examples:
+    http://scikit-learn.sourceforge.net/auto_examples/index.html
+'''
+import os
+import sys
+import numpy as np
+
+class PredictLogins(object):
+    '''
+    Easy example how to use Scikits.learn to solve textbook demo.
+    input file have to have certain format:
+        * commasepareted
+        * on first line is header
+        * # is comment-line
+
+    usage:
+        >>> predictor = PredictLogins("training.csv", "class.csv", "test.csv")
+        >>> predictor.execute(output) #all steps and print result to output
+    '''
+    def __init__(self, training_file, class_file, test_file, classifier = "svm"):
+        ''' '''
+        self.training_file = training_file
+        self.class_file = class_file
+        self.test_file = test_file
+
+        #some state values
+        self.is_fitted = False #is model trained
+        self.is_tested = False #is model validated and trained
+        #initialize classifier
+        if classifier == "svm":
+            from scikits.learn import svm
+            self.clf = svm.SVC()  #classifier
+
+    def data_from_csv(self, filename):
+        '''reads text file and transforms it to array '''
+        full_path = os.path.abspath(os.getcwd()) + "/" + filename
+        try:
+            nparray = np.genfromtxt(full_path, dtype = None,
+                names = None, delimiter = ",", comments = "#"  )
+        except IOError, e:
+            sys.stderr.write("Cant open:\"%s.\"\n %s"%(fullpath, e.msg))
+
+        return nparray
+
+    def fit(self):
+        #build predictive model
+        training_data = self.data_from_csv(self.training_file)
+        classes = self.data_from_csv(self.class_file)
+        print training_data, '\n', classes
+        self.clf.fit(training_data, classes)
+        self.is_fitted = True
+#TODO: finish it
+    def score(self):
+        #evaluate quality and print stats
+        self.is_tested = True
+        pass
+
+    def predict(self, output):
+        #predict the number of logins for a new user
+        test_data = self.data_from_csv(self.test_file)
+        result = self.clf.predict(test_data)
+        print("Training results:\n", result)
+        np.savetxt(output, result, delimiter = ',')
+
+
+    def execute(self, output):
+        '''runs whole model '''
+        print "train models"
+        self.fit()
+        self.score()
+        print "test new data"
+        self.predict(output)
+        print "job done - look results.txt"
+
+if __name__ == "__main__":
+
+    if len(sys.argv) > 4 :
+        training_file, class_file, test_file, output_file = sys.argv[1:5]
+    else:
+        training_file = "data/training.csv"
+        class_file = "data/class.csv"
+        test_file = "data/test.csv"
+        output_file = "data/result.csv"
+    predictor = PredictLogins(training_file, class_file, test_file)
+    predictor.execute(output_file)
+