Commits

Blaz Zupan committed c22077a

new tutorial

Comments (0)

Files changed (29)

docs/tutorial/rst/code/classification-classifier1.py

+import Orange
+
+data = Orange.data.Table("voting")
+classifier = Orange.classification.bayes.NaiveLearner(data)
+for d in data[:5]:
+    c = classifier(d)
+    print "%10s; originally %s" % (classifier(d), d.getclass())

docs/tutorial/rst/code/classification-classifier2.py

+import Orange
+
+data = Orange.data.Table("voting")
+classifier = Orange.classification.bayes.NaiveLearner(data)
+target = 1
+print "Probabilities for %s:" % data.domain.class_var.values[target]
+for d in data[:5]:
+    ps = classifier(d, Orange.classification.Classifier.GetProbabilities)
+    print "%5.3f; originally %s" % (ps[target], d.getclass())

docs/tutorial/rst/code/classification-cv.py

+import Orange
+
+data = Orange.data.Table("voting")
+bayes = Orange.classification.bayes.NaiveLearner()
+res = Orange.evaluation.testing.cross_validation([bayes], data, folds=5)
+print "Accuracy: %.2f" % Orange.evaluation.scoring.CA(res)[0]
+print "AUC:      %.2f" % Orange.evaluation.scoring.AUC(res)[0]

docs/tutorial/rst/code/classification-cv2.py

+import Orange
+
+data = Orange.data.Table("voting")
+
+tree = Orange.classification.tree.TreeLearner(sameMajorityPruning=1, mForPruning=2)
+tree.name = "tree"
+nbc = Orange.classification.bayes.NaiveLearner()
+nbc.name = "nbc"
+lr = Orange.classification.logreg.LogRegLearner()
+lr.name = "lr"
+
+learners = [nbc, tree, lr]
+print " "*9 + " ".join("%-4s" % learner.name for learner in learners)
+res = Orange.evaluation.testing.cross_validation(learners, data, folds=5)
+print "Accuracy %s" % " ".join("%.2f" % s for s in Orange.evaluation.scoring.CA(res))
+print "AUC      %s" % " ".join("%.2f" % s for s in Orange.evaluation.scoring.AUC(res))

docs/tutorial/rst/code/classification-models.py

+import Orange
+
+data = Orange.data.Table("titanic")
+lr = Orange.classification.logreg.LogRegLearner(data)
+print Orange.classification.logreg.dump(lr)
+
+tree = Orange.classification.tree.TreeLearner(data)
+print tree.to_string()

docs/tutorial/rst/code/classification-other.py

+import Orange
+import random
+
+data = Orange.data.Table("housing")
+test = Orange.data.Table(random.sample(data, 5))
+train = Orange.data.Table([d for d in data if d not in test])
+
+tree = Orange.regression.tree.TreeLearner(train, same_majority_pruning=1, m_pruning=2)
+tree.name = "tree"
+knn = Orange.classification.knn.kNNLearner(train, k=21)
+knn.name = "k-NN"
+lr = Orange.classification.logreg.LogRegLearner(train)
+lr.name = "lr"
+
+classifiers = [tree, knn, lr]
+
+target = 0
+print "Probabilities for %s:" % data.domain.class_var.values[target]
+print "original class ",
+print " ".join("%-9s" % l.name for l in classifiers)
+
+return_type = Orange.classification.Classifier.GetProbabilities
+for d in test:
+    print "%-15s" % (d.getclass()),
+    print "     ".join("%5.3f" % c(d, return_type)[target] for c in classifiers)

docs/tutorial/rst/code/data-domain1.py

+import Orange
+
+data = Orange.data.Table("imports-85.tab")
+m = len(data.domain.features)
+m_cont = sum(1 for x in data.domain.features if x.var_type==Orange.feature.Type.Continuous)
+m_disc = sum(1 for x in data.domain.features if x.var_type==Orange.feature.Type.Discrete)
+m_disc = len(data.domain.features)
+print "%d features, %d continuous and %d discrete" % (m, m_cont, m-m_cont)
+
+print "First three features:"
+for i in range(3):
+    print "   ", data.domain.features[i].name
+
+print "First three features (again):"
+for x in data.domain.features[:3]:
+    print "   ", x.name
+
+print "Class:", data.domain.class_var.name

docs/tutorial/rst/code/data-domain2.py

+import Orange
+
+data = Orange.data.Table("imports-85.tab")
+
+print "Name of the first feature:", data.domain[0].name
+name = 'fuel-type'
+print "Values of feature '%s'" % name,
+print data.domain[name].values

docs/tutorial/rst/code/data-featureselection.py

+import Orange
+
+data = Orange.data.Table("iris.tab")
+new_domain = Orange.data.Domain(data.domain.features[:2] + [data.domain.class_var])
+new_data = Orange.data.Table(new_domain, data)
+
+print data[0]
+print new_data[0]

docs/tutorial/rst/code/data-instances1.py

+import Orange
+
+data = Orange.data.Table("iris")
+print "First three data instances:"
+for d in data[:3]:
+    print d
+
+print "25-th data instance:"
+print data[26]
+
+name = "sepal width"
+print "Value of '%s' for the first instance:" % name, data[0][name]
+print "The 3rd value of the 25th data instance:", data[26][2]

docs/tutorial/rst/code/data-instances2.py

+import Orange
+
+average = lambda xs: sum(xs)/float(len(xs))
+
+data = Orange.data.Table("iris")
+print "%-15s %s" % ("Feature", "Mean")
+for x in data.domain.features:
+    print "%-15s %.2f" % (x.name, average([d[x] for d in data]))

docs/tutorial/rst/code/data-instances3.py

+import Orange
+
+average = lambda xs: sum(xs)/float(len(xs))
+
+data = Orange.data.Table("iris")
+targets = data.domain.class_var.values
+print "%-15s %s" % ("Feature", " ".join("%15s" % c for c in targets))
+for x in data.domain.features:
+    dist = ["%15.2f" % average([d[x] for d in data if d.get_class()==c]) for c in targets]
+    print "%-15s" % x.name, " ".join(dist)

docs/tutorial/rst/code/data-instances4.py

+import Orange
+from collections import Counter
+
+data = Orange.data.Table("lenses")
+print Counter(str(d.get_class()) for d in data)

docs/tutorial/rst/code/data-lenses.py

+import Orange
+data = Orange.data.Table("lenses")
+print "Attributes:", ", ".join(x.name for x in data.domain.features)
+print "Class:", data.domain.class_var.name
+print "Data instances", len(data)
+
+target = "soft"
+print "Data instances with %s prescriptions:" % target
+for d in data:
+    if d.get_class() == target:
+        print " ".join(["%-15s" % str(v) for v in d])
+

docs/tutorial/rst/code/data-missing.py

+import Orange
+
+data = Orange.data.Table("voting.tab")
+for x in data.domain.features:
+    n_miss = sum(1 for d in data if d[x].is_special())
+    print "%4.1f%% %s" % (100.*n_miss/len(data), x.name)

docs/tutorial/rst/code/data-save.py

+import Orange
+data = Orange.data.Table("lenses")
+print "N1=%d" % len(data)
+new_data = Orange.data.Table([d for d in data if d["prescription"]=="myope"])
+print "N2=%d" %len(new_data)
+new_data.save("lenses-subset.tab")

docs/tutorial/rst/code/data-subsetting.py

+import Orange
+
+data = Orange.data.Table("iris.tab")
+new_data = Orange.data.Table([d for d in data if d["petal length"]>3.0])
+print "Subsetting from %d to %d instances." % (len(data), len(new_data))

docs/tutorial/rst/code/ensemble-bagging.py

+import Orange
+
+data = Orange.data.Table("promoters")
+
+tree = Orange.classification.tree.TreeLearner(m_pruning=2, name="tree")
+boost = Orange.ensemble.boosting.BoostedLearner(tree, name="boost")
+bagg = Orange.ensemble.bagging.BaggedLearner(tree, name="bagg")
+
+learners = [tree, boost, bagg]
+results = Orange.evaluation.testing.cross_validation(learners, data, folds=10)
+for l, s in zip(learners, Orange.evaluation.scoring.AUC(results)):
+    print "%5s: %.2f" % (l.name, s)

docs/tutorial/rst/code/ensemble-cmd.py

+import Orange
+data = Orange.data.Table("housing")
+tree = Orange.classification.tree.TreeLearner()
+# btree = Orange.ensemble.boosting.BoostedLearner(tree)
+btree = Orange.ensemble.bagging.BaggedLearner(tree)
+#btree
+#btree(data)
+model = btree(data)
+print model(data[0])

docs/tutorial/rst/code/ensemble-forest.py

+import Orange
+
+data = Orange.data.Table("promoters")
+
+bayes = Orange.classification.bayes.NaiveLearner(name="bayes")
+knn = Orange.classification.knn.kNNLearner(name="knn")
+forest = Orange.ensemble.forest.RandomForestLearner(name="forest")
+
+learners = [forest, bayes, knn]
+res = Orange.evaluation.testing.cross_validation(learners, data, 5)
+print "\n".join(["%6s: %5.3f" % (l.name, r) for r, l in zip(Orange.evaluation.scoring.AUC(res), learners)])

docs/tutorial/rst/code/ensemble-stacking.py

+import Orange
+
+data = Orange.data.Table("promoters")
+
+bayes = Orange.classification.bayes.NaiveLearner(name="bayes")
+tree = Orange.classification.tree.SimpleTreeLearner(name="tree")
+knn = Orange.classification.knn.kNNLearner(name="knn")
+
+base_learners = [bayes, tree, knn]
+stack = Orange.ensemble.stacking.StackedClassificationLearner(base_learners)
+
+learners = [stack, bayes, tree, knn]
+res = Orange.evaluation.testing.cross_validation(learners, data, 10)
+print "\n".join(["%8s: %5.3f" % (l.name, r) for r, l in zip(Orange.evaluation.scoring.AUC(res), learners)])

docs/tutorial/rst/code/py-score-features.py

+import Orange
+
+data = Orange.data.Table("promoters")
+gain = Orange.feature.scoring.InfoGain()
+best = [f for _, f in sorted((gain(x, data), x) for x in data.domain.features)[-5:]]
+print "Features:", len(data.domain.features)
+print "Best ones:", ", ".join([x.name for x in best])

docs/tutorial/rst/code/py-small.py

+import Orange
+
+class SmallLearner(Orange.classification.PyLearner):
+    def __init__(self, base_learner=Orange.classification.bayes.NaiveLearner,
+                 name='small', m=5):
+        self.name = name
+        self.m   = m
+        self.base_learner = base_learner
+
+    def __call__(self, data, weight=None):
+        gain = Orange.feature.scoring.InfoGain()
+        m = min(self.m, len(data.domain.features))
+        best = [f for _, f in sorted((gain(x, data), x) for x in data.domain.features)[-m:]]
+        domain = Orange.data.Domain(best + [data.domain.class_var])
+
+        model = self.base_learner(Orange.data.Table(domain, data), weight)
+        return Orange.classification.PyClassifier(classifier=model, name=self.name)
+
+class OptimizedSmallLearner(Orange.classification.PyLearner):
+    def __init__(self, name="opt_small", ms=range(1,30,3)):
+        self.ms = ms
+        self.name = name
+
+    def __call__(self, data, weight=None):
+        scores = []
+        for m in self.ms:
+            res = Orange.evaluation.testing.cross_validation([SmallLearner(m=m)], data, folds=5)
+            scores.append((Orange.evaluation.scoring.AUC(res)[0], m))
+        _, best_m = max(scores)
+
+        return SmallLearner(data, m=best_m)
+
+data = Orange.data.Table("promoters")
+s_learner = SmallLearner(m=3)
+classifier = s_learner(data)
+print classifier(data[20])
+print classifier(data[20], Orange.classification.Classifier.GetProbabilities)
+
+nbc = Orange.classification.bayes.NaiveLearner(name="nbc")
+s_learner = SmallLearner(m=3)
+o_learner = OptimizedSmallLearner()
+
+learners = [o_learner, s_learner, nbc]
+res = Orange.evaluation.testing.cross_validation(learners, data, folds=10)
+print ", ".join("%s: %.3f" % (l.name, s) for l, s in zip(learners, Orange.evaluation.scoring.AUC(res)))
+

docs/tutorial/rst/code/regression-cv.py

+import Orange
+
+data = Orange.data.Table("housing.tab")
+
+lin = Orange.regression.linear.LinearRegressionLearner()
+lin.name = "lin"
+earth = Orange.regression.earth.EarthLearner()
+earth.name = "mars"
+tree = Orange.regression.tree.TreeLearner(m_pruning = 2)
+tree.name = "tree"
+
+learners = [lin, earth, tree]
+
+res = Orange.evaluation.testing.cross_validation(learners, data, folds=5)
+mse = Orange.evaluation.scoring.RMSE(res)
+
+print "Learner  RMSE"
+for i in range(len(learners)):
+  print "%-7s %5.2f" % (learners[i].name, mse[i])

docs/tutorial/rst/code/regression-other.py

+import Orange
+import random
+
+data = Orange.data.Table("housing")
+test = Orange.data.Table(random.sample(data, 5))
+train = Orange.data.Table([d for d in data if d not in test])
+
+lin = Orange.regression.linear.LinearRegressionLearner(train)
+lin.name = "lin"
+earth = Orange.regression.earth.EarthLearner(train)
+earth.name = "mars"
+tree = Orange.regression.tree.TreeLearner(train)
+tree.name = "tree"
+
+models = [lin, earth, tree]
+
+print "y    " + " ".join("%-4s" % l.name for l in models)
+for d in test[:3]:
+    print "%.1f" % (d.get_class()),
+    print " ".join("%4.1f" % model(d) for model in models)

docs/tutorial/rst/code/regression-tree.py

+import Orange
+
+data = Orange.data.Table("housing.tab")
+tree = Orange.regression.tree.TreeLearner(data, m_pruning=2., min_instances=20)
+print tree.to_string()

docs/tutorial/rst/code/regression.py

+import Orange
+
+data = Orange.data.Table("housing")
+learner = Orange.regression.linear.LinearRegressionLearner()
+model = learner(data)
+
+print "pred obs"
+for d in data[:3]:
+    print "%.1f %.1f" % (model(d), d.get_class())

docs/tutorial/rst/data.rst

+The Data
+========
+
+.. index: data
+
+This section describes how to load and save the data. We also show how to explore the data, its domain description, how to report on basic data set statistics, and how to sample the data.
+
+Data Input
+----------
+
+.. index:: 
+   single: data; input
+
+Orange can read files in native and other data formats. Native format starts with feature (attribute) names, their type (continuous, discrete, string). The third line contains meta information to identify dependent features (class), irrelevant features (ignore) or meta features (meta). Here are the first few lines from a data set :download:`lenses.tab <code/lenses.tab>` on prescription of eye
+lenses [CJ1987]::
+
+   age       prescription  astigmatic    tear_rate     lenses
+   discrete  discrete      discrete      discrete      discrete 
+                                                       class
+   young     myope         no            reduced       none
+   young     myope         no            normal        soft
+   young     myope         yes           reduced       none
+   young     myope         yes           normal        hard
+   young     hypermetrope  no            reduced       none
+
+
+Values are tab-limited. The data set has four attributes (age of the patient, spectacle prescription, notion on astigmatism, and information on tear production rate) and an associated three-valued dependent variable encoding lens prescription for the patient (hard contact lenses, soft contact lenses, no lenses). Feature descriptions could use one letter only, so the header of this data set could also read::
+
+   age       prescription  astigmatic    tear_rate     lenses
+   d         d             d             d             d 
+                                                       c
+
+The rest of the table gives the data. Note that there are 5
+instances in our table above (check the original file to see
+other). Orange is rather free in what attribute value names it
+uses, so they do not need all to start with a letter like in our
+example.
+
+You may download :download:`lenses.tab <code/lenses.tab>` to a target directory and there open a python shell. Alternatively, just execute the code below; this particular data set comes with Orange instalation, and Orange knows where to find it:
+
+    >>> import Orange
+    >>> data = Orange.data.Table("lenses")
+    >>>
+
+Note that for the file name no suffix is needed; as Orange checks if any files in the current directory are of the readable type. The call to ``Orange.data.Table`` creates an object called ``data`` that holds your data set and information about the lenses domain:
+
+>>> print data.domain.features
+<Orange.feature.Discrete 'age', Orange.feature.Discrete 'prescription', Orange.feature.Discrete 'astigmatic', Orange.feature.Discrete 'tear_rate'>
+>>> print data.domain.class_var
+Orange.feature.Discrete 'lenses'
+>>> for d in data[:3]:
+   ...:     print d
+   ...:
+['young', 'myope', 'no', 'reduced', 'none']
+['young', 'myope', 'no', 'normal', 'soft']
+['young', 'myope', 'yes', 'reduced', 'none']
+>>>
+
+The following script wraps-up everything we have done so far and lists first 5 data instances with ``soft`` perscription:
+
+.. literalinclude:: code/data-lenses.py
+
+Note that data is an object that holds both the data and information on the domain. We show above how to access attribute and class names, but there is much more information there, including that on feature type, set of values for categorical features, and other.
+
+Saving the Data
+---------------
+
+Data objects can be saved to a file:
+
+>>> data.save("new_data.tab")
+>>>
+
+This time, we have to provide the extension for Orange to know which data format to use. An extension for native Orange's data format is ".tab". The following code saves only the data items with myope perscription:
+
+.. literalinclude:: code/data-save.py
+
+Exploration of Data Domain
+--------------------------
+
+.. index::
+   single: data; features
+.. index::
+   single: data; domain
+.. index::
+   single: data; class
+
+Data table object stores information on data instances as well as on data domain. Domain holds the names of features, optional classes, their types and, if categorical, value names.
+
+.. literalinclude:: code/data-domain1.py
+
+Orange's objects often behave like Python lists and dictionaries, and can be indexed or accessed through feature names.
+
+.. literalinclude:: code/data-domain2.py
+    :lines: 5-
+
+Data Instances
+--------------
+
+.. index::
+   single: data; instances
+.. index::
+   single: data; examples
+
+Data table stores data instances (or examples). These can be index or traversed as any Python list. Data instances can be considered as vectors, accessed through element index, or through feature name.
+
+.. literalinclude:: code/data-instances1.py
+
+The script above displays the following output::
+
+   First three data instances:
+   [5.1, 3.5, 1.4, 0.2, 'Iris-setosa']
+   [4.9, 3.0, 1.4, 0.2, 'Iris-setosa']
+   [4.7, 3.2, 1.3, 0.2, 'Iris-setosa']
+   25-th data instance:
+   [5.0, 3.4, 1.6, 0.4, 'Iris-setosa']
+   Value of 'sepal width' for the first instance: 3.5
+   The 3rd value of the 25th data instance: 1.6
+
+Iris data set we have used above has four continous attributes. Here's a script that computes their mean:
+
+.. literalinclude:: code/data-instances2.py
+   :lines: 3-
+
+Above also illustrates indexing of data instances with objects that store features; in ``d[x]`` variable ``x`` is an Orange object. Here's the output::
+
+   Feature         Mean
+   sepal length    5.84
+   sepal width     3.05
+   petal length    3.76
+   petal width     1.20
+
+
+Slightly more complicated, but more interesting is a code that computes per-class averages:
+
+.. literalinclude:: code/data-instances3.py
+   :lines: 3-
+
+Of the four features, petal width and length look quite discriminative for the type of iris::
+
+   Feature             Iris-setosa Iris-versicolor  Iris-virginica
+   sepal length               5.01            5.94            6.59
+   sepal width                3.42            2.77            2.97
+   petal length               1.46            4.26            5.55
+   petal width                0.24            1.33            2.03
+
+Finally, here is a quick code that computes the class distribution for another data set:
+
+.. literalinclude:: code/data-instances4.py
+
+Missing Values
+--------------
+
+.. index::
+   single: data; missing values
+
+Consider the following exploration of senate voting data set::
+
+   >>> data = Orange.data.Table("voting.tab")
+   >>> data[2]
+   ['?', 'y', 'y', '?', 'y', 'y', 'n', 'n', 'n', 'n', 'y', 'n', 'y', 'y', 'n', 'n', 'democrat']
+   >>> data[2][0].is_special()
+   1
+   >>> data[2][1].is_special()
+   0
+
+The particular data instance included missing data (represented with '?') for first and fourth feature. We can use the method ``is_special()`` to detect parts of the data which is missing. In the original data set file, the missing values are, by default, represented with a blank space. We use the method ``is_special()`` below to examine each feature and report on proportion of instances for which this feature was undefined:
+
+.. literalinclude:: code/data-missing.py
+
+First few lines of the output of this script are::
+
+    2.8% handicapped-infants
+   11.0% water-project-cost-sharing
+    2.5% adoption-of-the-budget-resolution
+    2.5% physician-fee-freeze
+    3.4% el-salvador-aid
+
+A single-liner that reports on number of data instances with at least one missing value is::
+
+    >>> sum(any(d[x].is_special() for x in data.domain.features) for d in data)
+    203
+
+
+Data Subsetting
+---------------
+
+.. index::
+   single: data; subsetting
+
+``Orange.data.Table`` accepts a list of data items and returns a new data set. This is useful for any data subsetting:
+
+.. literalinclude:: code/data-subsetting.py
+   :lines: 3-
+
+The code outputs::
+
+   Subsetting from 150 to 99 instances.
+
+and inherits the data description (domain) from the original data set. Changing the domain requires setting up a new domain descriptor. This feature is useful for any kind of feature selection:
+
+.. literalinclude:: code/data-featureselection.py
+   :lines: 3-
+
+.. index::
+   single: feature; selection
+
+By default, ``Orange.data.Domain`` assumes that last feature in argument feature list is a class variable. This can be changed with an optional argument::
+
+   >>> nd = Orange.data.Domain(data.domain.features[:2], False)
+   >>> print nd.class_var
+   None
+   >>> nd = Orange.data.Domain(data.domain.features[:2], True)
+   >>> print nd.class_var
+   Orange.feature.Continuous 'sepal width'
+
+The first call to ``Orange.data.Domain`` constructed the classless domain, while the second used the last feature and constructed the domain with one input feature and a continous class.   
+
+**References**
+
+.. [CJ1987] Cendrowska J (1987) PRISM: An algorithm for inducing modular rules, International Journal of Man-Machine Studies, 27, 349-370.

docs/tutorial/rst/python-learners.rst

+Learners in Python
+==================
+
+.. index::
+   single: classifiers; in Python
+
+Orange comes with plenty classification and regression algorithms. But its also fun to make the new ones. You can build them anew, or wrap existing learners and add some preprocessing to construct new variants. Notice that learners in Orange have to adhere to certain rules. Let us observe them on a classification algorithm::
+
+   >>> import Orange
+   >>> data = Orange.data.Table("titanic")
+   >>> learner = Orange.classification.logreg.LogRegLearner()
+   >>> classifier = learner(data)
+   >>> classifier(data[0])
+   <orange.Value 'survived'='no'>
+
+When learner is given the data it returns a predictor. In our case, classifier. Classifiers are passed data instances and return a value of a class. They can also return probability distribution, or this together with a class value::
+
+   >>> classifier(data[0], Orange.classification.Classifier.GetProbabilities)
+   Out[26]: <0.593, 0.407>
+   >>> classifier(data[0], Orange.classification.Classifier.GetBoth)
+   Out[27]: (<orange.Value 'survived'='no'>, <0.593, 0.407>)
+
+Regression is similar, just that the regression model would return only the predicted continuous value.
+
+Notice also that the constructor for the learner can be given the data, and in that case it will construct a classifier (what else could it do?)::
+
+   >>> classifier = Orange.classification.logreg.LogRegLearner(data)
+   >>> classifier(data[42])
+   <orange.Value 'survived'='no'>
+
+Now we are ready to build our own learner. We will do this for a classification problem.
+
+Classifier with Feature Selection
+---------------------------------
+
+Consider a naive Bayesian classifiers. They do perform well, but could loose accuracy when there are many features, especially when these are correlated. Feature selection can help. We may want to wrap naive Bayesian classifier with feature subset selection, such that it would learn only from the few most informative features. We will assume the data contains only discrete features and will score them with information gain. Here is an example that sets the scorer (``gain``) and uses it to find best five features from the classification data set:
+
+.. literalinclude:: code/py-score-features.py
+   :lines: 3-
+
+We need to incorporate the feature selection within the learner, at the point where it gets the data. Learners for classification tasks inherit from ``Orange.classification.PyLearner``:
+
+.. literalinclude:: code/py-small.py
+   :lines: 3-17
+
+The initialization part of the learner (``__init__``) simply stores the based learner (in our case a naive Bayesian classifier), the name of the learner and a number of features we would like to use. Invocation of the learner (``__call__``) scores the features, stores the best one in the list (``best``), construct a data domain and then uses the one to transform the data (``Orange.data.Table(domain, data)``) by including only the set of the best features. Besides the most informative features we needed to include also the class. The learner then returns the classifier by using a generic classifier ``Orange.classification.PyClassifier``, where the actual prediction model is passed through ``classifier`` argument.
+
+Note that classifiers in Orange also use the weight vector, which records the importance of training data items. This is useful for several algorithms, like boosting.
+
+Let's check if this works::
+
+   >>> data = Orange.data.Table("promoters")
+   >>> s_learner = SmallLearner(m=3)
+   >>> classifier = s_learner(data)
+   >>> classifier(data[20])
+   <orange.Value 'y'='mm'>
+   >>> classifier(data[20], Orange.classification.Classifier.GetProbabilities)
+   <0.439, 0.561>
+
+It does! We constructed the naive Bayesian classifier with only three features. But how do we know what is the best number of features we could use? It's time to construct one more learner.
+
+Estimation of Feature Set Size
+------------------------------
+
+Given a training data, what is the best number of features we could use with a training algorithm? We can estimate that through cross-validation, by checking possible feature set sizes and estimating how well does the classifier on such reduced feature set behave. When we are done, we use the feature sets size with best performance, and build a classifier on the entire training set. This procedure is often referred to as internal cross validation. We wrap it into a new learner:
+
+.. literalinclude:: code/py-small.py
+   :lines: 19-31
+
+Again, our code stores the arguments at initialization (``__init__``). The learner invocation part selects the best value of parameter ``m``, the size of the feature set, and uses it to construct the final classifier.
+
+We can now compare the three classification algorithms. That is, the base classifier (naive Bayesian), the classifier with a fixed number of selected features, and the classifier that estimates the optimal number of features from the training set:
+
+.. literalinclude:: code/py-small.py
+   :lines: 39-45
+
+And the result? The classifier with feature set size wins (but not substantially. The results would be more pronounced if we would run this on the datasets with larger number of features)::
+
+   opt_small: 0.942, small: 0.937, nbc: 0.933
+