orange / source / orange / classify.hpp


#include <string>

#include "examples.hpp"
#include "distvars.hpp"

using namespace std;



#define TClassifierList TOrangeVector<PClassifier> 

/* Classifiers have three methods for classification.
   - operator() returns TValue
   - classDistribution return PDistribution
   - predictionAndDistribution returns both

   At least one of the first two need to be overloaded. If the method
   can return probabilities (or at least something the closely
   ressembles it), it should redefine the second.

   computesProbabilities should be set if classDistribution is overloaded.

   Here are different scenarios:

   1. Derived class overloads only operator() (returning TValue) and
      sets computeProbabilities to false.

      In this case, the inherited PDistribution will call operator()
      and construct a PDistribution in which the returned TValue will
      have a probability of 1.0. Similarly, predictionAndDistribution
      will call operator() and construct PDistribution.

   2. Derived class overloads classDistribution and sets

      The inherited operator() will call classDistribution and use the
      distribution's method highestProbValue() to select a value to return.
      (Note that it randomly selects one of the values (and not the first
      value) with the highest probability. This can lead to
      different classification of the same example.) Method
      predictionAndDistribution behaves similarly.

   3. operator() and classDistribution are overloaded, while
      predictionAndDistribution is not.

      computesProbabilities decides which of the two methods are used
      by the inherited predictionAndDistribution.

   4. Only predictionAndDistribution is overloaded.

      This is illegal. If prediction cannot be based on probabilities
      (for example, if there is a way to prefer one of predictions with
      the same probability), you should redefine all three methods.
      computesProbabilities can be set to whatever you like.

   Most classifiers will use scenario 2 -- return classDistributions
   and leave the rest to methods inherited from TClassifier.

class ORANGE_API TClassifier : public TOrange {

  PVariable classVar; //P class variable
  bool computesProbabilities; //P set if classifier computes class probabilities (if not, it assigns 1.0 to the predicted)

  TClassifier(const bool &cp=false);
  TClassifier(const PVariable &, const bool &cp=false);
  TClassifier(const TClassifier &old);

  virtual TValue operator ()(const TExample &);
  virtual PDistribution classDistribution(const TExample &);
  virtual void predictionAndDistribution(const TExample &, TValue &, PDistribution &);

  virtual TValue operator()(const TExample &, PEFMDataDescription);
  virtual PDistribution classDistribution(const TExample &, PEFMDataDescription);

class ORANGE_API TClassifierFD : public TClassifier {

  PDomain domain; //P domain

  TClassifierFD(const bool &cp =false);
  TClassifierFD(PDomain, const bool &cp =false);
  TClassifierFD(const TClassifierFD &old);

  void afterSet(const char *name);

class ORANGE_API TDefaultClassifier : public TClassifier {

  TValue defaultVal; //P default prediction
  PDistribution defaultDistribution; //P default distribution

  TDefaultClassifier(PVariable, PDistribution defDis);
  TDefaultClassifier(PVariable, const TValue &defVal, PDistribution defDis);
  TDefaultClassifier(const TDefaultClassifier &old);
  virtual TValue operator ()(const TExample &);
  virtual PDistribution classDistribution(const TExample &);
  virtual void predictionAndDistribution(const TExample &, TValue &, PDistribution &);

class ORANGE_API TRandomClassifier : public TClassifier {

  PDistribution probabilities; //P probabilities of predictions

  TRandomClassifier(PVariable acv=PVariable());
  TRandomClassifier(const TDistribution &probs);
  TRandomClassifier(PVariable acv, const TDistribution &probs);
  TRandomClassifier(PVariable acv, PDistribution);

  TValue operator()(const TExample &);
  PDistribution classDistribution(const TExample &);
  void predictionAndDistribution(const TExample &, TValue &val, PDistribution &dist);


#ifdef _MSC_VER
  template class ORANGE_API std::vector<float>;

class ORANGE_API TEFMDataDescription : public TOrange {

  PDomain domain; //PR domain
  PDomainDistributions domainDistributions; //C distributions of values for attributes
  std::vector<float> averages;
  vector<float> matchProbabilities; // if you intend to really export this class, you'll need to define 'afterSet' for domain distributions
  int originalWeight, missingWeight;

  TEFMDataDescription(PDomain, PDomainDistributions=PDomainDistributions(), int ow=0, int mw=0);
  void getAverages();

  float getExampleWeight(const TExample &) const;
  float getExampleMatch(const TExample &, const TExample &);

class ORANGE_API TExampleForMissing : public TExample {

  PEFMDataDescription dataDescription; //P data description
  vector<int> DKs;
  vector<int> DCs;

  TExampleForMissing(PDomain, PEFMDataDescription = PEFMDataDescription());
  TExampleForMissing(const TExample &orig, PEFMDataDescription =PEFMDataDescription());
  TExampleForMissing(const TExampleForMissing &orig);
  TExampleForMissing(PDomain dom, const TExample &orig, PEFMDataDescription);

  virtual TExampleForMissing &operator =(const TExampleForMissing &orig);
  virtual TExample &operator =(const TExample &orig);

  void resetExample();
  bool nextExample();
  bool hasMissing();