Commits

Marko Toplak committed 7f84657

Moved datasets from Orange/doc/datasets to Orange/datasets.

  • Participants
  • Parent commits 74014d3

Comments (0)

Files changed (252)

File Orange/datasets/DATA_INFO.py

+import os, glob, sys, orange, time, os.path, string
+
+descriptors = ['fname', 'inst', 'size', 'att', 'categ', 'cont', '%cont', 'class', 'values', '%major', 'date', 'description']
+verbose = 1
+
+# construct_datasets(): constructs list of data sets
+def build_datasets():
+  # use current directory instead
+  # os.chdir("d:\webMagix\orange\download\demos")
+  return glob.glob("*.tab")
+
+def dataset_statistics(fname, trace=0):
+  data = orange.ExampleTable(fname)
+  s = [fname]
+
+  # instances and size [kBytes]
+  size = '%5.1f' % (os.path.getsize(fname)/1000.)
+  s = s + [len(data), size]
+
+  # attributes
+  natt = len(data.domain.attributes)
+  ncont=0; ndisc=0
+  for a in data.domain.attributes:
+    if a.varType == orange.VarTypes.Discrete: ndisc = ndisc + 1
+    else: ncont = ncont + 1
+  pcont = '%5.1f' % (100.0 * ncont / natt)
+  s = s + [natt, ndisc, ncont, pcont]
+
+  # class name, values, majority class
+  if data.domain.classVar:
+    cname = data.domain.classVar.name
+    if data.domain.classVar.varType == 1:  # categorical data set
+      cval = 'discrete/' + str(len(data.domain.classVar.values))
+      c = [0] * len(data.domain.classVar.values)
+      for e in data:
+        c[int(e.getclass())] += 1
+      cmaj = '%5.1f' % (100.0 * max(c) / len(data))
+    else: # continuous data set
+      cval = 'continuous'
+      cmaj = 'n/a'
+  else:
+    cname = 'n/a'; cval = 'n/a'; cmaj = 'n/a'
+  s = s + [cname, cval, cmaj]
+
+  # date
+  rtime = time.gmtime(os.path.getmtime(fname))
+  t = time.strftime("%m/%d/%y", rtime)
+  s = s + [t]
+
+  # description
+  s = s + ['-']
+
+  # wrap up    
+  if trace: print fname, s
+  return s
+
+def compute_statistics(flist, trace=0):
+  global verbose
+  stat = {}
+  for f in flist:
+    if verbose:
+      print "processing %s" % (f)
+    s = dataset_statistics(f, trace)
+    stat[f] = s
+  return stat
+
+# obtain past descriptions (attributes) from info file
+def get_past():
+  past = {}
+  if glob.glob("data_info.txt"):
+    f = open("data_info.txt")
+    for line in f:
+      line = line[:-1] #remove new line at the end
+      att = string.split(line, '\t')
+      past[att[0]] = att
+    f.close()
+
+    import time
+    t = time.strftime("%m-%d-%y_%H-%M-%S", time.localtime(time.time()))
+    os.rename('data_info.txt', 'data_info_%s.txt' % t)
+  return past
+
+def get_past_desc():
+  past_desc = {}
+  if glob.glob("data_info.txt"):
+    f = open("data_info.txt")
+    for line in f:
+      line = line[:-1]
+      att = string.split(line, '\t')
+      past_desc[att[0]] = att[-1]
+    f.close()
+
+    import time
+    t = time.strftime("%m-%d-%y_%H-%M-%S", time.localtime(time.time()))
+    os.rename('data_info.txt', 'data_info_%s.txt' % t)
+  return past_desc
+
+def save_info(stat):
+  f = open("data_info.txt", 'w')
+  s = reduce(lambda x,y: str(x)+"\t"+str(y), descriptors)
+  f.write(s+'\n')
+  keys = stat.keys()
+  keys.sort()
+  print keys
+  for k in keys:
+    s = reduce(lambda x,y: str(x)+"\t"+str(y),stat[k])
+    f.write(s+"\n")
+  f.close()
+
+def help():
+  print 'data_info.py [-help|-list|-update|-add]'
+  print '  -help   prints this message'
+  print '  -list   lists statistics for data files'
+  print '  -update updates statistics in data_info.txt, maintains description fields'
+  print '  -add    adds statistics for data files not present in data_info.txt'
+
+def main():
+  flist = build_datasets()
+  if '-help' in sys.argv: help()
+  elif '-list' in sys.argv:
+    compute_statistics(flist, trace=1)
+  elif '-add' in sys.argv:
+    past = get_past()
+    k = past.keys()
+    fnew = filter(lambda x, k=k: not x in k, flist)
+    print 'new=', fnew
+    stat = compute_statistics(fnew)
+    # append past statistics
+    for k in past.keys():
+      stat[k] = past[k]
+    save_info(stat)
+  elif '-update' in sys.argv or 1:
+    # only description and file name is read here, where not equal to '-'
+    # this is used to update new statistics.
+    # this is constructed primarily if we want to change the number of
+    # descriptive fields
+    past_desc = get_past_desc()
+    past_desc_keys = past_desc.keys()
+    stat = compute_statistics(flist)
+    for k in stat.keys():
+      if k in past_desc_keys:
+        stat[k][-1] = past_desc[k]
+    save_info(stat)
+  else: help()
+  
+main()

File Orange/datasets/adult.htm

+<html>
+<head>
+<title>Adult Data Base</title>
+</head>
+<body>
+<h1>Info on Adult Data Base</h1>
+<pre>
+This data was extracted from the census bureau database found at
+http://www.census.gov/ftp/pub/DES/www/welcome.html
+
+Donor: Ronny Kohavi and Barry Becker,
+       Data Mining and Visualization
+       Silicon Graphics.
+       e-mail: ronnyk@sgi.com for questions.
+
+Split into train-test using MLC++ GenCVFiles (2/3, 1/3 random).
+48842 instances, mix of continuous and discrete    (train=32561, test=16281)
+45222 if instances with unknown values are removed (train=30162, test=15060)
+
+Duplicate or conflicting instances : 6
+
+Class probabilities for adult.all file
+Probability for the label '>50K'  : 23.93% / 24.78% (without unknowns)
+Probability for the label '<=50K' : 76.07% / 75.22% (without unknowns)
+Extraction was done by Barry Becker from the 1994 Census database.  A set of
+  reasonably clean records was extracted using the following conditions:
+  ((AAGE>16) && (AGI>100) && (AFNLWGT>1)&& (HRSWK>0))
+
+Prediction task is to determine whether a person makes over 50K
+a year.
+
+First cited in:
+@inproceedings{kohavi-nbtree,
+   author={Ron Kohavi},
+   title={Scaling Up the Accuracy of Naive-Bayes Classifiers: a
+          Decision-Tree Hybrid},
+   booktitle={Proceedings of the Second International Conference on
+              Knowledge Discovery and Data Mining},
+   year = 1996,
+   pages={to appear}}
+
+Error Accuracy reported as follows, after removal of unknowns from
+   train/test sets):
+   C4.5       : 84.46+-0.30
+   Naive-Bayes: 83.88+-0.30
+   NBTree     : 85.90+-0.28
+
+Following algorithms were later run with the following error rates,
+all after removal of unknowns and using the original train/test split.
+All these numbers are straight runs using MLC++ with default values.
+
+   Algorithm               Error
+-- ----------------        -----
+1  C4.5                    15.54
+2  C4.5-auto               14.46
+3  C4.5 rules              14.94
+4  Voted ID3 (0.6)         15.64
+5  Voted ID3 (0.8)         16.47
+6  T2                      16.84
+7  1R                      19.54
+8  NBTree                  14.10
+9  CN2                     16.00
+10 HOODG                   14.82
+11 FSS Naive Bayes         14.05
+12 IDTM (Decision table)   14.46
+13 Naive-Bayes             16.12
+14 Nearest-neighbor (1)    21.42
+15 Nearest-neighbor (3)    20.35
+16 OC1                     15.04
+17 Pebls                   Crashed.  Unknown why (bounds WERE increased)
+
+Conversion of original data as follows:
+1. Discretized agrossincome into two ranges with threshold 50,000.
+2. Convert U.S. to US to avoid periods.
+3. Convert Unknown to "?"
+4. Run MLC++ GenCVFiles to generate data,test.
+
+Description of fnlwgt (final weight): The weights on the CPS files are
+controlled to independent estimates of the civilian noninstitutional
+population of the US.  These are prepared monthly for us by Population
+Division here at the Census Bureau.  We use 3 sets of controls. These
+are:
+
+  1.  A single cell estimate of the population 16+ for each state.
+  2.  Controls for Hispanic Origin by age and sex.
+  3.  Controls by Race, age and sex.
+
+We use all three sets of controls in our weighting program and "rake"
+through them 6 times so that by the end we come back to all the
+controls we used.
+
+The term estimate refers to population totals derived from CPS by
+creating "weighted tallies" of any specified socio-economic
+characteristics of the population.
+
+People with similar demographic characteristics should have similar
+weights.  There is one important caveat to remember about this
+statement.  That is that since the CPS sample is actually a collection
+of 51 state samples, each with its own probability of selection, the
+statement only applies within state.
+<pre>
+</body>
+</html>