Commits

Georg Brandl committed 8eae2d6

data: refactor data loading and Dataset class a bit to support explicit dy columns

Comments (0)

Files changed (9)

ufit/data/dataset.py

 
 """Base dataset class."""
 
-from numpy import array, concatenate, ones, sqrt
+from numpy import array, concatenate, ones
 
 from ufit.utils import attrdict
 from ufit.data.merge import rebin
 
 
 class Dataset(object):
-    def __init__(self, colnames, data, meta, xcol, ycol,
-                 ncol=None, nscale=1, name='', sources=None):
-        self.colnames = colnames
-        self.cols = dict((cn, data[:,i]) for (i, cn) in enumerate(colnames))
-        self.data = data
+    def __init__(self, meta, data, xcol, ycol, ncol=None, nscale=1,
+                 name='', sources=None):
         self.meta = attrdict(meta)
-        self.name = name or str(self.meta.get('filenumber', ''))
-        self.full_name = '%s:%s:%s' % (self.meta.get('instrument', ''),
-                                       self.meta.get('experiment', ''),
-                                       self.name)
-        self.sources = sources or [self.full_name]
+        self.name = name or str(self.meta.filenumber)
+        self.sources = sources or [self.filedesc]
+        self._data = data
 
-        self.xcol = xcol
-        self.x = self[xcol]
-        self.xaxis = xcol
+        self.xcol = self.xaxis = xcol
+        self.x = self.x_raw = data[:,0]
 
-        self.ycol = ycol
-        self.y_raw = self[ycol]
-        self.yaxis = ycol
+        self.ycol = self.yaxis = ycol
+        self.y_raw = data[:,1]
+        self.dy_raw = data[:,2]
 
         self.ncol = ncol
         self.nscale = nscale
-        if ncol is not None:
-            self.norm_raw = self[ncol]
-            self.norm = self[ncol] / nscale
+        if ncol is not None and data.shape[1] > 3:
+            self.norm_raw = data[:,3]
+            self.norm = self.norm_raw / nscale
             if nscale != 1:
                 self.yaxis += ' / %s %s' % (nscale, ncol)
             else:
         else:
             self.norm = ones(len(self.y_raw))
 
+        self.y = self.y_raw / self.norm
+        self.dy = self.dy_raw / self.norm
+
         # points with mask = False are masked out
         self.mask = ones(len(self.x), bool)
         self.fitmin = None
         self.fitmax = None
 
-        self.y = self.y_raw/self.norm
-        self.dy = sqrt(self.y_raw)/self.norm
-        self.dy[self.dy==0] = 0.1
-        ##self.y = self.y_raw
-        ##self.dy = self.norm_raw
-
     @property
     def fit_columns(self):
         mask = self.mask.copy()
             mask &= self.x <= self.fitmax
         return self.x[mask], self.y[mask], self.dy[mask]
 
-    @property
-    def environment(self):
-        s = []
-        if 'temperature' in self.meta:
-            s.append('T = %.3f K' % self.meta['temperature'])
-        return ', '.join(s)
-
-    @property
-    def data_title(self):
-        return self.meta.get('title', '')
-
     @classmethod
     def from_arrays(cls, name, x, y, dy, meta=None, xcol='x', ycol='y'):
-        arr = array((x, y)).T
-        obj = cls([xcol, ycol], arr, meta or {}, xcol, ycol, name=name)
-        obj.dy = dy
-        return obj
+        return cls(meta or {}, array((x, y, dy)).T, xcol, ycol, name=name)
 
     def __repr__(self):
         return '<%s (%d points)>' % (self.name, len(self.x))
         if key == '__setstate__':
             # pickling support
             raise AttributeError
-        if key in self.cols:
-            return self.cols[key]
         elif key in self.meta:
             return self.meta[key]
         raise AttributeError('no such data column: %s' % key)
 
     def __getitem__(self, key):
         if isinstance(key, slice):
-            return self.__class__(self.colnames, self.data[key], self.meta,
-                                  self.xcol, self.ycol, self.ncol, name=self.name)
-        elif key in self.cols:
-            return self.cols[key]
-        raise KeyError('no such data column: %s' % key)
+            return self.__class__(self.meta, self._data[key],
+                                  self.xcol, self.ycol, self.ncol, self.nscale,
+                                  name=self.name, sources=self.sources)
+        raise KeyError
 
     def __or__(self, other):
-        return self.__class__(self.colnames,
-                              concatenate((self.data, other.data)),
-                              self.meta,
-                              self.xcol, self.ycol, self.ncol,
-                              name=self.name + '|' + other.name)
+        return self.__class__(self.meta, concatenate((self._data, other._data)),
+                              self.xcol, self.ycol, self.ncol, self.nscale,
+                              name=self.name + '|' + other.name,
+                              sources=self.sources + other.sources)
 
     def merge(self, binsize, *others):
         if not others:
             return self
         allsets = (self,) + others
-        all_x = concatenate([dset.x for dset in allsets])
-        all_y = concatenate([dset.y_raw for dset in allsets])
-        all_n = concatenate([dset.norm_raw for dset in allsets])
-        new_array = rebin(all_x, all_y, all_n, binsize)
+        alldata = concatenate([dset._data for dset in allsets])
+        new_array = rebin(alldata, binsize)
         sources = sum((dset.sources for dset in allsets), [])
         # XXX should we merge meta's?
-        return self.__class__([self.xcol, self.ycol, self.ncol], new_array,
-                               self.meta, self.xcol, self.ycol, self.ncol,
-                               self.nscale,
-                               name='&'.join(d.name for d in allsets),
-                               sources=sources)
+        return self.__class__(self.meta, new_array,
+                              self.xcol, self.ycol, self.ncol, self.nscale,
+                              name='&'.join(d.name for d in allsets),
+                              sources=sources)
 
     def plot(self, _axes=None):
         DataPlotter(_axes).plot_data(self)
         line = fp.readline()
         if not line:
             break
-    if 'TT' in meta:
-        meta['temperature'] = meta['TT']
+    meta['filedesc'] = '%s:%s:%s' % (meta.get('instrument', ''),
+                                     meta.get('experiment', ''),
+                                     meta.get('filenumber'))
     names = fp.readline().split()
+    # XXX make error message style consistent
+    if not names:
+        raise UFitError('No columns in file')
     usecols = range(len(names))
     if names[0] == 'PNT':
         usecols = range(1, len(names))
     arr = loadtxt(fp, ndmin=2, usecols=usecols, comments='F')
     for i, n in enumerate(names):
         meta[n] = arr[:,i].mean()
-        meta[n + '__std'] = arr[:,i].std()
+    meta['environment'] = []
+    if 'TT' in meta:
+        meta['environment'].append('T = %.3f K' % meta['TT'])
     if len(arr) == 0:
-        raise UFitError('No data in %s' % filename)
+        raise UFitError('No data in file')
     return names, arr, meta

ufit/data/loader.py

 
 """Data loader object."""
 
-from numpy import array
+from numpy import array, ones, sqrt
 
 from ufit import UFitError
 from ufit.data.dataset import Dataset, DataList
             for n, m in data_formats.iteritems():
                 if m.check_data(fobj):
                     return m
-            raise UFitError('File %s not recognized')
+            raise UFitError('File %s not recognized' % filename)
         return data_formats[self.format]
 
-    def load(self, n, xcol, ycol, mcol=None, mscale=1):
+    def load(self, n, xcol, ycol, dycol=None, ncol=None, nscale=1):
         filename = self.template % n
         fobj = open(filename, 'rb')
         colnames, coldata, meta = \
             self._get_reader(filename, fobj).read_data(filename, fobj)
-        dset = Dataset(colnames, coldata, meta, xcol, ycol, mcol, mscale)
+        if 'filenumber' not in meta:
+            meta['filenumber'] = n
+        if 'filedesc' not in meta:
+            meta['filedesc'] = str(n)
+        meta['datafilename'] = filename
+        datarr = ones((len(coldata), 4))
+        datarr[:,0] = coldata[:,colnames.index(xcol)]
+        datarr[:,1] = coldata[:,colnames.index(ycol)]
+        if dycol is not None:
+            datarr[:,2] = coldata[:,colnames.index(dycol)]
+        else:
+            datarr[:,2] = sqrt(datarr[:,1])
+        if ncol is not None:
+            datarr[:,3] = coldata[:,colnames.index(ncol)]
+        dset = Dataset(meta, datarr, xcol, ycol, ncol, nscale)
         self.sets[n] = dset
         return dset
 
                     # use average monitor counts for normalization, but
                     # round to 2 significant digits
                     nmon = int(float('%.2g' % coldata[:,i].mean()))
+        if yguess is None and len(colnames) > 1:
+            yguess = colnames[1]
         return colnames, xguess, yguess, mguess, nmon
 
-    def load_numors(self, nstring, binsize, xcol, ycol, mcol=None, mscale=1):
+    def load_numors(self, nstring, binsize, xcol, ycol, dycol=None,
+                    ncol=None, nscale=1):
         """Load a number of data files and merge them according to numor
         list operations:
 
         for part1 in parts1:
             if '-' in part1:
                 a, b = map(toint, part1.split('-'))
-                datasets.extend(self.load(n, xcol, ycol, mcol, mscale)
+                datasets.extend(self.load(n, xcol, ycol, dycol, ncol, nscale)
                                 for n in range(a, b+1))
             else:
                 parts2 = part1.split('+')
                 for part2 in parts2:
                     if '>' in part2:
                         a, b = map(toint, part2.split('>'))
-                        ds = [self.load(n, xcol, ycol, mcol, mscale)
+                        ds = [self.load(n, xcol, ycol, dycol, ncol, nscale)
                               for n in range(a, b+1)]
                         inner.append(ds[0].merge(binsize, *ds[1:]))
                     else:
                         inner.append(
-                            self.load(toint(part2), xcol, ycol, mcol, mscale))
+                            self.load(toint(part2), xcol, ycol, dycol,
+                                      ncol, nscale))
                 datasets.append(inner[0].merge(binsize, *inner[1:]))
         return datasets

ufit/data/merge.py

 # data merging
 
-from numpy import arange, ones, zeros
+from numpy import arange, ones, zeros, sqrt
 
 from ufit import UFitError
 
 
-def rebin(x, y, n, binsize):
-    """Simple rebinning of (x, y, n) data."""
+def rebin(data, binsize):
+    """Simple rebinning of (x, y, dy, n) data."""
+
+    x, y, dy, n = data.T
 
     # calculate new x values
     halfbinsize = binsize/2.
                    binsize) + halfbinsize
     nbins = len(stops)
 
-    # newarray will be the new x, y, m array
-    newarray = zeros((nbins, 3))
+    # newarray will be the new x, y, dy, n array
+    newarray = zeros((nbins, 4))
     newarray[:,0] = stops
 
     # this will keep track which data values we already used
         indices &= data_unused
         if indices.any():
             newarray[i, 1] += y[indices].sum()
-            newarray[i, 2] += n[indices].sum()
+            newarray[i, 2] += sqrt((dy[indices]**2).sum())
+            newarray[i, 3] += n[indices].sum()
             data_unused[indices] = False
         else:
             new_used[i] = False

ufit/data/nicos.py

                 meta['title'] = oval
             elif key == 'number':
                 meta['filenumber'] = int(oval)
+            # 'info' key already has the right name
             meta[key] = val
+    meta['filedesc'] = '%s:%s:%s' % (meta.get('instrument', ''),
+                                     meta.get('experiment', ''),
+                                     meta.get('filenumber'))
     colnames = fp.readline()[1:].split()
     colunits = fp.readline()[1:].split()
     def convert_value(s):
     colunits = [unit for unit in colunits if unit != ';']
     usecols = cvdict.keys()
     coldata = loadtxt(fp, converters=cvdict, usecols=usecols)
-    if 'Ts' in colnames:
-        tindex = colnames.index('Ts')
-        meta['temperature'] = coldata[:,tindex].mean()
-    if 'sT' in colnames:
-        tindex = colnames.index('sT')
-        meta['temperature'] = coldata[:,tindex].mean()
-    if 'B' in colnames:
-        tindex = colnames.index('B')
-        meta['magfield'] = coldata[:,tindex].mean()
+    cols = dict((name, coldata[:,i]) for (i, name) in enumerate(colnames))
+    meta['environment'] = []
+    for col in cols:
+        meta[col] = cols[col].mean()
+    if 'Ts' in cols:
+        meta['environment'].append('T = %.3f K' % meta['Ts'])
+    elif 'sT' in cols:
+        meta['environment'].append('T = %.3f K' % meta['sT'])
+    if 'B' in cols:
+        meta['environment'].append('B = %.3f K' % meta['B'])
     return colnames, coldata, meta

ufit/data/simple.py

 # Licensed under a 2-clause BSD license, see LICENSE.
 # *****************************************************************************
 
-"""Load routine for simple three-column data files."""
+"""Load routine for simple whitespace-separated column data files."""
 
-from numpy import loadtxt, sqrt
-
-from ufit import UFitError
+from numpy import loadtxt
 
 
 def check_data(fp):
     dtline = fp.readline()
     try:
-        x, y, dy = map(float, dtline.split())
-    except (ValueError, TypeError):
+        map(float, dtline.split())
+    except ValueError:
+        fp.seek(0, 0)
         return False
     fp.seek(0, 0)
     return True
 
 
 def read_data(filename, fp):
-    colnames = ['x', 'y', 'dy']
+    dtline = fp.readline()
+    try:
+        if dtline.startswith(('#', '%')):
+            dtline = dtline[1:]
+        map(float, dtline.split())
+    except ValueError:
+        # must be headers...
+        colnames = dtline.split()
+    else:
+        fp.seek(0, 0)
     arr = loadtxt(fp, ndmin=2)
     return colnames, arr, {}

ufit/gui/dataloader.py

         try:
             cols, xguess, yguess, mguess, nmon = self.loader.guess_cols(numor)
         except Exception, e:
+            raise
             QMessageBox.information(self, 'Error',
                                     'Could not read column names: %s' % e)
             return
             return
         numors = str(self.numors.text())
         try:
+            # XXX support dycol
             datas = self.loader.load_numors(numors, prec,
-                                            xcol, ycol, mcol, mscale)
+                                            xcol, ycol, None, mcol, mscale)
         except Exception, e:
             QMessageBox.information(self, 'Error', 'Could not read data: %s' % e)
             return
         self.panels.append(
             ('<big><b>%s</b></big> - %s<br>%s<br><small>%s</small>' %
              (len(self.panels) + 1,
-              data.data_title,
-              data.environment,
+              data.meta.get('title', ''),
+              ', '.join(data.environment),
               '<br>'.join(data.sources)), panel))
         self.pristine = False
         if not self._loading:

ufit/models/base.py

     def fit(self, datas, **kw):
 
         # fit a cumulative data set consisting of a concatenation of all data
+        fitcols = [d.fit_columns for d in datas]
 
         cumulative_data = Dataset.from_arrays(
             'cumulative data',
-            concatenate([d.x for d in datas]),
-            concatenate([d.y for d in datas]),
-            concatenate([d.dy for d in datas]),
-            attrdict(('d%d' % i, d.meta) for (i, d) in enumerate(datas)),
+            concatenate([cols[0] for cols in fitcols]),
+            concatenate([cols[1] for cols in fitcols]),
+            concatenate([cols[2] for cols in fitcols]),
+            dict(('d%d' % i, d.meta) for (i, d) in enumerate(datas)),
         )
         overall_res = Model.fit(self, cumulative_data, **kw)