Commits

Eric Chlebek  committed c338a12

Added experimental HDF5 reader.

  • Participants
  • Parent commits 5b54c38

Comments (0)

Files changed (3)

File Orange/OrangeWidgets/Data/OWHdf5.py

+"""<name>OWHdf5</name>
+<description>Reads data from an HDF5 file.</description>
+<icon>icons/Hdf5.svg</icon>
+<priority>30</priority>
+"""
+import Orange
+from Orange.OrangeWidgets.OWWidget import OWWidget
+from Orange.OrangeWidgets.OWContexts import ContextHandler
+from PyQt4.QtGui import (
+    QComboBox,
+    QSizePolicy,
+    QStyle,
+    qApp,
+    QFileDialog,
+    QApplication,
+    QTreeWidget,
+    QTreeWidgetItem,
+)
+from PyQt4.QtCore import SIGNAL, QTimer
+import OWGUI
+import os
+import numpy as np
+import h5py
+from orange import StringVariable
+import sys
+
+UINT_TYPES = [
+    np.uint8, np.uint16, np.uint32, np.uint64
+]
+
+NAME = "HDF5"
+ID = "orange.widgets.zymeworks.hdf5"
+
+
+DESCRIPTION = """Read data from an HDF5 file."""
+
+LONG_DESCRIPTION = """
+This node uses the h5py library to read an HDF5 dataset into an Orange.data.Table.
+It can handle 1D and 2D datasets.
+"""
+
+ICON = "icons/Hdf5.svg"
+AUTHOR = "Eric Chlebek"
+MAINTAINER_EMAIL = "echlebek(@at@)zymeworks.com"
+PRIORITY = 30
+CATEGORY = "Data"
+
+KEYWORDS = ["data", "file", "load", "read", "hdf5"]
+
+OUTPUTS = (
+    {
+        "name": "Data",
+        "type": Orange.data.Table,
+        "doc": "Attribute-valued data set read from the input file.",
+    },
+)
+
+
+WIDGET_CLASS = "OWHdf5"
+
+
+NO_FILE_SELECTED = "(none)"
+
+
+def exists(path):
+    if not os.path.exists(path):
+        dirpath, basename = os.path.split(path)
+        return os.path.exists(os.path.join("./", basename))
+    else:
+        return True
+
+
+def addOrigin(examples, filename):
+    variables = examples.domain.variables + examples.domain.getmetas().values()
+    strings = [var for var in variables if isinstance(var, StringVariable)]
+    dirname, basename = os.path.split(filename)
+    for var in strings:
+        if "type" in var.attributes and "origin" not in var.attributes:
+            var.attributes["origin"] = dirname
+
+
+class Hdf5PathContextHandler(ContextHandler):  # Same as the one in OWFile
+    def match(self, context, imperfect, filename):
+        return context.filename == filename and 2
+
+
+class OWHdf5(OWWidget):
+    settingsList=["recentFiles"]
+    contextHandlers = {"": Hdf5PathContextHandler()}
+
+    def __init__(self, parent=None, signalManager=None):
+        OWWidget.__init__(self, parent, signalManager, "HDF5", wantMainArea=0, resizingEnabled=1)
+
+        self.inputs = []
+        self.outputs = [("Data", Orange.data.Table)]
+
+        self.datasetPath = "/"
+        self.createNewOn = 1
+        self.domain = None
+        self.loadedFile = ""
+        self.showAdvanced = 0
+        self.loadSettings()
+        self.recentFiles=[NO_FILE_SELECTED]
+
+        self._add_file_selector_pane()
+        self._add_dataset_selector_pane()
+        self._add_info_pane()
+
+        OWGUI.rubber(self.controlArea)
+
+        self.recentFiles = filter(exists, self.recentFiles)
+        self.setFileList()
+
+        self.connect(self.filecombo, SIGNAL('activated(int)'), self.selectFile)
+
+    def selectFile(self, n):
+        if n < len(self.recentFiles):
+            name = self.recentFiles[n]
+            self.recentFiles.remove(name)
+            self.recentFiles.insert(0, name)
+        elif n:
+            self.browseFile(1)
+
+        if len(self.recentFiles) > 0:
+            self.setFileList()
+
+    def _add_file_selector_pane(self):
+        box = OWGUI.widgetBox(self.controlArea, "HDF5 File", addSpace=True, orientation=0)
+        self.filecombo = QComboBox(box)
+        self.filecombo.setMinimumWidth(150)
+        box.layout().addWidget(self.filecombo)
+
+        button = OWGUI.button(box, self, '...', callback = self.browseFile, disabled=0)
+        button.setIcon(self.style().standardIcon(QStyle.SP_DirOpenIcon))
+        button.setSizePolicy(QSizePolicy.Maximum, QSizePolicy.Fixed)
+
+        self.reloadBtn = OWGUI.button(box, self, "Load", callback = self.load, default=True)
+        self.reloadBtn.setIcon(self.style().standardIcon(QStyle.SP_DialogApplyButton))
+        self.reloadBtn.setSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed)
+
+    def item_selection_changed(self):
+        selection = self.treewidget.selectedItems()
+        if len(selection) > 0:
+            self.datasetPath = str(selection[0].text(0))
+
+    def _add_dataset_selector_pane(self):
+        box = OWGUI.widgetBox(self.controlArea, "HDF5 Dataset", addSpace=True)
+        self.treewidget = QTreeWidget(box)
+        self.treewidget.setMinimumWidth(150)
+        self.treewidget.setHeaderHidden(True)
+
+        self.connect(self.treewidget, SIGNAL('itemSelectionChanged()'), self.item_selection_changed)
+        box.layout().addWidget(self.treewidget)
+
+    def _add_info_pane(self):
+        box = OWGUI.widgetBox(self.controlArea, "Info", addSpace = True)
+        self.infobox = OWGUI.widgetLabel(box, 'No data loaded.')
+
+        #Set word wrap so long warnings won't expand the widget
+        self.infobox.setWordWrap(True)
+        self.infobox.setSizePolicy(QSizePolicy.Ignored, QSizePolicy.MinimumExpanding)
+
+    def adjustSize0(self):
+        qApp.processEvents()
+        QTimer.singleShot(50, self.adjustSize)
+
+    def setFileList(self):
+        self.filecombo.clear()
+        if not self.recentFiles:
+            self.filecombo.addItem(NO_FILE_SELECTED)
+        for path in self.recentFiles:
+            if path == NO_FILE_SELECTED:
+                self.filecombo.addItem(NO_FILE_SELECTED)
+            else:
+                self.filecombo.addItem(os.path.split(path)[1])
+
+    def load(self):
+        if self.recentFiles:
+            return self.openFile(self.recentFiles[0], self.datasetPath)
+
+    def browseFile(self):
+        if not self.recentFiles or self.recentFiles[0] == NO_FILE_SELECTED:
+            startfile = os.path.expanduser("~/")
+        else:
+            startfile = self.recentFiles[0]
+
+        path = unicode(
+            QFileDialog.getOpenFileName(self, "Open HDF5 File", startfile, "HDF5 Files (*.h5)")
+        )
+
+        if not path:
+            return
+
+        if path in self.recentFiles:
+            self.recentFiles.remove(path)
+
+        self.recentFiles.insert(0, path)
+        self.setFileList()
+        self.treewidget.clear()
+        group_icon = self.style().standardIcon(QStyle.SP_DirIcon)
+        dataset_icon = self.style().standardIcon(QStyle.SP_FileIcon)
+        hdf5_root = _create_qtree_widget_item(h5py.File(path), self.treewidget, group_icon, dataset_icon)
+        self.treewidget.addTopLevelItem(hdf5_root)
+        self.treewidget.expandItem(hdf5_root)
+
+    def openFile(self, path, datasetPath):
+        if self.processingHandler:
+            self.processingHandler(self, 1)    # focus on active widget
+
+        self.error()
+        self.warning()
+        self.information()
+
+        if not os.path.exists(path):
+            dname, basename = os.path.split(path)
+            if os.path.exists(os.path.join("./", basename)):
+                path = os.path.join("./", basename)
+                self.information("Loading {0} from the current directory".format(path))
+
+        self.closeContext()
+        self.loadedFile = ""
+
+        if path == NO_FILE_SELECTED:
+            self.send("Data", None)
+            self.infobox.setText("No data loaded.")
+            return
+
+        self.openContext("", path)
+
+        try:
+            f = h5py.File(path)
+            self.loadedFile = path
+            data = examples_from_dataset(f[datasetPath])
+        except Exception as e:
+            # The above can raise IOError and ValueError, but the handling is the same for both.
+            self.infobox.setText(str(e))
+            return
+
+        self.dataDomain = data.domain
+
+        addOrigin(data, path)
+        data.name = os.path.basename(path)
+
+        self.send("Data", data)
+
+        if self.processingHandler:
+            self.processingHandler(self, 0)  # Remove focus from the widget
+
+        self.infobox.setText(
+            "{0} examples(s), {1} attribute(s)".format(len(data), len(data.domain.attributes))
+        )
+
+    def settingsFromWidgetCallback(self, handler, context):
+        context.filename = self.loadedFile
+
+    def settingsToWidgetCallback(self, handler, context):
+        self.loadedFile = context.filename
+
+
+def dataset_iterator(dataset):
+    """Yield each column from a dataset. Compound columns are decomposed recursively."""
+    names = dataset.dtype.names
+    if names:
+        for name in names:
+            subdset = dataset[name]
+            for d in dataset_iterator(subdset):
+                yield d
+    else:
+        yield dataset[()]
+
+
+def flatten_dtype(dtype, name=""):
+    """Recursively flatten a numpy dtype into its simple component types."""
+    if dtype.type == np.void:  # We are working with a compound dtype in this case
+        new_dtype = []
+
+        for subname in dtype.names:
+            appended_name = ".".join([name, subname] if name else [subname])
+            subtype, offset = dtype.fields[subname]
+            new_dtype.extend(flatten_dtype(subtype, appended_name))
+
+        return new_dtype
+
+    else:  # Otherwise we have a simple dtype
+        return [(name, dtype.type) if name else ("", dtype.type)]
+
+
+def examples_from_dataset(dataset):
+    """Create an Orange.data.Table from an HDF5 dataset."""
+    from itertools import izip
+
+    if not isinstance(dataset, h5py.Dataset):
+        raise ValueError("Groups cannot be loaded.")
+
+    if len(dataset.shape) > 2:
+        raise ValueError("Incompatible dataset shape.")
+
+    flat_dtype = np.dtype(flatten_dtype(dataset.dtype))
+    features = [feature_from_dtype(flat_dtype.fields[d][0], d) for d in flat_dtype.names]
+    domain = Orange.data.Domain(features)
+    data = [list(x) for x in izip(*list(dataset_iterator(dataset)))]
+
+    return Orange.data.Table(domain, data)
+
+
+def feature_from_dtype(dtype, name):
+    """Create an Orange feature from a dtype with a name."""
+    if not name:
+        raise ValueError("Parameter name must be defined.")
+
+    if np.issubdtype(dtype, str):
+        return Orange.feature.String(name)
+
+    elif np.issubdtype(dtype, float):
+        return Orange.feature.Continuous(name)
+
+    elif np.issubdtype(dtype, int) or dtype.type in UINT_TYPES:
+        # Bit of a hack, but discrete features must enumerate all values up front
+        return Orange.feature.Continuous(name, number_of_decimals=0)
+
+    elif np.issubdtype(dtype, bool):
+        return Orange.feature.Discrete(name, values=[True, False])
+
+    else:
+        raise ValueError("{0} values are not supported.".format(dtype))
+
+
+def _create_qtree_widget_item(group, parent, group_icon, dataset_icon):
+    """Walk through the HDF5 group and create QTreeWidgetItems recursively."""
+    child = QTreeWidgetItem(parent)
+    child.setText(0, group.name)
+    child.setIcon(0, group_icon)
+
+    for key in group.keys():
+        obj = group[key]
+        if isinstance(obj, h5py.Group):
+            _create_qtree_widget_item(obj, child, group_icon, dataset_icon)
+        else:
+            grandchild = QTreeWidgetItem(child)
+            grandchild.setText(0, obj.name)
+            grandchild.setIcon(0, dataset_icon)
+
+    return child
+
+
+if __name__ == "__main__":
+    a = QApplication(sys.argv)
+    ow = OWHdf5()
+    ow.show()
+    a.exec_()
+    ow.saveSettings()

File Orange/OrangeWidgets/Data/icons/Hdf5.svg

+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
+	 width="48px" height="48px" viewBox="0 0 48 48" enable-background="new 0 0 48 48" xml:space="preserve">
+<path fill="#FFFFFF" d="M25,8H13v32h22V17c0-4.123-8-1-8-1C28,12,27,8,25,8z"/>
+<path fill="#333333" d="M28,6H13h-2v2v32v2h2h22h2v-2V16v-2L28,6z M35,40H13V8h12c2,0,3,4,2,8c0,0,8-3.123,8,1V40z"/>
+<g>
+	<path fill="#7C7C7C" d="M21.817,29.095c0.114,0.623,0.33,1.104,0.651,1.441c0.321,0.339,0.789,0.507,1.403,0.507
+		c0.708,0,1.248-0.248,1.619-0.748c0.37-0.498,0.556-1.126,0.556-1.883c0-0.743-0.174-1.371-0.521-1.884
+		c-0.349-0.511-0.889-0.767-1.624-0.767c-0.348,0-0.647,0.043-0.902,0.13c-0.448,0.161-0.786,0.458-1.012,0.894l-2.566-0.121
+		l1.021-8.044h8.014v2.431h-5.949l-0.521,3.184c0.44-0.288,0.786-0.479,1.033-0.572c0.415-0.154,0.919-0.231,1.516-0.231
+		c1.203,0,2.253,0.405,3.148,1.216c0.896,0.81,1.345,1.988,1.345,3.534c0,1.347-0.433,2.548-1.294,3.606
+		c-0.864,1.058-2.155,1.587-3.874,1.587c-1.384,0-2.521-0.372-3.412-1.115c-0.889-0.743-1.384-1.798-1.485-3.164H21.817z"/>
+</g>
+</svg>

File Orange/testing/unit/tests/test_hdf5.py

+import unittest
+import numpy
+
+
+class HDF5Tests(unittest.TestCase):
+
+    def test_dataset_iterator(self):
+        from Orange.OrangeWidgets.Data.OWHdf5 import dataset_iterator
+
+        dtype = numpy.dtype([("foo", numpy.dtype([("A", int), ("B", int)])), ("bar", float)])
+        data = numpy.array([((1, 2), 3.0)], dtype=dtype)
+
+        it = dataset_iterator(data)
+
+        a = next(it)
+        self.assertEquals(list(a), [1])
+
+        b = next(it)
+        self.assertEquals(list(b), [2])
+
+        c = next(it)
+        self.assertAlmostEquals(list(c)[0], 3.0)
+
+    def test_flatten_dtype(self):
+        from Orange.OrangeWidgets.Data.OWHdf5 import flatten_dtype
+        import numpy
+
+        dtype = numpy.dtype([("foo", numpy.dtype([("A", int), ("B", int)])), ("bar", float)])
+
+        ((d1_name, d1_type), (d2_name, d2_type), (d3_name, d3_type)) = flatten_dtype(dtype)
+
+        self.assertEquals(d1_name, "foo.A")
+        self.assertEquals(d2_name, "foo.B")
+        self.assertEquals(d3_name, "bar")
+
+        self.assertTrue(numpy.issubdtype(d1_type, int))
+        self.assertTrue(numpy.issubdtype(d2_type, int))
+        self.assertTrue(numpy.issubdtype(d3_type, float))
+
+    def test_examples_from_dataset(self):
+        from Orange.OrangeWidgets.Data.OWHdf5 import examples_from_dataset
+
+        dtype = numpy.dtype([("foo", numpy.dtype([("A", int), ("B", int)])), ("bar", int)])
+        data = numpy.array([((1, 2), 3)], dtype=dtype)
+
+        table = examples_from_dataset(data)
+        self.assertEquals(list(list(table)[0]), [1, 2, 3])
+
+
+if __name__ == "__main__":
+    unittest.main()