Commits

Thomas Kluyver committed d9881c4

Allow creating a new dataset from matched taxa.

  • Participants
  • Parent commits bf555ef

Comments (0)

Files changed (3)

File taxonome/gui/mapnames.py

 from taxonome import tracker
 from .ui.map_names_wizard import Ui_MapNamesWizard
 from .csvdialogs import _preview_csv, _get_auth_field
-from .iothread import Worker
+from .iothread import Worker, makeloader
 
 def getoptions(app, optsdialog):
     """Find what options the user selected from the dialog for matching taxa."""
         mappings_file = optswizard.mappings_file.text()
     if optswizard.log.isChecked():
         log_file = optswizard.log_file.text()
+    matched_ds = optswizard.matched_ds.isChecked()
     
     trackers = []
     files = []
         trackers.append(tracker.CSVTaxaTracker(f, fields))
     
     def _run(progress=None):
+        "Called in a separate thread to run the matching."
         if from_csv:
             csvfile = open(csv_filename, encoding='utf-8', errors='replace')
             files.append(csvfile)
                 trackers.append(tracker.Counter(progress))
             data_in = taxa_dataset
         
-        matching = streaming_match_taxa(data_in, target.ds, upgrade_subsp=upgrade,
+        # Do we want the output as a dataset?
+        matchfunc = match_taxa if matched_ds else run_match_taxa
+        
+        # Run the matching
+        res = matchfunc(data_in, target.ds, upgrade_subsp=upgrade,
                 strict_authority=strict_authority, nameselector=nameselector, tracker=trackers)
         
-        for source_taxon, target_taxon in matching:
-            pass
-        
         for f in files: f.close()
+        return res
     
-    thread = Worker(_run, app)
-    
-    if from_csv:
-        thread.withprogress(app, steps=os.stat(csv_filename).st_size)
+    steps = os.stat(csv_filename).st_size if from_csv else len(taxa_dataset)
+    if matched_ds:
+        fromname = os.path.basename(csv_filename) if from_csv else dataset_item.name
+        ds_name = "{} mapped to {}".format(fromname, target.name)
+        makeloader(app, _run, ds_name, steps)
     else:
-        thread.withprogress(app, steps=len(taxa_dataset))
-        
-    thread.error_raised.connect(app.show_error)
-    thread.start()
+        thread = Worker(_run, app)
+        thread.error_raised.connect(app.show_error)
+        thread.withprogress(app, steps)
+        thread.start()

File taxonome/gui/ui/map_names_wizard.ui

          </item>
         </layout>
        </item>
+       <item>
+        <widget class="QLabel" name="label_4">
+         <property name="text">
+          <string>Reading from a CSV file is recommended for very large datasets, which may not fit into the computer's memory, and for specimen data where each species may occur on several rows.</string>
+         </property>
+         <property name="wordWrap">
+          <bool>true</bool>
+         </property>
+        </widget>
+       </item>
       </layout>
      </widget>
     </item>
    <attribute name="pageId">
     <string notr="true">3</string>
    </attribute>
-   <layout class="QVBoxLayout" name="verticalLayout_12">
+   <layout class="QVBoxLayout" name="verticalLayout_2">
     <item>
      <widget class="QWidget" name="widget" native="true">
       <layout class="QVBoxLayout" name="verticalLayout_10">
       </layout>
      </widget>
     </item>
+    <item>
+     <widget class="QCheckBox" name="matched_ds">
+      <property name="text">
+       <string>Create dataset of source taxa with new names</string>
+      </property>
+     </widget>
+    </item>
+    <item>
+     <widget class="QLabel" name="label_5">
+      <property name="text">
+       <string>This is not recommended when matching big datasets read directly from file.</string>
+      </property>
+      <property name="wordWrap">
+       <bool>true</bool>
+      </property>
+     </widget>
+    </item>
    </layout>
   </widget>
  </widget>

File taxonome/taxa/collection.py

         yield taxon, t2
         
     tracker.reset()
+
+def run_match_taxa(taxa, target, **kwargs):
+    """Run the matching process without collecting the results - suitable
+    for use with big datasets.
+    """
+    for taxonpair in streaming_match_taxa(taxa, target, **kwargs):
+        pass