Commits

Anonymous committed 09f5c6f

Release v1.4a1 -- Documentation update and minor bug fixes

* Considerably expanded the documentation by adding pages for widgets
Extract XML, Intersect, Merge, Recode, Segment, and Select, as well as
an introductory illustration ("Mining Humanist"); minor corrections to
other documentation pages.
* Annotations added by auto-numbering now have an integer value (rather
than string) and thus can be used in scatterplots for instance.
* New predefined codes (__str_index_raw__ and __start_raw__, corresponding
to __str_index__-1 and __start__-1) can be used in widget Display's
Custom formatting field.
* Changed default names of some widgets' output channels.
* Modified storage location of settings in an attempt to avoid issues that
have been arising when updating to newer versions.
* Fixed a bug that a caused zero-division error when measuring the length
of empty segments.
* Fixed a bug that prevented user from disabling annotation copying in
the advanced settings of widget Preprocess.
* Fixed a bug related to progress bar handling in widget Select.

Comments (0)

Files changed (53)

_textable/widgets/LTTL/Processor.py

 #=============================================================================
-# Class LTTL.Processor, v0.14
+# Class LTTL.Processor, v0.16
 # Copyright 2012-2014 LangTech Sarl (info@langtech.ch)
 #=============================================================================
 # This file is part of the LTTL package v1.4
                         progress_callback()
 
         # Create pivot crosstab...
+        if isinstance(context_types[0], int):
+            header_type = u'continuous'
+        else:
+            header_type = u'string'
         return(PivotCrosstab(
                 context_types,
                 unit_types,
                 },
                 {
                     'id':   u'__context__',
-                    'type': u'string',
+                    'type': header_type,
                 },
                 dict([(u, u'continuous') for u in unit_types]),
                 None,
                     if contexts['merge']:
 
                         # Store average and count...
-                        average_length = (
-                                num_units[context_type]
-                              / num_averaging_units[context_type]
-                        )
-                        if average_length < 1:
-                            average_length = 0.0
-                        values[
-                                (context_type, u'__length_average__')
-                        ] = average_length
-                        values[
-                                (context_type, u'__length_count__')
-                        ] = num_averaging_units[context_type]
-
-                    # Otherwise loop over context types...
-                    else:
-
-                        for context_type in context_types:
-
-                            # Store average and count for this context...
+                        try:
                             average_length = (
                                     num_units[context_type]
                                   / num_averaging_units[context_type]
                             values[
                                     (context_type, u'__length_count__')
                             ] = num_averaging_units[context_type]
+                        except ZeroDivisionError:
+                            pass
+
+                    # Otherwise loop over context types...
+                    else:
+
+                        for context_type in context_types:
+
+                            # Store average and count for this context...
+                            try:
+                                average_length = (
+                                        num_units[context_type]
+                                      / num_averaging_units[context_type]
+                                )
+                                if average_length < 1:
+                                    average_length = 0.0
+                                values[
+                                        (context_type, u'__length_average__')
+                                ] = average_length
+                                values[
+                                        (context_type, u'__length_count__')
+                                ] = num_averaging_units[context_type]
+                            except ZeroDivisionError:
+                                pass
 
                 # Store col ids...
                 if len(values) > 0:
                     num_averaging_units = len(averaging['segmentation'])
 
                     # Store average and count...
-                    average_length = len(units) / num_averaging_units
-                    if average_length < 1:
-                        average_length = 0.0
-                    values[
-                            (context_type, u'__length_average__')
-                    ] = average_length
-                    values[
-                            (context_type, u'__length_count__')
-                    ] = num_averaging_units
+                    try:
+                        average_length = len(units) / num_averaging_units
+                        if average_length < 1:
+                            average_length = 0.0
+                        values[
+                                (context_type, u'__length_average__')
+                        ] = average_length
+                        values[
+                                (context_type, u'__length_count__')
+                        ] = num_averaging_units
+                    except ZeroDivisionError:
+                        pass
 
                 # Store col ids...
                 if len(values) > 0:
             context_types.append(context_type)
 
         # Create Table...
+        if isinstance(context_types[0], int):
+            header_type = u'continuous'
+        else:
+            header_type = u'string'
         return(Table(
                 context_types,
                 col_ids,
                 {},
                 {
                     'id':   u'__context__',
-                    'type': u'string',
+                    'type': header_type,
                 },
                 dict([(c, u'continuous') for c in col_ids]),
                 None,
                 new_col_ids,
                 new_values,
                 {},
-                {
-                    'id':   u'__context__',
-                    'type': u'string',
-                },
+                counts.header_col.copy(),
                 dict([(c, u'continuous') for c in new_col_ids]),
                 None,
                 None,
                 [u'__annotation__'],
                 new_values,
                 {},
-                {
-                    'id':   u'__context__',
-                    'type': u'string',
-                },
+                counts.header_col.copy(),
                 {u'__annotation__': u'discrete'},
                 u'__annotation__',
                 None,

_textable/widgets/LTTL/Segmentation.py

 #=============================================================================
-# Class LTTL.Segmentation, v0.20
+# Class LTTL.Segmentation, v0.21
 # Copyright 2012-2014 LangTech Sarl (info@langtech.ch)
 #=============================================================================
 # This file is part of the LTTL package v1.4
             if format:
                 segment_dict = default_dict.copy()
                 segment_dict.update(segment.annotations)
-                segment_dict['__num__']       = segment_count
-                segment_dict['__content__']   = segment.get_content()
-                segment_dict['__str_index__'] = str_index
-                segment_dict['__start__']     = start
-                segment_dict['__end__']       = end
+                segment_dict['__num__']             = segment_count
+                segment_dict['__content__']         = segment.get_content()
+                segment_dict['__str_index__']       = str_index
+                segment_dict['__start__']           = start
+                segment_dict['__end__']             = end
+                segment_dict['__str_index_raw__']   = str_index - offset
+                segment_dict['__start_raw__']       = start     - offset
+                segment_dict['__end_raw__']         = end
                 lines.append(format % segment_dict)
             else:
                 lines.extend([

_textable/widgets/LTTL/Segmenter.py

 #=============================================================================
-# Class LTTL.Segmenter, v0.18
+# Class LTTL.Segmenter, v0.19
 # Copyright 2012-2014 LangTech Sarl (info@langtech.ch)
 #=============================================================================
 # This file is part of the LTTL package v1.4
                                                 previous_end_pos,
                                                 start_pos + m.start(),
                                         ),
-                                        new_segment_annotations
+                                        new_segment_annotations.copy()
                                 )
                         )
                         previous_end_pos = start_pos + m.end()
                                                 previous_end_pos,
                                                 segment_end_pos,
                                         ),
-                                        new_segment_annotations
+                                        new_segment_annotations.copy()
                                 )
                         )
                 # Sort segments...
         """Add annotation with integers from 1 to N to segments in a list"""
         counter = 1
         for segment in segment_list:
-            segment.annotations[annotation_key] = unicode(counter)
+            segment.annotations[annotation_key] = counter
             counter += 1
             if progress_callback:
                 progress_callback()

_textable/widgets/OWTextableAnnotation.py

 #=============================================================================
-# Class OWTextableAnnotation, v0.09
+# Class OWTextableAnnotation, v0.10
 # Copyright 2012-2014 LangTech Sarl (info@langtech.ch)
 #=============================================================================
 # This file is part of the Textable (v1.4) extension to Orange Canvas.
                 self,
                 parent,
                 signalManager,
-                'TextableAnnotation',
+                'TextableAnnotation_0_10',
                 wantMainArea=0,
         )
         

_textable/widgets/OWTextableContext.py

 #=============================================================================
-# Class OWTextableContext, v0.07
+# Class OWTextableContext, v0.08
 # Copyright 2012-2014 LangTech Sarl (info@langtech.ch)
 #=============================================================================
 # This file is part of the Textable (v1.4) extension to Orange Canvas.
                 self,
                 parent,
                 signalManager,
-                'TextableContext',
+                'TextableContext_0_08',
                 wantMainArea=0,
         )
         

_textable/widgets/OWTextableConvert.py

 #=============================================================================
-# Class OWTextableConvert, v0.09
+# Class OWTextableConvert, v0.10
 # Copyright 2012-2014 LangTech Sarl (info@langtech.ch)
 #=============================================================================
 # This file is part of the Textable (v1.4) extension to Orange Canvas.
                 self,
                 parent,
                 signalManager,
-                'TextableConvert',
+                'TextableConvert_0_10',
                 wantMainArea=0,
         )
         

_textable/widgets/OWTextableCount.py

 #=============================================================================
-# Class OWTextableCount, v0.18
+# Class OWTextableCount, v0.19
 # Copyright 2012-2014 LangTech Sarl (info@langtech.ch)
 #=============================================================================
 # This file is part of the Textable (v1.4) extension to Orange Canvas.
                 self,
                 parent,
                 signalManager,
-                'TextableCount',
+                'TextableCount_0_19',
                 wantMainArea=0,
         )
         

_textable/widgets/OWTextableDisplay.py

 #=============================================================================
-# Class OWTextableDisplay, v0.11
+# Class OWTextableDisplay, v0.12
 # Copyright 2012-2014 LangTech Sarl (info@langtech.ch)
 #=============================================================================
 # This file is part of the Textable (v1.4) extension to Orange Canvas.
                 self,
                 parent,
                 signalManager,
-                'Display',
+                'TextableDisplay_0_12',
                 wantMainArea=1
         )
         

_textable/widgets/OWTextableExtractXML.py

 #=============================================================================
-# Class OWTextableExtractXML, v0.09
+# Class OWTextableExtractXML, v0.11
 # Copyright 2012-2014 LangTech Sarl (info@langtech.ch)
 #=============================================================================
 # This file is part of the Textable (v1.4) extension to Orange Canvas.
 
 """
 <name>Extract XML</name>
-<description>Create a new segmentation based on xml markup</description>
+<description>Create a new segmentation based on XML markup</description>
 <icon>icons/ExtractXML.png</icon>
 <priority>4005</priority>
 """
                 self,
                 parent,
                 signalManager,
-                'TextableExtractXML',
+                'TextableExtractXML_0_11',
                 wantMainArea=0,
         )
 
         # Input and output channels...
         self.inputs  = [('Segmentation', Segmentation, self.inputData, Single)]
-        self.outputs = [('Extracted XML', Segmentation)]
+        self.outputs = [('Extracted data', Segmentation)]
         
         # Settings...
         self.conditions                 = []
         # Check that there's something on input...
         if not self.inputSegmentation:
             self.infoBox.noDataSent(u'No input.')
-            self.send('Extracted XML', None, self)
+            self.send('Extracted Data', None, self)
             return
 
         # Check that element field is not empty...
         if not self.element:
             self.infoBox.noDataSent(u'No XML element was specified.')
-            self.send('Extracted XML', None, self)
+            self.send('Extracted Data', None, self)
             return
 
         # Check that label is not empty...
         if not self.label:
             self.infoBox.noDataSent(u'No label was provided.')
-            self.send('Extracted XML', None, self)
+            self.send('Extracted Data', None, self)
             return
 
         # Check that importElementAs is not empty (if necessary)...
                 self.infoBox.noDataSent(
                         u'No annotation key was provided for element import.'
                 )
-                self.send('Extracted XML', None)
+                self.send('Extracted Data', None)
                 return
         else:
             importElementAs = None
                 self.infoBox.noDataSent(
                         u'No annotation key was provided for auto-numbering.'
                 )
-                self.send('Extracted XML', None, self)
+                self.send('Extracted Data', None, self)
                 return
         else:
             autoNumberKey = None
         message = pluralize(message, len(xml_extracted_data))
         self.infoBox.dataSent(message)
 
-        self.send( 'Extracted XML', xml_extracted_data, self)
+        self.send( 'Extracted Data', xml_extracted_data, self)
         self.sendButton.resetSettingsChangedFlag()
 
 

_textable/widgets/OWTextableIntersect.py

 #=============================================================================
-# Class OWTextableIntersect, v0.11
+# Class OWTextableIntersect, v0.13
 # Copyright 2012-2014 LangTech Sarl (info@langtech.ch)
 #=============================================================================
 # This file is part of the Textable (v1.4) extension to Orange Canvas.
                 self,
                 parent,
                 signalManager,
-                'TextableIntersect',
+                'TextableIntersect_0_13',
                 wantMainArea=0,
         )
 
         # Input and output channels...
         self.inputs  = [('Segmentation', Segmentation, self.inputData, Multiple)]
         self.outputs = [
-            ('Filtered data', Segmentation, Default),
+            ('Selected data', Segmentation, Default),
             ('Discarded data', Segmentation)
         ]
         
         # Check that there's something on input...
         if len(self.segmentations) == 0:
             self.infoBox.noDataSent(u'No input.')
-            self.send('Filtered data', None, self)
+            self.send('Selected data', None, self)
             return
 
         # Check that label is not empty...
         if not self.label:
             self.infoBox.noDataSent(u'No label was provided.')
-            self.send('Filtered data', None, self)
+            self.send('Selected data', None, self)
             return
 
         # Source and filtering parameter...
                 self.infoBox.noDataSent(
                         u'No annotation key was provided for auto-numbering.'
                 )
-                self.send('Filtered data', None, self)
+                self.send('Selected data', None, self)
                 return
         else:
             autoNumberKey = None
         message = pluralize(message, len(filtered_data))
         self.infoBox.dataSent(message)
 
-        self.send( 'Filtered data', filtered_data, self)
+        self.send( 'Selected data', filtered_data, self)
         self.send( 'Discarded data', discarded_data, self)
         self.sendButton.resetSettingsChangedFlag()
 

_textable/widgets/OWTextableLength.py

 #=============================================================================
-# Class OWTextableLength, v0.11
+# Class OWTextableLength, v0.12
 # Copyright 2012-2014 LangTech Sarl (info@langtech.ch)
 #=============================================================================
 # This file is part of the Textable (v1.4) extension to Orange Canvas.
                 self,
                 parent,
                 signalManager,
-                'TextableLength',
+                'TextableLength_0_12',
                 wantMainArea=0,
         )
         

_textable/widgets/OWTextableMerge.py

 #=============================================================================
-# Class OWTextableMerge, v0.16
+# Class OWTextableMerge, v0.17
 # Copyright 2012-2014 LangTech Sarl (info@langtech.ch)
 #=============================================================================
 # This file is part of the Textable (v1.4) extension to Orange Canvas.
                 self,
                 parent,
                 signalManager,
-                'TextableMerge',
+                'TextableMerge_0_17',
                 wantMainArea=0,
         )
         

_textable/widgets/OWTextablePreprocess.py

 #=============================================================================
-# Class OWTextablePreprocess, v0.08
+# Class OWTextablePreprocess, v0.09
 # Copyright 2012-2014 LangTech Sarl (info@langtech.ch)
 #=============================================================================
 # This file is part of the Textable (v1.4) extension to Orange Canvas.
                 self,
                 parent,
                 signalManager,
-                'TextablePreprocess',
+                'TextablePreprocess_0_09',
                 wantMainArea=0,
         )
         
                 self.segmentation,
                 mode                = 'standard',
                 label               = self.label,
-                copy_annotations    = self.copyAnnotations,
+                copy_annotations    = copyAnnotations,
                 progress_callback   = progressBar.advance,
         )
         progressBar.finish()

_textable/widgets/OWTextableRecode.py

 #=============================================================================
-# Class OWTextableRecode, v0.09
+# Class OWTextableRecode, v0.10
 # Copyright 2012-2014 LangTech Sarl (info@langtech.ch)
 #=============================================================================
 # This file is part of the Textable (v1.4) extension to Orange Canvas.
                 self,
                 parent,
                 signalManager,
-                'TextableRecode',
+                'TextableRecode_0_10',
                 wantMainArea=0,
         )
 

_textable/widgets/OWTextableSegment.py

 #=============================================================================
-# Class OWTextableSegment, v0.16
+# Class OWTextableSegment, v0.17
 # Copyright 2012-2014 LangTech Sarl (info@langtech.ch)
 #=============================================================================
 # This file is part of the Textable (v1.4) extension to Orange Canvas.
                 self,
                 parent,
                 signalManager,
-                'TextableSegment',
+                'TextableSegment_0_17',
                 wantMainArea=0,
         )
 

_textable/widgets/OWTextableSelect.py

 #=============================================================================
-# Class OWTextableSelect, v0.12
+# Class OWTextableSelect, v0.13
 # Copyright 2012-2014 LangTech Sarl (info@langtech.ch)
 #=============================================================================
 # This file is part of the Textable (v1.4) extension to Orange Canvas.
                 self,
                 parent,
                 signalManager,
-                'TextableSelect',
+                'TextableSelect_0_13',
                 wantMainArea=0,
         )
 
                     return
 
                 # Get number of iterations...
-                num_iterations = sampleSize
+                num_iterations = len(self.segmentation)
 
             # Else if mode is Threshold...
             elif self.method == u'Threshold':

_textable/widgets/OWTextableTextField.py

 #=============================================================================
-# Class OWTextableTextField, v0.08
+# Class OWTextableTextField, v0.09
 # Copyright 2012-2014 LangTech Sarl (info@langtech.ch)
 #=============================================================================
 # This file is part of the Textable (v1.4) extension to Orange Canvas.
                 self,
                 parent,
                 signalManager,
-                'TextableTextField',
+                'TextableTextField_0_09',
                 wantMainArea=0,
         )
 
         self.uuid               = getWidgetUuid(self)
 
         # Other attributes...
-        self.fileIndex          = 0
         self.infoBox            = InfoBox(widget=self.controlArea)
         self.sendButton         = SendButton(
                 widget              = self.controlArea,
 
     def sendData(self):
 
-        """Open file, read and normalize content, then send Text object"""
+        """Normalize content, then create and send segmentation"""
 
         # Get, convert and normalize field content...
         textFieldContent = unicode(
         message = pluralize(message, len(textFieldContent))
         self.infoBox.dataSent(message)
 
-        # Store content in the data array and set associated label.
+        # Update segmentation.
         self.segmentation.update(textFieldContent, label=self.label)
 
         # Send token...

_textable/widgets/OWTextableTextFiles.py

 #=============================================================================
-# Class OWTextableTextFiles, v0.12
+# Class OWTextableTextFiles, v0.13
 # Copyright 2012-2014 LangTech Sarl (info@langtech.ch)
 #=============================================================================
 # This file is part of the Textable (v1.4) extension to Orange Canvas.
                 self,
                 parent,
                 signalManager,
-                'TextableTextFiles',
+                'TextableTextFiles_0_13',
                 wantMainArea=0,
         )
 
                     filename = os.path.basename(filePath)
                     annotation[self.importFilenamesKey] = filename
                 if self.autoNumber and self.autoNumberKey:
-                    annotation[self.autoNumberKey] = unicode(counter)
+                    annotation[self.autoNumberKey] = counter
                     counter += 1
             annotations.append(annotation)
             

_textable/widgets/OWTextableURLs.py

 #=============================================================================
-# Class OWTextableURLs, v0.09
+# Class OWTextableURLs, v0.10
 # Copyright 2012-2014 LangTech Sarl (info@langtech.ch)
 #=============================================================================
 # This file is part of the Textable (v1.4) extension to Orange Canvas.
                 self,
                 parent,
                 signalManager,
-                'TextableURLs',
+                'TextableURLs_0_10',
                 wantMainArea=0,
         )
 
                 if self.importURLs and self.importURLsKey:
                     annotation[self.importURLsKey] = URL
                 if self.autoNumber and self.autoNumberKey:
-                    annotation[self.autoNumberKey] = unicode(counter)
+                    annotation[self.autoNumberKey] = counter
                     counter += 1
             annotations.append(annotation)
             

_textable/widgets/OWTextableVariety.py

 #=============================================================================
-# Class OWTextableVariety, v0.10
+# Class OWTextableVariety, v0.11
 # Copyright 2012-2014 LangTech Sarl (info@langtech.ch)
 #=============================================================================
 # This file is part of the Textable (v1.4) extension to Orange Canvas.
                 self,
                 parent,
                 signalManager,
-                'TextableVariety',
+                'TextableVariety_0_11',
                 wantMainArea=0,
         )
         

docs/rst/annotation_based_selection.rst

 Thus, in the case of the XML data example introduced
 :doc:`here <converting_xml_markup_annotations>` (and further developed
 :doc:`there <merging_units_annotations>`), we might insert an instance of
-:ref:`Select` between those of :ref:`Select` and :ref:`Count` (see
+:ref:`Select` between those of :ref:`Extract XML` and :ref:`Count` (see
 :ref:`figure 1 <annotation_based_selection_fig1>` below) in order to include
 only "content words".
 
 
     Figure 1: Inserting an instance of :ref:`Select` to filter a segmentation.
 
-In this simplified example, the :ref:`Intersect` instance could thus be
-parametered as indicated on :ref:`figure 2 <annotation_based_selection_fig1>`
-below), so as to exclude (**Mode: Exclude**) those segments whose annotation
-value for key *type* (**Annotation key**: *type*) is *DET* or *PREP*
-(**Regex:** ``^(DET|PREP)$``).
+In this simplified example, the :ref:`Select` instance could thus be
+parameterized as indicated on :ref:`figure 2
+<annotation_based_selection_fig1>` below), so as to exclude (**Mode:
+Exclude**) those segments whose annotation value for key *type* (**Annotation
+key**: *type*) is *DET* or *PREP* (**Regex:** ``^(DET|PREP)$``).
 
 .. _annotation_based_selection_fig2:
 
     :alt: Inserting an instance of Select to filter a segmentation
     :figclass: align-center
 
-    Figure 1: Excluding segments based on annotation values with :ref:`Select`.
+    Figure 2: Excluding segments based on annotation values with :ref:`Select`.
 

docs/rst/extract_xml.rst

 Extract XML
 ===========
 
-In preparation.
+.. image:: figures/ExtractXML_54.png
+
+Create a new segmentation based on XML markup.
+
+Signals
+-------
+
+Inputs:
+
+* ``Segmentation``
+
+  Segmentation covering XML data based on which a new segmentation will be
+  created
+
+Outputs:
+
+* ``Extracted data``
+
+  Segmentation containing the segments corresponding to extracted XML elements
+
+Description
+-----------
+
+This widget inputs a segmentation, searches in its content portions
+corresponding to a specific XML element type, and creates a segment for each
+occurrence of this element. It should be noted that if a given occurrence is
+distributed among several segments of the input segmentation, it will result
+in the creation of as many segments in the output segmentation.
+
+Every attribute from extracted elements is automatically converted in
+annotation in the output segmentation. For example, extracting the element
+*<div>* in the following fragment:
+
+::
+
+    <div type="interjection">Cripes!</div>
+
+will result in the creation of a segment whose content is *Cripes!* and whose
+annotation value for key *type* is *interjection*.
+
+This widget offers the easiest and most flexible way to import into Orange
+Textable v1.4 a segmentation and arbitrary annotations specified by the user
+for a given text. Let us however mention the following limitation: the widget
+automatically deletes all segments of zero length in the output segmentation.
+As a consequence, it is impossible to import empty XML elements (be they in
+the form *<element></element>* or *<element/>*).
+
+Basic interface
+~~~~~~~~~~~~~~~
+
+In the basic widget interface (see :ref:`figure 1 <extract_xml_fig1>` below),
+the **XML Extraction** section allows the user to specify the XML element to
+extract (**XML element**). The widget indeed only allows the extraction of a
+single type of element at a time; however, it extracts every occurrence of
+this element, including those embedded in other occurrences of the same type.
+
+.. _extract_xml_fig1:
+
+.. figure:: figures/extract_xml_basic_example.png
+    :align: center
+    :alt: Basic interface of the Extract XML widget
+    :figclass: align-center
+
+    Figure 1: **Extract XML** widget (basic interface).
+    
+The **Remove markup** checkbox triggers the deletion of  XML tags embedded
+within the extracted XML elements, if any. An important consequence of the
+use of this option is that the extracted elements will potentially be
+decomposed in several segments corresponding to portions  of their content
+which are separated by the deleted XML tags (see `Advanced interface`_ for an
+example of this mechanism [#]_).
+
+The **Options** section limits itself to the output segmentation label choice.
+By default, the input segment annotations are copied in the output segments.
+
+The **Info** section indicates the number of segments in the output
+segmentation, or the reasons why no segmentation is emitted (no input data,
+no output segment created, etc.).
+
+The **Send** button triggers data emission, as it happens a segmentation, to
+the output connection(s). When it is selected, the **Send automatically**
+checkbox disables the button and the widget attempts to automatically emit
+a segmentation at every modification of its interface or when its input data
+are modified (by deletion or addition of a connection, or because modified
+data is received through an existing connection).
+
+Advanced interface
+~~~~~~~~~~~~~~~~~~
+
+The XML Extraction section of the widget interface (see :ref:`figure 2
+<extract_xml_fig2>` below) allows the user to configure the XML element
+extraction. The field **XML element** allows the user to indicate the XML
+element type which should be sought. The **Import element with key** checkbox
+enables the program to assign to each output segment an annotation whose key
+is the text contained in the field immediately on the right and whose value is
+the name of the XML element extracted by the widget.
+
+.. _extract_xml_fig2:
+
+.. figure:: figures/extract_xml_advanced_example.png
+    :align: center
+    :alt: Advanced interface of the Extract XML widget
+    :figclass: align-center
+
+    Figure 2: **Extract XML** widget (advanced interface).
+
+If the **Remove markup** checkbox is selected, XML tags embedded within the
+extracted XML elements will be excluded from the output segmentation. An
+important consequence of the use of this option is that the extracted elements
+will potentially be decomposed in several segments corresponding to portions
+of their content which are separated by the excluded XML tags. For example,
+given the following fragment:
+
+::
+
+	<text>a <keyword>fragment</keyword> of XML data</text>
+
+the extraction of element *<text>* will lead to the creation of three
+segments:
+
+::
+
+    a
+
+::
+
+    fragment
+
+::
+
+    of XML data
+
+If on the other hand the **Remove markup** option is not selected, a single
+segment will be created:
+
+::
+
+    a <keyword>fragment</keyword> of XML data
+    
+The **Prioritize shallow attributes** checkbox determines the behavior of the
+widget in the very particular case where (a) elements of the extracted type
+are (exactly) embedded in one another, (b) they have different values for the
+same attribute, (c) the **Remove markup** option is selected and (d) the
+**Merge duplicate segments** option (section **Options**) as well. This could
+be the case in the extraction of the *<div>* element in the following fragment
+for example:
+
+::
+
+    <div type="A"><div type="B">
+    two exactly embedded elements
+    </div></div>
+
+In such a case, the widget will first create two segments that have the exact
+same address (since the embedded XML tags are deleted with **Remove markup**),
+then by the effect of **Merge duplicate segments**, it will seek to fuse them
+into one. It will only be able to keep one of the rival annotation values *A*
+and *B* for the annotation key *type*; by default, it will be the value
+associated to the element closest to the root in the XML tree, namely *A*.
+If on the other hand the **Prioritize shallow attributes** option is selected,
+the value of the element closest to the "surface" will be kept, in our example
+*B*.
+
+The **Conditions** subsection included in the **XML Extraction** section
+allows the user to limit the extraction by specifying conditions bearing on
+attributes of the extracted elements. These conditions are expressed in the
+form of regular expressions that the given attribute values must match. In the
+list appearing at the top of this subsection, the columns indicate (a) the
+concerned attribute, (b) the corresponding regular expression, and (c) the
+options associated to this expression. [#]_
+
+In :ref:`figure 2 <extract_xml_fig2>` above), we have thus limited the
+extraction only to the *<div>* elements that have a type attribute whose value
+is *poem*. If several conditions were defined, they would all have to be
+fulfilled for an element to be extracted. The buttons on the right enable the
+user to delete the selected condition (**Remove**) or to empty the list
+completely (**Clear All**).
+
+The remaining part of the **Conditions** subsection allows the user to add new
+conditions to the list. To do so, the attribute in question (**Attribute**)
+and the corresponding regular expression (**Regex**) must be specified. The
+**Ignore case (i)**, **Unicode dependent (u)**, **Multiline (m)** and **Dot
+matches all (s)** checkboxes manage the application of the corresponding
+options to the regular expression. Adding the new condition to the list is
+finally carried out by clicking on the **Add** button.
+
+The **Options** section allows the user to specify the output segmentation
+label. The **Auto-number with key** checkbox enables the program to
+automatically number the segments of the output segmentation and to associate
+the number to the annotation key specified in the text field on the right. The
+**Import annotations** checkbox copies in each output segment every annotation
+associated to the corresponding segment of the input segmentation. The **Merge
+duplicate segments** checkbox enables the program to fuse distinct segments
+whose addresses are the same in a single segment; the annotations associated
+to the fused segments are copied in the single resulting segment. [#]_
+
+The **Info** section indicates the number of segments in the output
+segmentation, or the reasons why no segmentation is emitted (no input data,
+no output segment created, etc.).
+
+The **Send** button triggers data emission, as it happens a segmentation, to
+the output connection(s). When it is selected, the **Send automatically**
+checkbox disables the button and the widget attempts to automatically emit
+a segmentation at every modification of its interface or when its input data
+are modified (by deletion or addition of a connection, or because modified
+data is received through an existing connection).
+
+Examples
+--------
+
+* :doc:`Converting XML markup to annotations <converting_xml_markup_annotations>`
+
+See also:
+
+* :doc:`Merging units with annotations <merging_units_annotations>`
+
+.. [#] In comparison with the advance interface, it should also be noted that
+       in the basic interface the options **Prioritize shallow attributes**
+       and **Merge duplicate segments** are disabled by default.
+
+.. [#] See `Python documentation <http://docs.python.org/library/re.html>`_.
+
+.. [#] In the case where the fused segments have distinct values for the same
+       annotation key, only the value of the last segment (in the order of the
+       extracted segments before fusion) will be retained.
+
 

docs/rst/figures/Thumbs.db

Binary file modified.

docs/rst/figures/banner.jpg

Added
New image

docs/rst/figures/extract_xml_advanced_example.png

Added
New image

docs/rst/figures/extract_xml_basic_example.png

Added
New image

docs/rst/figures/merge_advanced_example.png

Added
New image

docs/rst/figures/mining_humanist_recode.png

Added
New image

docs/rst/figures/mining_humanist_results.png

Added
New image

docs/rst/figures/mining_humanist_schema.png

Added
New image

docs/rst/figures/recode_advanced_example.png

Added
New image

docs/rst/figures/recode_basic_example.png

Added
New image

docs/rst/figures/segment_advanced_example.png

Added
New image

docs/rst/figures/select_advanced_regex_example.png

Added
New image

docs/rst/figures/select_advanced_sample_example.png

Added
New image

docs/rst/figures/select_advanced_threshold_example.png

Added
New image

docs/rst/figures/urls_advanced_example.png

Old
Old image
New
New image

docs/rst/illustration.rst

+Illustration: mining Humanist
+=============================
+
+The following example is meant to show *what* Orange Textable typically does,
+without considering (for now) every detail of *how* it does it.
+
+In a paper reflecting about terminology in the field of Digital Humanities
+[#]_, Patrik Svensson compares the evolution of the frequency of expressions
+*Humanities Computing* and *Digital Humanities* in 20 years of archives of
+the `Humanist discussion group <http://dhhumanist.org/>`_. He uses these
+figures to show that while the former denomination remains prevalent over
+these two decades, the latter has been quickly gaining ground since the 2000s.
+
+The same experiment can be run with Orange Textable, by building a "visual
+program" like the one shown on :ref:`figure 1 <illustration_fig1>` below:
+
+.. _illustration_fig1:
+
+.. figure:: figures/mining_humanist_schema.png
+    :align: center
+    :alt: Mining Humanist with an Orange Textable schema
+    :figclass: align-center
+    :scale: 80%
+
+    Figure 1: Mining Humanist with an Orange Textable schema.
+
+Such a program is called a *schema*. Its visible part consists of a network
+of interconnected units called *widget instances*. Each instance belongs to a
+type, e.g. :ref:`URLs`, :ref:`Recode`, :ref:`Segment`, and so on. Widgets
+are the basic blocks with which a variety of text analysis applications can be
+built. Each corresponds to a fundamental operation, such as "import data from
+an online source" (:ref:`URLs`) or "replace specific text patterns with
+others" (:ref:`Recode`) for example. Connections between instances determine
+the flow of data in the schema, and thus the order in which operations are
+carried on. Several parallel paths can be constructed, as demonstrated here
+by the :ref:`Recode` instance, which sends data to :ref:`Segment` as well as
+:ref:`Count`.
+
+Widget instances can (and indeed must) be individually parameterized in order
+to "fine-tune" their operation. For example, double-clicking on the
+:ref:`Recode` instance of :ref:`figure 1 <illustration_fig1>` above displays
+the interface shown on :ref:`figure 2 <illustration_fig2>` below. What this
+particular configuration means is that every line beginning with symbol "|" or
+">" (**Regex** field) should be replaced with an empty string (**Replacement
+string**): in other words, remove those lines that are marked as being part
+of a reply to another message. There is a fair amount of variation between
+widget interfaces, but regular expressions play an important role in many of
+them and Orange Textable's flexibility owes a lot to them.
+
+.. _illustration_fig2:
+
+.. figure:: figures/mining_humanist_recode.png
+    :align: center
+    :alt: Interface of Recode widget in the Humanist example
+    :figclass: align-center
+
+    Figure 2: Interface of the :ref:`Recode` widget.
+
+After executing the schema of :ref:`figure 1 <illustration_fig1>` above, the
+resulting frequencies can be viewed by double-clicking on the **Data Table**
+instance, whose interface is shown on :ref:`figure 3 <illustration_fig3>`
+below. On the whole, these figures lend themselves to the same interpretation
+as that of Patrik Svensson, but they differ wildly from the frequencies he
+reports. This might be explained by the fact that, in the present
+illustration, we have used *preprocessed* data `made available on the Humanist
+website <http://dhhumanist.org/text.html>`_, or it might be that we have not
+processed the data exactly like Svensson did. The user can always refer to the
+Orange Textable schema (including the parameters of each instance) to
+understand exactly the operations that it performs. [#]_ In this sense, Orange
+Textable does not only attempt to make the construction of text analysis
+programs easier; it aims to make *communicating* and *understanding* such
+programs easier.
+
+.. _illustration_fig3:
+
+.. figure:: figures/mining_humanist_results.png
+    :align: center
+    :alt: Monitoring the frequency of two expressions over time
+    :figclass: align-center
+
+    Figure 3: Monitoring the frequency of *Humanities Computing* vs. *Digital Humanities*.
+
+.. [#] Svensson, P. (2009). Humanities Computing as Digital Humanities.
+       *Digital Humanities Quarterly 3(3)*. Available `here
+       <http://digitalhumanities.org/dhq/vol/3/3/000065/000065.html>`_.
+
+.. [#] The schema can be downloaded from :download:`here
+       <schemas/humanist_mining_example.ows>`. Note that two decades of
+       Humanist archives weigh dozens of megabytes and that retrieving these
+       data from the Internet can take a few minutes depending on bandwidth.
+       Please be patient if Orange Textable appears to be stalled when the
+       schema is being opened.
+
+
-Orange Textable documentation
+.. image:: figures/banner.jpg
+   
+Orange Textable documentation
 =============================
 
-Textable is an add-on for Orange_ data mining software package. It enables users to build data
-tables on the basis of text data, by means of a flexible and intuitive
-interface. It offers in particular the following features:
+Orange Textable is an add-on for Orange_ data mining software package. It
+enables users to build data tables on the basis of text data, by means of a
+flexible and intuitive interface. Look at the following :doc:`example
+<illustration>` to see it in typical action.
+
+Orange Textable offers in particular the following features:
 
 - import text data from various sources
 - apply systematic recoding operations
 
 .. _Orange: http://orange.biolab.si/
 
-Textable was designed and implemented by `LangTech Sàrl <http://langtech.ch>`_
-on behalf of the department of language and information sciences (SLI_) at the
-`University of Lausanne <http://www.unil.ch>`_ (see :doc:`Credits <credits>`).
+Orange Textable was designed and implemented by `LangTech Sàrl 
+<http://langtech.ch>`_ on behalf of the department of language and information 
+sciences (SLI_) at the `University of Lausanne <http://www.unil.ch>`_ (see 
+:doc:`Credits <credits>`).
 
 .. _SLI: http://www.unil.ch/sli
 
 .. toctree::
     :maxdepth: 3
    
+    Illustration: mining Humanist <illustration>
     Installation <installation>
     Getting started <getting_started>
     Widget reference <widget_reference>

docs/rst/intersect.rst

 Intersect
 =========
 
-In preparation.
+.. image:: figures/Intersect_54.png
+
+In-/exclude segments based on another segmentation.
+
+Signals
+-------
+
+Inputs:
+
+* ``Segmentation``
+
+  Segmentation out of which a subset of segments should be selected
+  ("source" segmentation), or containing the segments that will be
+  in-/excluded from the former ("filter" segmentation").
+
+Outputs:
+
+* ``Selected data`` (default)
+
+  Segmentation containing the selected segments
+
+* ``Discarded data``
+
+  Segmentation containing the discarded segments
+
+Description
+-----------
+
+This widget inputs several segmentations and selects the segments of one of
+them ("source" segmentation) on the basis of the segments present in another
+("filter" segmentation). It also emits on an output connection (not selected
+by default) a segmentation containing the segments that were *not* selected.
+
+Basic interface
+~~~~~~~~~~~~~~~
+
+The **Intersect** section of the widget's basic interface (see :ref:`figure 1
+<intersect_fig1>` above) allows the user to specify if the segments of the
+source segmentation that correspond to a type present in the filter
+segmentation should be included (**Mode: Include**) in the output segmentation
+or excluded (**Mode: Exclude**) from it. This section is also designed to
+select the source segmentation (**Source segmentation**) and the filter
+segmentation (**Filter segmentation**) among the input segmentations. [#]_
+
+.. _intersect_fig1:
+
+.. figure:: figures/intersect_example.png
+    :align: center
+    :alt: Basic interface of the Intersect widget
+    :figclass: align-center
+
+    Figure 1: **Intersect** widget (basic interface).
+
+The **Source annotation** key drop-down menu allows the user to select an
+annotation key from the source segmentation; thus the segments whose
+annotation value for this key corresponds to a type present in the filter
+segmentation will be in-/excluded. If the value *(none)* is selected, the
+segment content will be decisive.
+
+Thus in :ref:`figure 1 <intersect_fig1>` above, the widget inputs two
+segmentations. The first (**Source segmentation**), whose label is *words*, is
+the result of the segmentation of a text in words, as performed with the
+:ref:`Segment` widget for instance. The second (**Filter segmentation**),
+whose label is *stopwords*, is the result of the segmentation in words of a
+list of so-called "stopwords" (articles, pronouns, prepositions,
+etc.)--typically deemed irrelevant for information retrieval.
+
+Since the **Source annotation key** drop-down menu is set on *(none)*,
+the content of input segments will determine the next steps (rather than the
+values of some annotation key). Concretely, the source segmentation segments
+(namely the words from the text) whose content matches that of a segment from
+the filter segmentation (namely a stopword) will be excluded (**Mode:
+Exclude**) from the output segmentation. By contrast, choosing the value
+**Include** would result in including as output only the stopwords from the
+text.
+
+The **Options** section limits itself to the output segmentation label choice.
+[#]_ By default, annotations are systematically copied from input to output
+segments.
+
+The **Info** section indicates the number of segments in the output
+segmentation, or the reasons why no segmentation is emitted (no input data,
+no selected input segment, etc.).
+
+The **Send** button triggers data emission, as it happens a segmentation, to
+the output connection(s). When it is selected, the **Send automatically**
+checkbox disables the button and the widget attempts to automatically emit
+a segmentation at every modification of its interface or when its input data
+are modified (by deletion or addition of a connection, or because modified
+data is received through an existing connection).
+
+Advanced interface
+~~~~~~~~~~~~~~~~~~
+
+The main difference between the widget's basic and advanced interface is that
+in the latter, section **Intersect** includes a **Filter annotation key**
+drop-down menu. If a given annotation key of the filter segmentation is
+selected, the corresponding annotation value (rather than *content*) types
+will condition the in-/exclusion of the source segmentation segments.
+
+The advanced interface also offers two additional controls in section
+**Options**. The **Auto-number with key** checkbox enable the program to
+automatically number the segments from the output segmentation and to
+associate their number to the annotation key specified in the text field on
+the right. The **Copy annotations** checkbox copies every annotation from the
+input segmentation to the output segmentation.
+
+Examples
+--------
+
+* :doc:`Using a segmentation to filter another <using_segmentation_filter_another>`
+
+.. [#] It should be noted that the interface does not prevent the user from
+       selecting the same segmentation as source and filter, which can only
+       make sense if different values are selected in the **Source annotation
+       key** and **Filter annotation key** menus (the latter being only
+       available when the **Advanced settings** checkbox is selected).
+
+.. [#] Here it concerns the segmentation containing the selected segments and
+       emitted on the default output channel; the segmentation containing the
+       discarded segments receives the same label prepended with *NEG\_*.
+
+
 
 Merge
 =====
 
-In preparation.
+.. image:: figures/Merge_54.png
+
+Merge two or more segmentations.
+
+Signals
+-------
+
+Inputs:
+
+* ``Segmentation``
+
+  Any number of segmentations that should be merged together
+
+Outputs:
+
+* ``Merged data``
+
+  Merged segmentation
+
+Description
+-----------
+
+This widget takes several input segmentations, successively copies each
+segment of each input segmentation to form a new segmentation, and sends this
+segmentation to its output connections.
+
+.. _merge_fig1:
+
+.. figure:: figures/merge_advanced_example.png
+    :align: center
+    :alt: Merge widget (advanced interface)
+    :figclass: align-center
+
+    Figure 1: **Merge** widget (advanced interface).
+
+The **Ordering** section of the widget interface (see :ref:`figure 1
+<merge_fig1>` above) allows the user to select the order in which the input
+segmentations are placed to form the merged output segmentation. The label of
+each input segmentation appears on a line of the list and can be selected then
+moved by clicking on the **Move Up** and **Move Down** buttons.
+
+The **Options** section allows the user to specify the label assigned to the
+output segmentation (**Output segmentation label**). The **Import labels with
+key** checkbox enables the user to create for each input segmentation an
+annotation whose value is the segmentation label (as displayed in the list)
+and whose key is specified by the user in the text field on the right of the
+checkbox. Similarly, the **Auto-number with key** checkbox enables the program
+to automatically number the output segments and to associate the number to the
+annotation key specified in the text field on the right. The **Copy
+annotations** checkbox copies every input segmentation annotation to the
+output segmentation.
+
+The two last elements of the **Options** section influence the ordering of
+segments in the output segmentation as well as their count. The **Sort
+segments** checkbox enables the program to sort the segments on the basis of
+their address (string index first, then initial position, and final position);
+this option is typically useful to rearrange segments that belong to
+different segmentations of a single text in their order of occurrence in the
+text. [#]_ The **Merge duplicate segments** checkbox enables the program to
+fuse into a single segment several distinct segments whose addresses are the
+same; the annotations associated to the fused segments are all copied in the
+single resulting segment. [#]_
+
+When the **Advanced settings** checkbox is not selected, only the **Output
+segmentation label** and **Import labels with key** options are accessible.
+In that case, auto-numbering is disabled, annotations are copied by default,
+and segments are sorted by address but not fused.
+
+The **Info** section indicates the number of segments in the output
+segmentation, or the reasons why no segmentation is emitted (no input data,
+no label specified for the output segmentation, etc.).
+
+The **Send** button triggers data emission, as it happens a segmentation, to
+the output connection(s). When it is selected, the **Send automatically**
+checkbox disables the button and the widget attempts to automatically emit
+a segmentation at every modification of its interface or when its input data
+are modified (by deletion or addition of a connection, or because modified
+data is received through an existing connection).
+
+Examples
+--------
+
+* :doc:`Merging segmentations together <merging_segmentations_together>`
+* :doc:`Annotating by merging <annotating_merging>`
+
+See also:
+
+* :doc:`Tagging table rows with annotations <tagging_table_rows_annotations>`
+* :ref:`Preprocess (section "Caveat") <anchor_to_caveat>`
+
+.. [#] Note that if sorting is enabled, it may well result in segments being
+       ordered in a different way than specified by the user in the
+       **Ordering** section.
+
+.. [#] In the case where the fused segments have distinct values for the same
+       annotation key, only the value of the last segment (in the order of the
+       output segmentation before fusion) will be retained.
+
 

docs/rst/preprocess.rst

 
 * ``Segmentation``
 
-  Segmentation covering the text to be preprocessed
+  Segmentation covering the text that should be preprocessed
 
 Outputs:
 
 
 The **Send** button triggers data emission, as it happens a segmentation, to
 the output connection(s). When it is selected, the **Send automatically**
-checkbox deactivates the button and the widget attempts to automatically emit
-a segmentation at every modification of its interface (editing of the text or
-label modification) or when its input data are modified (by deletion or
-addition of a connection, or because modified data is received through an
-existing connection).
+checkbox disables the button and the widget attempts to automatically emit
+a segmentation at every modification of its interface or when its input data
+are modified (by deletion or addition of a connection, or because modified
+data is received through an existing connection).
+
+.. _anchor_to_caveat:
 
 Caveat
 ------
 Recode
 ======
 
-In preparation.
+.. image:: figures/Recode_54.png
+
+Custom text recoding using regular expressions.
+
+Signals
+-------
+
+Inputs:
+
+* ``Segmentation``
+
+  Segmentation covering the text that should be recoded
+
+Outputs:
+
+* ``Recoded text data``
+
+  Segmentation covering the recoded text
+
+Description
+-----------
+
+This widget inputs a segmentation, creates a modified copy of its content, and
+outputs a new segmentation corresponding to the modified data. The
+modifications applied are defined by *substitutions*, namely pairs
+composed of a regular expression (designed to identify portions of text that
+should be modified) and a replacement string.
+
+It is possible to "capture" text portions using parentheses appearing in the
+regular expressions, in order to insert them in the replacement strings, where
+sequences ``&1``, ``&2``, etc. correspond to the successive pairs of
+parentheses (numbered on the basis of the position of the opening
+parenthesis).
+
+Note that **Recode** creates a copy of each modified segment, which
+increases the program's memory footprint; moreover this widget can only work
+on segmentations without any overlap, which means no part of the text is
+covered by more than one segment.
+
+Basic interface
+~~~~~~~~~~~~~~~
+
+The basic version of the widget is limited to the application of a single
+substitution. Section **Substitution** (see :ref:`figure 1
+<recode_fig1>` below) allows the user to specify the regular expression
+(**Regex**) and the corresponding replacement string (**Replacement string**).
+If the replacement string is left empty, the text parts identified by the
+regular expression will simply be deleted; it is the case in the example of
+:ref:`figure 1 <recode_fig1>`, which leads to the deletion of XML/HTML
+tags. [#]_
+
+.. _recode_fig1:
+
+.. figure:: figures/recode_basic_example.png
+    :align: center
+    :alt: Basic interface of the Recode widget
+    :figclass: align-center
+
+    Figure 1: **Recode** widget (basic interface).
+
+The **Options** section allows the user to define the output segmentation
+label. The annotations of each input segment are systematically copied in the
+corresponding output segments (see `Advanced interface`_, option **Copy
+annotations**).
+
+The **Info** section indicates the number of segments present in the output
+segmentation, or the reasons why no segmentation is emitted (no input data,
+overlaps in the input segmentation, etc.).
+
+The **Send** button triggers data emission, as it happens a segmentation, to
+the output connection(s). When it is selected, the **Send automatically**
+checkbox disables the button and the widget attempts to automatically emit
+a segmentation at every modification of its interface or when its input data
+are modified (by deletion or addition of a connection, or because modified
+data is received through an existing connection).
+
+Advanced interface
+~~~~~~~~~~~~~~~~~~
+
+In its advanced version, the **Recode** widget allows the user to define
+several substitutions and to determine the order in which they should
+successively be applied to each segment of the input segmentation.
+
+.. _recode_fig2:
+
+.. figure:: figures/recode_advanced_example.png
+    :align: center
+    :alt: Advanced interface of the Recode widget
+    :figclass: align-center
+
+    Figure 2: **Recode** widget (basic interface).
+
+The advanced interface (see :ref:`figure 2 <recode_fig2>` above) presents
+similarities with that of the :ref:`Text Files`, :ref:`URLs`, and
+:ref:`Segment` widgets. The **Substitutions** section allows the user to
+define the substitutions applied to each successive input segment and to
+determine their application order. In the list displayed at the top of the
+window, each line specifies a substitution, and the columns indicate for each
+substitution (a) the corresponding regular expression, (b) the (possibly
+empty) replacement string, and (c) the options associated with the regular
+expression. [#]_
+
+On :ref:`figure 2 <recode_fig2>` above, we can see that three substitutions
+have been specified. The first deletes XML/HTML tags (it replaces them with
+the empty string). The second replaces occurrences of British English forms
+(*behaviour*, *colour*, and *neighbour*, possibly capitalized, since the
+*Ignore case* option is selected) with their American English variants
+(*behavior*, *color*, and *neighbor*), while the last replaces sequences
+like *a X of mine* with *my X*; thus they illustrate the possibility to
+"capture" text portions through parentheses appearing in the regular
+expression.
+
+To take a concrete example, the successive application of these three
+substitutions to string
+
+::
+
+ 	<example>I've just met a neighbour of mine.</example>
+
+will produce in turns the modified versions
+
+::
+
+    I've just met a neighbour of mine.
+
+::
+
+    I've just met a neighbor of mine.
+
+::
+
+    I've just met my neighbor.
+
+The first buttons on the right of the substitution list allow the user to
+modify the order in which they are successively applied to each segment of the
+input segmentation (**Move Up** and **Move Down**), to delete a substitution
+from the list (**Remove**) or to empty it entirely (**Clear All**). Except for
+**Clear All**, all of these buttons require the selection of an entry in the
+list beforehand. **Import List** enables the user to import a list of
+substitutions in JSON format (see :doc:`JSON im-/export format <json_format>`)
+and to add them to those already selected. **Export List** enables the
+user on the contrary to export the list of substitutions in a JSON format
+file.
+
+The remaining part of the **Substitutions** section allows the user to add new
+substitutions to the list. To define a new substitution, one must specify the
+regular expression (**Regex**) and the corresponding replacement string
+(**Replacement string**); the latter can be left empty, in which case the text
+portions identified by the regular expression will simply be deleted. The
+**Ignore case (i)**, **Unicode dependent (u)**, **Multiline (m)** and **Dot
+matches all (s)** checkboxes control the application of the corresponding
+options to the regular expression. Adding the new substitution to the list is
+achieved by clicking on the **Add** button.
+
+The **Options** section allows the user to define the output segmentation
+label. The **Copy annotations** checkbox copies every annotation of the input
+segmentation to the output segmentation.
+
+The **Info** section indicates the number of segments present in the output
+segmentation, or the reasons why no segmentation is emitted (no input data,
+overlaps in the input segmentation, etc.).
+
+The **Send** button triggers data emission, as it happens a segmentation, to
+the output connection(s). When it is selected, the **Send automatically**
+checkbox disables the button and the widget attempts to automatically emit
+a segmentation at every modification of its interface or when its input data
+are modified (by deletion or addition of a connection, or because modified
+data is received through an existing connection).
+
+Caveat
+------
+
+As one of the rare widgets of Textable that do create new *strings* and not
+only new *segmentations* (the only other one being :ref:`Preprocess`),
+**Recode** is prone to a very specific and possibly disconcerting type of
+error, which can be best understood by studying the example given in the
+documentation of :ref:`Preprocess` (section :ref:`anchor_to_caveat`), where
+all that is said about :ref:`Preprocess` also applies to **Recode**.
+
+.. [#] For more details concerning the regular expression syntax, see the
+       `Python documentation <http://docs.python.org/library/re.html>`_.
+       Note that option ``-u`` (*Unicode dependent*) is activated by default.
+
+.. [#] For more details on the effect of options ``i``, ``u``, ``m``, and
+       ``s``, see the
+       `Python documentation <http://docs.python.org/library/re.html>`_.
+
+
 

docs/rst/schemas/humanist_mining_example.ows

+<?xml version='1.0' encoding='utf-8'?>
+<scheme description="" title="" version="2.0">
+	<nodes>
+		<node id="0" name="URLs" position="(-237.0, 139.0)" project_name="orange-textable" qualified_name="_textable.widgets.OWTextableURLs.OWTextableURLs" title="URLs" version="" />
+		<node id="1" name="Segment" position="(-46.0, 245.0)" project_name="orange-textable" qualified_name="_textable.widgets.OWTextableSegment.OWTextableSegment" title="Segment" version="" />
+		<node id="2" name="Convert" position="(238.0, 140.0)" project_name="orange-textable" qualified_name="_textable.widgets.OWTextableConvert.OWTextableConvert" title="Convert" version="" />
+		<node id="3" name="Data Table" position="(337.0, 140.0)" project_name="Orange" qualified_name="Orange.OrangeWidgets.Data.OWDataTable.OWDataTable" title="Data Table" version="" />
+		<node id="4" name="Recode" position="(-140.0, 139.0)" project_name="orange-textable" qualified_name="_textable.widgets.OWTextableRecode.OWTextableRecode" title="Recode" version="" />
+		<node id="5" name="Segment" position="(51.0, 245.0)" project_name="orange-textable" qualified_name="_textable.widgets.OWTextableSegment.OWTextableSegment" title="Segment (2)" version="" />
+		<node id="6" name="Count" position="(138.0, 140.0)" project_name="orange-textable" qualified_name="_textable.widgets.OWTextableCount.OWTextableCount" title="Count" version="" />
+	</nodes>
+	<links>
+		<link enabled="true" id="0" sink_channel="Data" sink_node_id="3" source_channel="Orange table" source_node_id="2" />
+		<link enabled="true" id="1" sink_channel="Segmentation" sink_node_id="1" source_channel="Recoded data" source_node_id="4" />
+		<link enabled="true" id="2" sink_channel="Textable table" sink_node_id="2" source_channel="Pivot Crosstab" source_node_id="6" />
+		<link enabled="true" id="3" sink_channel="Segmentation" sink_node_id="6" source_channel="Segmented data" source_node_id="5" />
+		<link enabled="true" id="4" sink_channel="Segmentation" sink_node_id="6" source_channel="Recoded data" source_node_id="4" />
+		<link enabled="true" id="5" sink_channel="Segmentation" sink_node_id="4" source_channel="Text data" source_node_id="0" />
+		<link enabled="true" id="6" sink_channel="Segmentation" sink_node_id="5" source_channel="Segmented data" source_node_id="1" />
+	</links>
+	<annotations>
+		<arrow end="(-233.0, 109.00000000000003)" fill="#C1272D" id="0" start="(-207.0, 56.000000000000014)" />
+		<text font-family="Helvetica" font-size="16" id="1" rect="(-233.0, -13.0, 394.0, 69.0)">1. Import annual archives from the Humanist discussion group (http://dhhumanist.org/Archives/Converted_Text/)
+from 1987 to 2008.</text>
+		<arrow end="(-131.0, 108.0)" fill="#C1272D" id="2" start="(-85.0, 77.0)" />
+		<text font-family="Helvetica" font-size="16" id="3" rect="(-84.0, 47.0, 326.0, 69.0)">2. Discard text lines marked as replies to other messages (those that begin with | or &gt;).</text>
+		<text font-family="Helvetica" font-size="16" id="4" rect="(-229.0, 204.0, 132.0, 107.0)">3. Segment archives into messages, discarding subject line.</text>
+		<arrow end="(-88.99999999999999, 256.0)" fill="#C1272D" id="5" start="(-137.0, 256.0)" />
+		<arrow end="(89.00000000000006, 264.0)" fill="#C1272D" id="6" start="(119.00000000000009, 302.0)" />
+		<text font-family="Helvetica" font-size="16" id="7" rect="(-82.0, 303.0, 388.0, 52.0)">4. Identify all occurrences of expressions "Humanities Computing" and "Digital Humanities" in messages.</text>
+		<text font-family="Helvetica" font-size="16" id="8" rect="(195.0, 222.0, 163.0, 74.0)">5. Count occurrences of both expressions across time periods. </text>
+		<arrow end="(169.0, 173.0)" fill="#C1272D" id="9" start="(196.0, 221.0)" />
+		<text font-family="Helvetica" font-size="16" id="10" rect="(226.0, -16.0, 148.0, 88.0)">6. Results appears here (after standard conversion).</text>
+		<arrow end="(335.0, 107.0)" fill="#C1272D" id="11" start="(317.0, 51.000000000000014)" />
+	</annotations>
+	<thumbnail />
+	<node_properties>
+		<properties format="pickle" node_id="0">(dp1
+S'autoNumberKey'
+p2
+Vvol
+p3
+sS'autoSend'
+p4
+I01
+sS'uuid'
+p5
+ccopy_reg
+_reconstructor
+p6
+(cuuid
+UUID
+p7
+c__builtin__
+object
+p8
+NtRp9
+(dp10
+S'int'
+p11
+L129722075554270120612964801262387674556L
+sbsS'encoding'
+p12
+Viso-8859-1
+p13
+sS'URL'
+p14
+Vhttp://dhhumanist.org/Archives/Converted_Text/humanist.1988-1989.txt
+p15
+sS'displayAdvancedSettings'
+p16
+I01
+sS'label'
+p17
+Vurl_content
+p18
+sS'widgetShown'
+p19
+I0
+sS'lastLocation'
+p20
+VC:/Users/Aris/Documents/Recherche/U of C/llc14
+p21
+sS'savedWidgetGeometry'
+p22
+S'\x01\xd9\xd0\xcb\x00\x01\x00\x00\x00\x00\x04@\x00\x00\x007\x00\x00\x05\xb9\x00\x00\x02\xfa\x00\x00\x04D\x00\x00\x00N\x00\x00\x05\xb5\x00\x00\x02\xf6\x00\x00\x00\x00\x00\x00'
+p23
+sS'URLs'
+p24
+(lp25
+(Vhttp://dhhumanist.org/Archives/Converted_Text/humanist.1987-1988.txt
+Viso-8859-1
+Vperiod
+p26
+V1987-1988
+p27
+tp28
+a(Vhttp://dhhumanist.org/Archives/Converted_Text/humanist.1988-1989.txt
+Viso-8859-1
+Vperiod
+p29
+V1988-1989
+p30
+tp31
+a(Vhttp://dhhumanist.org/Archives/Converted_Text/humanist.1989-1990.txt
+Viso-8859-1
+Vperiod
+p32
+V1989-1990
+p33
+tp34
+a(Vhttp://dhhumanist.org/Archives/Converted_Text/humanist.1990-1991.txt
+Viso-8859-1
+Vperiod
+p35
+V1990-1991
+p36
+tp37
+a(Vhttp://dhhumanist.org/Archives/Converted_Text/humanist.1991-1992.txt
+Viso-8859-1
+Vperiod
+p38
+V1991-1992
+p39
+tp40
+a(Vhttp://dhhumanist.org/Archives/Converted_Text/humanist.1992-1993.txt
+Viso-8859-1
+Vperiod
+p41
+V1992-1993
+p42
+tp43
+a(Vhttp://dhhumanist.org/Archives/Converted_Text/humanist.1993-1994.txt
+Viso-8859-1
+Vperiod
+p44
+V1993-1994
+p45
+tp46
+a(Vhttp://dhhumanist.org/Archives/Converted_Text/humanist.1994-1995.txt
+Viso-8859-1
+Vperiod
+p47
+V1994-1995
+p48
+tp49
+a(Vhttp://dhhumanist.org/Archives/Converted_Text/humanist.1995-1996.txt
+Viso-8859-1
+Vperiod
+p50
+V1995-1996
+p51
+tp52
+a(Vhttp://dhhumanist.org/Archives/Converted_Text/humanist.1996-1997.txt
+Viso-8859-1
+Vperiod
+p53
+V1996-1997
+p54
+tp55
+a(Vhttp://dhhumanist.org/Archives/Converted_Text/humanist.1997-1998.txt
+Viso-8859-1
+Vperiod
+p56
+V1997-1998
+p57
+tp58
+a(Vhttp://dhhumanist.org/Archives/Converted_Text/humanist.1998-1999.txt
+Viso-8859-1
+Vperiod
+p59
+V1998-1999
+p60
+tp61
+a(Vhttp://dhhumanist.org/Archives/Converted_Text/humanist.1999-2000.txt
+Viso-8859-1
+Vperiod
+p62
+V1999-2000
+p63
+tp64
+a(Vhttp://dhhumanist.org/Archives/Converted_Text/humanist.2000-2001.txt
+Viso-8859-1
+Vperiod
+p65
+V2000-2001
+p66
+tp67
+a(Vhttp://dhhumanist.org/Archives/Converted_Text/humanist.2001-2002.txt
+Viso-8859-1
+Vperiod
+p68
+V2001-2002
+p69
+tp70
+a(Vhttp://dhhumanist.org/Archives/Converted_Text/humanist.2002-2003.txt
+Viso-8859-1
+Vperiod
+p71
+V2002-2003
+p72
+tp73
+a(Vhttp://dhhumanist.org/Archives/Converted_Text/humanist.2003-2004.txt
+Viso-8859-1
+Vperiod
+p74
+V2003-2004
+p75
+tp76
+a(Vhttp://dhhumanist.org/Archives/Converted_Text/humanist.2004-2005.txt
+Viso-8859-1
+Vperiod
+p77
+V2004-2005
+p78
+tp79
+a(Vhttp://dhhumanist.org/Archives/Converted_Text/humanist.2005-2006.txt
+Viso-8859-1
+Vperiod
+p80
+V2005-2006
+p81
+tp82
+a(Vhttp://dhhumanist.org/Archives/Converted_Text/humanist.2006-2007.txt
+Viso-8859-1
+Vperiod
+p83
+V2006-2007
+p84
+tp85
+a(Vhttp://dhhumanist.org/Archives/Converted_Text/humanist.2007-2008.txt
+Viso-8859-1
+Vperiod
+p86
+V2007-2008
+p87
+tp88
+asS'autoNumber'
+p89
+I01
+sS'importURLsKey'
+p90
+Vurl
+p91
+sS'importURLs'
+p92
+I00
+s.</properties>
+		<properties format="pickle" node_id="1">(dp1
+S'regex'
+p2
+VX-Humanist:.+?(?=From:|$)(?s)
+p3
+sS'autoNumberKey'
+p4
+Vnum
+p5
+sS'autoSend'
+p6
+I01
+sS'uuid'
+p7
+ccopy_reg
+_reconstructor
+p8
+(cuuid
+UUID
+p9
+c__builtin__
+object
+p10
+NtRp11
+(dp12
+S'int'
+p13
+L60112408854348807738788140676809148794L
+sbsS'regexes'
+p14
+(lp15
+(VFrom:.+?X-Humanist: Vol\u005c. \u005cd+ Num\u005c. \u005cd+ \u005c((\u005cd+)\u005c).+?(?=From:|$)
+Vnum
+V&amp;1
+I01
+I01
+I00
+I01