Commits

tiedeman committed 624d5f0

added opus systems for annotation without parsing

Comments (0)

Files changed (19)

uplug-main/share/systems/opus/bg/tag

+{
+  'module' => {
+    'name' => 'Bulgarian pre-processing',
+    'submodules' => [
+        'pre/bg/toktag',
+    ],
+    'submodule names' => [
+	'tokenization and POS tagging (TreeTagger)',
+    ],
+    'stdin' => 'text',
+    'stdout' => 'text',
+  },
+  'description' => 'This is the pre-processing pipe-line for Bulgarian that includes the TreeTagger for tokenization and POS tagging.',
+  'input' => {
+    'text' => {
+      'format' => 'xml',
+    }
+  },
+  'output' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+      'write_mode' => 'overwrite',
+      'status' => 'tag'
+    }
+  },
+  'arguments' => {
+    'shortcuts' => {
+       'in' => 'input:text:file',
+       'out' => 'output:text:file',
+       'ci' => 'input:text:encoding',
+       'co' => 'output:text:encoding',
+    }
+  },
+  'widgets' => {
+       'input' => {
+	  'text' => {
+	    'stream name' => 'stream(format=text,language=de)'
+	  },
+       },
+  }
+}

uplug-main/share/systems/opus/ca/tag

+{
+  'module' => {
+    'name' => 'Catalan pre-processing',
+    'submodules' => [
+        'pre/tok -l ca',
+        'pre/ca/tagSvmTool',
+    ],
+    'submodule names' => [
+        'Tokenizer',
+        'POS tagger (SVMTool)',
+    ],
+    'stdout' => 'text',
+  },
+  'input' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+    }
+  },
+  'output' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+      'write_mode' => 'overwrite',
+      'status' => 'tag'
+    }
+  },
+  'arguments' => {
+    'shortcuts' => {
+       'in' => 'input:text:file',
+	'out' => 'output:text:file',
+       'ci' => 'input:text:encoding',
+       'co' => 'output:text:encoding',
+    }
+  },
+  'widgets' => {
+       'input' => {
+	  'text' => {
+	    'stream name' => 'stream(format=text,language=it)'
+	  },
+       },
+  }
+}

uplug-main/share/systems/opus/cs/tag

+{
+  'module' => {
+    'name' => 'Czech pre-processing',
+    'submodules' => [
+        'pre/tok -l cs',
+        'pre/cs/tagHunPos',
+    ],
+    'submodule names' => [
+        'tokenizer',
+        'POS tagger (hunpos)',
+    ],
+    'stdout' => 'text',
+  },
+  'input' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+    }
+  },
+  'output' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+      'write_mode' => 'overwrite',
+      'status' => 'tag'
+    }
+  },
+  'arguments' => {
+    'shortcuts' => {
+       'in' => 'input:text:file',
+	'out' => 'output:text:file',
+       'ci' => 'input:text:encoding',
+       'co' => 'output:text:encoding',
+    }
+  },
+  'widgets' => {
+       'input' => {
+	  'text' => {
+	    'stream name' => 'stream(format=text,language=it)'
+	  },
+       },
+  }
+}

uplug-main/share/systems/opus/da/tag

+{
+  'module' => {
+    'name' => 'Danish pre-processing',
+    'submodules' => [
+        'pre/tok -l da',
+        'pre/da/tagHunPos',
+    ],
+    'submodule names' => [
+        'tokenizer',
+        'POS tagger (hunpos)',
+    ],
+    'stdout' => 'text',
+  },
+  'input' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+    }
+  },
+  'output' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+      'write_mode' => 'overwrite',
+      'status' => 'tag'
+    }
+  },
+  'arguments' => {
+    'shortcuts' => {
+       'in' => 'input:text:file',
+	'out' => 'output:text:file',
+       'ci' => 'input:text:encoding',
+       'co' => 'output:text:encoding',
+    }
+  },
+  'widgets' => {
+       'input' => {
+	  'text' => {
+	    'stream name' => 'stream(format=text,language=it)'
+	  },
+       },
+  }
+}

uplug-main/share/systems/opus/de/tag

+{
+  'module' => {
+    'name' => 'German pre-processing',
+    'submodules' => [
+        'pre/de/toktag',
+        'pre/de/tagHunPos',
+    ],
+    'submodule names' => [
+	'tokenization and POS tagging (TreeTagger)',
+        'tagging (hunpos)',
+    ],
+    'stdin' => 'text',
+    'stdout' => 'text',
+  },
+  'description' => 'This is the pre-processing pipe-line for German that includes the TreeTagger for tokenization and POS tagging.',
+  'input' => {
+    'text' => {
+      'format' => 'xml',
+    }
+  },
+  'output' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+      'write_mode' => 'overwrite',
+      'status' => 'tag'
+    }
+  },
+  'arguments' => {
+    'shortcuts' => {
+       'in' => 'input:text:file',
+       'out' => 'output:text:file',
+       'ci' => 'input:text:encoding',
+       'co' => 'output:text:encoding',
+    }
+  },
+  'widgets' => {
+       'input' => {
+	  'text' => {
+	    'stream name' => 'stream(format=text,language=de)'
+	  },
+       },
+  }
+}

uplug-main/share/systems/opus/en/tag

+{
+  'module' => {
+    'name' => 'English pre-processing',
+    'submodules' => [
+        'pre/tok -l en',
+#        'pre/en/toktag',
+        'pre/en/tagTree',
+        'pre/en/tagHunPos',
+        'pre/en/tagGrok',
+        'pre/en/chunk',
+    ],
+    'submodule names' => [
+        'tokenization',
+#	'tokenization and POS tagging (TreeTagger)',
+	'POS tagging (TreeTagger)',
+        'English POS tagger (hunpos)',
+        'English POS tagger (Grok)',
+        'English chunker (Grok)',
+    ],
+    'stdin' => 'text',
+    'stdout' => 'text',
+  },
+  'description' => 'This is the pre-processing pipe-line for English that includes the TreeTagger for tokenization and POS tagging.',
+  'input' => {
+    'text' => {
+      'format' => 'xml',
+    }
+  },
+  'output' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+      'write_mode' => 'overwrite',
+      'status' => 'tag'
+    }
+  },
+  'arguments' => {
+    'shortcuts' => {
+       'in' => 'input:text:file',
+       'out' => 'output:text:file',
+       'ci' => 'input:text:encoding',
+       'co' => 'output:text:encoding',
+    }
+  },
+  'widgets' => {
+       'input' => {
+	  'text' => {
+	    'stream name' => 'stream(format=text,language=de)'
+	  },
+       },
+  }
+}

uplug-main/share/systems/opus/es/tag

+{
+  'module' => {
+    'name' => 'Spanish pre-processing',
+    'submodules' => [
+        'pre/es/toktag',
+        'pre/es/tagSvmTool',
+    ],
+    'submodule names' => [
+	'tokenization and POS tagging (TreeTagger)',
+        'POS tagger (SVMTool)',
+    ],
+    'stdin' => 'text',
+    'stdout' => 'text',
+  },
+  'description' => 'This is the pre-processing pipe-line for Spanish that includes the TreeTagger for tokenization and POS tagging.',
+  'input' => {
+    'text' => {
+      'format' => 'xml',
+    }
+  },
+  'output' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+      'write_mode' => 'overwrite',
+      'status' => 'tag'
+    }
+  },
+  'arguments' => {
+    'shortcuts' => {
+       'in' => 'input:text:file',
+       'out' => 'output:text:file',
+       'ci' => 'input:text:encoding',
+       'co' => 'output:text:encoding',
+    }
+  },
+  'widgets' => {
+       'input' => {
+	  'text' => {
+	    'stream name' => 'stream(format=text,language=de)'
+	  },
+       },
+  }
+}

uplug-main/share/systems/opus/et/tag

+{
+  'module' => {
+    'name' => 'Estonian pre-processing',
+    'submodules' => [
+        'pre/et/toktag',
+    ],
+    'submodule names' => [
+	'tokenization and POS tagging (TreeTagger)',
+    ],
+    'stdin' => 'text',
+    'stdout' => 'text',
+  },
+  'description' => 'This is the pre-processing pipe-line for Estonian that includes the TreeTagger for tokenization and POS tagging.',
+  'input' => {
+    'text' => {
+      'format' => 'xml',
+    }
+  },
+  'output' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+      'write_mode' => 'overwrite',
+      'status' => 'tag'
+    }
+  },
+  'arguments' => {
+    'shortcuts' => {
+       'in' => 'input:text:file',
+       'out' => 'output:text:file',
+       'ci' => 'input:text:encoding',
+       'co' => 'output:text:encoding',
+    }
+  },
+  'widgets' => {
+       'input' => {
+	  'text' => {
+	    'stream name' => 'stream(format=text,language=de)'
+	  },
+       },
+  }
+}

uplug-main/share/systems/opus/fr/tag

+{
+  'module' => {
+    'name' => 'French pre-processing',
+    'submodules' => [
+        'pre/fr/toktag',
+        'pre/fr/tagMElt',
+    ],
+    'submodule names' => [
+	'tokenization and POS tagging (TreeTagger)',
+        'POS tagger (MElt)',
+    ],
+    'stdin' => 'text',
+    'stdout' => 'text',
+  },
+  'description' => 'This is the pre-processing pipe-line for French that includes the TreeTagger for tokenization and POS tagging.',
+  'input' => {
+    'text' => {
+      'format' => 'xml',
+    }
+  },
+  'output' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+      'write_mode' => 'overwrite',
+      'status' => 'tag'
+    }
+  },
+  'arguments' => {
+    'shortcuts' => {
+       'in' => 'input:text:file',
+       'out' => 'output:text:file',
+       'ci' => 'input:text:encoding',
+       'co' => 'output:text:encoding',
+    }
+  },
+  'widgets' => {
+       'input' => {
+	  'text' => {
+	    'stream name' => 'stream(format=text,language=de)'
+	  },
+       },
+  }
+}

uplug-main/share/systems/opus/hu/tag

+{
+  'module' => {
+    'name' => 'Hungarian pre-processing',
+    'submodules' => [
+        'pre/tok -l hu',
+        'pre/hu/tagHunPos',
+    ],
+    'submodule names' => [
+        'POS tagger (hunpos)',
+    ],
+    'stdout' => 'text',
+  },
+  'input' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+    }
+  },
+  'output' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+      'write_mode' => 'overwrite',
+      'status' => 'tag'
+    }
+  },
+  'arguments' => {
+    'shortcuts' => {
+       'in' => 'input:text:file',
+	'out' => 'output:text:file',
+       'ci' => 'input:text:encoding',
+       'co' => 'output:text:encoding',
+    }
+  },
+  'widgets' => {
+       'input' => {
+	  'text' => {
+	    'stream name' => 'stream(format=text,language=it)'
+	  },
+       },
+  }
+}

uplug-main/share/systems/opus/it/tag

+{
+  'module' => {
+    'name' => 'Italian pre-processing',
+    'submodules' => [
+        'pre/it/toktag',
+    ],
+    'submodule names' => [
+	'tokenization and POS tagging (TreeTagger)',
+    ],
+    'stdin' => 'text',
+    'stdout' => 'text',
+  },
+  'description' => 'This is the pre-processing pipe-line for Italian that includes the TreeTagger for tokenization and POS tagging.',
+  'input' => {
+    'text' => {
+      'format' => 'xml',
+    }
+  },
+  'output' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+      'write_mode' => 'overwrite',
+      'status' => 'tag'
+    }
+  },
+  'arguments' => {
+    'shortcuts' => {
+       'in' => 'input:text:file',
+       'out' => 'output:text:file',
+       'ci' => 'input:text:encoding',
+       'co' => 'output:text:encoding',
+    }
+  },
+  'widgets' => {
+       'input' => {
+	  'text' => {
+	    'stream name' => 'stream(format=text,language=de)'
+	  },
+       },
+  }
+}

uplug-main/share/systems/opus/la/tag

+{
+  'module' => {
+    'name' => 'Latin pre-processing',
+    'submodules' => [
+        'pre/la/toktag',
+    ],
+    'submodule names' => [
+	'tokenization and POS tagging (TreeTagger)',
+    ],
+    'stdin' => 'text',
+    'stdout' => 'text',
+  },
+  'description' => 'This is the pre-processing pipe-line for Latin that includes the TreeTagger for tokenization and POS tagging.',
+  'input' => {
+    'text' => {
+      'format' => 'xml',
+    }
+  },
+  'output' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+      'write_mode' => 'overwrite',
+      'status' => 'tag'
+    }
+  },
+  'arguments' => {
+    'shortcuts' => {
+       'in' => 'input:text:file',
+       'out' => 'output:text:file',
+       'ci' => 'input:text:encoding',
+       'co' => 'output:text:encoding',
+    }
+  },
+  'widgets' => {
+       'input' => {
+	  'text' => {
+	    'stream name' => 'stream(format=text,language=de)'
+	  },
+       },
+  }
+}

uplug-main/share/systems/opus/nl/tag

+{
+  'module' => {
+    'name' => 'Dutch pre-processing',
+    'submodules' => [
+        'pre/nl/alptok',  # does nothing if Alpino is not present
+        'pre/tok -l nl',
+        'pre/nl/tagMBT',
+        'pre/nl/tagTree',
+    ],
+    'submodule names' => [
+    	'Alpino tokenizer',
+	'standard tokenizer',  # if necessary
+        'POS tagger (MBT)',
+	'POS tagging (TreeTagger)',
+    ],
+    'stdin' => 'text',
+    'stdout' => 'text',
+  },
+  'description' => 'This is the pre-processing pipe-line for Dutch that includes the TreeTagger for tokenization and POS tagging.',
+  'input' => {
+    'text' => {
+      'format' => 'xml',
+    }
+  },
+  'output' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+      'write_mode' => 'overwrite',
+      'status' => 'tag'
+    }
+  },
+  'arguments' => {
+    'shortcuts' => {
+       'in' => 'input:text:file',
+       'out' => 'output:text:file',
+       'ci' => 'input:text:encoding',
+       'co' => 'output:text:encoding',
+    }
+  },
+  'widgets' => {
+       'input' => {
+	  'text' => {
+	    'stream name' => 'stream(format=text,language=de)'
+	  },
+       },
+  }
+}

uplug-main/share/systems/opus/pt/tag

+{
+  'module' => {
+    'name' => 'Portuguese pre-processing',
+    'submodules' => [
+        'pre/tok -l pt',
+        'pre/pt/tagHunPos',
+    ],
+    'submodule names' => [
+        'tokenizer',
+        'POS tagger (hunpos)',
+    ],
+    'stdout' => 'text',
+  },
+  'input' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+    }
+  },
+  'output' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+      'write_mode' => 'overwrite',
+      'status' => 'tag'
+    }
+  },
+  'arguments' => {
+    'shortcuts' => {
+       'in' => 'input:text:file',
+	'out' => 'output:text:file',
+       'ci' => 'input:text:encoding',
+       'co' => 'output:text:encoding',
+    }
+  },
+  'widgets' => {
+       'input' => {
+	  'text' => {
+	    'stream name' => 'stream(format=text,language=it)'
+	  },
+       },
+  }
+}

uplug-main/share/systems/opus/ru/tag

+{
+  'module' => {
+    'name' => 'Russian pre-processing',
+    'submodules' => [
+        'pre/tok -l ru',
+        'pre/ru/tagHunPos',
+    ],
+    'submodule names' => [
+        'tokenizer',
+        'POS tagger (hunpos)',
+    ],
+    'stdout' => 'text',
+  },
+  'description' => 'This is the pre-processing module for Italian
+  corpora. It includes a basic XML markup tool, a general sentence
+  splitter, and the <a href="http:/www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/DecisionTreeTagger.html">TreeTagger</a>
+  for Italian which also does tokenization
+  and lemmatization',
+  'input' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+    }
+  },
+  'output' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+      'write_mode' => 'overwrite',
+      'status' => 'tag'
+    }
+  },
+  'arguments' => {
+    'shortcuts' => {
+       'in' => 'input:text:file',
+	'out' => 'output:text:file',
+       'ci' => 'input:text:encoding',
+       'co' => 'output:text:encoding',
+    }
+  },
+  'widgets' => {
+       'input' => {
+	  'text' => {
+	    'stream name' => 'stream(format=text,language=it)'
+	  },
+       },
+  }
+}

uplug-main/share/systems/opus/sl/tag

+{
+  'module' => {
+    'name' => 'Slovene pre-processing',
+    'submodules' => [
+        'pre/tok -l sl',
+        'pre/sl/tagHunPos',
+    ],
+    'submodule names' => [
+        'tokenizer',
+        'POS tagger (hunpos)',
+    ],
+    'stdout' => 'text',
+  },
+  'input' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+    }
+  },
+  'output' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+      'write_mode' => 'overwrite',
+      'status' => 'tag'
+    }
+  },
+  'arguments' => {
+    'shortcuts' => {
+       'in' => 'input:text:file',
+	'out' => 'output:text:file',
+       'ci' => 'input:text:encoding',
+       'co' => 'output:text:encoding',
+    }
+  },
+  'widgets' => {
+       'input' => {
+	  'text' => {
+	    'stream name' => 'stream(format=text,language=it)'
+	  },
+       },
+  }
+}

uplug-main/share/systems/opus/sv/tag

+{
+  'module' => {
+    'name' => 'Swedish pre-processing',
+    'submodules' => [
+        'pre/tok -l sv',
+        'pre/sv/tagHunPos',
+#        'pre/sv/tagTnT',
+#        'pre/sv/parse',
+    ],
+    'submodule names' => [
+        'tokenizer',
+        'POS tagger (hunpos)',
+    ],
+    'stdout' => 'text',
+  },
+  'input' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+    }
+  },
+  'output' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+      'write_mode' => 'overwrite',
+      'status' => 'chunk'
+    }
+  },
+  'arguments' => {
+    'shortcuts' => {
+       'in' => 'input:text:file',
+	'out' => 'output:text:file',
+       'ci' => 'input:text:encoding',
+       'co' => 'output:text:encoding',
+    }
+  },
+  'widgets' => {
+       'input' => {
+	  'text' => {
+	    'stream name' => 'stream(format=text,language=en)'
+	  },
+       },
+  }
+}

uplug-main/share/systems/opus/sw/tag

+{
+  'module' => {
+    'name' => 'Swahili pre-processing',
+    'submodules' => [
+        'pre/sw/toktag',
+    ],
+    'submodule names' => [
+	'tokenization and POS tagging (TreeTagger)',
+    ],
+    'stdin' => 'text',
+    'stdout' => 'text',
+  },
+  'description' => 'This is the pre-processing pipe-line for Swahili that includes the TreeTagger for tokenization and POS tagging.',
+  'input' => {
+    'text' => {
+      'format' => 'xml',
+    }
+  },
+  'output' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+      'write_mode' => 'overwrite',
+      'status' => 'tag'
+    }
+  },
+  'arguments' => {
+    'shortcuts' => {
+       'in' => 'input:text:file',
+       'out' => 'output:text:file',
+       'ci' => 'input:text:encoding',
+       'co' => 'output:text:encoding',
+    }
+  },
+  'widgets' => {
+       'input' => {
+	  'text' => {
+	    'stream name' => 'stream(format=text,language=de)'
+	  },
+       },
+  }
+}

uplug-main/share/systems/opus/tr/tag

+{
+  'module' => {
+    'name' => 'Turkish pre-processing',
+    'submodules' => [
+        'pre/tok -l tr',
+        'pre/tr/tag',
+    ],
+    'submodule names' => [
+        'tokenizer',
+        'POS tagger',
+    ],
+    'stdout' => 'text',
+  },
+  'input' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+    }
+  },
+  'output' => {
+    'text' => {
+      'format' => 'xml',
+      'root' => 's',
+      'write_mode' => 'overwrite',
+      'status' => 'tag'
+    }
+  },
+  'arguments' => {
+    'shortcuts' => {
+       'in' => 'input:text:file',
+	'out' => 'output:text:file',
+       'ci' => 'input:text:encoding',
+       'co' => 'output:text:encoding',
+    }
+  },
+  'widgets' => {
+       'input' => {
+	  'text' => {
+	    'stream name' => 'stream(format=text,language=it)'
+	  },
+       },
+  }
+}