Commits

vvcephei committed 50f029a

finishing up new process. Added treesplit, moved documentation to the scalabha wiki.

  • Participants
  • Parent commits 2bbd66e

Comments (0)

Files changed (7)

 	. ~/.sbtconfig
     fi
     java -Dfile.encoding=UTF8 -Xmx1536M -Xss1M -XX:+CMSClassUnloadingEnabled -XX:MaxPermSize=256m ${SBT_OPTS} -jar $SCALABHA_DIR/bin/sbt-launch-*.jar "$@"
+    (( EXIT_CODE += $? ))
 
 elif [[ $CMD = 'tokenize' ]]; then
     tokenize.sh $1
+    (( EXIT_CODE += $? ))
 
 elif [[ $CMD = 'treeseed' ]]; then
     treeseed.sh $1
+    (( EXIT_CODE += $? ))
 
 elif [[ $CMD = 'treemerge' ]]; then
     treemerge.sh $1
+    (( EXIT_CODE += $? ))
 
 elif [ $CMD = 'viz' ]; then
 
     head -n $2 $1 | tail -n 1 | tr \(\) \[\]
+    (( EXIT_CODE += $? ))
 
 else 
 
         for collection in $treesrc/* ; do
             echo $collection
             for tree in $collection/* ; do
-                treemerge.sh $tree
-                (( exit_code += $? ))
+                if [[ $exit_code -ne 0 ]]; then
+                    exit $exit_code
+                else
+                    treemerge.sh $tree
+                    (( exit_code += $? ))
+                fi
             done
         done
         ;;
         echo "merging all"
         treemerge.sh kin
         (( exit_code += $? ))
-        treemerge.sh mlg
-        (( exit_code += $? ))
+        if [[ $exit_code -ne 0 ]]; then
+            exit $exit_code
+        else
+            treemerge.sh mlg
+            (( exit_code += $? ))
+        fi
         ;;
     *)
         fullpath=`readlink -f $1`
                 fi
                 ;;
             *)
+                echo
+                echo "Dont' know what to do with $fullpath"
+                echo
                 usage
                 ;;
         esac
+#!/bin/bash
+
+root=$LDMT_MURI_DIR/data/phase2
+mkdir -p /tmp/treesplit
+rm -rf /tmp/treesplit/*
+pushd /tmp/treesplit
+
+for langDir in $root/*
+do
+    lang=$( basename $langDir )
+    for collDir in $langDir/parsed/*
+    do
+        coll=$( basename $collDir )
+        for filePath in $collDir/*
+        do
+            file=$( basename $filePath )
+            prefix=$( basename $file .tree )
+            split -d -l1 $filePath $prefix.
+            dest=$langDir/tree/src/$coll/$prefix/
+            mkdir -p $dest
+            for i in *
+            do
+                mv $i $dest/$i.tree
+            done
+        done
+    done
+done
+
+popd

doc/pipeline.svg

-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<!-- Created with Inkscape (http://www.inkscape.org/) -->
-
-<svg
-   xmlns:dc="http://purl.org/dc/elements/1.1/"
-   xmlns:cc="http://creativecommons.org/ns#"
-   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-   xmlns:svg="http://www.w3.org/2000/svg"
-   xmlns="http://www.w3.org/2000/svg"
-   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
-   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
-   width="930.75531"
-   height="271.31403"
-   id="svg2"
-   version="1.1"
-   inkscape:version="0.48.2 r9819"
-   sodipodi:docname="drawing.svg">
-  <defs
-     id="defs4" />
-  <sodipodi:namedview
-     id="base"
-     pagecolor="#ffffff"
-     bordercolor="#666666"
-     borderopacity="1.0"
-     inkscape:pageopacity="0.0"
-     inkscape:pageshadow="2"
-     inkscape:zoom="0.87705101"
-     inkscape:cx="453.08639"
-     inkscape:cy="210.13867"
-     inkscape:document-units="px"
-     inkscape:current-layer="layer1"
-     showgrid="false"
-     inkscape:window-width="1366"
-     inkscape:window-height="750"
-     inkscape:window-x="0"
-     inkscape:window-y="0"
-     inkscape:window-maximized="1"
-     fit-margin-top="10"
-     fit-margin-left="10"
-     fit-margin-right="10"
-     fit-margin-bottom="10" />
-  <metadata
-     id="metadata7">
-    <rdf:RDF>
-      <cc:Work
-         rdf:about="">
-        <dc:format>image/svg+xml</dc:format>
-        <dc:type
-           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-        <dc:title></dc:title>
-      </cc:Work>
-    </rdf:RDF>
-  </metadata>
-  <g
-     inkscape:label="Layer 1"
-     inkscape:groupmode="layer"
-     id="layer1"
-     transform="translate(2.4337173,-91.62751)">
-    <path
-       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
-       d="m 245.13967,205.20507 -95.7755,57.00923 95.7755,55.86904"
-       id="path4009"
-       inkscape:connector-curvature="0" />
-    <path
-       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
-       d="m 497.40083,174.42009 -111.73809,30.78498 111.73809,26.22425"
-       id="path4011"
-       inkscape:connector-curvature="0" />
-    <path
-       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:1, 6;stroke-dashoffset:0"
-       d="m 386.80293,316.94316 112.87827,0"
-       id="path4013"
-       inkscape:connector-curvature="0" />
-    <path
-       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:1, 6;stroke-dashoffset:0"
-       d="m 665.28832,315.80297 110.5979,0"
-       id="path4015"
-       inkscape:connector-curvature="0" />
-    <path
-       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
-       d="M 665.28832,232.5695 774.74604,202.9247 663.00795,174.42009"
-       id="path4017"
-       inkscape:connector-curvature="0" />
-    <g
-       id="g3968">
-      <rect
-         ry="10"
-         rx="10"
-         y="225.46606"
-         x="7.8162827"
-         height="73.739708"
-         width="140.7074"
-         id="rect2987"
-         style="fill:#daebf0;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none" />
-      <text
-         sodipodi:linespacing="125%"
-         id="text2989"
-         y="255.3535"
-         x="22.256895"
-         style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
-         xml:space="preserve"><tspan
-           style="font-size:18px"
-           y="255.3535"
-           x="22.256895"
-           id="tspan2991"
-           sodipodi:role="line">filename.xml</tspan><tspan
-           style="font-size:18px"
-           id="tspan2997"
-           y="277.85352"
-           x="22.256895"
-           sodipodi:role="line">- lang α...ω</tspan></text>
-    </g>
-    <text
-       xml:space="preserve"
-       style="font-size:20px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
-       x="136.45006"
-       y="118.7613"
-       id="text3003"
-       sodipodi:linespacing="125%"><tspan
-         sodipodi:role="line"
-         id="tspan3005"
-         x="136.45006"
-         y="118.7613">tokenize</tspan></text>
-    <g
-       id="g3956"
-       transform="translate(-2,-14.000008)">
-      <rect
-         ry="10"
-         rx="10"
-         y="178.59688"
-         x="246.6176"
-         height="73.739708"
-         width="140.7074"
-         id="rect2987-1"
-         style="fill:#daebf0;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none" />
-      <text
-         sodipodi:linespacing="125%"
-         id="text3007"
-         y="210.42279"
-         x="254.4713"
-         style="font-size:20px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
-         xml:space="preserve"><tspan
-           y="210.42279"
-           x="254.4713"
-           id="tspan3009"
-           sodipodi:role="line"
-           style="font-size:17.5px">filename.α.tok</tspan><tspan
-           id="tspan3023"
-           y="232.29779"
-           x="254.4713"
-           sodipodi:role="line"
-           style="font-size:17.5px">- lines 1...N</tspan></text>
-    </g>
-    <g
-       id="g3962"
-       transform="translate(-2,8)">
-      <rect
-         ry="10"
-         rx="10"
-         y="270.95181"
-         x="247.75778"
-         height="73.739708"
-         width="140.7074"
-         id="rect2987-8"
-         style="fill:#daebf0;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none" />
-      <text
-         sodipodi:linespacing="125%"
-         id="text3011"
-         y="302.77771"
-         x="253.83414"
-         style="font-size:20px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
-         xml:space="preserve"><tspan
-           id="tspan3027"
-           y="302.77771"
-           x="253.83414"
-           sodipodi:role="line"
-           style="font-size:17.5px">filename.ω.tok</tspan><tspan
-           id="tspan3025"
-           y="324.65271"
-           x="253.83414"
-           sodipodi:role="line"
-           style="font-size:17.5px">- lines 1...N</tspan></text>
-    </g>
-    <text
-       xml:space="preserve"
-       style="font-size:20px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
-       x="383.01459"
-       y="118.7613"
-       id="text3015"
-       sodipodi:linespacing="125%"><tspan
-         sodipodi:role="line"
-         id="tspan3017"
-         x="383.01459"
-         y="118.7613">treeseed</tspan></text>
-    <g
-       id="g3999"
-       transform="translate(-4,-14)">
-      <g
-         id="g3974">
-        <rect
-           style="fill:#daebf0;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none"
-           id="rect2987-2"
-           width="166.93166"
-           height="44.094902"
-           x="499.73859"
-           y="164.91466"
-           rx="10"
-           ry="10" />
-        <text
-           xml:space="preserve"
-           style="font-size:20px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
-           x="507.2962"
-           y="194.11055"
-           id="text3019"
-           sodipodi:linespacing="125%"><tspan
-             sodipodi:role="line"
-             id="tspan3021"
-             x="507.2962"
-             y="194.11055"
-             style="font-size:17.5px">filename.α.1.tree</tspan></text>
-      </g>
-      <g
-         id="g3979">
-        <rect
-           style="fill:#daebf0;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none"
-           id="rect2987-2-7"
-           width="166.93166"
-           height="44.094902"
-           x="501.44882"
-           y="221.92389"
-           rx="10"
-           ry="10" />
-        <text
-           xml:space="preserve"
-           style="font-size:20px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
-           x="507.89316"
-           y="251.11978"
-           id="text3031"
-           sodipodi:linespacing="125%"><tspan
-             sodipodi:role="line"
-             id="tspan3033"
-             x="507.89316"
-             y="251.11978"
-             style="font-size:17.5px">filename.α.N.tree</tspan></text>
-      </g>
-    </g>
-    <text
-       xml:space="preserve"
-       style="font-size:20px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
-       x="665.91467"
-       y="116.82282"
-       id="text3035"
-       sodipodi:linespacing="125%"><tspan
-         sodipodi:role="line"
-         id="tspan3037"
-         x="665.91467"
-         y="116.82282">treemerge</tspan></text>
-    <g
-       id="g3989"
-       transform="translate(-4,-11.719641)">
-      <rect
-         ry="10"
-         rx="10"
-         y="192.2791"
-         x="777.9436"
-         height="41.814533"
-         width="142.98776"
-         id="rect2987-23"
-         style="fill:#daebf0;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none" />
-      <text
-         sodipodi:linespacing="125%"
-         id="text3039"
-         y="220.06625"
-         x="783.08008"
-         style="font-size:20px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
-         xml:space="preserve"><tspan
-           y="220.06625"
-           x="783.08008"
-           id="tspan3041"
-           sodipodi:role="line"
-           style="font-size:17px">filename.α.tree</tspan></text>
-    </g>
-    <g
-       id="g3984"
-       transform="translate(-4,11.420548)">
-      <rect
-         ry="10"
-         rx="10"
-         y="282.35367"
-         x="502.58902"
-         height="44.094902"
-         width="166.93166"
-         id="rect2987-2-5"
-         style="fill:#daebf0;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none" />
-      <text
-         sodipodi:linespacing="125%"
-         id="text3077"
-         y="305.64136"
-         x="576.51874"
-         style="font-size:20px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
-         xml:space="preserve"><tspan
-           y="305.64136"
-           x="576.51874"
-           id="tspan3079"
-           sodipodi:role="line">...</tspan></text>
-    </g>
-    <g
-       id="g3994"
-       transform="translate(-4,17.121462)">
-      <rect
-         ry="10"
-         rx="10"
-         y="277.79294"
-         x="779.0838"
-         height="41.814533"
-         width="142.98776"
-         id="rect2987-23-9"
-         style="fill:#daebf0;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none" />
-      <text
-         sodipodi:linespacing="125%"
-         id="text3081"
-         y="299.94043"
-         x="841.04156"
-         style="font-size:20px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
-         xml:space="preserve"><tspan
-           y="299.94043"
-           x="841.04156"
-           id="tspan3083"
-           sodipodi:role="line">...</tspan></text>
-    </g>
-    <text
-       xml:space="preserve"
-       style="font-size:16px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
-       x="195.28934"
-       y="261.65356"
-       id="text4019"
-       sodipodi:linespacing="125%"><tspan
-         sodipodi:role="line"
-         id="tspan4021"
-         x="195.28934"
-         y="261.65356">...</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-size:16px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
-       x="443.83087"
-       y="204.35461"
-       id="text4023"
-       sodipodi:linespacing="125%"><tspan
-         sodipodi:role="line"
-         id="tspan4025"
-         x="443.83087"
-         y="204.35461">...</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-size:16px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
-       x="700.37238"
-       y="204.35461"
-       id="text4027"
-       sodipodi:linespacing="125%"><tspan
-         sodipodi:role="line"
-         id="tspan4029"
-         x="700.37238"
-         y="204.35461">...</tspan></text>
-  </g>
-</svg>

doc/readme.txt

-=Treebanking Overview=
-
-[[pipeline.svg]]
-
-This overview describes the process of transforming source (XML) files into treebanked files for the [[LDMT-MURI|link]] project. As the following image illustrates, this involves three high-level transformations and four high-level states.
-
-We'll also provide information about the directory layout of the muri project, since the transformation scripts provided expect this structure:
-    lang/                     # There is one tree for each language (currently kin=Kinyarwanda and mlg=Malagasy)
-        orig/                 # The dir for source (XML) files
-            collection1/      # There is one directory for each collection. Each directory contains all the documents in XML format.
-            ...
-            collectionN/
-        tok/                  # The dir for tokenized files
-            collections/      # Again, there is one dir for each collection.
-            ...
-        parsed/               # The dir for treebanked files
-            src/              # This dir contains working files for treebanking
-                collections/
-                    document.lang/     # Each document is represented by a dir containing .tree files, with one tree per file
-            collections/      # This contains the finished product of treebanking
-
-=State 1: Source Files=
-In this state, you create an XML file. We currently have not defined a schema for muri XML, but we do expect the XML to conform to the following structure:
-    <file id="ID" languages="lang1,lang2,...">
-        <data>
-            <unit>
-                <align>
-                    <text langid="lang1">
-                        <s>sentence</s>
-                        ...
-                    </text>
-                    <text langid="lang2">...</text>
-                    ...
-                </align>
-            <unit>
-            ...
-        </data>
-    </file>
-These files should be UTF-8 encoded, and they reside in the {lang}/orig/ directory. Note that each align block must contain exactly one text node for each of the languages specified in the file.languages attribute.
-
-=Transition A: Tokenize=
-This transformation turns a source file (name.xml) into a collection of token files (name.lang.tok). This transformation happens only rarely, when source files change. Also, it is fairly fast, so we don't tokenize single XML files. Instead, we tokenize all XML files for a language. Note that this operation is valid because there is no other way to create valid token files than through this transformation, and it is not permitted to manually edit token files.
-
-Commands:
-scalabha tokenize kin       # Tokenize all Kinyarwanda XML files
-scalabha tokenize mlg       # Tokenize all Malagasy XML files
-scalabha tokenize all       # Tokenize all XML files (just a shortcut for running both of the previous commands)
-
-The output of this command is written to the {lang}/tok/ directory.
-
-=State 2: Token Files=
-During Transition A, each XML file is split into K token files, where K is the number of languages in the XML file (usually 2 or 3), named accordingly. The token files contain 1 line for each of the align blocks, and the sentences within the align blocks are delimited by <EOS> tags. These tags need to be removed before data releases, but they are neccessary for this process, so leave them for now.
-
-In this state, you'll want to do some extra validation of the token files. Ulf has provided a Perl script for this purpose: wildebeest.
-
-Commands:
-wildebeest.pl < path/to/filename.lang.tok
-
-Wildebeest will print a set of statistics about the token file you provide. You should pay special attention to the "unsplit punctuation" sections. These are not neccessarily problems, but they are meant to bring potential tokenization problems to the fore so that you can decide, for example, that "..." is ok while "said..." is not. The way to fix these problems is typically to correct typos in the source file and re-run tokenization. In the example, you would open the XML file, find "said...", and change it to "said ...".
-
-As noted above, the only valid way to create or modify token files is first to create or modify the XML files and then run Transition A.This ensures that our token files always have the same set of known properties, which is important for machine translation.
-
-=Transition B: TreeSeed=
-This transformation turns a token file into a set of L tree files, where L is the number of lines in the token file. I.e., there is one tree per tree file. Note that the treeseed command checks the output files to make sure there are no midifications to them before overwriting, so if you have make changes but still want to create a new file, rename or delete the original before running treeseed.
-
-Commands:
-scalabha treeseed /path/to/filename.tok     # turn this token file into a set of tree files (the path can be relative)
-
-The output of this command is written to the {lang}/parsed/src/ directory.
-
-=State 3: Treebank Files=
-This state is where the brunt of the work occurs. The Treeseed transition creates dummy treebank files with the correct structure down to the sentence level, but you have to specify the sentence structure. All whitespace will be ignored, so you can format the file any way you like. Here is an example of a valid tree structure:
-    (TOP
-        (S
-            (NP (N John) )
-            (VP (V saw) 
-                (NP (D the) (N saw) )
-            )
-            (.  .)
-        )
-        (S (NP (D The) (N saw) ) (VP (V sawed) (NP (D the) (N log) ) ) (. .) )
-    )
-
-Note that every tree's top level node is TOP, even if there is only one sentence. Also, note that tags must be ASCII characters, but the tokens can be UTF-8. Punctuation is treebanked by making it the last element in its syntactic structure (the sentence in this case) and giving it a tag ascii-equivalent to itself. In other words the tag for directional double quotes is the ascii double quote, etc.
-
-=Transition C: TreeMerge=
-This transition turns a collection of working tree files into a single output tree file. This means that whitespace in the input tree files will be collapsed, and each input tree will become a line in the output tree file. Note that input tree files are sorted alphabetically by name to determine the order of trees in the output file. Just as with Transition A, treemerge will overwrite the output files, so you should not make changes directly to the output tree files, only the ones in the src/ directory.
-
-Commands:
-scalabha treemerge /path/to/document/dir    # the document dir is something like "kin/tree/src/kgmc/kgmc_0026.eng". All the immediate children of this directory should be *.tree files.
-scalabha treemerge kin                      # Since this command is defined to be destructive (i.e., you should not be modifying
-scalabha treemerge mlg                      #  the output tree files directly), and this operation is fairly fast, it might be
-scalabha treemerge all                      #  easier to just process all the src trees at once.
-
-=State 4: Verification=
-In this state, the process is complete, and all of the requisite files (XML, tok, and tree) exist. Here, you can use verification tools like tree-checker.pl to make sure that the output is well formed. You will also want to make sure of a few properties:
-- For each tree file in parsed/*/*.tree, there is exactly one tok file in tok/*/*.tok. It is expected that there will be many token files that are not treebanked, but every treefile must correspond exactly to a token file.
-- Likewise, each tree file must have exactly the same number of lines as its corresponding tok file. In fact, suppose that a source file orig/coll1/file_001.xml contains 2 languages, langA and langB. Then, the following 4 files must exist and must all have the same number of lines:
--- tok/coll1/file_001.langA.tok
--- tok/coll1/file_001.langB.tok
--- parsed/coll1/file_001.langA.tree
--- parsed/coll1/file_001.langB.tree
-
-This is the state from which it is possible to do a data release.
-
-=After: Data Release=
-Before doing a data release, you'll want to re-run any validation tools you have at your disposal, as well as spot-checking the output.
-
-Finally, as a cleanup, you should remove the <EOS> tags in the token files before packaging the data release. Just be sure to re-run the tokenizers (replaceing the <EOS> tags) before the next phase of treebanking begins. These tags are neccessary for the treeseed transition to create the correct sentence nodes, but they are not valid tokens, so they should not be present in released data.

src/main/scala/opennlp/scalabha/tree/Merge.scala

         log.summary("Suspending output since there were errors.\n")
 
       outputBuffer.close()
+      System.exit(errors)
     }
     catch {
       case e: ArgotUsageException =>

src/main/scala/opennlp/scalabha/tree/MultiLineTreeParser.scala

     then we can report the position in the source file of errors*/
 
     var restToParse = forestString
+    var lastValueToParse = ""
     var resultantTrees: List[TreeNode] = Nil
     var numTreesParsed = 0
-    while (restToParse != "") {
+    while (restToParse != "" && restToParse != lastValueToParse) {
+      // If we get into a state wherein the parser cannot make progress, we will need to break out of the loop.
+      // This is ok to do because we've already logged any errors we have encountered.
+      lastValueToParse = restToParse
+
+
       val index = numTreesParsed + 1
       log.debug("Parsing (file:%s,tree#:%d)\n".format(forestName, index))
       apply(forestName, index, restToParse, "") match {
             case Node(_, _) =>
               resultantTrees = tree :: resultantTrees
             case Value(name) =>
-              // TODO: Do we keep executing on critical errors or let them fly? Maybe we just don't write the ouput file.
+              // TODO: Do we keep executing on critical errors or let them fly? Maybe we just don't write the output file.
               log.err("(file:%s,tree#:%d): Top-level element is not a tree:<<%s>>\n".format(forestName, index, name))
           }
           restToParse = leftover.trim()
         log.summary("Suspending output since there were errors.\n")
 
       log.summary("Warnings,Errors: %s\n".format(log.getStats()))
+      System.exit(errors)
     }
     catch {
       case e: ArgotUsageException =>