Commits

tiedeman  committed 9a66347

more flexible filtering with uplug-readalign

  • Participants
  • Parent commits 8a0f590

Comments (0)

Files changed (1)

File uplug-main/tools/uplug-readalign

 #  -h    : print html
 #
 
+=head1 NAME
+
+uplug-readalign - read sentence alignment in XCES align format
+
+=head1 SYNOPSIS
+
+ # read sentence alignments and print aligned sentences
+ uplug-readalign align-file.xml
+
+ # print alignments with alignment certainty > LinkThr=0
+ uplug-readalign -c 0 align-file.xml
+
+ # print alignments with max 2 source sentences and 3 target sentences
+ uplug-readalign -S 2 -T 3 align-file.xml
+
+ # print aligned sentences marked as 'de' (source) and 'en' (target)
+ # (this only works if sentences are marked with languages:
+ #  for example, in the German XML file: <s lang="de">...</s>)
+ uplug-readalign -s de -t en align-file.xml
+
+ # wrap aligned sentences in simple HTML
+ uplug-readalign -h align-file.xml
+
+ # print max 10 alignments
+ uplug-readalign -m 10 align-file.xml
+
+ # specify home directory of aligned XML files
+ uplug-readalign -d /path/to/xml/files align-file.xml
+
+ # print XCES align format of all 1:1 sentence alignments
+ uplug-readalign -S 1 -T 1 -l align-file.xml
+
+=head1 USAGE
+
+ uplug-readalign [OPTIONS] align-file.xml
+
+=head1 OPTIONS
+
+ -c <thr> ........... set a link threshold <thr>
+ -d <dir> ........... set home directory for aligned XML documents
+ -h ................. print simple HTML
+ -l ................. print links (filter mode)
+ -m <max> ........... print max <max> alignments
+ -s <LangID> ........ require source sentences to match <LangID>
+ -t <LangID> ........ require target sentences to match <LangID>
+ -S <max> ........... maximum number of source sentence in alignments
+ -T <max> ........... maximum number of target sentence in alignments
+
+=head1 DESCRIPTION
+
+C<uplug-readalign> is a simple script to read sentence alignments stored in XCES align format and prints the aligned sentences to STDOUT. It requires monolingual alignments (ascending order, no crossing links) of sentences in linked XML files. Linked XML files are specified in the C<toDoc> and <fromDoc> attributes (see below).
+
+ <cesAlign version="1.0">
+  <linkGrp targType="s" toDoc="source1.xml" fromDoc="target1.xml">
+    <link certainty="0.88" xtargets="s1.1 s1.2;s1.1" id="SL1" />
+    ....
+  <linkGrp targType="s" toDoc="source2.xml" fromDoc="target2.xml">
+    <link certainty="0.88" xtargets="s1.1;s1.1" id="SL1" />
+
+Several parameters can be set to filter the alignments and to print only certain types of alignments.
+
+C<uplug-readalign> can also be used to filter the XCES alignment files and to print the remaining links in the same XCES align format. Use the C<-l> flag to enable this mode.
+
+=head1 See also
+
+More information on Uplug: Look at L<Uplug::Config>
+
+More downloads:
+L<https://bitbucket.org/tiedemann/uplug>
+
+=cut
+
 
 use strict;
 use FindBin qw($Bin);
 
 my $html=0;
 my $max=0;
+my $SrcID=undef;
+my $TrgID=undef;
+my $MaxSrc=undef;
+my $MaxTrg=undef;
+my $LinkThr=undef;
+my $FilterMode=0;    # filter-mode: print alignment XML
+
 my $dir='xml';                   # extra directory to check for from/toDoc
 while ($ARGV[0]=~/^\-/){
     my $o=shift(@ARGV);
     if ($o=~/^\-h/){$html=1;}
-    if ($o=~/^\-m/){$max=shift @ARGV;}
-    if ($o=~/^\-d/){$dir=shift @ARGV;}
+    elsif ($o=~/^\-m/){$max=shift @ARGV;}
+    elsif ($o=~/^\-d/){$dir=shift @ARGV;}
+    elsif ($o=~/^\-s/){$SrcID=shift @ARGV;}
+    elsif ($o=~/^\-t/){$TrgID=shift @ARGV;}
+    elsif ($o=~/^\-S/){$MaxSrc=shift @ARGV;}
+    elsif ($o=~/^\-T/){$MaxTrg=shift @ARGV;}
+    elsif ($o=~/^\-c/){$LinkThr=shift @ARGV;}
+    elsif ($o=~/^\-l/){$FilterMode=1;}
 }
 
 my $ALIGN=shift(@ARGV);
 		open TRG,"<$trgdoc";
 	    }
 	    $firstTrg=0;
-	    if ($html){print "<p>\n";}
-	    print "\n# ".$srcdoc;
-	    if ($html){print '<br>';}
-	    print "\n# ".$trgdoc."\n\n";
-	    if ($html){print "<p><hr>\n";}
-	    else{print "================================\n";}
+	    unless ($FilterMode){
+		if ($html){print "<p>\n";}
+		print "\n# ".$srcdoc;
+		if ($html){print '<br>';}
+		print "\n# ".$trgdoc."\n\n";
+		if ($html){print "<p><hr>\n";}
+		else{print "================================\n";}
+	    }
 	}
     }
     if (/xtargets=\"([^\"]*)\s*\;\s*([^\"]*)\"/){
+	if (defined $LinkThr){
+	    if (/certainty=\"(.*?)\"/){
+		next if ($1<$LinkThr);
+	    }
+	}
 	my $srceof=1;
 	my $trgeof=1;
 	$count++;
 	my @srcsent=split(/\s/,$src);
 	my @trgsent=split(/\s/,$trg);
 
+	if (defined $MaxSrc){
+	    next if (scalar @srcsent > $MaxSrc);
+	}
+	if (defined $MaxTrg){
+	    next if (scalar @trgsent > $MaxTrg);
+	}
+
+	my $SrcStr='';
+	my $TrgStr='';
+
 	my $oldDel=$/;
 	$/='</s>';
-	foreach (@srcsent){
+SRCSENT: foreach (@srcsent){
 	    while (my $sent=<SRC>){
 		$srceof=0;
 		if ($sent=~/s [^\>]*id="$_"/s){
+		    if ($SrcID && $sent=~/lang=\".*?\"/){
+			next SRCSENT unless ($sent=~/lang=\"$SrcID\"/);
+		    }
 		    $sent=~s/^.*<s [^\>]*id/(src)/s;
 		    $sent=~s/\n/ /gs;
 		    $sent=~s/\<[^\>]*>//gs;
 			$sent=~s/\&lt\;/\</gs;
 			$sent=~s/\&amp\;/\&/gs;
 		    }
-		    print $sent;
-		    if ($html){print "<br>";}
-		    print "\n";
+		    $SrcStr.=$sent;
+		    if ($html){$SrcStr.="<br>";}
+		    $SrcStr.="\n";
 		    last;
 		}
 		$srceof=1;
 	    }
 	}
 
-	foreach (@trgsent){
+TRGSENT: foreach (@trgsent){
 	    while (my $sent=<TRG>){
 		$trgeof=0;
 		if ($sent=~/s [^\>]*id="$_"/s){
+		    if ($TrgID && $sent=~/lang=\".*?\"/){
+			next TRGSENT unless ($sent=~/lang=\"$TrgID\"/);
+		    }
 		    $sent=~s/^.*<s [^\>]*id/(trg)/s;
 		    $sent=~s/\n/ /gs;
 		    $sent=~s/\<[^\>]*>//gs;
 			$sent=~s/\&lt\;/\</gs;
 			$sent=~s/\&amp\;/\&/gs;
 		    }
-		    print $sent;
-		    if ($html){print "<br>";}
-		    print "\n";
+		    $TrgStr.=$sent;
+		    if ($html){$TrgStr.="<br>";}
+		    $TrgStr.="\n";
 		    last;
 		}
 		$trgeof=1;
             else{open SRC,"<$srcdoc";}
         }
 	$/=$oldDel;
-	if ($html){print "<hr>\n";}
-	else{print "================================\n";}
+	if ($SrcStr && $TrgStr){
+	    unless ($FilterMode){
+		print $SrcStr;
+		print $TrgStr;
+		if ($html){print "<hr>\n";}
+		else{print "================================\n";}
+	    }
+	}
+	else{ next; }
     }
+    print $_ if ($FilterMode);
 }