tiedeman  committed 23fa8d6

new option in xces2moses to produce ID files

  • Parent commits 96a4685

File uplug-main/tools/xces2moses

 my $trglang='trg';
 my $dir='xml';
 my $max=undef;
+my $opt_p=undef;
+my $opt_P=undef;
 my $max=0;
 while ($ARGV[0]=~/^\-/){
     if ($o=~/^\-t/){$trglang=shift @ARGV;}
     if ($o=~/^\-d/){$dir=shift @ARGV;}
     if ($o=~/^\-m/){$max=shift @ARGV;}
+    if ($o=~/^\-p/){$opt_p=shift @ARGV;}   # store file names and sentence IDs
+    if ($o=~/^\-P/){$opt_P=shift @ARGV;}   # the same but different format
 my $ALIGN   = shift(@ARGV);
     open F,"<$ALIGN";
+if ($opt_p){
+    open P,">$opt_p" || warn "cannot open $opt_p ...\n";
+if ($opt_P){
+    open P2,">$opt_P" || warn "cannot open $opt_P ...\n";
+my ($org_srcdoc,$org_trgdoc);
 my $count=0;
 while (<F>){
     if (/fromDoc=\"([^\"]+)\"/){
 	if ($srcdoc ne $1){
+	    $org_srcdoc=$srcdoc;
 	    if ((not -e $srcdoc) and (-e "$srcdoc.gz")){
     if (/toDoc=\"([^\"]+)\"/){
 	if ($trgdoc ne $1){
+	    $org_trgdoc=$trgdoc;
 	    if ((not -e $trgdoc) and (-e "$trgdoc.gz")){
 		open TRG,"<$trgdoc";
 	    binmode(TRG, ":utf8");
+	    if ($opt_p){
+		print P "## $org_srcdoc\t$org_trgdoc\n";
+	    }
 		print S "\n";
 		print T $trgtxt;
 		print T "\n";
+		print P "$src\t$trg\n" if ($opt_p);
+		print P2 "$org_srcdoc\t$org_trgdoc\t$src\t$trg\n" if ($opt_P);
 close S;
 close T;
+close P if ($opt_p);
+close P2 if ($opt_P);
 # simplistic conversion from XML to plain text