Commits

tiedeman committed 2b4af96

srt2xml now in utf8

  • Participants
  • Parent commits 732f470

Comments (0)

Files changed (2)

 Some scripts for processing movie subtitles
 
 
-srt2xml.pl .... convert subtitles in srt-format to simple OPUS-style XML 
+srt2xml    .... convert subtitles in srt-format to simple OPUS-style XML 
                 format (does sentence splitting and tokenization)
                 (uses nonbreaking_prefix.* files for tokenization
                  which are just copies from the files distributed with 
                  the Europarl corpus version 3)
 
 		Note that subtitle files are usually DOS files and 
-		srt2xml.pl expects UNIX-style text files! 
+		srt2xml expects UNIX-style text files! 
 		--> use dos2unix before piping the text into srt2xml.pl
 
-		for Dutch: if you want to use Alpino for tokenization:
-		srt2xml.pl looks by default for the tokenizer in
-		$ENV{ALPINO_HOME}/Tokenization/tokenize.sh
-		(adjust if you need to)
 
-srtalign.pl ... align srt-files which have been converted to XML using 
-		srt2xml.pl (requires time-stamps!)
+srtalign... ... align srt-files which have been converted to XML using 
+		srt2xml (requires time-stamps!)
 		For more information on using this script and its options:
 		Look at the header of the script!
 
-dic/ .......... This directory contains word alignment dictionaries
+share/dic ..... This directory contains word alignment dictionaries
 		obtained by aligning the OpenSubtitles corpus from OPUS
 		These dictionaries can be used to improve sentence 
 		alignment by synchronizing time stamps with the help of
 
 
 use strict;
-
-use open IN => ':bytes';
+use utf8;
 
 use Getopt::Std;
 use IPC::Open3;
 ##
 ## these RE's are not used at all ...
 ##
-#my $s_start = '([\"\']?[\�\�\p{Lu}])';
-#my $s_start_maybe = '(\-?\s*[\"\'\�\�]?[\p{N}\p{Ps}])';
+#my $s_start = '([\"\']?[\¿\¡\p{Lu}])';
+#my $s_start_maybe = '(\-?\s*[\"\'\¿\¡]?[\p{N}\p{Ps}])';
 #my $s_end = "([^\.]\.[\"\']?|[\.\!\?\:][\"\']?)";
 #my $s_end_maybe = "([^\.]\.[\"\'\]\}\)]?\-?\s*|[\.\!\?\:][\"\'\]\}\)]?\-?\s*)";
 
 		$wid=0;
 	    }
 
-	    elsif ($plain=~/^\s*([\"\'\[]?|[\*\#\']*\s*)[\�\�\p{Lu}l]/){
+	    elsif ($plain=~/^\s*([\"\'\[]?|[\*\#\']*\s*)[\¿\¡\p{Lu}l]/){
 		closetags();
 		print "  </s>\n";
 		print "  <s id=\"$sid\">\n";
 	    my $sentence_boundary = 0;
 	    if ($plain_before=~/([^.]\.|[!?:])[\'\"]?\s*$/){
 #		if ($plain_after=~/^\s+\-?\s*[\"\']?[\p{N}\p{Ps}\p{Lu}]/){
-		if ($plain_after=~/^\s+[\-\*\#]*\s*[\�\�\"\'\[]?[\p{N}\p{Ps}\p{Lu}]/){
+		if ($plain_after=~/^\s+[\-\*\#]*\s*[\¿\¡\"\'\[]?[\p{N}\p{Ps}\p{Lu}]/){
 		    $sentence_boundary = 1;
 		}
 	    }
 	    elsif ($plain_before=~/([.!?:])[\"\'\]\}\)]?\-?\s*$/){
-		if ($plain_after=~/^\s+[\"\']?[\�\�\p{Lu}]/){
+		if ($plain_after=~/^\s+[\"\']?[\¿\¡\p{Lu}]/){
 		    $sentence_boundary = 1;
 		}
 	    }
     # a little hack to treat contractions later:
     # replace various apostrophes with one that does not count as punctuation
     # (to keep it together for later splitting - see below)
-    $string=~s/(\P{P})'(\P{P})/$1$2/gs;
+    $string=~s/(\P{P})'(\P{P})/$1´$2/gs;
 
     # \p{P} ==> punctuations
     # \P{P} ==> non-punctuations
     # continue treating contractions:
     # - for English: contraction is always the second part (are there exceptions?)
     if ($opt_l eq 'eng'){
-      $string=~s/(\P{P})(['`])(\P{P})/$1 '$3/gs;
+      $string=~s/(\P{P})(['`´])(\P{P})/$1 '$3/gs;
     }
     # - for French: contraction is always the first part (are there exceptions?)
     elsif ($opt_l eq 'fre'){
-      $string=~s/(\P{P})(['`])(\P{P})/$1' $3/gs;
+      $string=~s/(\P{P})(['`´])(\P{P})/$1' $3/gs;
     }
     # - for other languages: contraction is the shorter part (is that a good heuristics)
     else{
-      $string=~s/(\A|\s|\p{P})(\p{L}+)(['`])(\p{L}+)(\Z|\s|\p{P})/(length($2) >= length($4))?"$1$2 '$4$5":"$1$2' $4$5"/egs;
+      $string=~s/(\A|\s|\p{P})(\p{L}+)(['`´])(\p{L}+)(\Z|\s|\p{P})/(length($2) >= length($4))?"$1$2 '$4$5":"$1$2' $4$5"/egs;
     }
 
 
 
 =head1 AUTHOR
 
-J�rg Tiedemann, L<https://bitbucket.org/tiedemann>
+rg Tiedemann, L<https://bitbucket.org/tiedemann>
 
 =head1 BUGS AND SUPPORT
 
 
 =head1 LICENSE AND COPYRIGHT
 
-Copyright 2013 Jrg Tiedemann.
+Copyright 2013 Jörg Tiedemann.
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU Lesser General Public License as published