1. Jörg Tiedemann
  2. pdf2xml

Commits

tiedeman  committed d925ffc

post-processing of pdfxtk output

  • Participants
  • Parent commits 4eedb6f
  • Branches master

Comments (0)

Files changed (1)

File pdf2xml

View file
  • Ignore whitespace
 
 =head1 OPTIONS
 
+ -c ............. split strings into character sequences before finding words
  -h ............. skip de-hypenation (keep hyphenated words)
  -l lexicon ..... provide a list of words or a text in the target language
  -L ............. skip lowercasing (which is switched on by default)
 use File::Temp qw /tempfile/;
 
 
-use vars qw($opt_h $opt_L $opt_l $opt_m $opt_r $opt_x $opt_v $opt_X);
+use vars qw($opt_c $opt_h $opt_L $opt_l $opt_m $opt_r $opt_x $opt_v $opt_X);
 use Getopt::Std;
-getopts('hLl:mrxXv');
+getopts('chLl:mrxXv');
 
 # home of shared data (where Apache Tika should be)
 
 my $TIKA     = $SHARED_HOME.'/lib/tika-app-1.3.jar';
 my $PDF2TEXT = `which pdftotext`;chomp($PDF2TEXT);
 
-my $LONGEST_WORD = undef;
+# some global variables used for finding words in strings
+# LONGEST_WORD = length of the longest word in the vocabulary
+# SPLIT_CHAR_IF_NECESSARY = split strings into character sequences
+#                           (if they do not contain any single whitespace)
+#                           (this is only used with pdfxtk output)
+# SPLIT_CHAR = always split strings into character sequence before finding words
 
+my $LONGEST_WORD = undef;
 my $SPLIT_CHAR_IF_NECESSARY = 0;
-my $SPLIT_CHAR = 0;
+my $SPLIT_CHAR              = $opt_c;
 
 # we require recent versions of pdftotext developed by 
 # The Poppler Developers - http://poppler.freedesktop.org
     return split(/\s+/,$string) if ($opt_m);          # skip merging ...
 
     my @tokens1;
-    if ($split_char_when_necessary){
+    if ($split_char){
+	@tokens1 = split(//,$string);
+    }
+    elsif ($split_char_when_necessary){
 	unless ($string=~/\s/){
 	    @tokens1 = split(//,$string);
 	}
     }
-    elsif ($split_char){
-	@tokens1 = split(//,$string);
-    }
     unless (@tokens1){
 	@tokens1 = split(/\s+/,$string);
     }