Commits

tiedeman committed 93bd9ff

better pdfxtk conversion

  • Participants
  • Parent commits 5d3e519

Comments (0)

Files changed (2)

 
 * v0.2.4 Fri Mar 15 10:34:42 CET 2013
 
-- dehyphenation and word-finding in pdfxtk-mode (-X)
-- now also splits strings into known words
+- dehyphenation and other heurstics in pdfxtk-mode (-X)
+- now also splits strings into characters to find known words
   (solves a problem with pdfxtk conversions)
 
 * v0.2.3 Wed Mar  6 23:12:22 CET 2013
 #  $split_char_when_necessary = 1 ---> split into character sequences if string has no whitespaces
 #  $split_char = 1 ---> always split into character sequences
 
-sub find_words{
-    my ($string, $split_char_when_necessary, $split_char) = @_;
+sub find_longest_words{
+    my @tokens1 = @_;
 
+    return @tokens1 if ($opt_m);          # skip merging ...
     my @words = ();
-    $string=~s/^\s*//;
-
-    return split(/\s+/,$string) if ($opt_m);          # skip merging ...
-
-    my @tokens1;
-    if ($split_char){
-	@tokens1 = split(//,$string);
-    }
-    elsif ($split_char_when_necessary){
-	unless ($string=~/\s/){
-	    @tokens1 = split(//,$string);
-	}
-    }
-    unless (@tokens1){
-	@tokens1 = split(/\s+/,$string);
-    }
 
     my @tokens2   = ();
     my $remaining = \@tokens1;
 
 
 
+
+sub find_words{
+    my ($string,$pdfxtk,$charsplit) = @_;
+    if ($charsplit){
+	return find_words_charlevel($string);
+    }
+    if ($pdfxtk){
+	return find_words_pdfxtk($string);
+    }
+    return find_words_standard($string);
+}
+
+
+sub find_words_standard{
+    $_[0]=~s/^\s*//;
+    return find_longest_words( split(/\s+/,$_[0]) );
+}
+
+sub find_words_charlevel{
+    $_[0]=~s/^\s*//;
+    return find_longest_words( split(//,$_[0]) );
+}
+
+
+sub find_words_pdfxtk{
+    my $string = shift;
+
+    unless ($string=~/\s/){
+	return find_words_charlevel($string);
+    }
+
+    $string=~s/^\s*//;
+    my @words = ();
+    my @tokens = split(/\s+/,$string);
+
+    foreach (@tokens){
+
+	# suspiciously long words ....
+	if ( length($_) > $LONGEST_WORD ){
+	    push(@words, find_words_charlevel($_) );
+	}
+
+	# upper-case letters following a lower-cased one ...
+	elsif ( $_ =~/\p{Ll}\p{Lu}/ ){
+	    push(@words, find_words_charlevel($_) );
+	}
+	else{
+	    push(@words, $_);
+	}
+    }
+
+    foreach (0..$#words){
+	if ($words[$_]=~/\-/){
+	    my $str = $words[$_];
+	    $str=~s/\-//g;
+	    if (exists $voc{$str}){
+		$words[$_]=$str;
+	    }
+	}
+    }
+
+    return @words;
+}
+
+
+
+
+
 sub read_vocabulary{
     my ($voc,$file) = @_;
     if ($file=~/\.gz$/){