Commits

tiedeman committed 3eb608d

fixed a problem with the sentence splitter

  • Participants
  • Parent commits 8a0f590

Comments (0)

Files changed (2)

uplug-main/bin/uplug-sent

 	    print STDERR '.';
 	}
     }
-    &split($data);
+    &split_data($data);
     $output->write($data);
 }
 
 my $parId;
 my $id;
 my $idhead;
-sub split{
+sub split_data{
     my $data=shift;
     my %subst=();
 

uplug-main/lib/Uplug/PreProcess/SentDetect.pm

 sub _preprocess {
     my ( $self, $text ) = @_;
 
+    # clean up spaces at head and tail of each line as well as any double-spacing
+    $text =~ s/ +/ /g;
+    $text =~ s/\n /\n/g;
+    $text =~ s/ \n/\n/g;
+    $text =~ s/^ //g;
+    $text =~ s/ $//g;
+
     ##### add sentence breaks as needed #####
 
     #non-period end of sentence markers (?!) followed by sentence starters.