Commits

tiedeman  committed 2d55444

integrated Lingua::Identify::CLD

  • Participants
  • Parent commits 55a3383

Comments (0)

Files changed (5)

File Lingua-Identify-Blacklists/Changes

 0.02	Sat Oct 13 09:57:20 CEST 2012
 	- now with documentation
 
-0.03	Sat Oct 13 21:41:54 CEST 2012
+0.03	Sun Oct 14 18:50:20 CEST 2012
 	- read from files with line-length-limit
+	- integrated general-purpose language identifier (Lingua::Identify::CLD)
+

File Lingua-Identify-Blacklists/Makefile.PL

 requires 'File::ShareDir'           => 0;
 requires 'File::Basename'           => 0;
 requires 'File::GetLineMaxLength'   => 0;
+requires 'Lingua::Identify'         => 0;
+requires 'Lingua::Identify::CLD'    => '0.05';
+
 
 WriteAll;

File Lingua-Identify-Blacklists/lib/Lingua/Identify/Blacklists.pm

 use File::Basename qw/dirname/;
 use File::GetLineMaxLength;
 
+use Lingua::Identify qw(:language_identification);;
+use Lingua::Identify::CLD;
+
 use Exporter 'import';
 our @EXPORT_OK = qw( identify identify_file identify_stdin 
                      train train_blacklist run_experiment 
 our $TOKENIZE        = 1;
 our $ALPHA_ONLY      = 1;
 our $MAX_LINE_LENGTH = 2**16;    # limit the length of one line to be read
+our $CLD_TEXT_SIZE   = 2**16;    # text size used for detecting lang with CLD
+our $VERBOSE         = 0;
+
+my %blacklists = ();  # hash of blacklists (langpair => {blacklist}, ...)
+my %confusable = ();  # hash of confusable languages (lang => [other_langs])
+
+## the compact language identifier from Google Chrome
+my $CLD = new Lingua::Identify::CLD;
 
-my %blacklists = ();
 
-our $VERBOSE = 0;
 
+&load_blacklists( $BLACKLISTDIR );
 
 
 
-sub initialize{ %blacklists = (); }
+
+sub initialize{ %blacklists = (); %confusable = (); }
 
 =head1 Exported Functions
 
   my %dic = ();
   my $total = 0;
 
-  &process_string( $text, \%dic, $total );
-  return &classify( \%dic, %options );
+  # run the blacklist classifier if 'langs' are specified
+  if (exists $options{langs}){
+      &process_string( $text, \%dic, $total );
+      return &classify( \%dic, %options );
+  }
+
+  # otherwise: check if there is an 'assumed' language
+  # if not: classify with CLD
+  $options{assumed} = &identify_language( $text ) 
+      unless (exists $options{assumed});
+
+  # if there is an 'assumed' language:
+  # check if it can be confused with others (i.e. blacklists exist)
+  if (exists $confusable{$options{assumed}}){
+      $options{langs} = $confusable{$options{assumed}};
+      # finally: process the text and classify
+      &process_string( $text, \%dic, $total );
+      return &classify( \%dic, %options );
+  }
+  return $options{assumed};
 }
 
 
+sub identify_language{
+    my ($lang, $id, $conf) = $CLD->identify( $_[0] );
+
+    # strangely enough CLD is not really reliable for English
+    # (all kinds of garbish input is recognized as English)
+    # --> check with Lingua::Identify
+    if ($id eq 'en'){
+	$id = $id = langof( $_[0] ) ? $id : 'unknown';
+    }
+    return $id;
+}
+
+
+
 sub identify_stdin{
     return identify_file( undef, @_ );
 }
     my $total = 0;
     my @predictions = ();
     
-    my $fh = *STDIN;
-    if (defined $file){
-	if (-e $file){
-	    open $fh,"<$file" || die "cannot read from '$file'\n";
-	    binmode($fh,":encoding(UTF-8)");
+    my $fh     = defined $file ? open_file($file) : *STDIN;
+    my $reader = File::GetLineMaxLength->new($fh);
+
+    # mode 1: classify every line separately
+    if ($options{every_line}){
+	my @predictions = ();
+	while (my $line = $reader->getline($MAX_LINE_LENGTH)) {
+	    chomp $line;
+            push( @predictions, &identify( $line, %options ) );
 	}
-	else{ print STDERR "Cannot find file '$file'! Read from STDIN\n"; }
+	return @predictions;
     }
-    
-    while (<$fh>){
-	chomp;
-	&process_string($_,\%dic,$total);
-	if ($options{every_line}){                        # classify every line separately
-            push( @predictions, &classify( \%dic, %options ) );
-            %dic=();
+
+    # mode 2: classify all text together (optional: size limit)
+    my $text = '';
+    while (my $line = $reader->getline($MAX_LINE_LENGTH)) {
+
+	# save text if no languages are given (for blacklists)
+	unless (exists $options{langs} || exists $options{assumed}){
+	    if ( length($text) < $CLD_TEXT_SIZE ){
+		$text .= $line;
+	    }
 	}
-	elsif ($options{text_size}){                     # use only a certain number of words
+
+	# prepare the data for blacklist classification
+	# (TODO: we do not run blacklists all the time - 
+	#        schould we process the text later when needed?)
+	chomp $line;
+	&process_string($line,\%dic,$total);
+	if ($options{text_size}){        # use only a certain number of words
 	    if ($total > $options{text_size}){
-		print STDERR "use $total tokens for classification\n" if ($VERBOSE);
+		print STDERR "use $total tokens for classification\n" 
+		    if ($VERBOSE);
 		last;
 	    }
 	}
     }
-    unless ($options{every_line}){
-	push( @predictions, &classify( \%dic, %options ) );
+
+    # no languages selected?
+    unless (exists $options{langs}){
+	# no assumed language set
+	unless (exists $options{assumed}){
+	    # try to identify with the text we have saved above
+	    $options{assumed} = &identify_language( $text ) 
+		unless (exists $options{assumed});
+	}
+	if (exists $confusable{$options{assumed}}){
+	    $options{langs} = $confusable{$options{assumed}};
+	}
+    }
+
+    # finally: classify with blacklists
+    if (exists $options{langs}){
+	# finally: process the text and classify
+	&process_string( $text, \%dic, $total );
+	return &classify( \%dic, %options );
     }
-    return wantarray ? @predictions : $predictions[0];
+
+    # no blacklists in this case ...
+    return $options{assumed};
 }
 
 
 
 sub available_languages{
     unless (keys %blacklists){
-	&load_all_blacklists( $BLACKLISTDIR );
+	&load_blacklists( $BLACKLISTDIR );
     }
     my %langs = ();
     foreach (keys %blacklists){
 
 sub available_blacklists{
     unless (keys %blacklists){
-	&load_all_blacklists( $BLACKLISTDIR );
+	&load_blacklists( $BLACKLISTDIR );
     }
     my %pairs = ();
     foreach (keys %blacklists){
 
  classify_with_margin(\%dic,$margin,@langs) 
 
- load_all_blacklists()            # load all blacklists available in BLACKLISTDIR
+ load_blacklists($dir)                # load all blacklists available in $dir
  load_blacklist(\%list,$dir,      # load a lang-pair specific blacklist
                 $lang1,$lang2)  
  read_file($file,\%dic,$max)      # read a file and count token frequencies
 }
 
 
+# load_all_blacklists = alias for load_blacklists
 
+sub load_all_blacklists{ return load_blacklists(@_); }
 
-
-
-
-sub load_all_blacklists{
-    my $dir = shift;
+sub load_blacklists{
+    my $dir = shift || $BLACKLISTDIR;
 
     opendir(my $dh, $dir) || die "cannot read directory '$dir'\n";
     while(readdir $dh) {
 	}
     }
     closedir $dh;
+
+    # update list of confusable languages
+    my %lists = &available_blacklists();
+    foreach my $lang (keys %lists){
+	@{$confusable{$lang}} = keys %{$lists{$lang}};
+	unshift( @{$confusable{$lang}}, $lang );
+    }
 }
 
 
 
 
 
-
-sub read_file{
-    my ($file,$dic,$max)=@_;
-    my $total = 0;
-
+sub open_file{
+    my $file = shift;
     # allow gzipped input
     my $fh;
     if ($file=~/\.gz$/){
     else{
 	open $fh,"<:encoding(UTF-8)",$file || die "cannot open file '$file'";
     }
+    return $fh;
+}
+
+
+sub read_file{
+    my ($file,$dic,$max)=@_;
 
     # use File::GetLineMaxLength to avoid filling the memory
     # when reading from files without new lines
+    my $fh     = open_file( $file );
     my $reader = File::GetLineMaxLength->new($fh);
+
+    my $total = 0;
     while (my $line = $reader->getline($MAX_LINE_LENGTH)) {
 	chomp $line;
         &process_string($line,$dic,$total);

File Lingua-Identify-Blacklists/t/10_identify.t

 use Lingua::Identify::Blacklists ':all';
 
 
-my %texts = ( 'bs' => 'U Sudu BiH danas je saslušanjem svjedoka optužbe nastavljeno suđenje zločinačkoj organizaciji na čelu sa Zijadom Turkovićem koja se tereti za više monstruoznih likvidacija, međunarodnu trgovinu drogom, pljačku 2,5 miliona maraka iz sarajevskog aerodroma, pranje novca te otimanje dionica fabrike čarapa “Ključ”. Svjedočio je Drago Neimarović zvani Sandokan koji je bio blizak prijatelj ubijenog Marija Tolića. On je kazao da je Turkovića upoznao prije četiri ili pet godina u diskoteci “Party” u Busovači.',
+my %texts = ( 'unknown' => '<B8><A4>^A<BA>^R@^P^\^L^',   # CLD things this is En!
+	      'en' => 'This is a very short English text',
+	      'bs' => 'U Sudu BiH danas je saslušanjem svjedoka optužbe nastavljeno suđenje zločinačkoj organizaciji na čelu sa Zijadom Turkovićem koja se tereti za više monstruoznih likvidacija, međunarodnu trgovinu drogom, pljačku 2,5 miliona maraka iz sarajevskog aerodroma, pranje novca te otimanje dionica fabrike čarapa “Ključ”. Svjedočio je Drago Neimarović zvani Sandokan koji je bio blizak prijatelj ubijenog Marija Tolića. On je kazao da je Turkovića upoznao prije četiri ili pet godina u diskoteci “Party” u Busovači.',
 	      'hr' => 'VOĆIN – Unatoč kiši tijekom vikenda u voćinskim šumama i na ratnim ruševinama bivšeg odmarališta Zvečevo na Papuku, Komisija za potrage i lavine GSS-a Hrvatske uspješno je provela trodnevnu vježbu i licenciranje potražnih ekipa u kojoj je sudjelovalo dvadesetak spasilaca i njihovih potražnih pasa iz cijele Hrvatske. Domaćin vježbe bili su podružnica GSS-a Požega i spasilac Stjepan Gal iz Slatine, a u organizaciji vježbe pomogla je Općina Voćin. Malo je poznato da GSS Hrvatske danas ima 550 volontera, spasilaca, od kojih je 350 položilo zahtjevne ispite i dobilo licenciju spašavatelja. Kad god postoji potreba, volonteri velikog srca ostavljaju poslove, sjedaju u automobile i ponekad prelaze više od 800 kilometara samo da bi, bez ikakve naknade, pomogli ljudima u nevolji.',
 	      'sr' => 'Održavanje predsedničkih i parlamentarnih izbora na Kosmetu, a pogotovo najava predsednika Opštine Kosovska Mitrovica Krstimira Pantića da će u opštinama na severu Kosova, Zvečanu i Zubinom Potoku, uprkos protivljenju zvaničnog Beograda biti održani i lokalni izbori, alarmirali su NATO. Tako će ova vojna alijansa u srpsku pokrajinu do kraja ove nedelje poslati još 700 vojnika koji će pojačati nemački i austrijski kontingent. Oni će biti stacionirani na punktovima u Kosovskoj Mitrovici i u albanskim selima u opštinama Zvečan i Zubin Potok. Procena komande NATO-a jeste da u slučaju bilo kakvih etnički motivisanih sukoba Euleks ne bi imao dovoljno snage i kapaciteta da zaustavi nasilje.');
 

File Lingua-Identify-Blacklists/t/20_identify_file.t

 
 use Lingua::Identify::Blacklists ':all';
 
+# evaluation files
 my %files = ( bs => "$Bin/data/eval/dnevniavaz.ba.200.check",
 	      hr => "$Bin/data/eval/vecernji.hr.200.check",
 	      sr => "$Bin/data/eval/politika.rs.200.check" );
 
+# correct prediction counts (for classification of every_line)
+my %eval = ( 'bs' => { bs => 188, hr => 11, sr => 1 },
+	     'hr' => { hr => 196, bs => 4 },
+	     'sr' => { sr => 200 } );
+
+
 foreach my $lang (keys %files){
-    is( identify_file($files{$lang}), $lang) ;
-    is (my @pred = identify_file($files{$lang}, every_line => 1), 200);
+
+    # classify the whole file
+    is( identify_file($files{$lang}), $lang);
+
+    # classify every line separately
+    my @pred = identify_file($files{$lang}, every_line => 1);
+    foreach my $l (keys %{$eval{$lang}}){
+	is( my $count = grep ($_ eq $l,@pred), $eval{$lang}{$l} );
+    }
 }
 
 done_testing;