tiedeman avatar tiedeman committed 1ca0db9

version 0.03

Comments (0)

Files changed (1)

Lingua-Identify-Blacklists/lib/Lingua/Identify/Blacklists.pm

   my $lang = identify( ".... text to be classified ...", 
                        langs => ['bs','hr','sr']);
 
-  # use all languages available
+  # check if the assumed language ('hr') is confused with another one
+  my $lang = identify( ".... text to be classified ...", assumed => 'hr' );
+
+  # use a general-purpose identfier and check confusable langauges if necessary
   my $lang = identify( ".... text to be classified ...");
 
   # delect language in the given file (Unicode UTF-8 is assumed)
+  my $lang = identify_file( $filename, langs => [...] );
+  my $lang = identify_file( $filename, assumed => '..' );
   my $lang = identify_file( $filename );
 
   # delect language for every line separately from the given file 
   # (return a list of lang-IDs)
+  my @langs = identify_file( $filename, every_line => 1, langs = [...] );
+  my @langs = identify_file( $filename, every_line => 1, assumed = '..' );
   my @langs = identify_file( $filename, every_line => 1 );
 
 
 
   train( { cs => $file_with_cs_text, sk => $file_with_sk_text }, %para );
 
+=head1 Description
+
+This module adds a blacklist classifier to a general purpose language identification tool. Related languages can easily be confused with each other and standard language detection tools do not work very well for distinguishing them. With this module one can train so-called blacklists of words for language pairs containing words that should not (or very rarely) occur in one language while being quite common in the other. These blacklists are then used to discriminate between those "confusable" related languages.
+
+Since version 0.03 it also integrates a standard language identifier (Lingua::Identify::CLD) and can now be used for general language identification. It calls the blacklist classifier only for those languages that can be confused and for which appropriate blacklists are trained.
 
 
 =head1 Settings
 
 Module-internal variables that can be modified:
 
- $BLACKLISTDIR     directory with all blacklists (default: module-share-dir)
- $LOWERCASE        lowercase all data, yes/no (1/0), default: 1
- $TOKENIZE         tokenize all data, yes/no (1/0), default: 1
- $ALPHA_ONLY       don't use tokens with non-alphabetic characters, default: 1
- $MAX_LINE_LENGTH  max line length when reading from files (default=1048576)
- $VERBOSE          verbose output (default=0)
+ $BLACKLISTDIR     # directory with all blacklists (default: module-share-dir)
+ $LOWERCASE        # lowercase all data, yes/no (1/0), default: 1
+ $TOKENIZE         # tokenize all data, yes/no (1/0), default: 1
+ $ALPHA_ONLY       # don't use tokens with non-alphabetic characters, default: 1
+ $MAX_LINE_LENGTH  # max line length when reading from files (default=2**16)
+ $CLD_TEXT_SIZE    # text size in characters used for language ident. with CLD
+ $VERBOSE          # verbose output (default=0)
 
 Tokenization is very simple and replaces all non-alphabetic characters with a white-space character.
 
 my $CLD = new Lingua::Identify::CLD;
 
 
-
+# load all blacklists in the gneral BLACKLISTDIR
 &load_blacklists( $BLACKLISTDIR );
 
 
 
 
-sub initialize{ %blacklists = (); %confusable = (); }
-
 =head1 Exported Functions
 
 =head2 C<$langID = identify( $text [,%options] )>
 
 Analyses a given text and returns a language ID as the result of the classification. C<%options> can be used to change the behaviour of the classifier. Possible options are
 
-  langs => \@list_of_possible_langs
+  assumed    => $assumed_lang
+  langs      => \@list_of_possible_langs
   use_margin => $score
 
+If C<langs> are specified, it runs the classifier with blacklists for those languages (in a cascaded way, i.e. best1 = lang1 vs lang2, best2 = best1 vs lang3, ...). If C<use_margin> is specified, it runs all versus all and returns the language that wins the most (with margin=$score).
+
+If the C<assumed> language is given, it runs the blacklist classifier for all languages that can be confused with $assumed_lang (if blacklist models exist for them).
+
+If neither C<langs> not C<assumed> are specified, it first runs a general-purpose language identification (using Lingua::Identify::CLD and Lingua::Identify) and then checks with the blacklist classifier whether the detected language can be confused with another one. For example, CLD frequently classifies Serbian and Bosnian texts as Croatian but the blacklist classifier will detect that (and hopefully correct the decision).
+
 =cut
 
 sub identify{
 }
 
 
-sub identify_language{
-    my ($lang, $id, $conf) = $CLD->identify( $_[0] );
-
-    # strangely enough CLD is not really reliable for English
-    # (all kinds of garbish input is recognized as English)
-    # --> check with Lingua::Identify
-    if ($id eq 'en'){
-	$id = $id = langof( $_[0] ) ? $id : 'unknown';
-    }
-    return $id;
-}
-
-
-
-sub identify_stdin{
-    return identify_file( undef, @_ );
-}
-
 
 =head2 C<$langID = identify_file( $filename [,%options] )>
 
 
 
 
+=head2 C<$langID = identify_stdin( [,%options] )>
+
+The same as C<identify_file> but reads from STDIN
+
+=cut
+
+
+sub identify_stdin{
+    return identify_file( undef, @_ );
+}
+
+
 
 
 =head2 C<train( \%traindata [,%options] )>
     my $evalfiles = shift;
     my $options = ref($_[0]) eq 'HASH' ? shift : {};
 
-    my @traindata = ref($trainfiles) eq 'ARRAY' ? @{$trainfiles} : split(/\s+/,$trainfiles);
-    my @evaldata = ref($evalfiles) eq 'ARRAY' ? @{$evalfiles} : split(/\s+/,$evalfiles);
+    my @traindata = 
+	ref($trainfiles) eq 'ARRAY' ? @{$trainfiles} : split(/\s+/,$trainfiles);
+    my @evaldata = 
+	ref($evalfiles) eq 'ARRAY' ? @{$evalfiles} : split(/\s+/,$evalfiles);
     my @langs = @_;
 
     die "no languages given!\n" unless (@langs);
 
 The following functions are not exported and are mainly used for internal purposes (but may be used from the outside if needed).
 
+ initialize()                     # reset the repository of blacklists
+ identify_language($text)         # return lang-ID for $text (using CLD)
  classify(\%dic,%options)         # run the classifier
  classify_cascaded(\%dic,@langs)  # run a cascade of binary classifications
 
 =cut
 
 
+sub initialize{ %blacklists = (); %confusable = (); }
 
+sub identify_language{
+    my ($lang, $id, $conf) = $CLD->identify( $_[0] );
+
+    # strangely enough CLD is not really reliable for English
+    # (all kinds of garbish input is recognized as English)
+    # --> check with Lingua::Identify
+    if ($id eq 'en'){
+	$id = $id = langof( $_[0] ) ? $id : 'unknown';
+    }
+    return $id;
+}
 
 sub classify{
     my $dic         = shift;
     return &classify_cascaded( $dic, @langs );
 }
 
-
-
-
-
 sub classify_cascaded{
     my $dic = shift;
     my @langs = @_;
     close F;
 }
 
-
-
 sub open_file{
     my $file = shift;
     # allow gzipped input
     return $fh;
 }
 
-
 sub read_file{
     my ($file,$dic,$max)=@_;
 
     foreach my $w (@words){${$_[1]}{$w}++;$_[2]++;}
 }
 
-
-
-
-
-
 1;
 
-
 __END__
 
 
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.