1. Shlomi Fish
  2. perl-begin

Commits

Shlomi Fish  committed c05d79f

Started spell checking.

  • Participants
  • Parent commits def685b
  • Branches spell_checking

Comments (0)

Files changed (5)

File Tests/spell-check.t

View file
  • Ignore whitespace
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use Test::More tests => 1;
+
+{
+    my $output = `./bin/spell-checker-iface.sh`;
+    chomp($output);
+
+    # TEST
+    is ($output, '', "No spelling errors.");
+}

File bin/html-check-spelling-xmlp.pl

View file
  • Ignore whitespace
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use autodie;
+
+use utf8;
+
+use HTML::Parser 3.00 ();
+use Text::Hunspell;
+use List::MoreUtils qw(any);
+
+my $speller = Text::Hunspell->new(
+    '/usr/share/hunspell/en_GB.aff',
+    '/usr/share/hunspell/en_GB.dic',
+);
+
+die unless $speller;
+
+binmode STDOUT, ":encoding(utf8)";
+
+my %general_whitelist;
+my %per_filename_whitelists;
+
+{
+    my @current_whitelists_list = \%general_whitelist;
+    open my $fh, '<:encoding(utf8)', 'lib/hunspell/whitelist1.txt';
+    while (my $l = <$fh>)
+    {
+        chomp($l);
+        # Whitespace or comment - skip.
+        if ($l !~ /\S/ or ($l =~ /\A\s*#/))
+        {
+            # Do nothing.
+        }
+        elsif ($l =~ /\A====(.*)/)
+        {
+            @current_whitelists_list =
+            (
+                map { $per_filename_whitelists{$_} ||= +{} }
+                split /,/, $1
+            );
+        }
+        else
+        {
+            foreach my $w (@current_whitelists_list)
+            {
+                $w->{$l} = 1;
+            }
+        }
+    }
+    close ($fh);
+}
+
+my %inside;
+
+sub tag
+{
+   my($tag, $num) = @_;
+   $inside{$tag} += $num;
+   print " ";  # not for all tags
+}
+
+
+foreach my $filename (@ARGV)
+{
+    my $process_text = sub
+    {
+        return if $inside{script} || $inside{style};
+
+        my $text = shift;
+
+        my @lines = split /\n/, $text, -1;
+
+        foreach my $l (@lines)
+        {
+
+            my $mispelling_found = 0;
+
+            my $mark_word = sub {
+                my ($word) = @_;
+
+                $word =~ s{’(ve|s|m|d|t|ll|re)\z}{'$1};
+                $word =~ s{[’']\z}{};
+                if ($word =~ /[A-Za-z]/)
+                {
+                    $word =~ s{\A(?:(?:ֹו?(?:ש|ל|מ|ב|כש|לכש|מה|שה|לכשה|ב-))|ו)-?}{};
+                    $word =~ s{'?ים\z}{};
+                }
+
+                my $verdict =
+                (
+                    (!exists($general_whitelist{$word}))
+                        &&
+                    (!exists($per_filename_whitelists{$filename}{$word}))
+                        &&
+                    ($word !~ m#\A[\p{Hebrew}\-'’]+\z#)
+                        &&
+                    (!($speller->check($word)))
+                );
+
+                $mispelling_found ||= $verdict;
+
+                return $verdict ? "«$word»" : $word;
+            };
+
+            $l =~ s/
+            # Not sure this regex to match a word is fully
+            # idiot-proof, but we can amend it later.
+            ([\w'’-]+)
+            /$mark_word->($1)/egx;
+
+            if ($mispelling_found)
+            {
+                printf {*STDOUT}
+                (
+                    "%s:%d:%s\n",
+                    $filename,
+                    1,
+                    $l
+                );
+            }
+        }
+    };
+
+    open(my $fh, "<:utf8", $filename);
+
+    HTML::Parser->new(api_version => 3,
+        handlers    => [start => [\&tag, "tagname, '+1'"],
+            end   => [\&tag, "tagname, '-1'"],
+            text  => [$process_text, "dtext"],
+        ],
+        marked_sections => 1,
+    )->parse_file($fh);
+
+    close ($fh);
+}
+
+print "\n";

File bin/spell-checker-iface.sh

View file
  • Ignore whitespace
+#!/bin/bash
+find dest -name '*.html' -or -name '*.xhtml' |
+    ( LC_ALL=C sort  ) |
+    xargs perl bin/html-check-spelling-xmlp.pl |
+    grep ':'
+    # perl -lne 'print if /MathVentures\/3d.*\.xhtml/' |
+exit 0

File lib/hunspell/whitelist1.txt

View file
  • Ignore whitespace
+W3C
+Wikis
+wikibooks
+Wikipedia's
+Hexten
+CSS
+XHTML
+GoFlexiblePro
+shlomif
+shlomifish
+org
+Shlomi
+Webmaster
+Perl
+perl
+22-Jul-2011
+Unported
+9-Jul-2012
+Wiki
+OSDC
+wiki
+QA
+Win32
+Wikibooks
+Wikipedia
+Perl-related
+wikipedia
+MediaWiki-based
+wikis
+CPAN
+XML
+CGI
+Sys
+Admin
+IRC
+XMPP
+Telnet
+Bio-Info
+Blogs
+IRC
+Freenode's
+IDEs
+Online
+PERL
+Moertel
+
+====dest/wikis/index.html
+
+ajt

File src/wikis/index.html.wml

View file
  • Ignore whitespace
 <li>
 <a href="http://en.wikipedia.org/wiki/Perl">Perl at the
 English Wikipedia</a> - The wikipedia is a multi-lingual world-editable
-encyclopedia. The Perl Page contains links to other pages inside the wikipedia
+encyclopaedia. The Perl Page contains links to other pages inside the wikipedia
 about Perl.<br />
 
 <ul>