Commits

Anonymous committed 9572a8a

Add some scripts to spell-check the pages.

They have many false positives.

Comments (0)

Files changed (2)

bin/html-check-spelling-xmlp.pl

+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use utf8;
+
+use XML::Parser;
+use Text::Hunspell;
+use List::MoreUtils qw(any);
+
+my $speller = Text::Hunspell->new(
+    '/usr/share/hunspell/en_GH.aff',
+    '/usr/share/hunspell/en_GH.dic',
+);
+
+die unless $speller;
+
+binmode STDOUT, ":encoding(utf8)";
+
+foreach my $filename (@ARGV)
+{
+    my $parser = XML::Parser->new(
+        Handlers => {
+            Char => sub {
+                my ($expat, $string) = @_;
+
+                my @lines = split /\n/, $string, -1;
+
+                foreach my $idx (0 .. $#lines)
+                {
+                    my $l = $lines[$idx];
+
+                    my $mispelling_found = 0;
+
+                    my $mark_word = sub {
+                        my ($word) = @_;
+                        
+                        my $verdict = !($speller->check($word));
+                        
+                        $mispelling_found ||= $verdict;
+
+                        return $verdict ? "«$word»" : $word;
+                    };
+
+                    $l =~ s/
+                        # Not sure this regex to match a word is fully
+                        # idiot-proof, but we can amend it later.
+                        ([\w'’-]+)
+                        /$mark_word->($1)/egx;
+
+                    if ($mispelling_found)
+                    {
+                        printf {*STDOUT}
+                        (
+                            "%s:%d:%s\n", 
+                                $filename, 
+                                $idx+$expat->current_line(), 
+                                $l
+                        );
+                    }
+                }
+
+                return;
+            },
+        },
+    );
+
+    eval {
+        $parser->parsefile($filename);
+    };
+
+    if (my $err = $@)
+    {
+        die "Error '$err' at filename '$filename'";
+    }
+}

bin/t2_html_spell_check.bash

+#!/bin/bash
+
+# This is a temporary filter until I find out how to get rid of them there
+# exactly.
+temp_filter()
+{
+    grep -vP '(humour/human-hacking/hebrew-v2|humour/humanity/buy-the-fish-in-hebrew|humour/humanity/ongoing-text-hebrew\.html|humour/Pope/The-Pope-Died-on-Sunday--Hebrew-Text)' |
+    grep -vP '^(dest/t2-homepage/index\.html)' |
+    grep -vP '^(dest/t2-homepage/old-news\.html)' |
+    grep -vP '^(dest/t2-homepage/lecture/)' |
+    grep -vP '^(dest/t2-homepage/philosophy/politics/define-zionism/heb/index\.html)' |
+    grep -vP '^(dest/t2-homepage/philosophy/politics/drug-legalisation/hebrew\.html)'
+    cat
+}
+
+temp_only_from_reached()
+{
+    perl -lne 'print if m{t2-homepage/prog-evolution}..1'
+}
+
+old_find_quotes_filter()
+{
+    grep -vP '(catb-heb|WebMetaLecture/slides/examples|t2-homepage/rewrite\.html|humour/by-others/|humour/bits/COBOL-the-New-Age|humour/bits/Mastering-Cat|humour/fortunes/nyh-sigs|humour/fortunes/sharp-perl|humour/fortunes/sharp-programming|humour/fortunes/|humour/human-hacking/arabic-v2|humour/human-hacking/human-hacking-field-guide/|humour/human-hacking/human-hacking-field-guide-v2-arabic/|humour/TheEnemy/TheEnemy_eng\.html|humour/TheEnemy/The-Enemy-English-rev4\.html|humour/TheEnemy/The-Enemy-English-rev5\.html|humour/TheEnemy/The-Enemy-English-rev6\.html|humour/TheEnemy/The-Enemy-English-v7/|humour/TheEnemy/The-Enemy-Hebrew-v7\.html|humour/TheEnemy/The-Enemy-English-v7\.html|humour/TheEnemy/TheEnemy\.html|humour/TheEnemy/The-Enemy-rev[456]\.html|me/resumes/Shlomi-Fish-Heb-Resume\.html)' | 
+    grep -vP 'meta/copyrights/index\.html' | # Contains rel="nofollow"
+    grep -vP 'open-source/anti/php/index\.html' | # Contains code
+    grep -vP 'open-source/bits-and-bobs/greasemonkey/grease\.html' | # Contains HTML markup
+    grep -vP 'open-source/projects/Module-Format/index\.html' | # contains code
+    grep -vP 'open-source/projects/XML-Grammar/Fiction/index\.html' | # contains markup
+    grep -vP 'open-source/projects/Spark/mission/' | # contains code
+    grep -vP 'philosophy/by-others/mashhoor--10-reasons--hebrew\.html' | # contains code
+    grep -vP 'philosophy/computers/high-quality-software/index\.html$' | # old essay
+    grep -vP 'philosophy/computers/high-quality-software/rev2/index\.html$' | # contains output
+    grep -vP 'philosophy/computers/high-quality-software/rev2/what-makes-software-high-quality-rev2/freecell-solvers-quality\.html$' | # contains output
+    grep -vP 'philosophy/computers/high-quality-software/rev2/what-makes-software-high-quality-rev2/parameters-of-quality\.html$' | # contains output
+    grep -vP 'philosophy/computers/high-quality-software/what-makes-software-high-quality/' | # old
+    grep -vP 'philosophy/computers/optimizing-code-for-speed/index\.html$' | # old
+    grep -vP 'philosophy/computers/perl/joy-of-perl/joy-of-perl\.html$' | # contains some code
+    grep -vP 'philosophy/computers/software-management/perfect-workplace/perfect-it-workplace(/|\.xhtml$)' | # old
+    grep -vP 'philosophy/computers/web/create-a-great-personal-homesite/index\.html' | # in code
+    grep -vP 'philosophy/computers/web/create-a-great-personal-homesite/rev2\.html' | # in code
+    grep -vP 'philosophy/computers/web/online-communities/index\.html' | # in code
+    grep -vP 'philosophy/foss-other-beasts/revision-2/' | # in code
+    grep -vP 'philosophy/obj-oss/objectivism-and-open-source/' | # old
+    grep -vP 'philosophy/politics/drug-legalisation/case-for-drug-legalisation/' | # old
+    grep -vP 'rindolf/rindolf-spec/' # old and contains code
+}
+
+find dest/t2-homepage/ -regextype posix-extended -regex '.*x?html' -print | 
+    grep -vP '/catb-heb\.html$' | # HTML - not XHTML file.
+    grep -vP 'WebMetaLecture/slides/examples' | # HTML - not XHTML files.
+    grep -vP '/rewrite\.html$' | # HTML - not XHTML files.
+    sort | 
+    xargs -d '\n' perl bin/html-check-spelling-xmlp.pl