xemacsweb / checkURLfragment.pl

# Adrian Aichner (APA), aichner@ecf.teradyne.com, Teradyne GmbH, 2000-10-23.

# Make URL fragment identifier conform with XHTML/XML.
#
# See "C.8 Fragment Identifiers"
# in
# http://www.w3.org/TR/2000/REC-xhtml1-20000126/#guidelines


use strict;
use English;
use Getopt::Long;

my %options = (
               "help"
               => "Display this help message",
               "fix"
               => "Transliterate (fix) characters disallowed in XHML/XML fragment identifiers",
              );

# This is the place to map illegal characters to legal ones.
# See http://www.w3.org/TR/2000/REC-xhtml1-20000126/#guidelines
# C.8 Fragment Identifiers
# In XML, URIs [RFC2396] that end with fragment identifiers of the form
# "#foo" do not refer to elements with an attribute name="foo"; rather,
# they refer to elements with an attribute defined to be of type ID,
# e.g., the id attribute in HTML 4.  Many existing HTML clients don't
# support the use of ID-type attributes in this way, so identical values
# may be supplied for both of these attributes to ensure maximum forward
# and backward compatibility (e.g., <a id="foo" name="foo">...</a>).
# Further, since the set of legal values for attributes of type ID is
# much smaller than for those of type CDATA, the type of the name
# attribute has been changed to NMTOKEN. This attribute is constrained
# such that it can only have the same values as type ID, or as the Name
# production in XML 1.0 Section 2.5, production 5. Unfortunately, this
# constraint cannot be expressed in the XHTML 1.0 DTDs. Because of this
# change, care must be taken when converting existing HTML
# documents. The values of these attributes must be unique within the
# document, valid, and any references to these fragment identifiers
# (both internal and external) must be updated should the values be
# changed during conversion.
# Finally, note that XHTML 1.0 has deprecated the name attribute of the
# a, applet, form, frame, iframe, img, and map elements, and it will be
# removed from XHTML in subsequent versions.
my %map = (
           " " => "_",
           "\t" => "_",
           "/" => ":",
          );

use vars qw(
            $opt_help
            $opt_fix
           );

my $anchor_name_regex = qr{
                           (
                            name[ \t\n]*=[ \t\n]*
                            (
                             \"[^\"]*\"
                             |
                             \'[^\']*\'
                            )
                           )
                          }x;

my $anchor_href_regex = qr{
                           (
                            href[ \t\n]*=[ \t\n]*
                            (
                             \"[^\"\#]*\#
                             ( # double-quoted fragment identifier=$3
                              [^\"]*
                             )
                             \"
                             |
                             \'[^\'\#]*\#
                             ( # single-quoted fragment identifier=$4
                              [^\']*
                             )
                             \'
                            )
                           )
                          }x;

sub main {
    # Swallow the file in one piece to cope with anchors spanning multiple
    # lines, like:
    # 	    <li><a
    #  href
    # =
    # "#lib-src_ChangeLog"
    #  shape="rect">lib-src/ChangeLog</a></li>
    undef $INPUT_RECORD_SEPARATOR;
    if (! GetOptions(keys(%options)) or $opt_help) {
        printf(STDERR "Usage: $0 [-fix] [file ...]\n");
        return 1;
    }
    my $name_match;
    my $quoted_name_match;
    my $href_match;
    my $quoted_href_match;
    my $fragment_match;
    my $quoted_fragment_match;
    my $end_of_match;
    my $bad = join("", keys(%map));
    my $bad_regex = qr{[$bad]}x;
    my $good = join("", values(%map));
    my $good_regex = qr{[$good]}x;
    while (<>) {
        while (m|$anchor_name_regex|g) {
            $name_match = $1;
            if ($opt_fix) {
                $end_of_match = pos;
                $quoted_name_match = quotemeta($name_match);
                eval "\$name_match =~ tr[$bad][$good]";
                s|$quoted_name_match|$name_match|;
                pos = $end_of_match;
            } else {
                printf(STDERR "%s needs fixing.\n",
                       $name_match) if ($name_match =~ m|$bad_regex|);
            }
        }
        while (m|$anchor_href_regex|g) {
            $href_match = $1;
            $fragment_match = ($3 or $4);
            if ($opt_fix) {
                $end_of_match = pos;
                $quoted_href_match = quotemeta($href_match);
                $quoted_fragment_match = quotemeta($fragment_match);
                eval "\$fragment_match =~ tr[$bad][$good]";
                $href_match =~ s|$quoted_fragment_match|$fragment_match|;
                s|$quoted_href_match|$href_match|;
                pos = $end_of_match;
            } else {
                printf(STDERR "%s needs fixing.\n",
                      $href_match) if ($fragment_match =~ m|$bad_regex|);
            }
        }
        if ($opt_fix) {
            print;
        }
        if (eof(ARGV)) {
            printf (STDERR "Done processing %s ...\n", $ARGV);
        }
    }
}

main();
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.