xemacsweb / cgi-bin / wilma_striphtml

#!/usr/bin/perl -p0777
#
#########################################################
# striphtml ("striff tummel")
# tchrist@perl.com 
# version 1.0: Thu 01 Feb 1996 1:53:31pm MST 
# version 1.1: Sat Feb  3 06:23:50 MST 1996
# 		(fix up comments in annoying places)
# version 1.2j<: Strip everything outside of MHonArc
#               message body delims.
#                Compress blank lines.
# version 1.3dw: Fix some warnings and optimize a bit.
#########################################################
#
# how to strip out html comments and tags and transform
# entities in just three -- count 'em three -- substitutions;
# sed and awk eat your heart out.  :-)
#
# as always, translations from this nacré rendition into 
# more characteristically marine, herpetoid, titillative, 
# or indonesian idioms are welcome for the furthering of 
# comparitive cyberlinguistic studies.
#
#########################################################
#
# $Revision$

require 5.001;      # for nifty embedded regexp comments

#########################################################
# nuke everything outside of MHonArc message delims - j<
# But index/substr is 1/3 faster than s/// for constants - dw
#########################################################

my $begin = index($_, "<!--X-MsgBody-->");
$begin    = ($begin < 0) ? 0 : ($begin + 16); # 16 is length of comment
my $end   = index($_, "<!--X-MsgBody-End-->", $begin);
$end      = length($_) if $end < $begin;
$_        = substr($_, $begin, $end - $begin);

#########################################################
# first we'll shoot all the <!-- comments -->
#########################################################

s{ <!                   # comments begin with a `<!'
                        # followed by 0 or more comments;

    (.*?)		# this is actually to eat up comments in non 
			# random places

     (                  # not suppose to have any white space here

                        # just a quick start; 
      --                # each comment starts with a `--'
        .*?             # and includes all text up to and including
      --                # the *next* occurrence of `--'
        \s*             # and may have trailing while space
                        #   (albeit not leading white space XXX)
     )+                 # repetire ad libitum  XXX should be * not +
    (.*?)		# trailing non comment text
   >                    # up to a `>'
}{
    if ($1 || $3) {	# this silliness for embedded comments in tags
	"<!$1 $3>";
    } 
}gesx;                 # mutate into nada, nothing, and niente

#########################################################
# next we'll remove all the <tags>
#########################################################

s{ <                    # opening angle bracket

    (?:                 # Non-backreffing grouping paren
         [^>'"] +       # 1 or more things that are neither > nor ' nor "
            |           #    or else
         ".*?"          # a section between double quotes (stingy match)
            |           #    or else
         '.*?'          # a section between single quotes (stingy match)
    ) *                 # repetire ad libitum
                        #  hm.... are null tags <> legal? XXX
   >                    # closing angle bracket
}{}gsx;                 # mutate into nada, nothing, and niente

#########################################################
# finally we'll translate all &valid; HTML 2.0 entities
#########################################################

s{ (
        &              # an entity starts with a semicolon
        ( 
	    \x23\d+    # and is either a pound (#) and numbers
	     |	       #   or else
	    \w+        # has alphanumunders up to a semi
	)         
        ;?             # a semi terminates AS DOES ANYTHING ELSE (XXX)
    )
} {

    $entity{$2}        # if it's a known entity use that
        ||             #   but otherwise
        $1             # leave what we'd found; NO WARNINGS (XXX)

}gex;                  # execute replacement -- that's code not a string

#########################################################
# but wait! load up the %entity mappings enwrapped in 
# a BEGIN that the last might be first, and only execute
# once, since we're in a -p "loop"; awk is kinda nice after all.
#########################################################

BEGIN {

    %entity = (

        'lt'     => '<',     #a less-than
        'gt'     => '>',     #a greater-than
        'amp'    => '&',     #a nampersand
        'quot'   => '"',     #a (verticle) double-quote

        'nbsp'   => chr 160, #no-break space
        'iexcl'  => chr 161, #inverted exclamation mark
        'cent'   => chr 162, #cent sign
        'pound'  => chr 163, #pound sterling sign CURRENCY NOT WEIGHT
        'curren' => chr 164, #general currency sign
        'yen'    => chr 165, #yen sign
        'brvbar' => chr 166, #broken (vertical) bar
        'sect'   => chr 167, #section sign
        'uml'    => chr 168, #umlaut (dieresis)
        'copy'   => chr 169, #copyright sign
        'ordf'   => chr 170, #ordinal indicator, feminine
        'laquo'  => chr 171, #angle quotation mark, left
        'not'    => chr 172, #not sign
        'shy'    => chr 173, #soft hyphen
        'reg'    => chr 174, #registered sign
        'macr'   => chr 175, #macron
        'deg'    => chr 176, #degree sign
        'plusmn' => chr 177, #plus-or-minus sign
        'sup2'   => chr 178, #superscript two
        'sup3'   => chr 179, #superscript three
        'acute'  => chr 180, #acute accent
        'micro'  => chr 181, #micro sign
        'para'   => chr 182, #pilcrow (paragraph sign)
        'middot' => chr 183, #middle dot
        'cedil'  => chr 184, #cedilla
        'sup1'   => chr 185, #superscript one
        'ordm'   => chr 186, #ordinal indicator, masculine
        'raquo'  => chr 187, #angle quotation mark, right
        'frac14' => chr 188, #fraction one-quarter
        'frac12' => chr 189, #fraction one-half
        'frac34' => chr 190, #fraction three-quarters
        'iquest' => chr 191, #inverted question mark
        'Agrave' => chr 192, #capital A, grave accent
        'Aacute' => chr 193, #capital A, acute accent
        'Acirc'  => chr 194, #capital A, circumflex accent
        'Atilde' => chr 195, #capital A, tilde
        'Auml'   => chr 196, #capital A, dieresis or umlaut mark
        'Aring'  => chr 197, #capital A, ring
        'AElig'  => chr 198, #capital AE diphthong (ligature)
        'Ccedil' => chr 199, #capital C, cedilla
        'Egrave' => chr 200, #capital E, grave accent
        'Eacute' => chr 201, #capital E, acute accent
        'Ecirc'  => chr 202, #capital E, circumflex accent
        'Euml'   => chr 203, #capital E, dieresis or umlaut mark
        'Igrave' => chr 204, #capital I, grave accent
        'Iacute' => chr 205, #capital I, acute accent
        'Icirc'  => chr 206, #capital I, circumflex accent
        'Iuml'   => chr 207, #capital I, dieresis or umlaut mark
        'ETH'    => chr 208, #capital Eth, Icelandic
        'Ntilde' => chr 209, #capital N, tilde
        'Ograve' => chr 210, #capital O, grave accent
        'Oacute' => chr 211, #capital O, acute accent
        'Ocirc'  => chr 212, #capital O, circumflex accent
        'Otilde' => chr 213, #capital O, tilde
        'Ouml'   => chr 214, #capital O, dieresis or umlaut mark
        'times'  => chr 215, #multiply sign
        'Oslash' => chr 216, #capital O, slash
        'Ugrave' => chr 217, #capital U, grave accent
        'Uacute' => chr 218, #capital U, acute accent
        'Ucirc'  => chr 219, #capital U, circumflex accent
        'Uuml'   => chr 220, #capital U, dieresis or umlaut mark
        'Yacute' => chr 221, #capital Y, acute accent
        'THORN'  => chr 222, #capital THORN, Icelandic
        'szlig'  => chr 223, #small sharp s, German (sz ligature)
        'agrave' => chr 224, #small a, grave accent
        'aacute' => chr 225, #small a, acute accent
        'acirc'  => chr 226, #small a, circumflex accent
        'atilde' => chr 227, #small a, tilde
        'auml'   => chr 228, #small a, dieresis or umlaut mark
        'aring'  => chr 229, #small a, ring
        'aelig'  => chr 230, #small ae diphthong (ligature)
        'ccedil' => chr 231, #small c, cedilla
        'egrave' => chr 232, #small e, grave accent
        'eacute' => chr 233, #small e, acute accent
        'ecirc'  => chr 234, #small e, circumflex accent
        'euml'   => chr 235, #small e, dieresis or umlaut mark
        'igrave' => chr 236, #small i, grave accent
        'iacute' => chr 237, #small i, acute accent
        'icirc'  => chr 238, #small i, circumflex accent
        'iuml'   => chr 239, #small i, dieresis or umlaut mark
        'eth'    => chr 240, #small eth, Icelandic
        'ntilde' => chr 241, #small n, tilde
        'ograve' => chr 242, #small o, grave accent
        'oacute' => chr 243, #small o, acute accent
        'ocirc'  => chr 244, #small o, circumflex accent
        'otilde' => chr 245, #small o, tilde
        'ouml'   => chr 246, #small o, dieresis or umlaut mark
        'divide' => chr 247, #divide sign
        'oslash' => chr 248, #small o, slash
        'ugrave' => chr 249, #small u, grave accent
        'uacute' => chr 250, #small u, acute accent
        'ucirc'  => chr 251, #small u, circumflex accent
        'uuml'   => chr 252, #small u, dieresis or umlaut mark
        'yacute' => chr 253, #small y, acute accent
        'thorn'  => chr 254, #small thorn, Icelandic
        'yuml'   => chr 255, #small y, dieresis or umlaut mark
    );

    ####################################################
    # now fill in all the numbers to match themselves
    ####################################################
    for $chr ( 0 .. 255 ) { 
        $entity{ '#' . $chr } = chr $chr;
    }
}
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.