Source

xemacsweb / cgi-bin / wilma_glimpse

Full commit
#!/usr/bin/perl -w

# Web Interface to List Mail Archives
#
# Copyright 1997, 1998 by Jason L. Tibbitts III <tibbs@hpc.uh.edu> and
# David Wolfe <dave_wolfe@computer.org>. May be freely used, modified,
# and redistributed under the same terms as Perl. Absolutely no warranty
# of any kind. Authors are not responsible for any damage or loss caused
# by use of this program.
#
# Searches for the specified strings in the specified mail list HTML
# archives. Invoked by the form generated by the entry wilma CGI program.
#
# List configuration files are in the ".wilma" subdirectory of the
# directory where this CGI program is installed (but see CFG_ROOT
# below). They reside in files named in the form listname.cf (but see
# CFG_SUFFIX below), e.g. ".wilma/testlist.cf". The other supporting
# programs are assumed to be in the same directory as this one.
#
# Revision: 1.12

use CGI qw/ :cgi :html2 escape /;
use AppCfg 1.2;
use strict;
use integer;
use subs qw/
    build_glimpse_args
    build_hilite_regex
    error_badquery
    get_message_details
    error_exit
/;
use vars qw/
    $CFG_ROOT
    $CFG_SUFFIX
    $DEBUG
    $HILITER
    $HTML_REGEX
/;

*CFG_ROOT   = \'.wilma';	# Configuration file root directory
*CFG_SUFFIX = \'.cf';		# Configuration file name suffix
*DEBUG      = \0;
*HILITER    = \"wilma_hiliter";
*HTML_REGEX = \'\\.html?$';
$| = 1; # Autoflush output

# Import arguments into the PARAMS namespace.
import_names('PARAMS');

# Determine the name of the list from the path info string.

my ($list) = (path_info() =~ m{^/+([^/]+)}) if path_info();
error_exit("No List Specified",
    "The list archive name was not found in the path info")
    unless defined $list && $list ne "";

# Get the list configuration parameters.
eval {
    my $cfg = new AppCfg "$CFG_ROOT/$list$CFG_SUFFIX";
    $cfg->import_variables();
};
error_exit("Configuration File Error", $@) if $@;

$CFG::index_arc_dir ||= $CFG::arc_dir;

# Can we run Glimpse?
-x $CFG::glimpse || error_exit("Glimpse Not Found",
    "This archive relies on the", code("Glimpse"), "search tool.",
    "If it is installed, please set the correct path in the list",
    "configuration file. Otherwise obtain the latest version from",
    a({'-href' => "ftp://ftp.cs.arizona.edu/glimpse"}, "ftp.cs.arizona.edu"));

# Can we get to the index? If Glimpse is rebuilding it, the web user
# probably can't read it.
-r "$CFG::index_dir/.glimpse_index" || error_exit("Glimpse Index Not Found",
    "The index was not found or is not complete.",
    "This is most likely caused by the index being generated",
    "while you are attempting to use it.",
    "Please try again in a few minutes.");

# Run glimpse.
my @glimpse_args = build_glimpse_args();
my $match_hilite_regex = build_hilite_regex();
(my $base = script_name()) =~ s([^/]+$)();
my $glimpse_pid = safe_popen(\*GLIMPSEOUT, $CFG::glimpse, @glimpse_args);

# Prepare a few things to display the search output.
my %hdr_args = ('-title' => qq(Result for query "$PARAMS::query"));
if (defined $CFG::search_background && $CFG::search_background ne "" &&
    -f $CFG::search_background) {
    $hdr_args{'-background'} = $CFG::search_background;
}
elsif (defined $CFG::search_bg_color && $CFG::search_bg_color ne "") {
    $hdr_args{'-bgcolor'} = $CFG::search_bg_color;
}

print header(),
    start_html(%hdr_args),
    h1(sprintf qq(Result for query "%s"),
       escapeHTML ($PARAMS::query)),
    hr(), "\n";

my $prevfile = "";
my $icount   = 0;
my $lcount   = 0;
my $fcount   = 0;
my $case_sw  = (defined $PARAMS::case && lc($PARAMS::case) eq "on")
		? "" : "(?i)";
my ($end, $file, @hits, $line, $linecount, $start, $string, $xfile);

# Autoload in CGI.pm (through 2.36) commits variable suicide if the
# argument is $1. Force it to load before its first use with a dummy
# call.
$string = b();

while (<GLIMPSEOUT>) {
    print pre("GLIMPSEOUT:$_") if /\bglimpse:/;
    # Pass errors through
    if (/^content-type:/i) {
	print <GLIMPSEOUT>;
	exit 1;
    }

    chomp;
    next unless ($file, $line, $string) =
	($PARAMS::maxlines == 0) ? ($_, 0, "") : /^([^:]*)(?::\s*(\d+):(.*))?/;
    next unless $file =~ s|^$CFG::index_arc_dir/||o;

    if ($string ne "") {
	if ($HTML_REGEX && $file =~ /$HTML_REGEX/o) {
	    $string =~ s|</?[a-zA-Z][^>\n]*>?||g;
	    $string =~ s|<!--[^>]*-->||g;
	    unless ($string =~ /\S/) {
		$icount++;
		next;
	    }
	} else {
	    $string = escape($string);
	}
    }

    if ($file ne $prevfile) {
	print ul(@hits), "\n" if @hits;
	@hits = ();
	$linecount = 0;
	if ($fcount > $PARAMS::maxfiles) {
	    print h3("Limit of $PARAMS::maxfiles files exceeded...");
	    $file = "";
	    $fcount = "at least $fcount";
	    $lcount = "at least $lcount";
	    last;
	}
	($xfile = $prevfile = $file) =~ s|\.html$||;
	(my $subj, $start, $end) = get_message_details("$CFG::arc_dir/$file");
	my $anchor = a({'-href' => "$CFG::arc_url/$file"}, "$xfile:") . $subj;
	print((($string eq "") ? ($anchor . br()) : h3($anchor)), "\n");
	$fcount++ ;
    }

    $lcount++ ;
    $linecount++;
    if ($linecount >= $PARAMS::maxlines) {
	push(@hits,
	    br("Limit of $PARAMS::maxlines matched lines per file exceeded..."))
		if $linecount == $PARAMS::maxlines;
	next;
    }

    if ($line < $start || ($end && $line > $end)) {
	$icount++;
    }
    else {
	$string =~ s/$case_sw($match_hilite_regex)/b($1)/ego;
	$line -= $start;
	push(@hits,
	    li(a({'-href' => "$base$HILITER/$list/$file?line=$line#hilite"},
		code("$line:")), $string));
    }
}

close(GLIMPSEOUT);
unlink "/tmp/.glimpse_tmp.$glimpse_pid"; # Glimpse trash if it crashes

print ul(@hits), "\n" if @hits;

print hr(),
    h2("Summary for query ", code(sprintf qq(Result for query "%s":),
				  escapeHTML ($PARAMS::query))),
    "Found $lcount matches in $fcount files.\n";

print br(), "$icount matches in HTML tags ignored.\n" if $icount;

if ($DEBUG) {
    print p(), "Glimpse called with args:",
	br(), code(@glimpse_args);

    foreach (keys %PARAMS::) {
	no strict 'refs';
	print br(), "$_ = '",
	    (defined ${"PARAMS::$_"} ? ${"PARAMS::$_"} : "undef"), "'\n";
    }
}

print end_html;

exit 0;

# Appease -w
$CFG::arc_dir       ||= $CFG::arc_dir;
$CFG::arc_url       ||= $CFG::arc_url;
$CFG::index_arc_dir ||= $CFG::index_arc_dir;

########################################################################

# Error check arguments and build an array of Glimpse args.

sub build_glimpse_args {
    my @glimpse_args;

    error_badquery() unless defined $PARAMS::query;

    # Make sure numerical args contain only digits.
    $PARAMS::maxlines = 20
	unless defined $PARAMS::maxlines && $PARAMS::maxlines !~ /\D/;

    $PARAMS::maxfiles = 100
	unless defined $PARAMS::maxfiles && $PARAMS::maxfiles !~ /\D/;

    # Now build the argument string. (Use filters, don't prompt, give
    # line numbers.)
    push @glimpse_args, qw/ -y -n /;

    # If we only care about the filenames matched.
    push @glimpse_args, "-l" if $PARAMS::maxlines == 0;

    push @glimpse_args, "-i"
	unless defined $PARAMS::case && lc($PARAMS::case) eq "on";

    push @glimpse_args, "-w"
	unless defined $PARAMS::partial && lc($PARAMS::partial) eq "on";

    push @glimpse_args, "-B"
	if defined $PARAMS::errors && lc($PARAMS::errors) eq "best";

    push @glimpse_args, "-$PARAMS::errors"
	if defined $PARAMS::errors && $PARAMS::errors =~ /^[0-8]$/;

    push @glimpse_args, "-W" 
	if defined $PARAMS::lineonly && lc($PARAMS::lineonly) eq "on";

    # Where to find the index files.
    push @glimpse_args, "-H", $CFG::index_dir;

    push @glimpse_args, "-F", "(" . join("|", @PARAMS::filelist) . ")"
	if defined $PARAMS::restricttofiles
	&& lc($PARAMS::restricttofiles) eq "on"
	&& defined @PARAMS::filelist;

    push @glimpse_args, $PARAMS::query;

    @glimpse_args;
}


# Take the query string and turn it into a regex that will match the
# piece of the line that glimpse found. It's really hard to do this
# correctly; agrep regexes and Perl regexes do not overlap perfectly.
# We cheat by breaking apart on word boundaries; it might be better to
# split on ";" and ",", since spaces are significant.

sub build_hilite_regex {
    my $regex = $PARAMS::query;

    # Trim leading junk.
    $regex =~ s/^\W+//;

    # Turn "blah bluh;bufar,urgh" into "blah|bluh|bufar|urgh".
    $regex = join("|", split(/\W+/, $regex));

    # Check if the query contains any words.
    error_badquery() if $regex eq "";

    # Require word boundaries unless partial matches are turned on.
    $regex = "\\b($regex)\\b"
	unless defined $PARAMS::partial && lc($PARAMS::partial) eq "on";

    $regex;
}


sub error_badquery {
    error_exit("Query is too broad",
	"The query '$PARAMS::query' doesn't contain any words and",
	"thus will take too much time. Please refine your query.")
}


# Extract the subject string and line offsets of the beginning and
# end of the message body from the HTML comments inserted by MHonArc
# and return them. This subroutine make certain assumptions about the
# comment ordering of MHonArc HTML output.

sub get_message_details {
    my ($file) = @_;
    my $line   = 0;
    my $start  = 0;
    my $end    = 0;
    my $subj;

    if (open(MSG, $file)) {
	while (<MSG>) {
	    $line++;
	    last if ($subj) = /^<!--X-Subject:\s*(.*) -->/;
	}
	while (<MSG>) {
	    last if /^<!--X-MsgBody-->/;
	    $line++;
	}
	$start = $line unless eof;
	while (<MSG>) {
	    last if /^<!--X-MsgBody-End-->/;
	    $line++;
	}
	$end = $line unless eof;
	close(MSG);
    }
    ((defined($subj) ? $subj : "(No subject)"), $start, $end);
}


# Runs another program with its standard output and standard error piped
# to the specified file handle. Returns the child process ID and exits
# (via error_exit()) on errors.

sub safe_popen {
    my $fh = shift;
    my $pid;

    defined($pid = open($fh, "-|")) || error_exit("Pipe Open Failed",
	"An error ($!) occurred while trying to run the Glimpse search",
	"tool. This might be a transient condition due to the current",
	"server load, in which case you should try again later.");

    if ($pid == 0) {
	# We're the child: redirect stdout and stderr.
	open(STDERR, ">&STDOUT") || error_exit("Stderr Redirection Failed",
	    "An	error ($!) occurred while trying to redirect error",
	    "output to a pipe for communicating	with the Glimpse",
	    "search tool. If this problem persists, notify your	system",
	    "administrator.");

	select STDERR; $| = 1; # Unbuffer stderr
	select STDOUT; $| = 1; # Unbuffer stdout
	open(STDIN, "</dev/null"); # Don't really care if this fails

	exec(@_) || error_exit("Exec Failed",
	    "An	error ($!) occurred while trying to run	the Glimpse",
	    "search tool. If it	is installed, please set the correct",
	    "path in the list configuration file. Otherwise obtain the",
	    "latest version from",
	    a({'-href' => "ftp://ftp.cs.arizona.edu/glimpse"},
		"ftp.cs.arizona.edu") );
    }

    # We're the parent: return the child's process ID.
    $pid;
}


# Generic error display of HTML error message and exit.

sub error_exit {
    my $text = shift || "Unspecified Error Exit";
    my $desc = join(" ", @_);
    unless (defined $desc && $desc =~ /\S/) {
	my ($pkg, $file, $line) = caller();
	$desc = "In package '$pkg' at line $line of '$file'.";
    }

    my $address = "";
    if (defined $CFG::mail_addr && $CFG::mail_addr ne "") {
	$CFG::mail_name ||= $CFG::mail_addr;
	$address          = address(a({'-href' => "mailto:$CFG::mail_addr"},
			    $CFG::mail_name));
    }

    print header(),
	start_html('-title' => "ERROR"),
	h1($text),
	$desc,
	p(),
	$address,
	end_html();

    exit 1;
}