perl-MediaWiki-CleanupHTML / MediaWiki-CleanupHTML / lib / MediaWiki / CleanupHTML.pm

package MediaWiki::CleanupHTML;

use 5.008;

use strict;
use warnings;

use HTML::TreeBuilder::XPath;
use Scalar::Util (qw(blessed));

=head1 NAME

MediaWiki::CleanupHTML - cleanup the MediaWiki-generated HTML from MediaWiki
embellishments.

=head1 VERSION

Version 0.0.1

=cut

our $VERSION = '0.0.1';


=head1 SYNOPSIS

Quick summary of what the module does.

Perhaps a little code snippet.

    use MediaWiki::CleanupHTML;

    open my $fh, '<:encoding(UTF-8)', $filename
        or die "Cannot open '$filename' - $!";

    my $cleaner = MediaWiki::CleanupHTML->new({ fh => $fh });

    open my $out_fh, '>:encoding(UTF-8)', $processed_filename
        or die "Cannot open '$processed_filename' for output - $!";

    $cleaner->print_into_fh($out_fh);

    $cleaner->destroy_resources();

=head1 SUBROUTINES/METHODS

=head2 MediaWiki::CleanupHTML->new({fh => $fh})

The constructor - accepts the filehandle from which to read the XHTML.

=cut

sub new
{
    my $class = shift;

    my $self = bless {}, $class;

    $self->_init(@_);

    return $self;
}

sub _is_processed
{
    my $self = shift;

    if (@_)
    {
        $self->{_is_processed} = shift;
    }

    return $self->{_is_processed};
}

sub _fh
{
    my $self = shift;

    if (@_)
    {
        $self->{_fh} = shift;
    }

    return $self->{_fh};
}

sub _tree
{
    my $self = shift;

    if (@_)
    {
        $self->{_tree} = shift;
    }

    return $self->{_tree};
}

sub _init
{
    my ($self, $args) = @_;

    if (!exists($args->{'fh'}))
    {
        Carp::confess("MediaWiki::CleanupHTML->new was not passed a filehandle.");
    }

    $self->_fh($args->{fh});

    my $tree = HTML::TreeBuilder::XPath->new;

    $self->_tree($tree);

    $self->_tree->parse_file($self->_fh);

    $self->_is_processed(0);

    return;
}


sub _process
{
    my $self = shift;

    if ($self->_is_processed())
    {
        return;
    }

    my $tree = $self->_tree;

    {
        my @nodes = $tree->findnodes( '//div[@class="editsection"]' );

        foreach my $n (@nodes)
        {
            $n->detach();
            $n->delete();
        }
    }

    {
        my @nodes = map { $tree->findnodes( '//h' . $_ ) } (2 .. 4);

        foreach my $h2 (@nodes)
        {
            my $a_tag = $h2->left();
            if (blessed($a_tag) && $a_tag->tag() eq "a" && $a_tag->attr('name'))
            {
                my $id = $a_tag->attr('name');
                $h2->attr('id', $id);
                $a_tag->detach();
                $a_tag->delete();
            }
        }
    }

    my (@divs_to_delete) = 
    (
        $tree->findnodes('//div[@class="printfooter"]'),
        $tree->findnodes('//div[@id="catlinks"]'),
        $tree->findnodes('//div[@class="visualClear"]'),
        $tree->findnodes('//div[@id="column-one"]'),
        $tree->findnodes('//div[@id="footer"]'),
        $tree->findnodes('//head//style'),
        $tree->findnodes('//script'),
    );

    foreach my $div (@divs_to_delete)
    {
        $div->detach();
        $div->delete();
    }

    $self->_is_processed(1);

    return;
}

=head2 $cleaner->print_into_fh($fh)

Output to a filehandle. The filehandle should be able to process UTF-8 output.

=cut

sub print_into_fh
{
    my ($self, $fh) = @_;

    $self->_process();

    print {$fh} $self->_tree->as_XML();
}

=head2 $cleaner->destroy_resources()

Destroy the allocated resources (of the L<HTML::TreeBuilder> tree, etc.). Must
be called before destruction.

=cut

sub destroy_resources
{
    my $self = shift;

    $self->_tree->delete();
    $self->_tree(undef());

    return;
}

sub DESTROY
{
    my $self = shift;

    $self->destroy_resources();

    return;
}

=head1 AUTHOR

Shlomi Fish, L<http://www.shlomifish.org/> .

=head1 BUGS

Please report any bugs or feature requests to C<bug-mediawiki-cleanuphtml at rt.cpan.org>, or through
the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=MediaWiki-CleanupHTML>.  I will be notified, and then you'll
automatically be notified of progress on your bug as I make changes.




=head1 SUPPORT

You can find documentation for this module with the perldoc command.

    perldoc MediaWiki::CleanupHTML


You can also look for information at:

=over 4

=item * RT: CPAN's request tracker (report bugs here)

L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=MediaWiki-CleanupHTML>

=item * AnnoCPAN: Annotated CPAN documentation

L<http://annocpan.org/dist/MediaWiki-CleanupHTML>

=item * CPAN Ratings

L<http://cpanratings.perl.org/d/MediaWiki-CleanupHTML>

=item * Search CPAN

L<http://search.cpan.org/dist/MediaWiki-CleanupHTML/>

=back


=head1 ACKNOWLEDGEMENTS


=head1 LICENSE AND COPYRIGHT

Copyright 2012 Shlomi Fish.

This program is distributed under the MIT (X11) License:
L<http://www.opensource.org/licenses/mit-license.php>

Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:

The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.


=cut

1; # End of MediaWiki::CleanupHTML
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.