120 lines
3.7 KiB
Perl
Executable File
120 lines
3.7 KiB
Perl
Executable File
#!/usr/bin/perl
|
|
#
|
|
|
|
package HTMLCleaner;
|
|
|
|
use strict;
|
|
use base 'HTML::Parser';
|
|
|
|
sub new {
|
|
my ($class, %opts) = @_;
|
|
|
|
my $p = new HTML::Parser('api_version' => 3);
|
|
$p->handler('start' => \&start, 'self, tagname, attr, attrseq, text' );
|
|
$p->handler('end' => \&end, 'self, tagname' );
|
|
$p->handler('text' => \&text, 'self, text' );
|
|
$p->handler('declaration' => \&decl, 'self, tokens' );
|
|
|
|
$p->{'output'} = $opts{'output'} || sub {};
|
|
bless $p, $class;
|
|
}
|
|
|
|
my %bad_attr = (map { $_ => 1 }
|
|
qw(onabort onactivate onafterprint onafterupdate
|
|
onbeforeactivate onbeforecopy onbeforecut
|
|
onbeforedeactivate onbeforeeditfocus
|
|
onbeforepaste onbeforeprint onbeforeunload
|
|
onbeforeupdate onblur onbounce oncellchange
|
|
onchange onclick oncontextmenu oncontrolselect
|
|
oncopy oncut ondataavailable ondatasetchanged
|
|
ondatasetcomplete ondblclick ondeactivate
|
|
ondrag ondragend ondragenter ondragleave
|
|
ondragover ondragstart ondrop onerror
|
|
onerrorupdate onfilterchange onfinish onfocus
|
|
onfocusin onfocusout onhelp onkeydown
|
|
onkeypress onkeyup onlayoutcomplete onload
|
|
onlosecapture onmousedown onmouseenter
|
|
onmouseleave onmousemove onmouseout
|
|
onmouseover onmouseup onmousewheel onmove
|
|
onmoveend onmovestart onpaste onpropertychange
|
|
onreadystatechange onreset onresize
|
|
onresizeend onresizestart onrowenter onrowexit
|
|
onrowsdelete onrowsinserted onscroll onselect
|
|
onselectionchange onselectstart onstart onstop
|
|
onsubmit onunload datasrc datafld));
|
|
|
|
my %eat_tag = (map { $_ => 1 }
|
|
qw(script iframe object applet embed));
|
|
|
|
my @eating; # push tagname whenever we start eating a tag
|
|
|
|
sub start {
|
|
my ($self, $tagname, $attr, $seq, $text) = @_;
|
|
my $slashclose = 0; # xml-style
|
|
if ($tagname =~ s!/(.*)!!) {
|
|
if (length($1)) { push @eating, "$tagname/$1"; } # basically halt parsing
|
|
else { $slashclose = 1; }
|
|
}
|
|
push @eating, $tagname if
|
|
$eat_tag{$tagname};
|
|
return if @eating;
|
|
my $ret = "<$tagname";
|
|
foreach (@$seq) {
|
|
if ($_ eq "/") { $slashclose = 1; next; }
|
|
next if $bad_attr{lc($_)};
|
|
next if /(?:^=)|[\x0b\x0d]/;
|
|
|
|
# IE is brain-dead and lets javascript:, vbscript:, and about: have spaces mixed in
|
|
if ($attr->{$_} =~ /((?:(?:v\s*b)|(?:j\s*a\s*v\s*a))\s*s\s*c\s*r\s*i\s*p\s*t|
|
|
a\s*b\s*o\s*u\s*t)\s*:/ix) {
|
|
delete $attr->{$_};
|
|
}
|
|
$ret .= " $_=\"" . ehtml($attr->{$_}) . "\"";
|
|
}
|
|
$ret .= " /" if $slashclose;
|
|
$ret .= ">";
|
|
$self->{'output'}->($ret);
|
|
}
|
|
|
|
sub end {
|
|
my ($self, $tagname) = @_;
|
|
if (@eating) {
|
|
pop @eating if $eating[-1] eq $tagname;
|
|
return;
|
|
}
|
|
$self->{'output'}->("</$tagname>");
|
|
}
|
|
|
|
sub text {
|
|
my ($self, $text) = @_;
|
|
return if @eating;
|
|
# the parser gives us back text whenever it's confused
|
|
# on really broken input. sadly, IE parses really broken
|
|
# input, so let's escape anything going out this way.
|
|
$self->{'output'}->(eangles($text));
|
|
}
|
|
|
|
sub decl {
|
|
my ($self, $tokens) = @_;
|
|
$self->{'output'}->("<!" . join(" ", map { eangles($_) } @$tokens) . ">");
|
|
}
|
|
|
|
sub eangles {
|
|
my $a = shift;
|
|
$a =~ s/</</g;
|
|
$a =~ s/>/>/g;
|
|
return $a;
|
|
}
|
|
|
|
sub ehtml {
|
|
my $a = shift;
|
|
$a =~ s/\&/&/g;
|
|
$a =~ s/\"/"/g;
|
|
$a =~ s/\'/&\#39;/g;
|
|
$a =~ s/</</g;
|
|
$a =~ s/>/>/g;
|
|
return $a;
|
|
}
|
|
|
|
1;
|