#!/usr/bin/perl
#
#
after newlines where appropriate
# 'tablecheck' => 1, # make sure they aren't closing that weren't opened.
# 'eat' => [qw(head title style layer iframe)],
# 'mode' => 'allow',
# 'deny' => [qw(marquee)],
# 'remove' => [qw()],
# 'maximgwidth' => 100,
# 'maximgheight' => 100,
# 'keepcomments' => 1,
# 'cuturl' => 'http://www.domain.com/full_item_view.ext',
# 'ljcut_disable' => 1, # stops the cleaner from using the lj-cut tag
# 'cleancss' => 1,
# 'extractlinks' => 1, # remove a hrefs; implies noautolinks
# 'noautolinks' => 1, # do not auto linkify
# 'extractimages' => 1, # placeholder images
# });
package LJ::CleanHTML;
sub helper_preload
{
my $p = HTML::TokeParser->new("");
eval {$p->DESTROY(); };
}
# this treats normal characters and &entities; as single characters
# also treats UTF-8 chars as single characters if $LJ::UNICODE
my $onechar;
{
my $utf_longchar = '[\xc2-\xdf][\x80-\xbf]|\xe0[\xa0-\xbf][\x80-\xbf]|[\xe1-\xef][\x80-\xbf][\x80-\xbf]|\xf0[\x90-\xbf][\x80-\xbf][\x80-\xbf]|[\xf1-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf]';
my $match;
if (not $LJ::UNICODE) {
$match = '[^&\s]|(&\#?\w{1,7};)';
} else {
$match = $utf_longchar . '|[^&\s\x80-\xff]|(?:&\#?\w{1,7};)';
}
$onechar = qr/$match/o;
}
# Some browsers, such as Internet Explorer, have decided to alllow
# certain HTML tags to be an alias of another. This has manifested
# itself into a problem, as these aliases act in the browser in the
# same manner as the original tag, but are not treated the same by
# the HTML cleaner.
# 'alias' => 'real'
my %tag_substitute = (
'image' => 'img',
);
#
/g if $addbreaks;
$$data = "[Error: Irreparable invalid markup ('<$tag>') in entry. ".
"Owner must fix manually. Raw contents below.]
" .
'
' onmouseover="javascript:alert(document/**/.cookie)" >
# is returned by HTML::Parser as P_tag("='" => "='") Text( onmouseover...)
# which leads to reconstruction of valid HTML. Clever!
# detect this, and fail.
return $total_fail->("$tag $attr");
}
# ignore attributes that do not fit this strict scheme
return $total_fail->("$tag " . (%$hash > 1 ? "[...] " : "") . "$attr")
unless $attr =~ /^[\w_:-]+$/;
$hash->{$attr} =~ s/[\t\n]//g;
# IE sucks:
if ($hash->{$attr} =~ /(j\s*a\s*v\s*a\s*s\s*c\s*r\s*i\s*p\s*t|
v\s*b\s*s\s*c\s*r\s*i\s*p\s*t|
a\s*b\s*o\s*u\s*t)\s*:/ix) {
delete $hash->{$attr};
}
if ($attr eq 'style' && $opts->{'cleancss'}) {
# css2 spec, section 4.1.3
# position === p\osition :(
# strip all slashes no matter what.
$hash->{$attr} =~ s/\\//g;
# and catch the obvious ones.
foreach my $css (qw(absolute fixed)) {
if ($hash->{$attr} =~ /$css/i) {
delete $hash->{$attr};
last;
}
}
}
# reserve ljs_* ids for divs, etc so users can't override them to replace content
if ($attr eq 'id' && $hash->{$attr} =~ /^ljs_/i) {
delete $hash->{$attr};
}
if ($s1var) {
if ($attr =~ /%%/) {
delete $hash->{$attr};
next;
}
my $props = $LJ::S1::PROPS->{$s1var};
if ($hash->{$attr} =~ /^%%([\w:]+:)?(\S+?)%%$/ && $props->{$2} =~ /[aud]/) {
# don't change it.
} elsif ($hash->{$attr} =~ /^%%cons:\w+%%[^\%]*$/) {
# a site constant with something appended is also fine.
} elsif ($hash->{$attr} =~ /%%/) {
my $clean_var = sub {
my ($mods, $prop) = @_;
# HTML escape and kill line breaks
$mods = "attr:$mods" unless
$mods =~ /^(color|cons|siteroot|sitename|img):/ ||
$props->{$prop} =~ /[ud]/;
return '%%' . $mods . $prop . '%%';
};
$hash->{$attr} =~ s/[\n\r]//g;
$hash->{$attr} =~ s/%%([\w:]+:)?(\S+?)%%/$clean_var->(lc($1), $2)/eg;
if ($attr =~ /^(href|src|lowsrc|style)$/) {
$hash->{$attr} = "\%\%[attr[$hash->{$attr}]]\%\%";
}
}
}
}
if (exists $hash->{href}) {
unless ($hash->{href} =~ s/^lj:(?:\/\/)?(.*)$/ExpandLJURL($1)/ei) {
$hash->{href} = canonical_url($hash->{href}, 1);
}
}
if ($hash->{'style'} =~ /expression/i) {
delete $hash->{'style'};
}
if ($tag eq "img")
{
my $img_bad = 0;
if (defined $opts->{'maximgwidth'} &&
(! defined $hash->{'width'} ||
$hash->{'width'} > $opts->{'maximgwidth'})) { $img_bad = 1; }
if (defined $opts->{'maximgheight'} &&
(! defined $hash->{'height'} ||
$hash->{'height'} > $opts->{'maximgheight'})) { $img_bad = 1; }
if ($opts->{'extractimages'}) { $img_bad = 1; }
$hash->{src} = canonical_url($hash->{src}, 1);
if ($img_bad) {
$newdata .= "{'src'}) . "\">" .
LJ::img('placeholder') . '';
$alt_output = 1;
}
}
if ($tag eq "a" && $extractlinks)
{
push @canonical_urls, canonical_url($token->[2]->{href}, 1);
$newdata .= "";
next;
}
unless ($alt_output)
{
my $allow;
if ($mode eq "allow") {
$allow = 1;
if ($action{$tag} eq "deny") { $allow = 0; }
} else {
$allow = 0;
if ($action{$tag} eq "allow") { $allow = 1; }
}
if ($allow && ! $remove{$tag})
{
if ($opts->{'tablecheck'}) {
$allow = 0 if
# can't open table elements from outside a table
($tag =~ /^(?:tbody|thead|tfoot|tr|td|th)$/ && ! @tablescope) ||
# can't open td or th if not inside tr
($tag =~ /^(?:td|th)$/ && ! $tablescope[-1]->{'tr'}) ||
# can't open a table unless inside a td or th
($tag eq 'table' && @tablescope && ! grep { $tablescope[-1]->{$_} } qw(td th));
}
if ($allow) { $newdata .= "<$tag"; }
else { $newdata .= "<$tag"; }
# output attributes in original order, but only those
# that are allowed (by still being in %$hash after cleaning)
foreach (@$attrs) {
if ($hash->{$_} =~ /[^\x01-\x7f]/) {
# FIXME: this is so ghetto. make faster. make generic.
# HTML::Parser decodes entities for us (which is good)
# but in Perl 5.8 also includes the "poison" SvUTF8
# flag on the scalar it returns, thus poisoning the
# rest of the content this scalar is appended with.
# we need to remove that poison at this point. *sigh*
$hash->{$_} = pack('C*', unpack('C*', $hash->{$_}));
}
$newdata .= " $_=\"" . LJ::ehtml($hash->{$_}) . "\""
if exists $hash->{$_};
}
if ($slashclose) {
$newdata .= " /";
$opencount{$tag}--;
$tablescope[-1]->{$tag}-- if $opts->{'tablecheck'} && @tablescope;
}
if ($allow) {
$newdata .= ">";
$opencount{$tag}++;
# maintain current table scope
if ($opts->{'tablecheck'}) {
# open table
if ($tag eq 'table') {
push @tablescope, {};
# new tag within current table
} elsif (@tablescope) {
$tablescope[-1]->{$tag}++;
}
}
}
else { $newdata .= ">"; }
}
}
}
}
# end tag
elsif ($type eq "E")
{
my $tag = $token->[1];
my $allow;
if ($tag eq "lj-raw") {
$opencount{$tag}--;
$tablescope[-1]->{$tag}-- if $opts->{'tablecheck'} && @tablescope;
}
elsif ($tag eq "lj-cut") {
if ($opts->{'cutpreview'}) {
$newdata .= "</lj-cut>";
}
} else {
if ($mode eq "allow") {
$allow = 1;
if ($action{$tag} eq "deny") { $allow = 0; }
} else {
$allow = 0;
if ($action{$tag} eq "allow") { $allow = 1; }
}
if ($extractlinks && $tag eq "a") {
if (@canonical_urls) {
my $url = LJ::ehtml(pop @canonical_urls);
$newdata .= " ($url)";
}
}
if ($allow && ! $remove{$tag})
{
if ($opts->{'tablecheck'}) {
$allow = 0 if
# can't close table elements from outside a table
($tag =~ /^(?:table|tbody|thead|tfoot|tr|td|th)$/ && ! @tablescope) ||
# can't close td or th unless open tr
($tag =~ /^(?:td|th)$/ && ! $tablescope[-1]->{'tr'});
}
if ($allow && ! ($opts->{'noearlyclose'} && ! $opencount{$tag})) {
# maintain current table scope
if ($opts->{'tablecheck'}) {
# open table
if ($tag eq 'table') {
pop @tablescope;
# closing tag within current table
} elsif (@tablescope) {
$tablescope[-1]->{$tag}--;
}
}
$newdata .= "$tag>";
$opencount{$tag}--;
} else { $newdata .= "</$tag>"; }
}
}
}
elsif ($type eq "D") {
# remove everything past first closing tag
$token->[1] =~ s/>.+/>/s;
# kill any opening tag except the starting one
$token->[1] =~ s/./sg;
$newdata .= $token->[1];
}
elsif ($type eq "T") {
my %url = ();
my $urlcount = 0;
if ($opencount{'style'}) {
# remove anything that might run javascript/vbscript code
# (Note: Ghetto. Need to use CSS.pm to build full tree, analyze, redump)
my $reduced = lc($token->[1]);
$reduced =~ s/\s+//g;
$reduced =~ s/\\//g;
$reduced =~ s/&\#(\d+);?/chr($1)/eg;
$reduced =~ s/&\#x(\w+);?/chr(hex($1))/eg;
$token->[1] = "/* potential scripting */"
if $reduced =~ /javascript|vbscript|expression/;
$token->[1] =~ s//[COME]/g;
}
my $auto_format = $addbreaks &&
($opencount{'table'} <= ($opencount{'td'} + $opencount{'th'})) &&
! $opencount{'pre'} &&
! $opencount{'lj-raw'};
if ($auto_format && ! $noautolinks && ! $opencount{'a'} && ! $opencount{'textarea'}) {
my $match = sub {
my $str = shift;
if ($str =~ /^(.*?)(&(#39|quot|lt|gt)(;.*)?)$/) {
$url{++$urlcount} = $1;
return "&url$urlcount;$1&urlend;$2";
} else {
$url{++$urlcount} = $str;
return "&url$urlcount;$str&urlend;";
}
};
$token->[1] =~ s!https?://[^\s\'\"\<\>]+[a-zA-Z0-9_/&=\-]! $match->($&); !ge;
}
# escape tags in text tokens. shouldn't belong here!
# especially because the parser returns things it's
# confused about (broken, ill-formed HTML) as text.
$token->[1] =~ s/</g;
unless ($opencount{'style'}) {
# don't escape this, because that breaks a CSS construct
$token->[1] =~ s/>/>/g;
}
if ($opencount{'style'}) {
$token->[1] =~ s/\[COMS\]//g;
}
# put and