#!/usr/bin/perl # # # lib: HTML::TokeParser, cgi-bin/ljconfig.pl, cgi-bin/ljlib.pl # link: htdocs/userinfo.bml, htdocs/users # require "$ENV{'LJHOME'}/cgi-bin/ljconfig.pl"; use strict; use HTML::TokeParser (); use URI (); # LJ::CleanHTML::clean(\$u->{'bio'}, { # 'wordlength' => 100, # maximum length of an unbroken "word" # 'addbreaks' => 1, # insert
after newlines where appropriate # 'tablecheck' => 1, # make sure they aren't closing that weren't opened. # 'eat' => [qw(head title style layer iframe)], # 'mode' => 'allow', # 'deny' => [qw(marquee)], # 'remove' => [qw()], # 'maximgwidth' => 100, # 'maximgheight' => 100, # 'keepcomments' => 1, # 'cuturl' => 'http://www.domain.com/full_item_view.ext', # 'ljcut_disable' => 1, # stops the cleaner from using the lj-cut tag # 'cleancss' => 1, # 'extractlinks' => 1, # remove a hrefs; implies noautolinks # 'noautolinks' => 1, # do not auto linkify # 'extractimages' => 1, # placeholder images # }); package LJ::CleanHTML; sub helper_preload { my $p = HTML::TokeParser->new(""); eval {$p->DESTROY(); }; } # this treats normal characters and &entities; as single characters # also treats UTF-8 chars as single characters if $LJ::UNICODE my $onechar; { my $utf_longchar = '[\xc2-\xdf][\x80-\xbf]|\xe0[\xa0-\xbf][\x80-\xbf]|[\xe1-\xef][\x80-\xbf][\x80-\xbf]|\xf0[\x90-\xbf][\x80-\xbf][\x80-\xbf]|[\xf1-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf]'; my $match; if (not $LJ::UNICODE) { $match = '[^&\s]|(&\#?\w{1,7};)'; } else { $match = $utf_longchar . '|[^&\s\x80-\xff]|(?:&\#?\w{1,7};)'; } $onechar = qr/$match/o; } # Some browsers, such as Internet Explorer, have decided to alllow # certain HTML tags to be an alias of another. This has manifested # itself into a problem, as these aliases act in the browser in the # same manner as the original tag, but are not treated the same by # the HTML cleaner. # 'alias' => 'real' my %tag_substitute = ( 'image' => 'img', ); # # name: LJ::CleanHTML::clean # class: text # des: Multifaceted HTML parse function # info: # args: data, opts # des-data: A reference to html to parse to output, or HTML if modified in-place. # des-opts: An hash of options to pass to the parser. # returns: Nothing. # sub clean { my $data = shift; my $opts = shift; my $newdata; # remove the auth portion of any see_request.bml links $$data =~ s/(see_request\.bml.+?)auth=\w+/$1/ig; my $p = HTML::TokeParser->new($data); my $wordlength = $opts->{'wordlength'}; my $addbreaks = $opts->{'addbreaks'}; my $keepcomments = $opts->{'keepcomments'}; my $mode = $opts->{'mode'}; my $cut = $opts->{'cuturl'} || $opts->{'cutpreview'}; my $ljcut_disable = $opts->{'ljcut_disable'}; my $s1var = $opts->{'s1var'}; my $extractlinks = 0 || $opts->{'extractlinks'}; my $noautolinks = $extractlinks || $opts->{'noautolinks'}; my @canonical_urls; # extracted links my %action = (); my %remove = (); if (ref $opts->{'eat'} eq "ARRAY") { foreach (@{$opts->{'eat'}}) { $action{$_} = "eat"; } } if (ref $opts->{'allow'} eq "ARRAY") { foreach (@{$opts->{'allow'}}) { $action{$_} = "allow"; } } if (ref $opts->{'deny'} eq "ARRAY") { foreach (@{$opts->{'deny'}}) { $action{$_} = "deny"; } } if (ref $opts->{'remove'} eq "ARRAY") { foreach (@{$opts->{'remove'}}) { $action{$_} = "deny"; $remove{$_} = 1; } } $action{'script'} = "eat"; my @attrstrip = qw(); if ($opts->{'cleancss'}) { push @attrstrip, 'id'; } if (ref $opts->{'attrstrip'} eq "ARRAY") { foreach (@{$opts->{'attrstrip'}}) { push @attrstrip, $_; } } my %opencount = (); my @tablescope = (); my $cutcount = 0; my $total_fail = sub { my $tag = LJ::ehtml(@_); $$data = LJ::ehtml($$data); $$data =~ s/\r?\n/
/g if $addbreaks; $$data = "[Error: Irreparable invalid markup ('<$tag>') in entry. ". "Owner must fix manually. Raw contents below.]

" . '
' . $$data . '
'; return undef; }; TOKEN: while (my $token = $p->get_token) { my $type = $token->[0]; # See if this tag should be treated as an alias $token->[1] = $tag_substitute{$token->[1]} if defined $tag_substitute{$token->[1]} && ($type eq 'S' || $type eq 'E'); if ($type eq "S") # start tag { my $tag = $token->[1]; # check for exploit using TIME namespace to set data and introduce javascript that # would affect IE users. if ($tag =~ m/:set$/ && $token->[2]->{attributename} =~ /innerHTML/i) { next; } # do some quick checking to see if this is an email address/URL, and if so, just # escape it and ignore it if ($tag =~ m!(?:\@|://)!) { $newdata .= LJ::ehtml("<$tag>"); next; } my $slashclose = 0; # If set to 1, use XML-style empty tag marker # for tags like , pretend it's and reinsert the slash later $slashclose = 1 if ($tag =~ s!/$!!); return $total_fail->($tag) unless $tag =~ /^\w([\w\-:_]*\w)?$/; # for incorrect tags like (note the lack of a space) # delete everything after 'name' to prevent a security loophole which happens # because IE understands them. $tag =~ s!/.+$!!; if ($action{$tag} eq "eat") { $p->unget_token($token); $p->get_tag("/$tag"); } elsif ($tag eq "lj-cut" && !$ljcut_disable) { my $attr = $token->[2]; $cutcount++; if ($cut) { my $text = "Read more..."; if ($attr->{'text'}) { $text = $attr->{'text'}; if ($text =~ /[^\x01-\x7f]/) { $text = pack('C*', unpack('C*', $text)); } $text =~ s//>/g; } my $url = LJ::ehtml($cut); $newdata .= "$text )"; $p->get_tag("/lj-cut") unless $opts->{'cutpreview'} } else { $newdata .= ""; next; } } elsif ($tag eq "lj") { my $attr = $token->[2]; # keep working for backwards compatibility, but pretend # it was so we don't have to account for it below. my $user = $attr->{'user'} = exists $attr->{'user'} ? $attr->{'user'} : exists $attr->{'comm'} ? $attr->{'comm'} : undef; if (length $user) { $user = LJ::canonical_username($user); if ($s1var) { $newdata .= "%%ljuser:$1%%" if $attr->{'user'} =~ /^\%\%([\w\-\']+)\%\%$/; } elsif (length $user) { if ($opts->{'textonly'}) { $newdata .= $user; } else { $newdata .= LJ::ljuser($user); } } else { $newdata .= "[Bad username in LJ tag]"; } } else { $newdata .= "[Unknown LJ tag]"; } } elsif ($tag eq "lj-raw") { # Strip it out, but still register it as being open $opencount{$tag}++; } else { my $alt_output = 0; my $hash = $token->[2]; my $attrs = $token->[3]; # attribute names, in original order $slashclose = 1 if delete $hash->{'/'}; foreach (@attrstrip) { delete $hash->{$_}; } if ($tag eq "form") { my $action = lc($hash->{'action'}); my $deny = 0; if ($action =~ m!^https?://?([^/]+)!) { my $host = $1; $deny = 1 if $host =~ /[%\@\s]/ || $LJ::FORM_DOMAIN_BANNED{$host}; } else { $deny = 1; } delete $hash->{'action'} if $deny; } foreach my $attr (keys %$hash) { delete $hash->{$attr} if $attr =~ /^(?:on|dynsrc|data)/; if ($attr =~ /(?:^=)|[\x0b\x0d]/) { # Cleaner attack:

' onmouseover="javascript:alert(document/**/.cookie)" > # is returned by HTML::Parser as P_tag("='" => "='") Text( onmouseover...) # which leads to reconstruction of valid HTML. Clever! # detect this, and fail. return $total_fail->("$tag $attr"); } # ignore attributes that do not fit this strict scheme return $total_fail->("$tag " . (%$hash > 1 ? "[...] " : "") . "$attr") unless $attr =~ /^[\w_:-]+$/; $hash->{$attr} =~ s/[\t\n]//g; # IE sucks: if ($hash->{$attr} =~ /(j\s*a\s*v\s*a\s*s\s*c\s*r\s*i\s*p\s*t| v\s*b\s*s\s*c\s*r\s*i\s*p\s*t| a\s*b\s*o\s*u\s*t)\s*:/ix) { delete $hash->{$attr}; } if ($attr eq 'style' && $opts->{'cleancss'}) { # css2 spec, section 4.1.3 # position === p\osition :( # strip all slashes no matter what. $hash->{$attr} =~ s/\\//g; # and catch the obvious ones. foreach my $css (qw(absolute fixed)) { if ($hash->{$attr} =~ /$css/i) { delete $hash->{$attr}; last; } } } # reserve ljs_* ids for divs, etc so users can't override them to replace content if ($attr eq 'id' && $hash->{$attr} =~ /^ljs_/i) { delete $hash->{$attr}; } if ($s1var) { if ($attr =~ /%%/) { delete $hash->{$attr}; next; } my $props = $LJ::S1::PROPS->{$s1var}; if ($hash->{$attr} =~ /^%%([\w:]+:)?(\S+?)%%$/ && $props->{$2} =~ /[aud]/) { # don't change it. } elsif ($hash->{$attr} =~ /^%%cons:\w+%%[^\%]*$/) { # a site constant with something appended is also fine. } elsif ($hash->{$attr} =~ /%%/) { my $clean_var = sub { my ($mods, $prop) = @_; # HTML escape and kill line breaks $mods = "attr:$mods" unless $mods =~ /^(color|cons|siteroot|sitename|img):/ || $props->{$prop} =~ /[ud]/; return '%%' . $mods . $prop . '%%'; }; $hash->{$attr} =~ s/[\n\r]//g; $hash->{$attr} =~ s/%%([\w:]+:)?(\S+?)%%/$clean_var->(lc($1), $2)/eg; if ($attr =~ /^(href|src|lowsrc|style)$/) { $hash->{$attr} = "\%\%[attr[$hash->{$attr}]]\%\%"; } } } } if (exists $hash->{href}) { unless ($hash->{href} =~ s/^lj:(?:\/\/)?(.*)$/ExpandLJURL($1)/ei) { $hash->{href} = canonical_url($hash->{href}, 1); } } if ($hash->{'style'} =~ /expression/i) { delete $hash->{'style'}; } if ($tag eq "img") { my $img_bad = 0; if (defined $opts->{'maximgwidth'} && (! defined $hash->{'width'} || $hash->{'width'} > $opts->{'maximgwidth'})) { $img_bad = 1; } if (defined $opts->{'maximgheight'} && (! defined $hash->{'height'} || $hash->{'height'} > $opts->{'maximgheight'})) { $img_bad = 1; } if ($opts->{'extractimages'}) { $img_bad = 1; } $hash->{src} = canonical_url($hash->{src}, 1); if ($img_bad) { $newdata .= "{'src'}) . "\">" . LJ::img('placeholder') . ''; $alt_output = 1; } } if ($tag eq "a" && $extractlinks) { push @canonical_urls, canonical_url($token->[2]->{href}, 1); $newdata .= ""; next; } unless ($alt_output) { my $allow; if ($mode eq "allow") { $allow = 1; if ($action{$tag} eq "deny") { $allow = 0; } } else { $allow = 0; if ($action{$tag} eq "allow") { $allow = 1; } } if ($allow && ! $remove{$tag}) { if ($opts->{'tablecheck'}) { $allow = 0 if # can't open table elements from outside a table ($tag =~ /^(?:tbody|thead|tfoot|tr|td|th)$/ && ! @tablescope) || # can't open td or th if not inside tr ($tag =~ /^(?:td|th)$/ && ! $tablescope[-1]->{'tr'}) || # can't open a table unless inside a td or th ($tag eq 'table' && @tablescope && ! grep { $tablescope[-1]->{$_} } qw(td th)); } if ($allow) { $newdata .= "<$tag"; } else { $newdata .= "<$tag"; } # output attributes in original order, but only those # that are allowed (by still being in %$hash after cleaning) foreach (@$attrs) { if ($hash->{$_} =~ /[^\x01-\x7f]/) { # FIXME: this is so ghetto. make faster. make generic. # HTML::Parser decodes entities for us (which is good) # but in Perl 5.8 also includes the "poison" SvUTF8 # flag on the scalar it returns, thus poisoning the # rest of the content this scalar is appended with. # we need to remove that poison at this point. *sigh* $hash->{$_} = pack('C*', unpack('C*', $hash->{$_})); } $newdata .= " $_=\"" . LJ::ehtml($hash->{$_}) . "\"" if exists $hash->{$_}; } if ($slashclose) { $newdata .= " /"; $opencount{$tag}--; $tablescope[-1]->{$tag}-- if $opts->{'tablecheck'} && @tablescope; } if ($allow) { $newdata .= ">"; $opencount{$tag}++; # maintain current table scope if ($opts->{'tablecheck'}) { # open table if ($tag eq 'table') { push @tablescope, {}; # new tag within current table } elsif (@tablescope) { $tablescope[-1]->{$tag}++; } } } else { $newdata .= ">"; } } } } } # end tag elsif ($type eq "E") { my $tag = $token->[1]; my $allow; if ($tag eq "lj-raw") { $opencount{$tag}--; $tablescope[-1]->{$tag}-- if $opts->{'tablecheck'} && @tablescope; } elsif ($tag eq "lj-cut") { if ($opts->{'cutpreview'}) { $newdata .= "</lj-cut>"; } } else { if ($mode eq "allow") { $allow = 1; if ($action{$tag} eq "deny") { $allow = 0; } } else { $allow = 0; if ($action{$tag} eq "allow") { $allow = 1; } } if ($extractlinks && $tag eq "a") { if (@canonical_urls) { my $url = LJ::ehtml(pop @canonical_urls); $newdata .= " ($url)"; } } if ($allow && ! $remove{$tag}) { if ($opts->{'tablecheck'}) { $allow = 0 if # can't close table elements from outside a table ($tag =~ /^(?:table|tbody|thead|tfoot|tr|td|th)$/ && ! @tablescope) || # can't close td or th unless open tr ($tag =~ /^(?:td|th)$/ && ! $tablescope[-1]->{'tr'}); } if ($allow && ! ($opts->{'noearlyclose'} && ! $opencount{$tag})) { # maintain current table scope if ($opts->{'tablecheck'}) { # open table if ($tag eq 'table') { pop @tablescope; # closing tag within current table } elsif (@tablescope) { $tablescope[-1]->{$tag}--; } } $newdata .= ""; $opencount{$tag}--; } else { $newdata .= "</$tag>"; } } } } elsif ($type eq "D") { # remove everything past first closing tag $token->[1] =~ s/>.+/>/s; # kill any opening tag except the starting one $token->[1] =~ s/.[1]; } elsif ($type eq "T") { my %url = (); my $urlcount = 0; if ($opencount{'style'}) { # remove anything that might run javascript/vbscript code # (Note: Ghetto. Need to use CSS.pm to build full tree, analyze, redump) my $reduced = lc($token->[1]); $reduced =~ s/\s+//g; $reduced =~ s/\\//g; $reduced =~ s/&\#(\d+);?/chr($1)/eg; $reduced =~ s/&\#x(\w+);?/chr(hex($1))/eg; $token->[1] = "/* potential scripting */" if $reduced =~ /javascript|vbscript|expression/; $token->[1] =~ s//[COME]/g; } my $auto_format = $addbreaks && ($opencount{'table'} <= ($opencount{'td'} + $opencount{'th'})) && ! $opencount{'pre'} && ! $opencount{'lj-raw'}; if ($auto_format && ! $noautolinks && ! $opencount{'a'} && ! $opencount{'textarea'}) { my $match = sub { my $str = shift; if ($str =~ /^(.*?)(&(#39|quot|lt|gt)(;.*)?)$/) { $url{++$urlcount} = $1; return "&url$urlcount;$1&urlend;$2"; } else { $url{++$urlcount} = $str; return "&url$urlcount;$str&urlend;"; } }; $token->[1] =~ s!https?://[^\s\'\"\<\>]+[a-zA-Z0-9_/&=\-]! $match->($&); !ge; } # escape tags in text tokens. shouldn't belong here! # especially because the parser returns things it's # confused about (broken, ill-formed HTML) as text. $token->[1] =~ s/[1] =~ s/>/>/g; } if ($opencount{'style'}) { $token->[1] =~ s/\[COMS\]//g; } # put tags into long words, except inside

 and