#!/usr/bin/perl # # # lib: HTML::TokeParser, cgi-bin/ljconfig.pl, cgi-bin/ljlib.pl # link: htdocs/userinfo.bml, htdocs/users # require "$ENV{'LJHOME'}/cgi-bin/ljconfig.pl"; use strict; use Class::Autouse qw( URI HTML::TokeParser HTMLCleaner LJ::CSS::Cleaner LJ::EmbedModule ); use LJR::Viewuser; use LJR::Distributed; package LJ; # # name: LJ::strip_bad_code # class: security # des: Removes malicious/annoying HTML. # info: This is just a wrapper function around [func[LJ::CleanHTML::clean]]. # args: textref # des-textref: Scalar reference to text to be cleaned. # returns: Nothing. # sub strip_bad_code { my $data = shift; LJ::CleanHTML::clean($data, { 'eat' => [qw[layer iframe script object embed]], 'mode' => 'allow', 'keepcomments' => 1, # Allows CSS to work }); } # LJ::CleanHTML::clean(\$u->{'bio'}, { # 'wordlength' => 100, # maximum length of an unbroken "word" # 'addbreaks' => 1, # insert
after newlines where appropriate # 'tablecheck' => 1, # make sure they aren't closing that weren't opened. # 'eat' => [qw(head title style layer iframe)], # 'mode' => 'allow', # 'deny' => [qw(marquee)], # 'remove' => [qw()], # 'maximgwidth' => 100, # 'maximgheight' => 100, # 'keepcomments' => 1, # 'cuturl' => 'http://www.domain.com/full_item_view.ext', # 'ljcut_disable' => 1, # stops the cleaner from using the lj-cut tag # 'cleancss' => 1, # 'extractlinks' => 1, # remove a hrefs; implies noautolinks # 'noautolinks' => 1, # do not auto linkify # 'extractimages' => 1, # placeholder images # 'transform_embed_nocheck' => 1, # do not do checks on object/embed tag transforming # 'transform_embed_wmode' => , # define a wmode value for videos (usually 'transparent' is the value you want) # 'blocked_links' => [ qr/evil\.com/, qw/spammer\.com/ ], # list of sites which URL's will be blocked # 'blocked_link_substitute' => 'http://domain.com/error.html' # blocked links will be replaced by this URL # }); package LJ::CleanHTML; sub helper_preload { my $p = HTML::TokeParser->new(""); eval {$p->DESTROY(); }; } # this treats normal characters and &entities; as single characters # also treats UTF-8 chars as single characters if $LJ::UNICODE my $onechar; { my $utf_longchar = '[\xc2-\xdf][\x80-\xbf]|\xe0[\xa0-\xbf][\x80-\xbf]|[\xe1-\xef][\x80-\xbf][\x80-\xbf]|\xf0[\x90-\xbf][\x80-\xbf][\x80-\xbf]|[\xf1-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf]'; my $match; if (not $LJ::UNICODE) { $match = '[^&\s]|(&\#?\w{1,7};)'; } else { $match = $utf_longchar . '|[^&\s\x80-\xff]|(?:&\#?\w{1,7};)'; } $onechar = qr/$match/o; } # Some browsers, such as Internet Explorer, have decided to alllow # certain HTML tags to be an alias of another. This has manifested # itself into a problem, as these aliases act in the browser in the # same manner as the original tag, but are not treated the same by # the HTML cleaner. # 'alias' => 'real' my %tag_substitute = ( 'image' => 'img', ); # In XHTML you can close a tag in the same opening tag like
, # but some browsers still will interpret it as an opening only tag. # This is a list of tags which you can actually close with a trailing # slash and get the proper behavior from a browser. my $slashclose_tags = qr/^(?:area|base|basefont|br|col|embed|frame|hr|img|input|isindex|link|meta|param|lj-embed)$/i; # # name: LJ::CleanHTML::clean # class: text # des: Multi-faceted HTML parse function # info: # args: data, opts # des-data: A reference to HTML to parse to output, or HTML if modified in-place. # des-opts: An hash of options to pass to the parser. # returns: Nothing. # # (NB!) very slow on a large text, called for every item within view stage. # example: http://lj.rossia.org/users/lll22021918_01/2009/10/07/ # sub clean { my $data = shift; my $opts = shift; my $newdata; # remove the auth portion of any see_request.bml links $$data =~ s/(see_request\.bml\S+?)auth=\w+/$1/ig; my $p = HTML::TokeParser->new($data); my $wordlength = $opts->{'wordlength'}; my $addbreaks = $opts->{'addbreaks'}; my $keepcomments = $opts->{'keepcomments'}; my $mode = $opts->{'mode'}; my $cut = $opts->{'cuturl'} || $opts->{'cutpreview'}; my $ljcut_disable = $opts->{'ljcut_disable'}; my $s1var = $opts->{'s1var'}; my $extractlinks = 0 || $opts->{'extractlinks'}; my $noautolinks = $extractlinks || $opts->{'noautolinks'}; my $noexpand_embedded = $opts->{'noexpandembedded'} || $opts->{'textonly'} || 0; my $transform_embed_nocheck = $opts->{'transform_embed_nocheck'} || 0; my $transform_embed_wmode = $opts->{'transform_embed_wmode'}; my $remove_colors = $opts->{'remove_colors'} || 0; my $remove_sizes = $opts->{'remove_sizes'} || 0; my $remove_fonts = $opts->{'remove_fonts'} || 0; my $blocked_links = (exists $opts->{'blocked_links'}) ? $opts->{'blocked_links'} : \@LJ::BLOCKED_LINKS; my $blocked_link_substitute = (exists $opts->{'blocked_link_substitute'}) ? $opts->{'blocked_link_substitute'} : ($LJ::BLOCKED_LINK_SUBSTITUTE) ? $LJ::BLOCKED_LINK_SUBSTITUTE : '#'; my @canonical_urls; # extracted links my %action = (); my %remove = (); if (ref $opts->{'eat'} eq "ARRAY") { foreach (@{$opts->{'eat'}}) { $action{$_} = "eat"; } } if (ref $opts->{'allow'} eq "ARRAY") { foreach (@{$opts->{'allow'}}) { $action{$_} = "allow"; } } if (ref $opts->{'deny'} eq "ARRAY") { foreach (@{$opts->{'deny'}}) { $action{$_} = "deny"; } } if (ref $opts->{'remove'} eq "ARRAY") { foreach (@{$opts->{'remove'}}) { $action{$_} = "deny"; $remove{$_} = 1; } } $action{'script'} = "eat"; # if removing sizes, remove heading tags if ($remove_sizes) { foreach my $tag (qw( h1 h2 h3 h4 h5 h6 )) { $action{$tag} = "deny"; $remove{$tag} = 1; } } # foreach my $tag (qw( marquee )) { # $action{$tag} = "deny"; # $remove{$tag} = 1; # } # Marquee is abused by makaka, I disabled it here, no idea what is a proper place # to disable tags - please change - MV, 2014 # A few days later: the proper place is probably when we strip tags from # several HTML tags, such as hr, pre, textarea etc # antimakaka measure. Remove certain tags for anon - Nov 2014, MV if ($opts->{'anonhtml'}) { foreach my $tag (qw( h1 h2 big font pre )) { $action{$tag} = "deny"; $remove{$tag} = 1; } } my @attrstrip = qw(); # cleancss means clean annoying css # clean_js_css means clean javascript from css if ($opts->{'cleancss'}) { push @attrstrip, 'id'; $opts->{'clean_js_css'} = 1; } if ($opts->{'nocss'}) { push @attrstrip, 'style'; } if (ref $opts->{'attrstrip'} eq "ARRAY") { foreach (@{$opts->{'attrstrip'}}) { push @attrstrip, $_; } } my %opencount = (); my @tablescope = (); my $cutcount = 0; my $imagecount = 0; # bytes known good. set this BEFORE we start parsing any new # start tag, where most evil is (because where attributes can be) # then, if we have to totally fail, we can cut stuff off after this. my $good_until = 0; # then, if we decide that part of an entry has invalid content, we'll # escape that part and stuff it in here. this lets us finish cleaning # the "good" part of the entry (since some tags might not get closed # till after $good_until bytes into the text). my $extra_text; my $total_fail = sub { my $tag = LJ::ehtml(@_); my $edata = LJ::ehtml($$data); $edata =~ s/\r?\n/
/g if $addbreaks; $extra_text = "
[Error: Irreparable invalid markup ('<$tag>') in entry. ". "Owner must fix manually. Raw contents below.]

" . '
' . $edata . '
'; }; my $htmlcleaner = HTMLCleaner->new(valid_stylesheet => \&LJ::valid_stylesheet_url); my $eating_ljuser_span = 0; # bool, if we're eating an ljuser span my $ljuser_text_node = ""; # the last text node we saw while eating ljuser tags my @eatuntil = (); # if non-empty, we're eating everything. thing at end is thing # we're looking to open again or close again. my $capturing_during_eat; # if we save all tokens that happen inside the eating. my @capture = (); # if so, they go here my $form_tag = { input => 1, select => 1, option => 1, }; my $start_capture = sub { next if $capturing_during_eat; my ($tag, $first_token, $cb) = @_; push @eatuntil, $tag; @capture = ($first_token); $capturing_during_eat = $cb || sub {}; }; my $finish_capture = sub { @capture = (); $capturing_during_eat = undef; }; TOKEN: while (my $token = $p->get_token) { my $type = $token->[0]; # See if this tag should be treated as an alias $token->[1] = $tag_substitute{$token->[1]} if defined $tag_substitute{$token->[1]} && ($type eq 'S' || $type eq 'E'); if ($type eq "S") # start tag { my $tag = $token->[1]; my $attr = $token->[2]; # hashref $good_until = length $newdata; if (@eatuntil) { push @capture, $token if $capturing_during_eat; if ($tag eq $eatuntil[-1]) { push @eatuntil, $tag; } next TOKEN; } if ($tag eq "lj-template" && ! $noexpand_embedded) { my $name = $attr->{name} || ""; $name =~ s/-/_/g; my $run_template_hook = sub { # can pass in tokens to override passing the hook the @capture array my ($token, $override_capture) = @_; my $capture = $override_capture ? [$token] : \@capture; my $expanded = ($name =~ /^\w+$/) ? LJ::run_hook("expand_template_$name", $capture) : ""; $newdata .= $expanded || "[Error: unknown template '" . LJ::ehtml($name) . "']"; }; if ($attr->{'/'}) { # template is self-closing, no need to do capture $run_template_hook->($token, 1); } else { # capture and send content to hook $start_capture->("lj-template", $token, $run_template_hook); } next TOKEN; } if ($tag eq "lj-replace") { my $name = $attr->{name} || ""; my $replace = ($name =~ /^\w+$/) ? LJ::lj_replace($name, $attr) : undef; $newdata .= defined $replace ? $replace : "[Error: unknown lj-replace key '" . LJ::ehtml($name) . "']"; next TOKEN; } # Capture object and embed tags to possibly transform them into something else. if ($tag eq "object" || $tag eq "embed") { if (LJ::are_hooks("transform_embed") && !$noexpand_embedded) { # XHTML style open/close tags done as a singleton shouldn't actually # start a capture loop, because there won't be a close tag. if ($attr->{'/'}) { $newdata .= LJ::run_hook("transform_embed", [$token], nocheck => $transform_embed_nocheck, wmode => $transform_embed_wmode) || ""; next TOKEN; } $start_capture->($tag, $token, sub { my $expanded = LJ::run_hook("transform_embed", \@capture, nocheck => $transform_embed_nocheck, wmode => $transform_embed_wmode); $newdata .= $expanded || ""; }); next TOKEN; } } if ($tag eq "span" && lc $attr->{class} eq "ljruser" && ! $noexpand_embedded) { $eating_ljuser_span = 1; $ljuser_text_node = ""; } if ($eating_ljuser_span) { next TOKEN; } if (($tag eq "div" || $tag eq "span") && lc $attr->{class} eq "ljvideo") { $start_capture->($tag, $token, sub { my $expanded = LJ::run_hook("expand_template_video", \@capture); $newdata .= $expanded || "[Error: unknown template 'video']"; }); next TOKEN; } # do some quick checking to see if this is an email address/URL, and if so, just # escape it and ignore it if ($tag =~ m!(?:\@|://)!) { $newdata .= LJ::ehtml("<$tag>"); next; } if ($form_tag->{$tag}) { if (! $opencount{form}) { $newdata .= "<$tag ... >"; next; } if ($tag eq "input") { if ($attr->{type} !~ /^\w+$/ || lc $attr->{type} eq "password") { delete $attr->{type}; } } } my $slashclose = 0; # If set to 1, use XML-style empty tag marker # for tags like , pretend it's and reinsert the slash later $slashclose = 1 if ($tag =~ s!/$!!); unless ($tag =~ /^\w([\w\-:_]*\w)?$/) { $total_fail->($tag); last TOKEN; } # for incorrect tags like (note the lack of a space) # delete everything after 'name' to prevent a security loophole which happens # because IE understands them. $tag =~ s!/.+$!!; if ($action{$tag} eq "eat") { $p->unget_token($token); $p->get_tag("/$tag"); next; } # try to call HTMLCleaner's element-specific cleaner on this open tag my $clean_res = eval { my $cleantag = $tag; $cleantag =~ s/^.*://s; $cleantag =~ s/[^\w]//g; no strict 'subs'; my $meth = "CLEAN_$cleantag"; my $seq = $token->[3]; # attribute names, listref my $code = $htmlcleaner->can($meth) or return 1; return $code->($htmlcleaner, $seq, $attr); }; next if !$@ && !$clean_res; # this is so the rte converts its source to the standard ljuser html my $ljuser_div = $tag eq "div" && $attr->{class} eq "ljruser"; if ($ljuser_div) { my $ljuser_text = $p->get_text("/b"); $p->get_tag("/div"); $ljuser_text =~ s/\[info\]//; $tag = "lj"; $attr->{'user'} = $ljuser_text; } # stupid hack to remove the class='ljcut' from divs when we're # disabling them, so we account for the open div normally later. my $ljcut_div = $tag eq "div" && lc $attr->{class} eq "ljcut"; if ($ljcut_div && $ljcut_disable) { $ljcut_div = 0; } # no cut URL, record the anchor, but then fall through if (0 && $ljcut_div && !$cut) { $cutcount++; $newdata .= ""; $ljcut_div = 0; } if (($tag eq "lj-cut" || $ljcut_div) && !$ljcut_disable) { next TOKEN if $ljcut_disable; $cutcount++; my $link_text = sub { my $text = "Read more..."; if ($attr->{'text'}) { $text = $attr->{'text'}; if ($text =~ /[^\x01-\x7f]/) { $text = pack('C*', unpack('C*', $text)); } $text =~ s//>/g; } return $text; }; if ($cut) { my $etext = $link_text->(); my $url = LJ::ehtml($cut); $newdata .= "
" if $tag eq "div"; $newdata .= "$etext )"; $newdata .= "
" if $tag eq "div"; unless ($opts->{'cutpreview'}) { push @eatuntil, $tag; next TOKEN; } } else { $newdata .= "" unless $opts->{'textonly'}; if ($tag eq "div" && !$opts->{'textonly'}) { $opencount{"div"}++; my $etext = $link_text->(); $newdata .= "
"; } next; } } elsif ($tag eq "style") { my $style = $p->get_text("/style"); $p->get_tag("/style"); unless ($LJ::DISABLED{'css_cleaner'}) { my $cleaner = LJ::CSS::Cleaner->new; $style = $cleaner->clean($style); LJ::run_hook('css_cleaner_transform', \$style); if ($LJ::IS_DEV_SERVER) { $style = "/* cleaned */\n" . $style; } } $newdata .= "\n\n"; next; } elsif ($tag eq "lj") { # keep working for backwards compatibility, but pretend # it was so we don't have to account for it below. my $user = $attr->{'user'} = exists $attr->{'user'} ? $attr->{'user'} : exists $attr->{'comm'} ? $attr->{'comm'} : undef; if (length $user) { my $orig_user = $user; # save for later, in case # $user = LJ::canonical_username($user); if ($s1var) { $newdata .= "%%ljuser:$1%%" if $attr->{'user'} =~ /^\%\%([\w\-\']+)\%\%$/; } elsif (length $user) { if ($opts->{'textonly'}) { $newdata .= $user; } else { $opts->{'site'} = LJR::Viewuser::canonical_sitenum( exists $attr->{'site'} ? $attr->{'site'} : "LJ" ); if (exists $attr->{'comm'}) { $opts->{'type'} = "C"; } else { delete $opts->{'type'}; } $newdata .= LJR::Viewuser::ljuser($user, $opts); # $newdata .= LJ::ljuser($user); } } else { $orig_user = LJ::no_utf8_flag($orig_user); $newdata .= "[Bad username: " . LJ::ehtml($orig_user) . "]"; } } else { $newdata .= "[Unknown LJ tag]"; } } elsif ($tag eq "ljr") { my $optss=(); my $user = $attr->{'user'} = exists $attr->{'user'} ? $attr->{'user'} : exists $attr->{'comm'} ? $attr->{'comm'} : undef; if (length $user) { if (exists $attr->{'comm'}) {$optss->{'type'}='C';} $optss->{'site'}=0; $newdata .= LJR::Viewuser::ljuser($user, $optss); } else { $newdata .= "[Bad username in LJ tag]"; } } elsif ($tag eq "ljr-href") { my $attr = $token->[2]; my $ljr_rhref = exists $attr->{'url'} ? $attr->{'url'} : undef; my $ljr_rsite = exists $attr->{'site'} ? $attr->{'site'} : undef; if ($ljr_rhref && $ljr_rsite) { my $furl = $ljr_rsite . $ljr_rhref; my $ftxt = $ljr_rsite . $ljr_rhref; my $have_local_copy = 1; my $ru; my $ljr_rusername; my $ljr_ritemid; my $ljr_rthread; my $ljr_rreplyto; my $r; my $c; $ru = LJR::Distributed::get_remote_server($ljr_rsite); $have_local_copy = 0 if $ru->{"err"}; # we know remote server, proceed identifying link if ($have_local_copy) { #TODO: extract username according to remote server type # (currently we support only LJ-based servers) if ($ljr_rhref =~ /users\/(.+?)\/(\d+?)\.html(\?((thread\=(\d*))|(replyto\=(\d*))).*)*/) { $ljr_rusername = $1; $ljr_rusername =~ s/\-/\_/; $ljr_ritemid = int($2 / 256); $ljr_rthread = int($6 / 256) if $6; $ljr_rreplyto = int($8 / 256) if $8; } else { $have_local_copy = 0; } } # we've got remote username and event htmlid, proceed identifying link if ($have_local_copy) { $ru->{username} = $ljr_rusername; $ru = LJR::Distributed::get_cached_user($ru); # populates $ru->{ru_id} $have_local_copy = 0 if $ru->{"err"}; } # we know remote user, proceed identifying link if ($have_local_copy) { $r = LJR::Distributed::get_local_itemid (0, $ru->{ru_id}, $ljr_ritemid); $have_local_copy = 0 if $r->{"err"} || $r->{"itemid"} == 0; } if ($have_local_copy && ($ljr_rthread || $ljr_rreplyto)) { my $tempid; $tempid = $ljr_rthread if $ljr_rthread; $tempid = $ljr_rreplyto if $ljr_rreplyto; $c = LJR::Distributed::get_local_commentid (0, $ru->{ru_id}, $tempid); $have_local_copy = 0 if $c->{"err"} || $c->{"talkid"} == 0; } if ($have_local_copy) { $furl = $LJ::SITEROOT . "/users/" . $r->{"journalname"} . "/" . ($r->{"item"}->{"jitemid"} * 256 + $r->{"item"}->{"anum"}) . ".html"; if ($c->{"talkid"}) { my $thread_id = $c->{"talkid"} * 256 + $r->{"item"}->{"anum"}; if ($ljr_rthread) { $furl .= "?thread=" . $thread_id . "#t" . $thread_id; } else { $furl .= "?replyto=" . $thread_id; } } $ftxt = $furl; } $newdata .= ""; $opencount{'a'}++; } else { $newdata .= "[Malformed ljr-user tag]"; } } elsif ($tag eq "lj-raw") { # Strip it out, but still register it as being open $opencount{$tag}++; } # Don't allow any tag with the "set" attribute elsif ($tag =~ m/:set$/) { next; } else { my $alt_output = 0; my $hash = $token->[2]; my $attrs = $token->[3]; # attribute names, in original order $slashclose = 1 if delete $hash->{'/'}; foreach (@attrstrip) { # maybe there's a better place for this? next if (lc $tag eq 'lj-embed' && lc $_ eq 'id'); delete $hash->{$_}; } if ($tag eq "form") { my $action = lc($hash->{'action'}); my $deny = 0; if ($action =~ m!^https?://?([^/]+)!) { my $host = $1; $deny = 1 if $host =~ /[%\@\s]/ || $LJ::FORM_DOMAIN_BANNED{$host}; } else { $deny = 1; } delete $hash->{'action'} if $deny; } ATTR: foreach my $attr (keys %$hash) { if ($attr =~ /^(?:on|dynsrc)/) { delete $hash->{$attr}; next; } # added in Apr 2014 to prevent an exploit by 1px guy - MV if ($tag eq "table") { delete $hash->{$attr}; next; } #more anti-makaka measures - Oct 2014, MV if ($tag eq "pre" || $tag eq "hr" || $tag eq "marquee" || $tag eq "textarea") { delete $hash->{$attr}; next; } if ($attr eq "data") { delete $hash->{$attr} unless $tag eq "object"; next; } if ($attr eq "href" && $hash->{$attr} =~ /^data/) { delete $hash->{$attr}; next; } if ($attr =~ /(?:^=)|[\x0b\x0d]/) { # Cleaner attack:

' onmouseover="javascript:alert(document/**/.cookie)" > # is returned by HTML::Parser as P_tag("='" => "='") Text( onmouseover...) # which leads to reconstruction of valid HTML. Clever! # detect this, and fail. $total_fail->("$tag $attr"); last TOKEN; } # ignore attributes that do not fit this strict scheme unless ($attr =~ /^[\w_:-]+$/) { $total_fail->("$tag " . (%$hash > 1 ? "[...] " : "") . "$attr"); last TOKEN; } $hash->{$attr} =~ s/[\t\n]//g; # IE ignores the null character, so strip it out $hash->{$attr} =~ s/\x0//g; # IE sucks: my $nowhite = $hash->{$attr}; $nowhite =~ s/[\s\x0b]+//g; if ($nowhite =~ /(?:jscript|livescript|javascript|vbscript|about):/ix) { delete $hash->{$attr}; next; } if ($attr eq 'style') { if ($opts->{'cleancss'}) { # css2 spec, section 4.1.3 # position === p\osition :( # strip all slashes no matter what. $hash->{style} =~ s/\\//g; # and catch the obvious ones ("[" is for things like document["coo"+"kie"] foreach my $css ("/*", "[", qw(margin absolute fixed expression eval behavior cookie document window javascript -moz-binding)) { if ($hash->{style} =~ /\Q$css\E/i) { delete $hash->{style}; next ATTR; } } # remove specific CSS definitions if ($remove_colors) { $hash->{style} =~ s/(?:background-)?color:.*?(?:;|$)//gi; } if ($remove_sizes) { $hash->{style} =~ s/font-size:.*?(?:;|$)//gi; } if ($remove_fonts) { $hash->{style} =~ s/font-family:.*?(?:;|$)//gi; } # Added to prevent the new div exploit (August 2008) - M. V. $hash->{style} =~s/(content|background-image|background|position|top|left|width|height):.*?(?:;|$)//gi; #modified March 2014 to remove backgrounds and content # and in July 2014 to remove background-image } if ($opts->{'clean_js_css'} && ! $LJ::DISABLED{'css_cleaner'}) { # and then run it through a harder CSS cleaner that does a full parse my $css = LJ::CSS::Cleaner->new; $hash->{style} = $css->clean_property($hash->{style}); } } # reserve ljs_* ids for divs, etc so users can't override them to replace content if ($attr eq 'id' && $hash->{$attr} =~ /^ljs_/i) { delete $hash->{$attr}; next; } if ($s1var) { if ($attr =~ /%%/) { delete $hash->{$attr}; next ATTR; } my $props = $LJ::S1::PROPS->{$s1var}; if ($hash->{$attr} =~ /^%%([\w:]+:)?(\S+?)%%$/ && $props->{$2} =~ /[aud]/) { # don't change it. } elsif ($hash->{$attr} =~ /^%%cons:\w+%%[^\%]*$/) { # a site constant with something appended is also fine. } elsif ($hash->{$attr} =~ /%%/) { my $clean_var = sub { my ($mods, $prop) = @_; # HTML escape and kill line breaks $mods = "attr:$mods" unless $mods =~ /^(color|cons|siteroot|sitename|img):/ || $props->{$prop} =~ /[ud]/; return '%%' . $mods . $prop . '%%'; }; $hash->{$attr} =~ s/[\n\r]//g; $hash->{$attr} =~ s/%%([\w:]+:)?(\S+?)%%/$clean_var->(lc($1), $2)/eg; if ($attr =~ /^(href|src|lowsrc|style)$/) { $hash->{$attr} = "\%\%[attr[$hash->{$attr}]]\%\%"; } } } # remove specific attributes if (($remove_colors && ($attr eq "color" || $attr eq "bgcolor" || $attr eq "fgcolor" || $attr eq "text")) || ($remove_sizes && $attr eq "size") || ($remove_fonts && $attr eq "face")) { delete $hash->{$attr}; next ATTR; } } if (exists $hash->{href}) { ## links to some resources will be completely blocked ## and replaced by value of 'blocked_link_substitute' param if ($blocked_links) { foreach my $re (@$blocked_links) { if ($hash->{href} =~ $re) { $hash->{href} = sprintf($blocked_link_substitute, LJ::eurl($hash->{href})); last; } } } unless ($hash->{href} =~ s/^lj:(?:\/\/)?(.*)$/ExpandLJURL($1)/ei) { $hash->{href} = canonical_url($hash->{href}, 1); } } if ($tag eq "img") { $imagecount++; my $img_bad = 0; if ((defined $opts->{'maximgwidth'} && (! defined $hash->{'width'} || $hash->{'width'} > $opts->{'maximgwidth'})) # I replaced 1 to 33 temporarily # as an anti-macaque measure. Really stupid, in fact. - MV, Sept 2014 || (defined $hash->{'width'} && $hash->{'width'} <= 33)) # to avoid bombing with billion 1px images { $img_bad = 1; } if ((defined $opts->{'maximgheight'} && (! defined $hash->{'height'} || $hash->{'height'} > $opts->{'maximgheight'})) || (defined $hash->{'height'} && $hash->{'height'} <= 33)) { $img_bad = 1; } if ($opts->{'extractimages'}) { $img_bad = 1; } # anti-makaka: prohibit putting more than $MAXIMAGES images to comments my $MAXIMAGES = 5; # maximal number of images in comments # we should put this to ljconfig.pl! if (($imagecount > $MAXIMAGES) && $opts->{'maximages'}) { $img_bad = 1; } # remove img src="data:image/..." images $hash->{src} = canonical_url($hash->{src}, 1); if ("$hash->{src}" =~ "^data:") { $img_bad=1; $hash->{src} = "data:image is not allowed"; } # Anon and OpenID commenters are not allowed to post images if ($img_bad) { $newdata .= "{'src'}) . "\">" . LJ::img('placeholder') . ''; $alt_output = 1; $opencount{"img"}++; } } if ($tag eq "a" && $extractlinks) { push @canonical_urls, canonical_url($token->[2]->{href}, 1); $newdata .= ""; next; } # Through the xsl namespace in XML, it is possible to embed scripting lanaguages # as elements which will then be executed by the browser. Combining this with # customview.cgi makes it very easy for someone to replace their entire journal # in S1 with a page that embeds scripting as well. An example being an AJAX # six degrees tool, while cool it should not be allowed. # # Example syntax: # # text/javascript if ($tag eq 'xsl:attribute') { $alt_output = 1; # We'll always deal with output for this token my $orig_value = $p->get_text; # Get the value of this element my $value = $orig_value; # Make a copy if this turns out to be alright $value =~ s/\s+//g; # Remove any whitespace # See if they are trying to output scripting, if so eat the xsl:attribute # container and its value if ($value =~ /(javascript|vbscript)/i) { # Remove the closing tag from the tree $p->get_token; # Remove the value itself from the tree $p->get_text; # No harm, no foul...Write back out the original } else { $newdata .= "$token->[4]$orig_value"; } } unless ($alt_output) { my $allow; if ($mode eq "allow") { $allow = 1; if ($action{$tag} eq "deny") { $allow = 0; } } else { $allow = 0; if ($action{$tag} eq "allow") { $allow = 1; } } if ($allow && ! $remove{$tag}) { if ($opts->{'tablecheck'}) { $allow = 0 if # can't open table elements from outside a table ($tag =~ /^(?:tbody|thead|tfoot|tr|td|th|caption|colgroup|col)$/ && ! @tablescope) || # can't open td or th if not inside tr ($tag =~ /^(?:td|th)$/ && ! $tablescope[-1]->{'tr'}) || # can't open a table unless inside a td or th ($tag eq 'table' && @tablescope && ! grep { $tablescope[-1]->{$_} } qw(td th)); } if ($allow) { $newdata .= "<$tag"; } else { $newdata .= "<$tag"; } # output attributes in original order, but only those # that are allowed (by still being in %$hash after cleaning) foreach (@$attrs) { unless (LJ::is_ascii($hash->{$_})) { # FIXME: this is so ghetto. make faster. make generic. # HTML::Parser decodes entities for us (which is good) # but in Perl 5.8 also includes the "poison" SvUTF8 # flag on the scalar it returns, thus poisoning the # rest of the content this scalar is appended with. # we need to remove that poison at this point. *sigh* $hash->{$_} = LJ::no_utf8_flag($hash->{$_}); } $newdata .= " $_=\"" . LJ::ehtml($hash->{$_}) . "\"" if exists $hash->{$_}; } # ignore the effects of slashclose unless we're dealing with a tag that can # actually close itself. Otherwise, a tag like can pass through as valid # even though some browsers just render it as an opening tag if ($slashclose && $tag =~ $slashclose_tags) { $newdata .= " /"; $opencount{$tag}--; $tablescope[-1]->{$tag}-- if $opts->{'tablecheck'} && @tablescope; } if ($allow) { $newdata .= ">"; $opencount{$tag}++; # maintain current table scope if ($opts->{'tablecheck'}) { # open table if ($tag eq 'table') { push @tablescope, {}; # new tag within current table } elsif (@tablescope) { $tablescope[-1]->{$tag}++; } } } else { $newdata .= ">"; } } } } } # end tag elsif ($type eq "E") { my $tag = $token->[1]; next TOKEN if $tag =~ /[^\w\-:]/; if (@eatuntil) { push @capture, $token if $capturing_during_eat; if ($eatuntil[-1] eq $tag) { pop @eatuntil; if (my $cb = $capturing_during_eat) { $cb->(); $finish_capture->(); } next TOKEN; } next TOKEN if @eatuntil; } if ($eating_ljuser_span && $tag eq "span") { $eating_ljuser_span = 0; $newdata .= $opts->{'textonly'} ? $ljuser_text_node : LJ::ljuser($ljuser_text_node); next TOKEN; } my $allow; if ($tag eq "lj-raw") { $opencount{$tag}--; $tablescope[-1]->{$tag}-- if $opts->{'tablecheck'} && @tablescope; } elsif ($tag eq "lj-cut") { if ($opts->{'cutpreview'}) { $newdata .= "</lj-cut>"; } } elsif ($tag eq "ljr-href") { $newdata .= ""; $opencount{'a'}--; } else { if ($mode eq "allow") { $allow = 1; if ($action{$tag} eq "deny") { $allow = 0; } } else { $allow = 0; if ($action{$tag} eq "allow") { $allow = 1; } } if ($extractlinks && $tag eq "a") { if (@canonical_urls) { my $url = LJ::ehtml(pop @canonical_urls); $newdata .= " ($url)"; next; } } if ($allow && ! $remove{$tag}) { if ($opts->{'tablecheck'}) { $allow = 0 if # can't close table elements from outside a table ($tag =~ /^(?:table|tbody|thead|tfoot|tr|td|th|caption|colgroup|col)$/ && ! @tablescope) || # can't close td or th unless open tr ($tag =~ /^(?:td|th)$/ && ! $tablescope[-1]->{'tr'}); } if ($allow && ! ($opts->{'noearlyclose'} && ! $opencount{$tag})) { # maintain current table scope if ($opts->{'tablecheck'}) { # open table if ($tag eq 'table') { pop @tablescope; # closing tag within current table } elsif (@tablescope) { $tablescope[-1]->{$tag}--; } } $newdata .= ""; $opencount{$tag}--; } else { $newdata .= "</$tag>"; } } } } elsif ($type eq "D") { # remove everything past first closing tag $token->[1] =~ s/>.+/>/s; # kill any opening tag except the starting one $token->[1] =~ s/.[1]; } elsif ($type eq "T") { my %url = (); my $urlcount = 0; if (@eatuntil) { push @capture, $token if $capturing_during_eat; next TOKEN; } if ($eating_ljuser_span) { $ljuser_text_node = $token->[1]; next TOKEN; } if ($opencount{'style'} && $LJ::DEBUG{'s1_style_textnode'}) { my $r = Apache->request; my $uri = $r->uri; my $host = $r->header_in("Host"); warn "Got text node while style elements open. Shouldn't happen anymore. ($host$uri)\n"; } my $auto_format = $addbreaks && ($opencount{'table'} <= ($opencount{'td'} + $opencount{'th'})) && ! $opencount{'pre'} && ! $opencount{'lj-raw'}; if ($auto_format && ! $noautolinks && ! $opencount{'a'} && ! $opencount{'textarea'}) { my $match = sub { my $str = shift; if ($str =~ /^(.*?)(&(#39|quot|lt|gt)(;.*)?)$/) { $url{++$urlcount} = $1; return "&url$urlcount;$1&urlend;$2"; } else { $url{++$urlcount} = $str; return "&url$urlcount;$str&urlend;"; } }; $token->[1] =~ s!https?://[^\s\'\"\<\>]+[a-zA-Z0-9_/&=\-]! $match->($&); !ge; } # escape tags in text tokens. shouldn't belong here! # especially because the parser returns things it's # confused about (broken, ill-formed HTML) as text. $token->[1] =~ s/[1] =~ s/>/>/g; # put tags into long words, except inside

 and