"; } next; } } elsif ($tag eq "style") { my $style = $p->get_text("/style"); $p->get_tag("/style"); unless ($LJ::DISABLED{'css_cleaner'}) { my $cleaner = LJ::CSS::Cleaner->new; $style = $cleaner->clean($style); LJ::run_hook('css_cleaner_transform', \$style); if ($LJ::IS_DEV_SERVER) { $style = "/* cleaned */\n" . $style; } } $newdata .= "\n\n"; next; } elsif ($tag eq "lj") { # keep working for backwards compatibility, but pretend # it was so we don't have to account for it below. my $user = $attr->{'user'} = exists $attr->{'user'} ? $attr->{'user'} : exists $attr->{'comm'} ? $attr->{'comm'} : undef; if (length $user) { my $orig_user = $user; # save for later, in case # $user = LJ::canonical_username($user); if ($s1var) { $newdata .= "%%ljuser:$1%%" if $attr->{'user'} =~ /^\%\%([\w\-\']+)\%\%$/; } elsif (length $user) { if ($opts->{'textonly'}) { $newdata .= $user; } else { $opts->{'site'} = LJR::Viewuser::canonical_sitenum( exists $attr->{'site'} ? $attr->{'site'} : "LJ" ); if (exists $attr->{'comm'}) { $opts->{'type'} = "C"; } else { delete $opts->{'type'}; } $newdata .= LJR::Viewuser::ljuser($user, $opts); # $newdata .= LJ::ljuser($user); } } else { $orig_user = LJ::no_utf8_flag($orig_user); $newdata .= "[Bad username: " . LJ::ehtml($orig_user) . "]"; } } else { $newdata .= "[Unknown LJ tag]"; } } elsif ($tag eq "ljr") { my $optss=(); my $user = $attr->{'user'} = exists $attr->{'user'} ? $attr->{'user'} : exists $attr->{'comm'} ? $attr->{'comm'} : undef; if (length $user) { if (exists $attr->{'comm'}) {$optss->{'type'}='C';} $optss->{'site'}=0; $newdata .= LJR::Viewuser::ljuser($user, $optss); } else { $newdata .= "[Bad username in LJ tag]"; } } elsif ($tag eq "ljr-href") { my $attr = $token->[2]; my $ljr_rhref = exists $attr->{'url'} ? $attr->{'url'} : undef; my $ljr_rsite = exists $attr->{'site'} ? $attr->{'site'} : undef; if ($ljr_rhref && $ljr_rsite) { my $furl = $ljr_rsite . $ljr_rhref; my $ftxt = $ljr_rsite . $ljr_rhref; my $have_local_copy = 1; my $ru; my $ljr_rusername; my $ljr_ritemid; my $ljr_rthread; my $ljr_rreplyto; my $r; my $c; $ru = LJR::Distributed::get_remote_server($ljr_rsite); $have_local_copy = 0 if $ru->{"err"}; # we know remote server, proceed identifying link if ($have_local_copy) { #TODO: extract username according to remote server type # (currently we support only LJ-based servers) if ($ljr_rhref =~ /users\/(.+?)\/(\d+?)\.html(\?((thread\=(\d*))|(replyto\=(\d*))).*)*/) { $ljr_rusername = $1; $ljr_rusername =~ s/\-/\_/; $ljr_ritemid = int($2 / 256); $ljr_rthread = int($6 / 256) if $6; $ljr_rreplyto = int($8 / 256) if $8; } else { $have_local_copy = 0; } } # we've got remote username and event htmlid, proceed identifying link if ($have_local_copy) { $ru->{username} = $ljr_rusername; $ru = LJR::Distributed::get_cached_user($ru); # populates $ru->{ru_id} $have_local_copy = 0 if $ru->{"err"}; } # we know remote user, proceed identifying link if ($have_local_copy) { $r = LJR::Distributed::get_local_itemid (0, $ru->{ru_id}, $ljr_ritemid); $have_local_copy = 0 if $r->{"err"} || $r->{"itemid"} == 0; } if ($have_local_copy && ($ljr_rthread || $ljr_rreplyto)) { my $tempid; $tempid = $ljr_rthread if $ljr_rthread; $tempid = $ljr_rreplyto if $ljr_rreplyto; $c = LJR::Distributed::get_local_commentid (0, $ru->{ru_id}, $tempid); $have_local_copy = 0 if $c->{"err"} || $c->{"talkid"} == 0; } if ($have_local_copy) { $furl = $LJ::SITEROOT . "/users/" . $r->{"journalname"} . "/" . ($r->{"item"}->{"jitemid"} * 256 + $r->{"item"}->{"anum"}) . ".html"; if ($c->{"talkid"}) { my $thread_id = $c->{"talkid"} * 256 + $r->{"item"}->{"anum"}; if ($ljr_rthread) { $furl .= "?thread=" . $thread_id . "#t" . $thread_id; } else { $furl .= "?replyto=" . $thread_id; } } $ftxt = $furl; } $newdata .= ""; $opencount{'a'}++; } else { $newdata .= "[Malformed ljr-user tag]"; } } elsif ($tag eq "lj-raw") { # Strip it out, but still register it as being open $opencount{$tag}++; } # Don't allow any tag with the "set" attribute elsif ($tag =~ m/:set$/) { next; } else { my $alt_output = 0; my $hash = $token->[2]; my $attrs = $token->[3]; # attribute names, in original order $slashclose = 1 if delete $hash->{'/'}; foreach (@attrstrip) { # maybe there's a better place for this? next if (lc $tag eq 'lj-embed' && lc $_ eq 'id'); delete $hash->{$_}; } if ($tag eq "form") { my $action = lc($hash->{'action'}); my $deny = 0; if ($action =~ m!^https?://?([^/]+)!) { my $host = $1; $deny = 1 if $host =~ /[%\@\s]/ || $LJ::FORM_DOMAIN_BANNED{$host}; } else { $deny = 1; } delete $hash->{'action'} if $deny; } ATTR: foreach my $attr (keys %$hash) { if ($attr =~ /^(?:on|dynsrc)/) { delete $hash->{$attr}; next; } # added in Apr 2014 to prevent an exploit by 1px guy - MV if ($tag eq "table") { delete $hash->{$attr}; next; } #more anti-makaka measures - Oct 2014, MV if ($tag eq "pre" || $tag eq "hr" || $tag eq "marquee" || $tag eq "textarea") { delete $hash->{$attr}; next; } if ($attr eq "data") { delete $hash->{$attr} unless $tag eq "object"; next; } if ($attr eq "href" && $hash->{$attr} =~ /^data/) { delete $hash->{$attr}; next; } if ($attr =~ /(?:^=)|[\x0b\x0d]/) { # Cleaner attack:

' onmouseover="javascript:alert(document/**/.cookie)" > # is returned by HTML::Parser as P_tag("='" => "='") Text( onmouseover...) # which leads to reconstruction of valid HTML. Clever! # detect this, and fail. $total_fail->("$tag $attr"); last TOKEN; } # ignore attributes that do not fit this strict scheme unless ($attr =~ /^[\w_:-]+$/) { $total_fail->("$tag " . (%$hash > 1 ? "[...] " : "") . "$attr"); last TOKEN; } $hash->{$attr} =~ s/[\t\n]//g; # IE ignores the null character, so strip it out $hash->{$attr} =~ s/\x0//g; # IE sucks: my $nowhite = $hash->{$attr}; $nowhite =~ s/[\s\x0b]+//g; if ($nowhite =~ /(?:jscript|livescript|javascript|vbscript|about):/ix) { delete $hash->{$attr}; next; } if ($attr eq 'style') { if ($opts->{'cleancss'}) { # css2 spec, section 4.1.3 # position === p\osition :( # strip all slashes no matter what. $hash->{style} =~ s/\\//g; # and catch the obvious ones ("[" is for things like document["coo"+"kie"] foreach my $css ("/*", "[", qw(margin absolute fixed expression eval behavior cookie document window javascript -moz-binding)) { if ($hash->{style} =~ /\Q$css\E/i) { delete $hash->{style}; next ATTR; } } # remove specific CSS definitions if ($remove_colors) { $hash->{style} =~ s/(?:background-)?color:.*?(?:;|$)//gi; } if ($remove_sizes) { $hash->{style} =~ s/font-size:.*?(?:;|$)//gi; } if ($remove_fonts) { $hash->{style} =~ s/font-family:.*?(?:;|$)//gi; } # Added to prevent the new div exploit (August 2008) - M. V. $hash->{style} =~s/(content|background-image|background|position|top|left|width|height):.*?(?:;|$)//gi; #modified March 2014 to remove backgrounds and content # and in July 2014 to remove background-image } if ($opts->{'clean_js_css'} && ! $LJ::DISABLED{'css_cleaner'}) { # and then run it through a harder CSS cleaner that does a full parse my $css = LJ::CSS::Cleaner->new; $hash->{style} = $css->clean_property($hash->{style}); } } # reserve ljs_* ids for divs, etc so users can't override them to replace content if ($attr eq 'id' && $hash->{$attr} =~ /^ljs_/i) { delete $hash->{$attr}; next; } if ($s1var) { if ($attr =~ /%%/) { delete $hash->{$attr}; next ATTR; } my $props = $LJ::S1::PROPS->{$s1var}; if ($hash->{$attr} =~ /^%%([\w:]+:)?(\S+?)%%$/ && $props->{$2} =~ /[aud]/) { # don't change it. } elsif ($hash->{$attr} =~ /^%%cons:\w+%%[^\%]*$/) { # a site constant with something appended is also fine. } elsif ($hash->{$attr} =~ /%%/) { my $clean_var = sub { my ($mods, $prop) = @_; # HTML escape and kill line breaks $mods = "attr:$mods" unless $mods =~ /^(color|cons|siteroot|sitename|img):/ || $props->{$prop} =~ /[ud]/; return '%%' . $mods . $prop . '%%'; }; $hash->{$attr} =~ s/[\n\r]//g; $hash->{$attr} =~ s/%%([\w:]+:)?(\S+?)%%/$clean_var->(lc($1), $2)/eg; if ($attr =~ /^(href|src|lowsrc|style)$/) { $hash->{$attr} = "\%\%[attr[$hash->{$attr}]]\%\%"; } } } # remove specific attributes if (($remove_colors && ($attr eq "color" || $attr eq "bgcolor" || $attr eq "fgcolor" || $attr eq "text")) || ($remove_sizes && $attr eq "size") || ($remove_fonts && $attr eq "face")) { delete $hash->{$attr}; next ATTR; } } if (exists $hash->{href}) { ## links to some resources will be completely blocked ## and replaced by value of 'blocked_link_substitute' param if ($blocked_links) { foreach my $re (@$blocked_links) { if ($hash->{href} =~ $re) { $hash->{href} = sprintf($blocked_link_substitute, LJ::eurl($hash->{href})); last; } } } unless ($hash->{href} =~ s/^lj:(?:\/\/)?(.*)$/ExpandLJURL($1)/ei) { $hash->{href} = canonical_url($hash->{href}, 1); } } if ($tag eq "img") { $imagecount++; my $img_bad = 0; if ((defined $opts->{'maximgwidth'} && (! defined $hash->{'width'} || $hash->{'width'} > $opts->{'maximgwidth'})) # I replaced 1 to 33 temporarily # as an anti-macaque measure. Really stupid, in fact. - MV, Sept 2014 || (defined $hash->{'width'} && $hash->{'width'} <= 33)) # to avoid bombing with billion 1px images { $img_bad = 1; } if ((defined $opts->{'maximgheight'} && (! defined $hash->{'height'} || $hash->{'height'} > $opts->{'maximgheight'})) || (defined $hash->{'height'} && $hash->{'height'} <= 33)) { $img_bad = 1; } if ($opts->{'extractimages'}) { $img_bad = 1; } # anti-makaka: prohibit putting more than $MAXIMAGES images to comments my $MAXIMAGES = 5; # maximal number of images in comments # we should put this to ljconfig.pl! if (($imagecount > $MAXIMAGES) && $opts->{'maximages'}) { $img_bad = 1; } # remove img src="data:image/..." images $hash->{src} = canonical_url($hash->{src}, 1); if ("$hash->{src}" =~ "^data:") { $img_bad=1; $hash->{src} = "data:image is not allowed"; } # Anon and OpenID commenters are not allowed to post images if ($img_bad) { $newdata .= "{'src'}) . "\">" . LJ::img('placeholder') . ''; $alt_output = 1; $opencount{"img"}++; } } if ($tag eq "a" && $extractlinks) { push @canonical_urls, canonical_url($token->[2]->{href}, 1); $newdata .= ""; next; } # Through the xsl namespace in XML, it is possible to embed scripting lanaguages # as elements which will then be executed by the browser. Combining this with # customview.cgi makes it very easy for someone to replace their entire journal # in S1 with a page that embeds scripting as well. An example being an AJAX # six degrees tool, while cool it should not be allowed. # # Example syntax: # # text/javascript if ($tag eq 'xsl:attribute') { $alt_output = 1; # We'll always deal with output for this token my $orig_value = $p->get_text; # Get the value of this element my $value = $orig_value; # Make a copy if this turns out to be alright $value =~ s/\s+//g; # Remove any whitespace # See if they are trying to output scripting, if so eat the xsl:attribute # container and its value if ($value =~ /(javascript|vbscript)/i) { # Remove the closing tag from the tree $p->get_token; # Remove the value itself from the tree $p->get_text; # No harm, no foul...Write back out the original } else { $newdata .= "$token->[4]$orig_value"; } } unless ($alt_output) { my $allow; if ($mode eq "allow") { $allow = 1; if ($action{$tag} eq "deny") { $allow = 0; } } else { $allow = 0; if ($action{$tag} eq "allow") { $allow = 1; } } if ($allow && ! $remove{$tag}) { if ($opts->{'tablecheck'}) { $allow = 0 if # can't open table elements from outside a table ($tag =~ /^(?:tbody|thead|tfoot|tr|td|th|caption|colgroup|col)$/ && ! @tablescope) || # can't open td or th if not inside tr ($tag =~ /^(?:td|th)$/ && ! $tablescope[-1]->{'tr'}) || # can't open a table unless inside a td or th ($tag eq 'table' && @tablescope && ! grep { $tablescope[-1]->{$_} } qw(td th)); } if ($allow) { $newdata .= "<$tag"; } else { $newdata .= "<$tag"; } # output attributes in original order, but only those # that are allowed (by still being in %$hash after cleaning) foreach (@$attrs) { unless (LJ::is_ascii($hash->{$_})) { # FIXME: this is so ghetto. make faster. make generic. # HTML::Parser decodes entities for us (which is good) # but in Perl 5.8 also includes the "poison" SvUTF8 # flag on the scalar it returns, thus poisoning the # rest of the content this scalar is appended with. # we need to remove that poison at this point. *sigh* $hash->{$_} = LJ::no_utf8_flag($hash->{$_}); } $newdata .= " $_=\"" . LJ::ehtml($hash->{$_}) . "\"" if exists $hash->{$_}; } # ignore the effects of slashclose unless we're dealing with a tag that can # actually close itself. Otherwise, a tag like can pass through as valid # even though some browsers just render it as an opening tag if ($slashclose && $tag =~ $slashclose_tags) { $newdata .= " /"; $opencount{$tag}--; $tablescope[-1]->{$tag}-- if $opts->{'tablecheck'} && @tablescope; } if ($allow) { $newdata .= ">"; $opencount{$tag}++; # maintain current table scope if ($opts->{'tablecheck'}) { # open table if ($tag eq 'table') { push @tablescope, {}; # new tag within current table } elsif (@tablescope) { $tablescope[-1]->{$tag}++; } } } else { $newdata .= ">"; } } } } } # end tag elsif ($type eq "E") { my $tag = $token->[1]; next TOKEN if $tag =~ /[^\w\-:]/; if (@eatuntil) { push @capture, $token if $capturing_during_eat; if ($eatuntil[-1] eq $tag) { pop @eatuntil; if (my $cb = $capturing_during_eat) { $cb->(); $finish_capture->(); } next TOKEN; } next TOKEN if @eatuntil; } if ($eating_ljuser_span && $tag eq "span") { $eating_ljuser_span = 0; $newdata .= $opts->{'textonly'} ? $ljuser_text_node : LJ::ljuser($ljuser_text_node); next TOKEN; } my $allow; if ($tag eq "lj-raw") { $opencount{$tag}--; $tablescope[-1]->{$tag}-- if $opts->{'tablecheck'} && @tablescope; } elsif ($tag eq "lj-cut") { if ($opts->{'cutpreview'}) { $newdata .= "</lj-cut>"; } } elsif ($tag eq "ljr-href") { $newdata .= ""; $opencount{'a'}--; } else { if ($mode eq "allow") { $allow = 1; if ($action{$tag} eq "deny") { $allow = 0; } } else { $allow = 0; if ($action{$tag} eq "allow") { $allow = 1; } } if ($extractlinks && $tag eq "a") { if (@canonical_urls) { my $url = LJ::ehtml(pop @canonical_urls); $newdata .= " ($url)"; next; } } if ($allow && ! $remove{$tag}) { if ($opts->{'tablecheck'}) { $allow = 0 if # can't close table elements from outside a table ($tag =~ /^(?:table|tbody|thead|tfoot|tr|td|th|caption|colgroup|col)$/ && ! @tablescope) || # can't close td or th unless open tr ($tag =~ /^(?:td|th)$/ && ! $tablescope[-1]->{'tr'}); } if ($allow && ! ($opts->{'noearlyclose'} && ! $opencount{$tag})) { # maintain current table scope if ($opts->{'tablecheck'}) { # open table if ($tag eq 'table') { pop @tablescope; # closing tag within current table } elsif (@tablescope) { $tablescope[-1]->{$tag}--; } } $newdata .= ""; $opencount{$tag}--; } else { $newdata .= "</$tag>"; } } } } elsif ($type eq "D") { # remove everything past first closing tag $token->[1] =~ s/>.+/>/s; # kill any opening tag except the starting one $token->[1] =~ s/.[1]; } elsif ($type eq "T") { my %url = (); my $urlcount = 0; if (@eatuntil) { push @capture, $token if $capturing_during_eat; next TOKEN; } if ($eating_ljuser_span) { $ljuser_text_node = $token->[1]; next TOKEN; } if ($opencount{'style'} && $LJ::DEBUG{'s1_style_textnode'}) { my $r = Apache->request; my $uri = $r->uri; my $host = $r->header_in("Host"); warn "Got text node while style elements open. Shouldn't happen anymore. ($host$uri)\n"; } my $auto_format = $addbreaks && ($opencount{'table'} <= ($opencount{'td'} + $opencount{'th'})) && ! $opencount{'pre'} && ! $opencount{'lj-raw'}; if ($auto_format && ! $noautolinks && ! $opencount{'a'} && ! $opencount{'textarea'}) { my $match = sub { my $str = shift; if ($str =~ /^(.*?)(&(#39|quot|lt|gt)(;.*)?)$/) { $url{++$urlcount} = $1; return "&url$urlcount;$1&urlend;$2"; } else { $url{++$urlcount} = $str; return "&url$urlcount;$str&urlend;"; } }; $token->[1] =~ s!https?://[^\s\'\"\<\>]+[a-zA-Z0-9_/&=\-]! $match->($&); !ge; } # escape tags in text tokens. shouldn't belong here! # especially because the parser returns things it's # confused about (broken, ill-formed HTML) as text. $token->[1] =~ s/[1] =~ s/>/>/g; # put tags into long words, except inside

 and .
            if ($wordlength && !$opencount{'pre'} && !$opencount{'textarea'}) {
                $token->[1] =~ s/\S{$wordlength,}/break_word($&,$wordlength)/eg;                
            } 

            # auto-format things, unless we're in a textarea, when it doesn't make sense
            if ($auto_format && !$opencount{'textarea'}) {
                $token->[1] =~ s/\r?\n/<br \/>/g;
                if (! $opencount{'a'}) {
                    $token->[1] =~ s/&url(\d+);(.*?)&urlend;/<a href=\"$url{$1}\">$2<\/a>/g;
                }
            }

            $newdata .= $token->[1];
        } 
        elsif ($type eq "C") {

            # probably a malformed tag rather than a comment, so escape it
            # -- ehtml things like "<3", "<--->", "<>", etc
            # -- comments must start with <! to be eaten
            if ($token->[1] =~ /^<[^!]/) {
                $newdata .= LJ::ehtml($token->[1]);

            # by default, ditch comments
            } elsif ($keepcomments) {
                my $com = $token->[1];
                $com =~ s/^<!--\s*//;
                $com =~ s/\s*--!>$//;
                $com =~ s/<!--//;
                $com =~ s/-->//;
                $newdata .= "<!-- $com -->";
            }
        }
        elsif ($type eq "PI") {
            my $tok = $token->[1];
            $tok =~ s/</</g;
            $tok =~ s/>/>/g;
            $newdata .= "<?$tok>";
        }
        else {
            $newdata .= "<!-- OTHER: " . $type . "-->\n";
        }
    } # end while

    # finish up open links if we're extracting them
    if ($extractlinks && @canonical_urls) {
        while (my $url = LJ::ehtml(pop @canonical_urls)) {
            $newdata .= "</b> ($url)";
            $opencount{'a'}--;
        }
    }

    # close any tags that were opened and not closed
    # don't close tags that don't need a closing tag -- otherwise,
    # we output the closing tags in the wrong place (eg, a </td>
    # after the <table> was closed) causing unnecessary problems
    if (ref $opts->{'autoclose'} eq "ARRAY") {
        foreach my $tag (@{$opts->{'autoclose'}}) {
            next if $tag =~ /^(?:tr|td|th|tbody|thead|tfoot|li)$/;
            if ($opencount{$tag}) {
                $newdata .= "</$tag>" x $opencount{$tag};
            }
        }
    }
    
    # extra-paranoid check
    1 while $newdata =~ s/<script\b//ig;

    $$data = $newdata;
    $$data .= $extra_text if $extra_text; # invalid markup error

    return 0;
}


# takes a reference to HTML and a base URL, and modifies HTML in place to use absolute URLs from the given base
sub resolve_relative_urls
{
    my ($data, $base) = @_;
    my $p = HTML::TokeParser->new($data);

    # where we look for relative URLs
    my $rel_source = {
        'a' => { 
            'href' => 1,
        },
        'img' => { 
            'src' => 1,
        },
    };

    my $global_did_mod = 0;
    my $base_uri = undef;  # until needed
    my $newdata = "";

  TOKEN:
    while (my $token = $p->get_token)
    {
        my $type = $token->[0];

        if ($type eq "S")     # start tag
        {
            my $tag = $token->[1];
            my $hash  = $token->[2]; # attribute hashref
            my $attrs = $token->[3]; # attribute names, in original order

            my $did_mod = 0;
            # see if this is a tag that could contain relative URLs we fix up.
            if (my $relats = $rel_source->{$tag}) {
                while (my $k = each %$relats) {
                    next unless defined $hash->{$k} && $hash->{$k} !~ /^[a-z]+:/;
                    my $rel_url = $hash->{$k};
                    $global_did_mod = $did_mod = 1;

                    $base_uri ||= URI->new($base);
                    $hash->{$k} = URI->new_abs($rel_url, $base_uri)->as_string;
                }
            }

            # if no change was necessary
            unless ($did_mod) {
                $newdata .= $token->[4];
                next TOKEN;
            }
            
            # otherwise, rebuild the opening tag

            # for tags like <name/>, pretend it's <name> and reinsert the slash later
            my $slashclose = 0;   # If set to 1, use XML-style empty tag marker
            $slashclose = 1 if $tag =~ s!/$!!;
            $slashclose = 1 if delete $hash->{'/'};

            # spit it back out
            $newdata .= "<$tag";
            # output attributes in original order
            foreach (@$attrs) {
                $newdata .= " $_=\"" . LJ::ehtml($hash->{$_}) . "\""
                    if exists $hash->{$_};
            }
            $newdata .= " /" if $slashclose;
            $newdata .= ">"; 
        }
        elsif ($type eq "E") {
            $newdata .= $token->[2];
        }
        elsif ($type eq "D") {
            $newdata .= $token->[1];
        }
        elsif ($type eq "T") {
            $newdata .= $token->[1];
        } 
        elsif ($type eq "C") {
            $newdata .= $token->[1];
        }
        elsif ($type eq "PI") {
            $newdata .= $token->[2];
        }
    } # end while

    $$data = $newdata if $global_did_mod;
    return undef;
}

sub ExpandLJURL
{
    my @args = grep { $_ } split(/\//, $_[0]);
    my $mode = shift @args;

    my %modes =
        (
         'faq' => sub {
             my $id = shift()+0;
             if ($id) {
                 return "support/faqbrowse.bml?faqid=$id";
             } else {
                 return "support/faq.bml";
             }
         },
         'memories' => sub {
             my $user = LJ::canonical_username(shift);
             if ($user) {
                 return "memories.bml?user=$user";
             } else {
                 return "memories.bml";
             }
         },
         'pubkey' => sub {
             my $user = LJ::canonical_username(shift);
             if ($user) {
                 return "pubkey.bml?user=$user";
             } else {
                 return "pubkey.bml";
             }
         },
         'support' => sub {
             my $id = shift()+0;
             if ($id) {
                 return "support/see_request.bml?id=$id";
             } else {
                 return "support/";
             }
         },
         'todo' => sub {
             my $user = LJ::canonical_username(shift);
             if ($user) {
                 return "todo/?user=$user";
             } else {
                 return "todo/";
             }
         },
         'user' => sub {
             my $user = LJ::canonical_username(shift);
             return "" if grep { /[\"\'\<\>\n\&]/ } @_;
             return $_[0] eq 'profile' ?
                 "userinfo.bml?user=$user" :
                 "users/$user/" . join("", map { "$_/" } @_ );
         },
         'userinfo' => sub {
             my $user = LJ::canonical_username(shift);
             if ($user) {
                 return "userinfo.bml?user=$user";
             } else {
                 return "userinfo.bml";
             }
         },
         'userpics' => sub {
             my $user = LJ::canonical_username(shift);
             if ($user) {
                 return "allpics.bml?user=$user";
             } else {
                 return "allpics.bml";
             }
         },
        );

    my $uri = $modes{$mode} ? $modes{$mode}->(@args) : "error:bogus-lj-url";

    return "$LJ::SITEROOT/$uri";
}

my $subject_eat = [qw[head title style layer iframe applet object param]];
my $subject_allow = [qw[a b i u em strong cite]];
my $subject_remove = [qw[bgsound embed object caption link font noscript]];
sub clean_subject
{
    my $ref = shift;
    return unless $$ref =~ /[\<\>]/;
    clean($ref, {
        'wordlength' => 40,
        'addbreaks' => 0,
        'eat' => $subject_eat,
        'mode' => 'deny',
        'allow' => $subject_allow,
        'remove' => $subject_remove,
        'autoclose' => $subject_allow,
        'noearlyclose' => 1,
    });
}

## returns a pure text subject (needed in links, email headers, etc...)
my $subjectall_eat = [qw[head title style layer iframe applet object]];
sub clean_subject_all
{
    my $ref = shift;
    return unless $$ref =~ /[\<\>]/;
    clean($ref, {
        'wordlength' => 40,
        'addbreaks' => 0,
        'eat' => $subjectall_eat,
        'mode' => 'deny',
        'textonly' => 1,
        'autoclose' => $subject_allow,
        'noearlyclose' => 1,
    });
}

# wrapper around clean_subject_all; this also trims the subject to the given length
sub clean_and_trim_subject {
    my $ref = shift;
    my $length = shift || 40;

    LJ::CleanHTML::clean_subject_all($ref);
    $$ref =~ s/\n.*//s;
    $$ref = LJ::text_trim($$ref, 0, $length);
}

my $event_eat = [qw[head title style layer iframe applet object xml param]];
my $event_remove = [qw[bgsound embed object link body meta noscript plaintext noframes]];

my @comment_close = qw(
    a sub sup xmp bdo q span
    b i u tt s strike big small font
    abbr acronym cite code dfn em kbd samp strong var del ins
    h1 h2 h3 h4 h5 h6 div blockquote address pre center
    ul ol li dl dt dd
    table tr td th tbody tfoot thead colgroup caption
    marquee area map form textarea blink
);
my @comment_all = (@comment_close, "img", "br", "hr", "p", "col");

my $userbio_eat = $event_eat;
my $userbio_remove = $event_remove;
my @userbio_close = @comment_close;

sub clean_event
{
    my ($ref, $opts) = @_;

    # old prototype was passing in the ref and preformatted flag.
    # now the second argument is a hashref of options, so convert it to support the old way.
    unless (ref $opts eq "HASH") {
        $opts = { 'preformatted' => $opts };
    }

    my $wordlength = defined $opts->{'wordlength'} ? $opts->{'wordlength'} : 40;

    # fast path:  no markup or URLs to linkify
    if ($$ref !~ /\<|\>|http/ && ! $opts->{preformatted}) {
        $$ref =~ s/\S{$wordlength,}/break_word($&,$wordlength)/eg if $wordlength;
        $$ref =~ s/\r?\n/<br \/>/g;
        return;
    }
    
    # slow path: need to be run it through the cleaner
    clean($ref, {
        'linkify' => 1,
        'wordlength' => $wordlength,
        'addbreaks' => $opts->{'preformatted'} ? 0 : 1,
        'cuturl' => $opts->{'cuturl'},
        'cutpreview' => $opts->{'cutpreview'},
        'eat' => $event_eat,
        'mode' => 'allow',
        'remove' => $event_remove,
        'autoclose' => \@comment_close,
        'cleancss' => 1,
        'maximgwidth' => $opts->{'maximgwidth'},
        'maximgheight' => $opts->{'maximgheight'},
        'ljcut_disable' => $opts->{'ljcut_disable'},
        'noearlyclose' => 1,
        'tablecheck' => 1,
        'extractimages' => $opts->{'extractimages'} ? 1 : 0,
        'noexpandembedded' => $opts->{'noexpandembedded'} ? 1 : 0,
        'textonly' => $opts->{'textonly'} ? 1 : 0,
        'remove_colors' => $opts->{'remove_colors'} ? 1 : 0,
        'remove_sizes' => $opts->{'remove_sizes'} ? 1 : 0,
        'remove_fonts' => $opts->{'remove_fonts'} ? 1 : 0,
        'transform_embed_nocheck' => $opts->{'transform_embed_nocheck'} ? 1 : 0,
        'transform_embed_wmode' => $opts->{'transform_embed_wmode'},
    });
}

sub get_okay_comment_tags
{
    return @comment_all;
}


# ref: scalarref of text to clean, gets cleaned in-place
# opts:  either a hashref of opts:
#         - preformatted:  if true, don't insert breaks and auto-linkify
#         - anon_comment:  don't linkify things, and prevent <a> tags
#           <font> and <big> tags as well - MV, 2014, antimakaka
#       or, opts can just be a boolean scalar, which implies the performatted tag
sub clean_comment
{
    my ($ref, $opts) = @_;

    unless (ref $opts) {
        $opts = { 'preformatted' => $opts };
    }

    # fast path:  no markup or URLs to linkify
    if ($$ref !~ /\<|\>|http/ && ! $opts->{preformatted}) {
        $$ref =~ s/\S{40,}/break_word($&,40)/eg;
        $$ref =~ s/\r?\n/<br \/>/g;
        return 0;
    }

    # slow path: need to be run it through the cleaner
    return clean($ref, {
        'linkify' => 1,
        'wordlength' => 40,
        'addbreaks' => $opts->{preformatted} ? 0 : 1,
        'eat' => [qw[head title style layer iframe applet object]],
        'mode' => 'deny',
        'allow' => \@comment_all,
        'autoclose' => \@comment_close,
        'cleancss' => 1,
        'extractlinks' => $opts->{'anon_comment'},
        'extractimages' => $opts->{'anon_comment'},
	'maximages' => 1, # added in Aug 2014, antimakaka measure - MV,
	'anonhtml' => $opts->{'anon_comment'}, #added Nov 2014, antimakaka -MV
        'noearlyclose' => 1,
        'tablecheck' => 1,
        'nocss' => $opts->{'nocss'},
        'textonly' => $opts->{'textonly'} ? 1 : 0,
    });
}

sub clean_userbio {
    my $ref = shift;
    return undef unless ref $ref;

    clean($ref, {
        'wordlength' => 100,
        'addbreaks' => 1,
        'attrstrip' => [qw[style]],
        'mode' => 'allow',
        'noearlyclose' => 1,
        'tablecheck' => 1,
        'eat' => $userbio_eat,
        'remove' => $userbio_remove,
        'autoclose' => \@userbio_close,
        'cleancss' => 1,
    });
}

sub clean_s1_style
{
    my $s1 = shift;
    my $clean;
    
    my %tmpl;
    LJ::parse_vars(\$s1, \%tmpl);
    foreach my $v (keys %tmpl) {
        clean(\$tmpl{$v}, {
            'eat' => [qw[layer iframe script object embed applet]],
            'mode' => 'allow',
            'keepcomments' => 1, # allows CSS to work
            'clean_js_css' => 1,
            's1var' => $v,
        });
    }

    return Storable::nfreeze(\%tmpl);
}

sub s1_attribute_clean {
    my $a = $_[0];
    $a =~ s/[\t\n]//g;
    $a =~ s/\"/"/g;
    $a =~ s/\'/&\#39;/g;
    $a =~ s/</</g;
    $a =~ s/>/>/g;

    # IE sucks:
    if ($a =~ /((?:(?:v\s*b)|(?:j\s*a\s*v\s*a))\s*s\s*c\s*r\s*i\s*p\s*t|
                a\s*b\s*o\s*u\s*t)\s*:/ix) { return ""; }
    return $a;
}

sub canonical_url {
    my $url = shift;
    my $allow_all = shift;
    
    # strip leading and trailing spaces
    $url =~ s/^\s*//;
    $url =~ s/\s*$//;

    return '' unless $url;

    unless ($allow_all) {
        # see what protocol they want, default to http
        my $pref = "http";
        $pref = $1 if $url =~ /^(https?|ftp|webcal):/;

        # strip out the protocol section
        $url =~ s!^.*?:/*!!;

        return '' unless $url;

        # rebuild safe url
        $url = "$pref://$url";
    }

    if ($LJ::DEBUG{'aol_http_to_ftp'}) {
        # aol blocks http referred from lj, but ftp has no referer header.
        if ($url =~ m!^http://(?:www\.)?(?:members|hometown|users)\.aol\.com/!) {
            $url =~ s!^http!ftp!;
        }
    }

    return $url;
}

sub break_word {
    my ($word, $at) = @_;
    return $word unless $at;
    $word =~ s/((?:$onechar){$at})\B/$1<wbr \/>/g;
    return $word;
}

1;