package LJ; use strict; no warnings 'uninitialized'; use Class::Autouse qw( LJ::ConvUTF8 HTML::TokeParser ); # # name: LJ::trim # class: text # des: Removes whitespace from left and right side of a string. # args: string # des-string: string to be trimmed # returns: trimmed string # sub trim { my $a = $_[0]; $a =~ s/^\s+//; $a =~ s/\s+$//; return $a; } # # name: LJ::decode_url_string # class: web # des: Parse URL-style arg/value pairs into a hash. # args: buffer, hashref # des-buffer: Scalar or scalarref of buffer to parse. # des-hashref: Hashref to populate. # returns: boolean; true. # sub decode_url_string { my $a = shift; my $buffer = ref $a ? $a : \$a; my $hashref = shift; # output hash my $keyref = shift; # array of keys as they were found my $pair; my @pairs = split(/&/, $$buffer); @$keyref = @pairs; my ($name, $value); foreach $pair (@pairs) { ($name, $value) = split(/=/, $pair); $value =~ tr/+/ /; $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; $name =~ tr/+/ /; $name =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; $hashref->{$name} .= $hashref->{$name} ? "\0$value" : $value; } return 1; } # args: hashref of key/values # arrayref of keys in order (optional) # returns: urlencoded string sub encode_url_string { my ($hashref, $keyref) = @_; return join('&', map { LJ::eurl($_) . '=' . LJ::eurl($hashref->{$_}) } (ref $keyref ? @$keyref : keys %$hashref)); } # # name: LJ::eurl # class: text # des: Escapes a value before it can be put in a URL. See also [func[LJ::durl]]. # args: string # des-string: string to be escaped # returns: string escaped # sub eurl { my $a = $_[0]; return '' unless $a; $a =~ s/([^a-zA-Z0-9_\,\-.\/\\\: ])/uc sprintf("%%%02x",ord($1))/eg; $a =~ tr/ /+/; return $a; } # # name: LJ::durl # class: text # des: Decodes a value that's URL-escaped. See also [func[LJ::eurl]]. # args: string # des-string: string to be decoded # returns: string decoded # sub durl { my ($a) = @_; $a =~ tr/+/ /; $a =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; return $a; } # # name: LJ::exml # class: text # des: Escapes a value before it can be put in XML. # args: string # des-string: string to be escaped # returns: string escaped. # sub exml { # fast path for the commmon case: return $_[0] unless $_[0] =~ /[&\"\'<>\x00-\x08\x0B\x0C\x0E-\x1F]/; # what are those character ranges? XML 1.0 allows: # #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] my $a = shift; $a =~ s/\&/&/g; $a =~ s/\"/"/g; $a =~ s/\'/'/g; $a =~ s//>/g; $a =~ s/[\x00-\x08\x0B\x0C\x0E-\x1F]//g; return $a; } # # name: LJ::ehtml # class: text # des: Escapes a value before it can be put in HTML. # args: string # des-string: string to be escaped # returns: string escaped. # sub ehtml { # fast path for the commmon case: return $_[0] unless $_[0] =~ /[&\"\'<>]/; # this is faster than doing one substitution with a map: my $a = $_[0]; $a =~ s/\&/&/g; $a =~ s/\"/"/g; $a =~ s/\'/&\#39;/g; $a =~ s//>/g; return $a; } *eall = \&ehtml; # old BML syntax required eall to also escape BML. not anymore. # # name: LJ::etags # class: text # des: Escapes < and > from a string # args: string # des-string: string to be escaped # returns: string escaped. # sub etags { # fast path for the commmon case: return $_[0] unless $_[0] =~ /[<>]/; my $a = $_[0]; $a =~ s//>/g; return $a; } # # name: LJ::ejs # class: text # des: Escapes a string value before it can be put in JavaScript. # args: string # des-string: string to be escaped # returns: string escaped. # sub ejs { my $a = $_[0]; $a =~ s/[\"\'\\]/\\$&/g; $a =~ s/"/\\"/g; $a =~ s/\r?\n/\\n/gs; $a =~ s/\r//gs; return $a; } # given a string, makes it into a string you can put into javascript, # including protecting against closing tags in the entry. # does the double quotes for ya. sub ejs_string { my $str = ejs($_[0]); $str =~ s!/$1/g; # " $str =~ s/\<([^\<])+\>//g; return $str; } # # name: LJ::is_ascii # des: checks if text is pure ASCII. # args: text # des-text: text to check for being pure 7-bit ASCII text. # returns: 1 if text is indeed pure 7-bit, 0 otherwise. # sub is_ascii { my $text = shift; return ($text !~ m/[^\x01-\x7f]/); } # # name: LJ::is_utf8 # des: check text for UTF-8 validity. # args: text # des-text: text to check for UTF-8 validity # returns: 1 if text is a valid UTF-8 stream, 0 otherwise. # sub is_utf8 { my $text = shift; # it seems there is a dumbass who calls this function # with $text being reference to scalar; now we can tolerate this if (ref ($text) eq "HASH") { return ! (grep { !LJ::is_utf8($_) } values %{$text}); } elsif (ref ($text) eq "ARRAY") { return ! (grep { !LJ::is_utf8($_) } @{$text}); } elsif (ref ($text)) { return 0; } if (LJ::are_hooks("is_utf8")) { return LJ::run_hook("is_utf8", $text); } # for a discussion of the different utf8 validity checking methods, # see: http://zilla.livejournal.org/657 # in summary, this isn't the fastest, but it's pretty fast, it doesn't make # perl segfault, and it doesn't add new crazy dependencies. if you want # speed, check out ljcom's is_utf8 version in C, using Inline.pm return 1 unless defined($text); my $u = Unicode::String::utf8($text); my $text2 = $u->utf8; # return $text eq $text2; return LJ::is_ascii($text) || utf8::is_utf8($text) || $text eq $text2; } # # name: LJ::text_out # des: force outgoing text into valid UTF-8. # args: text # des-text: reference to text to pass to output. Text if modified in-place. # returns: nothing. # sub text_out { my $rtext = shift; # if we're not Unicode, do nothing return unless $LJ::UNICODE; # is this valid UTF-8 already? return if LJ::is_utf8($$rtext); # no. Blot out all non-ASCII chars $$rtext =~ s/[\x00\x80-\xff]/\?/g; return; } # # name: LJ::text_in # des: do appropriate checks on input text. Should be called on all # user-generated text. # args: text # des-text: text to check # returns: 1 if the text is valid, 0 if not. # sub text_in { my $text = shift; return 1 unless $LJ::UNICODE; if (ref ($text) eq "HASH") { return ! (grep { !LJ::is_utf8($_) } values %{$text}); } if (ref ($text) eq "ARRAY") { return ! (grep { !LJ::is_utf8($_) } @{$text}); } return LJ::is_utf8($text); } # # name: LJ::text_convert # des: convert old entries/comments to UTF-8 using user's default encoding. # args: dbs?, text, u, error # des-dbs: optional. Deprecated; a master/slave set of database handles. # des-text: old possibly non-ASCII text to convert # des-u: user hashref of the journal's owner # des-error: ref to a scalar variable which is set to 1 on error # (when user has no default encoding defined, but # text needs to be translated). # returns: converted text or undef on error # sub text_convert { &nodb; my ($text, $u, $error) = @_; # maybe it's pure ASCII? return $text if LJ::is_ascii($text); # load encoding id->name mapping if it's not loaded yet LJ::load_codes({ "encoding" => \%LJ::CACHE_ENCODINGS } ) unless %LJ::CACHE_ENCODINGS; if ($u->{'oldenc'} == 0 || not defined $LJ::CACHE_ENCODINGS{$u->{'oldenc'}}) { $$error = 1; return undef; }; # convert! my $name = $LJ::CACHE_ENCODINGS{$u->{'oldenc'}}; unless (LJ::ConvUTF8->supported_charset($name)) { $$error = 1; return undef; } return LJ::ConvUTF8->to_utf8($name, $text); } # # name: LJ::text_length # des: returns both byte length and character length of a string. In a non-Unicode # environment, this means byte length twice. In a Unicode environment, # the function assumes that its argument is a valid UTF-8 string. # args: text # des-text: the string to measure # returns: a list of two values, (byte_length, char_length). # sub text_length { my $text = shift; my $bl = length($text); unless ($LJ::UNICODE) { return ($bl, $bl); } my $cl = 0; my $utf_char = "([\x00-\x7f]|[\xc0-\xdf].|[\xe0-\xef]..|[\xf0-\xf7]...)"; while ($text =~ m/$utf_char/go) { $cl++; } return ($bl, $cl); } # # name: LJ::text_trim # des: truncate string according to requirements on byte length, char # length, or both. "char length" means number of UTF-8 characters if # [ljconfig[unicode]] is set, or the same thing as byte length otherwise. # args: text, byte_max, char_max # des-text: the string to trim # des-byte_max: maximum allowed length in bytes; if 0, there's no restriction # des-char_max: maximum allowed length in chars; if 0, there's no restriction # returns: the truncated string. # sub text_trim { my ($text, $byte_max, $char_max) = @_; return $text unless $byte_max or $char_max; if (!$LJ::UNICODE) { $byte_max = $char_max if $char_max and $char_max < $byte_max; $byte_max = $char_max unless $byte_max; return substr($text, 0, $byte_max); } # after upgrade to perl 5.12 we can use perl subroutines to handle unicode # (need to convert all the input to unicode as soon as it enters the system) return substr($text,0,$char_max); # my $cur = 0; # my $utf_char = "([\x00-\x7f]|[\xc0-\xdf].|[\xe0-\xef]..|[\xf0-\xf7]...)"; # # # if we don't have a character limit, assume it's the same as the byte limit. # # we will never have more characters than bytes, but we might have more bytes # # than characters, so we can't inherit the other way. # $char_max ||= $byte_max; # # while ($text =~ m/$utf_char/gco) { # last unless $char_max; # last if $cur + length($1) > $byte_max and $byte_max; # $cur += length($1); # $char_max--; # } # return substr($text,0,$cur); } # # name: LJ::text_compress # des: Compresses a chunk of text, to gzip, if configured for site. Can compress # a scalarref in place, or return a compressed copy. Won't compress if # value is too small, already compressed, or size would grow by compressing. # args: text # des-text: either a scalar or scalarref # returns: nothing if given a scalarref (to compress in-place), or original/compressed value, # depending on site config. # sub text_compress { my $text = shift; my $ref = ref $text; return $ref ? undef : $text unless $LJ::COMPRESS_TEXT; die "Invalid reference" if $ref && $ref ne "SCALAR"; my $tref = $ref ? $text : \$text; my $pre_len = length($$tref); unless (substr($$tref,0,2) eq "\037\213" || $pre_len < 100) { my $gz = Compress::Zlib::memGzip($$tref); if (length($gz) < $pre_len) { $$tref = $gz; } } return $ref ? undef : $$tref; } # # name: LJ::text_uncompress # des: Uncompresses a chunk of text, from gzip, if configured for site. Can uncompress # a scalarref in place, or return a compressed copy. Won't uncompress unless # it finds the gzip magic number at the beginning of the text. # args: text # des-text: either a scalar or scalarref. # returns: nothing if given a scalarref (to uncompress in-place), or original/uncompressed value, # depending on if test was compressed or not # sub text_uncompress { my $text = shift; my $ref = ref $text; die "Invalid reference" if $ref && $ref ne "SCALAR"; my $tref = $ref ? $text : \$text; # check for gzip's magic number if (substr($$tref,0,2) eq "\037\213") { $$tref = Compress::Zlib::memGunzip($$tref); } return $ref ? undef : $$tref; } # function to trim a string containing HTML. this will auto-close any # html tags that were still open when the string was truncated sub html_trim { my ($text, $char_max) = @_; return $text unless $char_max; my $p = HTML::TokeParser->new(\$text); my @open_tags; # keep track of what tags are open my $out = ''; my $content_len = 0; TOKEN: while (my $token = $p->get_token) { my $type = $token->[0]; my $tag = $token->[1]; my $attr = $token->[2]; # hashref if ($type eq "S") { my $selfclose; # start tag $out .= "<$tag"; # assume tags are properly self-closed $selfclose = 1 if lc $tag eq 'input' || lc $tag eq 'br' || lc $tag eq 'img'; # preserve order of attributes. the original order is # in element 4 of $token foreach my $attrname (@{$token->[3]}) { if ($attrname eq '/') { $selfclose = 1; next; } # FIXME: ultra ghetto. $attr->{$attrname} = LJ::no_utf8_flag($attr->{$attrname}); $out .= " $attrname=\"" . LJ::ehtml($attr->{$attrname}) . "\""; } $out .= $selfclose ? " />" : ">"; push @open_tags, $tag unless $selfclose; } elsif ($type eq 'T' || $type eq 'D') { my $content = $token->[1]; if (length($content) + $content_len > $char_max) { # truncate and stop parsing $content = LJ::text_trim($content, undef, ($char_max - $content_len)); $out .= $content; last; } $content_len += length $content; $out .= $content; } elsif ($type eq 'C') { # comment, don't care $out .= $token->[1]; } elsif ($type eq 'E') { # end tag pop @open_tags; $out .= ""; } } $out .= join("\n", map { "" } reverse @open_tags); return $out; } # takes a number, inserts commas where needed sub commafy { my $number = shift; return $number unless $number =~ /^\d+$/; my $punc = LJ::Lang::ml('number.punctuation') || ","; $number =~ s/(?<=\d)(?=(\d\d\d)+(?!\d))/$punc/g; return $number; } # # name: LJ::html_newlines # des: Replace newlines with HTML break tags. # args: text # returns: text, possibly including HTML break tags. # sub html_newlines { my $text = shift; $text =~ s/\n/
/gm; return $text; } # given HTML, returns an arrayref of URLs to images that are in the HTML sub html_get_img_urls { my $htmlref = shift; my %opts = @_; my $exclude_site_imgs = $opts{exclude_site_imgs} || 0; my @image_urls; my $p = HTML::TokeParser->new($htmlref); while (my $token = $p->get_token) { if ($token->[1] eq "img") { my $attrs = $token->[2]; foreach my $attr (keys %$attrs) { push @image_urls, $attrs->{$attr} if $attr eq "src" && ($exclude_site_imgs ? $attrs->{$attr} !~ /^$LJ::IMGPREFIX/ : 1); } } } return \@image_urls; } # given HTML, returns an arrayref of link URLs that are in the HTML sub html_get_link_urls { my $htmlref = shift; my %opts = @_; my @link_urls; my $p = HTML::TokeParser->new($htmlref); while (my $token = $p->get_token) { if ($token->[0] eq "S" && $token->[1] eq "a") { my $attrs = $token->[2]; foreach my $attr (keys %$attrs) { push @link_urls, $attrs->{$attr} if $attr eq "href"; } } } return \@link_urls; } 1;