617 lines
16 KiB
Perl
617 lines
16 KiB
Perl
package LJ;
|
|
use strict;
|
|
no warnings 'uninitialized';
|
|
|
|
use Class::Autouse qw(
|
|
LJ::ConvUTF8
|
|
HTML::TokeParser
|
|
);
|
|
|
|
# <LJFUNC>
|
|
# name: LJ::trim
|
|
# class: text
|
|
# des: Removes whitespace from left and right side of a string.
|
|
# args: string
|
|
# des-string: string to be trimmed
|
|
# returns: trimmed string
|
|
# </LJFUNC>
|
|
sub trim
|
|
{
|
|
my $a = $_[0];
|
|
$a =~ s/^\s+//;
|
|
$a =~ s/\s+$//;
|
|
return $a;
|
|
}
|
|
|
|
# <LJFUNC>
|
|
# name: LJ::decode_url_string
|
|
# class: web
|
|
# des: Parse URL-style arg/value pairs into a hash.
|
|
# args: buffer, hashref
|
|
# des-buffer: Scalar or scalarref of buffer to parse.
|
|
# des-hashref: Hashref to populate.
|
|
# returns: boolean; true.
|
|
# </LJFUNC>
|
|
sub decode_url_string
|
|
{
|
|
my $a = shift;
|
|
my $buffer = ref $a ? $a : \$a;
|
|
my $hashref = shift; # output hash
|
|
my $keyref = shift; # array of keys as they were found
|
|
|
|
my $pair;
|
|
my @pairs = split(/&/, $$buffer);
|
|
@$keyref = @pairs;
|
|
my ($name, $value);
|
|
foreach $pair (@pairs)
|
|
{
|
|
($name, $value) = split(/=/, $pair);
|
|
$value =~ tr/+/ /;
|
|
$value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
|
|
$name =~ tr/+/ /;
|
|
$name =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
|
|
$hashref->{$name} .= $hashref->{$name} ? "\0$value" : $value;
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
# args: hashref of key/values
|
|
# arrayref of keys in order (optional)
|
|
# returns: urlencoded string
|
|
sub encode_url_string {
|
|
my ($hashref, $keyref) = @_;
|
|
|
|
return join('&', map { LJ::eurl($_) . '=' . LJ::eurl($hashref->{$_}) }
|
|
(ref $keyref ? @$keyref : keys %$hashref));
|
|
}
|
|
|
|
# <LJFUNC>
|
|
# name: LJ::eurl
|
|
# class: text
|
|
# des: Escapes a value before it can be put in a URL. See also [func[LJ::durl]].
|
|
# args: string
|
|
# des-string: string to be escaped
|
|
# returns: string escaped
|
|
# </LJFUNC>
|
|
sub eurl
|
|
{
|
|
my $a = $_[0];
|
|
return '' unless $a;
|
|
$a =~ s/([^a-zA-Z0-9_\,\-.\/\\\: ])/uc sprintf("%%%02x",ord($1))/eg;
|
|
$a =~ tr/ /+/;
|
|
return $a;
|
|
}
|
|
|
|
# <LJFUNC>
|
|
# name: LJ::durl
|
|
# class: text
|
|
# des: Decodes a value that's URL-escaped. See also [func[LJ::eurl]].
|
|
# args: string
|
|
# des-string: string to be decoded
|
|
# returns: string decoded
|
|
# </LJFUNC>
|
|
sub durl
|
|
{
|
|
my ($a) = @_;
|
|
$a =~ tr/+/ /;
|
|
$a =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
|
|
return $a;
|
|
}
|
|
|
|
# <LJFUNC>
|
|
# name: LJ::exml
|
|
# class: text
|
|
# des: Escapes a value before it can be put in XML.
|
|
# args: string
|
|
# des-string: string to be escaped
|
|
# returns: string escaped.
|
|
# </LJFUNC>
|
|
sub exml
|
|
{
|
|
# fast path for the commmon case:
|
|
return $_[0] unless $_[0] =~ /[&\"\'<>\x00-\x08\x0B\x0C\x0E-\x1F]/;
|
|
# what are those character ranges? XML 1.0 allows:
|
|
# #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
|
|
|
|
my $a = shift;
|
|
$a =~ s/\&/&/g;
|
|
$a =~ s/\"/"/g;
|
|
$a =~ s/\'/'/g;
|
|
$a =~ s/</</g;
|
|
$a =~ s/>/>/g;
|
|
$a =~ s/[\x00-\x08\x0B\x0C\x0E-\x1F]//g;
|
|
return $a;
|
|
}
|
|
|
|
# <LJFUNC>
|
|
# name: LJ::ehtml
|
|
# class: text
|
|
# des: Escapes a value before it can be put in HTML.
|
|
# args: string
|
|
# des-string: string to be escaped
|
|
# returns: string escaped.
|
|
# </LJFUNC>
|
|
sub ehtml
|
|
{
|
|
# fast path for the commmon case:
|
|
return $_[0] unless $_[0] =~ /[&\"\'<>]/;
|
|
|
|
# this is faster than doing one substitution with a map:
|
|
my $a = $_[0];
|
|
$a =~ s/\&/&/g;
|
|
$a =~ s/\"/"/g;
|
|
$a =~ s/\'/&\#39;/g;
|
|
$a =~ s/</</g;
|
|
$a =~ s/>/>/g;
|
|
return $a;
|
|
}
|
|
*eall = \&ehtml; # old BML syntax required eall to also escape BML. not anymore.
|
|
|
|
# <LJFUNC>
|
|
# name: LJ::etags
|
|
# class: text
|
|
# des: Escapes < and > from a string
|
|
# args: string
|
|
# des-string: string to be escaped
|
|
# returns: string escaped.
|
|
# </LJFUNC>
|
|
sub etags
|
|
{
|
|
# fast path for the commmon case:
|
|
return $_[0] unless $_[0] =~ /[<>]/;
|
|
|
|
my $a = $_[0];
|
|
$a =~ s/</</g;
|
|
$a =~ s/>/>/g;
|
|
return $a;
|
|
}
|
|
|
|
# <LJFUNC>
|
|
# name: LJ::ejs
|
|
# class: text
|
|
# des: Escapes a string value before it can be put in JavaScript.
|
|
# args: string
|
|
# des-string: string to be escaped
|
|
# returns: string escaped.
|
|
# </LJFUNC>
|
|
sub ejs
|
|
{
|
|
my $a = $_[0];
|
|
$a =~ s/[\"\'\\]/\\$&/g;
|
|
$a =~ s/"/\\"/g;
|
|
$a =~ s/\r?\n/\\n/gs;
|
|
$a =~ s/\r//gs;
|
|
return $a;
|
|
}
|
|
|
|
# given a string, makes it into a string you can put into javascript,
|
|
# including protecting against closing </script> tags in the entry.
|
|
# does the double quotes for ya.
|
|
sub ejs_string {
|
|
my $str = ejs($_[0]);
|
|
$str =~ s!</script!</scri\" + \"pt!g;
|
|
return "\"" . $str . "\"";
|
|
}
|
|
|
|
# changes every char in a string to %XX where XX is the hex value
|
|
# this is useful for passing strings to javascript through HTML, because
|
|
# javascript's "unescape" function expects strings in this format
|
|
sub ejs_all
|
|
{
|
|
my $a = $_[0];
|
|
$a =~ s/(.)/uc sprintf("%%%02x",ord($1))/eg;
|
|
return $a;
|
|
}
|
|
|
|
|
|
# strip all HTML tags from a string
|
|
sub strip_html {
|
|
my $str = shift;
|
|
$str =~ s/\<lj user\=['"]?([\w-]+)['"]?\>/$1/g; # "
|
|
$str =~ s/\<([^\<])+\>//g;
|
|
return $str;
|
|
}
|
|
|
|
# <LJFUNC>
|
|
# name: LJ::is_ascii
|
|
# des: checks if text is pure ASCII.
|
|
# args: text
|
|
# des-text: text to check for being pure 7-bit ASCII text.
|
|
# returns: 1 if text is indeed pure 7-bit, 0 otherwise.
|
|
# </LJFUNC>
|
|
sub is_ascii {
|
|
my $text = shift;
|
|
return ($text !~ m/[^\x01-\x7f]/);
|
|
}
|
|
|
|
# <LJFUNC>
|
|
# name: LJ::is_utf8
|
|
# des: check text for UTF-8 validity.
|
|
# args: text
|
|
# des-text: text to check for UTF-8 validity
|
|
# returns: 1 if text is a valid UTF-8 stream, 0 otherwise.
|
|
# </LJFUNC>
|
|
sub is_utf8 {
|
|
my $text = shift;
|
|
|
|
# it seems there is a dumbass who calls this function
|
|
# with $text being reference to scalar; now we can tolerate this
|
|
if (ref ($text) eq "HASH") {
|
|
return ! (grep { !LJ::is_utf8($_) } values %{$text});
|
|
}
|
|
elsif (ref ($text) eq "ARRAY") {
|
|
return ! (grep { !LJ::is_utf8($_) } @{$text});
|
|
}
|
|
elsif (ref ($text)) {
|
|
return 0;
|
|
}
|
|
|
|
if (LJ::are_hooks("is_utf8")) {
|
|
return LJ::run_hook("is_utf8", $text);
|
|
}
|
|
|
|
# for a discussion of the different utf8 validity checking methods,
|
|
# see: http://zilla.livejournal.org/657
|
|
# in summary, this isn't the fastest, but it's pretty fast, it doesn't make
|
|
# perl segfault, and it doesn't add new crazy dependencies. if you want
|
|
# speed, check out ljcom's is_utf8 version in C, using Inline.pm
|
|
|
|
return 1 unless defined($text);
|
|
|
|
my $u = Unicode::String::utf8($text);
|
|
my $text2 = $u->utf8;
|
|
|
|
# return $text eq $text2;
|
|
|
|
return LJ::is_ascii($text) || utf8::is_utf8($text) || $text eq $text2;
|
|
}
|
|
|
|
# <LJFUNC>
|
|
# name: LJ::text_out
|
|
# des: force outgoing text into valid UTF-8.
|
|
# args: text
|
|
# des-text: reference to text to pass to output. Text if modified in-place.
|
|
# returns: nothing.
|
|
# </LJFUNC>
|
|
sub text_out
|
|
{
|
|
my $rtext = shift;
|
|
|
|
# if we're not Unicode, do nothing
|
|
return unless $LJ::UNICODE;
|
|
|
|
# is this valid UTF-8 already?
|
|
return if LJ::is_utf8($$rtext);
|
|
|
|
# no. Blot out all non-ASCII chars
|
|
$$rtext =~ s/[\x00\x80-\xff]/\?/g;
|
|
return;
|
|
}
|
|
|
|
# <LJFUNC>
|
|
# name: LJ::text_in
|
|
# des: do appropriate checks on input text. Should be called on all
|
|
# user-generated text.
|
|
# args: text
|
|
# des-text: text to check
|
|
# returns: 1 if the text is valid, 0 if not.
|
|
# </LJFUNC>
|
|
sub text_in
|
|
{
|
|
my $text = shift;
|
|
return 1 unless $LJ::UNICODE;
|
|
if (ref ($text) eq "HASH") {
|
|
return ! (grep { !LJ::is_utf8($_) } values %{$text});
|
|
}
|
|
if (ref ($text) eq "ARRAY") {
|
|
return ! (grep { !LJ::is_utf8($_) } @{$text});
|
|
}
|
|
return LJ::is_utf8($text);
|
|
}
|
|
|
|
# <LJFUNC>
|
|
# name: LJ::text_convert
|
|
# des: convert old entries/comments to UTF-8 using user's default encoding.
|
|
# args: dbs?, text, u, error
|
|
# des-dbs: optional. Deprecated; a master/slave set of database handles.
|
|
# des-text: old possibly non-ASCII text to convert
|
|
# des-u: user hashref of the journal's owner
|
|
# des-error: ref to a scalar variable which is set to 1 on error
|
|
# (when user has no default encoding defined, but
|
|
# text needs to be translated).
|
|
# returns: converted text or undef on error
|
|
# </LJFUNC>
|
|
sub text_convert
|
|
{
|
|
&nodb;
|
|
my ($text, $u, $error) = @_;
|
|
|
|
# maybe it's pure ASCII?
|
|
return $text if LJ::is_ascii($text);
|
|
|
|
# load encoding id->name mapping if it's not loaded yet
|
|
LJ::load_codes({ "encoding" => \%LJ::CACHE_ENCODINGS } )
|
|
unless %LJ::CACHE_ENCODINGS;
|
|
|
|
if ($u->{'oldenc'} == 0 ||
|
|
not defined $LJ::CACHE_ENCODINGS{$u->{'oldenc'}}) {
|
|
$$error = 1;
|
|
return undef;
|
|
};
|
|
|
|
# convert!
|
|
my $name = $LJ::CACHE_ENCODINGS{$u->{'oldenc'}};
|
|
unless (LJ::ConvUTF8->supported_charset($name)) {
|
|
$$error = 1;
|
|
return undef;
|
|
}
|
|
|
|
return LJ::ConvUTF8->to_utf8($name, $text);
|
|
}
|
|
|
|
|
|
# <LJFUNC>
|
|
# name: LJ::text_length
|
|
# des: returns both byte length and character length of a string. In a non-Unicode
|
|
# environment, this means byte length twice. In a Unicode environment,
|
|
# the function assumes that its argument is a valid UTF-8 string.
|
|
# args: text
|
|
# des-text: the string to measure
|
|
# returns: a list of two values, (byte_length, char_length).
|
|
# </LJFUNC>
|
|
|
|
sub text_length
|
|
{
|
|
my $text = shift;
|
|
my $bl = length($text);
|
|
unless ($LJ::UNICODE) {
|
|
return ($bl, $bl);
|
|
}
|
|
my $cl = 0;
|
|
my $utf_char = "([\x00-\x7f]|[\xc0-\xdf].|[\xe0-\xef]..|[\xf0-\xf7]...)";
|
|
|
|
while ($text =~ m/$utf_char/go) { $cl++; }
|
|
return ($bl, $cl);
|
|
}
|
|
|
|
# <LJFUNC>
|
|
# name: LJ::text_trim
|
|
# des: truncate string according to requirements on byte length, char
|
|
# length, or both. "char length" means number of UTF-8 characters if
|
|
# [ljconfig[unicode]] is set, or the same thing as byte length otherwise.
|
|
# args: text, byte_max, char_max
|
|
# des-text: the string to trim
|
|
# des-byte_max: maximum allowed length in bytes; if 0, there's no restriction
|
|
# des-char_max: maximum allowed length in chars; if 0, there's no restriction
|
|
# returns: the truncated string.
|
|
# </LJFUNC>
|
|
sub text_trim
|
|
{
|
|
my ($text, $byte_max, $char_max) = @_;
|
|
return $text unless $byte_max or $char_max;
|
|
if (!$LJ::UNICODE) {
|
|
$byte_max = $char_max if $char_max and $char_max < $byte_max;
|
|
$byte_max = $char_max unless $byte_max;
|
|
return substr($text, 0, $byte_max);
|
|
}
|
|
|
|
# after upgrade to perl 5.12 we can use perl subroutines to handle unicode
|
|
# (need to convert all the input to unicode as soon as it enters the system)
|
|
return substr($text,0,$char_max);
|
|
|
|
# my $cur = 0;
|
|
# my $utf_char = "([\x00-\x7f]|[\xc0-\xdf].|[\xe0-\xef]..|[\xf0-\xf7]...)";
|
|
#
|
|
# # if we don't have a character limit, assume it's the same as the byte limit.
|
|
# # we will never have more characters than bytes, but we might have more bytes
|
|
# # than characters, so we can't inherit the other way.
|
|
# $char_max ||= $byte_max;
|
|
#
|
|
# while ($text =~ m/$utf_char/gco) {
|
|
# last unless $char_max;
|
|
# last if $cur + length($1) > $byte_max and $byte_max;
|
|
# $cur += length($1);
|
|
# $char_max--;
|
|
# }
|
|
# return substr($text,0,$cur);
|
|
}
|
|
|
|
# <LJFUNC>
|
|
# name: LJ::text_compress
|
|
# des: Compresses a chunk of text, to gzip, if configured for site. Can compress
|
|
# a scalarref in place, or return a compressed copy. Won't compress if
|
|
# value is too small, already compressed, or size would grow by compressing.
|
|
# args: text
|
|
# des-text: either a scalar or scalarref
|
|
# returns: nothing if given a scalarref (to compress in-place), or original/compressed value,
|
|
# depending on site config.
|
|
# </LJFUNC>
|
|
sub text_compress
|
|
{
|
|
my $text = shift;
|
|
my $ref = ref $text;
|
|
return $ref ? undef : $text unless $LJ::COMPRESS_TEXT;
|
|
die "Invalid reference" if $ref && $ref ne "SCALAR";
|
|
|
|
my $tref = $ref ? $text : \$text;
|
|
my $pre_len = length($$tref);
|
|
unless (substr($$tref,0,2) eq "\037\213" || $pre_len < 100) {
|
|
my $gz = Compress::Zlib::memGzip($$tref);
|
|
if (length($gz) < $pre_len) {
|
|
$$tref = $gz;
|
|
}
|
|
}
|
|
|
|
return $ref ? undef : $$tref;
|
|
}
|
|
|
|
# <LJFUNC>
|
|
# name: LJ::text_uncompress
|
|
# des: Uncompresses a chunk of text, from gzip, if configured for site. Can uncompress
|
|
# a scalarref in place, or return a compressed copy. Won't uncompress unless
|
|
# it finds the gzip magic number at the beginning of the text.
|
|
# args: text
|
|
# des-text: either a scalar or scalarref.
|
|
# returns: nothing if given a scalarref (to uncompress in-place), or original/uncompressed value,
|
|
# depending on if test was compressed or not
|
|
# </LJFUNC>
|
|
sub text_uncompress
|
|
{
|
|
my $text = shift;
|
|
my $ref = ref $text;
|
|
die "Invalid reference" if $ref && $ref ne "SCALAR";
|
|
my $tref = $ref ? $text : \$text;
|
|
|
|
# check for gzip's magic number
|
|
if (substr($$tref,0,2) eq "\037\213") {
|
|
$$tref = Compress::Zlib::memGunzip($$tref);
|
|
}
|
|
|
|
return $ref ? undef : $$tref;
|
|
}
|
|
|
|
# function to trim a string containing HTML. this will auto-close any
|
|
# html tags that were still open when the string was truncated
|
|
sub html_trim {
|
|
my ($text, $char_max) = @_;
|
|
|
|
return $text unless $char_max;
|
|
|
|
my $p = HTML::TokeParser->new(\$text);
|
|
my @open_tags; # keep track of what tags are open
|
|
my $out = '';
|
|
my $content_len = 0;
|
|
|
|
TOKEN:
|
|
while (my $token = $p->get_token) {
|
|
my $type = $token->[0];
|
|
my $tag = $token->[1];
|
|
my $attr = $token->[2]; # hashref
|
|
|
|
if ($type eq "S") {
|
|
my $selfclose;
|
|
|
|
# start tag
|
|
$out .= "<$tag";
|
|
|
|
# assume tags are properly self-closed
|
|
$selfclose = 1 if lc $tag eq 'input' || lc $tag eq 'br' || lc $tag eq 'img';
|
|
|
|
# preserve order of attributes. the original order is
|
|
# in element 4 of $token
|
|
foreach my $attrname (@{$token->[3]}) {
|
|
if ($attrname eq '/') {
|
|
$selfclose = 1;
|
|
next;
|
|
}
|
|
|
|
# FIXME: ultra ghetto.
|
|
$attr->{$attrname} = LJ::no_utf8_flag($attr->{$attrname});
|
|
$out .= " $attrname=\"" . LJ::ehtml($attr->{$attrname}) . "\"";
|
|
}
|
|
|
|
$out .= $selfclose ? " />" : ">";
|
|
|
|
push @open_tags, $tag unless $selfclose;
|
|
|
|
} elsif ($type eq 'T' || $type eq 'D') {
|
|
my $content = $token->[1];
|
|
|
|
if (length($content) + $content_len > $char_max) {
|
|
|
|
# truncate and stop parsing
|
|
$content = LJ::text_trim($content, undef, ($char_max - $content_len));
|
|
$out .= $content;
|
|
last;
|
|
}
|
|
|
|
$content_len += length $content;
|
|
|
|
$out .= $content;
|
|
|
|
} elsif ($type eq 'C') {
|
|
# comment, don't care
|
|
$out .= $token->[1];
|
|
|
|
} elsif ($type eq 'E') {
|
|
# end tag
|
|
pop @open_tags;
|
|
$out .= "</$tag>";
|
|
}
|
|
}
|
|
|
|
$out .= join("\n", map { "</$_>" } reverse @open_tags);
|
|
|
|
return $out;
|
|
}
|
|
|
|
# takes a number, inserts commas where needed
|
|
sub commafy {
|
|
my $number = shift;
|
|
return $number unless $number =~ /^\d+$/;
|
|
|
|
my $punc = LJ::Lang::ml('number.punctuation') || ",";
|
|
$number =~ s/(?<=\d)(?=(\d\d\d)+(?!\d))/$punc/g;
|
|
return $number;
|
|
}
|
|
|
|
# <LJFUNC>
|
|
# name: LJ::html_newlines
|
|
# des: Replace newlines with HTML break tags.
|
|
# args: text
|
|
# returns: text, possibly including HTML break tags.
|
|
# </LJFUNC>
|
|
sub html_newlines
|
|
{
|
|
my $text = shift;
|
|
$text =~ s/\n/<br \/>/gm;
|
|
|
|
return $text;
|
|
}
|
|
|
|
# given HTML, returns an arrayref of URLs to images that are in the HTML
|
|
sub html_get_img_urls {
|
|
my $htmlref = shift;
|
|
my %opts = @_;
|
|
|
|
my $exclude_site_imgs = $opts{exclude_site_imgs} || 0;
|
|
|
|
my @image_urls;
|
|
my $p = HTML::TokeParser->new($htmlref);
|
|
|
|
while (my $token = $p->get_token) {
|
|
if ($token->[1] eq "img") {
|
|
my $attrs = $token->[2];
|
|
foreach my $attr (keys %$attrs) {
|
|
push @image_urls, $attrs->{$attr} if
|
|
$attr eq "src" &&
|
|
($exclude_site_imgs ? $attrs->{$attr} !~ /^$LJ::IMGPREFIX/ : 1);
|
|
}
|
|
}
|
|
}
|
|
|
|
return \@image_urls;
|
|
}
|
|
|
|
# given HTML, returns an arrayref of link URLs that are in the HTML
|
|
sub html_get_link_urls {
|
|
my $htmlref = shift;
|
|
my %opts = @_;
|
|
|
|
my @link_urls;
|
|
my $p = HTML::TokeParser->new($htmlref);
|
|
|
|
while (my $token = $p->get_token) {
|
|
if ($token->[0] eq "S" && $token->[1] eq "a") {
|
|
my $attrs = $token->[2];
|
|
foreach my $attr (keys %$attrs) {
|
|
push @link_urls, $attrs->{$attr} if $attr eq "href";
|
|
}
|
|
}
|
|
}
|
|
|
|
return \@link_urls;
|
|
}
|
|
|
|
1;
|