617 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			Perl
		
	
	
	
			
		
		
	
	
			617 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			Perl
		
	
	
	
| package LJ;
 | |
| use strict;
 | |
| no warnings 'uninitialized';
 | |
| 
 | |
| use Class::Autouse qw(
 | |
|                       LJ::ConvUTF8
 | |
|                       HTML::TokeParser
 | |
|                       );
 | |
| 
 | |
| # <LJFUNC>
 | |
| # name: LJ::trim
 | |
| # class: text
 | |
| # des: Removes whitespace from left and right side of a string.
 | |
| # args: string
 | |
| # des-string: string to be trimmed
 | |
| # returns: trimmed string
 | |
| # </LJFUNC>
 | |
| sub trim
 | |
| {
 | |
|     my $a = $_[0];
 | |
|     $a =~ s/^\s+//;
 | |
|     $a =~ s/\s+$//;
 | |
|     return $a;
 | |
| }
 | |
| 
 | |
| # <LJFUNC>
 | |
| # name: LJ::decode_url_string
 | |
| # class: web
 | |
| # des: Parse URL-style arg/value pairs into a hash.
 | |
| # args: buffer, hashref
 | |
| # des-buffer: Scalar or scalarref of buffer to parse.
 | |
| # des-hashref: Hashref to populate.
 | |
| # returns: boolean; true.
 | |
| # </LJFUNC>
 | |
| sub decode_url_string
 | |
| {
 | |
|     my $a = shift;
 | |
|     my $buffer = ref $a ? $a : \$a;
 | |
|     my $hashref = shift;  # output hash
 | |
|     my $keyref  = shift;  # array of keys as they were found
 | |
| 
 | |
|     my $pair;
 | |
|     my @pairs = split(/&/, $$buffer);
 | |
|     @$keyref = @pairs;
 | |
|     my ($name, $value);
 | |
|     foreach $pair (@pairs)
 | |
|     {
 | |
|         ($name, $value) = split(/=/, $pair);
 | |
|         $value =~ tr/+/ /;
 | |
|         $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
 | |
|         $name =~ tr/+/ /;
 | |
|         $name =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
 | |
|         $hashref->{$name} .= $hashref->{$name} ? "\0$value" : $value;
 | |
|     }
 | |
|     return 1;
 | |
| }
 | |
| 
 | |
| # args: hashref of key/values
 | |
| #       arrayref of keys in order (optional)
 | |
| # returns: urlencoded string
 | |
| sub encode_url_string {
 | |
|     my ($hashref, $keyref) = @_;
 | |
| 
 | |
|     return join('&', map { LJ::eurl($_) . '=' . LJ::eurl($hashref->{$_}) }
 | |
|                 (ref $keyref ? @$keyref : keys %$hashref));
 | |
| }
 | |
| 
 | |
| # <LJFUNC>
 | |
| # name: LJ::eurl
 | |
| # class: text
 | |
| # des: Escapes a value before it can be put in a URL.  See also [func[LJ::durl]].
 | |
| # args: string
 | |
| # des-string: string to be escaped
 | |
| # returns: string escaped
 | |
| # </LJFUNC>
 | |
| sub eurl
 | |
| {
 | |
|     my $a = $_[0];
 | |
|     return '' unless $a;
 | |
|     $a =~ s/([^a-zA-Z0-9_\,\-.\/\\\: ])/uc sprintf("%%%02x",ord($1))/eg;
 | |
|     $a =~ tr/ /+/;
 | |
|     return $a;
 | |
| }
 | |
| 
 | |
| # <LJFUNC>
 | |
| # name: LJ::durl
 | |
| # class: text
 | |
| # des: Decodes a value that's URL-escaped.  See also [func[LJ::eurl]].
 | |
| # args: string
 | |
| # des-string: string to be decoded
 | |
| # returns: string decoded
 | |
| # </LJFUNC>
 | |
| sub durl
 | |
| {
 | |
|     my ($a) = @_;
 | |
|     $a =~ tr/+/ /;
 | |
|     $a =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
 | |
|     return $a;
 | |
| }
 | |
| 
 | |
| # <LJFUNC>
 | |
| # name: LJ::exml
 | |
| # class: text
 | |
| # des: Escapes a value before it can be put in XML.
 | |
| # args: string
 | |
| # des-string: string to be escaped
 | |
| # returns: string escaped.
 | |
| # </LJFUNC>
 | |
| sub exml
 | |
| {
 | |
|     # fast path for the commmon case:
 | |
|     return $_[0] unless $_[0] =~ /[&\"\'<>\x00-\x08\x0B\x0C\x0E-\x1F]/;
 | |
|     # what are those character ranges? XML 1.0 allows:
 | |
|     # #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
 | |
| 
 | |
|     my $a = shift;
 | |
|     $a =~ s/\&/&/g;
 | |
|     $a =~ s/\"/"/g;
 | |
|     $a =~ s/\'/'/g;
 | |
|     $a =~ s/</</g;
 | |
|     $a =~ s/>/>/g;
 | |
|     $a =~ s/[\x00-\x08\x0B\x0C\x0E-\x1F]//g;
 | |
|     return $a;
 | |
| }
 | |
| 
 | |
| # <LJFUNC>
 | |
| # name: LJ::ehtml
 | |
| # class: text
 | |
| # des: Escapes a value before it can be put in HTML.
 | |
| # args: string
 | |
| # des-string: string to be escaped
 | |
| # returns: string escaped.
 | |
| # </LJFUNC>
 | |
| sub ehtml
 | |
| {
 | |
|     # fast path for the commmon case:
 | |
|     return $_[0] unless $_[0] =~ /[&\"\'<>]/;
 | |
| 
 | |
|     # this is faster than doing one substitution with a map:
 | |
|     my $a = $_[0];
 | |
|     $a =~ s/\&/&/g;
 | |
|     $a =~ s/\"/"/g;
 | |
|     $a =~ s/\'/&\#39;/g;
 | |
|     $a =~ s/</</g;
 | |
|     $a =~ s/>/>/g;
 | |
|     return $a;
 | |
| }
 | |
| *eall = \&ehtml;  # old BML syntax required eall to also escape BML.  not anymore.
 | |
| 
 | |
| # <LJFUNC>
 | |
| # name: LJ::etags
 | |
| # class: text
 | |
| # des: Escapes < and > from a string
 | |
| # args: string
 | |
| # des-string: string to be escaped
 | |
| # returns: string escaped.
 | |
| # </LJFUNC>
 | |
| sub etags
 | |
| {
 | |
|     # fast path for the commmon case:
 | |
|     return $_[0] unless $_[0] =~ /[<>]/;
 | |
| 
 | |
|     my $a = $_[0];
 | |
|     $a =~ s/</</g;
 | |
|     $a =~ s/>/>/g;
 | |
|     return $a;
 | |
| }
 | |
| 
 | |
| # <LJFUNC>
 | |
| # name: LJ::ejs
 | |
| # class: text
 | |
| # des: Escapes a string value before it can be put in JavaScript.
 | |
| # args: string
 | |
| # des-string: string to be escaped
 | |
| # returns: string escaped.
 | |
| # </LJFUNC>
 | |
| sub ejs
 | |
| {
 | |
|     my $a = $_[0];
 | |
|     $a =~ s/[\"\'\\]/\\$&/g;
 | |
|     $a =~ s/"/\\"/g;
 | |
|     $a =~ s/\r?\n/\\n/gs;
 | |
|     $a =~ s/\r//gs;
 | |
|     return $a;
 | |
| }
 | |
| 
 | |
| # given a string, makes it into a string you can put into javascript,
 | |
| # including protecting against closing </script> tags in the entry.
 | |
| # does the double quotes for ya.
 | |
| sub ejs_string {
 | |
|     my $str = ejs($_[0]);
 | |
|     $str =~ s!</script!</scri\" + \"pt!g;
 | |
|     return "\"" . $str . "\"";
 | |
| }
 | |
| 
 | |
| # changes every char in a string to %XX where XX is the hex value
 | |
| # this is useful for passing strings to javascript through HTML, because
 | |
| # javascript's "unescape" function expects strings in this format
 | |
| sub ejs_all
 | |
| {
 | |
|     my $a = $_[0];
 | |
|     $a =~ s/(.)/uc sprintf("%%%02x",ord($1))/eg;
 | |
|     return $a;
 | |
| }
 | |
| 
 | |
| 
 | |
| # strip all HTML tags from a string
 | |
| sub strip_html {
 | |
|     my $str = shift;
 | |
|     $str =~ s/\<lj user\=['"]?([\w-]+)['"]?\>/$1/g;   # "
 | |
|     $str =~ s/\<([^\<])+\>//g;
 | |
|     return $str;
 | |
| }
 | |
| 
 | |
| # <LJFUNC>
 | |
| # name: LJ::is_ascii
 | |
| # des: checks if text is pure ASCII.
 | |
| # args: text
 | |
| # des-text: text to check for being pure 7-bit ASCII text.
 | |
| # returns: 1 if text is indeed pure 7-bit, 0 otherwise.
 | |
| # </LJFUNC>
 | |
| sub is_ascii {
 | |
|     my $text = shift;
 | |
|     return ($text !~ m/[^\x01-\x7f]/);
 | |
| }
 | |
| 
 | |
| # <LJFUNC>
 | |
| # name: LJ::is_utf8
 | |
| # des: check text for UTF-8 validity.
 | |
| # args: text
 | |
| # des-text: text to check for UTF-8 validity
 | |
| # returns: 1 if text is a valid UTF-8 stream, 0 otherwise.
 | |
| # </LJFUNC>
 | |
| sub is_utf8 {
 | |
|     my $text = shift;
 | |
| 
 | |
|     # it seems there is a dumbass who calls this function
 | |
|     # with $text being reference to scalar; now we can tolerate this
 | |
|     if (ref ($text) eq "HASH") {
 | |
|       return ! (grep { !LJ::is_utf8($_) } values %{$text});
 | |
|     }
 | |
|     elsif (ref ($text) eq "ARRAY") {
 | |
|       return ! (grep { !LJ::is_utf8($_) } @{$text});
 | |
|     }
 | |
|     elsif (ref ($text)) {
 | |
|       return 0;
 | |
|     }
 | |
|     
 | |
|     if (LJ::are_hooks("is_utf8")) {
 | |
|         return LJ::run_hook("is_utf8", $text);
 | |
|     }
 | |
| 
 | |
|     # for a discussion of the different utf8 validity checking methods,
 | |
|     # see:  http://zilla.livejournal.org/657
 | |
|     # in summary, this isn't the fastest, but it's pretty fast, it doesn't make
 | |
|     # perl segfault, and it doesn't add new crazy dependencies.  if you want
 | |
|     # speed, check out ljcom's is_utf8 version in C, using Inline.pm
 | |
| 
 | |
|     return 1 unless defined($text);
 | |
| 
 | |
|     my $u = Unicode::String::utf8($text);
 | |
|     my $text2 = $u->utf8;
 | |
| 
 | |
| #    return $text eq $text2;
 | |
| 
 | |
|     return LJ::is_ascii($text) || utf8::is_utf8($text) || $text eq $text2;
 | |
| }
 | |
| 
 | |
| # <LJFUNC>
 | |
| # name: LJ::text_out
 | |
| # des: force outgoing text into valid UTF-8.
 | |
| # args: text
 | |
| # des-text: reference to text to pass to output. Text if modified in-place.
 | |
| # returns: nothing.
 | |
| # </LJFUNC>
 | |
| sub text_out
 | |
| {
 | |
|     my $rtext = shift;
 | |
| 
 | |
|     # if we're not Unicode, do nothing
 | |
|     return unless $LJ::UNICODE;
 | |
| 
 | |
|     # is this valid UTF-8 already?
 | |
|     return if LJ::is_utf8($$rtext);
 | |
| 
 | |
|     # no. Blot out all non-ASCII chars
 | |
|     $$rtext =~ s/[\x00\x80-\xff]/\?/g;
 | |
|     return;
 | |
| }
 | |
| 
 | |
| # <LJFUNC>
 | |
| # name: LJ::text_in
 | |
| # des: do appropriate checks on input text. Should be called on all
 | |
| #      user-generated text.
 | |
| # args: text
 | |
| # des-text: text to check
 | |
| # returns: 1 if the text is valid, 0 if not.
 | |
| # </LJFUNC>
 | |
| sub text_in
 | |
| {
 | |
|     my $text = shift;
 | |
|     return 1 unless $LJ::UNICODE;
 | |
|     if (ref ($text) eq "HASH") {
 | |
|         return ! (grep { !LJ::is_utf8($_) } values %{$text});
 | |
|     }
 | |
|     if (ref ($text) eq "ARRAY") {
 | |
|         return ! (grep { !LJ::is_utf8($_) } @{$text});
 | |
|     }
 | |
|     return LJ::is_utf8($text);
 | |
| }
 | |
| 
 | |
| # <LJFUNC>
 | |
| # name: LJ::text_convert
 | |
| # des: convert old entries/comments to UTF-8 using user's default encoding.
 | |
| # args: dbs?, text, u, error
 | |
| # des-dbs: optional. Deprecated; a master/slave set of database handles.
 | |
| # des-text: old possibly non-ASCII text to convert
 | |
| # des-u: user hashref of the journal's owner
 | |
| # des-error: ref to a scalar variable which is set to 1 on error
 | |
| #            (when user has no default encoding defined, but
 | |
| #            text needs to be translated).
 | |
| # returns: converted text or undef on error
 | |
| # </LJFUNC>
 | |
| sub text_convert
 | |
| {
 | |
|     &nodb;
 | |
|     my ($text, $u, $error) = @_;
 | |
| 
 | |
|     # maybe it's pure ASCII?
 | |
|     return $text if LJ::is_ascii($text);
 | |
| 
 | |
|     # load encoding id->name mapping if it's not loaded yet
 | |
|     LJ::load_codes({ "encoding" => \%LJ::CACHE_ENCODINGS } )
 | |
|         unless %LJ::CACHE_ENCODINGS;
 | |
| 
 | |
|     if ($u->{'oldenc'} == 0 ||
 | |
|         not defined $LJ::CACHE_ENCODINGS{$u->{'oldenc'}}) {
 | |
|         $$error = 1;
 | |
|         return undef;
 | |
|     };
 | |
| 
 | |
|     # convert!
 | |
|     my $name = $LJ::CACHE_ENCODINGS{$u->{'oldenc'}};
 | |
|     unless (LJ::ConvUTF8->supported_charset($name)) {
 | |
|         $$error = 1;
 | |
|         return undef;
 | |
|     }
 | |
| 
 | |
|     return LJ::ConvUTF8->to_utf8($name, $text);
 | |
| }
 | |
| 
 | |
| 
 | |
| # <LJFUNC>
 | |
| # name: LJ::text_length
 | |
| # des: returns both byte length and character length of a string. In a non-Unicode
 | |
| #      environment, this means byte length twice. In a Unicode environment,
 | |
| #      the function assumes that its argument is a valid UTF-8 string.
 | |
| # args: text
 | |
| # des-text: the string to measure
 | |
| # returns: a list of two values, (byte_length, char_length).
 | |
| # </LJFUNC>
 | |
| 
 | |
| sub text_length
 | |
| {
 | |
|     my $text = shift;
 | |
|     my $bl = length($text);
 | |
|     unless ($LJ::UNICODE) {
 | |
|         return ($bl, $bl);
 | |
|     }
 | |
|     my $cl = 0;
 | |
|     my $utf_char = "([\x00-\x7f]|[\xc0-\xdf].|[\xe0-\xef]..|[\xf0-\xf7]...)";
 | |
| 
 | |
|     while ($text =~ m/$utf_char/go) { $cl++; }
 | |
|     return ($bl, $cl);
 | |
| }
 | |
| 
 | |
| # <LJFUNC>
 | |
| # name: LJ::text_trim
 | |
| # des: truncate string according to requirements on byte length, char
 | |
| #      length, or both. "char length" means number of UTF-8 characters if
 | |
| #      [ljconfig[unicode]] is set, or the same thing as byte length otherwise.
 | |
| # args: text, byte_max, char_max
 | |
| # des-text: the string to trim
 | |
| # des-byte_max: maximum allowed length in bytes; if 0, there's no restriction
 | |
| # des-char_max: maximum allowed length in chars; if 0, there's no restriction
 | |
| # returns: the truncated string.
 | |
| # </LJFUNC>
 | |
| sub text_trim
 | |
| {
 | |
|     my ($text, $byte_max, $char_max) = @_;
 | |
|     return $text unless $byte_max or $char_max;
 | |
|     if (!$LJ::UNICODE) {
 | |
|         $byte_max = $char_max if $char_max and $char_max < $byte_max;
 | |
|         $byte_max = $char_max unless $byte_max;
 | |
|         return substr($text, 0, $byte_max);
 | |
|     }
 | |
| 
 | |
|     # after upgrade to perl 5.12 we can use perl subroutines to handle unicode
 | |
|     # (need to convert all the input to unicode as soon as it enters the system)
 | |
|     return substr($text,0,$char_max);
 | |
| 
 | |
| #    my $cur = 0;
 | |
| #    my $utf_char = "([\x00-\x7f]|[\xc0-\xdf].|[\xe0-\xef]..|[\xf0-\xf7]...)";
 | |
| #
 | |
| #    # if we don't have a character limit, assume it's the same as the byte limit.
 | |
| #    # we will never have more characters than bytes, but we might have more bytes
 | |
| #    # than characters, so we can't inherit the other way.
 | |
| #    $char_max ||= $byte_max;
 | |
| #
 | |
| #    while ($text =~ m/$utf_char/gco) {
 | |
| #        last unless $char_max;
 | |
| #        last if $cur + length($1) > $byte_max and $byte_max;
 | |
| #        $cur += length($1);
 | |
| #        $char_max--;
 | |
| #    }
 | |
| #    return substr($text,0,$cur);
 | |
| }
 | |
| 
 | |
| # <LJFUNC>
 | |
| # name: LJ::text_compress
 | |
| # des: Compresses a chunk of text, to gzip, if configured for site.  Can compress
 | |
| #      a scalarref in place, or return a compressed copy.  Won't compress if
 | |
| #      value is too small, already compressed, or size would grow by compressing.
 | |
| # args: text
 | |
| # des-text: either a scalar or scalarref
 | |
| # returns: nothing if given a scalarref (to compress in-place), or original/compressed value,
 | |
| #          depending on site config.
 | |
| # </LJFUNC>
 | |
| sub text_compress
 | |
| {
 | |
|     my $text = shift;
 | |
|     my $ref = ref $text;
 | |
|     return $ref ? undef : $text unless $LJ::COMPRESS_TEXT;
 | |
|     die "Invalid reference" if $ref && $ref ne "SCALAR";
 | |
| 
 | |
|     my $tref = $ref ? $text : \$text;
 | |
|     my $pre_len = length($$tref);
 | |
|     unless (substr($$tref,0,2) eq "\037\213" || $pre_len < 100) {
 | |
|         my $gz = Compress::Zlib::memGzip($$tref);
 | |
|         if (length($gz) < $pre_len) {
 | |
|             $$tref = $gz;
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     return $ref ? undef : $$tref;
 | |
| }
 | |
| 
 | |
| # <LJFUNC>
 | |
| # name: LJ::text_uncompress
 | |
| # des: Uncompresses a chunk of text, from gzip, if configured for site.  Can uncompress
 | |
| #      a scalarref in place, or return a compressed copy.  Won't uncompress unless
 | |
| #      it finds the gzip magic number at the beginning of the text.
 | |
| # args: text
 | |
| # des-text: either a scalar or scalarref.
 | |
| # returns: nothing if given a scalarref (to uncompress in-place), or original/uncompressed value,
 | |
| #          depending on if test was compressed or not
 | |
| # </LJFUNC>
 | |
| sub text_uncompress
 | |
| {
 | |
|     my $text = shift;
 | |
|     my $ref = ref $text;
 | |
|     die "Invalid reference" if $ref && $ref ne "SCALAR";
 | |
|     my $tref = $ref ? $text : \$text;
 | |
| 
 | |
|     # check for gzip's magic number
 | |
|     if (substr($$tref,0,2) eq "\037\213") {
 | |
|         $$tref = Compress::Zlib::memGunzip($$tref);
 | |
|     }
 | |
| 
 | |
|     return $ref ? undef : $$tref;
 | |
| }
 | |
| 
 | |
| # function to trim a string containing HTML.  this will auto-close any
 | |
| # html tags that were still open when the string was truncated
 | |
| sub html_trim {
 | |
|     my ($text, $char_max) = @_;
 | |
| 
 | |
|     return $text unless $char_max;
 | |
| 
 | |
|     my $p = HTML::TokeParser->new(\$text);
 | |
|     my @open_tags; # keep track of what tags are open
 | |
|     my $out = '';
 | |
|     my $content_len = 0;
 | |
| 
 | |
|   TOKEN:
 | |
|     while (my $token = $p->get_token) {
 | |
|         my $type = $token->[0];
 | |
|         my $tag  = $token->[1];
 | |
|         my $attr = $token->[2];  # hashref
 | |
| 
 | |
|         if ($type eq "S") {
 | |
|             my $selfclose;
 | |
| 
 | |
|             # start tag
 | |
|             $out .= "<$tag";
 | |
| 
 | |
|             # assume tags are properly self-closed
 | |
|             $selfclose = 1 if lc $tag eq 'input' || lc $tag eq 'br' || lc $tag eq 'img';
 | |
| 
 | |
|             # preserve order of attributes. the original order is
 | |
|             # in element 4 of $token
 | |
|             foreach my $attrname (@{$token->[3]}) {
 | |
|                 if ($attrname eq '/') {
 | |
|                     $selfclose = 1;
 | |
|                     next;
 | |
|                 }
 | |
| 
 | |
|                 # FIXME: ultra ghetto.
 | |
|                 $attr->{$attrname} = LJ::no_utf8_flag($attr->{$attrname});
 | |
|                 $out .= " $attrname=\"" . LJ::ehtml($attr->{$attrname}) . "\"";
 | |
|             }
 | |
| 
 | |
|             $out .= $selfclose ? " />" : ">";
 | |
| 
 | |
|             push @open_tags, $tag unless $selfclose;
 | |
| 
 | |
|         } elsif ($type eq 'T' || $type eq 'D') {
 | |
|             my $content = $token->[1];
 | |
| 
 | |
|             if (length($content) + $content_len > $char_max) {
 | |
| 
 | |
|                 # truncate and stop parsing
 | |
|                 $content = LJ::text_trim($content, undef, ($char_max - $content_len));
 | |
|                 $out .= $content;
 | |
|                 last;
 | |
|             }
 | |
| 
 | |
|             $content_len += length $content;
 | |
| 
 | |
|             $out .= $content;
 | |
| 
 | |
|         } elsif ($type eq 'C') {
 | |
|             # comment, don't care
 | |
|             $out .= $token->[1];
 | |
| 
 | |
|         } elsif ($type eq 'E') {
 | |
|             # end tag
 | |
|             pop @open_tags;
 | |
|             $out .= "</$tag>";
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     $out .= join("\n", map { "</$_>" } reverse @open_tags);
 | |
| 
 | |
|     return $out;
 | |
| }
 | |
| 
 | |
| # takes a number, inserts commas where needed
 | |
| sub commafy {
 | |
|     my $number = shift;
 | |
|     return $number unless $number =~ /^\d+$/;
 | |
| 
 | |
|     my $punc = LJ::Lang::ml('number.punctuation') || ",";
 | |
|     $number =~ s/(?<=\d)(?=(\d\d\d)+(?!\d))/$punc/g;
 | |
|     return $number;
 | |
| }
 | |
| 
 | |
| # <LJFUNC>
 | |
| # name: LJ::html_newlines
 | |
| # des: Replace newlines with HTML break tags.
 | |
| # args: text
 | |
| # returns: text, possibly including HTML break tags.
 | |
| # </LJFUNC>
 | |
| sub html_newlines
 | |
| {
 | |
|     my $text = shift;
 | |
|     $text =~ s/\n/<br \/>/gm;
 | |
| 
 | |
|     return $text;
 | |
| }
 | |
| 
 | |
| # given HTML, returns an arrayref of URLs to images that are in the HTML
 | |
| sub html_get_img_urls {
 | |
|     my $htmlref = shift;
 | |
|     my %opts = @_;
 | |
| 
 | |
|     my $exclude_site_imgs = $opts{exclude_site_imgs} || 0;
 | |
| 
 | |
|     my @image_urls;
 | |
|     my $p = HTML::TokeParser->new($htmlref);
 | |
| 
 | |
|     while (my $token = $p->get_token) {
 | |
|         if ($token->[1] eq "img") {
 | |
|             my $attrs = $token->[2];
 | |
|             foreach my $attr (keys %$attrs) {
 | |
|                 push @image_urls, $attrs->{$attr} if
 | |
|                     $attr eq "src" &&
 | |
|                     ($exclude_site_imgs ? $attrs->{$attr} !~ /^$LJ::IMGPREFIX/ : 1);
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     return \@image_urls;
 | |
| }
 | |
| 
 | |
| # given HTML, returns an arrayref of link URLs that are in the HTML
 | |
| sub html_get_link_urls {
 | |
|     my $htmlref = shift;
 | |
|     my %opts = @_;
 | |
| 
 | |
|     my @link_urls;
 | |
|     my $p = HTML::TokeParser->new($htmlref);
 | |
| 
 | |
|     while (my $token = $p->get_token) {
 | |
|         if ($token->[0] eq "S" && $token->[1] eq "a") {
 | |
|             my $attrs = $token->[2];
 | |
|             foreach my $attr (keys %$attrs) {
 | |
|                 push @link_urls, $attrs->{$attr} if $attr eq "href";
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     return \@link_urls;
 | |
| }
 | |
| 
 | |
| 1;
 |