#!/usr/bin/perl use strict; package LJ::ParseFeed; use XML::RSS; use XML::Parser; # parse_feed parses an RSS/Atom feed # arguments: content and, optionally, type, specifying "atom" or # "rss". If type isn't supplied, the function will try to guess it # based on contents. # It returns $feed, which is a hash # with the following keys: # type - 'atom' or 'rss' # version - version of the feed in its standard # link - URL of the feed # title - title of the feed # description - description of the feed # # TODO: more kinds of info? # # items - arrayref of item hashes, in the same order they were in the feed # each item contains: # link - URL of the item # id - unique identifier (optional) # text - text of the item # subject - subject # time - in format 'yyyy-mm-dd hh:mm' (optional) # the second argument returned is $error, which, if defined, is a human-readable # error string. the third argument is arrayref of items, same as # $feed->{'items'}. sub parse_feed { my ($content, $type) = @_; my ($feed, $items, $error); my $parser; # is it RSS or Atom? # Atom feeds are rare for now, so prefer to err in favor of RSS # simple heuristic: Atom feeds will have ''Stream', Pkg=>'LJ::ParseFeed::Atom'); return ("", "failed to create XML parser") unless $parser; eval { $parser->parse($content); }; if ($@) { $error = "XML parser error: $@"; } else { ($feed, $items, $error) = LJ::ParseFeed::Atom::results(); }; if ($feed || $type eq 'atom') { # there was a top-level there, or we're forced to treat # as an Atom feed, so even if $error is set, # don't try RSS $feed->{'type'} = 'atom'; return ($feed, $error, $items); } } # try parsing it as RSS $parser = new XML::RSS; return ("", "failed to create RSS parser") unless $parser; eval { $parser->parse($content); }; if ($@) { $error = "RSS parser error: $@"; return ("", $error); } $feed = {}; $feed->{'type'} = 'rss'; $feed->{'version'} = $parser->{'version'}; foreach (qw (link title description lastBuildDate)) { $feed->{$_} = $parser->{'channel'}->{$_} if $parser->{'channel'}->{$_}; } if ($parser->{'image'}->{'url'}) { $feed->{'image'} = $parser->{'image'}->{'url'}; } $feed->{'lastmod'} = undef; $feed->{'items'} = []; foreach(@{$parser->{'items'}}) { my $item = {}; $item->{'subject'} = $_->{'title'}; $item->{'text'} = $_->{'description'}; $item->{'link'} = $_->{'link'} if $_->{'link'}; $item->{'id'} = $_->{'guid'} if $_->{'guid'}; my $nsdc = 'http://purl.org/dc/elements/1.1/'; my $nsenc = 'http://purl.org/rss/1.0/modules/content/'; if ($_->{$nsenc} && ref($_->{$nsenc}) eq "HASH") { # prefer content:encoded if present $item->{'text'} = $_->{$nsenc}->{'encoded'} if defined $_->{$nsenc}->{'encoded'}; } if ($_->{'pubDate'}) { my $time = time822_to_time($_->{'pubDate'}); $item->{'time'} = $time if $time; } if ($_->{$nsdc} && ref($_->{$nsdc}) eq "HASH") { if ($_->{$nsdc}->{date}) { my $time = w3cdtf_to_time($_->{$nsdc}->{date}); $item->{'time'} = $time if $time; } } if ($_->{'pubDate'} && (! $feed->{'lastmod'} || (LJ::http_to_time($feed->{'lastmod'}) < LJ::http_to_time($_->{'pubDate'})))) { $feed->{'lastmod'} = $_->{'pubDate'}; } push @{$feed->{'items'}}, $item; } return ($feed, undef, $feed->{'items'}); } # convert rfc822-time in RSS's to our time # see http://www.faqs.org/rfcs/rfc822.html # RFC822 specifies 2 digits for year, and RSS2.0 refers to RFC822, # but real RSS2.0 feeds apparently use 4 digits. sub time822_to_time { my $t822 = shift; # remove day name if present $t822 =~ s/^\s*\w+\s*,//; # remove whitespace $t822 =~ s/^\s*//; # break it up if ($t822 =~ m!(\d?\d)\s+(\w+)\s+(\d\d\d\d)\s+(\d?\d):(\d\d)!) { my ($day, $mon, $year, $hour, $min) = ($1,$2,$3,$4,$5); $day = "0" . $day if length($day) == 1; $hour = "0" . $hour if length($hour) == 1; $mon = {'Jan'=>'01', 'Feb'=>'02', 'Mar'=>'03', 'Apr'=>'04', 'May'=>'05', 'Jun'=>'06', 'Jul'=>'07', 'Aug'=>'08', 'Sep'=>'09', 'Oct'=>'10', 'Nov'=>'11', 'Dec'=>'12'}->{$mon}; return undef unless $mon; return "$year-$mon-$day $hour:$min"; } else { return undef; } } # convert W3C-DTF to our internal format # see http://www.w3.org/TR/NOTE-datetime # Based very loosely on code from DateTime::Format::W3CDTF, # which isn't stable yet so we can't use it directly. sub w3cdtf_to_time { my $tw3 = shift; # TODO: Should somehow return the timezone offset # so that it can stored... but we don't do timezones # yet anyway. For now, just strip the timezone # portion if it is present, along with the decimal # fractions of a second. $tw3 =~ s/(?:\.\d+)?(?:[+-]\d{1,2}:\d{1,2}|Z)$//; $tw3 =~ s/^\s*//; $tw3 =~ s/\s*$//; # Eat any superflous whitespace # We can only use complete times, so anything which # doesn't feature the time part is considered invalid. # This is working around clients that don't implement W3C-DTF # correctly, and only send single digit values in the dates. # 2004-4-8T16:9:4Z vs 2004-04-08T16:09:44Z # If it's more messed up than that, reject it outright. $tw3 =~ /^(\d{4})-(\d{1,2})-(\d{1,2})T(\d{1,2}):(\d{1,2})(?::(\d{1,2}))?$/ or return undef; my %pd; # parsed date $pd{Y} = $1; $pd{M} = $2; $pd{D} = $3; $pd{h} = $4; $pd{m} = $5; $pd{s} = $6; # force double digits foreach (qw/ M D h m s /) { next unless defined $pd{$_}; $pd{$_} = sprintf "%02d", $pd{$_}; } return $pd{s} ? "$pd{Y}-$pd{M}-$pd{D} $pd{h}:$pd{m}:$pd{s}" : "$pd{Y}-$pd{M}-$pd{D} $pd{h}:$pd{m}"; } package LJ::ParseFeed::Atom; our ($feed, $item, $data); our ($ddepth, $dholder); # for accumulating; our @items; our $error; sub err { $error = shift unless $error; } sub results { return ($feed, \@items, $error); } # $name under which we'll store accumulated data may be different # from $tag which causes us to store it # $name may be a scalarref pointing to where we should store # swallowing is achieved by calling startaccum(''); sub startaccum { my $name = shift; return err("Tag found under neither nor ") unless $feed || $item; $data = ""; # defining $data triggers accumulation $ddepth = 1; $dholder = undef unless $name; # if $name is a scalarref, it's actually our $dholder if (ref($name) eq 'SCALAR') { $dholder = $name; } else { $dholder = ($item ? \$item->{$name} : \$feed->{$name}) if $name; } return; } sub swallow { return startaccum(''); } sub StartDocument { ($feed, $item, $data) = (undef, undef, undef); @items = (); undef $error; } sub StartTag { # $_ carries the unparsed tag my ($p, $tag) = @_; my $holder; # do nothing if there has been an error return if $error; # are we just accumulating data? if (defined $data) { $data .= $_; $ddepth++; return; } # where we'll usually store info $holder = $item ? $item : $feed; TAGS: { if ($tag eq 'feed') { return err("Nested tags") if $feed; $feed = {}; $feed->{'standard'} = 'atom'; $feed->{'version'} = $_{'version'}; # return err("No version specified in ") # unless $feed->{'version'}; # commented out as it's done in # http://code.livejournal.org/trac/livejournal/browser/trunk/cgi-bin/parsefeed.pl # (too many sites return this error now) return err("Incompatible version specified in ") if $feed->{'version'} && $feed->{'version'} < 0.3; last TAGS; } if ($tag eq 'entry') { return err("Nested tags") if $item; $item = {}; last TAGS; } # at this point, we must have a top-level or # to write into return err("Tag found under neither nor ") unless $holder; if ($tag eq 'link') { # ignore links with rel= anything but alternate unless ($_{'rel'} eq 'alternate') { swallow(); last TAGS; } $holder->{'link'} = $_{'href'}; return err("No href attribute in ") unless $holder->{'link'}; last TAGS; } if ($tag eq 'content') { return err(" outside ") unless $item; # if type is multipart/alternative, we continue recursing # otherwise we accumulate my $type = $_{'type'} || "text/plain"; unless ($type eq "multipart/alternative") { push @{$item->{'contents'}}, [$type, ""]; startaccum(\$item->{'contents'}->[-1]->[1]); last TAGS; } # it's multipart/alternative, so recurse, but don't swallow last TAGS; } # store tags which should require no further # processing as they are, and others under _atom_*, to be processed # in EndTag under if ($tag eq 'title') { if ($item) { # entry's subject startaccum("subject"); } else { # feed's title startaccum($tag); } last TAGS; } if ($tag eq 'id') { unless ($item) { swallow(); # we don't need feed-level } else { startaccum($tag); } last TAGS; } if ($tag eq 'tagline' && !$item) { # feed's tagline, our "description" startaccum("description"); last TAGS; } # accumulate and store startaccum("_atom_" . $tag); last TAGS; } return; } sub EndTag { # $_ carries the unparsed tag my ($p, $tag) = @_; # do nothing if there has been an error return if $error; # are we accumulating data? if (defined $data) { $ddepth--; if ($ddepth == 0) { # stop accumulating $$dholder = $data if $dholder; undef $data; return; } $data .= $_; return; } TAGS: { if ($tag eq 'entry') { # finalize item... # generate suitable text from $item->{'contents'} my $content; $item->{'contents'} ||= []; unless (scalar(@{$item->{'contents'}}) >= 1) { # this item had no # maybe it has ? if so, use # TODO: type= or encoding issues here? perhaps unite # handling of with that of ? if ($item->{'_atom_summary'}) { $item->{'text'} = $item->{'_atom_summary'}; delete $item->{'contents'}; } else { # nothing to display, so ignore this entry undef $item; last TAGS; } } unless ($item->{'text'}) { # unless we already have text if (scalar(@{$item->{'contents'}}) == 1) { # only one section $content = $item->{'contents'}->[0]; } else { # several section, must choose the best one foreach (@{$item->{'contents'}}) { if ($_->[0] eq "application/xhtml+xml") { # best match $content = $_; last; # don't bother to look at others } if ($_->[0] =~ m!html!) { # some kind of html/xhtml/html+xml, etc. # choose this unless we've already chosen some html $content = $_ unless $content->[0] =~ m!html!; next; } if ($_->[0] eq "text/plain") { # choose this unless we have some html already $content = $_ unless $content->[0] =~ m!html!; next; } } # if we didn't choose anything, pick the first one $content = $item->{'contents'}->[0] unless $content; } # we ignore the 'mode' attribute of . If it's "xml", we've # stringified it by accumulation; if it's "escaped", our parser # unescaped it # TODO: handle mode=base64? $item->{'text'} = $content->[1]; delete $item->{'contents'}; } # generate time my $w3time = $item->{'_atom_modified'} || $item->{'_atom_created'}; my $time; if ($w3time) { # see http://www.w3.org/TR/NOTE-datetime for format # we insist on having granularity up to a minute, # and ignore finer data as well as the timezone, for now if ($w3time =~ m!^(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d)!) { $time = "$1-$2-$3 $4:$5"; } } if ($time) { $item->{'time'} = $time; } # get rid of all other tags we don't need anymore foreach (keys %$item) { delete $item->{$_} if substr($_, 0, 6) eq '_atom_'; } push @items, $item; undef $item; last TAGS; } if ($tag eq 'feed') { # finalize feed # get rid of all other tags we don't need anymore foreach (keys %$feed) { delete $feed->{$_} if substr($_, 0, 6) eq '_atom_'; } # link the feed with its itms $feed->{'items'} = \@items if $feed; last TAGS; } } return; } sub Text { my $p = shift; # do nothing if there has been an error return if $error; $data .= $_ if defined $data; } sub PI { # ignore processing instructions return; } sub EndDocument { # if we parsed a feed, link items to it $feed->{'items'} = \@items if $feed; return; } 1;