ljr/local/cgi-bin/parsefeed.pl

496 lines
15 KiB
Perl
Executable File

#!/usr/bin/perl
use strict;
package LJ::ParseFeed;
use XML::RSS;
use XML::Parser;
# parse_feed parses an RSS/Atom feed
# arguments: content and, optionally, type, specifying "atom" or
# "rss". If type isn't supplied, the function will try to guess it
# based on contents.
# It returns $feed, which is a hash
# with the following keys:
# type - 'atom' or 'rss'
# version - version of the feed in its standard
# link - URL of the feed
# title - title of the feed
# description - description of the feed
# # TODO: more kinds of info?
#
# items - arrayref of item hashes, in the same order they were in the feed
# each item contains:
# link - URL of the item
# id - unique identifier (optional)
# text - text of the item
# subject - subject
# time - in format 'yyyy-mm-dd hh:mm' (optional)
# the second argument returned is $error, which, if defined, is a human-readable
# error string. the third argument is arrayref of items, same as
# $feed->{'items'}.
sub parse_feed
{
my ($content, $type) = @_;
my ($feed, $items, $error);
my $parser;
# is it RSS or Atom?
# Atom feeds are rare for now, so prefer to err in favor of RSS
# simple heuristic: Atom feeds will have '<feed' somewhere at the beginning
# TODO: maybe store the feed's type on creation in a userprop and not guess here
my $cut = substr($content, 0, 255);
if ($type eq 'atom' || $cut =~ m!\<feed!) {
# try treating it as an atom feed
$parser = new XML::Parser(Style=>'Stream', Pkg=>'LJ::ParseFeed::Atom');
return ("", "failed to create XML parser") unless $parser;
eval {
$parser->parse($content);
};
if ($@) {
$error = "XML parser error: $@";
} else {
($feed, $items, $error) = LJ::ParseFeed::Atom::results();
};
if ($feed || $type eq 'atom') {
# there was a top-level <feed> there, or we're forced to treat
# as an Atom feed, so even if $error is set,
# don't try RSS
$feed->{'type'} = 'atom';
return ($feed, $error, $items);
}
}
# try parsing it as RSS
$parser = new XML::RSS;
return ("", "failed to create RSS parser") unless $parser;
eval {
$parser->parse($content);
};
if ($@) {
$error = "RSS parser error: $@";
return ("", $error);
}
$feed = {};
$feed->{'type'} = 'rss';
$feed->{'version'} = $parser->{'version'};
foreach (qw (link title description lastBuildDate)) {
$feed->{$_} = $parser->{'channel'}->{$_}
if $parser->{'channel'}->{$_};
}
if ($parser->{'image'}->{'url'}) {
$feed->{'image'} = $parser->{'image'}->{'url'};
}
$feed->{'lastmod'} = undef;
$feed->{'items'} = [];
foreach(@{$parser->{'items'}}) {
my $item = {};
$item->{'subject'} = $_->{'title'};
$item->{'text'} = $_->{'description'};
$item->{'link'} = $_->{'link'} if $_->{'link'};
$item->{'id'} = $_->{'guid'} if $_->{'guid'};
my $nsdc = 'http://purl.org/dc/elements/1.1/';
my $nsenc = 'http://purl.org/rss/1.0/modules/content/';
if ($_->{$nsenc} && ref($_->{$nsenc}) eq "HASH") {
# prefer content:encoded if present
$item->{'text'} = $_->{$nsenc}->{'encoded'}
if defined $_->{$nsenc}->{'encoded'};
}
if ($_->{'pubDate'}) {
my $time = time822_to_time($_->{'pubDate'});
$item->{'time'} = $time if $time;
}
if ($_->{$nsdc} && ref($_->{$nsdc}) eq "HASH") {
if ($_->{$nsdc}->{date}) {
my $time = w3cdtf_to_time($_->{$nsdc}->{date});
$item->{'time'} = $time if $time;
}
}
if ($_->{'pubDate'} && (! $feed->{'lastmod'} ||
(LJ::http_to_time($feed->{'lastmod'}) < LJ::http_to_time($_->{'pubDate'})))) {
$feed->{'lastmod'} = $_->{'pubDate'};
}
push @{$feed->{'items'}}, $item;
}
return ($feed, undef, $feed->{'items'});
}
# convert rfc822-time in RSS's <pubDate> to our time
# see http://www.faqs.org/rfcs/rfc822.html
# RFC822 specifies 2 digits for year, and RSS2.0 refers to RFC822,
# but real RSS2.0 feeds apparently use 4 digits.
sub time822_to_time {
my $t822 = shift;
# remove day name if present
$t822 =~ s/^\s*\w+\s*,//;
# remove whitespace
$t822 =~ s/^\s*//;
# break it up
if ($t822 =~ m!(\d?\d)\s+(\w+)\s+(\d\d\d\d)\s+(\d?\d):(\d\d)!) {
my ($day, $mon, $year, $hour, $min) = ($1,$2,$3,$4,$5);
$day = "0" . $day if length($day) == 1;
$hour = "0" . $hour if length($hour) == 1;
$mon = {'Jan'=>'01', 'Feb'=>'02', 'Mar'=>'03', 'Apr'=>'04',
'May'=>'05', 'Jun'=>'06', 'Jul'=>'07', 'Aug'=>'08',
'Sep'=>'09', 'Oct'=>'10', 'Nov'=>'11', 'Dec'=>'12'}->{$mon};
return undef unless $mon;
return "$year-$mon-$day $hour:$min";
} else {
return undef;
}
}
# convert W3C-DTF to our internal format
# see http://www.w3.org/TR/NOTE-datetime
# Based very loosely on code from DateTime::Format::W3CDTF,
# which isn't stable yet so we can't use it directly.
sub w3cdtf_to_time {
my $tw3 = shift;
# TODO: Should somehow return the timezone offset
# so that it can stored... but we don't do timezones
# yet anyway. For now, just strip the timezone
# portion if it is present, along with the decimal
# fractions of a second.
$tw3 =~ s/(?:\.\d+)?(?:[+-]\d{1,2}:\d{1,2}|Z)$//;
$tw3 =~ s/^\s*//; $tw3 =~ s/\s*$//; # Eat any superflous whitespace
# We can only use complete times, so anything which
# doesn't feature the time part is considered invalid.
# This is working around clients that don't implement W3C-DTF
# correctly, and only send single digit values in the dates.
# 2004-4-8T16:9:4Z vs 2004-04-08T16:09:44Z
# If it's more messed up than that, reject it outright.
$tw3 =~ /^(\d{4})-(\d{1,2})-(\d{1,2})T(\d{1,2}):(\d{1,2})(?::(\d{1,2}))?$/
or return undef;
my %pd; # parsed date
$pd{Y} = $1; $pd{M} = $2; $pd{D} = $3;
$pd{h} = $4; $pd{m} = $5; $pd{s} = $6;
# force double digits
foreach (qw/ M D h m s /) {
next unless defined $pd{$_};
$pd{$_} = sprintf "%02d", $pd{$_};
}
return $pd{s} ? "$pd{Y}-$pd{M}-$pd{D} $pd{h}:$pd{m}:$pd{s}" :
"$pd{Y}-$pd{M}-$pd{D} $pd{h}:$pd{m}";
}
package LJ::ParseFeed::Atom;
our ($feed, $item, $data);
our ($ddepth, $dholder); # for accumulating;
our @items;
our $error;
sub err {
$error = shift unless $error;
}
sub results {
return ($feed, \@items, $error);
}
# $name under which we'll store accumulated data may be different
# from $tag which causes us to store it
# $name may be a scalarref pointing to where we should store
# swallowing is achieved by calling startaccum('');
sub startaccum {
my $name = shift;
return err("Tag found under neither <feed> nor <entry>")
unless $feed || $item;
$data = ""; # defining $data triggers accumulation
$ddepth = 1;
$dholder = undef
unless $name;
# if $name is a scalarref, it's actually our $dholder
if (ref($name) eq 'SCALAR') {
$dholder = $name;
} else {
$dholder = ($item ? \$item->{$name} : \$feed->{$name})
if $name;
}
return;
}
sub swallow {
return startaccum('');
}
sub StartDocument {
($feed, $item, $data) = (undef, undef, undef);
@items = ();
undef $error;
}
sub StartTag {
# $_ carries the unparsed tag
my ($p, $tag) = @_;
my $holder;
# do nothing if there has been an error
return if $error;
# are we just accumulating data?
if (defined $data) {
$data .= $_;
$ddepth++;
return;
}
# where we'll usually store info
$holder = $item ? $item : $feed;
TAGS: {
if ($tag eq 'feed') {
return err("Nested <feed> tags")
if $feed;
$feed = {};
$feed->{'standard'} = 'atom';
$feed->{'version'} = $_{'version'};
# return err("No version specified in <feed>")
# unless $feed->{'version'};
# commented out as it's done in
# http://code.livejournal.org/trac/livejournal/browser/trunk/cgi-bin/parsefeed.pl
# (too many sites return this error now)
return err("Incompatible version specified in <feed>")
if $feed->{'version'} && $feed->{'version'} < 0.3;
last TAGS;
}
if ($tag eq 'entry') {
return err("Nested <entry> tags")
if $item;
$item = {};
last TAGS;
}
# at this point, we must have a top-level <feed> or <entry>
# to write into
return err("Tag found under neither <feed> nor <entry>")
unless $holder;
if ($tag eq 'link') {
# ignore links with rel= anything but alternate
unless ($_{'rel'} eq 'alternate') {
swallow();
last TAGS;
}
$holder->{'link'} = $_{'href'};
return err("No href attribute in <link>")
unless $holder->{'link'};
last TAGS;
}
if ($tag eq 'content') {
return err("<content> outside <entry>")
unless $item;
# if type is multipart/alternative, we continue recursing
# otherwise we accumulate
my $type = $_{'type'} || "text/plain";
unless ($type eq "multipart/alternative") {
push @{$item->{'contents'}}, [$type, ""];
startaccum(\$item->{'contents'}->[-1]->[1]);
last TAGS;
}
# it's multipart/alternative, so recurse, but don't swallow
last TAGS;
}
# store tags which should require no further
# processing as they are, and others under _atom_*, to be processed
# in EndTag under </entry>
if ($tag eq 'title') {
if ($item) { # entry's subject
startaccum("subject");
} else { # feed's title
startaccum($tag);
}
last TAGS;
}
if ($tag eq 'id') {
unless ($item) {
swallow(); # we don't need feed-level <id>
} else {
startaccum($tag);
}
last TAGS;
}
if ($tag eq 'tagline' && !$item) { # feed's tagline, our "description"
startaccum("description");
last TAGS;
}
# accumulate and store
startaccum("_atom_" . $tag);
last TAGS;
}
return;
}
sub EndTag {
# $_ carries the unparsed tag
my ($p, $tag) = @_;
# do nothing if there has been an error
return if $error;
# are we accumulating data?
if (defined $data) {
$ddepth--;
if ($ddepth == 0) { # stop accumulating
$$dholder = $data
if $dholder;
undef $data;
return;
}
$data .= $_;
return;
}
TAGS: {
if ($tag eq 'entry') {
# finalize item...
# generate suitable text from $item->{'contents'}
my $content;
$item->{'contents'} ||= [];
unless (scalar(@{$item->{'contents'}}) >= 1) {
# this item had no <content>
# maybe it has <summary>? if so, use <summary>
# TODO: type= or encoding issues here? perhaps unite
# handling of <summary> with that of <content>?
if ($item->{'_atom_summary'}) {
$item->{'text'} = $item->{'_atom_summary'};
delete $item->{'contents'};
} else {
# nothing to display, so ignore this entry
undef $item;
last TAGS;
}
}
unless ($item->{'text'}) { # unless we already have text
if (scalar(@{$item->{'contents'}}) == 1) {
# only one <content> section
$content = $item->{'contents'}->[0];
} else {
# several <content> section, must choose the best one
foreach (@{$item->{'contents'}}) {
if ($_->[0] eq "application/xhtml+xml") { # best match
$content = $_;
last; # don't bother to look at others
}
if ($_->[0] =~ m!html!) { # some kind of html/xhtml/html+xml, etc.
# choose this unless we've already chosen some html
$content = $_
unless $content->[0] =~ m!html!;
next;
}
if ($_->[0] eq "text/plain") {
# choose this unless we have some html already
$content = $_
unless $content->[0] =~ m!html!;
next;
}
}
# if we didn't choose anything, pick the first one
$content = $item->{'contents'}->[0]
unless $content;
}
# we ignore the 'mode' attribute of <content>. If it's "xml", we've
# stringified it by accumulation; if it's "escaped", our parser
# unescaped it
# TODO: handle mode=base64?
$item->{'text'} = $content->[1];
delete $item->{'contents'};
}
# generate time
my $w3time = $item->{'_atom_modified'} || $item->{'_atom_created'};
my $time;
if ($w3time) {
# see http://www.w3.org/TR/NOTE-datetime for format
# we insist on having granularity up to a minute,
# and ignore finer data as well as the timezone, for now
if ($w3time =~ m!^(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d)!) {
$time = "$1-$2-$3 $4:$5";
}
}
if ($time) {
$item->{'time'} = $time;
}
# get rid of all other tags we don't need anymore
foreach (keys %$item) {
delete $item->{$_} if substr($_, 0, 6) eq '_atom_';
}
push @items, $item;
undef $item;
last TAGS;
}
if ($tag eq 'feed') {
# finalize feed
# get rid of all other tags we don't need anymore
foreach (keys %$feed) {
delete $feed->{$_} if substr($_, 0, 6) eq '_atom_';
}
# link the feed with its itms
$feed->{'items'} = \@items
if $feed;
last TAGS;
}
}
return;
}
sub Text {
my $p = shift;
# do nothing if there has been an error
return if $error;
$data .= $_ if defined $data;
}
sub PI {
# ignore processing instructions
return;
}
sub EndDocument {
# if we parsed a feed, link items to it
$feed->{'items'} = \@items
if $feed;
return;
}
1;