496 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
			
		
		
	
	
			496 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
| #!/usr/bin/perl
 | |
| 
 | |
| use strict;
 | |
| 
 | |
| package LJ::ParseFeed;
 | |
| 
 | |
| use XML::RSS;
 | |
| use XML::Parser;
 | |
| 
 | |
| 
 | |
| # parse_feed parses an RSS/Atom feed 
 | |
| # arguments: content and, optionally, type, specifying "atom" or
 | |
| # "rss". If type isn't supplied, the function will try to guess it
 | |
| # based on contents.
 | |
| # It returns $feed, which is a hash
 | |
| # with the following keys:
 | |
| #  type - 'atom' or 'rss'
 | |
| #  version - version of the feed in its standard
 | |
| #  link - URL of the feed
 | |
| #  title - title of the feed
 | |
| #  description - description of the feed
 | |
| #  # TODO: more kinds of info?
 | |
| #
 | |
| #  items - arrayref of item hashes, in the same order they were in the feed
 | |
| #    each item contains:
 | |
| #    link - URL of the item
 | |
| #    id - unique identifier (optional)
 | |
| #    text - text of the item
 | |
| #    subject - subject
 | |
| #    time - in format 'yyyy-mm-dd hh:mm' (optional)
 | |
| # the second argument returned is $error, which, if defined, is a human-readable
 | |
| # error string. the third argument is arrayref of items, same as 
 | |
| # $feed->{'items'}.
 | |
| 
 | |
| sub parse_feed
 | |
| {
 | |
|     my ($content, $type) = @_;
 | |
|     my ($feed, $items, $error);
 | |
|     my $parser;
 | |
| 
 | |
|     # is it RSS or Atom?
 | |
|     # Atom feeds are rare for now, so prefer to err in favor of RSS
 | |
|     # simple heuristic: Atom feeds will have '<feed' somewhere at the beginning
 | |
|     # TODO: maybe store the feed's type on creation in a userprop and not guess here
 | |
|     
 | |
|     my $cut = substr($content, 0, 255);
 | |
|     if ($type eq 'atom' || $cut =~ m!\<feed!) {
 | |
|         # try treating it as an atom feed
 | |
|         $parser = new XML::Parser(Style=>'Stream', Pkg=>'LJ::ParseFeed::Atom');
 | |
|         return ("", "failed to create XML parser") unless $parser;
 | |
|         eval {
 | |
|             $parser->parse($content);
 | |
|         };
 | |
|         if ($@) {
 | |
|             $error = "XML parser error: $@";
 | |
|         } else {
 | |
|             ($feed, $items, $error) = LJ::ParseFeed::Atom::results();
 | |
|         };
 | |
|     
 | |
|         if ($feed || $type eq 'atom') {
 | |
|             # there was a top-level <feed> there, or we're forced to treat
 | |
|             # as an Atom feed, so even if $error is set,
 | |
|             # don't try RSS
 | |
|             $feed->{'type'} = 'atom';
 | |
|             return ($feed, $error, $items);
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     # try parsing it as RSS
 | |
|     $parser = new XML::RSS;
 | |
|     return ("", "failed to create RSS parser") unless $parser;
 | |
|     eval {
 | |
|         $parser->parse($content);
 | |
|     };
 | |
|     if ($@) {
 | |
|         $error = "RSS parser error: $@";
 | |
|         return ("", $error);
 | |
|     }
 | |
| 
 | |
|     $feed = {};
 | |
|     $feed->{'type'} = 'rss';
 | |
|     $feed->{'version'} = $parser->{'version'};
 | |
| 
 | |
|     foreach (qw (link title description lastBuildDate)) {
 | |
|         $feed->{$_} = $parser->{'channel'}->{$_}
 | |
|             if $parser->{'channel'}->{$_};
 | |
|     }
 | |
|     
 | |
|     if ($parser->{'image'}->{'url'}) {
 | |
|         $feed->{'image'} = $parser->{'image'}->{'url'};
 | |
|     }
 | |
|    
 | |
| 
 | |
|     $feed->{'lastmod'} = undef;
 | |
|     $feed->{'items'} = [];
 | |
| 
 | |
|     foreach(@{$parser->{'items'}}) {
 | |
|         my $item = {};
 | |
|         $item->{'subject'} = $_->{'title'};
 | |
|         $item->{'text'} = $_->{'description'};
 | |
|         $item->{'link'} = $_->{'link'} if $_->{'link'};
 | |
|         $item->{'id'} = $_->{'guid'} if $_->{'guid'};
 | |
| 
 | |
|         my $nsdc = 'http://purl.org/dc/elements/1.1/';
 | |
|         my $nsenc = 'http://purl.org/rss/1.0/modules/content/';
 | |
|         if ($_->{$nsenc} && ref($_->{$nsenc}) eq "HASH") {
 | |
|             # prefer content:encoded if present
 | |
|             $item->{'text'} = $_->{$nsenc}->{'encoded'}
 | |
|                 if defined $_->{$nsenc}->{'encoded'};
 | |
|         }
 | |
| 
 | |
|         if ($_->{'pubDate'}) {
 | |
|             my $time = time822_to_time($_->{'pubDate'});
 | |
|             $item->{'time'} = $time if $time;
 | |
|         }
 | |
|         if ($_->{$nsdc} && ref($_->{$nsdc}) eq "HASH") {
 | |
|             if ($_->{$nsdc}->{date}) {
 | |
|                 my $time = w3cdtf_to_time($_->{$nsdc}->{date});
 | |
|                 $item->{'time'} = $time if $time;
 | |
|             }
 | |
|         }
 | |
|         if ($_->{'pubDate'} && (! $feed->{'lastmod'} ||
 | |
|             	(LJ::http_to_time($feed->{'lastmod'}) < LJ::http_to_time($_->{'pubDate'})))) {
 | |
|             $feed->{'lastmod'} = $_->{'pubDate'};
 | |
|         }
 | |
| 
 | |
|         push @{$feed->{'items'}}, $item;
 | |
|     }
 | |
| 
 | |
|     return ($feed, undef, $feed->{'items'});
 | |
| }
 | |
| 
 | |
| # convert rfc822-time in RSS's <pubDate> to our time
 | |
| # see http://www.faqs.org/rfcs/rfc822.html
 | |
| # RFC822 specifies 2 digits for year, and RSS2.0 refers to RFC822,
 | |
| # but real RSS2.0 feeds apparently use 4 digits. 
 | |
| sub time822_to_time {
 | |
|     my $t822 = shift;
 | |
|     # remove day name if present
 | |
|     $t822 =~ s/^\s*\w+\s*,//;
 | |
|     # remove whitespace
 | |
|     $t822 =~ s/^\s*//;
 | |
|     # break it up
 | |
|     if ($t822 =~ m!(\d?\d)\s+(\w+)\s+(\d\d\d\d)\s+(\d?\d):(\d\d)!) {
 | |
|         my ($day, $mon, $year, $hour, $min) = ($1,$2,$3,$4,$5);
 | |
|         $day = "0" . $day if length($day) == 1;
 | |
|         $hour = "0" . $hour if length($hour) == 1;
 | |
|         $mon = {'Jan'=>'01', 'Feb'=>'02', 'Mar'=>'03', 'Apr'=>'04',
 | |
|                 'May'=>'05', 'Jun'=>'06', 'Jul'=>'07', 'Aug'=>'08',
 | |
|                 'Sep'=>'09', 'Oct'=>'10', 'Nov'=>'11', 'Dec'=>'12'}->{$mon};
 | |
|         return undef unless $mon;
 | |
|         return "$year-$mon-$day $hour:$min";
 | |
|     } else {
 | |
|         return undef;
 | |
|     }
 | |
| }
 | |
| 
 | |
| # convert W3C-DTF to our internal format
 | |
| # see http://www.w3.org/TR/NOTE-datetime
 | |
| # Based very loosely on code from DateTime::Format::W3CDTF,
 | |
| # which isn't stable yet so we can't use it directly.
 | |
| sub w3cdtf_to_time {
 | |
|     my $tw3 = shift;
 | |
| 
 | |
|     # TODO: Should somehow return the timezone offset
 | |
|     #   so that it can stored... but we don't do timezones
 | |
|     #   yet anyway. For now, just strip the timezone
 | |
|     #   portion if it is present, along with the decimal
 | |
|     #   fractions of a second.
 | |
|     
 | |
|     $tw3 =~ s/(?:\.\d+)?(?:[+-]\d{1,2}:\d{1,2}|Z)$//;
 | |
|     $tw3 =~ s/^\s*//; $tw3 =~ s/\s*$//; # Eat any superflous whitespace
 | |
| 
 | |
|     # We can only use complete times, so anything which
 | |
|     # doesn't feature the time part is considered invalid.
 | |
|     
 | |
|     # This is working around clients that don't implement W3C-DTF
 | |
|     # correctly, and only send single digit values in the dates.
 | |
|     # 2004-4-8T16:9:4Z vs 2004-04-08T16:09:44Z
 | |
|     # If it's more messed up than that, reject it outright.
 | |
|     $tw3 =~ /^(\d{4})-(\d{1,2})-(\d{1,2})T(\d{1,2}):(\d{1,2})(?::(\d{1,2}))?$/
 | |
|         or return undef;
 | |
| 
 | |
|     my %pd; # parsed date
 | |
|     $pd{Y} = $1; $pd{M} = $2; $pd{D} = $3;
 | |
|     $pd{h} = $4; $pd{m} = $5; $pd{s} = $6;
 | |
| 
 | |
|     # force double digits
 | |
|     foreach (qw/ M D h m s /) {
 | |
|         next unless defined $pd{$_};
 | |
|         $pd{$_} = sprintf "%02d", $pd{$_};
 | |
|     }
 | |
| 
 | |
|     return $pd{s} ? "$pd{Y}-$pd{M}-$pd{D} $pd{h}:$pd{m}:$pd{s}" :
 | |
|                     "$pd{Y}-$pd{M}-$pd{D} $pd{h}:$pd{m}";
 | |
| }
 | |
| 
 | |
| package LJ::ParseFeed::Atom;
 | |
| 
 | |
| our ($feed, $item, $data);
 | |
| our ($ddepth, $dholder); # for accumulating;
 | |
| our @items;
 | |
| our $error;
 | |
| 
 | |
| sub err {
 | |
|     $error = shift unless $error;
 | |
| }
 | |
| 
 | |
| sub results {
 | |
|     return ($feed, \@items, $error);
 | |
| }
 | |
| 
 | |
| # $name under which we'll store accumulated data may be different
 | |
| # from $tag which causes us to store it
 | |
| # $name may be a scalarref pointing to where we should store
 | |
| # swallowing is achieved by calling startaccum('');
 | |
| 
 | |
| sub startaccum {
 | |
|     my $name = shift;
 | |
| 
 | |
|     return err("Tag found under neither <feed> nor <entry>")
 | |
|         unless $feed || $item;
 | |
|     $data = ""; # defining $data triggers accumulation
 | |
|     $ddepth = 1;
 | |
| 
 | |
|     $dholder = undef 
 | |
|         unless $name;
 | |
|     # if $name is a scalarref, it's actually our $dholder
 | |
|     if (ref($name) eq 'SCALAR') {
 | |
|         $dholder = $name;
 | |
|     } else {
 | |
|         $dholder = ($item ? \$item->{$name} : \$feed->{$name})
 | |
|             if $name;
 | |
|     }
 | |
|     return;
 | |
| }
 | |
| 
 | |
| sub swallow {
 | |
|     return startaccum('');
 | |
| }
 | |
| 
 | |
| sub StartDocument {
 | |
|     ($feed, $item, $data) = (undef, undef, undef);
 | |
|     @items = ();
 | |
|     undef $error;
 | |
| }
 | |
| 
 | |
| sub StartTag {
 | |
|     # $_ carries the unparsed tag
 | |
|     my ($p, $tag) = @_;
 | |
|     my $holder;
 | |
| 
 | |
|     # do nothing if there has been an error
 | |
|     return if $error;
 | |
| 
 | |
|     # are we just accumulating data?
 | |
|     if (defined $data) {
 | |
|         $data .= $_;
 | |
|         $ddepth++;
 | |
|         return;
 | |
|     }
 | |
| 
 | |
|     # where we'll usually store info
 | |
|     $holder = $item ? $item : $feed;
 | |
| 
 | |
|     TAGS: {
 | |
|         if ($tag eq 'feed') {
 | |
|             return err("Nested <feed> tags") 
 | |
|                 if $feed;
 | |
|             $feed = {};
 | |
|             $feed->{'standard'} = 'atom';
 | |
|             $feed->{'version'} = $_{'version'};
 | |
| #           return err("No version specified in <feed>")
 | |
| #                unless $feed->{'version'};
 | |
| # commented out as it's done in 
 | |
| # http://code.livejournal.org/trac/livejournal/browser/trunk/cgi-bin/parsefeed.pl
 | |
| # (too many sites return this error now)
 | |
|             return err("Incompatible version specified in <feed>")
 | |
|                if $feed->{'version'} && $feed->{'version'} < 0.3;
 | |
|             last TAGS;
 | |
|         }
 | |
|         if ($tag eq 'entry') {
 | |
|             return err("Nested <entry> tags") 
 | |
|                 if $item;
 | |
|             $item = {};
 | |
|             last TAGS;
 | |
|         }
 | |
|         
 | |
|         # at this point, we must have a top-level <feed> or <entry>
 | |
|         # to write into
 | |
|         return err("Tag found under neither <feed> nor <entry>")
 | |
|             unless $holder;
 | |
| 
 | |
|         if ($tag eq 'link') {
 | |
|             # ignore links with rel= anything but alternate
 | |
|             unless ($_{'rel'} eq 'alternate') {
 | |
|                 swallow();
 | |
|                 last TAGS;
 | |
|             }
 | |
|             $holder->{'link'} = $_{'href'};
 | |
|             return err("No href attribute in <link>")
 | |
|                 unless $holder->{'link'};
 | |
|             last TAGS;
 | |
|         }
 | |
| 
 | |
|         if ($tag eq 'content') {
 | |
|             return err("<content> outside <entry>")
 | |
|                 unless $item;
 | |
|             # if type is multipart/alternative, we continue recursing
 | |
|             # otherwise we accumulate
 | |
|             my $type = $_{'type'} || "text/plain";
 | |
|             unless ($type eq "multipart/alternative") {
 | |
|                 push @{$item->{'contents'}}, [$type, ""];
 | |
|                 startaccum(\$item->{'contents'}->[-1]->[1]);
 | |
|                 last TAGS;
 | |
|             }
 | |
|             # it's multipart/alternative, so recurse, but don't swallow
 | |
|             last TAGS;
 | |
|         }
 | |
| 
 | |
|         # store tags which should require no further
 | |
|         # processing as they are, and others under _atom_*, to be processed
 | |
|         # in EndTag under </entry>
 | |
|         if ($tag eq 'title') {
 | |
|             if ($item) { # entry's subject
 | |
|                 startaccum("subject");
 | |
|             } else { # feed's title
 | |
|                 startaccum($tag);
 | |
|             }
 | |
|             last TAGS;
 | |
|         }
 | |
|         if ($tag eq 'id') {
 | |
|             unless ($item) {
 | |
|                 swallow(); # we don't need feed-level <id>
 | |
|             } else {
 | |
|                 startaccum($tag);
 | |
|             }
 | |
|             last TAGS;
 | |
|         }
 | |
| 
 | |
|         if ($tag eq 'tagline' && !$item) { # feed's tagline, our "description"
 | |
|             startaccum("description");
 | |
|             last TAGS;
 | |
|         }
 | |
| 
 | |
|         # accumulate and store
 | |
|         startaccum("_atom_" . $tag);
 | |
|         last TAGS;
 | |
|     }
 | |
|             
 | |
|     return;
 | |
| }
 | |
| 
 | |
| sub EndTag {
 | |
|     # $_ carries the unparsed tag
 | |
|     my ($p, $tag) = @_;
 | |
| 
 | |
|     # do nothing if there has been an error
 | |
|     return if $error;
 | |
| 
 | |
|     # are we accumulating data?
 | |
|     if (defined $data) {
 | |
|         $ddepth--;
 | |
|         if ($ddepth == 0) { # stop accumulating
 | |
|             $$dholder = $data
 | |
|                 if $dholder;
 | |
|             undef $data;
 | |
|             return;
 | |
|         }
 | |
|         $data .= $_;
 | |
|         return;
 | |
|     }
 | |
| 
 | |
|     TAGS: {
 | |
|         if ($tag eq 'entry') {
 | |
|             # finalize item...
 | |
|             # generate suitable text from $item->{'contents'}            
 | |
|             my $content;
 | |
|             $item->{'contents'} ||= [];
 | |
|             unless (scalar(@{$item->{'contents'}}) >= 1) {
 | |
|                 # this item had no <content>
 | |
|                 # maybe it has <summary>? if so, use <summary>
 | |
|                 # TODO: type= or encoding issues here? perhaps unite
 | |
|                 # handling of <summary> with that of <content>?
 | |
|                 if ($item->{'_atom_summary'}) {
 | |
|                     $item->{'text'} = $item->{'_atom_summary'};
 | |
|                     delete $item->{'contents'};
 | |
|                 } else {
 | |
|                     # nothing to display, so ignore this entry
 | |
|                     undef $item;
 | |
|                     last TAGS;
 | |
|                 }
 | |
|             }
 | |
| 
 | |
|             unless ($item->{'text'}) { # unless we already have text
 | |
|                 if (scalar(@{$item->{'contents'}}) == 1) {
 | |
|                     # only one <content> section
 | |
|                     $content = $item->{'contents'}->[0]; 
 | |
|                 } else {
 | |
|                     # several <content> section, must choose the best one
 | |
|                     foreach (@{$item->{'contents'}}) {
 | |
|                         if ($_->[0] eq "application/xhtml+xml") { # best match
 | |
|                             $content = $_;
 | |
|                             last; # don't bother to look at others
 | |
|                         }
 | |
|                         if ($_->[0] =~ m!html!) { # some kind of html/xhtml/html+xml, etc.
 | |
|                             # choose this unless we've already chosen some html
 | |
|                             $content = $_
 | |
|                                 unless $content->[0] =~ m!html!;
 | |
|                             next;
 | |
|                         }
 | |
|                         if ($_->[0] eq "text/plain") {
 | |
|                             # choose this unless we have some html already
 | |
|                             $content = $_
 | |
|                                 unless $content->[0] =~ m!html!;
 | |
|                             next;
 | |
|                         }
 | |
|                     }
 | |
|                     # if we didn't choose anything, pick the first one
 | |
|                     $content =  $item->{'contents'}->[0]
 | |
|                         unless $content;
 | |
|                 }
 | |
| 
 | |
|                 # we ignore the 'mode' attribute of <content>. If it's "xml", we've
 | |
|                 # stringified it by accumulation; if it's "escaped", our parser
 | |
|                 # unescaped it
 | |
|                 # TODO: handle mode=base64?
 | |
| 
 | |
|                 $item->{'text'} = $content->[1];
 | |
|                 delete $item->{'contents'};
 | |
|             }
 | |
| 
 | |
|             # generate time
 | |
|             my $w3time = $item->{'_atom_modified'} || $item->{'_atom_created'};
 | |
|             my $time;
 | |
|             if ($w3time) {
 | |
|                 # see http://www.w3.org/TR/NOTE-datetime for format
 | |
|                 # we insist on having granularity up to a minute,
 | |
|                 # and ignore finer data as well as the timezone, for now
 | |
|                 if ($w3time =~ m!^(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d)!) {
 | |
|                     $time = "$1-$2-$3 $4:$5";
 | |
|                 }
 | |
|             }
 | |
|             if ($time) {
 | |
|                 $item->{'time'} = $time;
 | |
|             }
 | |
|             
 | |
|             # get rid of all other tags we don't need anymore
 | |
|             foreach (keys  %$item) {
 | |
|                 delete $item->{$_} if substr($_, 0, 6) eq '_atom_';
 | |
|             }
 | |
|             
 | |
|             push @items, $item;
 | |
|             undef $item;
 | |
|             last TAGS;
 | |
|         }
 | |
|         if ($tag eq 'feed') {
 | |
|             # finalize feed
 | |
|             # get rid of all other tags we don't need anymore
 | |
|             foreach (keys  %$feed) {
 | |
|                 delete $feed->{$_} if substr($_, 0, 6) eq '_atom_';
 | |
|             }
 | |
|             
 | |
|             # link the feed with its itms
 | |
|             $feed->{'items'} = \@items 
 | |
|                 if $feed;
 | |
|             last TAGS;
 | |
|         }
 | |
|     }
 | |
|     return;
 | |
| }
 | |
| 
 | |
| sub Text {
 | |
|     my $p = shift;
 | |
| 
 | |
|     # do nothing if there has been an error
 | |
|     return if $error;
 | |
| 
 | |
|     $data .= $_ if defined $data;
 | |
| }
 | |
| 
 | |
| sub PI {
 | |
|     # ignore processing instructions
 | |
|     return;
 | |
| }
 | |
| 
 | |
| sub EndDocument {
 | |
|     # if we parsed a feed, link items to it
 | |
|     $feed->{'items'} = \@items 
 | |
|         if $feed;
 | |
|     return;
 | |
| }
 | |
| 
 | |
| 
 | |
| 1;
 |