ljr/local/cgi-bin/parsefeed.pl

#!/usr/bin/perl

use strict;

package LJ::ParseFeed;

use XML::RSS;
use XML::Parser;


# parse_feed parses an RSS/Atom feed 
# arguments: content and, optionally, type, specifying "atom" or
# "rss". If type isn't supplied, the function will try to guess it
# based on contents.
# It returns $feed, which is a hash
# with the following keys:
#  type - 'atom' or 'rss'
#  version - version of the feed in its standard
#  link - URL of the feed
#  title - title of the feed
#  description - description of the feed
#  # TODO: more kinds of info?
#
#  items - arrayref of item hashes, in the same order they were in the feed
#    each item contains:
#    link - URL of the item
#    id - unique identifier (optional)
#    text - text of the item
#    subject - subject
#    time - in format 'yyyy-mm-dd hh:mm' (optional)
# the second argument returned is $error, which, if defined, is a human-readable
# error string. the third argument is arrayref of items, same as 
# $feed->{'items'}.

sub parse_feed
{
    my ($content, $type) = @_;
    my ($feed, $items, $error);
    my $parser;

    # is it RSS or Atom?
    # Atom feeds are rare for now, so prefer to err in favor of RSS
    # simple heuristic: Atom feeds will have '<feed' somewhere at the beginning
    # TODO: maybe store the feed's type on creation in a userprop and not guess here
    
    my $cut = substr($content, 0, 255);
    if ($type eq 'atom' || $cut =~ m!\<feed!) {
        # try treating it as an atom feed
        $parser = new XML::Parser(Style=>'Stream', Pkg=>'LJ::ParseFeed::Atom');
        return ("", "failed to create XML parser") unless $parser;
        eval {
            $parser->parse($content);
        };
        if ($@) {
            $error = "XML parser error: $@";
        } else {
            ($feed, $items, $error) = LJ::ParseFeed::Atom::results();
        };
    
        if ($feed || $type eq 'atom') {
            # there was a top-level <feed> there, or we're forced to treat
            # as an Atom feed, so even if $error is set,
            # don't try RSS
            $feed->{'type'} = 'atom';
            return ($feed, $error, $items);
        }
    }

    # try parsing it as RSS
    $parser = new XML::RSS;
    return ("", "failed to create RSS parser") unless $parser;
    eval {
        $parser->parse($content);
    };
    if ($@) {
        $error = "RSS parser error: $@";
        return ("", $error);
    }

    $feed = {};
    $feed->{'type'} = 'rss';
    $feed->{'version'} = $parser->{'version'};

    foreach (qw (link title description lastBuildDate)) {
        $feed->{$_} = $parser->{'channel'}->{$_}
            if $parser->{'channel'}->{$_};
    }
    
    if ($parser->{'image'}->{'url'}) {
        $feed->{'image'} = $parser->{'image'}->{'url'};
    }
   

    $feed->{'lastmod'} = undef;
    $feed->{'items'} = [];

    foreach(@{$parser->{'items'}}) {
        my $item = {};
        $item->{'subject'} = $_->{'title'};
        $item->{'text'} = $_->{'description'};
        $item->{'link'} = $_->{'link'} if $_->{'link'};
        $item->{'id'} = $_->{'guid'} if $_->{'guid'};

        my $nsdc = 'http://purl.org/dc/elements/1.1/';
        my $nsenc = 'http://purl.org/rss/1.0/modules/content/';
        if ($_->{$nsenc} && ref($_->{$nsenc}) eq "HASH") {
            # prefer content:encoded if present
            $item->{'text'} = $_->{$nsenc}->{'encoded'}
                if defined $_->{$nsenc}->{'encoded'};
        }

        if ($_->{'pubDate'}) {
            my $time = time822_to_time($_->{'pubDate'});
            $item->{'time'} = $time if $time;
        }
        if ($_->{$nsdc} && ref($_->{$nsdc}) eq "HASH") {
            if ($_->{$nsdc}->{date}) {
                my $time = w3cdtf_to_time($_->{$nsdc}->{date});
                $item->{'time'} = $time if $time;
            }
        }
        if ($_->{'pubDate'} && (! $feed->{'lastmod'} ||
            	(LJ::http_to_time($feed->{'lastmod'}) < LJ::http_to_time($_->{'pubDate'})))) {
            $feed->{'lastmod'} = $_->{'pubDate'};
        }

        push @{$feed->{'items'}}, $item;
    }

    return ($feed, undef, $feed->{'items'});
}

# convert rfc822-time in RSS's <pubDate> to our time
# see http://www.faqs.org/rfcs/rfc822.html
# RFC822 specifies 2 digits for year, and RSS2.0 refers to RFC822,
# but real RSS2.0 feeds apparently use 4 digits. 
sub time822_to_time {
    my $t822 = shift;
    # remove day name if present
    $t822 =~ s/^\s*\w+\s*,//;
    # remove whitespace
    $t822 =~ s/^\s*//;
    # break it up
    if ($t822 =~ m!(\d?\d)\s+(\w+)\s+(\d\d\d\d)\s+(\d?\d):(\d\d)!) {
        my ($day, $mon, $year, $hour, $min) = ($1,$2,$3,$4,$5);
        $day = "0" . $day if length($day) == 1;
        $hour = "0" . $hour if length($hour) == 1;
        $mon = {'Jan'=>'01', 'Feb'=>'02', 'Mar'=>'03', 'Apr'=>'04',
                'May'=>'05', 'Jun'=>'06', 'Jul'=>'07', 'Aug'=>'08',
                'Sep'=>'09', 'Oct'=>'10', 'Nov'=>'11', 'Dec'=>'12'}->{$mon};
        return undef unless $mon;
        return "$year-$mon-$day $hour:$min";
    } else {
        return undef;
    }
}

# convert W3C-DTF to our internal format
# see http://www.w3.org/TR/NOTE-datetime
# Based very loosely on code from DateTime::Format::W3CDTF,
# which isn't stable yet so we can't use it directly.
sub w3cdtf_to_time {
    my $tw3 = shift;

    # TODO: Should somehow return the timezone offset
    #   so that it can stored... but we don't do timezones
    #   yet anyway. For now, just strip the timezone
    #   portion if it is present, along with the decimal
    #   fractions of a second.
    
    $tw3 =~ s/(?:\.\d+)?(?:[+-]\d{1,2}:\d{1,2}|Z)$//;
    $tw3 =~ s/^\s*//; $tw3 =~ s/\s*$//; # Eat any superflous whitespace

    # We can only use complete times, so anything which
    # doesn't feature the time part is considered invalid.
    
    # This is working around clients that don't implement W3C-DTF
    # correctly, and only send single digit values in the dates.
    # 2004-4-8T16:9:4Z vs 2004-04-08T16:09:44Z
    # If it's more messed up than that, reject it outright.
    $tw3 =~ /^(\d{4})-(\d{1,2})-(\d{1,2})T(\d{1,2}):(\d{1,2})(?::(\d{1,2}))?$/
        or return undef;

    my %pd; # parsed date
    $pd{Y} = $1; $pd{M} = $2; $pd{D} = $3;
    $pd{h} = $4; $pd{m} = $5; $pd{s} = $6;

    # force double digits
    foreach (qw/ M D h m s /) {
        next unless defined $pd{$_};
        $pd{$_} = sprintf "%02d", $pd{$_};
    }

    return $pd{s} ? "$pd{Y}-$pd{M}-$pd{D} $pd{h}:$pd{m}:$pd{s}" :
                    "$pd{Y}-$pd{M}-$pd{D} $pd{h}:$pd{m}";
}

package LJ::ParseFeed::Atom;

our ($feed, $item, $data);
our ($ddepth, $dholder); # for accumulating;
our @items;
our $error;

sub err {
    $error = shift unless $error;
}

sub results {
    return ($feed, \@items, $error);
}

# $name under which we'll store accumulated data may be different
# from $tag which causes us to store it
# $name may be a scalarref pointing to where we should store
# swallowing is achieved by calling startaccum('');

sub startaccum {
    my $name = shift;

    return err("Tag found under neither <feed> nor <entry>")
        unless $feed || $item;
    $data = ""; # defining $data triggers accumulation
    $ddepth = 1;

    $dholder = undef 
        unless $name;
    # if $name is a scalarref, it's actually our $dholder
    if (ref($name) eq 'SCALAR') {
        $dholder = $name;
    } else {
        $dholder = ($item ? \$item->{$name} : \$feed->{$name})
            if $name;
    }
    return;
}

sub swallow {
    return startaccum('');
}

sub StartDocument {
    ($feed, $item, $data) = (undef, undef, undef);
    @items = ();
    undef $error;
}

sub StartTag {
    # $_ carries the unparsed tag
    my ($p, $tag) = @_;
    my $holder;

    # do nothing if there has been an error
    return if $error;

    # are we just accumulating data?
    if (defined $data) {
        $data .= $_;
        $ddepth++;
        return;
    }

    # where we'll usually store info
    $holder = $item ? $item : $feed;

    TAGS: {
        if ($tag eq 'feed') {
            return err("Nested <feed> tags") 
                if $feed;
            $feed = {};
            $feed->{'standard'} = 'atom';
            $feed->{'version'} = $_{'version'};
#           return err("No version specified in <feed>")
#                unless $feed->{'version'};
# commented out as it's done in 
# http://code.livejournal.org/trac/livejournal/browser/trunk/cgi-bin/parsefeed.pl
# (too many sites return this error now)
            return err("Incompatible version specified in <feed>")
               if $feed->{'version'} && $feed->{'version'} < 0.3;
            last TAGS;
        }
        if ($tag eq 'entry') {
            return err("Nested <entry> tags") 
                if $item;
            $item = {};
            last TAGS;
        }
        
        # at this point, we must have a top-level <feed> or <entry>
        # to write into
        return err("Tag found under neither <feed> nor <entry>")
            unless $holder;

        if ($tag eq 'link') {
            # ignore links with rel= anything but alternate
            unless ($_{'rel'} eq 'alternate') {
                swallow();
                last TAGS;
            }
            $holder->{'link'} = $_{'href'};
            return err("No href attribute in <link>")
                unless $holder->{'link'};
            last TAGS;
        }

        if ($tag eq 'content') {
            return err("<content> outside <entry>")
                unless $item;
            # if type is multipart/alternative, we continue recursing
            # otherwise we accumulate
            my $type = $_{'type'} || "text/plain";
            unless ($type eq "multipart/alternative") {
                push @{$item->{'contents'}}, [$type, ""];
                startaccum(\$item->{'contents'}->[-1]->[1]);
                last TAGS;
            }
            # it's multipart/alternative, so recurse, but don't swallow
            last TAGS;
        }

        # store tags which should require no further
        # processing as they are, and others under _atom_*, to be processed
        # in EndTag under </entry>
        if ($tag eq 'title') {
            if ($item) { # entry's subject
                startaccum("subject");
            } else { # feed's title
                startaccum($tag);
            }
            last TAGS;
        }
        if ($tag eq 'id') {
            unless ($item) {
                swallow(); # we don't need feed-level <id>
            } else {
                startaccum($tag);
            }
            last TAGS;
        }

        if ($tag eq 'tagline' && !$item) { # feed's tagline, our "description"
            startaccum("description");
            last TAGS;
        }

        # accumulate and store
        startaccum("_atom_" . $tag);
        last TAGS;
    }
            
    return;
}

sub EndTag {
    # $_ carries the unparsed tag
    my ($p, $tag) = @_;

    # do nothing if there has been an error
    return if $error;

    # are we accumulating data?
    if (defined $data) {
        $ddepth--;
        if ($ddepth == 0) { # stop accumulating
            $$dholder = $data
                if $dholder;
            undef $data;
            return;
        }
        $data .= $_;
        return;
    }

    TAGS: {
        if ($tag eq 'entry') {
            # finalize item...
            # generate suitable text from $item->{'contents'}            
            my $content;
            $item->{'contents'} ||= [];
            unless (scalar(@{$item->{'contents'}}) >= 1) {
                # this item had no <content>
                # maybe it has <summary>? if so, use <summary>
                # TODO: type= or encoding issues here? perhaps unite
                # handling of <summary> with that of <content>?
                if ($item->{'_atom_summary'}) {
                    $item->{'text'} = $item->{'_atom_summary'};
                    delete $item->{'contents'};
                } else {
                    # nothing to display, so ignore this entry
                    undef $item;
                    last TAGS;
                }
            }

            unless ($item->{'text'}) { # unless we already have text
                if (scalar(@{$item->{'contents'}}) == 1) {
                    # only one <content> section
                    $content = $item->{'contents'}->[0]; 
                } else {
                    # several <content> section, must choose the best one
                    foreach (@{$item->{'contents'}}) {
                        if ($_->[0] eq "application/xhtml+xml") { # best match
                            $content = $_;
                            last; # don't bother to look at others
                        }
                        if ($_->[0] =~ m!html!) { # some kind of html/xhtml/html+xml, etc.
                            # choose this unless we've already chosen some html
                            $content = $_
                                unless $content->[0] =~ m!html!;
                            next;
                        }
                        if ($_->[0] eq "text/plain") {
                            # choose this unless we have some html already
                            $content = $_
                                unless $content->[0] =~ m!html!;
                            next;
                        }
                    }
                    # if we didn't choose anything, pick the first one
                    $content =  $item->{'contents'}->[0]
                        unless $content;
                }

                # we ignore the 'mode' attribute of <content>. If it's "xml", we've
                # stringified it by accumulation; if it's "escaped", our parser
                # unescaped it
                # TODO: handle mode=base64?

                $item->{'text'} = $content->[1];
                delete $item->{'contents'};
            }

            # generate time
            my $w3time = $item->{'_atom_modified'} || $item->{'_atom_created'};
            my $time;
            if ($w3time) {
                # see http://www.w3.org/TR/NOTE-datetime for format
                # we insist on having granularity up to a minute,
                # and ignore finer data as well as the timezone, for now
                if ($w3time =~ m!^(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d)!) {
                    $time = "$1-$2-$3 $4:$5";
                }
            }
            if ($time) {
                $item->{'time'} = $time;
            }
            
            # get rid of all other tags we don't need anymore
            foreach (keys  %$item) {
                delete $item->{$_} if substr($_, 0, 6) eq '_atom_';
            }
            
            push @items, $item;
            undef $item;
            last TAGS;
        }
        if ($tag eq 'feed') {
            # finalize feed
            # get rid of all other tags we don't need anymore
            foreach (keys  %$feed) {
                delete $feed->{$_} if substr($_, 0, 6) eq '_atom_';
            }
            
            # link the feed with its itms
            $feed->{'items'} = \@items 
                if $feed;
            last TAGS;
        }
    }
    return;
}

sub Text {
    my $p = shift;

    # do nothing if there has been an error
    return if $error;

    $data .= $_ if defined $data;
}

sub PI {
    # ignore processing instructions
    return;
}

sub EndDocument {
    # if we parsed a feed, link items to it
    $feed->{'items'} = \@items 
        if $feed;
    return;
}


1;
init 2019-02-05 21:49:12 +00:00			`#!/usr/bin/perl`

			`use strict;`

			`package LJ::ParseFeed;`

			`use XML::RSS;`
			`use XML::Parser;`


			`# parse_feed parses an RSS/Atom feed`
			`# arguments: content and, optionally, type, specifying "atom" or`
			`# "rss". If type isn't supplied, the function will try to guess it`
			`# based on contents.`
			`# It returns $feed, which is a hash`
			`# with the following keys:`
			`# type - 'atom' or 'rss'`
			`# version - version of the feed in its standard`
			`# link - URL of the feed`
			`# title - title of the feed`
			`# description - description of the feed`
			`# # TODO: more kinds of info?`
			`#`
			`# items - arrayref of item hashes, in the same order they were in the feed`
			`# each item contains:`
			`# link - URL of the item`
			`# id - unique identifier (optional)`
			`# text - text of the item`
			`# subject - subject`
			`# time - in format 'yyyy-mm-dd hh:mm' (optional)`
			`# the second argument returned is $error, which, if defined, is a human-readable`
			`# error string. the third argument is arrayref of items, same as`
			`# $feed->{'items'}.`

			`sub parse_feed`
			`{`
			`my ($content, $type) = @_;`
			`my ($feed, $items, $error);`
			`my $parser;`

			`# is it RSS or Atom?`
			`# Atom feeds are rare for now, so prefer to err in favor of RSS`
			`# simple heuristic: Atom feeds will have '<feed' somewhere at the beginning`
			`# TODO: maybe store the feed's type on creation in a userprop and not guess here`

			`my $cut = substr($content, 0, 255);`
			`if ($type eq 'atom' \|\| $cut =~ m!\<feed!) {`
			`# try treating it as an atom feed`
			`$parser = new XML::Parser(Style=>'Stream', Pkg=>'LJ::ParseFeed::Atom');`
			`return ("", "failed to create XML parser") unless $parser;`
			`eval {`
			`$parser->parse($content);`
			`};`
			`if ($@) {`
			`$error = "XML parser error: $@";`
			`} else {`
			`($feed, $items, $error) = LJ::ParseFeed::Atom::results();`
			`};`

			`if ($feed \|\| $type eq 'atom') {`
			`# there was a top-level <feed> there, or we're forced to treat`
			`# as an Atom feed, so even if $error is set,`
			`# don't try RSS`
			`$feed->{'type'} = 'atom';`
			`return ($feed, $error, $items);`
			`}`
			`}`

			`# try parsing it as RSS`
			`$parser = new XML::RSS;`
			`return ("", "failed to create RSS parser") unless $parser;`
			`eval {`
			`$parser->parse($content);`
			`};`
			`if ($@) {`
			`$error = "RSS parser error: $@";`
			`return ("", $error);`
			`}`

			`$feed = {};`
			`$feed->{'type'} = 'rss';`
			`$feed->{'version'} = $parser->{'version'};`

			`foreach (qw (link title description lastBuildDate)) {`
			`$feed->{$_} = $parser->{'channel'}->{$_}`
			`if $parser->{'channel'}->{$_};`
			`}`

			`if ($parser->{'image'}->{'url'}) {`
			`$feed->{'image'} = $parser->{'image'}->{'url'};`
			`}`


			`$feed->{'lastmod'} = undef;`
			`$feed->{'items'} = [];`

			`foreach(@{$parser->{'items'}}) {`
			`my $item = {};`
			`$item->{'subject'} = $_->{'title'};`
			`$item->{'text'} = $_->{'description'};`
			`$item->{'link'} = $_->{'link'} if $_->{'link'};`
			`$item->{'id'} = $_->{'guid'} if $_->{'guid'};`

			`my $nsdc = 'http://purl.org/dc/elements/1.1/';`
			`my $nsenc = 'http://purl.org/rss/1.0/modules/content/';`
			`if ($_->{$nsenc} && ref($_->{$nsenc}) eq "HASH") {`
			`# prefer content:encoded if present`
			`$item->{'text'} = $_->{$nsenc}->{'encoded'}`
			`if defined $_->{$nsenc}->{'encoded'};`
			`}`

			`if ($_->{'pubDate'}) {`
			`my $time = time822_to_time($_->{'pubDate'});`
			`$item->{'time'} = $time if $time;`
			`}`
			`if ($_->{$nsdc} && ref($_->{$nsdc}) eq "HASH") {`
			`if ($_->{$nsdc}->{date}) {`
			`my $time = w3cdtf_to_time($_->{$nsdc}->{date});`
			`$item->{'time'} = $time if $time;`
			`}`
			`}`
			`if ($_->{'pubDate'} && (! $feed->{'lastmod'} \|\|`
			`(LJ::http_to_time($feed->{'lastmod'}) < LJ::http_to_time($_->{'pubDate'})))) {`
			`$feed->{'lastmod'} = $_->{'pubDate'};`
			`}`

			`push @{$feed->{'items'}}, $item;`
			`}`

			`return ($feed, undef, $feed->{'items'});`
			`}`

			`# convert rfc822-time in RSS's <pubDate> to our time`
			`# see http://www.faqs.org/rfcs/rfc822.html`
			`# RFC822 specifies 2 digits for year, and RSS2.0 refers to RFC822,`
			`# but real RSS2.0 feeds apparently use 4 digits.`
			`sub time822_to_time {`
			`my $t822 = shift;`
			`# remove day name if present`
			`$t822 =~ s/^\s\w+\s,//;`
			`# remove whitespace`
			`$t822 =~ s/^\s*//;`
			`# break it up`
			`if ($t822 =~ m!(\d?\d)\s+(\w+)\s+(\d\d\d\d)\s+(\d?\d):(\d\d)!) {`
			`my ($day, $mon, $year, $hour, $min) = ($1,$2,$3,$4,$5);`
			`$day = "0" . $day if length($day) == 1;`
			`$hour = "0" . $hour if length($hour) == 1;`
			`$mon = {'Jan'=>'01', 'Feb'=>'02', 'Mar'=>'03', 'Apr'=>'04',`
			`'May'=>'05', 'Jun'=>'06', 'Jul'=>'07', 'Aug'=>'08',`
			`'Sep'=>'09', 'Oct'=>'10', 'Nov'=>'11', 'Dec'=>'12'}->{$mon};`
			`return undef unless $mon;`
			`return "$year-$mon-$day $hour:$min";`
			`} else {`
			`return undef;`
			`}`
			`}`

			`# convert W3C-DTF to our internal format`
			`# see http://www.w3.org/TR/NOTE-datetime`
			`# Based very loosely on code from DateTime::Format::W3CDTF,`
			`# which isn't stable yet so we can't use it directly.`
			`sub w3cdtf_to_time {`
			`my $tw3 = shift;`

			`# TODO: Should somehow return the timezone offset`
			`# so that it can stored... but we don't do timezones`
			`# yet anyway. For now, just strip the timezone`
			`# portion if it is present, along with the decimal`
			`# fractions of a second.`

			`$tw3 =~ s/(?:\.\d+)?(?:[+-]\d{1,2}:\d{1,2}\|Z)$//;`
			`$tw3 =~ s/^\s//; $tw3 =~ s/\s$//; # Eat any superflous whitespace`

			`# We can only use complete times, so anything which`
			`# doesn't feature the time part is considered invalid.`

			`# This is working around clients that don't implement W3C-DTF`
			`# correctly, and only send single digit values in the dates.`
			`# 2004-4-8T16:9:4Z vs 2004-04-08T16:09:44Z`
			`# If it's more messed up than that, reject it outright.`
			`$tw3 =~ /^(\d{4})-(\d{1,2})-(\d{1,2})T(\d{1,2}):(\d{1,2})(?::(\d{1,2}))?$/`
			`or return undef;`

			`my %pd; # parsed date`
			`$pd{Y} = $1; $pd{M} = $2; $pd{D} = $3;`
			`$pd{h} = $4; $pd{m} = $5; $pd{s} = $6;`

			`# force double digits`
			`foreach (qw/ M D h m s /) {`
			`next unless defined $pd{$_};`
			`$pd{$_} = sprintf "%02d", $pd{$_};`
			`}`

			`return $pd{s} ? "$pd{Y}-$pd{M}-$pd{D} $pd{h}:$pd{m}:$pd{s}" :`
			`"$pd{Y}-$pd{M}-$pd{D} $pd{h}:$pd{m}";`
			`}`

			`package LJ::ParseFeed::Atom;`

			`our ($feed, $item, $data);`
			`our ($ddepth, $dholder); # for accumulating;`
			`our @items;`
			`our $error;`

			`sub err {`
			`$error = shift unless $error;`
			`}`

			`sub results {`
			`return ($feed, \@items, $error);`
			`}`

			`# $name under which we'll store accumulated data may be different`
			`# from $tag which causes us to store it`
			`# $name may be a scalarref pointing to where we should store`
			`# swallowing is achieved by calling startaccum('');`

			`sub startaccum {`
			`my $name = shift;`

			`return err("Tag found under neither <feed> nor <entry>")`
			`unless $feed \|\| $item;`
			`$data = ""; # defining $data triggers accumulation`
			`$ddepth = 1;`

			`$dholder = undef`
			`unless $name;`
			`# if $name is a scalarref, it's actually our $dholder`
			`if (ref($name) eq 'SCALAR') {`
			`$dholder = $name;`
			`} else {`
			`$dholder = ($item ? \$item->{$name} : \$feed->{$name})`
			`if $name;`
			`}`
			`return;`
			`}`

			`sub swallow {`
			`return startaccum('');`
			`}`

			`sub StartDocument {`
			`($feed, $item, $data) = (undef, undef, undef);`
			`@items = ();`
			`undef $error;`
			`}`

			`sub StartTag {`
			`# $_ carries the unparsed tag`
			`my ($p, $tag) = @_;`
			`my $holder;`

			`# do nothing if there has been an error`
			`return if $error;`

			`# are we just accumulating data?`
			`if (defined $data) {`
			`$data .= $_;`
			`$ddepth++;`
			`return;`
			`}`

			`# where we'll usually store info`
			`$holder = $item ? $item : $feed;`

			`TAGS: {`
			`if ($tag eq 'feed') {`
			`return err("Nested <feed> tags")`
			`if $feed;`
			`$feed = {};`
			`$feed->{'standard'} = 'atom';`
			`$feed->{'version'} = $_{'version'};`
			`# return err("No version specified in <feed>")`
			`# unless $feed->{'version'};`
			`# commented out as it's done in`
			`# http://code.livejournal.org/trac/livejournal/browser/trunk/cgi-bin/parsefeed.pl`
			`# (too many sites return this error now)`
			`return err("Incompatible version specified in <feed>")`
			`if $feed->{'version'} && $feed->{'version'} < 0.3;`
			`last TAGS;`
			`}`
			`if ($tag eq 'entry') {`
			`return err("Nested <entry> tags")`
			`if $item;`
			`$item = {};`
			`last TAGS;`
			`}`

			`# at this point, we must have a top-level <feed> or <entry>`
			`# to write into`
			`return err("Tag found under neither <feed> nor <entry>")`
			`unless $holder;`

			`if ($tag eq 'link') {`
			`# ignore links with rel= anything but alternate`
			`unless ($_{'rel'} eq 'alternate') {`
			`swallow();`
			`last TAGS;`
			`}`
			`$holder->{'link'} = $_{'href'};`
			`return err("No href attribute in <link>")`
			`unless $holder->{'link'};`
			`last TAGS;`
			`}`

			`if ($tag eq 'content') {`
			`return err("<content> outside <entry>")`
			`unless $item;`
			`# if type is multipart/alternative, we continue recursing`
			`# otherwise we accumulate`
			`my $type = $_{'type'} \|\| "text/plain";`
			`unless ($type eq "multipart/alternative") {`
			`push @{$item->{'contents'}}, [$type, ""];`
			`startaccum(\$item->{'contents'}->[-1]->[1]);`
			`last TAGS;`
			`}`
			`# it's multipart/alternative, so recurse, but don't swallow`
			`last TAGS;`
			`}`

			`# store tags which should require no further`
			`# processing as they are, and others under _atom_*, to be processed`
			`# in EndTag under </entry>`
			`if ($tag eq 'title') {`
			`if ($item) { # entry's subject`
			`startaccum("subject");`
			`} else { # feed's title`
			`startaccum($tag);`
			`}`
			`last TAGS;`
			`}`
			`if ($tag eq 'id') {`
			`unless ($item) {`
			`swallow(); # we don't need feed-level <id>`
			`} else {`
			`startaccum($tag);`
			`}`
			`last TAGS;`
			`}`

			`if ($tag eq 'tagline' && !$item) { # feed's tagline, our "description"`
			`startaccum("description");`
			`last TAGS;`
			`}`

			`# accumulate and store`
			`startaccum("_atom_" . $tag);`
			`last TAGS;`
			`}`

			`return;`
			`}`

			`sub EndTag {`
			`# $_ carries the unparsed tag`
			`my ($p, $tag) = @_;`

			`# do nothing if there has been an error`
			`return if $error;`

			`# are we accumulating data?`
			`if (defined $data) {`
			`$ddepth--;`
			`if ($ddepth == 0) { # stop accumulating`
			`$$dholder = $data`
			`if $dholder;`
			`undef $data;`
			`return;`
			`}`
			`$data .= $_;`
			`return;`
			`}`

			`TAGS: {`
			`if ($tag eq 'entry') {`
			`# finalize item...`
			`# generate suitable text from $item->{'contents'}`
			`my $content;`
			`$item->{'contents'} \|\|= [];`
			`unless (scalar(@{$item->{'contents'}}) >= 1) {`
			`# this item had no <content>`
			`# maybe it has <summary>? if so, use <summary>`
			`# TODO: type= or encoding issues here? perhaps unite`
			`# handling of <summary> with that of <content>?`
			`if ($item->{'_atom_summary'}) {`
			`$item->{'text'} = $item->{'_atom_summary'};`
			`delete $item->{'contents'};`
			`} else {`
			`# nothing to display, so ignore this entry`
			`undef $item;`
			`last TAGS;`
			`}`
			`}`

			`unless ($item->{'text'}) { # unless we already have text`
			`if (scalar(@{$item->{'contents'}}) == 1) {`
			`# only one <content> section`
			`$content = $item->{'contents'}->[0];`
			`} else {`
			`# several <content> section, must choose the best one`
			`foreach (@{$item->{'contents'}}) {`
			`if ($_->[0] eq "application/xhtml+xml") { # best match`
			`$content = $_;`
			`last; # don't bother to look at others`
			`}`
			`if ($_->[0] =~ m!html!) { # some kind of html/xhtml/html+xml, etc.`
			`# choose this unless we've already chosen some html`
			`$content = $_`
			`unless $content->[0] =~ m!html!;`
			`next;`
			`}`
			`if ($_->[0] eq "text/plain") {`
			`# choose this unless we have some html already`
			`$content = $_`
			`unless $content->[0] =~ m!html!;`
			`next;`
			`}`
			`}`
			`# if we didn't choose anything, pick the first one`
			`$content = $item->{'contents'}->[0]`
			`unless $content;`
			`}`

			`# we ignore the 'mode' attribute of <content>. If it's "xml", we've`
			`# stringified it by accumulation; if it's "escaped", our parser`
			`# unescaped it`
			`# TODO: handle mode=base64?`

			`$item->{'text'} = $content->[1];`
			`delete $item->{'contents'};`
			`}`

			`# generate time`
			`my $w3time = $item->{'_atom_modified'} \|\| $item->{'_atom_created'};`
			`my $time;`
			`if ($w3time) {`
			`# see http://www.w3.org/TR/NOTE-datetime for format`
			`# we insist on having granularity up to a minute,`
			`# and ignore finer data as well as the timezone, for now`
			`if ($w3time =~ m!^(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d)!) {`
			`$time = "$1-$2-$3 $4:$5";`
			`}`
			`}`
			`if ($time) {`
			`$item->{'time'} = $time;`
			`}`

			`# get rid of all other tags we don't need anymore`
			`foreach (keys %$item) {`
			`delete $item->{$_} if substr($_, 0, 6) eq '_atom_';`
			`}`

			`push @items, $item;`
			`undef $item;`
			`last TAGS;`
			`}`
			`if ($tag eq 'feed') {`
			`# finalize feed`
			`# get rid of all other tags we don't need anymore`
			`foreach (keys %$feed) {`
			`delete $feed->{$_} if substr($_, 0, 6) eq '_atom_';`
			`}`

			`# link the feed with its itms`
			`$feed->{'items'} = \@items`
			`if $feed;`
			`last TAGS;`
			`}`
			`}`
			`return;`
			`}`

			`sub Text {`
			`my $p = shift;`

			`# do nothing if there has been an error`
			`return if $error;`

			`$data .= $_ if defined $data;`
			`}`

			`sub PI {`
			`# ignore processing instructions`
			`return;`
			`}`

			`sub EndDocument {`
			`# if we parsed a feed, link items to it`
			`$feed->{'items'} = \@items`
			`if $feed;`
			`return;`
			`}`


			`1;`