#!/usr/bin/perl # use strict; use vars qw(%maint %maintinfo); use lib "$ENV{'LJHOME'}/cgi-bin"; # extra XML::Encoding files in cgi-bin/XML/* use LWP::UserAgent; use XML::RSS; use HTTP::Status; require "ljprotocol.pl"; require "parsefeed.pl"; require "cleanhtml.pl"; $maintinfo{'synsuck'}{opts}{locking} = "per_host"; $maint{'synsuck'} = sub { my $maxcount = shift || 0; my $verbose = $LJ::LJMAINT_VERBOSE; my %child_jobs; # child pid => [ userid, lock ] my $process_user = sub { my $urow = shift; return unless $urow; my ($user, $userid, $synurl, $lastmod, $etag, $readers) = map { $urow->{$_} } qw(user userid synurl lastmod etag numreaders); # we're a child process now, need to invalidate caches and # get a new database handle LJ::start_request(); my $dbh = LJ::get_db_writer(); # see if things have changed since we last looked and acquired the lock. # otherwise we could 1) check work, 2) get lock, and between 1 and 2 another # process could do both steps. we don't want to duplicate work already done. my $now_checknext = $dbh->selectrow_array("SELECT checknext FROM syndicated ". "WHERE userid=?", undef, $userid); return if $now_checknext ne $urow->{checknext}; my $ua = LWP::UserAgent->new("timeout" => 10); my $reader_info = $readers ? "; $readers readers" : ""; $ua->agent("$LJ::SITENAME ($LJ::ADMIN_EMAIL; for $LJ::SITEROOT/users/$user/" . $reader_info . ")"); my $delay = sub { my $minutes = shift; my $status = shift; # add some random backoff to avoid waves building up $minutes += int(rand(5)); $dbh->do("UPDATE syndicated SET lastcheck=NOW(), checknext=DATE_ADD(NOW(), ". "INTERVAL ? MINUTE), laststatus=? WHERE userid=?", undef, $minutes, $status, $userid); }; print "[$$] Synsuck: $user ($synurl)\n" if $verbose; my $req = HTTP::Request->new("GET", $synurl); $req->header('If-Modified-Since', LJ::time_to_http($lastmod)) if $lastmod; $req->header('If-None-Match', $etag) if $etag; my ($content, $too_big); my $res = $ua->request($req, sub { if (length($content) > 1024*150) { $too_big = 1; return; } $content .= $_[0]; }, 4096); if ($too_big) { $delay->(60, "toobig"); return; } if ($res->is_error()) { # http error print "HTTP error!\n" if $verbose; # overload parseerror here because it's already there -- we'll # never have both an http error and a parse error on the # same request $delay->(3*60, "parseerror"); LJ::set_userprop($userid, "rssparseerror", $res->status_line()); return; } # check if not modified if ($res->code() == RC_NOT_MODIFIED) { print " not modified.\n" if $verbose; $delay->($readers ? 60 : 24*60, "notmodified"); return; } # WARNING: blatant XML spec violation ahead... # # Blogger doesn't produce valid XML, since they don't handle encodings # correctly. So if we see they have no encoding (which is UTF-8 implictly) # but it's not valid UTF-8, say it's Windows-1252, which won't # cause XML::Parser to barf... but there will probably be some bogus characters. # better than nothing I guess. (personally, I'd prefer to leave it broken # and have people bitch at Blogger, but jwz wouldn't stop bugging me) # XML::Parser doesn't include Windows-1252, but we put it in cgi-bin/XML/* for it # to find. my $encoding; if ($content =~ /<\?xml.+?>/ && $& =~ /encoding=([\"\'])(.+?)\1/) { $encoding = lc($2); } if (! $encoding && ! LJ::is_utf8($content)) { $content =~ s/\?>/ encoding='windows-1252' \?>/; } # WARNING: another hack... # People produce what they think is iso-8859-1, but they include # Windows-style smart quotes. Check for invalid iso-8859-1 and correct. if ($encoding =~ /^iso-8859-1$/i && $content =~ /[\x80-\x9F]/) { # They claimed they were iso-8859-1, but they are lying. # Assume it was Windows-1252. print "Invalid ISO-8859-1; assuming Windows-1252...\n" if $verbose; $content =~ s/encoding=([\"\'])(.+?)\1/encoding='windows-1252'/; } # parsing time... my ($feed, $error) = LJ::ParseFeed::parse_feed($content); if ($error) { # parse error! print "Parse error! $error\n" if $verbose; $delay->(3*60, "parseerror"); $error =~ s! at /.*!!; $error =~ s/^\n//; # cleanup of newline at the beggining of the line LJ::set_userprop($userid, "rssparseerror", $error); return; } # another sanity check unless (ref $feed->{'items'} eq "ARRAY") { $delay->(3*60, "noitems"); return; } my @items = reverse @{$feed->{'items'}}; # take most recent 20 splice(@items, 0, @items-20) if @items > 20; # delete existing items older than the age which can show on a # friends view. my $su = LJ::load_userid($userid); my $udbh = LJ::get_cluster_master($su); unless ($udbh) { $delay->(15, "nodb"); return; } # TAG:LOG2:synsuck_delete_olderitems my $secs = ($LJ::MAX_FRIENDS_VIEW_AGE || 3600*24*14)+0; # 2 week default. my $sth = $udbh->prepare("SELECT jitemid, anum FROM log2 WHERE journalid=? AND ". "logtime < DATE_SUB(NOW(), INTERVAL $secs SECOND)"); $sth->execute($userid); die $udbh->errstr if $udbh->err; while (my ($jitemid, $anum) = $sth->fetchrow_array) { print "DELETE itemid: $jitemid, anum: $anum... \n" if $verbose; if (LJ::delete_entry($su, $jitemid, 0, $anum)) { print "success.\n" if $verbose; } else { print "fail.\n" if $verbose; } } # determine if link tags are good or not, where good means # "likely to be a unique per item". some feeds have the same # element for each item, which isn't good. # if we have unique ids, we don't compare link tags my ($compare_links, $have_ids) = 0; { my %link_seen; foreach my $it (@items) { $have_ids = 1 if $it->{'id'}; next unless $it->{'link'}; $link_seen{$it->{'link'}} = 1; } $compare_links = 1 if !$have_ids and $feed->{'type'} eq 'rss' and scalar(keys %link_seen) == scalar(@items); } # if we have unique links/ids, load them for syndicated # items we already have on the server. then, if we have one # already later and see it's changed, we'll do an editevent # instead of a new post. my %existing_item = (); if ($have_ids || $compare_links) { my $p = $have_ids ? LJ::get_prop("log", "syn_id") : LJ::get_prop("log", "syn_link"); my $sth = $udbh->prepare("SELECT jitemid, value FROM logprop2 WHERE ". "journalid=? AND propid=? LIMIT 1000"); $sth->execute($su->{'userid'}, $p->{'id'}); while (my ($itemid, $id) = $sth->fetchrow_array) { $existing_item{$id} = $itemid; } } # post these items my $newcount = 0; my $errorflag = 0; my $mindate; # "yyyy-mm-dd hh:mm:ss"; my $notedate = sub { my $date = shift; $mindate = $date if ! $mindate || $date lt $mindate; }; foreach my $it (@items) { # remove the SvUTF8 flag. it's still UTF-8, but # we don't want perl knowing that and fucking stuff up # for us behind our back in random places all over # http://zilla.livejournal.org/show_bug.cgi?id=1037 foreach my $attr (qw(subject text link)) { $it->{$attr} = pack('C*', unpack('C*', $it->{$attr})); } my $dig = LJ::md5_struct($it)->b64digest; my $prevadd = $dbh->selectrow_array("SELECT MAX(dateadd) FROM synitem WHERE ". "userid=? AND item=?", undef, $userid, $dig); if ($prevadd) { $notedate->($prevadd); next; } my $now_dateadd = $dbh->selectrow_array("SELECT NOW()"); die "unexpected format" unless $now_dateadd =~ /^\d\d\d\d\-\d\d\-\d\d \d\d:\d\d:\d\d$/; $dbh->do("INSERT INTO synitem (userid, item, dateadd) VALUES (?,?,?)", undef, $userid, $dig, $now_dateadd); $notedate->($now_dateadd); $newcount++; print "[$$] $dig - $it->{'subject'}\n" if $verbose; $it->{'text'} =~ s/^\s+//; $it->{'text'} =~ s/\s+$//; my $htmllink; if (defined $it->{'link'}) { $htmllink = "
"; } # Show the