#!/usr/bin/perl # use strict; use vars qw(%maint %maintinfo); use lib "$ENV{'LJHOME'}/cgi-bin"; # extra XML::Encoding files in cgi-bin/XML/* use LWP::UserAgent; use XML::RSS; use HTTP::Status; use Image::Size; require "ljprotocol.pl"; require "parsefeed.pl"; require "cleanhtml.pl"; require "talklib.pl"; require LWPx::ParanoidAgent; require LJR::unicode; use utf8; binmode STDOUT, ":utf8"; my $dumpxml = sub { my ($xdata, $username) = @_; open(my $outfile, ">$ENV{'LJHOME'}/logs/syn_err_" . $username . ".xml"); print $outfile "$xdata"; close($outfile); }; my $err = sub { my ($msg) = @_; print $msg . "\n"; return; }; my $get_picture = sub { my ($userid, $url) = @_; my $MAX_UPLOAD = 102400; my $ua = LWPx::ParanoidAgent->new(timeout => 30, max_size => $MAX_UPLOAD + 1024); $ua->agent("Synsuck::Userpic; $LJ::SITENAME; $LJ::ADMIN_EMAIL"); my $res = $ua->get($url); my $picdata = $res->content; return $err->("some error while getting userpic") if !($res && $res->is_success); return $err->("404 while getting userpic") if ($res && $res->{"_rc"} eq 404); return $err->("userpic size is bigger than we want") if length($picdata) > $MAX_UPLOAD; my ($sx, $sy, $filetype) = Image::Size::imgsize(\$picdata); return $err->("can't ge userpic size") unless defined $sx; return $err->("unknown userpic filetype") unless $filetype eq "GIF" || $filetype eq "JPG" || $filetype eq "PNG"; return $err->("userpic is bigger than we want") if $sx > 150 || $sy > 150; my $contenttype; if ($filetype eq "GIF") { $contenttype = 'G'; } elsif ($filetype eq "PNG") { $contenttype = 'P'; } elsif ($filetype eq "JPG") { $contenttype = 'J'; } my $base64 = Digest::MD5::md5_base64($picdata); my $picid = LJ::alloc_global_counter('P') or return; my $dbh = LJ::get_db_writer(); $dbh->do( "INSERT INTO userpic2" . "(picid, userid, fmt, width, height, picdate, md5base64, location, state, url) " . "VALUES (?, ?, ?, ?, ?, NOW(), ?, ?, 'N', ?)", undef, $picid, $userid, $contenttype, $sx, $sy, $base64, undef, $url); $dbh->do( "INSERT INTO userpicblob2 (userid, picid, imagedata) VALUES (?,?,?)", undef, $userid, $picid, $picdata); my $su = LJ::load_userid($userid); LJ::update_user($su, { defaultpicid => $picid }); }; $maintinfo{'synsuck'}{opts}{locking} = "per_host"; $maint{'synsuck'} = sub { my $maxcount = shift || 0; my $verbose = $LJ::LJMAINT_VERBOSE; my %child_jobs; # child pid => [ userid, lock ] my $process_user = sub { my $urow = shift; return unless $urow; my ($user, $userid, $synurl, $lastmod, $etag, $lastmod_feed, $readers) = map { $urow->{$_} } qw(user userid synurl lastmod etag lastmod_feed numreaders); # we're a child process now, need to invalidate caches and # get a new database handle LJ::start_request(); my $dbh = LJ::get_db_writer(); # see if things have changed since we last looked and acquired the lock. # otherwise we could 1) check work, 2) get lock, and between 1 and 2 another # process could do both steps. we don't want to duplicate work already done. my $now_checknext = $dbh->selectrow_array("SELECT checknext FROM syndicated ". "WHERE userid=?", undef, $userid); return if $now_checknext ne $urow->{checknext}; my $ua = LWP::UserAgent->new("timeout" => 30); my $reader_info = $readers ? "; $readers readers" : ""; $ua->agent("$LJ::SITENAME ($LJ::ADMIN_EMAIL; for $LJ::SITEROOT/users/$user/" . $reader_info . ")"); my $delay = sub { my $minutes = shift; my $status = shift; # add some random backoff to avoid waves building up $minutes += int(rand(5)); $dbh->do("UPDATE syndicated SET lastcheck=NOW(), checknext=DATE_ADD(NOW(), ". "INTERVAL ? MINUTE), laststatus=? WHERE userid=?", undef, $minutes, $status, $userid); }; print "[$$] Synsuck: fetching $user ($synurl)\n" if $verbose; my $req = HTTP::Request->new("GET", $synurl); $req->header('If-Modified-Since', $lastmod) if $lastmod; $req->header('If-None-Match', $etag) if $etag; my ($content, $too_big); my $max_size = $LJ::SYNSUCK_MAX_SIZE || 500; # in kb my $res = eval { $ua->request($req, sub { if (length($content) > 1024*$max_size) { $too_big = 1; return; } $content .= $_[0]; }, 4096); }; if ($@) { $delay->(120, "lwp_death"); return; } if ($too_big) { $delay->(60, "toobig"); return; } if ($res->is_error()) { # http error print " HTTP error! " . $res->status_line() . "\n" if $verbose; $delay->(3*60, "httperror"); # overload rssparseerror here because it's already there -- we'll # never have both an http error and a parse error on the # same request LJ::set_userprop($userid, "rssparseerror", $res->status_line()); return; } # check if not modified if ($res->code() == RC_NOT_MODIFIED) { print " not modified.\n" if $verbose; $delay->($readers ? 30 : 12*60, "notmodified"); return; } my $r_lastmod = $res->header('Last-Modified'); my $r_etag = $res->header('ETag'); # check again (feedburner.com, blogspot.com, etc.) if (($etag && $etag eq $r_etag) || ($lastmod && $lastmod eq $r_lastmod)) { print " not modified.\n" if $verbose; $delay->($readers ? 30 : 12*60, "notmodified"); return; } # force utf8; this helps from time to time ??? LJR::unicode::force_utf8(\$content); # parsing time... my ($feed, $error) = LJ::ParseFeed::parse_feed($content); if ($error) { # parse error! print "Parse error! $error\n" if $verbose; $delay->(3*60, "parseerror"); $error =~ s! at /.*!!; $error =~ s/^\n//; # cleanup of newline at the beggining of the line LJ::set_userprop($userid, "rssparseerror", $error); $dumpxml->($content, $user); return; } my $r_lastmod_feed = $feed->{'lastmod'}; my $r_lastmod_lastBuildDate = $feed->{'lastBuildDate'}; print " $lastmod \n $r_lastmod \n $etag \n $r_etag \n $lastmod_feed \n $r_lastmod_feed \n $r_lastmod_lastBuildDate \n "; # check last-modified for bogus web-servers if ($lastmod_feed && $lastmod_feed eq $r_lastmod_feed) { print " not modified..\n" if $verbose; $delay->($readers ? 30 : 12*60, "notmodified"); return; } # print " $lastmod \n $r_lastmod \n $etag \n $r_etag \n $lastmod_feed \n $r_lastmod_feed \n $r_lastmod_lastBuildDate \n "; # another sanity check unless (ref $feed->{'items'} eq "ARRAY") { $delay->(3*60, "noitems"); return; } # update userpic my $cur_pic = $dbh->selectrow_array( "SELECT url FROM userpic2, user WHERE user.userid=? and userpic2.userid=user.userid and userpic2.picid=user.defaultpicid", undef, $userid); if ( ($feed->{'image'} && $cur_pic && $cur_pic ne $feed->{'image'}) || ($feed->{'image'} && !$cur_pic) ) { $dbh->do("delete from userpic2 WHERE userid=?", undef, $userid); $dbh->do("delete from userpicblob2 WHERE userid=?", undef, $userid); $dbh->do("update user set user.defaultpicid=NULL where userid=?", undef, $userid); print "[$$] Synsuck: $user -- trying to fetch userpic from: " . $feed->{'image'} . " \n" if $verbose; $get_picture->($userid, $feed->{'image'}); } my @items = reverse @{$feed->{'items'}}; # delete existing items older than the age which can show on a # friends view. my $su = LJ::load_userid($userid); my $udbh = LJ::get_cluster_master($su); unless ($udbh) { $delay->(15, "nodb"); return; } # TAG:LOG2:synsuck_delete_olderitems # my $secs = ($LJ::MAX_FRIENDS_VIEW_AGE || 3600*24*14)+0; # 2 week default. # my $sth = $udbh->prepare("SELECT jitemid, anum FROM log2 WHERE journalid=? AND ". # "logtime < DATE_SUB(NOW(), INTERVAL $secs SECOND)"); # $sth->execute($userid); # die $udbh->errstr if $udbh->err; # while (my ($jitemid, $anum) = $sth->fetchrow_array) { # print "DELETE itemid: $jitemid, anum: $anum... \n" if $verbose; # if (LJ::delete_entry($su, $jitemid, 0, $anum)) { # print "success.\n" if $verbose; # } else { # print "fail.\n" if $verbose; # } # } my $count = $udbh->selectrow_array("SELECT COUNT(*) FROM log2 WHERE journalid=$userid"); print "count = $count \n"; my $extra = $count - $LJ::MAX_SCROLLBACK_FRIENDS_SINGLE_USER_ACTIVITY; if ($extra > 0) { my $sth = $udbh->prepare("SELECT jitemid FROM logprop2 WHERE journalid=$userid". " ORDER BY jitemid ASC LIMIT $extra"); $sth->execute(); while (my ($jitemid) = $sth->fetchrow_array) { print "DELETE itemid: $jitemid... \n" if $verbose; if (LJ::delete_entry($su, $jitemid)) { print "success.\n" if $verbose; } else { print "fail.\n" if $verbose; } } } # determine if link tags are good or not, where good means # "likely to be a unique per item". some feeds have the same # element for each item, which isn't good. # if we have unique ids, we don't compare link tags my ($compare_links, $have_ids) = 0; { my %link_seen; foreach my $it (@items) { $have_ids = 1 if $it->{'id'}; next unless $it->{'link'}; $link_seen{$it->{'link'}} = 1; } $compare_links = 1 if !$have_ids and $feed->{'type'} eq 'rss' and scalar(keys %link_seen) == scalar(@items); } # if we have unique links/ids, load them for syndicated # items we already have on the server. then, if we have one # already later and see it's changed, we'll do an editevent # instead of a new post. my %existing_item = (); if ($have_ids || $compare_links) { my $p = $have_ids ? LJ::get_prop("log", "syn_id") : LJ::get_prop("log", "syn_link"); my $sth = $udbh->prepare("SELECT jitemid, value FROM logprop2 WHERE ". "journalid=? AND propid=? ORDER BY jitemid DESC LIMIT 1000"); # need last 100 $sth->execute($su->{'userid'}, $p->{'id'}); while (my ($jitemid, $id) = $sth->fetchrow_array) { if (!defined $existing_item{$id}) { $existing_item{$id} = $jitemid; # print "Got it: $jitemid, $id\n" if $verbose; } else { # remove duplicates - if any: print "DELETE duplicated itemid: $jitemid, $id ...\n" if $verbose; if (LJ::delete_entry($su, $jitemid)) { print "success.\n" if $verbose; } else { print "fail.\n" if $verbose; } } } } # post these items my $newcount = 0; my $errorflag = 0; my $mindate; # "yyyy-mm-dd hh:mm:ss"; my $notedate = sub { my $date = shift; $mindate = $date if ! $mindate || $date lt $mindate; }; LJ::load_user_props($su, { use_master => 1 }, "newesteventtime"); ### if ($su->{'newesteventtime'} eq $oldevent->{'eventtime'}) { ### LJ::set_userprop($su, "newesteventtime", undef); ### } foreach my $it (@items) { if ($it->{'time'} && $it->{'time'} lt $su->{'newesteventtime'}) { ## print "---- " . $it->{'subject'} . "\n" if $verbose; next; } elsif ($it->{'time'} && $it->{'time'} eq $su->{'newesteventtime'}) { print "==== " . $it->{'subject'} . "\n" if $verbose; } else { print "++++ " . $it->{'subject'} . "\n" if $verbose; } my $dig = LJ::md5_struct($it)->b64digest; my $prevadd = $dbh->selectrow_array("SELECT MAX(dateadd) FROM synitem WHERE ". "userid=? AND item=?", undef, $userid, $dig); if ($prevadd) { $notedate->($prevadd); next; } my $now_dateadd = $dbh->selectrow_array("SELECT NOW()"); die "unexpected format" unless $now_dateadd =~ /^\d\d\d\d\-\d\d\-\d\d \d\d:\d\d:\d\d$/; $dbh->do("INSERT INTO synitem (userid, item, dateadd) VALUES (?,?,?)", undef, $userid, $dig, $now_dateadd); $notedate->($now_dateadd); $newcount++; print "[$$] $dig - $it->{'subject'}\n" if $verbose; $it->{'text'} =~ s/^\s+//; $it->{'text'} =~ s/\s+$//; # process lj-cuts while ($it->{'text'} =~ /\G(.*?)\<\/a\>(.*)/sg) { my $before = $1; my $cutid = $2; my $rcut = $3; $rcut =~ s/\<\/a\>/\<\/lj\-cut\>/; $it->{'text'} = $before . "" . $rcut; } my $htmllink; if (defined $it->{'link'}) { $htmllink = ""; } # Show the link if it's present and different than the # . # [zilla: 267] Patch: Chaz Meyers if ( defined $it->{'id'} && $it->{'id'} ne $it->{'link'} && $it->{'id'} =~ m!^http://! ) { $htmllink .= ""; } # rewrite relative URLs to absolute URLs, but only invoke the HTML parser # if we see there's some image or link tag, to save us some work if it's # unnecessary (the common case) if ($it->{'text'} =~ /<(?:img|a)\b/i) { # TODO: support XML Base? http://www.w3.org/TR/xmlbase/ my $base_href = $it->{'link'} || $synurl; LJ::CleanHTML::resolve_relative_urls(\$it->{'text'}, $base_href); } # $own_time==1 means we took the time from the feed rather than localtime my ($own_time, $year, $mon, $day, $hour, $min); if ($it->{'time'} && $it->{'time'} =~ m!^(\d\d\d\d)-(\d\d)-(\d\d) (\d\d):(\d\d)!) { $own_time = 1; ($year, $mon, $day, $hour, $min) = ($1,$2,$3,$4,$5); } else { $own_time = 0; my @now = localtime(); ($year, $mon, $day, $hour, $min) = ($now[5]+1900, $now[4]+1, $now[3], $now[2], $now[1]); } my $command = "postevent"; my $req = { 'username' => $user, 'ver' => 1, 'subject' => $it->{'subject'}, 'event' => "$it->{'text'}", 'year' => $year, 'mon' => $mon, 'day' => $day, 'hour' => $hour, 'min' => $min, 'props' => { 'syn_link' => $it->{'link'}, 'opt_nocomments' => 1, }, }; $req->{'props'}->{'syn_id'} = $it->{'id'} if $it->{'id'}; my $flags = { 'nopassword' => 1, }; # if the post contains html linebreaks, assume it's preformatted. if ($it->{'text'} =~ /<(?:p|br)\b/i) { $req->{'props'}->{'opt_preformatted'} = 1; } # do an editevent if we've seen this item before my $id = $have_ids ? $it->{'id'} : $it->{'link'}; my $old_itemid = $existing_item{$id}; if ($id && $old_itemid) { $newcount--; # cancel increment above $command = "editevent"; $req->{'itemid'} = $old_itemid; # the editevent requires us to resend the date info, which # we have to go fetch first, in case the feed doesn't have it # TAG:LOG2:synsuck_fetch_itemdates unless($own_time) { my $origtime = $udbh->selectrow_array("SELECT eventtime FROM log2 WHERE ". "journalid=? AND jitemid=?", undef, $su->{'userid'}, $old_itemid); $origtime =~ /(\d\d\d\d)-(\d\d)-(\d\d) (\d\d):(\d\d)/; $req->{'year'} = $1; $req->{'mon'} = $2; $req->{'day'} = $3; $req->{'hour'} = $4; $req->{'min'} = $5; } } my $err; my $res = LJ::Protocol::do_request($command, $req, \$err, $flags); unless ($res && ! $err) { print " Error: $err\n" if $verbose; $errorflag = 1; } } # delete some unneeded synitems. the limit 1000 is because # historically we never deleted and there are accounts with # 222,000 items on a myisam table, and that'd be quite the # delete hit. # the 14 day interval is because if a remote site deleted an # entry, it's possible for the oldest item that was previously # gone to reappear, and we want to protect against that a # little. if ($LJ::SYNITEM_CLEAN) { $dbh->do("DELETE FROM synitem WHERE userid=? AND ". "dateadd < ? - INTERVAL 14 DAY LIMIT 1000", undef, $userid, $mindate); } $dbh->do("UPDATE syndicated SET oldest_ourdate=? WHERE userid=?", undef, $mindate, $userid); # bail out if errors, and try again shortly if ($errorflag) { $delay->(30, "posterror"); return; } # update syndicated account's userinfo if necessary LJ::load_user_props($su, "url", "urlname"); { my $title = $feed->{'title'}; $title = $su->{'user'} unless LJ::is_utf8($title); $title =~ s/[\n\r]//g; if ($title && $title ne $su->{'name'}) { LJ::update_user($su, { name => $title }); } if ($title) { LJ::set_userprop($su, "urlname", $title); } else { LJ::set_userprop($su, "urlname", $su->{'url'}); } my $link = $feed->{'link'}; if ($link && $link ne $su->{'url'}) { LJ::set_userprop($su, "url", $link); } my $des = $feed->{'description'}; if ($des) { my $bio; if ($su->{'has_bio'} eq "Y") { $bio = $udbh->selectrow_array("SELECT bio FROM userbio WHERE userid=?", undef, $su->{'userid'}); } if ($bio ne $des && $bio !~ /\[LJ:KEEP\]/) { if ($des) { $su->do("REPLACE INTO userbio (userid, bio) VALUES (?,?)", undef, $su->{'userid'}, $des); } else { $su->do("DELETE FROM userbio WHERE userid=?", undef, $su->{'userid'}); } LJ::update_user($su, { has_bio => ($des ? "Y" : "N") }); LJ::MemCache::delete([$su->{'userid'}, "bio:$su->{'userid'}"]); } } } # decide when to poll next (in minutes). # FIXME: this is super lame. (use hints in RSS file!) my $int = $newcount ? 15 : 30; my $status = $newcount ? "ok" : "nonew"; my $updatenew = $newcount ? ", lastnew=NOW()" : ""; # update reader count while we're changing things, but not # if feed is stale (minimize DB work for inactive things) if ($newcount || ! defined $readers) { $readers = $dbh->selectrow_array("SELECT COUNT(*) FROM friends WHERE ". "friendid=?", undef, $userid); } # if readers are gone, don't check for a whole day $int = 60*24 if $readers && $readers < 2 || !$readers; $dbh->do("UPDATE syndicated SET checknext=DATE_ADD(NOW(), INTERVAL $int MINUTE), ". "lastcheck=NOW(), lastmod=?, etag=?, lastmod_feed=? , laststatus=?, numreaders=? $updatenew ". "WHERE userid=$userid", undef, $r_lastmod, $r_etag, $r_lastmod_feed, $status, $readers); }; ### ### child process management ### # get the next user to be processed my @all_users; my $get_next_user = sub { return shift @all_users if @all_users; # need to get some more rows my $dbh = LJ::get_db_writer(); my $current_jobs = join(",", map { $dbh->quote($_->[0]) } values %child_jobs); my $in_sql = " AND u.userid NOT IN ($current_jobs)" if $current_jobs; my $sth = $dbh->prepare("SELECT u.user, s.userid, s.synurl, s.lastmod, " . " s.etag, s.lastmod_feed, s.numreaders, s.checknext " . "FROM user u, syndicated s " . "WHERE u.userid=s.userid AND u.statusvis='V' " . "AND s.checknext < NOW()$in_sql " . "ORDER BY RAND() LIMIT 500"); $sth->execute; while (my $urow = $sth->fetchrow_hashref) { push @all_users, $urow; } return undef unless @all_users; return shift @all_users; }; # fork and manage child processes my $max_threads = $LJ::SYNSUCK_MAX_THREADS || 1; print "[$$] PARENT -- using $max_threads workers\n" if $verbose; my $threads = 0; my $userct = 0; my $keep_forking = 1; while ( $maxcount == 0 || $userct < $maxcount ) { if ($threads < $max_threads && $keep_forking) { my $urow = $get_next_user->(); unless ($urow) { $keep_forking = 0; next; } my $lockname = "synsuck-user-" . $urow->{user}; my $lock = LJ::locker()->trylock($lockname); next unless $lock; print "Got lock on '$lockname'. Running\n" if $verbose; # spawn a new process if (my $pid = fork) { # we are a parent, nothing to do? $child_jobs{$pid} = [$urow->{'userid'}, $lock]; $threads++; $userct++; } else { # handles won't survive the fork LJ::disconnect_dbs(); $process_user->($urow); exit 0; } # wait for child(ren) to die } else { my $child = wait(); last if $child == -1; delete $child_jobs{$child}; $threads--; } } # Now wait on any remaining children so we don't leave zombies behind. while ( %child_jobs ) { my $child = wait(); last if $child == -1; delete $child_jobs{ $child }; $threads--; } print "[$$] $userct users processed\n" if $verbose; return; }; 1; # Local Variables: # mode: perl # c-basic-indent: 4 # indent-tabs-mode: nil # End: