ljr/livejournal/bin/upgrading/migrate-userpics.pl

319 lines
11 KiB
Prolog
Executable File

#!/usr/bin/perl
use strict;
use lib "$ENV{LJHOME}/cgi-bin";
require 'ljlib.pl';
use LJ::Blob;
use LJ::User;
use Getopt::Long;
use IPC::Open3;
use Digest::MD5;
# this script is a migrater that will move userpics from an old storage method
# into mogilefs.
# the basic theory is that we iterate over all clusters, find all userpics that
# aren't in mogile right now, and put them there
# determine
my ($one, $besteffort, $dryrun, $user, $verify, $verbose, $clusters, $purge);
my $rv = GetOptions("best-effort" => \$besteffort,
"one" => \$one,
"dry-run" => \$dryrun,
"user=s" => \$user,
"verify" => \$verify,
"verbose" => \$verbose,
"purge-old" => \$purge,
"clusters=s" => \$clusters,);
unless ($rv) {
die <<ERRMSG;
This script supports the following command line arguments:
--clusters=X[-Y]
Only handle clusters in this range. You can specify a
single number, or a range of two numbers with a dash.
--user=username
Only move this particular user.
--one
Only move one user. (But it moves all their pictures.)
This is used for testing.
--verify
If specified, this option will reload the userpic from
MogileFS and make sure it's been stored successfully.
--dry-run
If on, do not update the database. This mode will put the
userpic in MogileFS and give you paths to examine the picture
and make sure everything is okay. It will not update the
userpic2 table, though.
--best-effort
Normally, if a problem is encountered (null userpic, md5
mismatch, connection failure, etc) the script will die to
make sure everything goes well. With this flag, we don't
die and instead just print to standard error.
--purge-old
Sometimes we run into data that is for users that have since
moved to a different cluster. Normally we ignore it, but
with this option, we'll clean that data up as we find it.
--verbose
Be very chatty.
ERRMSG
}
# make sure ljconfig is setup right (or so we hope)
die "Please define a 'userpics' class in your \%LJ::MOGILEFS_CONFIG\n"
unless defined $LJ::MOGILEFS_CONFIG{classes}->{userpics};
die "Unable to find MogileFS object (\%LJ::MOGILEFS_CONFIG not setup?)\n"
unless $LJ::MogileFS;
# setup stderr if we're in best effort mode
if ($besteffort) {
my $oldfd = select(STDERR);
$| = 1;
select($oldfd);
}
# operation modes
if ($user) {
# move a single user
my $u = LJ::load_user($user);
die "No such user: $user\n" unless $u;
handle_userid($u->{userid}, $u->{clusterid});
} else {
# parse the clusters
my @clusters;
if ($clusters) {
if ($clusters =~ /^(\d+)(?:-(\d+))?$/) {
my ($min, $max) = map { $_ + 0 } ($1, $2 || $1);
push @clusters, $_ foreach $min..$max;
} else {
die "Error: --clusters argument not of right format.\n";
}
} else {
@clusters = @LJ::CLUSTERS;
}
# now iterate over the clusters to pick
my $ctotal = scalar(@clusters);
my $ccount = 0;
foreach my $cid (sort { $a <=> $b } @clusters) {
# status report
$ccount++;
print "\nChecking cluster $cid...\n\n";
# get a handle
my $dbcm = get_db_handle($cid);
# get all userids
print "Getting userids...\n";
my $limit = $one ? 'LIMIT 1' : '';
my $userids = $dbcm->selectcol_arrayref
("SELECT DISTINCT userid FROM userpic2 WHERE location <> 'mogile' OR location IS NULL $limit");
my $total = scalar(@$userids);
# iterate over userids
my $count = 0;
print "Beginning iteration over userids...\n";
foreach my $userid (@$userids) {
# move this userpic
my $extra = sprintf("[%6.2f%%, $ccount of $ctotal] ", (++$count/$total*100));
handle_userid($userid, $cid, $extra);
}
# don't hit up more clusters
last if $one;
}
}
print "\n";
print "Updater terminating.\n";
#############################################################################
### helper subs down here
# take a userid and move their pictures. returns 0 on error, 1 on successful
# move of a user's pictures, and 2 meaning the user isn't ready for moving
# (dversion < 7, etc)
sub handle_userid {
my ($userid, $cid, $extra) = @_;
# load user to move and do some sanity checks
my $u = LJ::load_userid($userid);
unless ($u) {
LJ::end_request();
LJ::start_request();
$u = LJ::load_userid($userid);
}
die "ERROR: Unable to load userid $userid\n"
unless $u;
# if they're expunged, they might have data somewhere if they were
# copy-moved from A to B, then expunged on B. now we're on A and
# need to delete it ourselves (if purge-old is on)
if ($u->{clusterid} == 0 && $u->{statusvis} eq "X") {
return unless $purge;
# if we get here, the user has indicated they want data purged, get handle
my $to_purge_dbcm = get_db_handle($cid);
my $ct = $to_purge_dbcm->do("DELETE FROM userpic2 WHERE userid = ?", undef, $u->{userid});
print "\tnotice: purged $ct old rows.\n\n"
if $verbose;
return;
}
# get a handle
my $dbcm = get_db_handle($u->{clusterid});
# print that we're doing this user
print "$extra$u->{user}($u->{userid})\n";
# if a user has been moved to another cluster, but the source data from
# userpic2 wasn't deleted, we need to ignore the user or purge their data
if ($u->{clusterid} != $cid) {
return unless $purge;
# verify they have some rows on the new side
my $count = $dbcm->selectrow_array
("SELECT COUNT(*) FROM userpic2 WHERE userid = ?",
undef, $u->{userid});
return unless $count;
# if we get here, the user has indicated they want data purged, get handle
my $to_purge_dbcm = get_db_handle($cid);
# delete the old data
if ($dryrun) {
print "\tnotice: need to delete userpic2 rows.\n\n"
if $verbose;
} else {
my $ct = $to_purge_dbcm->do("DELETE FROM userpic2 WHERE userid = ?", undef, $u->{userid});
print "\tnotice: purged $ct old rows.\n\n"
if $verbose;
}
# nothing else to do here
return;
}
# get all their photos that aren't in mogile already
my $picids = $dbcm->selectall_arrayref
("SELECT picid, md5base64, fmt FROM userpic2 WHERE userid = ? AND (location <> 'mogile' OR location IS NULL)",
undef, $u->{userid});
return unless @$picids;
# now we have a userid and picids, get the photos from the blob server
foreach my $row (@$picids) {
my ($picid, $md5, $fmt) = @$row;
print "\tstarting move for picid $picid\n"
if $verbose;
my $format = { G => 'gif', J => 'jpg', P => 'png' }->{$fmt};
my $data = LJ::Blob::get($u, "userpic", $format, $picid);
# get length
my $len = length($data);
if ($besteffort && !$len) {
print STDERR "empty_userpic userid=$u->{userid} picid=$picid\n";
print "\twarning: empty userpic.\n\n"
if $verbose;
next;
}
die "Error: data from blob empty ($u->{user}, 'userpic', $format, $picid)\n"
unless $len;
# verify the md5 of this picture with what's in the database
my $blobmd5 = Digest::MD5::md5_base64($data);
if ($besteffort && ($md5 ne $blobmd5)) {
print STDERR "md5_mismatch userid=$u->{userid} picid=$picid dbmd5=$md5 blobmd5=$blobmd5\n";
print "\twarning: md5 mismatch; database=$md5, blobserver=$blobmd5\n\n"
if $verbose;
next;
}
die "\tError: data from blobserver md5 mismatch: database=$md5, blobserver=$blobmd5\n"
unless $md5 eq $blobmd5;
print "\tverified md5; database=$md5, blobserver=$blobmd5\n"
if $verbose;
# get filehandle to Mogile and put the file there
print "\tdata length = $len bytes, uploading to MogileFS...\n"
if $verbose;
my $fh = $LJ::MogileFS->new_file($u->mogfs_userpic_key($picid), 'userpics');
if ($besteffort && !$fh) {
print STDERR "new_file_failed userid=$u->{userid} picid=$picid\n";
print "\twarning: failed in call to new_file\n\n"
if $verbose;
next;
}
die "Unable to get filehandle to save file to MogileFS\n"
unless $fh;
# now save the file and close the handles
$fh->print($data);
my $rv = $fh->close;
if ($besteffort && !$rv) {
print STDERR "close_failed userid=$u->{userid} picid=$picid reason=$@\n";
print "\twarning: failed in call to cloes: $@\n\n"
if $verbose;
next;
}
die "Unable to save file to MogileFS: $@\n"
unless $rv;
# extra verification
if ($verify) {
my $data2 = $LJ::MogileFS->get_file_data($u->mogfs_userpic_key($picid));
my $eq = ($data2 && $$data2 eq $data) ? 1 : 0;
if ($besteffort && !$eq) {
print STDERR "verify_failed userid=$u->{userid} picid=$picid\n";
print "\twarning: verify failed; picture not updated\n\n"
if $verbose;
next;
}
die "\tERROR: picture NOT stored successfully, content mismatch\n"
unless $eq;
print "\tverified length = " . length($$data2) . " bytes...\n"
if $verbose;
}
# done moving this picture
unless ($dryrun) {
print "\tupdating database for this picture...\n"
if $verbose;
$dbcm->do("UPDATE userpic2 SET location = 'mogile' WHERE userid = ? AND picid = ?",
undef, $u->{userid}, $picid);
}
# get the paths so the user can verify if they want
if ($verbose) {
my @paths = $LJ::MogileFS->get_paths($u->mogfs_userpic_key($picid), 1);
print "\tverify mogile path: $_\n" foreach @paths;
print "\tverify site url: $LJ::SITEROOT/userpic/$picid/$u->{userid}\n";
print "\tpicture update complete.\n\n";
}
}
}
# a sub to get a cluster handle and set it up for our use
sub get_db_handle {
my $cid = shift;
my $dbcm = LJ::get_cluster_master({ raw => 1 }, $cid);
unless ($dbcm) {
print STDERR "handle_unavailable clusterid=$cid\n";
die "ERROR: unable to get raw handle to cluster $cid\n";
}
eval {
$dbcm->do("SET wait_timeout = 28800");
die $dbcm->errstr if $dbcm->err;
};
die "Couldn't set wait_timeout on $cid: $@\n" if $@;
$dbcm->{'RaiseError'} = 1;
return $dbcm;
}