ljr/local/bin/maint/stats.pl

549 lines
21 KiB
Perl
Executable File

#!/usr/bin/perl
#
use strict;
use vars qw(%maint);
require "$ENV{'LJHOME'}/cgi-bin/statslib.pl";
# filled in by ljmaint.pl, 0=quiet, 1=normal, 2=verbose
$LJ::Stats::VERBOSE = $LJ::LJMAINT_VERBOSE >= 2 ? 1 : 0;
$maint{'genstats'} = sub
{
my @which = @_ || qw(users countries
states gender clients
pop_interests meme pop_faq);
# popular faq items
LJ::Stats::register_stat
({ 'type' => "global",
'jobname' => "popfaq",
'statname' => "pop_faq",
'handler' =>
sub {
my $db_getter = shift;
return undef unless ref $db_getter eq 'CODE';
my $db = $db_getter->();
return undef unless $db;
my $sth = $db->prepare("SELECT faqid, COUNT(*) FROM faquses WHERE " .
"faqid<>0 GROUP BY 1 ORDER BY 2 DESC LIMIT 50");
$sth->execute;
die $db->errstr if $db->err;
my %ret;
while (my ($id, $count) = $sth->fetchrow_array) {
$ret{$id} = $count;
}
return \%ret;
},
});
# popular interests
LJ::Stats::register_stat
({ 'type' => "global",
'jobname' => "pop_interests",
'statname' => "pop_interests",
'handler' =>
sub {
my $db_getter = shift;
return undef unless ref $db_getter eq 'CODE';
my $db = $db_getter->();
return undef unless $db;
return {} if $LJ::DISABLED{'interests-popular'};
# see what the previous min was, then subtract 20% of max from it
my ($prev_min, $prev_max) = $db->selectrow_array("SELECT MIN(statval), MAX(statval) " .
"FROM stats WHERE statcat='pop_interests'");
my $stat_min = int($prev_min - (0.2*$prev_max));
$stat_min = 1 if $stat_min < 1;
my $sth = $db->prepare("SELECT interest, intcount FROM interests WHERE intcount>? " .
"ORDER BY intcount DESC, interest ASC LIMIT 400");
$sth->execute($stat_min);
die $db->errstr if $db->err;
my %ret;
while (my ($int, $count) = $sth->fetchrow_array) {
$ret{$int} = $count;
}
return \%ret;
},
});
# popular memes
LJ::Stats::register_stat
({ 'type' => "global",
'jobname' => "meme",
'statname' => "popmeme",
'handler' =>
sub {
my $db_getter = shift;
return undef unless ref $db_getter eq 'CODE';
my $db = $db_getter->();
return undef unless $db;
return {} if $LJ::DISABLED{'meme'};
my $sth = $db->prepare("SELECT url, count(*) FROM meme " .
"GROUP BY 1 ORDER BY 2 DESC LIMIT 100");
$sth->execute;
die $db->errstr if $db->err;
my %ret;
while (my ($url, $count) = $sth->fetchrow_array) {
$ret{$url} = $count;
}
return \%ret;
},
});
# clients
LJ::Stats::register_stat
({ 'type' => "global",
'jobname' => "clients",
'statname' => "client",
'handler' =>
sub {
my $db_getter = shift;
return undef unless ref $db_getter eq 'CODE';
my $db = $db_getter->();
return undef unless $db;
return {} if $LJ::DISABLED{'clientversionlog'};
my $tsql = "SELECT MAX(userid) FROM user";
$tsql .= " where userid < " . $LJ::LJR_IMPORTED_USERIDS if $LJ::LJR_IMPORTED_USERIDS;
my $usertotal = $db->selectrow_array($tsql);
my $blocks = LJ::Stats::num_blocks($usertotal);
my %ret;
foreach my $block (1..$blocks) {
my ($low, $high) = LJ::Stats::get_block_bounds($block);
$db = $db_getter->(); # revalidate connection
my $sth = $db->prepare("SELECT c.client, COUNT(*) AS 'count' FROM clients c, clientusage cu " .
"WHERE c.clientid=cu.clientid AND cu.userid BETWEEN $low AND $high " .
"AND cu.lastlogin > DATE_SUB(NOW(), INTERVAL 30 DAY) GROUP BY 1 ORDER BY 2");
$sth->execute;
die $db->errstr if $db->err;
while ($_ = $sth->fetchrow_hashref) {
$ret{$_->{'client'}} += $_->{'count'};
}
print LJ::Stats::block_status_line($block, $blocks);
}
return \%ret;
},
});
# user table analysis
LJ::Stats::register_stat
({ 'type' => "global",
'jobname' => "users",
'statname' => ["account", "newbyday", "age", "userinfo"],
'handler' =>
sub {
my $db_getter = shift;
return undef unless ref $db_getter eq 'CODE';
my $db = $db_getter->();
return undef unless $db;
my $tsql = "SELECT MAX(userid) FROM user";
$tsql .= " where userid < " . $LJ::LJR_IMPORTED_USERIDS if $LJ::LJR_IMPORTED_USERIDS;
my $usertotal = $db->selectrow_array($tsql);
my $blocks = LJ::Stats::num_blocks($usertotal);
my %ret; # return hash, (statname => { arg => val } since 'statname' is arrayref above
# iterate over user table in batches
foreach my $block (1..$blocks) {
my ($low, $high) = LJ::Stats::get_block_bounds($block);
# user query: gets user,caps,age,status,allow_getljnews
$db = $db_getter->(); # revalidate connection
my $sth = $db->prepare
("SELECT user, caps, " .
"FLOOR((TO_DAYS(NOW())-TO_DAYS(bdate))/365.25) AS 'age', " .
"status, allow_getljnews " .
"FROM user WHERE userid BETWEEN $low AND $high");
$sth->execute;
die $db->errstr if $db->err;
while (my $rec = $sth->fetchrow_hashref) {
# account types
my $capnameshort = LJ::name_caps_short($rec->{'caps'});
$ret{'account'}->{$capnameshort}++;
# ages
$ret{'age'}->{$rec->{'age'}}++
if $rec->{'age'} > 4 && $rec->{'age'} < 110;
# users receiving news emails
$ret{'userinfo'}->{'allow_getljnews'}++
if $rec->{'status'} eq "A" && $rec->{'allow_getljnews'} eq "Y";
}
# userusage query: gets timeupdate,datereg,nowdate
my $sth = $db->prepare
("SELECT DATE_FORMAT(timecreate, '%Y-%m-%d') AS 'datereg', " .
"DATE_FORMAT(NOW(), '%Y-%m-%d') AS 'nowdate', " .
"UNIX_TIMESTAMP(timeupdate) AS 'timeupdate' " .
"FROM userusage WHERE userid BETWEEN $low AND $high");
$sth->execute;
die $db->errstr if $db->err;
while (my $rec = $sth->fetchrow_hashref) {
# date registered
$ret{'newbyday'}->{$rec->{'datereg'}}++
unless $rec->{'datereg'} eq $rec->{'nowdate'};
# total user/activity counts
$ret{'userinfo'}->{'total'}++;
if (my $time = $rec->{'timeupdate'}) {
my $now = time();
$ret{'userinfo'}->{'updated'}++;
$ret{'userinfo'}->{'updated_last30'}++ if $time > $now-60*60*24*30;
$ret{'userinfo'}->{'updated_last7'}++ if $time > $now-60*60*24*7;
$ret{'userinfo'}->{'updated_last1'}++ if $time > $now-60*60*24*1;
}
}
print LJ::Stats::block_status_line($block, $blocks);
}
return \%ret;
},
});
LJ::Stats::register_stat
({ 'type' => "clustered",
'jobname' => "countries",
'statname' => "country",
'handler' =>
sub {
my $db_getter = shift;
return undef unless ref $db_getter eq 'CODE';
my $db = $db_getter->();
my $cid = shift;
return undef unless $db && $cid;
my $upc = LJ::get_prop("user", "country");
die "Can't find country userprop. Database populated?\n" unless $upc;
my $tsql = "SELECT MAX(userid) FROM userproplite2";
$tsql .= " where userid < " . $LJ::LJR_IMPORTED_USERIDS if $LJ::LJR_IMPORTED_USERIDS;
my $usertotal = $db->selectrow_array($tsql);
my $blocks = LJ::Stats::num_blocks($usertotal);
my %ret;
foreach my $block (1..$blocks) {
my ($low, $high) = LJ::Stats::get_block_bounds($block);
$db = $db_getter->(); # revalidate connection
my $sth = $db->prepare("SELECT u.value, COUNT(*) AS 'count' FROM userproplite2 u " .
"LEFT JOIN clustertrack2 c ON u.userid=c.userid " .
"WHERE u.upropid=? AND u.value<>'' AND u.userid=c.userid " .
"AND u.userid BETWEEN $low AND $high " .
"AND (c.clusterid IS NULL OR c.clusterid=?)" .
"GROUP BY 1 ORDER BY 2");
$sth->execute($upc->{'id'}, $cid);
die "clusterid: $cid, " . $db->errstr if $db->err;
while ($_ = $sth->fetchrow_hashref) {
$ret{$_->{'value'}} += $_->{'count'};
}
print LJ::Stats::block_status_line($block, $blocks);
}
return \%ret;
},
});
LJ::Stats::register_stat
({ 'type' => "clustered",
'jobname' => "states",
'statname' => "stateus",
'handler' =>
sub {
my $db_getter = shift;
return undef unless ref $db_getter eq 'CODE';
my $db = $db_getter->();
my $cid = shift;
return undef unless $db && $cid;
my $upc = LJ::get_prop("user", "country");
die "Can't find country userprop. Database populated?\n" unless $upc;
my $ups = LJ::get_prop("user", "state");
die "Can't find state userprop. Database populated?\n" unless $ups;
my $tsql = "SELECT MAX(userid) FROM userproplite2";
$tsql .= " where userid < " . $LJ::LJR_IMPORTED_USERIDS if $LJ::LJR_IMPORTED_USERIDS;
my $usertotal = $db->selectrow_array($tsql);
my $blocks = LJ::Stats::num_blocks($usertotal);
my %ret;
foreach my $block (1..$blocks) {
my ($low, $high) = LJ::Stats::get_block_bounds($block);
$db = $db_getter->(); # revalidate connection
my $sth = $db->prepare("SELECT ua.value, COUNT(*) AS 'count' " .
"FROM userproplite2 ua, userproplite2 ub " .
"WHERE ua.userid=ub.userid AND ua.upropid=? AND " .
"ub.upropid=? and ub.value='US' AND ub.value<>'' " .
"AND ua.userid BETWEEN $low AND $high " .
"GROUP BY 1 ORDER BY 2");
$sth->execute($ups->{'id'}, $upc->{'id'});
die $db->errstr if $db->err;
while ($_ = $sth->fetchrow_hashref) {
$ret{$_->{'value'}} += $_->{'count'};
}
print LJ::Stats::block_status_line($block, $blocks);
}
return \%ret;
},
});
LJ::Stats::register_stat
({ 'type' => "clustered",
'jobname' => "gender",
'statname' => "gender",
'handler' =>
sub {
my $db_getter = shift;
return undef unless ref $db_getter eq 'CODE';
my $db = $db_getter->();
my $cid = shift;
return undef unless $db && $cid;
my $upg = LJ::get_prop("user", "gender");
die "Can't find gender userprop. Database populated?\n" unless $upg;
my $tsql = "SELECT MAX(userid) FROM userproplite2";
$tsql .= " where userid < " . $LJ::LJR_IMPORTED_USERIDS if $LJ::LJR_IMPORTED_USERIDS;
my $usertotal = $db->selectrow_array($tsql);
my $blocks = LJ::Stats::num_blocks($usertotal);
my %ret;
foreach my $block (1..$blocks) {
my ($low, $high) = LJ::Stats::get_block_bounds($block);
$db = $db_getter->(); # revalidate connection
my $sth = $db->prepare("SELECT value, COUNT(*) AS 'count' FROM userproplite2 up " .
"LEFT JOIN clustertrack2 c ON up.userid=c.userid " .
"WHERE up.upropid=? AND up.userid BETWEEN $low AND $high " .
"AND (c.clusterid IS NULL OR c.clusterid=?) GROUP BY 1");
$sth->execute($upg->{'id'}, $cid);
die "clusterid: $cid, " . $db->errstr if $db->err;
while ($_ = $sth->fetchrow_hashref) {
$ret{$_->{'value'}} += $_->{'count'};
}
print LJ::Stats::block_status_line($block, $blocks);
}
return \%ret;
},
});
# run stats
LJ::Stats::run_stats(@which);
#### dump to text file
print "-I- Dumping to a text file.\n";
{
my $dbh = LJ::Stats::get_db("dbh");
my $sth = $dbh->prepare("SELECT statcat, statkey, statval FROM stats ORDER BY 1, 2");
$sth->execute;
die $dbh->errstr if $dbh->err;
open (OUT, ">$LJ::HTDOCS/stats/stats.txt");
while (my @row = $sth->fetchrow_array) {
next if grep { $row[0] eq $_ } @LJ::PRIVATE_STATS;
print OUT join("\t", @row), "\n";
}
close OUT;
}
print "-I- Done.\n";
};
$maint{'genstats_size'} = sub {
LJ::Stats::register_stat
({ 'type' => "global",
'jobname' => "size-accounts",
'statname' => "size",
'handler' =>
sub {
my $db_getter = shift;
return undef unless ref $db_getter eq 'CODE';
my $db = $db_getter->();
return undef unless $db;
my $tsql = "SELECT MAX(userid) FROM user";
$tsql .= " where userid < " . $LJ::LJR_IMPORTED_USERIDS if $LJ::LJR_IMPORTED_USERIDS;
# not that this isn't a total of current accounts (some rows may have
# been deleted), but rather a total of accounts ever created
my $size = $db->selectrow_array($tsql);
return { 'accounts' => $size };
},
});
LJ::Stats::register_stat
({ 'type' => "clustered",
'jobname' => "size-accounts_active",
'statname' => "size",
'handler' =>
sub {
my $db_getter = shift;
return undef unless ref $db_getter eq 'CODE';
my $db = $db_getter->();
return undef unless $db;
my $period = 30; # one month is considered active
my $active = $db->selectrow_array
("SELECT COUNT(*) FROM clustertrack2 WHERE ".
"timeactive > UNIX_TIMESTAMP()-86400*$period");
return { 'accounts_active' => $active };
},
});
print "-I- Generating account size stats.\n";
LJ::Stats::run_stats("size-accounts", "size-accounts_active");
print "-I- Done.\n";
};
$maint{'genstats_weekly'} = sub
{
LJ::Stats::register_stat
({ 'type' => "global",
'jobname' => "supportrank",
'statname' => "supportrank",
'handler' =>
sub {
my $db_getter = shift;
return undef unless ref $db_getter eq 'CODE';
my $db = $db_getter->();
return undef unless $db;
my %supportrank;
my $rank = 0;
my $lastpoints = 0;
my $buildup = 0;
my $sth = $db->prepare
("SELECT u.userid, SUM(sp.points) AS 'points' " .
"FROM user u, supportpoints sp " .
"WHERE u.userid=sp.userid GROUP BY 1 ORDER BY 2 DESC");
$sth->execute;
die $db->errstr if $db->err;
while ($_ = $sth->fetchrow_hashref) {
if ($lastpoints != $_->{'points'}) {
$lastpoints = $_->{'points'};
$rank += (1 + $buildup);
$buildup = 0;
} else {
$buildup++;
}
$supportrank{$_->{'userid'}} = $rank;
}
# move old 'supportrank' stat to supportrank_prev
# no API for this :-/
{
my $dbh = LJ::Stats::get_db("dbh");
$dbh->do("DELETE FROM stats WHERE statcat='supportrank_prev'");
$dbh->do("UPDATE stats SET statcat='supportrank_prev' WHERE statcat='supportrank'");
}
return \%supportrank;
}
});
print "-I- Generating weekly stats.\n";
LJ::Stats::run_stats('supportrank');
print "-I- Done.\n";
};
$maint{'build_randomuserset'} = sub
{
## this sets up the randomuserset table daily (or whenever) that htdocs/random.bml uses to
## find a random user that is both 1) publicly listed in the directory, and 2) updated
## within the past 24 hours.
## note that if a user changes their privacy setting to not be in the database, it'll take
## up to 24 hours for them to be removed from the random.bml listing, but that's acceptable.
my $dbh = LJ::get_db_writer();
print "-I- Building randomuserset.\n";
$dbh->do("TRUNCATE TABLE randomuserset");
$dbh->do("REPLACE INTO randomuserset (userid) " .
"SELECT uu.userid FROM userusage uu, user u " .
"WHERE u.userid=uu.userid AND u.allow_infoshow='Y' " .
"AND uu.timeupdate > DATE_SUB(NOW(), INTERVAL 1 DAY) ORDER BY RAND() LIMIT 5000");
my $num = $dbh->selectrow_array("SELECT MAX(rid) FROM randomuserset");
$dbh->do("REPLACE INTO stats (statcat, statkey, statval) " .
"VALUES ('userinfo', 'randomcount', $num)");
print "-I- Done.\n";
};
$maint{'memeclean'} = sub
{
my $dbh = LJ::get_db_writer();
print "-I- Cleaning memes.\n";
my $sth = $dbh->prepare("SELECT statkey FROM stats WHERE statcat='popmeme'");
$sth->execute;
die $dbh->errstr if $dbh->err;
while (my $url = $sth->fetchrow_array) {
my $copy = $url;
LJ::run_hooks("canonicalize_url", \$copy);
unless ($copy) {
my $d = $dbh->quote($url);
$dbh->do("DELETE FROM stats WHERE statcat='popmeme' AND statkey=$d");
print " deleting: $url\n";
}
}
print "-I- Done.\n";
};
1;