This commit is contained in:
2019-02-06 00:49:12 +03:00
commit 8dbb1bb605
4796 changed files with 506072 additions and 0 deletions

View File

@@ -0,0 +1,2 @@
BINARY attributes to mysql columns

View File

@@ -0,0 +1,14 @@
#!/usr/bin/perl
#
__END__
mkdir /mnt/mogilefs
mkdir /mnt/mogilefs/brad
mkdir /mnt/mogilefs/kenny
mkdir /mnt/mogilefs/cartman
mount -t nfs -o defaults,noatime,timeo=2,retrans=1,soft 10.1.0.10:/var/mogdata /mnt/mogilefs/brad
mount -t nfs -o defaults,noatime,timeo=2,retrans=1,soft 10.1.0.2:/var/mogdata /mnt/mogilefs/kenny
mount -t nfs -o defaults,noatime,timeo=2,retrans=1,soft 10.1.0.1:/var/mogdata /mnt/mogilefs/cartman

View File

@@ -0,0 +1,46 @@
DATABASE
mysql -umogile mogilefs
CLIENT
10.0.0.80:/usr/share/commons /mnt/tally/commons nfs defaults,noatime,timeo=1,retrans=1,soft 0 0
0.3 seconds for open to fail w/ above.
find /mnt/mogilefs/ -type f -exec rm {} \;
SERVER
apfel:/var# cat /etc/exports
# /etc/exports: the access control list for filesystems which may be exported
# to NFS clients. See exports(5).
/var/mogilefs 10.0.0.81(rw,sync,anonuid=1000,anongid=1000)
MogileA
/mnt/foo/disk1
/mnt/foo/disk2
MogileB
/mnt/bar/disk1
/mnt/bar/disk2
ClientA
/mnt/mogilefs/MA/disk1
/mnt/mogilefs/MA/disk2
/mnt/mogilefs/MB/disk1
/mnt/mogilefs/MB/disk2
ClientA's fstab:
10.0.0.A:/mnt/foo /mnt/mogilefs/MA
10.0.0.B:/mnt/bar /mnt/mogilefs/MB

View File

@@ -0,0 +1,102 @@
FB will want:
Give me new $fh for a given key. (which is for a domain)
Give me a path to a file to sendfile. (from a key)
Perl <-> mogilefs interface:
1) request new fid/path for a given key
< CREATE_OPEN domain=test&key=foo%27+sdf&class=<class_ident>
> ERR <errcode> <errstring>
nodb
foo
> OK <arg_length> fid=234&devid=342&path=/blah/foo/sdff
> ERR invalid_domain "Provided domain is not registered"
'key' may be optional, in which an anonymous file (not in
namespace is returned). essentially a temp/spool file.
optional request key: 'multi_dest=1' signals that server should
reply not with keys "devid" and "path" but instead with:
dev_count=3
devid_1=10
devid_2=17
devid_3=26
path_1=http://host1:7500/dev10/...
path_2=http://host2:7500/dev17/...
path_3=http://host3:7500/dev26/...
.. that way if the server unknowingly gave the client a down storage node,
the client can immediately try somewhere else.
2) close fid (and the devid, path that was written to)
< CREATE_CLOSE domain=test&key=foo&fid=343&devid=34&path=/sdf/sf/sdf
-- fid and devid are totally opaque strings. server just uses
them to verify the path and key.
-- it's possible two clients did a CREATE_OPEN on same key, got
diff fids (123 and 124), then 123 closes first and loses, since
key is now owned by 124, not 123.
> ERR expiredfid URL_encode("Expired file: another connection created the same file between your open and close")
> ERR bogus_devid "Devid doesn't correspond to path"
> ERR invalid_domain "Provided domain is not registered"
3) give string path to a given domainid+key
< GET_PATHS domain=test&key=FOOBAR
> OK <arg_length> paths=<n>&path1=.....&path2=.....
(client should prefer them in order given)
4) delete a key
< DELETE domain=test&key=sdfsdfsdfsdf
> OK 0
5) get a list of domains and classes for them
< GET_DOMAINS
> OK domains=1&domain1=test&domain1classes=1&domain1class1name=alt&domain1class1mindevcount=2
-- this is a two level hash; there are N domains (taken from the
key 'domains') and their names are stored in 'domainX' where X is
in the range 1..N.
-- each domain has N classes (taken from 'domainXclasses');
each class has two keys in the form of 'domainXclassYname' and
'domainXclassYmindevcount' where Y is in the range of 1..N.
6) create a new domain on the fly
< CREATE_DOMAIN domain=mynewdomain
> OK domain=mynewdomain
-- returns the domain you just created as the only response value
7) create a new class
< CREATE_CLASS domain=mynewdomain&class=theclass&mindevcount=2
> OK domain=mynewdomain&class=theclass&mindevcount=2
-- creates a new class under the domain you specify named 'class' with
the minimum device replication count of 'mindevcount'
8) updates a class's minimum device replica count
< UPDATE_CLASS domain=mynewdomain&class=theclass&mindevcount=3
> OK domain=mynewdomain&class=theclass&mindevcount=3
-- same as create_class except it overwrites the mindevcount of the class
you are specifying. useful if you want to change the replica count
for a class.

View File

@@ -0,0 +1,13 @@
INSERT INTO host SET hostid=1, status='alive', hostname='sammy', hostip='10.0.0.81', remoteroot='/var/mogilefs';
INSERT INTO host SET hostid=2, status='alive', hostname='tally', hostip='10.0.0.80', remoteroot='/var/mogilefs';
INSERT INTO host SET hostid=3, status='alive', hostname='tibook', hostip='10.0.0.13', remoteroot='/var/mogilefs';
INSERT INTO device SET devid=1, hostid=1, status='alive';
INSERT INTO device SET devid=2, hostid=1, status='alive';
INSERT INTO device SET devid=3, hostid=2, status='alive';
INSERT INTO device SET devid=4, hostid=2, status='alive';
INSERT INTO device SET devid=5, hostid=3, status='alive';
INSERT INTO domain SET dmid=1, namespace='test';
INSERT INTO class SET dmid=1, classid=1, classname='normal', mindevcount=2;

View File

@@ -0,0 +1,12 @@
INSERT INTO host SET hostid=1, status='alive', hostname='kenny', hostip='10.1.0.2', remoteroot='/var/mogdata';
INSERT INTO host SET hostid=2, status='alive', hostname='cartman', hostip='10.1.0.1', remoteroot='/var/mogdata';
INSERT INTO host SET hostid=3, status='alive', hostname='brad', hostip='10.1.0.10', remoteroot='/var/mogdata';
INSERT INTO device SET devid=1, hostid=1, status='alive';
INSERT INTO device SET devid=2, hostid=1, status='alive';
INSERT INTO device SET devid=3, hostid=2, status='alive';
INSERT INTO device SET devid=4, hostid=3, status='alive';
INSERT INTO domain SET dmid=1, namespace='test';
INSERT INTO class SET dmid=1, classid=1, classname='normal', mindevcount=2;

View File

@@ -0,0 +1,8 @@
INSERT INTO host SET hostid=1, status='alive', hostname='localhost', hostip='127.0.0.1', remoteroot='/var/mogilefs';
INSERT INTO device SET devid=1, hostid=1, status='alive';
INSERT INTO device SET devid=2, hostid=1, status='alive';
INSERT INTO device SET devid=3, hostid=1, status='alive';
INSERT INTO domain SET dmid=1, namespace='test';
INSERT INTO class SET dmid=1, classid=1, classname='normal', mindevcount=2;

View File

@@ -0,0 +1,154 @@
# <www.danga.com>LiveJournal
DROP TABLE IF EXISTS domain;
CREATE TABLE domain (
dmid SMALLINT UNSIGNED NOT NULL,
PRIMARY KEY (dmid),
namespace VARCHAR(255),
UNIQUE (namespace) # LiveJournal:<www.livejournal.com>, FotoBilder:<www.picpix.com>
);
# classes are tied to domains. domains can have classes of items
# with different mindevcounts.
#
# a minimum devcount is the number of copies the system tries to
# maintain for files in that class
#
# unspecified classname means classid=0 (implicit class), and that
# implies mindevcount=2
#
DROP TABLE IF EXISTS class;
CREATE TABLE class (
dmid SMALLINT UNSIGNED NOT NULL,
classid TINYINT UNSIGNED NOT NULL,
PRIMARY KEY (dmid,classid),
classname VARCHAR(50),
UNIQUE (dmid,classname),
mindevcount TINYINT UNSIGNED NOT NULL
);
# the length field is only here for easy verifications of content
# integrity when copying around. no sums or content types or other
# metadata here. application can handle that.
#
# classid is what class of file this belongs to. for instance, on fotobilder
# there will be a class for original pictures (the ones the user uploaded)
# and a class for derived images (scaled down versions, thumbnails, greyscale, etc)
# each domain can setup classes and assign the minimum redundancy level for
# each class. fotobilder will use a 2 or 3 minimum copy redundancy for original
# photos and and a 1 minimum for derived images (which means the sole device
# for a derived image can die, bringing devcount to 0 for that file, but
# the application can recreate it from its original)
DROP TABLE IF EXISTS file;
CREATE TABLE file (
fid INT UNSIGNED NOT NULL,
PRIMARY KEY (fid),
dmid SMALLINT UNSIGNED NOT NULL,
dkey VARCHAR(255), # domain-defined. LJ: "<userid>-<blobtype>-<blobid>"
UNIQUE dkey (dmid, dkey),
length INT UNSIGNED, # 4GB limit
classid TINYINT UNSIGNED NOT NULL,
devcount TINYINT UNSIGNED NOT NULL,
INDEX devcount (dmid,classid,devcount)
);
DROP TABLE IF EXISTS tempfile;
CREATE TABLE tempfile (
fid INT UNSIGNED NOT NULL AUTO_INCREMENT,
PRIMARY KEY (fid),
createtime INT UNSIGNED NOT NULL,
classid TINYINT UNSIGNED NOT NULL,
dmid SMALLINT UNSIGNED NOT NULL,
dkey VARCHAR(255),
devids VARCHAR(60)
);
# files marked for death when their key is overwritten. then they get a new
# fid, but since the old row (with the old fid) had to be deleted immediately,
# we need a place to store the fid so an async job can delete the file from
# all devices.
DROP TABLE IF EXISTS file_to_delete;
CREATE TABLE file_to_delete (
fid INT UNSIGNED NOT NULL,
PRIMARY KEY (fid)
);
# if the replicator notices that a fid has no sources, that file gets inserted
# into the unreachable_fids table. it is up to the application to actually
# handle fids stored in this table.
DROP TABLE IF EXISTS unreachable_fids;
CREATE TABLE unreachable_fids (
fid INT UNSIGNED NOT NULL,
lastupdate INT UNSIGNED NOT NULL,
PRIMARY KEY (fid),
INDEX (lastupdate)
);
# what files are on what devices? (most likely physical devices,
# as logical devices of RAID arrays would be costly, and mogilefs
# already handles redundancy)
#
# the devid index lets us answer "What files were on this now-dead disk?"
#
DROP TABLE IF EXISTS file_on;
CREATE TABLE file_on (
fid INT UNSIGNED NOT NULL,
devid MEDIUMINT UNSIGNED NOT NULL,
PRIMARY KEY (fid, devid),
INDEX (devid)
);
# if application or framework detects an error in one of the duplicate files
# for whatever reason, it can register its complaint and the framework
# will do some verifications and fix things up w/ an async job
# MAYBE: let application tell us the SHA1/MD5 of the file for us to check
# on the other devices?
DROP TABLE IF EXISTS file_on_corrupt;
CREATE TABLE file_on_corrupt (
fid INT UNSIGNED NOT NULL,
devid MEDIUMINT UNSIGNED NOT NULL,
PRIMARY KEY (fid, devid)
);
DROP TABLE IF EXISTS device;
CREATE TABLE device (
devid MEDIUMINT UNSIGNED NOT NULL,
PRIMARY KEY (devid),
hostid MEDIUMINT UNSIGNED NOT NULL,
status ENUM('alive','dead','down'),
INDEX (status),
mb_total MEDIUMINT UNSIGNED,
mb_used MEDIUMINT UNSIGNED,
mb_asof INT UNSIGNED
);
DROP TABLE IF EXISTS host;
CREATE TABLE host (
hostid MEDIUMINT UNSIGNED NOT NULL,
PRIMARY KEY (hostid),
status ENUM('alive','dead','down'),
http_port MEDIUMINT UNSIGNED DEFAULT 7500,
http_get_port MEDIUMINT UNSIGNED,
hostname VARCHAR(40),
UNIQUE (hostname),
hostip VARCHAR(15),
UNIQUE (hostip),
altip VARCHAR(15),
UNIQUE (altip),
altmask VARCHAR(18),
remoteroot VARCHAR(60)
);

View File

@@ -0,0 +1,59 @@
mogile database:
* add UUID to device table
mogilefsd:
* when storing files, also store 00003443.meta with info about file
- this is so we can restore the database if it totally crashes
- can also store checksumming info in the .meta file?
* other jobs (replicator & delete, etc) can connect to frontend and do spud-like variable setting
* fix request pipelining problem (make it an option, though?)
* host dead cache:
- when any jobs notices a mogstored state change, send to parent
- parent rebroadcasts to kids, so they share info
* UDP between peers for jobs to cross-communicate
- in conf file, "peers: 10.x.y.z:6001, 10.a.b.c:6001"
* running fs verifier that makes sure UUIDs from mogstored matches database
- if a host fails verification, mark it as down
- configurable "verify time" (default 300 seconds)
- over 300 seconds, verify another device every N seconds (300/X = N, X = number of devices)
* rebalancer job (future)
- moves files around to devices with less files
* free space weighting
- every N seconds, using usage file, update database free space columns
- jobs update their in memory space cache every 60 seconds or so
- new files go to devices depending on free space (weighted algorithm)
- new job to update mb_* columns
- new column, mb_asof for date last updated
- email alerts when mb_asof is old
* checksum files (future)
* FUSE interface (future)
* require InnoDB backend
mogstored:
* support configuration files
* allow OPTIONS request
- include "Date: <gmt date>" header
* run usage file creator in different thread
* smartd dump just like usage file
* fsinfo dump
- include UUID of fs
- run in new thread (with usage?)
- mount -P to get fs types
mogadm:
* example commands:
- check
- <host> add disk <disk>
- <host> down disk <disk>
- <host> kill disk <disk>
- <host> down
- <host> up
moginit
* create new mogilefs database
* perform database upgrades