osd/Filer.o\
osd/OSDMap.o\
osd/PG.o\
+ osd/ObjectStore.o\
ebofs.o\
common/Logger.o\
common/Clock.o\
- stability
- ebofs table.remove() thing
+ - fakestore crapping out.. missing timer events?
- mds
- vary log stripe size, count (on ebofs and fakestore)
? 4k for ebofs, 64k for fakestore
? scount=4... or 2?
+ -> striping mostly useless
- makedirs vs ebo/fake.
- streaming small writes (mds log)
-
-- mds scaling
- - shared /lib, /include .. verify fixed open() behavior works!
+ -> ebofs needs 200 clients per MDS to get close to fakestore (compensate for latency?)
- quick retest of crush vs linear
-- workloads
+- scaling (scale mds+osd+clients together)
+ - metadata
- makedirs
+** 100 c per mds is good on googoo ... testing on alc as mds_log.1
+
+ - creates in same dir
+ --syn makefiles 10000 10 0
+ - repeat creat+stat+unlink
+ - creates in private dir
+ - repeat creat+stat+unlink
+ - zillion opens of the same file
+ --syn only 0 --syn createshared 10 --syn openshared 10 1000
- - kernel compile?
+ - local compile
+ - shared compile (/lib, /include)
+ - data scaling
+ - writes to local files
+ - strided write to shared file
+ - strided write with O_SYNC
-- MDS mkdir needs EDirUpdate event
- ...or else we can get unlogged leftover dirty dirs
- obfs?
- osd write tests
- - get obfs working
+ - ebofs vs fakestore:
+ plot "log/osd/write_sizes.swap1.block/c" u 1:2 t "fs=ebofs", "log/osd/write_sizes.sdb2.ext3.fake/c" u 1:2 t "fs=fakestore.sdb2", "log/osd/write_sizes.sdb2.ext3.fake2/c" u 1:2 t "fs=fakestore.sdb2.again", "log/osd/write_sizes.sdb2.ebo/c" u 1:2 t "fs=ebofs.again"
+ - get obfs working?
- client buffer cache!
Dentry *dn = NULL;
if (dir->dentries.count(dname))
dn = dir->dentries[dname];
- dout(12) << "insert_inode_info " << dname << " ino " << hex << in_info->inode.ino << dec << " size " << in_info->inode.size << endl;
+ dout(12) << "insert_inode_info " << dname << " ino " << hex << in_info->inode.ino << dec << " size " << in_info->inode.size << " hashed " << in_info->hashed << endl;
if (dn) {
if (dn->inode->inode.ino == in_info->inode.ino) {
*(dn->inode->symlink) = in_info->symlink;
}
- // take note of latest distribution on mds's
+ // dir info
+ dn->inode->dir_auth = in_info->dir_auth;
+ dn->inode->dir_hashed = in_info->hashed;
+ dn->inode->dir_replicated = in_info->replicated;
+
+ // dir replication
if (in_info->spec_defined) {
- if (in_info->dist.empty() && !dn->inode->mds_contacts.empty()) {
- dout(9) << "lost dist spec for " << hex << dn->inode->inode.ino << dec << " " << in_info->dist << endl;
- }
- if (!in_info->dist.empty() && dn->inode->mds_contacts.empty()) {
- dout(9) << "got dist spec for " << hex << dn->inode->inode.ino << dec << " " << in_info->dist << endl;
- }
- dn->inode->mds_contacts = in_info->dist;
- dn->inode->mds_dir_auth = in_info->dir_auth;
+ if (in_info->dist.empty() && !dn->inode->dir_contacts.empty())
+ dout(9) << "lost dist spec for " << hex << dn->inode->inode.ino << dec
+ << " " << in_info->dist << endl;
+ if (!in_info->dist.empty() && dn->inode->dir_contacts.empty())
+ dout(9) << "got dist spec for " << hex << dn->inode->inode.ino << dec
+ << " " << in_info->dist << endl;
+ dn->inode->dir_contacts = in_info->dist;
}
return dn->inode;
Inode *cur = root;
time_t now = time(NULL);
- if (trace.empty()) return;
+ if (trace.empty()) {
+ return;
+ }
for (unsigned i=0; i<trace.size(); i++) {
if (i == 0) {
+ c_inode_info *in_info = trace[0];
+
if (!root) {
cur = root = new Inode();
- root->inode = trace[i]->inode;
- inode_map[root->inode.ino] = root;
+ root->inode = in_info->inode;
+ inode_map[root->inode.ino] = root;
}
- if (trace[i]->spec_defined) {
- root->mds_contacts = trace[i]->dist;
- root->mds_dir_auth = trace[i]->dir_auth;
- }
+
root->last_updated = now;
- dout(12) << "insert_trace trace " << i << " root" << endl;
+
+ root->dir_auth = in_info->dir_auth;
+ root->dir_hashed = in_info->hashed;
+ root->dir_replicated = in_info->replicated;
+ if (in_info->spec_defined)
+ root->dir_contacts = in_info->dist;
+
+ dout(12) << "insert_trace trace " << i << " root .. rep=" << root->dir_replicated << endl;
} else {
dout(12) << "insert_trace trace " << i << endl;
Dir *dir = cur->open_dir();
bool auth_best,
int use_mds) // this param is icky, debug weirdness!
{
- // send to what MDS? find deepest known prefix
- Inode *cur = root;
- for (unsigned i=0; i<req->get_filepath().depth(); i++) {
- if (cur && cur->inode.mode & INODE_MODE_DIR && cur->dir) {
- Dir *dir = cur->dir;
- if (dir->dentries.count( req->get_filepath()[i] ) == 0)
+ // find deepest known prefix
+ Inode *diri = root; // the deepest known containing dir
+ Inode *item = 0; // the actual item... if we know it
+ int missing_dn = -1; // which dn we miss on (if we miss)
+
+ unsigned depth = req->get_filepath().depth();
+ for (unsigned i=0; i<depth; i++) {
+ // dir?
+ if (diri && diri->inode.mode & INODE_MODE_DIR && diri->dir) {
+ Dir *dir = diri->dir;
+
+ // do we have the next dentry?
+ if (dir->dentries.count( req->get_filepath()[i] ) == 0) {
+ missing_dn = i; // no.
break;
+ }
- dout(7) << " have path seg " << i << " on " << cur->mds_dir_auth << " ino " << hex << cur->inode.ino << dec << " " << req->get_filepath()[i] << endl;
- cur = dir->dentries[ req->get_filepath()[i] ]->inode;
- assert(cur);
- } else
+ dout(7) << " have path seg " << i << " on " << diri->dir_auth << " ino " << hex << diri->inode.ino << dec << " " << req->get_filepath()[i] << endl;
+
+ if (i == depth-1) { // last one!
+ item = dir->dentries[ req->get_filepath()[i] ]->inode;
+ break;
+ }
+
+ // continue..
+ diri = dir->dentries[ req->get_filepath()[i] ]->inode;
+ assert(diri);
+ } else {
+ missing_dn = i;
break;
+ }
}
-
+
+ // choose an mds
int mds = 0;
- if (cur) {
- if (!auth_best && cur->get_replicas().size()) {
- // try replica(s)
- dout(9) << "contacting replica from deepest inode " << hex << cur->inode.ino << dec << " " << req->get_filepath() << ": " << cur->get_replicas() << endl;
- set<int>::iterator it = cur->get_replicas().begin();
- if (cur->get_replicas().size() == 1)
- mds = *it;
- else {
- int r = rand() % cur->get_replicas().size();
- while (r--) it++;
- mds = *it;
+ if (diri) {
+ if (auth_best) {
+ // pick the actual auth (as best we can)
+ if (item) {
+ mds = item->authority(mdcluster);
+ } else if (diri->dir_hashed) {
+ mds = diri->dentry_authority(req->get_filepath()[missing_dn].c_str(),
+ mdcluster);
+ } else {
+ mds = diri->authority(mdcluster);
}
} else {
- // try auth
- mds = cur->authority();
- //if (!auth_best && req->get_filepath().get_path()[0] == '/')
- dout(9) << "contacting auth mds " << mds << " auth_best " << auth_best << " for " << req->get_filepath() << endl;
+ // balance our traffic!
+ if (diri->dir_hashed)
+ mds = diri->dentry_authority(req->get_filepath()[missing_dn].c_str(),
+ mdcluster);
+ else
+ mds = diri->pick_replica(mdcluster);
}
- } else {
- dout(9) << "i have no idea where " << req->get_filepath() << " is" << endl;
}
// force use of a particular mds?
for (map<int,InodeCap>::iterator p = in->caps.begin();
p != in->caps.end();
p++)
- dout(0) << " cap " << p->first << " "
+ dout(20) << " left cap " << p->first << " "
<< cap_string(p->second.caps) << " "
<< p->second.seq << endl;
for (map<int,InodeCap>::iterator p = in->stale_caps.begin();
p != in->stale_caps.end();
p++)
- dout(0) << " stale cap " << p->first << " "
+ dout(20) << " left stale cap " << p->first << " "
<< cap_string(p->second.caps) << " "
<< p->second.seq << endl;
client_lock.Lock();
assert(reply);
+ // mdcluster!
+ mdcluster = new MDCluster(g_conf.num_mds, g_conf.num_osd); // FIXME this is stoopid
+
// we got osdmap!
osdmap->decode(reply->get_osd_map_state());
#ifndef __CLIENT_H
#define __CLIENT_H
-//#include "mds/MDCluster.h"
+#include "mds/MDCluster.h"
#include "osd/OSDMap.h"
#include "msg/Message.h"
void put() { assert(ref == 1); ref--; lru_unpin(); }
Dentry() : dir(0), inode(0), ref(0) { }
+
/*Dentry() : name(0), dir(0), inode(0), ref(0) { }
Dentry(string& n) : name(0), dir(0), inode(0), ref(0) {
name = new char[n.length()+1];
class Inode {
public:
inode_t inode; // the actual inode
- int mds_dir_auth;
- set<int> mds_contacts;
time_t last_updated;
+ // about the dir (if this is one!)
+ int dir_auth;
+ set<int> dir_contacts;
+ bool dir_hashed, dir_replicated;
+
// per-mds caps
map<int,InodeCap> caps; // mds -> InodeCap
map<int,InodeCap> stale_caps; // mds -> cap .. stale
//cout << "inode.put on " << hex << inode.ino << dec << " now " << ref << endl;
}
- Inode() : mds_dir_auth(-1), last_updated(0),
+ Inode() :
+ last_updated(0),
+ dir_auth(-1), dir_hashed(false), dir_replicated(false),
file_wr_mtime(0), file_wr_size(0), num_rd(0), num_wr(0),
ref(0), dir(0), dn(0), symlink(0) { }
~Inode() {
return w;
}
- int authority() {
- // my info valid?
- if (mds_dir_auth >= 0)
- return mds_dir_auth;
-
- // otherwise try parent
- if (dn && dn->dir && dn->dir->parent_inode)
- return dn->dir->parent_inode->authority();
+ int authority(MDCluster *mdcluster) {
+ // parent?
+ if (dn && dn->dir && dn->dir->parent_inode) {
+ // parent hashed?
+ if (dn->dir->parent_inode->dir_hashed) {
+ // hashed
+ return mdcluster->hash_dentry( dn->dir->parent_inode->ino(),
+ dn->name );
+ }
+
+ if (dir_auth >= 0)
+ return dir_auth;
+ else
+ return dn->dir->parent_inode->authority(mdcluster);
+ }
+
+ if (dir_auth >= 0)
+ return dir_auth;
- return 0; // who knows!
+ assert(0); // !!!
+ return 0;
}
- set<int>& get_replicas() {
- if (mds_contacts.size())
- return mds_contacts;
- if (is_dir()) {
- return mds_contacts;
- }
- if (dn && dn->dir && dn->dir->parent_inode) {
- return dn->dir->parent_inode->get_replicas();
+ int dentry_authority(const char *dn,
+ MDCluster *mdcluster) {
+ return mdcluster->hash_dentry( ino(),
+ dn );
+ }
+ int pick_replica(MDCluster *mdcluster) {
+ // replicas?
+ if (dir_contacts.size()) {
+ set<int>::iterator it = dir_contacts.begin();
+ if (dir_contacts.size() == 1)
+ return *it;
+ else {
+ int r = rand() % dir_contacts.size();
+ while (r--) it++;
+ return *it;
+ }
}
- return mds_contacts;
+
+ if (dir_replicated)
+ return rand() % mdcluster->get_num_mds(); // huh.. pick a random mds!
+ else
+ return authority(mdcluster);
}
-
+
// open Dir for an inode. if it's not open, allocated it (and pin dentry in memory).
Dir *open_dir() {
int whoami;
// cluster descriptors
- //MDCluster *mdcluster;
+ MDCluster *mdcluster;
OSDMap *osdmap;
bool mounted;
for (unsigned i=0; i<args.size(); i++) {
if (strcmp(args[i],"--syn") == 0) {
++i;
+
if (strcmp(args[i],"writefile") == 0) {
syn_modes.push_back( SYNCLIENT_MODE_WRITEFILE );
syn_iargs.push_back( atoi(args[++i]) );
syn_modes.push_back( SYNCLIENT_MODE_READFILE );
syn_iargs.push_back( a );
syn_iargs.push_back( b );
+
} else if (strcmp(args[i],"makedirs") == 0) {
syn_modes.push_back( SYNCLIENT_MODE_MAKEDIRS );
syn_iargs.push_back( atoi(args[++i]) );
syn_iargs.push_back( atoi(args[++i]) );
syn_iargs.push_back( atoi(args[++i]) );
+ } else if (strcmp(args[i],"makefiles") == 0) {
+ syn_modes.push_back( SYNCLIENT_MODE_MAKEFILES );
+ syn_iargs.push_back( atoi(args[++i]) );
+ syn_iargs.push_back( atoi(args[++i]) );
+ syn_iargs.push_back( atoi(args[++i]) );
+ } else if (strcmp(args[i],"createshared") == 0) {
+ syn_modes.push_back( SYNCLIENT_MODE_CREATESHARED );
+ syn_iargs.push_back( atoi(args[++i]) );
+ } else if (strcmp(args[i],"openshared") == 0) {
+ syn_modes.push_back( SYNCLIENT_MODE_OPENSHARED );
+ syn_iargs.push_back( atoi(args[++i]) );
+ syn_iargs.push_back( atoi(args[++i]) );
+
} else if (strcmp(args[i],"fullwalk") == 0) {
syn_modes.push_back( SYNCLIENT_MODE_FULLWALK );
//syn_sargs.push_back( atoi(args[++i]) );
} else if (strcmp(args[i],"randomwalk") == 0) {
syn_modes.push_back( SYNCLIENT_MODE_RANDOMWALK );
syn_iargs.push_back( atoi(args[++i]) );
+
} else if (strcmp(args[i],"trace") == 0) {
syn_modes.push_back( SYNCLIENT_MODE_TRACE );
syn_sargs.push_back( args[++i] );
} else if (strcmp(args[i],"opentest") == 0) {
syn_modes.push_back( SYNCLIENT_MODE_OPENTEST );
syn_iargs.push_back( atoi(args[++i]) );
+
} else {
cerr << "unknown syn mode " << args[i] << endl;
assert(0);
}
}
break;
+ case SYNCLIENT_MODE_MAKEFILES:
+ {
+ int num = iargs.front(); iargs.pop_front();
+ int count = iargs.front(); iargs.pop_front();
+ int priv = iargs.front(); iargs.pop_front();
+ if (run_me()) {
+ dout(2) << "makefiles " << num << " " << count << " " << priv << endl;
+ make_files(num, count, priv, false);
+ }
+ }
+ break;
+ case SYNCLIENT_MODE_MAKEFILES2:
+ {
+ int num = iargs.front(); iargs.pop_front();
+ int count = iargs.front(); iargs.pop_front();
+ int priv = iargs.front(); iargs.pop_front();
+ if (run_me()) {
+ dout(2) << "makefiles2 " << num << " " << count << " " << priv << endl;
+ make_files(num, count, priv, true);
+ }
+ }
+ break;
+ case SYNCLIENT_MODE_CREATESHARED:
+ {
+ string sarg1 = get_sarg(0);
+ int num = iargs.front(); iargs.pop_front();
+ if (run_me()) {
+ dout(2) << "createshared " << num << endl;
+ create_shared(num);
+ }
+ }
+ break;
+ case SYNCLIENT_MODE_OPENSHARED:
+ {
+ string sarg1 = get_sarg(0);
+ int num = iargs.front(); iargs.pop_front();
+ int count = iargs.front(); iargs.pop_front();
+ if (run_me()) {
+ dout(2) << "openshared " << num << endl;
+ open_shared(num, count);
+ }
+ }
+ break;
case SYNCLIENT_MODE_FULLWALK:
{
}
+int SyntheticClient::make_files(int num, int count, int priv, bool more)
+{
+ int whoami = client->get_nodeid();
+ char d[255];
+
+ if (priv) {
+ for (int c=0; c<count; c++) {
+ sprintf(d,"dir.%d.run%d", whoami, c);
+ client->mkdir(d, 0755);
+ }
+ } else {
+ // shared
+ if (whoami == 0) {
+ for (int c=0; c<count; c++) {
+ sprintf(d,"dir.%d.run%d", 0, c);
+ client->mkdir(d, 0755);
+ }
+ } else {
+ sleep(5);
+ }
+ }
+
+ // files
+ struct stat st;
+ for (int c=0; c<count; c++) {
+ for (int n=0; n<num; n++) {
+ sprintf(d,"dir.%d.run%d/file.client%d.%d", priv ? whoami:0, c, whoami, n);
+
+ client->mknod(d, 0644);
+
+ if (more) {
+ client->lstat(d, &st);
+ int fd = client->open(d, O_RDONLY);
+ client->unlink(d);
+ client->close(fd);
+ }
+
+ if (time_to_stop()) return 0;
+ }
+ }
+
+ return 0;
+}
+
+
+int SyntheticClient::create_shared(int num)
+{
+ // files
+ char d[255];
+ for (int n=0; n<num; n++) {
+ sprintf(d,"file.%d", n);
+ client->mknod(d, 0644);
+ }
+
+ return 0;
+}
+
+int SyntheticClient::open_shared(int num, int count)
+{
+ // files
+ char d[255];
+ for (int c=0; c<count; c++) {
+ // open
+ list<int> fds;
+ for (int n=0; n<num; n++) {
+ sprintf(d,"file.%d", n);
+ int fd = client->open(d,O_RDONLY);
+ fds.push_back(fd);
+ }
+
+ while (!fds.empty()) {
+ int fd = fds.front();
+ fds.pop_front();
+ client->close(fd);
+ }
+ }
+
+ return 0;
+}
+
int SyntheticClient::write_file(string& fn, int size, int wrsize) // size is in MB, wrsize in bytes
{
#define SYNCLIENT_MODE_RANDOMWALK 1
#define SYNCLIENT_MODE_FULLWALK 2
-#define SYNCLIENT_MODE_MAKEDIRS 3
-#define SYNCLIENT_MODE_WRITEFILE 4
-#define SYNCLIENT_MODE_READFILE 5
#define SYNCLIENT_MODE_REPEATWALK 7
-#define SYNCLIENT_MODE_WRITEBATCH 8
-#define SYNCLIENT_MODE_TRACE 20
+#define SYNCLIENT_MODE_MAKEDIRS 10 // dirs files depth
+#define SYNCLIENT_MODE_MAKEFILES 11 // num count private
+#define SYNCLIENT_MODE_MAKEFILES2 12 // num count private
+#define SYNCLIENT_MODE_CREATESHARED 13 // num
+#define SYNCLIENT_MODE_OPENSHARED 14 // num count
-#define SYNCLIENT_MODE_OPENTEST 30
+#define SYNCLIENT_MODE_WRITEFILE 20
+#define SYNCLIENT_MODE_READFILE 21
+#define SYNCLIENT_MODE_WRITEBATCH 22
+
+#define SYNCLIENT_MODE_TRACE 30
+
+#define SYNCLIENT_MODE_OPENTEST 40
#define SYNCLIENT_MODE_ONLY 50
#define SYNCLIENT_MODE_UNTIL 51
int full_walk(string& fromdir);
int random_walk(int n);
+
int make_dirs(const char *basedir, int dirs, int files, int depth);
+ int make_files(int num, int count, int priv, bool more);
+
+ int create_shared(int num);
+ int open_shared(int num, int count);
+
int write_file(string& fn, int mb, int chunk);
int write_batch(int nfile, int mb, int chunk);
int read_file(string& fn, int mb, int chunk);
mds_bal_replicate_threshold: 8000,
mds_bal_unreplicate_threshold: 1000,
- //mds_bal_hash_threshold: 20000,
- //mds_bal_unhash_threshold: 2000,
+ mds_bal_hash_threshold: 2000,
+ mds_bal_unhash_threshold: 250,
mds_bal_interval: 30, // seconds
+ mds_bal_hash_interval: 5, // seconds
mds_bal_idle_threshold: .1,
mds_bal_max: -1,
mds_bal_max_until: -1,
osd_max_opq: 10,
osd_mkfs: false,
osd_fake_sync: false,
+ osd_age: 0.0,
// --- fakestore ---
fakestore_fake_sync: 2, // 2 seconds
else if (strcmp(args[i], "--osd_mkfs") == 0)
g_conf.osd_mkfs = atoi(args[++i]);
+ else if (strcmp(args[i], "--osd_age") == 0)
+ g_conf.osd_age = atof(args[++i]);
else if (strcmp(args[i], "--osd_pg_bits") == 0)
g_conf.osd_pg_bits = atoi(args[++i]);
else if (strcmp(args[i], "--osd_max_rep") == 0)
float mds_bal_replicate_threshold;
float mds_bal_unreplicate_threshold;
+ float mds_bal_hash_threshold;
+ float mds_bal_unhash_threshold;
int mds_bal_interval;
+ int mds_bal_hash_interval;
float mds_bal_idle_threshold;
int mds_bal_max;
int mds_bal_max_until;
int osd_max_opq;
bool osd_mkfs;
bool osd_fake_sync;
+ float osd_age;
int fakestore_fake_sync;
{
ebofs_lock.Lock();
if (!dirty) {
- dout(3) << "sync in " << super_epoch << ", not dirty" << endl;
+ dout(0) << "sync in " << super_epoch << ", not dirty" << endl;
} else {
- dout(3) << "sync in " << super_epoch << endl;
+ dout(0) << "sync in " << super_epoch << endl;
if (!commit_thread_started) {
- dout(10) << "sync waiting for commit thread to start" << endl;
+ dout(0) << "sync waiting for commit thread to start" << endl;
sync_cond.Wait(ebofs_lock);
}
if (mid_commit) {
- dout(10) << "sync waiting for commit in progress" << endl;
+ dout(0) << "sync waiting for commit in progress" << endl;
sync_cond.Wait(ebofs_lock);
}
sync_cond.Wait(ebofs_lock); // wait
- dout(3) << "sync finish in " << super_epoch << endl;
+ dout(0) << "sync finish in " << super_epoch << endl;
}
ebofs_lock.Unlock();
}
buf->f_type = EBOFS_MAGIC; /* type of filesystem */
buf->f_bsize = 4096; /* optimal transfer block size */
buf->f_blocks = dev.get_num_blocks(); /* total data blocks in file system */
- buf->f_bfree = free_blocks; /* free blocks in fs */
- buf->f_bavail = free_blocks; /* free blocks avail to non-superuser */
+ buf->f_bfree = free_blocks + limbo_blocks; /* free blocks in fs */
+ buf->f_bavail = free_blocks + limbo_blocks; /* free blocks avail to non-superuser */
buf->f_files = nodepool.num_total(); /* total file nodes in file system */
buf->f_ffree = nodepool.num_free(); /* free file nodes in fs */
//buf->f_fsid = 0; /* file system id */
#define MDS_POP_NESTED 1 // me + children, auth or not
#define MDS_POP_CURDOM 2 // me + children in current domain
#define MDS_POP_ANYDOM 3 // me + children in any (nested) domain
-#define MDS_NPOP 4
+#define MDS_POP_DIRMOD 4 // just this dir, modifications only
+#define MDS_NPOP 5
class mds_load_t {
public:
if (dir.is_dirty()) out << " dirty";
if (dir.is_import()) out << " import";
if (dir.is_export()) out << " export";
- if (dir.is_hashed()) out << " hashed";
+ if (dir.is_hashed()) out << " hashed"; //=" << (int)dir.get_inode()->inode.hash_seed;
if (dir.is_auth()) {
out << " auth";
if (dir.is_open_by_anyone())
assert(in->is_dir());
if (auth)
state |= CDIR_STATE_AUTH;
+ /*
if (in->dir_is_hashed()) {
assert(0); // when does this happen?
state |= CDIR_STATE_HASHED;
}
-
+ */
+
auth_pins = 0;
nested_auth_pins = 0;
request_pins = 0;
if (( popularity[MDS_POP_CURDOM].get() > g_conf.mds_bal_replicate_threshold)) {
//if (!cached_by.empty() && inode.ino > 1) dout(1) << "distributed spec for " << *this << endl;
ls = open_by;
+ if (!ls.empty()) ls.insert(auth);
}
}
CDir *get_or_open_dir(MDS *mds);
CDir *set_dir(CDir *newdir);
- bool dir_is_hashed() {
- if (inode.hash_seed) return true;
- return false;
- }
bool dir_is_auth();
+void MDBalancer::do_hashing()
+{
+ if (hash_queue.empty()) {
+ dout(20) << "do_hashing has nothing to do" << endl;
+ return;
+ }
+
+ dout(0) << "do_hashing " << hash_queue.size() << " dirs marked for possible hashing" << endl;
+
+ for (set<inodeno_t>::iterator i = hash_queue.begin();
+ i != hash_queue.end();
+ i++) {
+ inodeno_t dirino = *i;
+ CInode *in = mds->mdcache->get_inode(dirino);
+ if (!in) continue;
+ CDir *dir = in->dir;
+ if (!dir) continue;
+ if (!dir->is_auth()) continue;
+
+ dout(0) << "do_hashing hashing " << *dir << endl;
+ mds->mdcache->hash_dir(dir);
+ }
+ hash_queue.clear();
+}
+
+
+
void MDBalancer::do_rebalance(int beat)
{
int cluster_size = mds->get_cluster()->get_num_mds();
}
-void MDBalancer::hit_dir(CDir *dir)
+void MDBalancer::hit_dir(CDir *dir, bool modify)
{
// hit me
dir->popularity[MDS_POP_JUSTME].hit();
-
- hit_recursive(dir);
+ // hit modify counter, if this was a modify
+ if (modify) {
+ float p = dir->popularity[MDS_POP_DIRMOD].hit();
+
+ if (dir->is_auth()) {
+ // hash this dir? (later?)
+ if (p > g_conf.mds_bal_hash_threshold &&
+ !(dir->is_hashed() || dir->is_hashing()) &&
+ hash_queue.count(dir->ino()) == 0) {
+ dout(0) << "hit_dir DIRMOD pop is " << p << ", putting in hash_queue: " << *dir << endl;
+ hash_queue.insert(dir->ino());
+ }
+ }
+ }
+
+ hit_recursive(dir);
}
int beat_epoch;
+ // todo
+ set<inodeno_t> hash_queue;
+
// per-epoch scatter/gathered info
hash_map<int, mds_load_t> mds_load;
map<int, map<int, float> > mds_import_map;
void send_heartbeat();
void handle_heartbeat(MHeartbeat *m);
+ void do_hashing();
+
void export_empties();
void do_rebalance(int beat);
void find_exports(CDir *dir,
void add_import(class CDir *im);
void hit_inode(class CInode *in);
- void hit_dir(class CDir *dir);
+ void hit_dir(class CDir *dir, bool modify=false);
void hit_recursive(class CDir *dir);
if (!in->is_dir()) continue;
if (!in->dir) continue;
- int dentryhashcode = mds->get_cluster()->hash_dentry( dir->inode->inode.hash_seed, it->first );
+ int dentryhashcode = mds->get_cluster()->hash_dentry( dir->ino(), it->first );
if (dentryhashcode == mds->get_nodeid()) continue;
// msg?
}
// pick a hash seed.
- dir->inode->inode.hash_seed = dir->ino();
+ dir->inode->inode.hash_seed = 1;//dir->ino();
// suck up all waiters
C_Contexts *fin = new C_Contexts;
CDentry *dn = it->second;
CInode *in = dn->inode;
- int dentryhashcode = mds->get_cluster()->hash_dentry( dir->inode->inode.hash_seed, it->first );
+ int dentryhashcode = mds->get_cluster()->hash_dentry( dir->ino(), it->first );
if (dentryhashcode == mds->get_nodeid()) {
continue; // still mine!
}
dout(7) << "got notifies too, all done" << endl;
hash_dir_finish(dir);
} else {
- dout(7) << "waiting on notifies" << endl;
+ dout(7) << "waiting on notifies " << endl;
}
} else {
if (!in) continue;
if (!in->dir) continue;
- int dentryhashcode = mds->get_cluster()->hash_dentry( dir->inode->inode.hash_seed, it->first );
+ int dentryhashcode = mds->get_cluster()->hash_dentry( dir->ino(), it->first );
if (dentryhashcode != from) continue; // we'll import these in a minute
if (in->dir->authority() != dentryhashcode)
assert(dir->hashed_subset.empty());
// inode state
- dir->inode->inode.hash_seed = dir->ino();
+ dir->inode->inode.hash_seed = 1;// dir->ino();
if (dir->inode->is_auth())
dir->inode->mark_dirty();
if (!in->is_dir()) continue;
if (!in->dir) continue;
- int dentryhashcode = mds->get_cluster()->hash_dentry( dir->inode->inode.hash_seed, it->first );
+ int dentryhashcode = mds->get_cluster()->hash_dentry( dir->ino(), it->first );
if (dentryhashcode != mds->get_nodeid()) continue;
// msg?
CDir *dir = in->dir;
assert(dir);
- dout(7) << "handle_unhash_dir " << *dir << endl;
+ dout(7) << "handle_unhash_dir " << *dir << endl;//" .. hash_seed is " << dir->inode->inode.hash_seed << endl;
assert(dir->is_hashed());
assert(dir->is_unhashing());
assert(!dir->is_auth());
CDentry *dn = it->second;
CInode *in = dn->inode;
- int dentryhashcode = mds->get_cluster()->hash_dentry( dir->inode->inode.hash_seed, it->first );
+ int dentryhashcode = mds->get_cluster()->hash_dentry( dir->ino(), it->first );
if (dentryhashcode != mds->get_nodeid()) {
// not mine!
// twiddle dir_auth?
r %= num_mds;
- dout(22) << "hash_dentry(" << dirino << ", " << dn << ") -> " << r << endl;
+ dout(30) << "hash_dentry(" << dirino << ", " << dn << ") -> " << r << endl;
return r;
}
mds_paused = false;
stat_ops = 0;
- last_balancer_heartbeat = g_clock.recent_now();
+ last_balancer_hash = last_balancer_heartbeat = g_clock.recent_now();
// log
string name;
balancer->send_heartbeat();
num_bal_times--;
}
+
+ // hash?
+ if (true &&
+ now.sec() - last_balancer_hash.sec() > g_conf.mds_bal_hash_interval) {
+ last_balancer_hash = now;
+ balancer->do_hashing();
+ }
+
// HACK to test hashing stuff
// hashed?
if (dir->is_hashed() &&
- whoami != get_cluster()->hash_dentry( dir->inode->inode.hash_seed, it->first ))
+ whoami != get_cluster()->hash_dentry( dir->ino(), it->first ))
continue;
// is dentry readable?
else
dir->link_inode(dn, newi);
+ // bump modify pop
+ balancer->hit_dir(dir, true);
+
// mark dirty
dn->mark_dirty();
newi->mark_dirty();
protected:
__uint64_t stat_ops;
- utime_t last_balancer_heartbeat;
+ utime_t last_balancer_heartbeat, last_balancer_hash;
public:
MDS(MDCluster *mdc, int whoami, Messenger *m);
{
if (hashcode >= 0) {
assert(dir->is_hashed());
- dout(11) << "commit_dir_slice hashcode " << hashcode << " " << *dir << " version " << dir->get_version() << endl;
+ dout(10) << "commit_dir_slice hashcode " << hashcode << " " << *dir << " version " << dir->get_version() << endl;
} else {
assert(dir->is_auth());
- dout(11) << "commit_dir_slice (whole dir) " << *dir << " version " << dir->get_version() << endl;
+ dout(10) << "commit_dir_slice (whole dir) " << *dir << " version " << dir->get_version() << endl;
}
// get continuation ready
CDir *dir = in->dir;
if (!dir) return true;
- dout(0) << "EDirUpdate v " << version << " on dir " << *dir << endl;
+ dout(10) << "EDirUpdate v " << version << " on dir " << *dir << endl;
if (!dir->is_auth()) return true; // not mine!
if (dir->is_frozen()) return true; // frozen -> exporting -> obsolete? FIXME
CDir *dir = in->dir;
assert(dir);
- dout(0) << "EDirUpdate committing dir " << *dir << endl;
+ dout(10) << "EDirUpdate committing dir " << *dir << endl;
mds->mdstore->commit_dir(dir, c);
}
bool inode_file_valid; // true if inode info is valid (ie was readable on mds at the time)
bool inode_hard_valid; // true if inode info is valid (ie was readable on mds at the time)
- bool spec_defined;
int dir_auth;
+ bool hashed, replicated;
+
+ bool spec_defined;
set<int> dist; // where am i replicated?
this->ref_dn = ref_dn;
// replicated where?
- spec_defined = in->dir && in->dir->is_auth();
- if (spec_defined) {
- dir_auth = in->dir->get_dir_auth();
+ if (in->dir && in->dir->is_auth()) {
+ spec_defined = true;
in->dir->get_dist_spec(this->dist, whoami);
- }
+ } else
+ spec_defined = false;
+
+ // dir info
+ dir_auth = (in->dir && in->dir->get_dir_auth());
+ hashed = (in->dir && in->dir->is_hashed()); // FIXME not quite right.
+ replicated = (in->dir && in->dir->is_rep());
}
void _encode(bufferlist &bl) {
bl.append((char*)&inode_hard_valid, sizeof(inode_hard_valid));
bl.append((char*)&spec_defined, sizeof(spec_defined));
bl.append((char*)&dir_auth, sizeof(dir_auth));
+ bl.append((char*)&hashed, sizeof(hashed));
+ bl.append((char*)&replicated, sizeof(replicated));
::_encode(ref_dn, bl);
::_encode(symlink, bl);
off += sizeof(spec_defined);
bl.copy(off, sizeof(dir_auth), (char*)&dir_auth);
off += sizeof(dir_auth);
+ bl.copy(off, sizeof(hashed), (char*)&hashed);
+ off += sizeof(hashed);
+ bl.copy(off, sizeof(replicated), (char*)&replicated);
+ off += sizeof(replicated);
::_decode(ref_dn, bl, off);
::_decode(symlink, bl, off);
dout(2) << "mkfs" << endl;
store->mkfs();
+
}
int r = store->mount();
+ if (g_conf.osd_age > 0.0)
+ store->age(g_conf.osd_age, g_conf.osd_age / 2.0, 2, g_conf.osd_age);
+
+
monitor->init();
osd_lock.Unlock();
--- /dev/null
+
+#include "ObjectStore.h"
+
+#include "config.h"
+
+
+object_t ObjectStore::age_get_oid() {
+ if (!age_free_oids.empty()) {
+ object_t o = age_free_oids.front();
+ age_free_oids.pop_front();
+ return o;
+ }
+ return age_cur_oid++;
+ }
+
+ ssize_t ObjectStore::age_pick_size() {
+ ssize_t max = file_size_distn.sample() * 1024;
+ return max/2 + (rand() % 100) * max/200 + 1;
+ }
+
+ void ObjectStore::age_fill(float pc) {
+ static char buf[1024*1024];
+ bufferlist bl;
+ bl.push_back(new buffer(buf, 1024*1024));
+ while (1) {
+ struct statfs st;
+ statfs(&st);
+ float a = (float)(st.f_blocks-st.f_bavail) / (float)st.f_blocks;
+ if (a >= pc) {
+ dout(10) << "age_fill at " << a << " / " << pc << " stopping" << endl;
+ break;
+ }
+
+ object_t oid = age_get_oid();
+
+ int b = rand() % 10;
+ age_objects[b].push_back(oid);
+
+ ssize_t s = age_pick_size();
+
+ dout(10) << "age_fill at " << a << " / " << pc << " creating " << hex << oid << dec << " sz " << s << endl;
+
+ off_t off = 0;
+ while (s) {
+ ssize_t t = MIN(s, 1024*1024);
+ write(oid, t, off, bl, false);
+ off += t;
+ s -= t;
+ }
+ oid++;
+ }
+ }
+
+ void ObjectStore::age_empty(float pc) {
+ int nper = 20;
+ int n = nper;
+ while (1) {
+ struct statfs st;
+ statfs(&st);
+ float a = (float)(st.f_blocks-st.f_bavail) / (float)st.f_blocks;
+ if (a <= pc) {
+ dout(10) << "age_empty at " << a << " / " << pc << " stopping" << endl;
+ break;
+ }
+
+ int b = rand() % 10;
+ n--;
+ if (n == 0 || age_objects[b].empty()) {
+ dout(10) << "age_empty sync" << endl;
+ //sync();
+ sync();
+ n = nper;
+ continue;
+ }
+ object_t oid = age_objects[b].front();
+ age_objects[b].pop_front();
+
+ dout(10) << "age_empty at " << a << " / " << pc << " removing " << hex << oid << dec << endl;
+
+ remove(oid);
+ age_free_oids.push_back(oid);
+ }
+ }
+
+
+ void ObjectStore::age(float high_water, // fill to this %
+ float low_water, // then empty to this %
+ int count, // this many times
+ float final_water, // and end here ( <= low_water)
+ int fake_size_mb) {
+ while (age_objects.size() < 10) age_objects.push_back( list<object_t>() );
+
+ if (fake_size_mb) {
+ int fake_bl = fake_size_mb * 256;
+ struct statfs st;
+ statfs(&st);
+ float f = (float)fake_bl / (float)st.f_blocks;
+ high_water = (float)high_water * f;
+ low_water = (float)low_water * f;
+ final_water = (float)final_water * f;
+ dout(10) << "fake " << fake_bl << " / " << st.f_blocks << " is " << f << ", high " << high_water << " low " << low_water << " final " << final_water << endl;
+ }
+
+ // init size distn (once)
+ if (!did_distn) {
+ did_distn = true;
+ age_cur_oid = 1;
+ file_size_distn.add(1, 19.0758125+0.65434375);
+ file_size_distn.add(512, 35.6566);
+ file_size_distn.add(1024, 27.7271875);
+ file_size_distn.add(2*1024, 16.63503125);
+ //file_size_distn.add(4*1024, 106.82384375);
+ //file_size_distn.add(8*1024, 81.493375);
+ //file_size_distn.add(16*1024, 14.13553125);
+ //file_size_distn.add(32*1024, 2.176);
+ //file_size_distn.add(256*1024, 0.655938);
+ //file_size_distn.add(512*1024, 0.1480625);
+ //file_size_distn.add(1*1024*1024, 0.020125); // actually 2, but 32bit
+ file_size_distn.normalize();
+ }
+
+ // clear
+ for (int i=0; i<10; i++)
+ age_objects[i].clear();
+
+ for (int c=1; c<=count; c++) {
+ dout(1) << "age " << c << "/" << count << " filling to " << high_water << endl;
+ age_fill(high_water);
+ if (c == count) {
+ dout(1) << "age final empty to " << final_water << endl;
+ age_empty(final_water);
+ } else {
+ dout(1) << "age " << c << "/" << count << " emptying to " << low_water << endl;
+ age_empty(low_water);
+ }
+ }
+ dout(1) << "age finished" << endl;
+ }
+
#include "include/Context.h"
#include "include/bufferlist.h"
+#include "include/Distribution.h"
+
#include <sys/stat.h>
+#include <sys/vfs.h> /* or <sys/statfs.h> */
#include <list>
using namespace std;
+#ifndef MIN
+# define MIN(a,b) ((a) < (b) ? (a):(b))
+#endif
+
/*
* low-level interface to the local OSD file system
*/
class ObjectStore {
+private:
+ list<object_t> age_free_oids;
+ object_t age_cur_oid;
+ vector< list<object_t> > age_objects;
+ Distribution file_size_distn; //kb
+ bool did_distn;
+
public:
+ ObjectStore() : did_distn(false) {}
virtual ~ObjectStore() {}
// mgmt
+ // age store
+private:
+ void age_empty(float pc);
+ void age_fill(float pc);
+ ssize_t age_pick_size();
+ object_t age_get_oid();
+
+public:
+ void age(float high_water, // fill to this %
+ float low_water, // then empty to this %
+ int count, // this many times
+ float final_water, // and end here ( <= low_water)
+ int fake_size_mb=0);
+
};
#endif