From 6965c53c7de14029c9887298a3344fd3e60859fa Mon Sep 17 00:00:00 2001 From: sage Date: Fri, 24 Feb 2006 06:30:01 +0000 Subject: [PATCH] better traffic control; objectstore aging git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@672 29311d96-e01e-0410-9327-a35deaab8ce9 --- ceph/Makefile | 1 + ceph/TODO | 31 ++++++-- ceph/client/Client.cc | 129 +++++++++++++++++++----------- ceph/client/Client.h | 77 ++++++++++++------ ceph/client/SyntheticClient.cc | 140 +++++++++++++++++++++++++++++++++ ceph/client/SyntheticClient.h | 24 ++++-- ceph/config.cc | 8 +- ceph/config.h | 4 + ceph/ebofs/Ebofs.cc | 14 ++-- ceph/include/types.h | 3 +- ceph/mds/CDir.cc | 6 +- ceph/mds/CDir.h | 1 + ceph/mds/CInode.h | 4 - ceph/mds/MDBalancer.cc | 47 ++++++++++- ceph/mds/MDBalancer.h | 7 +- ceph/mds/MDCache.cc | 18 ++--- ceph/mds/MDCluster.cc | 2 +- ceph/mds/MDS.cc | 15 +++- ceph/mds/MDS.h | 2 +- ceph/mds/MDStore.cc | 4 +- ceph/mds/events/EDirUpdate.h | 4 +- ceph/messages/MClientReply.h | 23 ++++-- ceph/osd/OSD.cc | 5 ++ ceph/osd/ObjectStore.cc | 139 ++++++++++++++++++++++++++++++++ ceph/osd/ObjectStore.h | 29 +++++++ 25 files changed, 609 insertions(+), 128 deletions(-) create mode 100644 ceph/osd/ObjectStore.cc diff --git a/ceph/Makefile b/ceph/Makefile index 688fc37ece75a..8f55f9af22961 100644 --- a/ceph/Makefile +++ b/ceph/Makefile @@ -51,6 +51,7 @@ COMMON_OBJS= \ osd/Filer.o\ osd/OSDMap.o\ osd/PG.o\ + osd/ObjectStore.o\ ebofs.o\ common/Logger.o\ common/Clock.o\ diff --git a/ceph/TODO b/ceph/TODO index defaa76ef16a0..f85d7269f7d59 100644 --- a/ceph/TODO +++ b/ceph/TODO @@ -14,32 +14,45 @@ MAR 1 - stability - ebofs table.remove() thing + - fakestore crapping out.. missing timer events? - mds - vary log stripe size, count (on ebofs and fakestore) ? 4k for ebofs, 64k for fakestore ? scount=4... or 2? + -> striping mostly useless - makedirs vs ebo/fake. - streaming small writes (mds log) - -- mds scaling - - shared /lib, /include .. verify fixed open() behavior works! + -> ebofs needs 200 clients per MDS to get close to fakestore (compensate for latency?) - quick retest of crush vs linear -- workloads +- scaling (scale mds+osd+clients together) + - metadata - makedirs +** 100 c per mds is good on googoo ... testing on alc as mds_log.1 + + - creates in same dir + --syn makefiles 10000 10 0 + - repeat creat+stat+unlink + - creates in private dir + - repeat creat+stat+unlink + - zillion opens of the same file + --syn only 0 --syn createshared 10 --syn openshared 10 1000 - - kernel compile? + - local compile + - shared compile (/lib, /include) + - data scaling + - writes to local files + - strided write to shared file + - strided write with O_SYNC -- MDS mkdir needs EDirUpdate event - ...or else we can get unlogged leftover dirty dirs @@ -55,7 +68,9 @@ OSDI - obfs? - osd write tests - - get obfs working + - ebofs vs fakestore: + plot "log/osd/write_sizes.swap1.block/c" u 1:2 t "fs=ebofs", "log/osd/write_sizes.sdb2.ext3.fake/c" u 1:2 t "fs=fakestore.sdb2", "log/osd/write_sizes.sdb2.ext3.fake2/c" u 1:2 t "fs=fakestore.sdb2.again", "log/osd/write_sizes.sdb2.ebo/c" u 1:2 t "fs=ebofs.again" + - get obfs working? - client buffer cache! diff --git a/ceph/client/Client.cc b/ceph/client/Client.cc index bda4f98ec40ee..3fbb46aa0fa71 100644 --- a/ceph/client/Client.cc +++ b/ceph/client/Client.cc @@ -225,7 +225,7 @@ Inode* Client::insert_inode_info(Dir *dir, c_inode_info *in_info) Dentry *dn = NULL; if (dir->dentries.count(dname)) dn = dir->dentries[dname]; - dout(12) << "insert_inode_info " << dname << " ino " << hex << in_info->inode.ino << dec << " size " << in_info->inode.size << endl; + dout(12) << "insert_inode_info " << dname << " ino " << hex << in_info->inode.ino << dec << " size " << in_info->inode.size << " hashed " << in_info->hashed << endl; if (dn) { if (dn->inode->inode.ino == in_info->inode.ino) { @@ -284,16 +284,20 @@ Inode* Client::insert_inode_info(Dir *dir, c_inode_info *in_info) *(dn->inode->symlink) = in_info->symlink; } - // take note of latest distribution on mds's + // dir info + dn->inode->dir_auth = in_info->dir_auth; + dn->inode->dir_hashed = in_info->hashed; + dn->inode->dir_replicated = in_info->replicated; + + // dir replication if (in_info->spec_defined) { - if (in_info->dist.empty() && !dn->inode->mds_contacts.empty()) { - dout(9) << "lost dist spec for " << hex << dn->inode->inode.ino << dec << " " << in_info->dist << endl; - } - if (!in_info->dist.empty() && dn->inode->mds_contacts.empty()) { - dout(9) << "got dist spec for " << hex << dn->inode->inode.ino << dec << " " << in_info->dist << endl; - } - dn->inode->mds_contacts = in_info->dist; - dn->inode->mds_dir_auth = in_info->dir_auth; + if (in_info->dist.empty() && !dn->inode->dir_contacts.empty()) + dout(9) << "lost dist spec for " << hex << dn->inode->inode.ino << dec + << " " << in_info->dist << endl; + if (!in_info->dist.empty() && dn->inode->dir_contacts.empty()) + dout(9) << "got dist spec for " << hex << dn->inode->inode.ino << dec + << " " << in_info->dist << endl; + dn->inode->dir_contacts = in_info->dist; } return dn->inode; @@ -307,21 +311,29 @@ void Client::insert_trace(const vector& trace) Inode *cur = root; time_t now = time(NULL); - if (trace.empty()) return; + if (trace.empty()) { + return; + } for (unsigned i=0; iinode = trace[i]->inode; - inode_map[root->inode.ino] = root; + root->inode = in_info->inode; + inode_map[root->inode.ino] = root; } - if (trace[i]->spec_defined) { - root->mds_contacts = trace[i]->dist; - root->mds_dir_auth = trace[i]->dir_auth; - } + root->last_updated = now; - dout(12) << "insert_trace trace " << i << " root" << endl; + + root->dir_auth = in_info->dir_auth; + root->dir_hashed = in_info->hashed; + root->dir_replicated = in_info->replicated; + if (in_info->spec_defined) + root->dir_contacts = in_info->dist; + + dout(12) << "insert_trace trace " << i << " root .. rep=" << root->dir_replicated << endl; } else { dout(12) << "insert_trace trace " << i << endl; Dir *dir = cur->open_dir(); @@ -379,42 +391,60 @@ MClientReply *Client::make_request(MClientRequest *req, bool auth_best, int use_mds) // this param is icky, debug weirdness! { - // send to what MDS? find deepest known prefix - Inode *cur = root; - for (unsigned i=0; iget_filepath().depth(); i++) { - if (cur && cur->inode.mode & INODE_MODE_DIR && cur->dir) { - Dir *dir = cur->dir; - if (dir->dentries.count( req->get_filepath()[i] ) == 0) + // find deepest known prefix + Inode *diri = root; // the deepest known containing dir + Inode *item = 0; // the actual item... if we know it + int missing_dn = -1; // which dn we miss on (if we miss) + + unsigned depth = req->get_filepath().depth(); + for (unsigned i=0; iinode.mode & INODE_MODE_DIR && diri->dir) { + Dir *dir = diri->dir; + + // do we have the next dentry? + if (dir->dentries.count( req->get_filepath()[i] ) == 0) { + missing_dn = i; // no. break; + } - dout(7) << " have path seg " << i << " on " << cur->mds_dir_auth << " ino " << hex << cur->inode.ino << dec << " " << req->get_filepath()[i] << endl; - cur = dir->dentries[ req->get_filepath()[i] ]->inode; - assert(cur); - } else + dout(7) << " have path seg " << i << " on " << diri->dir_auth << " ino " << hex << diri->inode.ino << dec << " " << req->get_filepath()[i] << endl; + + if (i == depth-1) { // last one! + item = dir->dentries[ req->get_filepath()[i] ]->inode; + break; + } + + // continue.. + diri = dir->dentries[ req->get_filepath()[i] ]->inode; + assert(diri); + } else { + missing_dn = i; break; + } } - + + // choose an mds int mds = 0; - if (cur) { - if (!auth_best && cur->get_replicas().size()) { - // try replica(s) - dout(9) << "contacting replica from deepest inode " << hex << cur->inode.ino << dec << " " << req->get_filepath() << ": " << cur->get_replicas() << endl; - set::iterator it = cur->get_replicas().begin(); - if (cur->get_replicas().size() == 1) - mds = *it; - else { - int r = rand() % cur->get_replicas().size(); - while (r--) it++; - mds = *it; + if (diri) { + if (auth_best) { + // pick the actual auth (as best we can) + if (item) { + mds = item->authority(mdcluster); + } else if (diri->dir_hashed) { + mds = diri->dentry_authority(req->get_filepath()[missing_dn].c_str(), + mdcluster); + } else { + mds = diri->authority(mdcluster); } } else { - // try auth - mds = cur->authority(); - //if (!auth_best && req->get_filepath().get_path()[0] == '/') - dout(9) << "contacting auth mds " << mds << " auth_best " << auth_best << " for " << req->get_filepath() << endl; + // balance our traffic! + if (diri->dir_hashed) + mds = diri->dentry_authority(req->get_filepath()[missing_dn].c_str(), + mdcluster); + else + mds = diri->pick_replica(mdcluster); } - } else { - dout(9) << "i have no idea where " << req->get_filepath() << " is" << endl; } // force use of a particular mds? @@ -762,13 +792,13 @@ void Client::handle_file_caps(MClientFileCaps *m) for (map::iterator p = in->caps.begin(); p != in->caps.end(); p++) - dout(0) << " cap " << p->first << " " + dout(20) << " left cap " << p->first << " " << cap_string(p->second.caps) << " " << p->second.seq << endl; for (map::iterator p = in->stale_caps.begin(); p != in->stale_caps.end(); p++) - dout(0) << " stale cap " << p->first << " " + dout(20) << " left stale cap " << p->first << " " << cap_string(p->second.caps) << " " << p->second.seq << endl; @@ -964,6 +994,9 @@ int Client::mount(int mkfs) client_lock.Lock(); assert(reply); + // mdcluster! + mdcluster = new MDCluster(g_conf.num_mds, g_conf.num_osd); // FIXME this is stoopid + // we got osdmap! osdmap->decode(reply->get_osd_map_state()); diff --git a/ceph/client/Client.h b/ceph/client/Client.h index 40eecd39c915a..207a49c5c136e 100644 --- a/ceph/client/Client.h +++ b/ceph/client/Client.h @@ -15,7 +15,7 @@ #ifndef __CLIENT_H #define __CLIENT_H -//#include "mds/MDCluster.h" +#include "mds/MDCluster.h" #include "osd/OSDMap.h" #include "msg/Message.h" @@ -80,6 +80,7 @@ class Dentry : public LRUObject { void put() { assert(ref == 1); ref--; lru_unpin(); } Dentry() : dir(0), inode(0), ref(0) { } + /*Dentry() : name(0), dir(0), inode(0), ref(0) { } Dentry(string& n) : name(0), dir(0), inode(0), ref(0) { name = new char[n.length()+1]; @@ -113,10 +114,13 @@ class InodeCap { class Inode { public: inode_t inode; // the actual inode - int mds_dir_auth; - set mds_contacts; time_t last_updated; + // about the dir (if this is one!) + int dir_auth; + set dir_contacts; + bool dir_hashed, dir_replicated; + // per-mds caps map caps; // mds -> InodeCap map stale_caps; // mds -> cap .. stale @@ -144,7 +148,9 @@ class Inode { //cout << "inode.put on " << hex << inode.ino << dec << " now " << ref << endl; } - Inode() : mds_dir_auth(-1), last_updated(0), + Inode() : + last_updated(0), + dir_auth(-1), dir_hashed(false), dir_replicated(false), file_wr_mtime(0), file_wr_size(0), num_rd(0), num_wr(0), ref(0), dir(0), dn(0), symlink(0) { } ~Inode() { @@ -177,29 +183,52 @@ class Inode { return w; } - int authority() { - // my info valid? - if (mds_dir_auth >= 0) - return mds_dir_auth; - - // otherwise try parent - if (dn && dn->dir && dn->dir->parent_inode) - return dn->dir->parent_inode->authority(); + int authority(MDCluster *mdcluster) { + // parent? + if (dn && dn->dir && dn->dir->parent_inode) { + // parent hashed? + if (dn->dir->parent_inode->dir_hashed) { + // hashed + return mdcluster->hash_dentry( dn->dir->parent_inode->ino(), + dn->name ); + } + + if (dir_auth >= 0) + return dir_auth; + else + return dn->dir->parent_inode->authority(mdcluster); + } + + if (dir_auth >= 0) + return dir_auth; - return 0; // who knows! + assert(0); // !!! + return 0; } - set& get_replicas() { - if (mds_contacts.size()) - return mds_contacts; - if (is_dir()) { - return mds_contacts; - } - if (dn && dn->dir && dn->dir->parent_inode) { - return dn->dir->parent_inode->get_replicas(); + int dentry_authority(const char *dn, + MDCluster *mdcluster) { + return mdcluster->hash_dentry( ino(), + dn ); + } + int pick_replica(MDCluster *mdcluster) { + // replicas? + if (dir_contacts.size()) { + set::iterator it = dir_contacts.begin(); + if (dir_contacts.size() == 1) + return *it; + else { + int r = rand() % dir_contacts.size(); + while (r--) it++; + return *it; + } } - return mds_contacts; + + if (dir_replicated) + return rand() % mdcluster->get_num_mds(); // huh.. pick a random mds! + else + return authority(mdcluster); } - + // open Dir for an inode. if it's not open, allocated it (and pin dentry in memory). Dir *open_dir() { @@ -239,7 +268,7 @@ class Client : public Dispatcher { int whoami; // cluster descriptors - //MDCluster *mdcluster; + MDCluster *mdcluster; OSDMap *osdmap; bool mounted; diff --git a/ceph/client/SyntheticClient.cc b/ceph/client/SyntheticClient.cc index d1ab86d3315a1..3c02982ca99fc 100644 --- a/ceph/client/SyntheticClient.cc +++ b/ceph/client/SyntheticClient.cc @@ -46,6 +46,7 @@ void parse_syn_options(vector& args) for (unsigned i=0; i& args) syn_modes.push_back( SYNCLIENT_MODE_READFILE ); syn_iargs.push_back( a ); syn_iargs.push_back( b ); + } else if (strcmp(args[i],"makedirs") == 0) { syn_modes.push_back( SYNCLIENT_MODE_MAKEDIRS ); syn_iargs.push_back( atoi(args[++i]) ); syn_iargs.push_back( atoi(args[++i]) ); syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"makefiles") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_MAKEFILES ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"createshared") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_CREATESHARED ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"openshared") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_OPENSHARED ); + syn_iargs.push_back( atoi(args[++i]) ); + syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"fullwalk") == 0) { syn_modes.push_back( SYNCLIENT_MODE_FULLWALK ); //syn_sargs.push_back( atoi(args[++i]) ); } else if (strcmp(args[i],"randomwalk") == 0) { syn_modes.push_back( SYNCLIENT_MODE_RANDOMWALK ); syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"trace") == 0) { syn_modes.push_back( SYNCLIENT_MODE_TRACE ); syn_sargs.push_back( args[++i] ); @@ -101,6 +117,7 @@ void parse_syn_options(vector& args) } else if (strcmp(args[i],"opentest") == 0) { syn_modes.push_back( SYNCLIENT_MODE_OPENTEST ); syn_iargs.push_back( atoi(args[++i]) ); + } else { cerr << "unknown syn mode " << args[i] << endl; assert(0); @@ -239,6 +256,49 @@ int SyntheticClient::run() } } break; + case SYNCLIENT_MODE_MAKEFILES: + { + int num = iargs.front(); iargs.pop_front(); + int count = iargs.front(); iargs.pop_front(); + int priv = iargs.front(); iargs.pop_front(); + if (run_me()) { + dout(2) << "makefiles " << num << " " << count << " " << priv << endl; + make_files(num, count, priv, false); + } + } + break; + case SYNCLIENT_MODE_MAKEFILES2: + { + int num = iargs.front(); iargs.pop_front(); + int count = iargs.front(); iargs.pop_front(); + int priv = iargs.front(); iargs.pop_front(); + if (run_me()) { + dout(2) << "makefiles2 " << num << " " << count << " " << priv << endl; + make_files(num, count, priv, true); + } + } + break; + case SYNCLIENT_MODE_CREATESHARED: + { + string sarg1 = get_sarg(0); + int num = iargs.front(); iargs.pop_front(); + if (run_me()) { + dout(2) << "createshared " << num << endl; + create_shared(num); + } + } + break; + case SYNCLIENT_MODE_OPENSHARED: + { + string sarg1 = get_sarg(0); + int num = iargs.front(); iargs.pop_front(); + int count = iargs.front(); iargs.pop_front(); + if (run_me()) { + dout(2) << "openshared " << num << endl; + open_shared(num, count); + } + } + break; case SYNCLIENT_MODE_FULLWALK: { @@ -630,6 +690,86 @@ int SyntheticClient::make_dirs(const char *basedir, int dirs, int files, int dep } +int SyntheticClient::make_files(int num, int count, int priv, bool more) +{ + int whoami = client->get_nodeid(); + char d[255]; + + if (priv) { + for (int c=0; cmkdir(d, 0755); + } + } else { + // shared + if (whoami == 0) { + for (int c=0; cmkdir(d, 0755); + } + } else { + sleep(5); + } + } + + // files + struct stat st; + for (int c=0; cmknod(d, 0644); + + if (more) { + client->lstat(d, &st); + int fd = client->open(d, O_RDONLY); + client->unlink(d); + client->close(fd); + } + + if (time_to_stop()) return 0; + } + } + + return 0; +} + + +int SyntheticClient::create_shared(int num) +{ + // files + char d[255]; + for (int n=0; nmknod(d, 0644); + } + + return 0; +} + +int SyntheticClient::open_shared(int num, int count) +{ + // files + char d[255]; + for (int c=0; c fds; + for (int n=0; nopen(d,O_RDONLY); + fds.push_back(fd); + } + + while (!fds.empty()) { + int fd = fds.front(); + fds.pop_front(); + client->close(fd); + } + } + + return 0; +} + int SyntheticClient::write_file(string& fn, int size, int wrsize) // size is in MB, wrsize in bytes { diff --git a/ceph/client/SyntheticClient.h b/ceph/client/SyntheticClient.h index c6b838bf7b051..e5fd54943f6d9 100644 --- a/ceph/client/SyntheticClient.h +++ b/ceph/client/SyntheticClient.h @@ -24,15 +24,21 @@ #define SYNCLIENT_MODE_RANDOMWALK 1 #define SYNCLIENT_MODE_FULLWALK 2 -#define SYNCLIENT_MODE_MAKEDIRS 3 -#define SYNCLIENT_MODE_WRITEFILE 4 -#define SYNCLIENT_MODE_READFILE 5 #define SYNCLIENT_MODE_REPEATWALK 7 -#define SYNCLIENT_MODE_WRITEBATCH 8 -#define SYNCLIENT_MODE_TRACE 20 +#define SYNCLIENT_MODE_MAKEDIRS 10 // dirs files depth +#define SYNCLIENT_MODE_MAKEFILES 11 // num count private +#define SYNCLIENT_MODE_MAKEFILES2 12 // num count private +#define SYNCLIENT_MODE_CREATESHARED 13 // num +#define SYNCLIENT_MODE_OPENSHARED 14 // num count -#define SYNCLIENT_MODE_OPENTEST 30 +#define SYNCLIENT_MODE_WRITEFILE 20 +#define SYNCLIENT_MODE_READFILE 21 +#define SYNCLIENT_MODE_WRITEBATCH 22 + +#define SYNCLIENT_MODE_TRACE 30 + +#define SYNCLIENT_MODE_OPENTEST 40 #define SYNCLIENT_MODE_ONLY 50 #define SYNCLIENT_MODE_UNTIL 51 @@ -159,7 +165,13 @@ class SyntheticClient { int full_walk(string& fromdir); int random_walk(int n); + int make_dirs(const char *basedir, int dirs, int files, int depth); + int make_files(int num, int count, int priv, bool more); + + int create_shared(int num); + int open_shared(int num, int count); + int write_file(string& fn, int mb, int chunk); int write_batch(int nfile, int mb, int chunk); int read_file(string& fn, int mb, int chunk); diff --git a/ceph/config.cc b/ceph/config.cc index 73d5c658b27b7..ff066042b37ee 100644 --- a/ceph/config.cc +++ b/ceph/config.cc @@ -113,9 +113,10 @@ md_config_t g_conf = { mds_bal_replicate_threshold: 8000, mds_bal_unreplicate_threshold: 1000, - //mds_bal_hash_threshold: 20000, - //mds_bal_unhash_threshold: 2000, + mds_bal_hash_threshold: 2000, + mds_bal_unhash_threshold: 250, mds_bal_interval: 30, // seconds + mds_bal_hash_interval: 5, // seconds mds_bal_idle_threshold: .1, mds_bal_max: -1, mds_bal_max_until: -1, @@ -135,6 +136,7 @@ md_config_t g_conf = { osd_max_opq: 10, osd_mkfs: false, osd_fake_sync: false, + osd_age: 0.0, // --- fakestore --- fakestore_fake_sync: 2, // 2 seconds @@ -414,6 +416,8 @@ void parse_config_options(vector& args) else if (strcmp(args[i], "--osd_mkfs") == 0) g_conf.osd_mkfs = atoi(args[++i]); + else if (strcmp(args[i], "--osd_age") == 0) + g_conf.osd_age = atof(args[++i]); else if (strcmp(args[i], "--osd_pg_bits") == 0) g_conf.osd_pg_bits = atoi(args[++i]); else if (strcmp(args[i], "--osd_max_rep") == 0) diff --git a/ceph/config.h b/ceph/config.h index 319e0e4a4a930..a1e86468aa913 100644 --- a/ceph/config.h +++ b/ceph/config.h @@ -83,7 +83,10 @@ struct md_config_t { float mds_bal_replicate_threshold; float mds_bal_unreplicate_threshold; + float mds_bal_hash_threshold; + float mds_bal_unhash_threshold; int mds_bal_interval; + int mds_bal_hash_interval; float mds_bal_idle_threshold; int mds_bal_max; int mds_bal_max_until; @@ -102,6 +105,7 @@ struct md_config_t { int osd_max_opq; bool osd_mkfs; bool osd_fake_sync; + float osd_age; int fakestore_fake_sync; diff --git a/ceph/ebofs/Ebofs.cc b/ceph/ebofs/Ebofs.cc index 95f280ba5027c..a9ac581f453a0 100644 --- a/ceph/ebofs/Ebofs.cc +++ b/ceph/ebofs/Ebofs.cc @@ -1168,17 +1168,17 @@ void Ebofs::sync() { ebofs_lock.Lock(); if (!dirty) { - dout(3) << "sync in " << super_epoch << ", not dirty" << endl; + dout(0) << "sync in " << super_epoch << ", not dirty" << endl; } else { - dout(3) << "sync in " << super_epoch << endl; + dout(0) << "sync in " << super_epoch << endl; if (!commit_thread_started) { - dout(10) << "sync waiting for commit thread to start" << endl; + dout(0) << "sync waiting for commit thread to start" << endl; sync_cond.Wait(ebofs_lock); } if (mid_commit) { - dout(10) << "sync waiting for commit in progress" << endl; + dout(0) << "sync waiting for commit in progress" << endl; sync_cond.Wait(ebofs_lock); } @@ -1186,7 +1186,7 @@ void Ebofs::sync() sync_cond.Wait(ebofs_lock); // wait - dout(3) << "sync finish in " << super_epoch << endl; + dout(0) << "sync finish in " << super_epoch << endl; } ebofs_lock.Unlock(); } @@ -1213,8 +1213,8 @@ int Ebofs::statfs(struct statfs *buf) buf->f_type = EBOFS_MAGIC; /* type of filesystem */ buf->f_bsize = 4096; /* optimal transfer block size */ buf->f_blocks = dev.get_num_blocks(); /* total data blocks in file system */ - buf->f_bfree = free_blocks; /* free blocks in fs */ - buf->f_bavail = free_blocks; /* free blocks avail to non-superuser */ + buf->f_bfree = free_blocks + limbo_blocks; /* free blocks in fs */ + buf->f_bavail = free_blocks + limbo_blocks; /* free blocks avail to non-superuser */ buf->f_files = nodepool.num_total(); /* total file nodes in file system */ buf->f_ffree = nodepool.num_free(); /* free file nodes in fs */ //buf->f_fsid = 0; /* file system id */ diff --git a/ceph/include/types.h b/ceph/include/types.h index 64f5e573e1b18..dc503fa1cbc6f 100644 --- a/ceph/include/types.h +++ b/ceph/include/types.h @@ -267,7 +267,8 @@ typedef int fh_t; // file handle #define MDS_POP_NESTED 1 // me + children, auth or not #define MDS_POP_CURDOM 2 // me + children in current domain #define MDS_POP_ANYDOM 3 // me + children in any (nested) domain -#define MDS_NPOP 4 +#define MDS_POP_DIRMOD 4 // just this dir, modifications only +#define MDS_NPOP 5 class mds_load_t { public: diff --git a/ceph/mds/CDir.cc b/ceph/mds/CDir.cc index 112f460ad0ff8..a148626794fe3 100644 --- a/ceph/mds/CDir.cc +++ b/ceph/mds/CDir.cc @@ -58,7 +58,7 @@ ostream& operator<<(ostream& out, CDir& dir) if (dir.is_dirty()) out << " dirty"; if (dir.is_import()) out << " import"; if (dir.is_export()) out << " export"; - if (dir.is_hashed()) out << " hashed"; + if (dir.is_hashed()) out << " hashed"; //=" << (int)dir.get_inode()->inode.hash_seed; if (dir.is_auth()) { out << " auth"; if (dir.is_open_by_anyone()) @@ -118,11 +118,13 @@ CDir::CDir(CInode *in, MDS *mds, bool auth) assert(in->is_dir()); if (auth) state |= CDIR_STATE_AUTH; + /* if (in->dir_is_hashed()) { assert(0); // when does this happen? state |= CDIR_STATE_HASHED; } - + */ + auth_pins = 0; nested_auth_pins = 0; request_pins = 0; diff --git a/ceph/mds/CDir.h b/ceph/mds/CDir.h index 6aa6cb917ce2d..2e566656ef34f 100644 --- a/ceph/mds/CDir.h +++ b/ceph/mds/CDir.h @@ -369,6 +369,7 @@ class CDir { if (( popularity[MDS_POP_CURDOM].get() > g_conf.mds_bal_replicate_threshold)) { //if (!cached_by.empty() && inode.ino > 1) dout(1) << "distributed spec for " << *this << endl; ls = open_by; + if (!ls.empty()) ls.insert(auth); } } diff --git a/ceph/mds/CInode.h b/ceph/mds/CInode.h index 024d64a89351d..55842885a0b7c 100644 --- a/ceph/mds/CInode.h +++ b/ceph/mds/CInode.h @@ -261,10 +261,6 @@ class CInode : LRUObject { CDir *get_or_open_dir(MDS *mds); CDir *set_dir(CDir *newdir); - bool dir_is_hashed() { - if (inode.hash_seed) return true; - return false; - } bool dir_is_auth(); diff --git a/ceph/mds/MDBalancer.cc b/ceph/mds/MDBalancer.cc index 3b0370c442c0d..f502a43b62054 100644 --- a/ceph/mds/MDBalancer.cc +++ b/ceph/mds/MDBalancer.cc @@ -217,6 +217,33 @@ double MDBalancer::try_match(int ex, double& maxex, +void MDBalancer::do_hashing() +{ + if (hash_queue.empty()) { + dout(20) << "do_hashing has nothing to do" << endl; + return; + } + + dout(0) << "do_hashing " << hash_queue.size() << " dirs marked for possible hashing" << endl; + + for (set::iterator i = hash_queue.begin(); + i != hash_queue.end(); + i++) { + inodeno_t dirino = *i; + CInode *in = mds->mdcache->get_inode(dirino); + if (!in) continue; + CDir *dir = in->dir; + if (!dir) continue; + if (!dir->is_auth()) continue; + + dout(0) << "do_hashing hashing " << *dir << endl; + mds->mdcache->hash_dir(dir); + } + hash_queue.clear(); +} + + + void MDBalancer::do_rebalance(int beat) { int cluster_size = mds->get_cluster()->get_num_mds(); @@ -563,13 +590,27 @@ void MDBalancer::hit_inode(CInode *in) } -void MDBalancer::hit_dir(CDir *dir) +void MDBalancer::hit_dir(CDir *dir, bool modify) { // hit me dir->popularity[MDS_POP_JUSTME].hit(); - - hit_recursive(dir); + // hit modify counter, if this was a modify + if (modify) { + float p = dir->popularity[MDS_POP_DIRMOD].hit(); + + if (dir->is_auth()) { + // hash this dir? (later?) + if (p > g_conf.mds_bal_hash_threshold && + !(dir->is_hashed() || dir->is_hashing()) && + hash_queue.count(dir->ino()) == 0) { + dout(0) << "hit_dir DIRMOD pop is " << p << ", putting in hash_queue: " << *dir << endl; + hash_queue.insert(dir->ino()); + } + } + } + + hit_recursive(dir); } diff --git a/ceph/mds/MDBalancer.h b/ceph/mds/MDBalancer.h index 203497042e8da..1c1f83ab8943e 100644 --- a/ceph/mds/MDBalancer.h +++ b/ceph/mds/MDBalancer.h @@ -39,6 +39,9 @@ class MDBalancer { int beat_epoch; + // todo + set hash_queue; + // per-epoch scatter/gathered info hash_map mds_load; map > mds_import_map; @@ -69,6 +72,8 @@ class MDBalancer { void send_heartbeat(); void handle_heartbeat(MHeartbeat *m); + void do_hashing(); + void export_empties(); void do_rebalance(int beat); void find_exports(CDir *dir, @@ -82,7 +87,7 @@ class MDBalancer { void add_import(class CDir *im); void hit_inode(class CInode *in); - void hit_dir(class CDir *dir); + void hit_dir(class CDir *dir, bool modify=false); void hit_recursive(class CDir *dir); diff --git a/ceph/mds/MDCache.cc b/ceph/mds/MDCache.cc index 0e2e38b2b5f42..8021a37a87a54 100644 --- a/ceph/mds/MDCache.cc +++ b/ceph/mds/MDCache.cc @@ -7420,7 +7420,7 @@ void MDCache::hash_dir_frozen(CDir *dir) if (!in->is_dir()) continue; if (!in->dir) continue; - int dentryhashcode = mds->get_cluster()->hash_dentry( dir->inode->inode.hash_seed, it->first ); + int dentryhashcode = mds->get_cluster()->hash_dentry( dir->ino(), it->first ); if (dentryhashcode == mds->get_nodeid()) continue; // msg? @@ -7496,7 +7496,7 @@ void MDCache::hash_dir_go(CDir *dir) } // pick a hash seed. - dir->inode->inode.hash_seed = dir->ino(); + dir->inode->inode.hash_seed = 1;//dir->ino(); // suck up all waiters C_Contexts *fin = new C_Contexts; @@ -7515,7 +7515,7 @@ void MDCache::hash_dir_go(CDir *dir) CDentry *dn = it->second; CInode *in = dn->inode; - int dentryhashcode = mds->get_cluster()->hash_dentry( dir->inode->inode.hash_seed, it->first ); + int dentryhashcode = mds->get_cluster()->hash_dentry( dir->ino(), it->first ); if (dentryhashcode == mds->get_nodeid()) { continue; // still mine! } @@ -7679,7 +7679,7 @@ void MDCache::handle_hash_dir_ack(MHashDirAck *m) dout(7) << "got notifies too, all done" << endl; hash_dir_finish(dir); } else { - dout(7) << "waiting on notifies" << endl; + dout(7) << "waiting on notifies " << endl; } } else { @@ -7791,7 +7791,7 @@ void MDCache::handle_hash_dir_notify(MHashDirNotify *m) if (!in) continue; if (!in->dir) continue; - int dentryhashcode = mds->get_cluster()->hash_dentry( dir->inode->inode.hash_seed, it->first ); + int dentryhashcode = mds->get_cluster()->hash_dentry( dir->ino(), it->first ); if (dentryhashcode != from) continue; // we'll import these in a minute if (in->dir->authority() != dentryhashcode) @@ -7889,7 +7889,7 @@ void MDCache::handle_hash_dir_discover_2(MHashDirDiscover *m, CInode *in, int r) assert(dir->hashed_subset.empty()); // inode state - dir->inode->inode.hash_seed = dir->ino(); + dir->inode->inode.hash_seed = 1;// dir->ino(); if (dir->inode->is_auth()) dir->inode->mark_dirty(); @@ -8504,7 +8504,7 @@ void MDCache::unhash_dir_prep_finish(CDir *dir) if (!in->is_dir()) continue; if (!in->dir) continue; - int dentryhashcode = mds->get_cluster()->hash_dentry( dir->inode->inode.hash_seed, it->first ); + int dentryhashcode = mds->get_cluster()->hash_dentry( dir->ino(), it->first ); if (dentryhashcode != mds->get_nodeid()) continue; // msg? @@ -8529,7 +8529,7 @@ void MDCache::handle_unhash_dir(MUnhashDir *m) CDir *dir = in->dir; assert(dir); - dout(7) << "handle_unhash_dir " << *dir << endl; + dout(7) << "handle_unhash_dir " << *dir << endl;//" .. hash_seed is " << dir->inode->inode.hash_seed << endl; assert(dir->is_hashed()); assert(dir->is_unhashing()); assert(!dir->is_auth()); @@ -8551,7 +8551,7 @@ void MDCache::handle_unhash_dir(MUnhashDir *m) CDentry *dn = it->second; CInode *in = dn->inode; - int dentryhashcode = mds->get_cluster()->hash_dentry( dir->inode->inode.hash_seed, it->first ); + int dentryhashcode = mds->get_cluster()->hash_dentry( dir->ino(), it->first ); if (dentryhashcode != mds->get_nodeid()) { // not mine! // twiddle dir_auth? diff --git a/ceph/mds/MDCluster.cc b/ceph/mds/MDCluster.cc index 4379873bd62cf..6eb56dff0351c 100644 --- a/ceph/mds/MDCluster.cc +++ b/ceph/mds/MDCluster.cc @@ -76,7 +76,7 @@ int MDCluster::hash_dentry( inodeno_t dirino, const string& dn ) r %= num_mds; - dout(22) << "hash_dentry(" << dirino << ", " << dn << ") -> " << r << endl; + dout(30) << "hash_dentry(" << dirino << ", " << dn << ") -> " << r << endl; return r; } diff --git a/ceph/mds/MDS.cc b/ceph/mds/MDS.cc index 3e3ed29b45bfd..0b18ef30e9223 100644 --- a/ceph/mds/MDS.cc +++ b/ceph/mds/MDS.cc @@ -141,7 +141,7 @@ MDS::MDS(MDCluster *mdc, int whoami, Messenger *m) { mds_paused = false; stat_ops = 0; - last_balancer_heartbeat = g_clock.recent_now(); + last_balancer_hash = last_balancer_heartbeat = g_clock.recent_now(); // log string name; @@ -592,6 +592,14 @@ void MDS::my_dispatch(Message *m) balancer->send_heartbeat(); num_bal_times--; } + + // hash? + if (true && + now.sec() - last_balancer_hash.sec() > g_conf.mds_bal_hash_interval) { + last_balancer_hash = now; + balancer->do_hashing(); + } + // HACK to test hashing stuff @@ -1250,7 +1258,7 @@ int MDS::encode_dir_contents(CDir *dir, list& items) // hashed? if (dir->is_hashed() && - whoami != get_cluster()->hash_dentry( dir->inode->inode.hash_seed, it->first )) + whoami != get_cluster()->hash_dentry( dir->ino(), it->first )) continue; // is dentry readable? @@ -1620,6 +1628,9 @@ CInode *MDS::mknod(MClientRequest *req, CInode *diri, bool okexist) else dir->link_inode(dn, newi); + // bump modify pop + balancer->hit_dir(dir, true); + // mark dirty dn->mark_dirty(); newi->mark_dirty(); diff --git a/ceph/mds/MDS.h b/ceph/mds/MDS.h index 917a21747c7fc..09d1924d1de28 100644 --- a/ceph/mds/MDS.h +++ b/ceph/mds/MDS.h @@ -152,7 +152,7 @@ class MDS : public Dispatcher { protected: __uint64_t stat_ops; - utime_t last_balancer_heartbeat; + utime_t last_balancer_heartbeat, last_balancer_hash; public: MDS(MDCluster *mdc, int whoami, Messenger *m); diff --git a/ceph/mds/MDStore.cc b/ceph/mds/MDStore.cc index a26b1f4fd8e2b..332bf843aa8ed 100644 --- a/ceph/mds/MDStore.cc +++ b/ceph/mds/MDStore.cc @@ -574,10 +574,10 @@ void MDStore::commit_dir_slice( CDir *dir, { if (hashcode >= 0) { assert(dir->is_hashed()); - dout(11) << "commit_dir_slice hashcode " << hashcode << " " << *dir << " version " << dir->get_version() << endl; + dout(10) << "commit_dir_slice hashcode " << hashcode << " " << *dir << " version " << dir->get_version() << endl; } else { assert(dir->is_auth()); - dout(11) << "commit_dir_slice (whole dir) " << *dir << " version " << dir->get_version() << endl; + dout(10) << "commit_dir_slice (whole dir) " << *dir << " version " << dir->get_version() << endl; } // get continuation ready diff --git a/ceph/mds/events/EDirUpdate.h b/ceph/mds/events/EDirUpdate.h index aa503e9e3dbec..068544854c5d7 100644 --- a/ceph/mds/events/EDirUpdate.h +++ b/ceph/mds/events/EDirUpdate.h @@ -45,7 +45,7 @@ class EDirUpdate : public LogEvent { CDir *dir = in->dir; if (!dir) return true; - dout(0) << "EDirUpdate v " << version << " on dir " << *dir << endl; + dout(10) << "EDirUpdate v " << version << " on dir " << *dir << endl; if (!dir->is_auth()) return true; // not mine! if (dir->is_frozen()) return true; // frozen -> exporting -> obsolete? FIXME @@ -65,7 +65,7 @@ class EDirUpdate : public LogEvent { CDir *dir = in->dir; assert(dir); - dout(0) << "EDirUpdate committing dir " << *dir << endl; + dout(10) << "EDirUpdate committing dir " << *dir << endl; mds->mdstore->commit_dir(dir, c); } diff --git a/ceph/messages/MClientReply.h b/ceph/messages/MClientReply.h index 0d8185fed042a..0b665cc085541 100644 --- a/ceph/messages/MClientReply.h +++ b/ceph/messages/MClientReply.h @@ -56,8 +56,10 @@ class c_inode_info { bool inode_file_valid; // true if inode info is valid (ie was readable on mds at the time) bool inode_hard_valid; // true if inode info is valid (ie was readable on mds at the time) - bool spec_defined; int dir_auth; + bool hashed, replicated; + + bool spec_defined; set dist; // where am i replicated? @@ -76,11 +78,16 @@ class c_inode_info { this->ref_dn = ref_dn; // replicated where? - spec_defined = in->dir && in->dir->is_auth(); - if (spec_defined) { - dir_auth = in->dir->get_dir_auth(); + if (in->dir && in->dir->is_auth()) { + spec_defined = true; in->dir->get_dist_spec(this->dist, whoami); - } + } else + spec_defined = false; + + // dir info + dir_auth = (in->dir && in->dir->get_dir_auth()); + hashed = (in->dir && in->dir->is_hashed()); // FIXME not quite right. + replicated = (in->dir && in->dir->is_rep()); } void _encode(bufferlist &bl) { @@ -89,6 +96,8 @@ class c_inode_info { bl.append((char*)&inode_hard_valid, sizeof(inode_hard_valid)); bl.append((char*)&spec_defined, sizeof(spec_defined)); bl.append((char*)&dir_auth, sizeof(dir_auth)); + bl.append((char*)&hashed, sizeof(hashed)); + bl.append((char*)&replicated, sizeof(replicated)); ::_encode(ref_dn, bl); ::_encode(symlink, bl); @@ -106,6 +115,10 @@ class c_inode_info { off += sizeof(spec_defined); bl.copy(off, sizeof(dir_auth), (char*)&dir_auth); off += sizeof(dir_auth); + bl.copy(off, sizeof(hashed), (char*)&hashed); + off += sizeof(hashed); + bl.copy(off, sizeof(replicated), (char*)&replicated); + off += sizeof(replicated); ::_decode(ref_dn, bl, off); ::_decode(symlink, bl, off); diff --git a/ceph/osd/OSD.cc b/ceph/osd/OSD.cc index d50723fd44209..196b58c752fa8 100644 --- a/ceph/osd/OSD.cc +++ b/ceph/osd/OSD.cc @@ -191,9 +191,14 @@ int OSD::init() dout(2) << "mkfs" << endl; store->mkfs(); + } int r = store->mount(); + if (g_conf.osd_age > 0.0) + store->age(g_conf.osd_age, g_conf.osd_age / 2.0, 2, g_conf.osd_age); + + monitor->init(); osd_lock.Unlock(); diff --git a/ceph/osd/ObjectStore.cc b/ceph/osd/ObjectStore.cc new file mode 100644 index 0000000000000..62fb76e88b7cf --- /dev/null +++ b/ceph/osd/ObjectStore.cc @@ -0,0 +1,139 @@ + +#include "ObjectStore.h" + +#include "config.h" + + +object_t ObjectStore::age_get_oid() { + if (!age_free_oids.empty()) { + object_t o = age_free_oids.front(); + age_free_oids.pop_front(); + return o; + } + return age_cur_oid++; + } + + ssize_t ObjectStore::age_pick_size() { + ssize_t max = file_size_distn.sample() * 1024; + return max/2 + (rand() % 100) * max/200 + 1; + } + + void ObjectStore::age_fill(float pc) { + static char buf[1024*1024]; + bufferlist bl; + bl.push_back(new buffer(buf, 1024*1024)); + while (1) { + struct statfs st; + statfs(&st); + float a = (float)(st.f_blocks-st.f_bavail) / (float)st.f_blocks; + if (a >= pc) { + dout(10) << "age_fill at " << a << " / " << pc << " stopping" << endl; + break; + } + + object_t oid = age_get_oid(); + + int b = rand() % 10; + age_objects[b].push_back(oid); + + ssize_t s = age_pick_size(); + + dout(10) << "age_fill at " << a << " / " << pc << " creating " << hex << oid << dec << " sz " << s << endl; + + off_t off = 0; + while (s) { + ssize_t t = MIN(s, 1024*1024); + write(oid, t, off, bl, false); + off += t; + s -= t; + } + oid++; + } + } + + void ObjectStore::age_empty(float pc) { + int nper = 20; + int n = nper; + while (1) { + struct statfs st; + statfs(&st); + float a = (float)(st.f_blocks-st.f_bavail) / (float)st.f_blocks; + if (a <= pc) { + dout(10) << "age_empty at " << a << " / " << pc << " stopping" << endl; + break; + } + + int b = rand() % 10; + n--; + if (n == 0 || age_objects[b].empty()) { + dout(10) << "age_empty sync" << endl; + //sync(); + sync(); + n = nper; + continue; + } + object_t oid = age_objects[b].front(); + age_objects[b].pop_front(); + + dout(10) << "age_empty at " << a << " / " << pc << " removing " << hex << oid << dec << endl; + + remove(oid); + age_free_oids.push_back(oid); + } + } + + + void ObjectStore::age(float high_water, // fill to this % + float low_water, // then empty to this % + int count, // this many times + float final_water, // and end here ( <= low_water) + int fake_size_mb) { + while (age_objects.size() < 10) age_objects.push_back( list() ); + + if (fake_size_mb) { + int fake_bl = fake_size_mb * 256; + struct statfs st; + statfs(&st); + float f = (float)fake_bl / (float)st.f_blocks; + high_water = (float)high_water * f; + low_water = (float)low_water * f; + final_water = (float)final_water * f; + dout(10) << "fake " << fake_bl << " / " << st.f_blocks << " is " << f << ", high " << high_water << " low " << low_water << " final " << final_water << endl; + } + + // init size distn (once) + if (!did_distn) { + did_distn = true; + age_cur_oid = 1; + file_size_distn.add(1, 19.0758125+0.65434375); + file_size_distn.add(512, 35.6566); + file_size_distn.add(1024, 27.7271875); + file_size_distn.add(2*1024, 16.63503125); + //file_size_distn.add(4*1024, 106.82384375); + //file_size_distn.add(8*1024, 81.493375); + //file_size_distn.add(16*1024, 14.13553125); + //file_size_distn.add(32*1024, 2.176); + //file_size_distn.add(256*1024, 0.655938); + //file_size_distn.add(512*1024, 0.1480625); + //file_size_distn.add(1*1024*1024, 0.020125); // actually 2, but 32bit + file_size_distn.normalize(); + } + + // clear + for (int i=0; i<10; i++) + age_objects[i].clear(); + + for (int c=1; c<=count; c++) { + dout(1) << "age " << c << "/" << count << " filling to " << high_water << endl; + age_fill(high_water); + if (c == count) { + dout(1) << "age final empty to " << final_water << endl; + age_empty(final_water); + } else { + dout(1) << "age " << c << "/" << count << " emptying to " << low_water << endl; + age_empty(low_water); + } + } + dout(1) << "age finished" << endl; + } + diff --git a/ceph/osd/ObjectStore.h b/ceph/osd/ObjectStore.h index aacfbc7d2aa7a..11d4858f5f342 100644 --- a/ceph/osd/ObjectStore.h +++ b/ceph/osd/ObjectStore.h @@ -19,17 +19,32 @@ #include "include/Context.h" #include "include/bufferlist.h" +#include "include/Distribution.h" + #include +#include /* or */ #include using namespace std; +#ifndef MIN +# define MIN(a,b) ((a) < (b) ? (a):(b)) +#endif + /* * low-level interface to the local OSD file system */ class ObjectStore { +private: + list age_free_oids; + object_t age_cur_oid; + vector< list > age_objects; + Distribution file_size_distn; //kb + bool did_distn; + public: + ObjectStore() : did_distn(false) {} virtual ~ObjectStore() {} // mgmt @@ -84,6 +99,20 @@ class ObjectStore { + // age store +private: + void age_empty(float pc); + void age_fill(float pc); + ssize_t age_pick_size(); + object_t age_get_oid(); + +public: + void age(float high_water, // fill to this % + float low_water, // then empty to this % + int count, // this many times + float final_water, // and end here ( <= low_water) + int fake_size_mb=0); + }; #endif -- 2.39.5