From 2484df3ee28c730cbedefee8d881fa459cf0f3b0 Mon Sep 17 00:00:00 2001 From: sage Date: Wed, 16 Nov 2005 21:12:13 +0000 Subject: [PATCH] caps fun, but centralized writers ultimatley busted! git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@507 29311d96-e01e-0410-9327-a35deaab8ce9 --- ceph/TODO | 6 + ceph/client/Client.cc | 189 ++- ceph/client/Client.h | 11 + ceph/client/SyntheticClient.cc | 13 + ceph/client/SyntheticClient.h | 2 + ceph/config.cc | 6 +- ceph/crush/test/bucket_movement.cc | 164 ++ ceph/crush/test/bucket_variance.cc | 199 +++ ceph/crush/test/cluster_movement.cc | 217 +++ ceph/crush/test/cluster_movement_remove.cc | 229 +++ ceph/crush/test/movement_failed.cc | 246 +++ ceph/crush/test/overload.cc | 335 ++++ ceph/crush/test/smallbucket.cc | 138 ++ ceph/crush/test/speed_bucket.cc | 84 + ceph/crush/test/speed_depth.cc | 139 ++ ceph/crush/test/t.cc | 25 + ceph/crush/test/testbucket.cc | 61 + ceph/mds/CInode.cc | 26 +- ceph/mds/CInode.h | 176 +- ceph/mds/Capability.h | 120 +- ceph/mds/Lock.h | 243 ++- ceph/mds/MDCache.cc | 1689 ++++++++++++-------- ceph/mds/MDCache.h | 42 +- ceph/mds/MDS.cc | 127 +- ceph/mds/MDStore.h | 1 + ceph/messages/MClientFileCaps.h | 40 +- ceph/messages/MClientReply.h | 10 +- ceph/messages/MInodeFileCaps.h | 41 + ceph/messages/MLock.h | 33 +- ceph/msg/Message.h | 2 + ceph/msg/Messenger.cc | 10 + ceph/msg/Messenger.h | 1 + ceph/msg/TCPMessenger.cc | 21 +- 33 files changed, 3482 insertions(+), 1164 deletions(-) create mode 100644 ceph/crush/test/bucket_movement.cc create mode 100644 ceph/crush/test/bucket_variance.cc create mode 100644 ceph/crush/test/cluster_movement.cc create mode 100644 ceph/crush/test/cluster_movement_remove.cc create mode 100644 ceph/crush/test/movement_failed.cc create mode 100644 ceph/crush/test/overload.cc create mode 100644 ceph/crush/test/smallbucket.cc create mode 100644 ceph/crush/test/speed_bucket.cc create mode 100644 ceph/crush/test/speed_depth.cc create mode 100644 ceph/crush/test/t.cc create mode 100644 ceph/crush/test/testbucket.cc create mode 100644 ceph/messages/MInodeFileCaps.h diff --git a/ceph/TODO b/ceph/TODO index b51e33641415a..05247ff9b6b38 100644 --- a/ceph/TODO +++ b/ceph/TODO @@ -1,3 +1,9 @@ +- open -> export -> caps update race potential... +- caps from multiple mds's! +- import/export of caps totally busted.. (merging??) + + + cluster issues - general problem: how to do posix ordering on object boundaries using an object store diff --git a/ceph/client/Client.cc b/ceph/client/Client.cc index 767e14192fe52..cf88537c8c80d 100644 --- a/ceph/client/Client.cc +++ b/ceph/client/Client.cc @@ -348,7 +348,7 @@ Dentry *Client::lookup(filepath& path) MClientReply *Client::make_request(MClientRequest *req, bool auth_best, - int use_auth) // this param is icky! + int use_mds) // this param is icky, debug weirdness! { // send to what MDS? find deepest known prefix Inode *cur = root; @@ -388,9 +388,8 @@ MClientReply *Client::make_request(MClientRequest *req, dout(9) << "i have no idea where " << req->get_filepath() << " is" << endl; } - // force use of a particular mds auth? - if (use_auth >= 0) - mds = use_auth; + // force use of a particular mds? + if (use_mds >= 0) mds = use_mds; // drop mutex for duration of call client_lock->Unlock(); @@ -627,18 +626,42 @@ void Client::release_inode_buffers(Inode *in) void Client::handle_file_caps(MClientFileCaps *m) { + if (inode_map.count(m->get_ino()) == 0) { + dout(5) << "handle_file_caps on ino " << m->get_ino() << " seq " << m->get_seq() << " " << cap_string(m->get_caps()) << ", which we don't have, releasing." << endl; + m->set_caps(0); + m->set_wanted(0); + messenger->send_message(m, m->get_source(), m->get_source_port()); + return; + } + Inode *in = inode_map[ m->get_ino() ]; assert(in); - // auth? + if (in->file_caps_wanted() == 0) { + dout(5) << "handle_file_caps on ino " << m->get_ino() << " seq " << m->get_seq() << " " << cap_string(m->get_caps()) << ", which we don't want caps for, releasing." << endl; + m->set_caps(0); + m->set_wanted(0); + messenger->send_message(m, m->get_source(), m->get_source_port()); + return; + } + if (m->get_seq() <= in->file_caps_seq) { + dout(5) << "handle_file_caps on ino " << m->get_ino() << " old seq " << m->get_seq() << " <= " << in->file_caps_seq << endl; + delete m; + return; + } + + // new mds auth? if (m->get_mds() >= 0) { in->file_mds = m->get_mds(); - dout(5) << "handle_file_caps on in " << m->get_ino() << " mds now " << in->file_mds << endl; + dout(5) << "handle_file_caps on ino " << m->get_ino() << " mds now " << in->file_mds << endl; } - + + + int old_caps = in->file_caps; in->file_caps = m->get_caps(); - dout(5) << "handle_file_caps on in " << m->get_ino() << " caps now " << in->file_caps << endl; - + in->file_caps_seq = m->get_seq(); + dout(5) << "handle_file_caps on in " << m->get_ino() << " seq " << m->get_seq() << " caps now " << cap_string(in->file_caps) << " was " << cap_string(old_caps) << endl; + // update inode in->inode = m->get_inode(); // might have updated size... FIXME this is overkill! @@ -646,18 +669,17 @@ void Client::handle_file_caps(MClientFileCaps *m) if (in->file_caps & CAP_FILE_WRBUFFER == 0) { flush_inode_buffers(in); Filecache *fc = bc->get_fc(in); - fc->wait_for_inflight(client_lock); + fc->wait_for_inflight(client_lock); // FIXME: this isn't actually allowed to block is it?!? } // release buffers? if (in->file_caps & CAP_FILE_RDCACHE == 0) release_inode_buffers(in); - // ack - if (m->needs_ack()) { - dout(5) << "acking" << endl; + // ack? + if (old_caps & ~in->file_caps) { + dout(5) << " we lost caps " << cap_string(old_caps & ~in->file_caps) << ", acking" << endl; messenger->send_message(m, m->get_source(), m->get_source_port()); - return; } // wake up waiters? @@ -684,6 +706,52 @@ void Client::handle_file_caps(MClientFileCaps *m) } +void Client::release_caps(Inode *in, + int retain) +{ + dout(5) << "releasing caps on ino " << in->inode.ino + << " had " << cap_string(in->file_caps) + << " retaining " << cap_string(retain) + << endl; + + in->file_caps = retain; + + // release + MClientFileCaps *m = new MClientFileCaps(in->inode, + in->file_caps_seq, + in->file_caps, + in->file_caps_wanted(), + whoami); + messenger->send_message(m, + MSG_ADDR_MDS(in->file_mds), MDS_PORT_CACHE); + + if ((in->file_caps & CAP_FILE_WR) == 0) { + in->file_wr_mtime = 0; + in->file_wr_size = 0; + } + + // release caps completely? + if (in->file_caps == 0) { + in->file_caps_seq = 0; + in->file_mds = 0; + put_inode(in); + } +} + +void Client::update_caps_wanted(Inode *in) +{ + dout(5) << "updating caps wanted on ino " << in->inode.ino + << " to " << cap_string(in->file_caps_wanted()) + << endl; + + MClientFileCaps *m = new MClientFileCaps(in->inode, + in->file_caps_seq, + in->file_caps, + in->file_caps_wanted(), + whoami); + messenger->send_message(m, + MSG_ADDR_MDS(in->file_mds), MDS_PORT_CACHE); +} @@ -1293,7 +1361,9 @@ int Client::open(const char *path, int mode) req->set_caller_uid(getuid()); req->set_caller_gid(getgid()); - MClientReply *reply = make_request(req, true); + MClientReply *reply = make_request(req, + mode & (O_RDWR|O_WRONLY)); // try auth if writer + assert(reply); dout(3) << "op: open_files[" << reply->get_result() << "] = fh; // fh = " << reply->get_result() << endl; tout << reply->get_result() << endl; @@ -1314,19 +1384,37 @@ int Client::open(const char *path, int mode) f->inode = inode_map[trace[trace.size()-1]->inode.ino]; assert(f->inode); f->inode->get(); - f->inode->file_mds = reply->get_source(); + f->inode->file_mds = MSG_ADDR_NUM(reply->get_source()); if (cmode & FILE_MODE_R) f->inode->num_rd++; if (cmode & FILE_MODE_W) f->inode->num_wr++; - // caps - if (f->inode->file_caps_seq == 0) - f->inode->get(); - f->inode->file_caps = reply->get_file_caps(); - assert(f->inode->file_caps_seq < reply->get_file_caps_seq()); // ordered delivery - f->inode->file_caps_seq = reply->get_file_caps_seq(); + // caps included? + assert(reply->get_file_caps_seq() >= f->inode->file_caps_seq); + if (reply->get_file_caps_seq() > f->inode->file_caps_seq) { + dout(7) << "open got caps " << cap_string(reply->get_file_caps()) << " seq " << reply->get_file_caps_seq() << endl; + + // first ones? + if (f->inode->file_caps_seq == 0) + f->inode->get(); + + int old_caps = f->inode->file_caps; + f->inode->file_caps = reply->get_file_caps(); + f->inode->file_caps_seq = reply->get_file_caps_seq(); + + // ack if we lost any caps + if (old_caps & ~f->inode->file_caps) { + dout(5) << " we lost caps " << cap_string(old_caps & ~f->inode->file_caps) << ", acking" << endl; + messenger->send_message(new MClientFileCaps(f->inode->inode, + f->inode->file_caps_seq, + f->inode->file_caps, + f->inode->file_caps_wanted(), + whoami), + reply->get_source(), reply->get_source_port()); + } + } // put in map result = fh = get_fh(); @@ -1357,60 +1445,45 @@ int Client::close(fh_t fh) Fh *f = fh_map[fh]; Inode *in = f->inode; - // Make sure buffers are all clean! - //flush_inode_buffers(in); - - // update inode + // update inode rd/wr counts + int before = in->file_caps_wanted(); if (f->mode & FILE_MODE_R) in->num_rd--; if (f->mode & FILE_MODE_W) in->num_wr--; + int after = in->file_caps_wanted(); + + // does this change what caps we want? + if (before != after && after) + update_caps_wanted(in); // hose fh fh_map.erase(fh); delete f; - // note mds auth.. we'll send the close there! FIXME this is sort of icky - int mds_auth = in->authority(); - - //release_inode_buffers(in); - put_inode( in ); - int result = 0; - // release caps right away? dout(10) << "num_rd " << in->num_rd << " num_wr " << in->num_wr << endl; if (in->num_rd == 0 && in->num_wr == 0) { - // synchronously; FIXME this is dumb - MClientRequest *req = new MClientRequest(MDS_OP_RELEASE, whoami); - req->set_ino(in->inode.ino); - - req->set_iarg( in->file_caps_seq ); - req->set_targ( in->file_wr_mtime ); - req->set_sizearg( in->file_wr_size ); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); + // FIXME THIS IS ALL WRONG! + // this should happen _async_ when the buffers finally flush.. + // _then_ release the caps, if they still need to be released + // also, retain read or write caps, if that's appropriate!! - // release caps locally - in->file_caps_seq = 0; - in->file_caps = 0; - in->file_wr_mtime = 0; - in->file_wr_size = 0; + // Make sure buffers are all clean! + //flush_inode_buffers(in); - put_inode(in); - - // make the call .. FIXME there's no reason this has to block! - MClientReply *reply = make_request(req, true, mds_auth); - assert(reply); - int result = reply->get_result(); - assert(result == 0); - - delete reply; + // bad bad.. + release_caps(in); + + // ech + release_inode_buffers(in); } + put_inode( in ); + int result = 0; + client_lock->Unlock(); return result; } diff --git a/ceph/client/Client.h b/ceph/client/Client.h index b0c08e0d78c31..632a220c6bd51 100644 --- a/ceph/client/Client.h +++ b/ceph/client/Client.h @@ -129,6 +129,13 @@ class Inode { return (inode.mode & INODE_TYPE_MASK) == INODE_MODE_DIR; } + int file_caps_wanted() { + int w = 0; + if (num_rd) w |= CAP_FILE_RD|CAP_FILE_RDCACHE; + if (num_wr) w |= CAP_FILE_WR|CAP_FILE_WRBUFFER; + return w; + } + int authority() { // my info valid? if (mds_dir_auth >= 0) @@ -344,7 +351,11 @@ class Client : public Dispatcher { // messaging void dispatch(Message *m); + + // file caps void handle_file_caps(class MClientFileCaps *m); + void release_caps(Inode *in, int retain=0); + void update_caps_wanted(Inode *in); // metadata cache Inode* insert_inode_info(Dir *dir, c_inode_info *in_info); diff --git a/ceph/client/SyntheticClient.cc b/ceph/client/SyntheticClient.cc index 3597dbb054c73..e32df1fdbee3c 100644 --- a/ceph/client/SyntheticClient.cc +++ b/ceph/client/SyntheticClient.cc @@ -78,6 +78,9 @@ void parse_syn_options(vector& args) } else if (strcmp(args[i],"sleep") == 0) { syn_modes.push_back( SYNCLIENT_MODE_SLEEP ); syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"opentest") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_OPENTEST ); + syn_iargs.push_back( atoi(args[++i]) ); } else { cerr << "unknown syn mode " << args[i] << endl; assert(0); @@ -335,6 +338,16 @@ int SyntheticClient::run() } break; + case SYNCLIENT_MODE_OPENTEST: + { + int count = iargs.front(); iargs.pop_front(); + for (int i=0; iopen("test", rand()%2 ? (O_WRONLY|O_CREAT):O_RDONLY); + if (fd > 0) client->close(fd); + } + } + break; + default: assert(0); } diff --git a/ceph/client/SyntheticClient.h b/ceph/client/SyntheticClient.h index 3034a5b613b98..7894d54752707 100644 --- a/ceph/client/SyntheticClient.h +++ b/ceph/client/SyntheticClient.h @@ -26,6 +26,8 @@ #define SYNCLIENT_MODE_RANDOMSLEEP 12 #define SYNCLIENT_MODE_SLEEP 13 +#define SYNCLIENT_MODE_OPENTEST 30 + void parse_syn_options(vector& args); class SyntheticClient { diff --git a/ceph/config.cc b/ceph/config.cc index f65a73e201b5b..8fef763f76c6b 100644 --- a/ceph/config.cc +++ b/ceph/config.cc @@ -104,7 +104,7 @@ md_config_t g_conf = { // --- osd --- - osd_pg_bits: 6, + osd_pg_bits: 2, osd_max_rep: 4, osd_fsync: true, osd_writesync: false, @@ -231,9 +231,9 @@ void parse_config_options(vector& args) else if (strcmp(args[i], "--mds_bal_interval") == 0) g_conf.mds_bal_interval = atoi(args[++i]); - else if (strcmp(args[i], "--mds_bal_replicate_threshold") == 0) + else if (strcmp(args[i], "--mds_bal_rep") == 0) g_conf.mds_bal_replicate_threshold = atoi(args[++i]); - else if (strcmp(args[i], "--mds_bal_unreplicate_threshold") == 0) + else if (strcmp(args[i], "--mds_bal_unrep") == 0) g_conf.mds_bal_unreplicate_threshold = atoi(args[++i]); else if (strcmp(args[i], "--mds_bal_max") == 0) g_conf.mds_bal_max = atoi(args[++i]); diff --git a/ceph/crush/test/bucket_movement.cc b/ceph/crush/test/bucket_movement.cc new file mode 100644 index 0000000000000..fdc890b4bf3fb --- /dev/null +++ b/ceph/crush/test/bucket_movement.cc @@ -0,0 +1,164 @@ + + +#include "../crush.h" +using namespace crush; + +#include + +#include +#include +using namespace std; + + +void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) +{ + vector v(numrep); + map ocount; + + for (int x=1; x<=numpg; x++) { + + //cout << H(x) << "\t" << h(x) << endl; + c.do_rule(rule, x, v); + //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; + + bool bad = false; + for (int i=0; i::iterator it = ocount.begin(); + it != ocount.end(); + it++) + cout << it->first << "\t" << it->second << endl; + +} + + +float testmovement(int n, float f, int buckettype) +{ + Hash h(73232313); + + // crush + Crush c; + + int ndisks = 0; + + // bucket + Bucket *b; + if (buckettype == 0) + b = new TreeBucket(1); + else if (buckettype == 1 || buckettype == 2) + b = new ListBucket(1); + else if (buckettype == 3) + b = new StrawBucket(1); + + for (int i=0; iadd_item(ndisks++,1); + + c.add_bucket(b); + int root = b->get_id(); + + //c.print(cout,root); + + // rule + int numrep = 2; + Rule rule; + rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); + rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); + rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); + + //c.overload[10] = .1; + + + int pg_per = 1000; + int numpg = pg_per*ndisks/numrep; + + vector ocount(ndisks); + + /* + cout << ndisks << " disks, " << endl; + cout << pg_per << " pgs per disk" << endl; + cout << numpg << " logical pgs" << endl; + cout << "numrep is " << numrep << endl; + */ + map > placement1, placement2; + + //c.print(cout, root); + + + // ORIGINAL + place(c, rule, numpg, numrep, placement1); + + int olddisks = ndisks; + + // add item + if (buckettype == 2) { + // start over! + ndisks = 0; + b = new ListBucket(1); + for (int i=0; i<=n; i++) + b->add_item(ndisks++,1); + c.add_bucket(b); + root = b->get_id(); + + rule.steps.clear(); + rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); + rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); + rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); + + } + else + b->add_item(ndisks++, 1); + + + // ADDED + //c.print(cout, root); + place(c, rule, numpg, numrep, placement2); + + int moved = 0; + for (int x=1; x<=numpg; x++) + if (placement1[x] != placement2[x]) + for (int j=0; j + +#include +#include +using namespace std; + + +Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) +{ + if (h == 0) { + // uniform + Hash hash(123); + vector disks; + for (int i=0; imake_primes(hash); + c.add_bucket(b); + //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; + return b; + } else { + // mixed + //Bucket *b = new MixedBucket(h+1); + Bucket *b = new StrawBucket(h+1); + for (int i=0; iadd_item(n->get_id(), n->get_weight()); + } + c.add_bucket(b); + //cout << h << " mixedbucket with " << wid[h] << endl; + return b; + } +} + +int make_hierarchy(Crush& c, vector& wid, int& ndisks) +{ + Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); + return b->get_id(); +} + + +float go(int dep) +{ + Hash h(73232313); + + // crush + Crush c; + + + // buckets + int root = -1; + int ndisks = 0; + + vector wid; + if (0) { + for (int d=0; dadd_item(ndisks++, 10); + root = c.add_bucket(b); + } + if (0) { + vector disks; + for (int i=0; i<10000; i++) + disks.push_back(ndisks++); + UniformBucket *b = new UniformBucket(1, 0, 10000, disks); + Hash h(123); + b->make_primes(h); + root = c.add_bucket(b); + } + + + + // rule + int numrep = 1; + Rule rule; + rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); + rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); + rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); + + //c.overload[10] = .1; + + + int pg_per = 100; + int numpg = pg_per*ndisks/numrep; + + vector ocount(ndisks); + //cout << ndisks << " disks, " << endl; + //cout << pg_per << " pgs per disk" << endl; + // cout << numpg << " logical pgs" << endl; + //cout << "numrep is " << numrep << endl; + + + int place = 100000; + int times = place / numpg; + if (!times) times = 1; + + cout << "#looping " << times << " times" << endl; + + float tvar = 0; + int tvarnum = 0; + + int x = 0; + for (int t=0; t v(numrep); + + for (int z=0; z + +#include +#include +using namespace std; + +int buckettype = 0; + +Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) +{ + if (h == 0) { + // uniform + Hash hash(123); + vector disks; + for (int i=0; imake_primes(hash); + c.add_bucket(b); + //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; + buckets[h].push_back(b); + return b; + } else { + // mixed + //Bucket *b = new TreeBucket(h+1); + //Bucket *b = new ListBucket(h+1); + //Bucket *b = new StrawBucket(h+1); + Bucket *b; + if (buckettype == 0) + b = new TreeBucket(h+1); + else if (buckettype == 1 || buckettype == 2) + b = new ListBucket(h+1); + else if (buckettype == 3) + b = new StrawBucket(h+1); + + c.add_bucket(b); + for (int i=0; iadd_item(n->get_id(), n->get_weight()); + n->set_parent(b->get_id()); + } + buckets[h].push_back(b); + //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; + return b; + } +} + +int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) +{ + Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); + return b->get_id(); +} + + +void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) +{ + vector v(numrep); + map ocount; + + for (int x=1; x<=numpg; x++) { + + //cout << H(x) << "\t" << h(x) << endl; + c.do_rule(rule, x, v); + //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; + + bool bad = false; + for (int i=0; i::iterator it = ocount.begin(); + it != ocount.end(); + it++) + cout << it->first << "\t" << it->second << endl; + +} + + +float testmovement(int depth, int branching, int udisks, int add, int modifydepth) +{ + Hash h(73232313); + + // crush + Crush c; + + + // buckets + int root = -1; + int ndisks = 0; + + vector wid; + wid.push_back(udisks); + for (int d=1; d > buckets; + + root = make_hierarchy(c, wid, buckets, ndisks); + + //c.print(cout,root); + + // rule + int numrep = 2; + Rule rule; + rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); + rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); + rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); + + //c.overload[10] = .1; + + + int pg_per = 100; + int numpg = pg_per*ndisks/numrep; + + vector ocount(ndisks); + + /* + cout << ndisks << " disks, " << endl; + cout << pg_per << " pgs per disk" << endl; + cout << numpg << " logical pgs" << endl; + cout << "numrep is " << numrep << endl; + */ + map > placement1, placement2; + + //c.print(cout, root); + + + // ORIGINAL + place(c, rule, numpg, numrep, placement1); + + int olddisks = ndisks; + + // add disks + //cout << " adding " << add << " disks" << endl; + vector disks; + for (int i=0; imake_primes(h); + + //Bucket *o = buckets[2].back(); + Bucket *o; + if (buckettype == 2) + o = buckets[modifydepth].front(); + else + o = buckets[modifydepth].back(); + + c.add_bucket(b); + //cout << " adding under " << o->get_id() << endl; + c.add_item(o->get_id(), b->get_id(), b->get_weight()); + //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); + //newbucket = b; + + + // ADDED + //c.print(cout, root); + place(c, rule, numpg, numrep, placement2); + + int moved = 0; + for (int x=1; x<=numpg; x++) + if (placement1[x] != placement2[x]) + for (int j=0; j + +#include +#include +using namespace std; + + +int buckettype = 2; // 0 = mixed, 1 = linear, 2 = straw + +int big_one_skip = 255; +int big_one_size; +Bucket *big_one = 0; + +Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) +{ + if (h == 0) { + // uniform + Hash hash(123); + vector disks; + + int s = wid[h]; + if (big_one_skip > 0) + big_one_skip--; + if (!big_one_skip && !big_one) + s = big_one_size; + + + for (int i=0; imake_primes(hash); + c.add_bucket(b); + //cout << h << " uniformbucket with " << wid[h] << " disks " << disks.size()<< endl; + buckets[h].push_back(b); + return b; + } else { + // mixed + Bucket *b; + if (buckettype == 0) + b = new TreeBucket(h+1); + else if (buckettype == 1) + b = new ListBucket(h+1); + else if (buckettype == 2) + b = new StrawBucket(h+1); + c.add_bucket(b); + for (int i=0; iadd_item(n->get_id(), n->get_weight()); + n->set_parent(b->get_id()); + } + buckets[h].push_back(b); + //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; + return b; + } +} + +int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) +{ + Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); + return b->get_id(); +} + + +void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) +{ + vector v(numrep); + map ocount; + + for (int x=1; x<=numpg; x++) { + + //cout << H(x) << "\t" << h(x) << endl; + c.do_rule(rule, x, v); + //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; + + bool bad = false; + for (int i=0; i::iterator it = ocount.begin(); + it != ocount.end(); + it++) + cout << it->first << "\t" << it->second << endl; + +} + + +float testmovement(int depth, int branching, int udisks, int add) +{ + Hash h(73232313); + + // crush + Crush c; + + + // buckets + int root = -1; + int ndisks = 0; + + vector wid; + wid.push_back(udisks); + for (int d=1; d > buckets; + + big_one_size = add; + big_one = 0; + + //cout << "making tree" << endl; + root = make_hierarchy(c, wid, buckets, ndisks); + + //c.print(cout, root); + + + // rule + int numrep = 2; + Rule rule; + rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); + rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); + rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); + + //c.overload[10] = .1; + + + int pg_per = 100; + int numpg = pg_per*ndisks/numrep; + + vector ocount(ndisks); + + /* + cout << ndisks << " disks, " << endl; + cout << pg_per << " pgs per disk" << endl; + cout << numpg << " logical pgs" << endl; + cout << "numrep is " << numrep << endl; + */ + map > placement1, placement2; + + //c.print(cout, root); + + int olddisks = ndisks; + + + place(c, rule, numpg, numrep, placement1); + + if (1) { + // remove disks + assert(big_one); + c.adjust_item(big_one->get_id(), 0); + } + + int newdisks = ndisks - add; + + //c.print(cout, root); + place(c, rule, numpg, numrep, placement2); + + int moved = 0; + for (int x=1; x<=numpg; x++) + if (placement1[x] != placement2[x]) + for (int j=0; j >::iterator i = r.begin(); + i != r.end(); + i++) { + cout << i->first; + for (map::iterator j = i->second.begin(); + j != i->second.end(); + j++) + cout << "\t" << j->first << "\t" << j->second; + cout << endl; + } + */ +} + diff --git a/ceph/crush/test/movement_failed.cc b/ceph/crush/test/movement_failed.cc new file mode 100644 index 0000000000000..98c34d96e9ac2 --- /dev/null +++ b/ceph/crush/test/movement_failed.cc @@ -0,0 +1,246 @@ + + +#include "../crush.h" +using namespace crush; + +#include + +#include +#include +using namespace std; + + +Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) +{ + if (h == 0) { + // uniform + Hash hash(123); + vector disks; + for (int i=0; imake_primes(hash); + c.add_bucket(b); + //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; + buckets[h].push_back(b); + return b; + } else { + // mixed + MixedBucket *b = new MixedBucket(h+1); + c.add_bucket(b); + for (int i=0; iadd_item(n->get_id(), n->get_weight()); + n->set_parent(b->get_id()); + } + buckets[h].push_back(b); + //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; + return b; + } +} + +int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) +{ + Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); + return b->get_id(); +} + + +void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) +{ + vector v(numrep); + map ocount; + + for (int x=1; x<=numpg; x++) { + + //cout << H(x) << "\t" << h(x) << endl; + c.do_rule(rule, x, v); + //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; + + bool bad = false; + for (int i=0; i::iterator it = ocount.begin(); + it != ocount.end(); + it++) + cout << it->first << "\t" << it->second << endl; + +} + + +float testmovement(int depth, int branching, int udisks) +{ + Hash h(73232313); + + // crush + Crush c; + + + // buckets + int root = -1; + int ndisks = 0; + + vector wid; + wid.push_back(udisks); + for (int d=1; d > buckets; + + if (1) { + root = make_hierarchy(c, wid, buckets, ndisks); + } + if (0) { + MixedBucket *b = new MixedBucket(1); + for (int i=0; i<10000; i++) + b->add_item(ndisks++, 10); + root = c.add_bucket(b); + } + if (0) { + vector disks; + for (int i=0; i<10000; i++) + disks.push_back(ndisks++); + UniformBucket *b = new UniformBucket(1, 0, 1, disks); + Hash h(123); + b->make_primes(h); + root = c.add_bucket(b); + } + + + + // rule + int numrep = 2; + Rule rule; + rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); + rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); + rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); + + //c.overload[10] = .1; + + + int pg_per = 100; + int numpg = pg_per*ndisks/numrep; + + vector ocount(ndisks); + + /* + cout << ndisks << " disks, " << endl; + cout << pg_per << " pgs per disk" << endl; + cout << numpg << " logical pgs" << endl; + cout << "numrep is " << numrep << endl; + */ + map > placement1, placement2; + + //c.print(cout, root); + + place(c, rule, numpg, numrep, placement1); + + float over = .5; + + if (1) { + // failed + + //for (int i=500; i<1000; i++) + //c.failed.insert(i); + //c.failed.insert(0); + c.overload[0] = over; + } + + int olddisks = ndisks; + + + + if (0) { + int n = udisks; + //cout << " adding " << n << " disks" << endl; + vector disks; + for (int i=0; imake_primes(h); + Bucket *o = buckets[1].back(); + c.add_bucket(b); + //cout << " adding under " << o->get_id() << endl; + c.add_item(o->get_id(), b->get_id(), b->get_weight()); + //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); + } + + //c.print(cout, root); + place(c, rule, numpg, numrep, placement2); + + vector moved(ndisks); + + //int moved = 0; + for (int d=0; d::iterator it = placement1[d].begin(); + it != placement1[d].end(); + it++) { + placement2[d].erase(*it); + } + } + + float avg = 0; + for (int d=0; d v; + cout << depth; + for (int branching = 3; branching < 16; branching += 1) { + float fac = testmovement(depth, branching, udisks); + v.push_back(fac); + int n = udisks * pow((float)branching, (float)depth-1); + //cout << "\t" << n; + //cout << "\t" << fac; + } + //for (int i=0; i + +#include +#include +using namespace std; + + +Clock g_clock; + + +Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) +{ + if (h == 0) { + // uniform + Hash hash(123); + vector disks; + for (int i=0; imake_primes(hash); + c.add_bucket(b); + //cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl; + return b; + } else { + // mixed + Bucket *b = new TreeBucket(h+1); + for (int i=0; iadd_item(n->get_id(), n->get_weight()); + } + c.add_bucket(b); + //cout << h << " mixedbucket with " << wid[h] << endl; + return b; + } +} + +int make_hierarchy(Crush& c, vector& wid, int& ndisks) +{ + Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); + return b->get_id(); +} + + + +float go(int dep, int utilization ) +{ + Hash h(73232313); + + int overloadcutoff = (int)((float)10000.0 / (float)utilization); + + //cout << "util " << utilization << " cutoff " << overloadcutoff << endl; + // crush + Crush c; + + + // buckets + int root = -1; + int ndisks = 0; + + vector wid; + for (int d=0; d ocount(ndisks); + //cout << ndisks << " disks, " << endl; + //cout << pg_per << " pgs per disk" << endl; + // cout << numpg << " logical pgs" << endl; + //cout << "numrep is " << numrep << endl; + + + int place = 100000; + int times = place / numpg; + if (!times) times = 1; + + + //cout << "looping " << times << " times" << endl; + + float tavg[10]; + float tvar[10]; + for (int j=0;j<10;j++) { + tvar[j] = 0; + tavg[j] = 0; + } + int tvarnum = 0; + + float overloadsum = 0.0; + float adjustsum = 0.0; + float afteroverloadsum = 0.0; + float aslowdown = 0.0; + int chooses = 0; + int xs = 1; + for (int t=0; t v(numrep); + + c.overload.clear(); + + for (int z=0; z cutoff) + overloaded++; + + if (ocount[i] > adjoff) { + adjusted++; + c.overload[i] = (float)target / (float)ocount[i]; + //cout << "setting overload " << i << " to " << c.overload[i] << endl; + //cout << "disk " << i << " has " << ocount[i] << endl; + } + ocount[i] = 0; + } + //cout << overloaded << " overloaded" << endl; + overloadsum += (float)overloaded / (float)ndisks; + adjustsum += (float)adjusted / (float)ndisks; + + + + // keep adjusting! + for (int bla=0; bla<5; bla++) { + utime_t t2a = g_clock.now(); + + // second pass + for (int x=xs; x= adjoff) { + numover++; + if (c.overload.count(i) == 0) { + c.overload[i] = 1.0; + adjusted++; + } + //else cout << "(re)adjusting " << i << endl; + c.overload[i] *= (float)target / (float)ocount[i]; + //cout << "setting overload " << i << " to " << c.overload[i] << endl; + //cout << "disk " << i << " has " << ocount[i] << endl; + } + ocount[i] = 0; + } + if (!numover) break; + cout << "readjusting" << endl; + } + + utime_t t3a = g_clock.now(); + + for (int x=xs; x cutoff) { + still++; + //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; + if (c.overload.count(i)) cout << "[adjusted] "; + cout << "disk " << i << " has " << ocount[i] << endl; + } + } + //if (still) cout << "overload was " << overloaded << " now " << still << endl; + afteroverloadsum += (float)still / (float)ndisks; + + //cout << "collisions: " << c.collisions << endl; + //cout << "r bumps: " << c.bumps << endl; + + int n = ndisks/10; + float avg[10]; + float var[10]; + for (int i=0;i<10;i++) { + int s = n*i; + avg[i] = 0.0; + for (int j=0; j + +#include +#include +using namespace std; + + +Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) +{ + if (h == 0) { + // uniform + Hash hash(123); + vector disks; + for (int i=0; imake_primes(hash); + c.add_bucket(b); + //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; + buckets[h].push_back(b); + return b; + } else { + // mixed + Bucket *b = new TreeBucket(h+1); + c.add_bucket(b); + for (int i=0; iadd_item(n->get_id(), n->get_weight()); + n->set_parent(b->get_id()); + } + buckets[h].push_back(b); + //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; + return b; + } +} + +int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) +{ + Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); + return b->get_id(); +} + + +void place(Crush& c, Rule& rule, int numpg, int numrep, vector& ocount) +{ + vector v(numrep); + //map ocount; + + for (int x=1; x<=numpg; x++) { + + //cout << H(x) << "\t" << h(x) << endl; + c.do_rule(rule, x, v); + //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; + + bool bad = false; + for (int i=0; i wid; + wid.push_back(10); + wid.push_back(2); + + map< int, list > buckets; + root = make_hierarchy(c, wid, buckets, ndisks); + + // add small bucket + vector disks; + for (int i=0; i<3; i++) + disks.push_back(ndisks++); + UniformBucket *b = new UniformBucket(1, 0, 1, disks); + b->make_primes(h); + Bucket *o = buckets[1].back(); + c.add_bucket(b); + //cout << " adding under " << o->get_id() << endl; + c.add_item(o->get_id(), b->get_id(), b->get_weight()); + + + // rule + int numrep = 6; + Rule rule; + rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); + rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); + rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); + + //c.overload[10] = .1; + + int pg_per = 10000; + int numpg = pg_per*ndisks/numrep; + + vector ocount(ndisks); + + c.print(cout, root); + + place(c, rule, numpg, numrep, ocount); + + for (int i=0; i + +#include +#include +using namespace std; + + +int numrep = 1; + + +double go(int n, int bucket) +{ + Hash h(73232313); + + // crush + Crush c; + + + // buckets + int root = -1; + int ndisks = 0; + + Bucket *b; + vector items; + if (bucket == 0) b = new UniformBucket(1,0,10,items); + if (bucket == 1) b = new TreeBucket(1); + if (bucket == 2) b = new ListBucket(1); + if (bucket == 3) b = new StrawBucket(1); + + for (int d=0; dadd_item(ndisks++, 1); + + //if (!bucket) ((UniformBucket*)b)->make_primes(h); + + root = c.add_bucket(b); + + // rule + Rule rule; + rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); + rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); + rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); + + + int place = 1000000; + + + vector v(numrep); + + utime_t start = g_clock.now(); + + for (int x=1; x <= place; x++) + c.do_rule(rule, x, v); + + utime_t end = g_clock.now(); + + end -= start; + double el = (double)end; + + //cout << "\t" << ndisks; + + return el; +} + + +int main() +{ + + for (int n=4; n<=50; n += 4) { + cout << n; + for (int b=0; b<4; b++) { + double el = go(n,b); + cout << "\t" << el; + } + cout << endl; + } +} diff --git a/ceph/crush/test/speed_depth.cc b/ceph/crush/test/speed_depth.cc new file mode 100644 index 0000000000000..c4e6693377293 --- /dev/null +++ b/ceph/crush/test/speed_depth.cc @@ -0,0 +1,139 @@ + +#include "../../common/Clock.h" +#include "../crush.h" +using namespace crush; + + +Clock g_clock; + +#include + +#include +#include +using namespace std; + + +int branching = 10; +bool linear = false; +int numrep = 1; + +Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) +{ + if (h == 0) { + // uniform + Hash hash(123); + vector disks; + for (int i=0; imake_primes(hash); + c.add_bucket(b); + //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; + return b; + } else { + // mixed + Bucket *b; + if (linear) + b = new ListBucket(h+1); + else + b = new TreeBucket(h+1); + for (int i=0; iadd_item(n->get_id(), n->get_weight()); + } + c.add_bucket(b); + //cout << h << " mixedbucket with " << wid[h] << endl; + return b; + } +} + +int make_hierarchy(Crush& c, vector& wid, int& ndisks) +{ + Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); + return b->get_id(); +} + + +double go(int dep) +{ + Hash h(73232313); + + // crush + Crush c; + + + // buckets + int root = -1; + int ndisks = 0; + + vector wid; + if (1) { + for (int d=0; d v(numrep); + + utime_t start = g_clock.now(); + + for (int x=1; x <= place; x++) + c.do_rule(rule, x, v); + + utime_t end = g_clock.now(); + + end -= start; + double el = (double)end; + + cout << "\t" << ndisks; + + return el; +} + + +int main() +{ + branching = 8; + + for (int d=2; d<=5; d++) { + cout << d << "\t" << branching; + for (numrep = 1; numrep <= 3; numrep++) { + cout << "\t" << numrep; + + //for (linear = false; 1; linear = true) { + double el = go(d); + cout << "\t" << el; + //if (linear) break; + } + cout << endl; + } +} diff --git a/ceph/crush/test/t.cc b/ceph/crush/test/t.cc new file mode 100644 index 0000000000000..0785ef47d6c04 --- /dev/null +++ b/ceph/crush/test/t.cc @@ -0,0 +1,25 @@ + +#include "../../common/Clock.h" +#include "../crush.h" +using namespace crush; + + +Clock g_clock; + +#include + +#include +#include +using namespace std; + + +int branching = 10; +bool linear = false; +int numrep = 1; + +int main() { + + Bucket *b = new UniformBucket(1, 0); + //b = new TreeBucket(1); +} + diff --git a/ceph/crush/test/testbucket.cc b/ceph/crush/test/testbucket.cc new file mode 100644 index 0000000000000..065721c2c1967 --- /dev/null +++ b/ceph/crush/test/testbucket.cc @@ -0,0 +1,61 @@ + + +#include "../Bucket.h" +using namespace crush; + +#include +#include +using namespace std; + + +ostream& operator<<(ostream& out, vector& v) +{ + out << "["; + for (int i=0; i ocount(ndisks); + + vector v(numrep); + int nplace = 0; + for (int x=1; x<1000000; x++) { + //cout << H(x) << "\t" << h(x) << endl; + for (int i=0; i::iterator it = in.get_client_caps().begin(); + it != in.get_client_caps().end(); + it++) { + if (it != in.get_client_caps().begin()) out << ","; + out << it->first; + } + out << "}"; + } out << " " << ∈ out << "]"; return out; @@ -62,7 +74,7 @@ ostream& operator<<(ostream& out, CInode& in) // ====== CInode ======= CInode::CInode(bool auth) : LRUObject(), hardlock(LOCK_TYPE_BASIC), - softlock(LOCK_TYPE_ASYNC) { + filelock(LOCK_TYPE_FILE) { ref = 0; parent = NULL; @@ -197,7 +209,7 @@ void CInode::mark_dirty() { updated below. */ - // only auth can get dirty. "dirty" async data in replicas is relative to (say) softlock state, not dirty flag. + // only auth can get dirty. "dirty" async data in replicas is relative to (say) filelock state, not dirty flag. assert(is_auth()); // touch my private version @@ -226,14 +238,14 @@ void CInode::mark_dirty() { // new state encoders -void CInode::encode_soft_state(crope& r) +void CInode::encode_file_state(crope& r) { r.append((char*)&inode.size, sizeof(inode.size)); r.append((char*)&inode.mtime, sizeof(inode.mtime)); r.append((char*)&inode.atime, sizeof(inode.atime)); // ?? } -void CInode::decode_soft_state(crope& r, int& off) +void CInode::decode_file_state(crope& r, int& off) { r.copy(off, sizeof(inode.size), (char*)&inode.size); off += sizeof(inode.size); @@ -243,7 +255,8 @@ void CInode::decode_soft_state(crope& r, int& off) off += sizeof(inode.atime); } -void CInode::decode_merge_soft_state(crope& r, int& off) +/* not used currently +void CInode::decode_merge_file_state(crope& r, int& off) { __uint64_t size; r.copy(off, sizeof(size), (char*)&size); @@ -259,6 +272,7 @@ void CInode::decode_merge_soft_state(crope& r, int& off) off += sizeof(t); if (t > inode.atime) inode.atime = t; } +*/ void CInode::encode_hard_state(crope& r) { diff --git a/ceph/mds/CInode.h b/ceph/mds/CInode.h index 2048d26a4d1b4..695367a94bcff 100644 --- a/ceph/mds/CInode.h +++ b/ceph/mds/CInode.h @@ -32,8 +32,7 @@ using namespace __gnu_cxx; #define CINODE_PIN_PROXY 3 // can't expire yet #define CINODE_PIN_WAITER 4 // waiter -#define CINODE_PIN_OPEN 5 // local fh's -#define CINODE_PIN_OPENTOK 6 // fh tokens to replicas +#define CINODE_PIN_CAPS 5 // local fh's #define CINODE_PIN_DNDIRTY 7 // dentry is dirty @@ -43,6 +42,7 @@ using namespace __gnu_cxx; #define CINODE_PIN_RENAMESRC 11 // pinned on dest for foreign rename #define CINODE_PIN_ANCHORING 12 + #define CINODE_PIN_DENTRYLOCK 14 #define CINODE_NUM_PINS 15 @@ -53,8 +53,8 @@ static char *cinode_pin_names[CINODE_NUM_PINS] = { "dirty", "proxy", "waiter", - "open", - "opentok", + "caps", + "--", "dndirty", "authpin", "imping", @@ -72,7 +72,7 @@ static char *cinode_pin_names[CINODE_NUM_PINS] = { // wait reasons #define CINODE_WAIT_AUTHPINNABLE CDIR_WAIT_UNFREEZE - // waiters: write_hard_start, read_soft_start, write_soft_start (mdcache) + // waiters: write_hard_start, read_file_start, write_file_start (mdcache) // handle_client_chmod, handle_client_touch (mds) // trigger: (see CDIR_WAIT_UNFREEZE) #define CINODE_WAIT_GETREPLICA (1<<11) // update/replicate individual inode @@ -93,13 +93,13 @@ static char *cinode_pin_names[CINODE_NUM_PINS] = { #define CINODE_WAIT_HARDRWB (CINODE_WAIT_HARDR|CINODE_WAIT_HARDW|CINODE_WAIT_HARDB) #define CINODE_WAIT_HARDSTABLE (1<<20) #define CINODE_WAIT_HARDNORD (1<<21) -#define CINODE_WAIT_SOFTR (1<<22) -#define CINODE_WAIT_SOFTW (1<<23) -#define CINODE_WAIT_SOFTB (1<<24) -#define CINODE_WAIT_SOFTRWB (CINODE_WAIT_SOFTR|CINODE_WAIT_SOFTW|CINODE_WAIT_SOFTB) -#define CINODE_WAIT_SOFTSTABLE (1<<25) -#define CINODE_WAIT_SOFTNORD (1<<26) -#define CINODE_WAIT_SOFTNOWR (1<<27) +#define CINODE_WAIT_FILER (1<<22) +#define CINODE_WAIT_FILEW (1<<23) +#define CINODE_WAIT_FILEB (1<<24) +#define CINODE_WAIT_FILERWB (CINODE_WAIT_FILER|CINODE_WAIT_FILEW|CINODE_WAIT_FILEB) +#define CINODE_WAIT_FILESTABLE (1<<25) +#define CINODE_WAIT_FILENORD (1<<26) +#define CINODE_WAIT_FILENOWR (1<<27) #define CINODE_WAIT_RENAMEACK (1<<28) #define CINODE_WAIT_RENAMENOTIFYACK (1<<29) @@ -160,7 +160,7 @@ class CInode : LRUObject { // inode metadata locks CLock hardlock; - CLock softlock; + CLock filelock; protected: int ref; // reference count @@ -190,10 +190,13 @@ class CInode : LRUObject { int num_request_pins; // waiters - multimap waiting; + multimap waiting; - // issued client capabilities - map caps; // client -> caps + // file capabilities + map client_caps; // client -> caps + + map mds_caps_wanted; // [auth] mds -> caps wanted + int replica_caps_wanted; // [replica] what i've requested from auth private: @@ -274,34 +277,13 @@ class CInode : LRUObject { crope encode_export_state(); - void encode_soft_state(crope& r); - void decode_soft_state(crope& r, int& off); - void decode_merge_soft_state(crope& r, int& off); + void encode_file_state(crope& r); + void decode_file_state(crope& r, int& off); + void decode_merge_file_state(crope& r, int& off); void encode_hard_state(crope& r); void decode_hard_state(crope& r, int& off); - void replicate_relax_locks() { - assert(is_auth()); - assert(!is_cached_by_anyone()); - dout(10) << " relaxing locks on " << *this << endl; - - if (hardlock.get_state() == LOCK_LOCK && - !hardlock.is_used()) { - dout(10) << " hard now sync " << *this << endl; - hardlock.set_state(LOCK_SYNC); - } - if (softlock.get_state() == LOCK_LOCK && - !softlock.is_used()) { - if (softlock.get_mode() == LOCK_MODE_SYNC) { - softlock.set_state(LOCK_SYNC); - dout(10) << " soft now sync " << *this << endl; - } else { - softlock.set_state(LOCK_ASYNC); - dout(10) << " soft now async " << *this << endl; - } - } - } // -- dirtyness -- __uint64_t get_version() { return version; } @@ -384,40 +366,78 @@ class CInode : LRUObject { // -- caps -- (new) - bool is_caps_issued() { return !caps.empty(); } - void add_cap(int client, Capability& cap) { - assert(caps.count(client) == 0); - caps[client] = cap; - } - void remove_cap(int client) { - assert(caps.count(client) == 1); - caps.erase(client); - } - Capability* get_cap(int client) { - if (caps.count(client)) - return &caps[client]; + // client caps + map& get_client_caps() { return client_caps; } + void add_client_cap(int client, Capability& cap) { + if (client_caps.empty()) + get(CINODE_PIN_CAPS); + assert(client_caps.count(client) == 0); + client_caps[client] = cap; + } + void remove_client_cap(int client) { + assert(client_caps.count(client) == 1); + client_caps.erase(client); + if (client_caps.empty()) + put(CINODE_PIN_CAPS); + } + Capability* get_client_cap(int client) { + if (client_caps.count(client)) + return &client_caps[client]; return 0; } - - void add_caps(map& cl) { - caps.clear(); - caps = cl; + void set_client_caps(map& cl) { + if (client_caps.empty() && !cl.empty()) + get(CINODE_PIN_CAPS); + client_caps.clear(); + client_caps = cl; } - void remove_caps(map& cl) { - cl = caps; - caps.clear(); + void take_client_caps(map& cl) { + if (!client_caps.empty()) + put(CINODE_PIN_CAPS); + cl = client_caps; + client_caps.clear(); } + // caps issued, wanted int get_caps_issued() { int c = 0; - for (map::iterator it = caps.begin(); - it != caps.end(); + for (map::iterator it = client_caps.begin(); + it != client_caps.end(); it++) c |= it->second.issued(); return c; } - bool is_write_caps() { - return get_caps_issued() & CAP_FILE_WR; + int get_caps_wanted() { + int w = 0; + for (map::iterator it = client_caps.begin(); + it != client_caps.end(); + it++) + w |= it->second.wanted(); + if (is_auth()) + for (map::iterator it = mds_caps_wanted.begin(); + it != mds_caps_wanted.end(); + it++) + w |= it->second; + return w; + } + + + void replicate_relax_locks() { + assert(is_auth()); + assert(!is_cached_by_anyone()); + dout(10) << " relaxing locks on " << *this << endl; + + if (hardlock.get_state() == LOCK_LOCK && + !hardlock.is_used()) { + dout(10) << " hard now sync " << *this << endl; + hardlock.set_state(LOCK_SYNC); + } + if (filelock.get_state() == LOCK_LOCK && + !filelock.is_used() && + get_caps_issued() == 0) { + filelock.set_state(LOCK_SYNC); + dout(10) << " soft now sync " << *this << endl; + } } @@ -537,7 +557,7 @@ class CInodeDiscover { int replica_nonce; int hardlock_state; - int softlock_state; + int filelock_state; public: CInodeDiscover() {} @@ -546,7 +566,7 @@ class CInodeDiscover { replica_nonce = nonce; hardlock_state = in->hardlock.get_replica_state(); - softlock_state = in->softlock.get_replica_state(); + filelock_state = in->filelock.get_replica_state(); } inodeno_t get_ino() { return inode.ino; } @@ -557,14 +577,14 @@ class CInodeDiscover { in->replica_nonce = replica_nonce; in->hardlock.set_state(hardlock_state); - in->softlock.set_state(softlock_state); + in->filelock.set_state(filelock_state); } void _encode(bufferlist& bl) { bl.append((char*)&inode, sizeof(inode)); bl.append((char*)&replica_nonce, sizeof(replica_nonce)); bl.append((char*)&hardlock_state, sizeof(hardlock_state)); - bl.append((char*)&softlock_state, sizeof(softlock_state)); + bl.append((char*)&filelock_state, sizeof(filelock_state)); } void _decode(bufferlist& bl, int& off) { @@ -574,8 +594,8 @@ class CInodeDiscover { off += sizeof(int); bl.copy(off, sizeof(hardlock_state), (char*)&hardlock_state); off += sizeof(hardlock_state); - bl.copy(off, sizeof(softlock_state), (char*)&softlock_state); - off += sizeof(softlock_state); + bl.copy(off, sizeof(filelock_state), (char*)&filelock_state); + off += sizeof(filelock_state); } }; @@ -602,7 +622,8 @@ class CInodeExport { map cached_by_nonce; map cap_map; - CLock hardlock,softlock; + CLock hardlock,filelock; + //int remaining_issued; public: CInodeExport() {} @@ -614,14 +635,15 @@ public: cached_by_nonce = in->cached_by_nonce; hardlock = in->hardlock; - softlock = in->softlock; + filelock = in->filelock; st.popularity_justme.take( in->popularity[MDS_POP_JUSTME] ); st.popularity_curdom.take( in->popularity[MDS_POP_CURDOM] ); in->popularity[MDS_POP_ANYDOM].adjust_down(st.popularity_curdom); - // steal caps from inode - in->remove_caps(cap_map); + // steal WRITER caps from inode + in->take_client_caps(cap_map); + //remaining_issued = in->get_caps_issued(); } ~CInodeExport() { } @@ -649,10 +671,10 @@ public: in->get(CINODE_PIN_CACHED); in->hardlock = hardlock; - in->softlock = softlock; + in->filelock = filelock; // caps - in->add_caps(cap_map); + in->set_client_caps(cap_map); } void _encode(bufferlist& bl) { @@ -671,7 +693,7 @@ public: } hardlock.encode_state(bl); - softlock.encode_state(bl); + filelock.encode_state(bl); // caps for (map::iterator it = cap_map.begin(); @@ -697,7 +719,7 @@ public: } hardlock.decode_state(bl, off); - softlock.decode_state(bl, off); + filelock.decode_state(bl, off); // caps for (int i=0; i cap_history; // seq -> cap long last_sent, last_recv; + bool suppress; + public: Capability(int want=0) : wanted_caps(want), last_sent(0), - last_recv(0) { - cap_history[last_sent] = 0; + last_recv(0), + suppress(false) { + //cap_history[last_sent] = 0; } + + bool is_suppress() { return suppress; } + void set_suppress(bool b) { suppress = b; } + + bool is_null() { return cap_history.empty(); } // most recently issued caps. int pending() { - return cap_history[ last_sent ]; + if (cap_history.count(last_sent)) + return cap_history[ last_sent ]; + return 0; } // caps client has confirmed receipt of int confirmed() { - return cap_history[ last_recv ]; + if (cap_history.count(last_recv)) + return cap_history[ last_recv ]; + return 0; } // caps potentially issued int issued() { int c = 0; for (long seq = last_recv; seq <= last_sent; seq++) { - c |= cap_history[seq]; - dout(10) << "cap issued: " << seq << " " << cap_history[seq] << " -> " << c << endl; + if (cap_history.count(seq)) { + c |= cap_history[seq]; + dout(10) << " cap issued: seq " << seq << " " << cap_string(cap_history[seq]) << " -> " << cap_string(c) << endl; + } } return c; } // caps this client wants to hold - int wanted() { return wanted_caps; } + int wanted() { return wanted_caps; } void set_wanted(int w) { wanted_caps = w; } + // needed + static int needed(int from) { + // strip out wrbuffer, rdcache + return from & (CAP_FILE_WR|CAP_FILE_RD); + } + int needed() { return needed(wanted_caps); } + // conflicts - int conflicts(int from) { + static int conflicts(int from) { int c = 0; + if (from & CAP_FILE_WRBUFFER) c |= CAP_FILE_RDCACHE|CAP_FILE_RD; if (from & CAP_FILE_WR) c |= CAP_FILE_RDCACHE; if (from & CAP_FILE_RD) c |= CAP_FILE_WRBUFFER; + if (from & CAP_FILE_RDCACHE) c |= CAP_FILE_WRBUFFER|CAP_FILE_WR; return c; } - int wanted_conflicts() { - return conflicts(wanted_caps); - } - int issued_conflicts() { - return conflicts(issued()); - } + int wanted_conflicts() { return conflicts(wanted()); } + int needed_conflicts() { return conflicts(needed()); } + int issued_conflicts() { return conflicts(issued()); } // issue caps; return seq number. long issue(int c) { + //int was = pending(); + //no! if (c == was && last_sent) return -1; // repeat of previous? + ++last_sent; cap_history[last_sent] = c; + + /* no! + // not recalling, just adding? + if (c & ~was && + cap_history.count(last_sent-1)) { + cap_history.erase(last_sent-1); + } + */ return last_sent; } long get_last_seq() { return last_sent; } - void confirm_receipt(long seq) { + // confirm receipt of a previous sent/issued seq. + int confirm_receipt(long seq, int caps) { + int r = 0; + + // old seqs while (last_recv < seq) { + dout(10) << " cap.confirm_receipt forgetting seq " << last_recv << " " << cap_string(cap_history[last_recv]) << endl; + r |= cap_history[last_recv]; cap_history.erase(last_recv); ++last_recv; } + + // release current? + if (cap_history.count(seq) && + cap_history[seq] != caps) { + dout(10) << " cap.confirm_receipt revising seq " << seq << " " << cap_string(cap_history[seq]) << " -> " << cap_string(caps) << endl; + // note what we're releasing.. + assert(cap_history[seq] & ~caps); + r |= cap_history[seq] & ~caps; + + cap_history[seq] = caps; // confirmed() now less than before.. + } + + // null? + if (caps == 0 && + cap_history.size() == 1 && + cap_history.count(seq)) { + cap_history.clear(); // viola, null! + } + + return r; } // serializers @@ -107,10 +164,7 @@ public: bl.append((char*)&wanted_caps, sizeof(wanted_caps)); bl.append((char*)&last_sent, sizeof(last_sent)); bl.append((char*)&last_recv, sizeof(last_recv)); - for (long seq = last_recv; seq <= last_sent; seq++) { - int c = cap_history[seq]; - bl.append((char*)&c, sizeof(c)); - } + ::_encode(cap_history, bl); } void _decode(bufferlist& bl, int& off) { bl.copy(off, sizeof(wanted_caps), (char*)&wanted_caps); @@ -119,12 +173,7 @@ public: off += sizeof(last_sent); bl.copy(off, sizeof(last_recv), (char*)&last_recv); off += sizeof(last_recv); - for (long seq=last_recv; seq<=last_sent; seq++) { - int c; - bl.copy(off, sizeof(c), (char*)&c); - off += sizeof(c); - cap_history[seq] = c; - } + ::_decode(cap_history, bl, off); } }; @@ -132,24 +181,5 @@ public: -/* -// capability helper functions! -inline int file_mode_want_caps(int mode) { - int wants = 0; - if (mode == CFILE_MODE_W ) { - wants |= CFILE_CAP_WR | CFILE_CAP_WRBUFFER; - } - else if (mode == CFILE_MODE_RW) { - wants |= CFILE_CAP_RDCACHE | CFILE_CAP_RD | CFILE_CAP_WR | CFILE_CAP_WRBUFFER; - } - else if (mode == CFILE_MODE_R ) { - wants |= CFILE_CAP_RDCACHE | CFILE_CAP_RD; - } - else assert(0); - return wants; -} -*/ - - #endif diff --git a/ceph/mds/Lock.h b/ceph/mds/Lock.h index 70f48125e3ecf..3fa57416905a3 100644 --- a/ceph/mds/Lock.h +++ b/ceph/mds/Lock.h @@ -7,56 +7,49 @@ using namespace std; #include "include/bufferlist.h" +#include "Capability.h" // STATES // basic lock -#define LOCK_SYNC 0 -#define LOCK_PRELOCK 1 -#define LOCK_LOCK 2 -#define LOCK_DELETING 3 // auth only -#define LOCK_DELETED 4 +#define LOCK_SYNC 0 // AR +#define LOCK_LOCK 1 // AR +#define LOCK_GLOCKR 2 // AR gather to lock from sync -// async lock -#define LOCK_ASYNC 5 -#define LOCK_GSYNC 6 // gather to sync -#define LOCK_GLOCK 7 // gather to lock -#define LOCK_GASYNC 8 // gather to async +// file lock states +#define LOCK_GLOCKW 3 // A gather to lock from wronly +#define LOCK_GLOCKM 4 // A gather to lock from mixed +#define LOCK_MIXED 5 // AR +#define LOCK_GMIXEDR 6 // AR gather to mixed from sync +#define LOCK_GMIXEDW 7 // A gather to mixed from wronly -// waits (on replica) -#define LOCK_WLOCKR 9 -#define LOCK_WLOCKW 10 -#define LOCK_WGASYNC 11 -#define LOCK_WGSYNC 12 +#define LOCK_WRONLY 8 // A +#define LOCK_GWRONLYR 9 // A gather to wronly from sync +#define LOCK_GWRONLYM 10 // A gather to wronly from mixed -#define LOCK_TYPE_BASIC 0 -#define LOCK_TYPE_ASYNC 1 +#define LOCK_GSYNCW 11 // A gather (clients) to sync from wronly +#define LOCK_GSYNCM 12 // A gather (clients) to sync from mixed -#define LOCK_MODE_SYNC 0 // return to sync when writes finish (or read requested) -#define LOCK_MODE_ASYNC 1 // return to async when reads finish (or write requested) +#define LOCK_TYPE_BASIC 0 +#define LOCK_TYPE_FILE 1 -// -- basic lock + +// -- lock (basic or soft lock) class CLock { protected: // lock state char type; char state; - char mode; set gather_set; // auth int nread, nwrite; - // dual meaning: - // on replicas, whether we've requested; - // on auth, whether others have requested. - //bool req_read, req_write; // FIXME: roll these into state, use a mask, whatever. public: CLock() {} CLock(char t) : type(t), state(LOCK_LOCK), - mode(LOCK_MODE_SYNC), nread(0), nwrite(0) { } @@ -65,11 +58,8 @@ class CLock { void encode_state(bufferlist& bl) { bl.append((char*)&type, sizeof(char)); bl.append((char*)&state, sizeof(state)); - bl.append((char*)&mode, sizeof(mode)); bl.append((char*)&nread, sizeof(nread)); bl.append((char*)&nwrite, sizeof(nwrite)); - //r.append((char*)&req_read, sizeof(req_read)); - //r.append((char*)&req_write, sizeof(req_write)); _encode(gather_set, bl); } @@ -78,16 +68,10 @@ class CLock { off += sizeof(type); bl.copy(off, sizeof(state), (char*)&state); off += sizeof(state); - bl.copy(off, sizeof(mode), (char*)&mode); - off += sizeof(mode); bl.copy(off, sizeof(nread), (char*)&nread); off += sizeof(nread); bl.copy(off, sizeof(nwrite), (char*)&nwrite); off += sizeof(nwrite); - //r.copy(off, sizeof(req_read), (char*)&req_read); - //off += sizeof(req_read); - //r.copy(off, sizeof(req_write), (char*)&req_write); - //off += sizeof(req_write); _decode(gather_set, bl, off); } @@ -99,15 +83,32 @@ class CLock { return s; }; - char get_mode() { return mode; } - char set_mode(char m) { - return mode = m; - } - char get_replica_state() { - if (state == LOCK_PRELOCK) return LOCK_LOCK; - if (state == LOCK_GLOCK) return LOCK_LOCK; - return state; // SYNC, LOCK, GASYNC, GSYNC + switch (state) { + case LOCK_LOCK: + case LOCK_GLOCKM: + case LOCK_GLOCKW: + case LOCK_GLOCKR: + case LOCK_WRONLY: + case LOCK_GWRONLYR: + case LOCK_GWRONLYM: + return LOCK_LOCK; + case LOCK_MIXED: + case LOCK_GMIXEDR: + return LOCK_MIXED; + case LOCK_SYNC: + return LOCK_SYNC; + + // after gather auth will bc LOCK_AC_MIXED or whatever + case LOCK_GSYNCM: + return LOCK_MIXED; + case LOCK_GSYNCW: + case LOCK_GMIXEDW: // ** LOCK isn't exact right state, but works. + return LOCK_LOCK; + + default: + assert(0); + } } // gather set @@ -118,6 +119,9 @@ class CLock { bool is_gathering(int i) { return gather_set.count(i); } + void clear_gather() { + gather_set.clear(); + } // ref counting int get_read() { return ++nread; } @@ -137,66 +141,134 @@ class CLock { return (nwrite+nread)>0 ? true:false; } - //bool get_req_read() { return req_read; } - //bool get_req_write() { return req_write; } - //void set_req_read(bool b) { req_read = b; } - //void set_req_write(bool b) { req_write = b; } - + /* void twiddle_export() { // was auth, now replica gather_set.clear(); - if (state == LOCK_PRELOCK || - state == LOCK_GLOCK) state = LOCK_LOCK; + if (state == LOCK_GLOCK) state = LOCK_LOCK; } void twiddle_import() { // was replica, now auth } + */ // stable bool is_stable() { - return (state == LOCK_SYNC) || (state == LOCK_LOCK) || (state == LOCK_ASYNC); + return (state == LOCK_SYNC) || + (state == LOCK_LOCK) || + (state == LOCK_MIXED) || + (state == LOCK_WRONLY); } // read/write access - bool could_read(bool auth) { - if (auth) - return false; - else - return (state == LOCK_WLOCKR) || (state == LOCK_WGASYNC); - } bool can_read(bool auth) { if (auth) - return (state == LOCK_SYNC) || (state == LOCK_PRELOCK) - || (state == LOCK_LOCK) || (state == LOCK_GASYNC); + return (state == LOCK_SYNC) || (state == LOCK_GMIXEDR) + || (state == LOCK_GLOCKR) || (state == LOCK_LOCK); else return (state == LOCK_SYNC); } bool can_read_soon(bool auth) { if (auth) - return (state == LOCK_GSYNC) || (state == LOCK_GLOCK); + return (state == LOCK_GLOCKW); else - return (state == LOCK_GSYNC); - } - - bool could_write(bool auth) { - if (auth) return false; - else - return (state == LOCK_WGSYNC) || (state == LOCK_WLOCKW); } + bool can_write(bool auth) { if (auth) - return (state == LOCK_LOCK) || (state == LOCK_ASYNC) || - (state == LOCK_GLOCK) || (state == LOCK_GSYNC); + return (state == LOCK_LOCK); else - return (state == LOCK_ASYNC); + return false; } bool can_write_soon(bool auth) { if (auth) - return (state == LOCK_PRELOCK) || (state == LOCK_GASYNC); + return (state == LOCK_GLOCKR) || (state == LOCK_GLOCKW) + || (state == LOCK_GLOCKM); else - return (state == LOCK_GASYNC); + return false; } + // client caps allowed + int caps_allowed(bool auth) { + if (auth) + switch (state) { + case LOCK_SYNC: + return CAP_FILE_RDCACHE | CAP_FILE_RD; + case LOCK_GMIXEDR: + case LOCK_GSYNCM: + return CAP_FILE_RD; + case LOCK_MIXED: + return CAP_FILE_RD | CAP_FILE_WR; + case LOCK_GMIXEDW: + case LOCK_GWRONLYM: + return CAP_FILE_WR; + case LOCK_WRONLY: + return CAP_FILE_WR | CAP_FILE_WRBUFFER; + case LOCK_LOCK: + case LOCK_GLOCKR: + case LOCK_GLOCKW: + case LOCK_GLOCKM: + case LOCK_GWRONLYR: + case LOCK_GSYNCW: + return 0; + default: + assert(0); + } + else + switch (state) { + case LOCK_SYNC: + return CAP_FILE_RDCACHE | CAP_FILE_RD; + case LOCK_GMIXEDR: + case LOCK_MIXED: + return CAP_FILE_RD; + case LOCK_LOCK: + case LOCK_GLOCKR: + return 0; + } + assert(0); + return 0; + } + int caps_allowed_soon(bool auth) { + if (auth) + switch (state) { + case LOCK_GSYNCM: + case LOCK_GSYNCW: + case LOCK_SYNC: + return CAP_FILE_RDCACHE | CAP_FILE_RD; + case LOCK_GMIXEDR: + case LOCK_GMIXEDW: + case LOCK_MIXED: + return CAP_FILE_RD | CAP_FILE_WR; + case LOCK_GWRONLYM: + case LOCK_GWRONLYR: + case LOCK_WRONLY: + return CAP_FILE_WR | CAP_FILE_WRBUFFER; + case LOCK_LOCK: + case LOCK_GLOCKR: + case LOCK_GLOCKW: + case LOCK_GLOCKM: + return 0; + default: + assert(0); + } + else + switch (state) { + case LOCK_SYNC: + return CAP_FILE_RDCACHE | CAP_FILE_RD; + case LOCK_GMIXEDR: + case LOCK_MIXED: + return CAP_FILE_RD; + case LOCK_LOCK: + case LOCK_GLOCKR: + return 0; + default: + assert(0); + } + assert(0); + return 0; + } + + friend class MDCache; }; @@ -205,18 +277,18 @@ inline ostream& operator<<(ostream& out, CLock& l) { static char* __lock_states[] = { "sync", - "prelock", "lock", - "deleting", - "deleted", - "async", - "gsync", - "glock", - "gasync", - "wlockr", - "wlockw", - "wgasync", - "wgsync" + "glockr", + "glockw", + "glockm", + "mixed", + "gmixedr", + "gmixedw", + "wronly", + "gwronlyr", + "gwronlym", + "gsyncw", + "gsyncm" }; out << "(" << __lock_states[(int)l.get_state()]; @@ -228,11 +300,6 @@ inline ostream& operator<<(ostream& out, CLock& l) if (l.get_nwrite()) out << " " << l.get_nwrite() << "w"; - if (l.get_mode() == LOCK_MODE_SYNC) - out << " Sm"; - if (l.get_mode() == LOCK_MODE_ASYNC) - out << " Am"; - // rw? /* out << " "; diff --git a/ceph/mds/MDCache.cc b/ceph/mds/MDCache.cc index bc9e7f2808f9b..415b38bfe2bc3 100644 --- a/ceph/mds/MDCache.cc +++ b/ceph/mds/MDCache.cc @@ -52,6 +52,8 @@ #include "messages/MDirUpdate.h" #include "messages/MCacheExpire.h" +#include "messages/MInodeFileCaps.h" + #include "messages/MInodeLink.h" #include "messages/MInodeLinkAck.h" #include "messages/MInodeUnlink.h" @@ -926,6 +928,10 @@ int MDCache::proc_message(Message *m) break; // cache fun + case MSG_MDS_INODEFILECAPS: + handle_inode_file_caps((MInodeFileCaps*)m); + break; + case MSG_CLIENT_FILECAPS: handle_client_file_caps((MClientFileCaps*)m); break; @@ -2034,9 +2040,9 @@ void MDCache::handle_discover(MDiscover *dis) } else { // send back to asker - dout(7) << "sending result back to asker " << dis->get_asker() << endl; + dout(7) << "sending result back to asker mds" << dis->get_asker() << endl; mds->messenger->send_message(reply, - dis->get_asker(), MDS_PORT_CACHE, MDS_PORT_CACHE); + MSG_ADDR_MDS(dis->get_asker()), MDS_PORT_CACHE, MDS_PORT_CACHE); } // done. @@ -2322,18 +2328,27 @@ void MDCache::handle_cache_expire(MCacheExpire *m) // remove from our cached_by dout(7) << "inode_expire on " << *in << " from mds" << from << " cached_by was " << in->cached_by << endl; in->cached_by_remove(from); + in->mds_caps_wanted.erase(from); - // fix locks + // note: this code calls _eval more often than it needs to! + // fix lock if (in->hardlock.is_gathering(from)) { in->hardlock.gather_set.erase(from); if (in->hardlock.gather_set.size() == 0) inode_hard_eval(in); } - if (in->softlock.is_gathering(from)) { - in->softlock.gather_set.erase(from); - if (in->softlock.gather_set.size() == 0) - inode_soft_eval(in); + if (in->filelock.is_gathering(from)) { + in->filelock.gather_set.erase(from); + if (in->filelock.gather_set.size() == 0) + inode_file_eval(in); + } + + // alone now? + if (!in->is_cached_by_anyone()) { + inode_hard_eval(in); + inode_file_eval(in); } + } else { // this is an old nonce, ignore expire. @@ -2461,7 +2476,7 @@ void MDCache::handle_dir_update(MDirUpdate *m) } // update - dout(1) << "dir_update on " << m->get_ino() << endl; + dout(1) << "dir_update on " << *in->dir << endl; in->dir->dir_rep = m->get_dir_rep(); in->dir->dir_rep_by = m->get_dir_rep_by(); @@ -3387,12 +3402,12 @@ __uint64_t MDCache::issue_file_data_version(CInode *in) } -Capability* MDCache::issue_file_caps(CInode *in, - int mode, - MClientRequest *req) +Capability* MDCache::issue_new_caps(CInode *in, + int mode, + MClientRequest *req) { - dout(7) << "issue_file_caps for mode " << mode << " on " << *in << endl; - + dout(7) << "issue_new_caps for mode " << mode << " on " << *in << endl; + // my needs int my_client = req->get_client(); int my_want = 0; @@ -3400,117 +3415,256 @@ Capability* MDCache::issue_file_caps(CInode *in, if (mode & FILE_MODE_W) my_want |= CAP_FILE_WRBUFFER | CAP_FILE_WR; // register a capability - Capability *cap = in->get_cap(my_client); + Capability *cap = in->get_client_cap(my_client); if (!cap) { + // new cap Capability c(my_want); - in->add_cap(my_client, c); - cap = in->get_cap(my_client); - } + in->add_client_cap(my_client, c); + cap = in->get_client_cap(my_client); + } else { + // make sure it has sufficient caps + if (cap->wanted() & ~my_want) { + // augment wanted caps for this client + cap->set_wanted( cap->wanted() | my_want ); + } + } + + // suppress file cap messages for this guy for a few moments (we'll bundle with the open() reply) + cap->set_suppress(true); + int before = cap->pending(); + + if (in->is_auth()) { + // [auth] twiddle mode? + inode_file_eval(in); + } else { + // [replica] tell auth about any new caps wanted + request_inode_file_caps(in); + } + + // issue caps (pot. incl new one) + issue_caps(in); // note: _eval above may have done this already... + + // re-issue whatever we can + cap->issue(cap->pending()); + + // ok, stop suppressing. + cap->set_suppress(false); + + int now = cap->pending(); + if (before != now && + (before & CAP_FILE_WR) == 0 && + (now & CAP_FILE_WR)) { + // FIXME FIXME FIXME + } + + // twiddle file_data_version? + if ((before & CAP_FILE_WRBUFFER) == 0 && + (now & CAP_FILE_WRBUFFER)) { + in->inode.file_data_version++; + dout(7) << " incrementing file_data_version, now " << in->inode.file_data_version << " for " << *in << endl; + } + + return cap; +} + + + +bool MDCache::issue_caps(CInode *in) +{ + int allowed = in->filelock.caps_allowed(in->is_auth()); + dout(7) << "issue_caps filelock allows=" << cap_string(allowed) << " on " << *in << endl; - // look at what caps are already issued - int issued = 0; - int want = my_want; - int conflicts = 0; - for (map::iterator it = in->caps.begin(); - it != in->caps.end(); + // look at what allowed, needed caps conflict with + int need_conflicts = 0; + int issued_conflicts = 0; + for (map::iterator it = in->client_caps.begin(); + it != in->client_caps.end(); it++) { - issued |= it->second.issued(); - want |= it->second.wanted(); - if (it->first != my_client) - conflicts |= it->second.wanted_conflicts(); + need_conflicts |= it->second.needed_conflicts(); + issued_conflicts |= it->second.issued_conflicts(); } - dout(10) << " issued: " << cap_string(issued) << endl; - dout(10) << " want: " << cap_string(want) << endl; - dout(10) << " want conflicts: " << cap_string(conflicts) << endl; + for (map::iterator it = in->mds_caps_wanted.begin(); + it != in->mds_caps_wanted.end(); + it++) { + need_conflicts |= Capability::conflicts( Capability::needed( it->second ) ); + } + dout(10) << " need conflicts: " << cap_string(need_conflicts) << endl; + dout(10) << "issued conflicts: " << cap_string(issued_conflicts) << endl; // what's allowed, given this mix? - int allowed = want - (want & conflicts); - dout(10) << " allowed: " << cap_string(allowed) << endl; + allowed -= allowed & need_conflicts; + allowed -= allowed & issued_conflicts; + dout(10) << " allowed: " << cap_string(allowed) << endl; - // problems? - if (issued & conflicts) { - dout(7) << " conflict with existing caps: " << cap_string(issued & conflicts) << endl; + // count conflicts with + int nissued = 0; - // call back caps! - for (map::iterator it = in->caps.begin(); - it != in->caps.end(); - it++) { - int extra = it->second.pending() - (it->second.pending() & allowed); - dout(7) << " client " << it->first << " has pending " << it->second.pending() << " .. extra is " << extra << endl; - if (extra) { - // issue restricted caps - it->second.issue(it->second.pending() - extra); - - // send - dout(7) << " sending MClientFileCaps on " << it->first << " new pending " << it->second.pending() << endl; - mds->messenger->send_message(new MClientFileCaps(in, it->second, true), + // client caps + for (map::iterator it = in->client_caps.begin(); + it != in->client_caps.end(); + it++) { + if (it->second.issued() != (it->second.wanted() & allowed)) { + // issue + nissued++; + + int before = it->second.pending(); + long seq = it->second.issue(it->second.wanted() & allowed); + int after = it->second.pending(); + + // twiddle file_data_version? + if (!(before & CAP_FILE_WRBUFFER) && + (after & CAP_FILE_WRBUFFER)) { + dout(7) << " incrementing file_data_version for " << *in << endl; + in->inode.file_data_version++; + } + + if (seq > 0 && + !it->second.is_suppress()) { + dout(7) << " sending MClientFileCaps to client" << it->first << " seq " << it->second.get_last_seq() << " new pending " << cap_string(it->second.pending()) << " was " << cap_string(before) << endl; + mds->messenger->send_message(new MClientFileCaps(in->inode, + it->second.get_last_seq(), + it->second.pending(), + it->second.wanted(), + it->first), MSG_ADDR_CLIENT(it->first), 0, MDS_PORT_CACHE); } } + } + + return (nissued == 0); // true if no re-issued, no callbacks +} + + + +void MDCache::request_inode_file_caps(CInode *in) +{ + int wanted = in->get_caps_wanted(); + if (wanted != in->replica_caps_wanted) { + in->replica_caps_wanted = wanted; + + int auth = in->authority(); + dout(7) << "request_inode_file_caps " << cap_string(in->replica_caps_wanted) + << " on " << *in << " to mds" << auth << endl; + assert(!in->is_auth()); - in->add_waiter(CINODE_WAIT_CAPS, new C_MDS_RetryRequest(mds, req, in)); - return 0; + mds->messenger->send_message(new MInodeFileCaps(in->ino(), mds->get_nodeid(), + in->replica_caps_wanted), + MSG_ADDR_MDS(auth), MDS_PORT_CACHE, MDS_PORT_CACHE); } +} - // we're okay! - int caps = my_want & allowed; - dout(7) << " issuing caps " << cap_string(caps) << " (i want " << cap_string(my_want) << ", allowed " << cap_string(allowed) << ")" << endl; - assert(caps > 0); +void MDCache::handle_inode_file_caps(MInodeFileCaps *m) +{ + CInode *in = get_inode(m->get_ino()); + assert(in); + assert(in->is_auth() || in->is_proxy()); - // issue - cap->issue(caps); + dout(7) << "handle_inode_file_caps replica mds" << m->get_from() << " wants caps " << cap_string(m->get_caps()) << " on " << *in << endl; - // issuing new write permissions? - if ((issued & CAP_FILE_WR) == 0 && - ( caps & CAP_FILE_WR) != 0) { - dout(7) << " incrementing file_data_version for " << *in << endl; - in->inode.file_data_version++; + if (in->is_proxy()) { + dout(7) << "proxy, fw" << endl; + mds->messenger->send_message(m, MSG_ADDR_MDS(in->authority()), MDS_PORT_CACHE, MDS_PORT_CACHE); + return; } - return cap; + if (m->get_caps()) + in->mds_caps_wanted[m->get_from()] = m->get_caps(); + else + in->mds_caps_wanted.erase(m->get_from()); + + inode_file_eval(in); + delete m; } + +/* + * note: we only get these from the client if + * - we are calling back previously issued caps (fewer than the client previously had) + * - or if the client releases (any of) its caps on its own + */ void MDCache::handle_client_file_caps(MClientFileCaps *m) { CInode *in = get_inode(m->get_ino()); - if (!in || !in->is_auth()) { + Capability *cap = 0; + if (in) + cap = in->get_client_cap(m->get_client()); + + if (!in || !cap) { int next; if (!in) { dout(7) << "handle_client_file_caps on unknown ino " << m->get_ino() << " passing buck" << endl; - next = mds->get_nodeid() + 1; - if (next >= mds->get_cluster()->get_num_mds()) next = 0; } else { - dout(7) << "handle_client_file_caps not auth on " << *in << endl; - next = in->authority(); + dout(7) << "handle_client_file_caps no cap for client" << m->get_client() << " on " << *in << endl; + //next = in->authority(); } + next = mds->get_nodeid() + 1; + if (next >= mds->get_cluster()->get_num_mds()) next = 0; + mds->messenger->send_message(m, MSG_ADDR_MDS(next), m->get_dest_port()); return; } - - dout(7) << "handle_client_file_caps on " << *in << " confirmed caps " << m->get_caps() << endl; - - Capability *cap = in->get_cap(m->get_client()); + assert(cap); - cap->confirm_receipt(m->get_seq()); - + dout(7) << "handle_client_file_caps seq " << m->get_seq() + << " confirms caps " << cap_string(m->get_caps()) + << " wants " << cap_string(m->get_wanted()) + << " from client" << m->get_client() + << " on " << *in + << endl; + + // update wanted + if (cap->wanted() != m->get_wanted()) + cap->set_wanted(m->get_wanted()); + + // confirm caps + int had = cap->confirm_receipt(m->get_seq(), m->get_caps()); + int has = cap->confirmed(); + if (cap->is_null()) { + dout(7) << " cap for client" << m->get_client() << " is now null, removing from " << *in << endl; + in->remove_client_cap(m->get_client()); + if (!in->is_auth()) + request_inode_file_caps(in); + } + + // merge in atime? + if (m->get_inode().atime > in->inode.atime) { + dout(7) << " taking atime " << m->get_inode().atime << " > " + << in->inode.atime << " for " << *in << endl; + in->inode.atime = m->get_inode().atime; + } + + if ((has|had) & CAP_FILE_WR) { + bool dirty = false; + + // mtime + if (m->get_inode().mtime > in->inode.mtime) { + dout(7) << " taking mtime " << m->get_inode().mtime << " > " + << in->inode.mtime << " for " << *in << endl; + in->inode.mtime = m->get_inode().mtime; + dirty = true; + } + // size + if (m->get_inode().size > in->inode.size) { + dout(7) << " taking size " << m->get_inode().size << " > " + << in->inode.size << " for " << *in << endl; + in->inode.size = m->get_inode().size; + dirty = true; + } + + if (dirty) + mds->mdlog->submit_entry(new EInodeUpdate(in)); + } + // reevaluate, waiters - eval_file_caps(in); + inode_file_eval(in); + in->finish_waiting(CINODE_WAIT_CAPS, 0); delete m; } -void MDCache::eval_file_caps(CInode *in) -{ - dout(7) << "eval_file_caps " << *in << endl; - - in->finish_waiting(CINODE_WAIT_CAPS, 0); - // *** -} - - @@ -3523,49 +3677,23 @@ void MDCache::eval_file_caps(CInode *in) /* -LOCKS: - - three states: - - Auth Replica State - R R normal/sync fw writes to auth - RW - lock ping auth for R/W? - W W async (*) fw reads to auth - - * only defined for soft inode metadata, right? - - we also remember: - auth: - set replicas - bool req_r, req_w - - replica: - last_sync - stamp of last time we were sync - - - - - INODES: = two types of inode metadata: hard - uid/gid, mode - soft - mtime, size + file - mtime, size ? atime - atime (*) <-- we want a lazy update strategy? - * if we want _strict_ atime behavior, atime can be folded into soft. - for lazy atime, should we just leave the atime lock in async state? XXX - = correspondingly, two types of inode locks: hardlock - hard metadata - softlock - soft metadata + filelock - file metadata -> These locks are completely orthogonal! = metadata ops and how they affect inode metadata: sma=size mtime atime - HARD SOFT OP + HARD FILE OP files: R RRR stat RW chmod/chown @@ -3621,8 +3749,8 @@ void MDCache::handle_lock(MLock *m) handle_lock_inode_hard(m); break; - case LOCK_OTYPE_ISOFT: - handle_lock_inode_soft(m); + case LOCK_OTYPE_IFILE: + handle_lock_inode_file(m); break; case LOCK_OTYPE_DIR: @@ -3641,6 +3769,8 @@ void MDCache::handle_lock(MLock *m) } + +// =============================== // hard inode metadata bool MDCache::inode_hard_read_try(CInode *in, Context *con) @@ -3696,8 +3826,7 @@ bool MDCache::inode_hard_read_start(CInode *in, MClientRequest *m) void MDCache::inode_hard_read_finish(CInode *in) { // drop ref - assert(in->hardlock.can_read(in->is_auth()) || - in->hardlock.could_read(in->is_auth())); + assert(in->hardlock.can_read(in->is_auth())); in->hardlock.put_read(); dout(7) << "inode_hard_read_finish on " << *in << endl; @@ -3759,8 +3888,7 @@ bool MDCache::inode_hard_write_start(CInode *in, MClientRequest *m) void MDCache::inode_hard_write_finish(CInode *in) { // drop ref - assert(in->hardlock.can_write(in->is_auth()) || - in->hardlock.could_write(in->is_auth())); + assert(in->hardlock.can_write(in->is_auth())); in->hardlock.put_write(); in->auth_unpin(); dout(7) << "inode_hard_write_finish on " << *in << endl; @@ -3784,10 +3912,10 @@ void MDCache::inode_hard_eval(CInode *in) // finished gather? if (in->is_auth() && !in->hardlock.is_stable() && - in->hardlock.gather_set.size() == 0) { + in->hardlock.gather_set.empty()) { dout(7) << "inode_hard_eval finished gather on " << *in << endl; switch (in->hardlock.get_state()) { - case LOCK_PRELOCK: + case LOCK_GLOCKR: in->hardlock.set_state(LOCK_LOCK); // waiters @@ -3828,7 +3956,7 @@ void MDCache::inode_hard_sync(CInode *in) // check state if (in->hardlock.get_state() == LOCK_SYNC) return; // already sync - if (in->hardlock.get_state() == LOCK_PRELOCK) + if (in->hardlock.get_state() == LOCK_GLOCKR) assert(0); // um... hmm! assert(in->hardlock.get_state() == LOCK_LOCK); @@ -3862,7 +3990,7 @@ void MDCache::inode_hard_lock(CInode *in) // check state if (in->hardlock.get_state() == LOCK_LOCK || - in->hardlock.get_state() == LOCK_PRELOCK) + in->hardlock.get_state() == LOCK_GLOCKR) return; // already lock or locking assert(in->hardlock.get_state() == LOCK_SYNC); @@ -3878,7 +4006,7 @@ void MDCache::inode_hard_lock(CInode *in) } // change lock - in->hardlock.set_state(LOCK_PRELOCK); + in->hardlock.set_state(LOCK_GLOCKR); in->hardlock.init_gather(in->get_cached_by()); } @@ -3905,27 +4033,24 @@ void MDCache::handle_lock_inode_hard(MLock *m) // fw int newauth = in->authority(); assert(newauth >= 0); - dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, fw to " << newauth << endl; - mds->messenger->send_message(m, - MSG_ADDR_MDS(newauth), MDS_PORT_CACHE, - MDS_PORT_CACHE); + if (from == newauth) { + dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, but from new auth, dropping" << endl; + delete m; + } else { + dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, fw to " << newauth << endl; + mds->messenger->send_message(m, + MSG_ADDR_MDS(newauth), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } return; } } else { // replica if (!in) { dout(7) << "handle_lock_inode_hard " << m->get_ino() << ": don't have it anymore" << endl; - /* - * do NOT nak.. if we go that route we need ot duplicate all the nonce funkiness + /* do NOT nak.. if we go that route we need to duplicate all the nonce funkiness to keep gather_set a proper/correct subset of cached_by. better to use the existing - cacheexpire mechanism. - */ - /* - MLock *reply = new MLock(m->get_action() + 3, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_IHARD); - mds->messenger->send_message(reply, - MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE, - MDS_PORT_CACHE); + cacheexpire mechanism instead! */ delete m; return; @@ -3958,16 +4083,16 @@ void MDCache::handle_lock_inode_hard(MLock *m) break; case LOCK_AC_LOCK: - assert(lock->get_state() == LOCK_SYNC || - lock->get_state() == LOCK_WLOCKR); + assert(lock->get_state() == LOCK_SYNC); + //|| lock->get_state() == LOCK_GLOCKR); // wait for readers to finish? if (lock->get_nread() > 0) { dout(7) << "handle_lock_inode_hard readers, waiting before ack on " << *in << endl; - lock->set_state(LOCK_WLOCKR); + lock->set_state(LOCK_GLOCKR); in->add_waiter(CINODE_WAIT_HARDNORD, new C_MDS_RetryMessage(mds,m)); - assert(0); // does this every happen? (if so, fix hard_read_finish!) + assert(0); // does this ever happen? (if so, fix hard_read_finish, and CInodeExport.update_inode!) return; } else { @@ -3986,12 +4111,8 @@ void MDCache::handle_lock_inode_hard(MLock *m) // -- auth -- - case LOCK_AC_LOCKNAK: - // do NOT remove from cached_by; we don't know the nonce! - // and somewhere out there there's an expire that will take care of it. - case LOCK_AC_LOCKACK: - assert(lock->state == LOCK_PRELOCK); + assert(lock->state == LOCK_GLOCKR); assert(lock->gather_set.count(from)); lock->gather_set.erase(from); @@ -4012,491 +4133,723 @@ void MDCache::handle_lock_inode_hard(MLock *m) // soft inode metadata -bool MDCache::inode_soft_read_start(CInode *in, MClientRequest *m) +bool MDCache::inode_file_read_start(CInode *in, MClientRequest *m) { - dout(7) << "inode_soft_read_start " << *in << " softlock=" << in->softlock << endl; - - if (in->is_auth() && - !in->softlock.can_read(in->is_auth()) && - !in->is_cached_by_anyone()) { - in->softlock.set_state(LOCK_LOCK); // twiddle lock at will - } + dout(7) << "inode_file_read_start " << *in << " filelock=" << in->filelock << endl; // can read? grab ref. - if (in->softlock.can_read(in->is_auth())) { - in->softlock.get_read(); + if (in->filelock.can_read(in->is_auth())) { + in->filelock.get_read(); return true; } // can't read, and replicated. - if (in->softlock.can_read_soon(in->is_auth())) { + if (in->filelock.can_read_soon(in->is_auth())) { // wait - dout(7) << "inode_soft_read_start can_read_soon " << *in << endl; + dout(7) << "inode_file_read_start can_read_soon " << *in << endl; } else { if (in->is_auth()) { // auth // FIXME or qsync? - if (in->softlock.is_stable()) { - assert(in->softlock.get_state() == LOCK_ASYNC); // should be async! - inode_soft_lock(in); // lock, easier to back off + if (in->filelock.is_stable()) { + inode_file_lock(in); // lock, bc easiest to back off + + if (in->filelock.can_read(in->is_auth())) { + in->filelock.get_read(); + + in->filelock.get_write(); + in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE); + in->filelock.put_write(); + return true; + } } else { - dout(7) << "inode_soft_read_start waiting until stable on " << *in << ", softlock=" << in->softlock << endl; - in->add_waiter(CINODE_WAIT_SOFTSTABLE, new C_MDS_RetryRequest(mds, m, in)); + dout(7) << "inode_file_read_start waiting until stable on " << *in << ", filelock=" << in->filelock << endl; + in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in)); return false; } } else { // replica - if (in->softlock.is_stable()) { + if (in->filelock.is_stable()) { - // HACK FIXME - - if (true || in->softlock.get_mode() == LOCK_MODE_ASYNC) { - // fw to auth - int auth = in->authority(); - dout(5) << "inode_soft_read_start " << *in << " on replica and async, fw to auth " << auth << endl; - assert(auth != mds->get_nodeid()); - request_forward(m, auth); - return false; - } else { - // wait. - // recall maybe? - } + // fw to auth + int auth = in->authority(); + dout(5) << "inode_file_read_start " << *in << " on replica and async, fw to auth " << auth << endl; + assert(auth != mds->get_nodeid()); + request_forward(m, auth); + return false; } else { // wait until stable - dout(7) << "inode_soft_read_start waiting until stable on " << *in << ", softlock=" << in->softlock << endl; - in->add_waiter(CINODE_WAIT_SOFTSTABLE, new C_MDS_RetryRequest(mds, m, in)); + dout(7) << "inode_file_read_start waiting until stable on " << *in << ", filelock=" << in->filelock << endl; + in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in)); return false; } } } // wait - dout(7) << "inode_soft_read_start waiting on " << *in << ", softlock=" << in->softlock << endl; - in->add_waiter(CINODE_WAIT_SOFTR, new C_MDS_RetryRequest(mds, m, in)); + dout(7) << "inode_file_read_start waiting on " << *in << ", filelock=" << in->filelock << endl; + in->add_waiter(CINODE_WAIT_FILER, new C_MDS_RetryRequest(mds, m, in)); return false; } -void MDCache::inode_soft_read_finish(CInode *in) +void MDCache::inode_file_read_finish(CInode *in) { // drop ref - assert(in->softlock.can_read(in->is_auth()) || - in->softlock.could_read(in->is_auth())); - in->softlock.put_read(); + assert(in->filelock.can_read(in->is_auth())); + in->filelock.put_read(); - dout(7) << "inode_soft_read_finish on " << *in << ", softlock=" << in->softlock << endl; + dout(7) << "inode_file_read_finish on " << *in << ", filelock=" << in->filelock << endl; - if (in->softlock.get_nread() == 0) { - in->finish_waiting(CINODE_WAIT_SOFTNORD); - inode_soft_eval(in); + if (in->filelock.get_nread() == 0) { + in->finish_waiting(CINODE_WAIT_FILENORD); + inode_file_eval(in); } } -bool MDCache::inode_soft_write_start(CInode *in, MClientRequest *m) +bool MDCache::inode_file_write_start(CInode *in, MClientRequest *m) { - // if no replicated, i can twiddle lock at will - if (in->is_auth() && - !in->is_cached_by_anyone() && - in->softlock.get_state() != LOCK_LOCK) - in->softlock.set_state(LOCK_LOCK); - // can write? grab ref. - if (in->softlock.can_write(in->is_auth())) { - in->softlock.get_write(); + if (in->filelock.can_write(in->is_auth())) { + in->filelock.get_write(); return true; } // can't write, replicated. if (in->is_auth()) { // auth - if (in->softlock.can_write_soon(in->is_auth())) { + if (in->filelock.can_write_soon(in->is_auth())) { // just wait } else { // initiate lock - // OR async ....... FIXME - inode_soft_lock(in); + inode_file_lock(in); + + if (in->filelock.can_write(in->is_auth())) { + in->filelock.get_write(); + + in->filelock.get_read(); + in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE); + in->filelock.put_read(); + return true; + } } - dout(7) << "inode_soft_write_start on auth, waiting for write on " << *in << endl; - in->add_waiter(CINODE_WAIT_SOFTW, new C_MDS_RetryRequest(mds, m, in)); - + dout(7) << "inode_file_write_start on auth, waiting for write on " << *in << endl; + in->add_waiter(CINODE_WAIT_FILEW, new C_MDS_RetryRequest(mds, m, in)); return false; } else { // replica - - if (in->softlock.get_mode() == LOCK_MODE_ASYNC) { - // wait - dout(5) << "inode_soft_write_start " << *in << " on replica, sync but mode async, waiting " << endl; - in->add_waiter(CINODE_WAIT_SOFTW, new C_MDS_RetryRequest(mds, m, in)); - return false; - } else { - // fw to auth - int auth = in->authority(); - dout(5) << "inode_soft_write_start " << *in << " on replica, fw to auth " << auth << endl; - assert(auth != mds->get_nodeid()); - request_forward(m, auth); - return false; - } + // fw to auth + int auth = in->authority(); + dout(5) << "inode_file_write_start " << *in << " on replica, fw to auth " << auth << endl; + assert(auth != mds->get_nodeid()); + request_forward(m, auth); + return false; } - } -void MDCache::inode_soft_write_finish(CInode *in) +void MDCache::inode_file_write_finish(CInode *in) { // drop ref - assert(in->softlock.can_write(in->is_auth()) || - in->softlock.could_write(in->is_auth())); - in->softlock.put_write(); - dout(7) << "inode_soft_write_finish on " << *in << ", softlock=" << in->softlock << endl; + assert(in->filelock.can_write(in->is_auth())); + in->filelock.put_write(); + dout(7) << "inode_file_write_finish on " << *in << ", filelock=" << in->filelock << endl; // drop lock? - if (in->softlock.get_nwrite() == 0) { - in->finish_waiting(CINODE_WAIT_SOFTNOWR); - inode_soft_eval(in); + if (in->filelock.get_nwrite() == 0) { + in->finish_waiting(CINODE_WAIT_FILENOWR); + inode_file_eval(in); } } -void MDCache::inode_soft_eval(CInode *in) +/* + * ... + * + * also called after client caps are acked to us + * - checks if we're in unstable sfot state and can now move on to next state + * - checks if soft state should change (eg bc last writer closed) + */ + +void MDCache::inode_file_eval(CInode *in) { - // finished gather? + int issued = in->get_caps_issued(); + + // [auth] finished gather? if (in->is_auth() && - !in->softlock.is_stable() && - in->softlock.gather_set.size() == 0) { - dout(7) << "inode_soft_eval finished gather on " << *in << endl; - switch (in->softlock.get_state()) { - case LOCK_PRELOCK: - in->softlock.set_state(LOCK_LOCK); - - // waiters - in->softlock.get_read(); - in->softlock.get_write(); - in->finish_waiting(CINODE_WAIT_SOFTRWB|CINODE_WAIT_SOFTSTABLE); - in->softlock.put_read(); - in->softlock.put_write(); + !in->filelock.is_stable() && + in->filelock.gather_set.size() == 0) { + dout(7) << "inode_file_eval finished mds gather on " << *in << endl; + + switch (in->filelock.get_state()) { + // to lock + case LOCK_GLOCKR: + case LOCK_GLOCKM: + case LOCK_GLOCKW: + if (issued == 0) { + in->filelock.set_state(LOCK_LOCK); + + // waiters + in->filelock.get_read(); + in->filelock.get_write(); + in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE); + in->filelock.put_read(); + in->filelock.put_write(); + } break; - case LOCK_GASYNC: - in->softlock.set_state(LOCK_ASYNC); - - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - MLock *reply = new MLock(LOCK_AC_ASYNC, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_ISOFT); - mds->messenger->send_message(reply, - MSG_ADDR_MDS(*it), MDS_PORT_CACHE, - MDS_PORT_CACHE); + // to mixed + case LOCK_GMIXEDR: + if ((issued & ~(CAP_FILE_RD)) == 0) { + in->filelock.set_state(LOCK_MIXED); + in->finish_waiting(CINODE_WAIT_FILESTABLE); } - - // waiters - in->softlock.get_write(); - in->finish_waiting(CINODE_WAIT_SOFTW|CINODE_WAIT_SOFTSTABLE); - in->softlock.put_write(); break; - - case LOCK_GSYNC: - in->softlock.set_state(LOCK_SYNC); - { // bcast data to replicas - crope softdata; - in->encode_soft_state(softdata); - - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - MLock *reply = new MLock(LOCK_AC_SYNC, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_ISOFT); - reply->set_data(softdata); - mds->messenger->send_message(reply, - MSG_ADDR_MDS(*it), MDS_PORT_CACHE, - MDS_PORT_CACHE); + case LOCK_GMIXEDW: + if ((issued & ~(CAP_FILE_WR)) == 0) { + in->filelock.set_state(LOCK_MIXED); + + if (in->is_cached_by_anyone()) { + // data + crope softdata; + in->encode_file_state(softdata); + + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_IFILE); + m->set_data(softdata); + mds->messenger->send_message(m, + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } } + + in->finish_waiting(CINODE_WAIT_FILESTABLE); + } + break; + + // to wronly + case LOCK_GWRONLYR: + if (issued == 0) { + in->filelock.set_state(LOCK_WRONLY); + in->finish_waiting(CINODE_WAIT_FILESTABLE); + } + break; + + case LOCK_GWRONLYM: + if ((issued & ~CAP_FILE_WR) == 0) { + in->filelock.set_state(LOCK_WRONLY); + in->finish_waiting(CINODE_WAIT_FILESTABLE); } - - // waiters - in->softlock.get_read(); - in->finish_waiting(CINODE_WAIT_SOFTR|CINODE_WAIT_SOFTSTABLE); - in->softlock.put_read(); break; - case LOCK_GLOCK: - in->softlock.set_state(LOCK_LOCK); - - // waiters - in->softlock.get_read(); - in->softlock.get_write(); - in->finish_waiting(CINODE_WAIT_SOFTRWB|CINODE_WAIT_SOFTSTABLE); - in->softlock.put_read(); - in->softlock.put_write(); + // to sync + case LOCK_GSYNCW: + case LOCK_GSYNCM: + if ((issued & ~(CAP_FILE_RD)) == 0) { + in->filelock.set_state(LOCK_SYNC); + + { // bcast data to replicas + crope softdata; + in->encode_file_state(softdata); + + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *reply = new MLock(LOCK_AC_SYNC, mds->get_nodeid()); + reply->set_ino(in->ino(), LOCK_OTYPE_IFILE); + reply->set_data(softdata); + mds->messenger->send_message(reply, + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } + } + + // waiters + in->filelock.get_read(); + in->finish_waiting(CINODE_WAIT_FILER|CINODE_WAIT_FILESTABLE); + in->filelock.put_read(); + } break; default: assert(0); } } - if (!in->softlock.is_stable()) return; // do nothing - if (in->is_auth()) { - // auth - - // check our mode - /* - if ((in->is_open_write() || in->num_replica_writers()) && - in->softlock.get_mode() != LOCK_MODE_ASYNC) { - inode_soft_mode(in,LOCK_MODE_ASYNC); - } - */ - if (!in->is_write_caps() && - in->softlock.get_mode() != LOCK_MODE_SYNC) { - inode_soft_mode(in,LOCK_MODE_SYNC); - } - - // check our state - if (in->softlock.get_mode() == LOCK_MODE_SYNC) { - // sync mode. bump state to sync? - if (in->is_cached_by_anyone() && - //!in->is_open_write() && // hack? - in->softlock.get_nwrite() == 0 && - in->softlock.get_state() != LOCK_SYNC) { - dout(7) << "inode_soft_eval stable, syncing " << *in << ", softlock=" << in->softlock << endl; - inode_soft_sync(in); + // [replica] finished caps gather? + if (!in->is_auth() && + !in->filelock.is_stable()) { + switch (in->filelock.get_state()) { + case LOCK_GMIXEDR: + if ((issued & ~(CAP_FILE_RD)) == 0) { + in->filelock.set_state(LOCK_MIXED); + + // ack + MLock *reply = new MLock(LOCK_AC_MIXEDACK, mds->get_nodeid()); + reply->set_ino(in->ino(), LOCK_OTYPE_IFILE); + mds->messenger->send_message(reply, + MSG_ADDR_MDS(in->authority()), MDS_PORT_CACHE, + MDS_PORT_CACHE); } - } - - else if (in->softlock.get_mode() == LOCK_MODE_ASYNC) { - // async mode. bump state to async? - if (in->is_cached_by_anyone() && - in->softlock.get_nread() == 0 && - in->softlock.get_state() != LOCK_ASYNC) { - dout(7) << "inode_soft_eval stable, asyncing " << *in << ", softlock=" << in->softlock << endl; - inode_soft_async(in); + break; + + case LOCK_GLOCKR: + if (issued == 0) { + in->filelock.set_state(LOCK_LOCK); + + // ack + MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid()); + reply->set_ino(in->ino(), LOCK_OTYPE_IFILE); + mds->messenger->send_message(reply, + MSG_ADDR_MDS(in->authority()), MDS_PORT_CACHE, + MDS_PORT_CACHE); } + break; + + default: + assert(0); } + } + + // !stable -> do nothing. + if (!in->filelock.is_stable()) return; + + // stable. + assert(in->filelock.is_stable()); + + if (in->is_auth()) { + // [auth] + int wanted = in->get_caps_wanted(); + dout(7) << "inode_file_eval wanted=" << cap_string(wanted) << " filelock=" << in->filelock << endl; + + // * -> wronly? + if (in->filelock.get_nread() == 0 && + in->filelock.get_nwrite() == 0 && + !(wanted & CAP_FILE_RD) && + (wanted & CAP_FILE_WR) && + in->filelock.get_state() != LOCK_WRONLY) { + dout(7) << "inode_file_eval stable, bump to wronly " << *in << ", filelock=" << in->filelock << endl; + inode_file_wronly(in); + } + + // * -> mixed? + else if (in->filelock.get_nread() == 0 && + in->filelock.get_nwrite() == 0 && + (wanted & CAP_FILE_RD) && + (wanted & CAP_FILE_WR) && + in->filelock.get_state() != LOCK_MIXED) { + dout(7) << "inode_file_eval stable, bump to mixed " << *in << ", filelock=" << in->filelock << endl; + inode_file_mixed(in); + } + + // * -> sync? + else if (in->filelock.get_nwrite() == 0 && + !(wanted & CAP_FILE_WR) && + ((wanted & CAP_FILE_RD) || in->is_cached_by_anyone()) && + in->filelock.get_state() != LOCK_SYNC) { + dout(7) << "inode_file_eval stable, bump to sync " << *in << ", filelock=" << in->filelock << endl; + inode_file_sync(in); + } + + // * -> lock? (if not replicated or open) + else if (!in->is_cached_by_anyone() && + wanted == 0 && + in->filelock.get_state() != LOCK_LOCK) { + inode_file_lock(in); + } + } else { // replica // recall? check wiaters? XXX } } + // mid -void MDCache::inode_soft_mode(CInode *in, int mode) +bool MDCache::inode_file_sync(CInode *in) { + dout(7) << "inode_file_sync " << *in << " filelock=" << in->filelock << endl; + assert(in->is_auth()); - - in->softlock.set_mode(mode); - dout(7) << "inode_soft_mode mode=" << mode << " " << *in << " softlock=" << in->softlock << endl; - - // tell replicas - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - int ac; - switch (mode) { - case LOCK_MODE_SYNC: ac = LOCK_AC_SYNC_MODE; break; - case LOCK_MODE_ASYNC: ac = LOCK_AC_ASYNC_MODE; break; - default: assert(0); - } - MLock *m = new MLock(ac, mds->get_nodeid()); - m->set_ino(in->ino(), LOCK_OTYPE_ISOFT); - mds->messenger->send_message(m, - MSG_ADDR_MDS(*it), MDS_PORT_CACHE, - MDS_PORT_CACHE); - } - // caller shoudl probably eval our state -} + // check state + if (in->filelock.get_state() == LOCK_SYNC || + in->filelock.get_state() == LOCK_GSYNCW || + in->filelock.get_state() == LOCK_GSYNCM) + return true; -bool MDCache::inode_soft_sync(CInode *in) -{ - dout(7) << "inode_soft_sync " << *in << " softlock=" << in->softlock << endl; + assert(in->filelock.is_stable()); - assert(in->is_auth()); + int issued = in->get_caps_issued(); - // check state - if (in->softlock.get_state() == LOCK_SYNC) return true; - if (in->softlock.get_state() == LOCK_GSYNC) return false; - - assert(in->softlock.is_stable()); - if (in->softlock.get_state() == LOCK_PRELOCK || - in->softlock.get_state() == LOCK_GLOCK) - assert(0); // hmm! - assert(in->softlock.get_state() == LOCK_LOCK || - in->softlock.get_state() == LOCK_ASYNC); - - if (in->softlock.get_state() == LOCK_LOCK) { - // soft data - crope softdata; - in->encode_soft_state(softdata); - - // bcast to replicas - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid()); - m->set_ino(in->ino(), LOCK_OTYPE_ISOFT); - m->set_data(softdata); - mds->messenger->send_message(m, - MSG_ADDR_MDS(*it), MDS_PORT_CACHE, - MDS_PORT_CACHE); + assert((in->get_caps_wanted() & CAP_FILE_WR) == 0); + + if (in->filelock.get_state() == LOCK_LOCK) { + if (in->is_cached_by_anyone()) { + // soft data + crope softdata; + in->encode_file_state(softdata); + + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_IFILE); + m->set_data(softdata); + mds->messenger->send_message(m, + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } } - + // change lock - in->softlock.set_state(LOCK_SYNC); + in->filelock.set_state(LOCK_SYNC); + // reissue caps + issue_caps(in); return true; } - else if (in->softlock.get_state() == LOCK_ASYNC) { - // bcast to replicas - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - MLock *m = new MLock(LOCK_AC_GSYNC, mds->get_nodeid()); - m->set_ino(in->ino(), LOCK_OTYPE_ISOFT); - mds->messenger->send_message(m, - MSG_ADDR_MDS(*it), MDS_PORT_CACHE, - MDS_PORT_CACHE); - } - - // change lock - in->softlock.set_state(LOCK_GSYNC); - in->softlock.init_gather(in->get_cached_by()); + else if (in->filelock.get_state() == LOCK_MIXED) { + // writers? + if (issued & CAP_FILE_WR) { + // gather client write caps + in->filelock.set_state(LOCK_GSYNCM); + issue_caps(in); + } else { + // no writers, go straight to sync - return false; - } - else - assert(0); // wtf. -} + if (in->is_cached_by_anyone()) { + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_IFILE); + mds->messenger->send_message(m, + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } + } + + // change lock + in->filelock.set_state(LOCK_SYNC); + } + return false; + } + else if (in->filelock.get_state() == LOCK_WRONLY) { + // writers? + if (issued & CAP_FILE_WR) { + // gather client write caps + in->filelock.set_state(LOCK_GSYNCW); + issue_caps(in); + } else { + // no writers, go straight to sync + if (in->is_cached_by_anyone()) { + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_IFILE); + mds->messenger->send_message(m, + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } + } + + // change lock + in->filelock.set_state(LOCK_SYNC); + } + return false; + } + else + assert(0); // wtf. +} -void MDCache::inode_soft_lock(CInode *in) + +void MDCache::inode_file_lock(CInode *in) { - dout(7) << "inode_soft_lock " << *in << " softlock=" << in->softlock << endl; + dout(7) << "inode_file_lock " << *in << " filelock=" << in->filelock << endl; assert(in->is_auth()); // check state - if (in->softlock.get_state() == LOCK_LOCK || - in->softlock.get_state() == LOCK_PRELOCK || - in->softlock.get_state() == LOCK_GLOCK) + if (in->filelock.get_state() == LOCK_LOCK || + in->filelock.get_state() == LOCK_GLOCKR || + in->filelock.get_state() == LOCK_GLOCKM || + in->filelock.get_state() == LOCK_GLOCKW) return; // lock or locking - assert(in->softlock.is_stable()); - if (in->softlock.get_state() == LOCK_GSYNC) - assert(0); // hmm! - assert(in->softlock.get_state() == LOCK_SYNC || - in->softlock.get_state() == LOCK_ASYNC); - - if (in->softlock.get_state() == LOCK_SYNC) { - // bcast to replicas - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid()); - m->set_ino(in->ino(), LOCK_OTYPE_ISOFT); - mds->messenger->send_message(m, - MSG_ADDR_MDS(*it), MDS_PORT_CACHE, - MDS_PORT_CACHE); + + assert(in->filelock.is_stable()); + + int issued = in->get_caps_issued(); + + if (in->filelock.get_state() == LOCK_SYNC) { + if (in->is_cached_by_anyone()) { + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_IFILE); + mds->messenger->send_message(m, + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } + in->filelock.init_gather(in->get_cached_by()); + + // change lock + in->filelock.set_state(LOCK_GLOCKR); + + // call back caps + if (issued) + issue_caps(in); + } else { + if (issued) { + // call back caps + in->filelock.set_state(LOCK_GLOCKR); + issue_caps(in); + } else { + in->filelock.set_state(LOCK_LOCK); + } } - - // change lock - in->softlock.set_state(LOCK_PRELOCK); - in->softlock.init_gather(in->get_cached_by()); } - else if (in->softlock.get_state() == LOCK_ASYNC) { - // bcast to replicas - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - MLock *m = new MLock(LOCK_AC_GLOCK, mds->get_nodeid()); - m->set_ino(in->ino(), LOCK_OTYPE_ISOFT); - mds->messenger->send_message(m, - MSG_ADDR_MDS(*it), MDS_PORT_CACHE, - MDS_PORT_CACHE); + else if (in->filelock.get_state() == LOCK_MIXED) { + if (in->is_cached_by_anyone()) { + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_IFILE); + mds->messenger->send_message(m, + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } + in->filelock.init_gather(in->get_cached_by()); + } else { + assert(issued); } - + // change lock - in->softlock.set_state(LOCK_GLOCK); - in->softlock.init_gather(in->get_cached_by()); + in->filelock.set_state(LOCK_GLOCKM); + + // call back caps + issue_caps(in); + } + else if (in->filelock.get_state() == LOCK_WRONLY) { + if (issued & CAP_FILE_WR) { + // change lock + in->filelock.set_state(LOCK_GLOCKW); + + // call back caps + issue_caps(in); + } else { + in->filelock.set_state(LOCK_LOCK); + } } else assert(0); // wtf. } -void MDCache::inode_soft_async(CInode *in) +void MDCache::inode_file_mixed(CInode *in) { - dout(7) << "inode_soft_async " << *in << " softlock=" << in->softlock << endl; + dout(7) << "inode_file_mixed " << *in << " filelock=" << in->filelock << endl; assert(in->is_auth()); // check state - if (in->softlock.get_state() == LOCK_ASYNC) - return; // async - assert(in->softlock.is_stable()); - if (in->softlock.get_state() == LOCK_GSYNC || - in->softlock.get_state() == LOCK_GLOCK || - in->softlock.get_state() == LOCK_PRELOCK) - assert(0); // hmm! - assert(in->softlock.get_state() == LOCK_SYNC || - in->softlock.get_state() == LOCK_LOCK); - - if (in->softlock.get_state() == LOCK_SYNC) { - // bcast to replicas - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - MLock *m = new MLock(LOCK_AC_GASYNC, mds->get_nodeid()); - m->set_ino(in->ino(), LOCK_OTYPE_ISOFT); - mds->messenger->send_message(m, - MSG_ADDR_MDS(*it), MDS_PORT_CACHE, - MDS_PORT_CACHE); - } + if (in->filelock.get_state() == LOCK_GMIXEDR || + in->filelock.get_state() == LOCK_GMIXEDW) + return; // mixed or mixing + + assert(in->filelock.is_stable()); + + int issued = in->get_caps_issued(); + + if (in->filelock.get_state() == LOCK_SYNC) { + if (in->is_cached_by_anyone()) { + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_IFILE); + mds->messenger->send_message(m, + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } + in->filelock.init_gather(in->get_cached_by()); - // change lock - in->softlock.set_state(LOCK_GASYNC); - in->softlock.init_gather(in->get_cached_by()); + in->filelock.set_state(LOCK_GMIXEDR); + issue_caps(in); + } else { + if (issued) { + in->filelock.set_state(LOCK_GMIXEDR); + issue_caps(in); + } else { + in->filelock.set_state(LOCK_MIXED); + } + } } - else if (in->softlock.get_state() == LOCK_LOCK) { - // data - crope softdata; - in->encode_soft_state(softdata); - - // bcast to replicas - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - MLock *m = new MLock(LOCK_AC_ASYNC, mds->get_nodeid()); - m->set_ino(in->ino(), LOCK_OTYPE_ISOFT); - m->set_data(softdata); - mds->messenger->send_message(m, - MSG_ADDR_MDS(*it), MDS_PORT_CACHE, - MDS_PORT_CACHE); + else if (in->filelock.get_state() == LOCK_LOCK) { + if (in->is_cached_by_anyone()) { + // data + crope softdata; + in->encode_file_state(softdata); + + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_IFILE); + m->set_data(softdata); + mds->messenger->send_message(m, + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } } - + // change lock - in->softlock.set_state(LOCK_ASYNC); + in->filelock.set_state(LOCK_MIXED); + issue_caps(in); } + + else if (in->filelock.get_state() == LOCK_WRONLY) { + if (issued & CAP_FILE_WRBUFFER) { + // gather up WRBUFFER caps + in->filelock.set_state(LOCK_GMIXEDW); + issue_caps(in); + } + else if (in->is_cached_by_anyone()) { + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_IFILE); + mds->messenger->send_message(m, + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } + in->filelock.set_state(LOCK_MIXED); + issue_caps(in); + } else { + in->filelock.set_state(LOCK_MIXED); + issue_caps(in); + } + } + else assert(0); // wtf. } +void MDCache::inode_file_wronly(CInode *in) +{ + dout(7) << "inode_file_wronly " << *in << " filelock=" << in->filelock << endl; + + assert(in->is_auth()); + + // check state + if (in->filelock.get_state() == LOCK_WRONLY || + in->filelock.get_state() == LOCK_GWRONLYR || + in->filelock.get_state() == LOCK_GWRONLYM) + return; + + assert(in->filelock.is_stable()); + + int issued = in->get_caps_issued(); + assert((in->get_caps_wanted() & CAP_FILE_RD) == 0); + + if (in->filelock.get_state() == LOCK_SYNC) { + if (in->is_cached_by_anyone()) { + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_IFILE); + mds->messenger->send_message(m, + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } + in->filelock.init_gather(in->get_cached_by()); + + // change lock + in->filelock.set_state(LOCK_GWRONLYR); + + // call back caps + if (issued & CAP_FILE_RD) + issue_caps(in); + } else { + if (issued & CAP_FILE_RD) { + in->filelock.set_state(LOCK_GWRONLYR); + issue_caps(in); + } else { + in->filelock.set_state(LOCK_WRONLY); + issue_caps(in); + } + } + } + + else if (in->filelock.get_state() == LOCK_LOCK) { + // change lock. ignore replicas; they don't know about WRONLY. + in->filelock.set_state(LOCK_WRONLY); + issue_caps(in); + } + + else if (in->filelock.get_state() == LOCK_MIXED) { + if (in->is_cached_by_anyone()) { + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_IFILE); + mds->messenger->send_message(m, + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } + in->filelock.init_gather(in->get_cached_by()); + + // change lock + in->filelock.set_state(LOCK_GWRONLYM); + } else { + in->filelock.set_state(LOCK_WRONLY); + issue_caps(in); + } + } + + else + assert(0); +} + // messenger -void MDCache::handle_lock_inode_soft(MLock *m) +void MDCache::handle_lock_inode_file(MLock *m) { - assert(m->get_otype() == LOCK_OTYPE_ISOFT); + assert(m->get_otype() == LOCK_OTYPE_IFILE); CInode *in = get_inode(m->get_ino()); int from = m->get_asker(); @@ -4505,31 +4858,28 @@ void MDCache::handle_lock_inode_soft(MLock *m) // auth assert(in); assert(in->is_auth() || in->is_proxy()); - + dout(7) << "handle_lock_inode_file " << *in << " hardlock=" << in->hardlock << endl; + if (in->is_proxy()) { // fw int newauth = in->authority(); assert(newauth >= 0); - dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, fw to " << newauth << endl; - mds->messenger->send_message(m, - MSG_ADDR_MDS(newauth), MDS_PORT_CACHE, - MDS_PORT_CACHE); + if (from == newauth) { + dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, but from new auth, dropping" << endl; + delete m; + } else { + dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, fw to " << newauth << endl; + mds->messenger->send_message(m, + MSG_ADDR_MDS(newauth), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } return; } } else { // replica if (!in) { - // nack + // drop it. don't nak. dout(7) << "handle_lock " << m->get_ino() << ": don't have it anymore" << endl; - - // DONT NAK - /* - MLock *reply = new MLock(m->get_action() + LOCK_AC_NAKOFFSET, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_ISOFT); - mds->messenger->send_message(reply, - MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE, - MDS_PORT_CACHE); - */ delete m; return; } @@ -4537,30 +4887,20 @@ void MDCache::handle_lock_inode_soft(MLock *m) assert(!in->is_auth()); } - dout(7) << "handle_lock_inode_soft a=" << m->get_action() << " from " << from << " " << *in << " softlock=" << in->softlock << endl; - - CLock *lock = &in->softlock; + dout(7) << "handle_lock_inode_file a=" << m->get_action() << " from " << from << " " << *in << " filelock=" << in->filelock << endl; + CLock *lock = &in->filelock; + int issued = in->get_caps_issued(); + switch (m->get_action()) { // -- replica -- - case LOCK_AC_SYNC_MODE: - lock->set_mode(LOCK_MODE_SYNC); - in->finish_waiting(CINODE_WAIT_SOFTW); - break; - - case LOCK_AC_ASYNC_MODE: - lock->set_mode(LOCK_MODE_ASYNC); - in->finish_waiting(CINODE_WAIT_SOFTR); - break; - - case LOCK_AC_SYNC: assert(lock->get_state() == LOCK_LOCK || - lock->get_state() == LOCK_GSYNC); + lock->get_state() == LOCK_MIXED); { // assim data int off = 0; - in->decode_soft_state(m->get_data(), off); + in->decode_file_state(m->get_data(), off); } // update lock @@ -4569,230 +4909,135 @@ void MDCache::handle_lock_inode_soft(MLock *m) // no need to reply. // waiters - in->softlock.get_read(); - in->finish_waiting(CINODE_WAIT_SOFTR|CINODE_WAIT_SOFTSTABLE); - in->softlock.put_read(); - inode_soft_eval(in); + in->filelock.get_read(); + in->finish_waiting(CINODE_WAIT_FILER|CINODE_WAIT_FILESTABLE); + in->filelock.put_read(); + inode_file_eval(in); break; case LOCK_AC_LOCK: assert(lock->get_state() == LOCK_SYNC || - lock->get_state() == LOCK_WLOCKR); + lock->get_state() == LOCK_MIXED); + // call back caps? + if (issued & CAP_FILE_RD) { + dout(7) << "handle_lock_inode_file client readers, gathering caps on " << *in << endl; + issue_caps(in); + } if (lock->get_nread() > 0) { - dout(7) << "handle_lock_inode_soft readers, waiting before ack on " << *in << endl; - lock->set_state(LOCK_WLOCKR); - in->add_waiter(CINODE_WAIT_SOFTNORD, + dout(7) << "handle_lock_inode_file readers, waiting before ack on " << *in << endl; + in->add_waiter(CINODE_WAIT_FILENORD, new C_MDS_RetryMessage(mds,m)); + lock->set_state(LOCK_GLOCKR); + assert(0);// i am broken.. why retry message when state captures all the info i need? return; - } else { - // update lock + } + if (issued & CAP_FILE_RD) { + lock->set_state(LOCK_GLOCKR); + break; + } + + // nothing to wait for, lock and ack. + { lock->set_state(LOCK_LOCK); - - // ack - { - MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_ISOFT); - mds->messenger->send_message(reply, - MSG_ADDR_MDS(from), MDS_PORT_CACHE, - MDS_PORT_CACHE); - } + + MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid()); + reply->set_ino(in->ino(), LOCK_OTYPE_IFILE); + mds->messenger->send_message(reply, + MSG_ADDR_MDS(from), MDS_PORT_CACHE, + MDS_PORT_CACHE); } break; - case LOCK_AC_ASYNC: - assert(lock->get_state() == LOCK_GASYNC || + case LOCK_AC_MIXED: + assert(lock->get_state() == LOCK_SYNC || lock->get_state() == LOCK_LOCK); - // update lock - lock->set_state(LOCK_ASYNC); - - // waiters - in->softlock.get_write(); - in->finish_waiting(CINODE_WAIT_SOFTW|CINODE_WAIT_SOFTSTABLE); - in->softlock.put_write(); - inode_soft_eval(in); - break; - + if (lock->get_state() == LOCK_SYNC) { + // MIXED + if (issued & CAP_FILE_RD) { + // call back client caps + lock->set_state(LOCK_GMIXEDR); + issue_caps(in); + break; + } else { + // no clients, go straight to mixed + lock->set_state(LOCK_MIXED); - case LOCK_AC_GASYNC: - assert(lock->get_state() == LOCK_SYNC || - lock->get_state() == LOCK_WGASYNC); - - // wait for readers to finish? - if (lock->get_nread() > 0) { - dout(7) << "handle_lock_inode_soft readers, waiting before ack on " << *in << endl; - lock->set_state(LOCK_WGASYNC); - in->add_waiter(CINODE_WAIT_SOFTNORD, - new C_MDS_RetryMessage(mds,m)); - return; - } else { - // update lock - lock->set_state(LOCK_GASYNC); - - // ack - { - MLock *reply = new MLock(LOCK_AC_GASYNCACK, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_ISOFT); + // ack + MLock *reply = new MLock(LOCK_AC_MIXEDACK, mds->get_nodeid()); + reply->set_ino(in->ino(), LOCK_OTYPE_IFILE); mds->messenger->send_message(reply, MSG_ADDR_MDS(from), MDS_PORT_CACHE, MDS_PORT_CACHE); } - } - break; - - - case LOCK_AC_GSYNC: - assert(lock->get_state() == LOCK_ASYNC || - lock->get_state() == LOCK_WGSYNC); - - // wait for writers to finish? - if (lock->get_nwrite() > 0) { - dout(7) << "handle_lock_inode_soft writers, waiting before ack on " << *in << endl; - lock->set_state(LOCK_WGSYNC); - in->add_waiter(CINODE_WAIT_SOFTNOWR, - new C_MDS_RetryMessage(mds,m)); - return; } else { - // update lock - lock->set_state(LOCK_GSYNC); + // LOCK + lock->set_state(LOCK_MIXED); - // reply w/ our data - { - MLock *reply = new MLock(LOCK_AC_GSYNCACK, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_ISOFT); - - // payload - crope sd; - in->encode_soft_state(sd); - reply->set_data(sd); - - // mark clean if dirty! - if (in->is_dirty()) in->mark_clean(); - - mds->messenger->send_message(reply, - MSG_ADDR_MDS(from), MDS_PORT_CACHE, - MDS_PORT_CACHE); - } + // no ack needed. } - break; - - case LOCK_AC_GLOCK: - assert(lock->get_state() == LOCK_ASYNC || - lock->get_state() == LOCK_WLOCKW); + + issue_caps(in); - // wait for writers to finish? - if (lock->get_nwrite() > 0) { - dout(7) << "handle_lock_inode_soft writers, waiting before ack on " << *in << endl; - lock->set_state(LOCK_WLOCKW); - in->add_waiter(CINODE_WAIT_SOFTNOWR, - new C_MDS_RetryMessage(mds,m)); - return; - } else { - // update lock - lock->set_state(LOCK_LOCK); - - // reply w/ our data - { - MLock *reply = new MLock(LOCK_AC_GLOCKACK, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_ISOFT); - - // payload - crope sd; - in->encode_soft_state(sd); - reply->set_data(sd); - - // mark clean if dirty! - if (in->is_dirty()) in->mark_clean(); - - mds->messenger->send_message(reply, - MSG_ADDR_MDS(from), MDS_PORT_CACHE, - MDS_PORT_CACHE); - } - } + // waiters + in->filelock.get_write(); + in->finish_waiting(CINODE_WAIT_FILEW|CINODE_WAIT_FILESTABLE); + in->filelock.put_write(); + inode_file_eval(in); break; - + + // -- auth -- - case LOCK_AC_LOCKNAK: - // do NOT remove from cached_by; we don't know the nonce! - // and somewhere out there there's an expire that will take care of it. - case LOCK_AC_LOCKACK: - assert(lock->state == LOCK_PRELOCK); + assert(lock->state == LOCK_GLOCKR || + lock->state == LOCK_GLOCKM || + lock->state == LOCK_GWRONLYM || + lock->state == LOCK_GWRONLYR); assert(lock->gather_set.count(from)); lock->gather_set.erase(from); - - if (lock->gather_set.size()) { - dout(7) << "handle_lock_inode_soft " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; - } else { - dout(7) << "handle_lock_inode_soft " << *in << " from " << from << ", last one" << endl; - inode_soft_eval(in); - } - break; - - - case LOCK_AC_GLOCKNAK: - // do NOT remove from cached_by; we don't know the nonce! - // and somewhere out there there's an expire that will take care of it. - - case LOCK_AC_GLOCKACK: - assert(lock->state == LOCK_GLOCK); - assert(lock->gather_set.count(from)); - lock->gather_set.erase(from); - - if (m->get_action() == LOCK_AC_GLOCKACK) { - // merge data (keep largest size, mtime, etc.) - int off = 0; - in->decode_merge_soft_state(m->get_data(), off); - } if (lock->gather_set.size()) { - dout(7) << "handle_lock_inode_soft " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; + dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; } else { - dout(7) << "handle_lock_inode_soft " << *in << " from " << from << ", last one" << endl; - inode_soft_eval(in); + dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl; + inode_file_eval(in); } break; - - case LOCK_AC_GSYNCNAK: - // do NOT remove from cached_by; we don't know the nonce! - // and somewhere out there there's an expire that will take care of it. - - case LOCK_AC_GSYNCACK: - assert(lock->state == LOCK_GSYNC); + case LOCK_AC_SYNCACK: + assert(lock->state == LOCK_GSYNCM); assert(lock->gather_set.count(from)); lock->gather_set.erase(from); - if (m->get_action() == LOCK_AC_GSYNCACK) { + /* not used currently + { // merge data (keep largest size, mtime, etc.) int off = 0; - dout(7) << "merging soft state" <decode_merge_soft_state(m->get_data(), off); - dout(7) << "done merging soft state" <decode_merge_file_state(m->get_data(), off); } + */ if (lock->gather_set.size()) { - dout(7) << "handle_lock_inode_soft " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; + dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; } else { - dout(7) << "handle_lock_inode_soft " << *in << " from " << from << ", last one" << endl; - inode_soft_eval(in); + dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl; + inode_file_eval(in); } break; - case LOCK_AC_GASYNCNAK: - case LOCK_AC_GASYNCACK: - assert(lock->state == LOCK_GASYNC); + case LOCK_AC_MIXEDACK: + assert(lock->state == LOCK_GMIXEDR); assert(lock->gather_set.count(from)); lock->gather_set.erase(from); if (lock->gather_set.size()) { - dout(7) << "handle_lock_inode_soft " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; + dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; } else { - dout(7) << "handle_lock_inode_soft " << *in << " from " << from << ", last one" << endl; - inode_soft_eval(in); + dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl; + inode_file_eval(in); } break; @@ -4805,6 +5050,18 @@ void MDCache::handle_lock_inode_soft(MLock *m) } + + + + + + + + + + + + void MDCache::handle_lock_dir(MLock *m) { @@ -5691,12 +5948,16 @@ void MDCache::encode_export_inode(CInode *in, bufferlist& enc_state, int new_aut { in->version++; // so local log entries are ignored, etc. (FIXME ??) - // tell clients with caps about new inode auth - for (map::iterator it = in->caps.begin(); - it != in->caps.end(); + // tell (all) clients about new inode auth + for (map::iterator it = in->client_caps.begin(); + it != in->client_caps.end(); it++) { dout(7) << "encode_export_inode " << *in << " telling client " << it->first << " new auth " << new_auth << endl; - mds->messenger->send_message(new MClientFileCaps(in, it->second, false, new_auth), + mds->messenger->send_message(new MClientFileCaps(in->inode, + it->second.get_last_seq(), + it->second.pending(), + it->second.wanted(), + it->first, new_auth), MSG_ADDR_CLIENT(it->first)); } @@ -5717,9 +5978,26 @@ void MDCache::encode_export_inode(CInode *in, bufferlist& enc_state, int new_aut // clear/unpin cached_by (we're no longer the authority) in->cached_by_clear(); - // twiddle lock states - in->softlock.twiddle_export(); - in->hardlock.twiddle_export(); + // twiddle lock states for auth -> replica transition + // hard + in->hardlock.clear_gather(); + if (in->hardlock.get_state() == LOCK_GLOCKR) + in->hardlock.set_state(LOCK_LOCK); + + // file : we lost all our caps, so move to stable state! + in->filelock.clear_gather(); + if (in->filelock.get_state() == LOCK_GLOCKR || + in->filelock.get_state() == LOCK_GLOCKM || + in->filelock.get_state() == LOCK_GLOCKW || + in->filelock.get_state() == LOCK_GWRONLYR || + in->filelock.get_state() == LOCK_GWRONLYM) + in->filelock.set_state(LOCK_LOCK); + if (in->filelock.get_state() == LOCK_GMIXEDR) + in->filelock.set_state(LOCK_MIXED); + if (in->filelock.get_state() == LOCK_GSYNCM) + in->filelock.set_state(LOCK_SYNC); + if (in->filelock.get_state() == LOCK_GMIXEDW) + in->filelock.set_state(LOCK_MIXED); // mark auth assert(in->is_auth()); @@ -6475,21 +6753,24 @@ void MDCache::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int old if (in->is_cached_by(mds->get_nodeid())) in->cached_by_remove(mds->get_nodeid()); - /* don't do this - if (!in->hardlock.is_stable() && - in->hardlock.is_gathering(mds->get_nodeid())) { - in->hardlock.gather_set.erase(mds->get_nodeid()); - if (in->hardlock.gather_set.size() == 0) - inode_hard_eval(in); - } - if (!in->softlock.is_stable() && - in->softlock.is_gathering(mds->get_nodeid())) { - in->softlock.gather_set.erase(mds->get_nodeid()); - if (in->softlock.gather_set.size() == 0) - inode_soft_eval(in); - } - */ - + // twiddle locks + // hard + if (in->hardlock.get_state() == LOCK_GLOCKR) { + in->hardlock.gather_set.erase(mds->get_nodeid()); + in->hardlock.gather_set.erase(oldauth); + if (in->hardlock.gather_set.empty()) + inode_hard_eval(in); + } + + // file + if (!in->filelock.is_stable()) { + // take me and old auth out of gather set + in->filelock.gather_set.erase(mds->get_nodeid()); + in->filelock.gather_set.erase(oldauth); + if (in->filelock.gather_set.empty()) // necessary but not suffient... + inode_file_eval(in); + } + // other if (in->is_dirty()) { dout(10) << "logging dirty import " << *in << endl; diff --git a/ceph/mds/MDCache.h b/ceph/mds/MDCache.h index cc6cebcb54cb0..e4ee75d925095 100644 --- a/ceph/mds/MDCache.h +++ b/ceph/mds/MDCache.h @@ -291,14 +291,6 @@ class MDCache { void handle_rename_notify(MRenameNotify *m); // dest -> bystanders - // -- file i/o -- - public: - __uint64_t issue_file_data_version(CInode *in); - Capability* issue_file_caps(CInode *in, int mode, MClientRequest *req); - void eval_file_caps(CInode *in); - protected: - void handle_client_file_caps(class MClientFileCaps *m); - // -- misc auth -- int ino_proxy_auth(inodeno_t ino, @@ -434,29 +426,43 @@ class MDCache { void inode_hard_read_finish(CInode *in); bool inode_hard_write_start(CInode *in, MClientRequest *m); void inode_hard_write_finish(CInode *in); - bool inode_soft_read_start(CInode *in, MClientRequest *m); - void inode_soft_read_finish(CInode *in); - bool inode_soft_write_start(CInode *in, MClientRequest *m); - void inode_soft_write_finish(CInode *in); + bool inode_file_read_start(CInode *in, MClientRequest *m); + void inode_file_read_finish(CInode *in); + bool inode_file_write_start(CInode *in, MClientRequest *m); + void inode_file_write_finish(CInode *in); void inode_hard_eval(CInode *in); - void inode_soft_eval(CInode *in); + void inode_file_eval(CInode *in); protected: void inode_hard_mode(CInode *in, int mode); - void inode_soft_mode(CInode *in, int mode); + void inode_file_mode(CInode *in, int mode); // low level triggers void inode_hard_sync(CInode *in); void inode_hard_lock(CInode *in); - bool inode_soft_sync(CInode *in); - void inode_soft_lock(CInode *in); - void inode_soft_async(CInode *in); + bool inode_file_sync(CInode *in); + void inode_file_lock(CInode *in); + void inode_file_mixed(CInode *in); + void inode_file_wronly(CInode *in); // messengers void handle_lock(MLock *m); void handle_lock_inode_hard(MLock *m); - void handle_lock_inode_soft(MLock *m); + void handle_lock_inode_file(MLock *m); + + // -- file i/o -- + public: + version_t issue_file_data_version(CInode *in); + Capability* issue_new_caps(CInode *in, int mode, MClientRequest *req); + bool issue_caps(CInode *in); + + protected: + void handle_client_file_caps(class MClientFileCaps *m); + + void request_inode_file_caps(CInode *in); + void handle_inode_file_caps(class MInodeFileCaps *m); + // dirs diff --git a/ceph/mds/MDS.cc b/ceph/mds/MDS.cc index 6bf9902577394..8011a93666685 100644 --- a/ceph/mds/MDS.cc +++ b/ceph/mds/MDS.cc @@ -212,8 +212,7 @@ int MDS::shutdown_start() if (i == whoami) continue; dout(1) << "sending MShutdownStart to mds" << i << endl; messenger->send_message(new MGenericMessage(MSG_MDS_SHUTDOWNSTART), - i, MDS_PORT_MAIN, - MDS_PORT_MAIN); + MSG_ADDR_MDS(i), MDS_PORT_MAIN, MDS_PORT_MAIN); } if (idalloc) idalloc->shutdown(); @@ -653,7 +652,7 @@ void MDS::handle_client_mount(MClientMount *m) // force empty root dir CDir *dir = root->dir; dir->mark_complete(); - dir->mark_dirty(); + //dir->mark_dirty(); // fake out idalloc (reset, pretend loaded) idalloc->reset(); @@ -996,9 +995,11 @@ void MDS::dispatch_request(Message *m, CInode *ref) handle_client_fsync(req, ref); break; */ + /* case MDS_OP_RELEASE: handle_client_release(req, ref); break; + */ // inodes case MDS_OP_STAT: @@ -1059,7 +1060,7 @@ void MDS::dispatch_request(Message *m, CInode *ref) void MDS::handle_client_stat(MClientRequest *req, CInode *ref) { - if (!mdcache->inode_soft_read_start(ref, req)) + if (!mdcache->inode_file_read_start(ref, req)) return; // sync dout(10) << "reply to " << *req << " stat " << ref->inode.mtime << endl; @@ -1067,7 +1068,7 @@ void MDS::handle_client_stat(MClientRequest *req, // inode info is in the trace - mdcache->inode_soft_read_finish(ref); + mdcache->inode_file_read_finish(ref); balancer->hit_inode(ref); @@ -1085,7 +1086,7 @@ void MDS::handle_client_utime(MClientRequest *req, CInode *cur) { // write - if (!mdcache->inode_soft_write_start(cur, req)) + if (!mdcache->inode_file_write_start(cur, req)) return; // fw or (wait for) sync // do update @@ -1094,7 +1095,7 @@ void MDS::handle_client_utime(MClientRequest *req, if (cur->is_auth()) cur->mark_dirty(); - mdcache->inode_soft_write_finish(cur); + mdcache->inode_file_write_finish(cur); balancer->hit_inode(cur); @@ -2631,22 +2632,16 @@ void MDS::handle_client_open(MClientRequest *req, dout(10) << " flags = " << flags << " mode = " << mode << endl; - // auth only - if (!cur->is_auth()) { + // auth for write access + if (mode != FILE_MODE_R && + !cur->is_auth()) { int auth = cur->authority(); assert(auth != whoami); - dout(9) << "open [replica] " << *cur << " on replica, fw to auth " << auth << endl; + dout(9) << "open writeable on replica for " << *cur << " fw to auth " << auth << endl; mdcache->request_forward(req, auth); return; } - assert(cur->is_auth()); - - // writer? - if (mode == FILE_MODE_W || - mode == FILE_MODE_RW) { - if (!mdcache->inode_soft_write_start(cur, req)) return; - } // hmm, check permissions or something. @@ -2654,16 +2649,11 @@ void MDS::handle_client_open(MClientRequest *req, // can we issue the caps they want? __uint64_t fdv = mdcache->issue_file_data_version(cur); - Capability *cap = mdcache->issue_file_caps(cur, mode, req); + Capability *cap = mdcache->issue_new_caps(cur, mode, req); if (!cap) return; // can't issue (yet), so wait! - dout(12) << "open gets caps " << cap->pending() << endl; + dout(12) << "open gets caps " << cap_string(cap->pending()) << endl; - if (mode == FILE_MODE_W || - mode == FILE_MODE_RW) { - mdcache->inode_soft_write_finish(cur); - } - balancer->hit_inode(cur); // reply @@ -2692,95 +2682,6 @@ void MDS::handle_client_openc(MClientRequest *req, CInode *ref) -void MDS::handle_client_release(MClientRequest *req, CInode *cur) -{ - // auth only - if (!cur->is_auth()) { - int auth = cur->authority(); - assert(auth != whoami); - dout(9) << "release " << *cur << " on replica, fw to auth " << auth << endl; - - mdcache->request_forward(req, auth); - return; - } - assert(cur->is_auth()); - - // verify on read or write list - int client = req->get_client(); - Capability *cap = cur->get_cap(client); - if (!cap) { - dout(1) << "release on non-existant capability client " << client << " inode " << *cur << endl; - assert(0); - } - - dout(10) << "release on " << *cur << " client " << client << endl; - - // update soft metadata - if (cap->issued() & CAP_FILE_WR) { - - // FIXME THIS IS BROKEN - //assert(cur->softlock.can_write(true)); // otherwise we're toast??? - - /* FIXME THIS IS BROKEN - if (!mdcache->inode_soft_write_start(cur, req)) - return; // wait - */ - - // update size, mtime - time_t mtime = req->get_targ(); - size_t size = req->get_sizearg(); - dout(10) << "mtime is " << mtime << " size is " << size << endl; - if (mtime > cur->inode.mtime) { - cur->inode.mtime = mtime; - dout(10) << " extended mtime to " << mtime << endl; - cur->mark_dirty(); - } - if (size > cur->inode.size) { - cur->inode.size = size; - dout(10) << " extended size to " << size << endl; - cur->mark_dirty(); - } - - // inc file_data_version - dout(7) << " incrementing file_data_version for " << *cur << endl; - cur->inode.file_data_version++; - - /* FIXME THIS IS BROKEN - - // release write - mdcache->inode_soft_write_finish(cur); - */ - - mdcache->inode_soft_eval(cur); - } else { - dout(10) << "no WR caps issued, not updating mtime/size" << endl; - } - - // XXX what about atime? - - - // release it. - int had_caps = cap->issued(); - long had_caps_seq = cap->get_last_seq(); - cur->remove_cap(client); - - // reeval caps - mdcache->eval_file_caps(cur); - - - - // give back a file_data_version to client - MClientReply *reply = new MClientReply(req, 0); - __uint64_t fdv = mdcache->issue_file_data_version(cur); - reply->set_file_caps(had_caps); - reply->set_file_caps_seq(had_caps_seq); - reply->set_file_data_version(fdv); - - // commit - commit_request(req, reply, cur, - new EInodeUpdate(cur)); // FIXME wrong message? -} - diff --git a/ceph/mds/MDStore.h b/ceph/mds/MDStore.h index e6c7bdb4a5c9d..534b44987d6aa 100644 --- a/ceph/mds/MDStore.h +++ b/ceph/mds/MDStore.h @@ -57,4 +57,5 @@ class MDStore { friend class C_MDS_CommitSlice; }; + #endif diff --git a/ceph/messages/MClientFileCaps.h b/ceph/messages/MClientFileCaps.h index caf19f9220504..9f1af1bdb79c0 100644 --- a/ceph/messages/MClientFileCaps.h +++ b/ceph/messages/MClientFileCaps.h @@ -6,32 +6,38 @@ class MClientFileCaps : public Message { inode_t inode; int mds; int caps; - bool need_ack; + int wanted; int client; public: inodeno_t get_ino() { return inode.ino; } - inode_t get_inode() { return inode; } + inode_t& get_inode() { return inode; } int get_mds() { return mds; } int get_caps() { return caps; } - bool needs_ack() { return need_ack; } + int get_wanted() { return wanted; } long get_seq() { return seq; } int get_client() { return client; } - void set_client(int c) { client = c; } + //void set_client(int c) { client = c; } + void set_caps(int c) { caps = c; } + void set_wanted(int w) { wanted = w; } MClientFileCaps() {} - MClientFileCaps(CInode *in, - Capability& cap, - bool need_ack, + MClientFileCaps(inode_t& inode, + long seq, + int caps, + int wanted, + int client, int new_mds = -1) : Message(MSG_CLIENT_FILECAPS) { - this->seq = cap.get_last_seq(); - this->caps = cap.pending(); + this->seq = seq; + this->caps = caps; + this->wanted = wanted; - this->inode = in->inode; - this->need_ack = need_ack; + this->client = client; + + this->inode = inode; this->mds = new_mds; } virtual char *get_type_name() { return "Cfcap";} @@ -41,21 +47,21 @@ class MClientFileCaps : public Message { off += sizeof(seq); s.copy(off, sizeof(inode), (char*)&inode); off += sizeof(inode); - s.copy(off, sizeof(mds), (char*)&mds); - off += sizeof(mds); s.copy(off, sizeof(caps), (char*)&caps); off += sizeof(caps); - s.copy(off, sizeof(need_ack), (char*)&need_ack); - off += sizeof(need_ack); + s.copy(off, sizeof(wanted), (char*)&wanted); + off += sizeof(wanted); + s.copy(off, sizeof(mds), (char*)&mds); + off += sizeof(mds); s.copy(off, sizeof(client), (char*)&client); off += sizeof(client); } virtual void encode_payload(crope& s) { s.append((char*)&seq, sizeof(seq)); s.append((char*)&inode, sizeof(inode)); - s.append((char*)&mds,sizeof(mds)); s.append((char*)&caps, sizeof(caps)); - s.append((char*)&need_ack, sizeof(need_ack)); + s.append((char*)&wanted, sizeof(wanted)); + s.append((char*)&mds,sizeof(mds)); s.append((char*)&client, sizeof(client)); } }; diff --git a/ceph/messages/MClientReply.h b/ceph/messages/MClientReply.h index b21ea99ca0093..5d3dc0a6d329f 100644 --- a/ceph/messages/MClientReply.h +++ b/ceph/messages/MClientReply.h @@ -39,7 +39,7 @@ class c_inode_info { string ref_dn; // referring dentry (blank if root) string symlink; // symlink content (if symlink) - bool inode_soft_valid; // true if inode info is valid (ie was readable on mds at the time) + bool inode_file_valid; // true if inode info is valid (ie was readable on mds at the time) bool inode_hard_valid; // true if inode info is valid (ie was readable on mds at the time) bool spec_defined; @@ -52,7 +52,7 @@ class c_inode_info { c_inode_info(CInode *in, int whoami, string ref_dn) { // inode this->inode = in->inode; - this->inode_soft_valid = in->softlock.can_read(in->is_auth()); + this->inode_file_valid = in->filelock.can_read(in->is_auth()); this->inode_hard_valid = in->hardlock.can_read(in->is_auth()); // symlink content? @@ -71,7 +71,7 @@ class c_inode_info { void _encode(bufferlist &bl) { bl.append((char*)&inode, sizeof(inode)); - bl.append((char*)&inode_soft_valid, sizeof(inode_soft_valid)); + bl.append((char*)&inode_file_valid, sizeof(inode_file_valid)); bl.append((char*)&inode_hard_valid, sizeof(inode_hard_valid)); bl.append((char*)&spec_defined, sizeof(spec_defined)); bl.append((char*)&dir_auth, sizeof(dir_auth)); @@ -84,8 +84,8 @@ class c_inode_info { void _decode(bufferlist &bl, int& off) { bl.copy(off, sizeof(inode), (char*)&inode); off += sizeof(inode); - bl.copy(off, sizeof(inode_soft_valid), (char*)&inode_soft_valid); - off += sizeof(inode_soft_valid); + bl.copy(off, sizeof(inode_file_valid), (char*)&inode_file_valid); + off += sizeof(inode_file_valid); bl.copy(off, sizeof(inode_hard_valid), (char*)&inode_hard_valid); off += sizeof(inode_hard_valid); bl.copy(off, sizeof(spec_defined), (char*)&spec_defined); diff --git a/ceph/messages/MInodeFileCaps.h b/ceph/messages/MInodeFileCaps.h new file mode 100644 index 0000000000000..d818423ac599d --- /dev/null +++ b/ceph/messages/MInodeFileCaps.h @@ -0,0 +1,41 @@ +#ifndef __MINODEFILECAPS_H +#define __MINODEFILECAPS_H + +class MInodeFileCaps : public Message { + inodeno_t ino; + int from; + int caps; + + public: + inodeno_t get_ino() { return ino; } + int get_from() { return from; } + int get_caps() { return caps; } + + MInodeFileCaps() {} + // from auth + MInodeFileCaps(inodeno_t ino, int from, int caps) : + Message(MSG_MDS_INODEFILECAPS) { + + this->ino = ino; + this->from = from; + this->caps = caps; + } + + virtual char *get_type_name() { return "Icap";} + + virtual void decode_payload(crope& s, int& off) { + s.copy(off, sizeof(from), (char*)&from); + off += sizeof(from); + s.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + s.copy(off, sizeof(caps), (char*)&caps); + off += sizeof(caps); + } + virtual void encode_payload(crope& s) { + s.append((char*)&from, sizeof(from)); + s.append((char*)&ino, sizeof(ino)); + s.append((char*)&caps, sizeof(caps)); + } +}; + +#endif diff --git a/ceph/messages/MLock.h b/ceph/messages/MLock.h index 71a447c023ad5..f44c2116cb255 100644 --- a/ceph/messages/MLock.h +++ b/ceph/messages/MLock.h @@ -4,41 +4,30 @@ #include "msg/Message.h" #define LOCK_OTYPE_IHARD 1 -#define LOCK_OTYPE_ISOFT 2 +#define LOCK_OTYPE_IFILE 2 #define LOCK_OTYPE_DIR 3 #define LOCK_OTYPE_DN 4 // for replicas #define LOCK_AC_SYNC 0 -#define LOCK_AC_ASYNC 1 +#define LOCK_AC_MIXED 1 +#define LOCK_AC_LOCK 2 -#define LOCK_AC_SYNC_MODE 2 -#define LOCK_AC_LOCK_MODE 3 -#define LOCK_AC_ASYNC_MODE 4 - -#define LOCK_AC_LOCK 5 // nakable -#define LOCK_AC_GSYNC 6 // " -#define LOCK_AC_GLOCK 7 // " -#define LOCK_AC_GASYNC 8 // " - -#define LOCK_AC_REQXLOCKACK 9 -#define LOCK_AC_REQXLOCKNAK 10 +#define LOCK_AC_REQXLOCKACK 9 // req dentry xlock +#define LOCK_AC_REQXLOCKNAK 10 // req dentry xlock +#define LOCK_AC_LOCKNAK 12 // for dentry xlock #define LOCK_AC_FOR_REPLICA(a) ((a) <= 10) #define LOCK_AC_FOR_AUTH(a) ((a) >= 11) -#define LOCK_AC_NAKOFFSET 4 // be careful with numbering! - // for auth -#define LOCK_AC_LOCKNAK 11 -#define LOCK_AC_GSYNCNAK 12 -#define LOCK_AC_GLOCKNAK 13 -#define LOCK_AC_GASYNCNAK 14 + +#define LOCK_AC_SYNCACK 13 +#define LOCK_AC_MIXEDACK 14 #define LOCK_AC_LOCKACK 15 -#define LOCK_AC_GSYNCACK 16 -#define LOCK_AC_GLOCKACK 17 -#define LOCK_AC_GASYNCACK 18 + + #define LOCK_AC_REQREAD 19 #define LOCK_AC_REQWRITE 20 diff --git a/ceph/msg/Message.h b/ceph/msg/Message.h index c692bcb7d9a9a..ebae8eb509770 100644 --- a/ceph/msg/Message.h +++ b/ceph/msg/Message.h @@ -56,6 +56,8 @@ #define MSG_MDS_INODEGETREPLICA 112 #define MSG_MDS_INODEGETREPLICAACK 113 +#define MSG_MDS_INODEFILECAPS 115 + #define MSG_MDS_INODEUPDATE 120 #define MSG_MDS_DIRUPDATE 121 #define MSG_MDS_INODEEXPIRE 122 diff --git a/ceph/msg/Messenger.cc b/ceph/msg/Messenger.cc index 52042cd849828..d63cf15101976 100644 --- a/ceph/msg/Messenger.cc +++ b/ceph/msg/Messenger.cc @@ -92,6 +92,7 @@ using namespace std; #include "messages/MInodeExpire.h" #include "messages/MDirExpire.h" #include "messages/MCacheExpire.h" +#include "messages/MInodeFileCaps.h" #include "messages/MLock.h" @@ -163,6 +164,10 @@ void Messenger::dispatch(Message *m) dout(DEBUGLVL) << "dispatch got reply for " << pcid << " " << m << endl; call_reply[pcid] = m; // set reply call_cond[pcid]->Signal(); + + // wait for delivery + call_reply_finish_cond.Wait(_lock); + _lock.Unlock(); } else { // no, this is an unsolicited message. @@ -226,6 +231,7 @@ Message *Messenger::sendrecv(Message *m, msg_addr_t dest, int port) dout(DEBUGLVL) << "sendrecv got reply " << reply << " on pcid " << pcid << endl; //delete sem; + call_reply_finish_cond.Signal(); _lock.Unlock(); return reply; @@ -471,6 +477,10 @@ decode_message(msg_envelope_t& env, bufferlist& payload) m = new MInodeExpire(); break; + case MSG_MDS_INODEFILECAPS: + m = new MInodeFileCaps(); + break; + case MSG_MDS_DIREXPIRE: m = new MDirExpire(); break; diff --git a/ceph/msg/Messenger.h b/ceph/msg/Messenger.h index 78fdd58681a4e..54d92f620ed1d 100644 --- a/ceph/msg/Messenger.h +++ b/ceph/msg/Messenger.h @@ -31,6 +31,7 @@ class Messenger { Mutex _lock; // protect call_sem, call_reply map call_cond; map call_reply; + Cond call_reply_finish_cond; public: Messenger(msg_addr_t w) : dispatcher(0), _myaddr(w), lamport_clock(0), _last_pcid(1) { } diff --git a/ceph/msg/TCPMessenger.cc b/ceph/msg/TCPMessenger.cc index 37d22ef090b88..93c3d69c53864 100644 --- a/ceph/msg/TCPMessenger.cc +++ b/ceph/msg/TCPMessenger.cc @@ -88,7 +88,7 @@ pthread_t out_thread_id = 0; // thread id of the event loop. init value pthread_t listen_thread_id = 0; map in_threads; // sd -> threadid -bool pending_timer = false; +//bool pending_timer = false; // per-rank fun @@ -303,7 +303,8 @@ int tcpmessenger_init() dout(DBL) << "listen addr is " << listen_addr << endl; // register to execute timer events - g_timer.set_messenger_kicker(new C_TCPKicker()); + //g_timer.set_messenger_kicker(new C_TCPKicker()); + msgr_callback_kicker = new C_TCPKicker(); dout(DBL) << "init done" << endl; return 0; @@ -699,19 +700,21 @@ void* tcp_dispatchthread(void*) while (1) { // callbacks? - Messenger::do_callbacks(); + messenger_do_callbacks(); // timer events? - if (pending_timer) { + /*if (pending_timer) { pending_timer = false; dout(DBL) << "dispatch: pending timer" << endl; g_timer.execute_pending(); } + */ // done? if (tcp_done && - incoming.empty() && - !pending_timer) break; + incoming.empty()) break; + //&& + //!pending_timer) break; // wait? if (incoming.empty()) { @@ -911,7 +914,8 @@ TCPMessenger::TCPMessenger(msg_addr_t myaddr) : Messenger(myaddr) directory_lock.Unlock(); // register to execute timer events - g_timer.set_messenger_kicker(new C_TCPKicker()); + //g_timer.set_messenger_kicker(new C_TCPKicker()); + g_timer.set_messenger(this); // logger @@ -1005,7 +1009,8 @@ int TCPMessenger::shutdown() //pthread_t whoami = pthread_self(); // no more timer events - g_timer.unset_messenger_kicker(); + g_timer.unset_messenger(); + msgr_callback_kicker = 0; // close incoming sockets -- 2.39.5