From 11c907d2ef721704dea95246941b4cca75f621b6 Mon Sep 17 00:00:00 2001 From: sageweil Date: Fri, 13 Jul 2007 21:06:01 +0000 Subject: [PATCH] * finished missing/full rejoin * CInode STATE_REJOINUNDEF * renamed some rejoin_* functions to be more consistent * fixed newsyn/fakesyn mon_stop_on_last_unmount defaultiness git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1501 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/cephmds2/client/Client.cc | 3 +- .../sage/cephmds2/client/SyntheticClient.cc | 2 +- branches/sage/cephmds2/fakesyn.cc | 9 +- branches/sage/cephmds2/mds/CInode.h | 12 +- branches/sage/cephmds2/mds/MDCache.cc | 206 +++++++++++------- branches/sage/cephmds2/mds/MDCache.h | 10 +- branches/sage/cephmds2/mds/MDS.cc | 9 +- branches/sage/cephmds2/newsyn.cc | 8 +- 8 files changed, 161 insertions(+), 98 deletions(-) diff --git a/branches/sage/cephmds2/client/Client.cc b/branches/sage/cephmds2/client/Client.cc index e28e71a3047ed..69c3c4e104e46 100644 --- a/branches/sage/cephmds2/client/Client.cc +++ b/branches/sage/cephmds2/client/Client.cc @@ -517,7 +517,7 @@ int Client::choose_target_mds(MClientRequest *req) if (mds < 0) mds = 0; if (0) { - mds = 1; + mds = 0; dout(0) << "hack: sending all requests to mds" << mds << endl; } } else { @@ -607,6 +607,7 @@ MClientReply *Client::make_request(MClientRequest *req, // open a session? if (mds_sessions.count(mds) == 0) { Cond cond; + if (waiting_for_session.count(mds) == 0) { dout(10) << "opening session to mds" << mds << endl; messenger->send_message(new MClientSession(MClientSession::OP_REQUEST_OPEN), diff --git a/branches/sage/cephmds2/client/SyntheticClient.cc b/branches/sage/cephmds2/client/SyntheticClient.cc index f483dc48c8767..2beb06bea125d 100644 --- a/branches/sage/cephmds2/client/SyntheticClient.cc +++ b/branches/sage/cephmds2/client/SyntheticClient.cc @@ -1370,7 +1370,7 @@ void SyntheticClient::foo() int c = rand() % s; char src[80]; sprintf(src, "syn.0.0/dir.%d/dir.%d/file.%d", a, b, c); - int fd = client->open(src, O_RDWR); + int fd = client->open(src, O_RDONLY); } return; diff --git a/branches/sage/cephmds2/fakesyn.cc b/branches/sage/cephmds2/fakesyn.cc index 75f8917d9cdc9..e4256db646abb 100644 --- a/branches/sage/cephmds2/fakesyn.cc +++ b/branches/sage/cephmds2/fakesyn.cc @@ -58,6 +58,10 @@ int main(int argc, char **argv) vector args; argv_to_vec(argc, argv, args); + // stop on our own (by default) + g_conf.mon_stop_on_last_unmount = true; + g_conf.mon_stop_with_last_mds = true; + parse_config_options(args); int start = 0; @@ -80,11 +84,6 @@ int main(int argc, char **argv) if (g_conf.clock_tare) g_clock.tare(); - // stop on our own - g_conf.mon_stop_on_last_unmount = true; - g_conf.mon_stop_with_last_mds = true; - - MonMap *monmap = new MonMap(g_conf.num_mon); entity_addr_t a; for (int i=0; i rejoins; @@ -1639,7 +1639,7 @@ void MDCache::send_cache_rejoins() if (auth == mds->get_nodeid()) continue; // skip my own regions! if (rejoins.count(auth) == 0) continue; // don't care about this node's regions - cache_rejoin_walk(dir, rejoins[auth]); + rejoin_walk(dir, rejoins[auth]); } if (!mds->is_rejoin()) { @@ -1705,7 +1705,7 @@ void MDCache::send_cache_rejoins() /** - * cache_rejoin_walk - build rejoin declarations for a subtree + * rejoin_walk - build rejoin declarations for a subtree * * @dir subtree root * @rejoin rejoin message @@ -1719,9 +1719,9 @@ void MDCache::send_cache_rejoins() * strong dentries (no connectivity!) * strong inodes */ -void MDCache::cache_rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) +void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) { - dout(10) << "cache_rejoin_walk " << *dir << endl; + dout(10) << "rejoin_walk " << *dir << endl; list nested; // finish this dir, then do nested items @@ -1771,7 +1771,7 @@ void MDCache::cache_rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) for (list::iterator p = nested.begin(); p != nested.end(); ++p) - cache_rejoin_walk(*p, rejoin); + rejoin_walk(*p, rejoin); } @@ -1961,10 +1961,9 @@ void MDCache::handle_cache_rejoin_weak_rejoin(MMDSCacheRejoin *weak) // done? rejoin_gather.erase(from); if (rejoin_gather.empty()) { - dout(7) << "got all rejoins, sending acks!" << endl; rejoin_gather_finish(); } else { - dout(7) << "still need rejoin from " << rejoin_gather << endl; + dout(7) << "still need rejoin from (" << rejoin_gather << ")" << endl; } } } @@ -2083,6 +2082,18 @@ void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack) } +CInode *MDCache::rejoin_invent_inode(inodeno_t ino) +{ + CInode *in = new CInode(this); + memset(&in->inode, 0, sizeof(inode_t)); + in->inode.ino = ino; + in->state_set(CInode::STATE_REJOINUNDEF); + add_inode(in); + rejoin_undef_inodes.insert(in); + dout(10) << " invented " << *in << endl; + return in; +} + void MDCache::handle_cache_rejoin_strong_rejoin(MMDSCacheRejoin *strong) { @@ -2100,9 +2111,13 @@ void MDCache::handle_cache_rejoin_strong_rejoin(MMDSCacheRejoin *strong) ++p) { CDir *dir = get_dirfrag(p->first); if (!dir) { - dout(10) << " missing " << p->first << endl; - if (!missing) missing = new MMDSCacheRejoin(MMDSCacheRejoin::OP_MISSING); - missing->add_weak_dirfrag(p->first); + CInode *in = get_inode(p->first.ino); + if (!in) in = rejoin_invent_inode(p->first.ino); + if (!in->is_dir()) { + assert(in->state_test(CInode::STATE_REJOINUNDEF)); + in->inode.mode = INODE_MODE_DIR; + } + dir = in->get_or_open_dirfrag(this, p->first.frag); } else { dir->add_replica(from); dout(10) << " have " << *dir << endl; @@ -2111,13 +2126,22 @@ void MDCache::handle_cache_rejoin_strong_rejoin(MMDSCacheRejoin *strong) for (map::iterator q = strong->strong_dentries[p->first].begin(); q != strong->strong_dentries[p->first].end(); ++q) { - CDentry *dn; - if (dir) dn = dir->lookup(q->first); - if (!dir || !dn) { - dout(10) << " missing " << p->first << " " << q->first << endl; - if (!missing) missing = new MMDSCacheRejoin(MMDSCacheRejoin::OP_MISSING); - missing->add_weak_null_dentry(p->first, q->first); // doesn't matter what kind for a missing decl. - continue; + CDentry *dn = dir->lookup(q->first); + if (!dn) { + if (q->second.is_remote()) { + dn = dir->add_dentry(q->first, q->second.remote_ino); + } else if (q->second.is_null()) { + dn = dir->add_dentry(q->first); + } else { + CInode *in = get_inode(q->second.ino); + if (!in) in = rejoin_invent_inode(q->second.ino); + dn = dir->add_dentry(q->first, in); + + dout(10) << " missing " << q->second.ino << endl; + if (!missing) missing = new MMDSCacheRejoin(MMDSCacheRejoin::OP_MISSING); + missing->add_weak_inode(q->second.ino); // we want it back! + } + dout(10) << " invented " << *dn << endl; } // dn auth_pin? @@ -2159,10 +2183,11 @@ void MDCache::handle_cache_rejoin_strong_rejoin(MMDSCacheRejoin *strong) MMDSCacheRejoin::inode_strong &is = strong->strong_inodes[in->ino()]; // caps_wanted - if (is.caps_wanted) + if (is.caps_wanted) { in->mds_caps_wanted[from] = is.caps_wanted; - else - in->mds_caps_wanted.erase(from); + dout(15) << " inode caps_wanted " << cap_string(is.caps_wanted) + << " on " << *in << endl; + } // scatterlock? if (is.dirlock == LOCK_SCATTER || @@ -2212,10 +2237,9 @@ void MDCache::handle_cache_rejoin_strong_rejoin(MMDSCacheRejoin *strong) // done? rejoin_gather.erase(from); if (rejoin_gather.empty()) { - dout(7) << "got all rejoins, sending acks!" << endl; rejoin_gather_finish(); } else { - dout(7) << "still need rejoin from " << rejoin_gather << endl; + dout(7) << "still need rejoin from (" << rejoin_gather << ")" << endl; } } } @@ -2271,10 +2295,12 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) // done? rejoin_ack_gather.erase(from); if (mds->is_rejoin() && + rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too. rejoin_ack_gather.empty()) { mds->rejoin_done(); } else { - dout(7) << "still need rejoin_ack from " << rejoin_ack_gather << endl; + dout(7) << "still need rejoin from (" << rejoin_gather << ")" + << ", rejoin_ack from (" << rejoin_ack_gather << ")" << endl; } } @@ -2356,37 +2382,6 @@ void MDCache::handle_cache_rejoin_missing(MMDSCacheRejoin *missing) MMDSCacheRejoin *full = new MMDSCacheRejoin(MMDSCacheRejoin::OP_FULL); - // dirs - for (map >::iterator p = missing->weak.begin(); - p != missing->weak.end(); - ++p) { - CDir *dir = get_dirfrag(p->first); - if (!dir) { - dout(10) << " don't have dirfrag " << p->first << endl; - assert(0); - continue; // we must have trimmed it after the original rejoin? - } - - dout(10) << " in dir " << *dir << endl; - - // dentries - for (map::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - CDentry *dn = dir->lookup(q->first); - if (!dn) { - dout(10) << " don't have dentry " << q->first << " in " << *dir << endl; - assert(0); - continue; // we must have trimmed it after our original rejoin - } - dout(10) << " sending " << *dn << endl; - full->add_strong_dentry(p->first, q->first, - dn->is_primary() ? dn->get_inode()->ino():inodeno_t(0), - dn->is_remote() ? dn->get_remote_ino():inodeno_t(0), - dn->get_replica_nonce(), dn->lock.get_state()); - } - } - // inodes for (set::iterator p = missing->weak_inodes.begin(); p != missing->weak_inodes.end(); @@ -2394,20 +2389,11 @@ void MDCache::handle_cache_rejoin_missing(MMDSCacheRejoin *missing) CInode *in = get_inode(*p); if (!in) { dout(10) << " don't have inode " << *p << endl; - assert(0); //??? continue; // we must have trimmed it after the originalo rejoin } dout(10) << " sending " << *in << endl; full->add_full_inode(in->inode, in->symlink, in->dirfragtree); - full->add_strong_inode(in->ino(), - in->get_replica_nonce(), - in->get_caps_wanted(), - in->authlock.get_replica_state(), - in->linklock.get_replica_state(), - in->dirfragtreelock.get_replica_state(), - in->filelock.get_replica_state(), - in->dirlock.get_replica_state()); } mds->send_message_mds(full, missing->get_source().num(), MDS_PORT_CACHE); @@ -2418,21 +2404,86 @@ void MDCache::handle_cache_rejoin_full(MMDSCacheRejoin *full) dout(7) << "handle_cache_rejoin_full from " << full->get_source() << endl; int from = full->get_source().num(); + // integrate full inodes + for (list::iterator p = full->full_inodes.begin(); + p != full->full_inodes.end(); + ++p) { + CInode *in = get_inode(p->inode.ino); + assert(in); - assert(0); // write me - + set::iterator q = rejoin_undef_inodes.find(in); + if (q != rejoin_undef_inodes.end()) { + CInode *in = *q; + in->inode = p->inode; + in->symlink = p->symlink; + in->dirfragtree = p->dirfragtree; + in->state_clear(CInode::STATE_REJOINUNDEF); + dout(10) << " got full " << *in << endl; + rejoin_undef_inodes.erase(q); + } else { + dout(10) << " had full " << *in << endl; + } + } // done? rejoin_gather.erase(from); if (rejoin_gather.empty()) { - dout(7) << "got all rejoins|fulls, sending acks!" << endl; rejoin_gather_finish(); } else { - dout(7) << "still need rejoin from " << rejoin_gather << endl; + dout(7) << "still need rejoin from (" << rejoin_gather << ")" << endl; } } + +void MDCache::rejoin_trim_undef_inodes() +{ + dout(10) << "rejoin_trim_undef_inodes" << endl; + + set::iterator p = rejoin_undef_inodes.begin(); + while (p != rejoin_undef_inodes.end()) { + CInode *in = *p; + in->clear_replicas(); + + // close out dirfrags + if (in->is_dir()) { + list dfls; + in->get_dirfrags(dfls); + for (list::iterator p = dfls.begin(); + p != dfls.end(); + ++p) { + CDir *dir = *p; + dir->clear_replicas(); + + for (map::iterator p = dir->items.begin(); + p != dir->items.end(); + ++p) { + CDentry *dn = p->second; + dn->clear_replicas(); + + dout(10) << " trimming " << *dn << endl; + dir->remove_dentry(dn); + } + + dout(10) << " trimming " << *dir << endl; + in->close_dirfrag(dir->dirfrag().frag); + } + } + + CDentry *dn = in->get_parent_dn(); + if (dn) { + dn->clear_replicas(); + dout(10) << " trimming " << *dn << endl; + dn->dir->remove_dentry(dn); + } else { + dout(10) << " trimming " << *in << endl; + remove_inode(in); + } + } + + rejoin_undef_inodes.clear(); +} + class C_MDC_RejoinGatherFinish : public Context { MDCache *cache; public: @@ -2445,7 +2496,10 @@ public: void MDCache::rejoin_gather_finish() { dout(10) << "rejoin_gather_finish" << endl; - + assert(mds->is_rejoin()); + + rejoin_trim_undef_inodes(); + // fetch paths? if (!cap_import_paths.empty() && !parallel_fetch(cap_import_paths, new C_MDC_RejoinGatherFinish(this))) @@ -2469,10 +2523,14 @@ void MDCache::rejoin_gather_finish() rejoin_import_cap(in, q->first, r->second, r->first); } - // process all reconnected caps (that is, twiddle filelock, before we send acks) mds->server->process_reconnected_caps(); - send_cache_rejoin_acks(); + rejoin_send_acks(); + + // did we already get our acks too? + // this happens when the rejoin_gather has to wait on a MISSING/FULL exchange. + if (rejoin_ack_gather.empty()) + mds->rejoin_done(); } void MDCache::rejoin_import_cap(CInode *in, int client, inode_caps_reconnect_t& icr, int frommds) @@ -2497,9 +2555,9 @@ void MDCache::rejoin_import_cap(CInode *in, int client, inode_caps_reconnect_t& 0, MDS_PORT_CACHE); } -void MDCache::send_cache_rejoin_acks() +void MDCache::rejoin_send_acks() { - dout(7) << "send_cache_rejoin_acks" << endl; + dout(7) << "rejoin_send_acks" << endl; // send acks to everyone in the recovery set map ack; diff --git a/branches/sage/cephmds2/mds/MDCache.h b/branches/sage/cephmds2/mds/MDCache.h index bd14026fff7b0..0e979d87bf6f7 100644 --- a/branches/sage/cephmds2/mds/MDCache.h +++ b/branches/sage/cephmds2/mds/MDCache.h @@ -353,20 +353,24 @@ protected: map > > cap_imports; // ino -> client -> frommds -> capex map cap_import_paths; + + set rejoin_undef_inodes; - void cache_rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin); + void rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin); void handle_cache_rejoin(MMDSCacheRejoin *m); void handle_cache_rejoin_weak_rejoin(MMDSCacheRejoin *m); + CInode* rejoin_invent_inode(inodeno_t ino); void handle_cache_rejoin_strong_rejoin(MMDSCacheRejoin *m); void rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack); void handle_cache_rejoin_ack(MMDSCacheRejoin *m); void handle_cache_rejoin_purge(MMDSCacheRejoin *m); void handle_cache_rejoin_missing(MMDSCacheRejoin *m); void handle_cache_rejoin_full(MMDSCacheRejoin *m); - void send_cache_rejoin_acks(); + void rejoin_send_acks(); + void rejoin_trim_undef_inodes(); public: void rejoin_gather_finish(); - void send_cache_rejoins(); + void rejoin_send_rejoins(); void rejoin_export_caps(inodeno_t ino, string& path, int client, inode_caps_reconnect_t& icr) { cap_exports[ino][client] = icr; cap_export_paths[ino] = path; diff --git a/branches/sage/cephmds2/mds/MDS.cc b/branches/sage/cephmds2/mds/MDS.cc index 5faa99b989bf2..36a755249aac6 100644 --- a/branches/sage/cephmds2/mds/MDS.cc +++ b/branches/sage/cephmds2/mds/MDS.cc @@ -276,14 +276,15 @@ void MDS::send_message_client_maybe_open(Message *m, entity_inst_t clientinst) { int client = clientinst.name.num(); if (!clientmap.have_session(client)) { + // no session! dout(10) << "send_message_client opening session with " << clientinst << endl; clientmap.add_opening(client); mdlog->submit_entry(new ESession(clientinst, true, clientmap.inc_projected()), new C_MDS_SendMessageClientSession(this, m, clientinst)); + } else { + // we have a session. + send_message_client(m, clientinst); } - - send_message_client(m, clientinst); - } @@ -937,7 +938,7 @@ void MDS::reconnect_done() void MDS::rejoin_joint_start() { dout(1) << "rejoin_joint_start" << endl; - mdcache->send_cache_rejoins(); + mdcache->rejoin_send_rejoins(); } void MDS::rejoin_done() { diff --git a/branches/sage/cephmds2/newsyn.cc b/branches/sage/cephmds2/newsyn.cc index dc967ada1caaf..5eadd85bfdb6f 100644 --- a/branches/sage/cephmds2/newsyn.cc +++ b/branches/sage/cephmds2/newsyn.cc @@ -174,6 +174,10 @@ int main(int argc, char **argv) args.swap(nargs); } + // stop on our own (by default) + g_conf.mon_stop_on_last_unmount = true; + g_conf.mon_stop_with_last_mds = true; + parse_config_options(args); parse_syn_options(args); @@ -188,10 +192,6 @@ int main(int argc, char **argv) g_conf.num_client = intabs(g_conf.num_client); g_conf.num_osd = intabs(g_conf.num_osd); - // stop on our own - g_conf.mon_stop_on_last_unmount = true; - g_conf.mon_stop_with_last_mds = true; - if (g_conf.kill_after) g_timer.add_event_after(g_conf.kill_after, new C_Die); -- 2.39.5