From 4977f3eab0cb265efeceeb02862c09e460549828 Mon Sep 17 00:00:00 2001 From: Sam Lang Date: Tue, 9 Apr 2013 10:35:19 -0500 Subject: [PATCH] mds: Delay export on missing inodes for reconnect The reconnect caps sent by the client on reconnect may not have inodes found in the inode cache until after clientreplay (when the client creates a new file, for example). Currently, we send an export for that cap to the client if we don't see an inode in the cache and path_is_mine() returns false (for example, if the client didn't send a path because the file was already unlinked). Instead, we want to delay handling of the reconnect cap until clientreplay completes. This patch modifies handle_client_reconnect() so that we don't assume the cap isn't ours if we don't have an inode for it, but instead delay recovery for later. An export cap message is only sent if the inode exists and the cap isn't ours (non-auth) during reconnect. If any remaining recovered caps exist in the recovered list once the mds goes active, we send export messages at that point. Also, after removing the path_is_mine check, MDCache::parallel_fetch_traverse_dir() needs to skip non-auth dirfrags. Fixes #4451. Signed-off-by: Sam Lang Signed-off-by: Yan, Zheng Reviewed-by: Yan, Zheng Reviewed-by: Greg Farnum --- src/mds/Capability.h | 1 + src/mds/MDCache.cc | 45 ++++++++++++++++++++++++++++++++++++-------- src/mds/MDCache.h | 1 + src/mds/MDS.cc | 1 + src/mds/Server.cc | 15 ++++++--------- 5 files changed, 46 insertions(+), 17 deletions(-) diff --git a/src/mds/Capability.h b/src/mds/Capability.h index 946afdc02b9..54d2312daeb 100644 --- a/src/mds/Capability.h +++ b/src/mds/Capability.h @@ -272,6 +272,7 @@ public: Export make_export() { return Export(_wanted, issued(), pending(), client_follows, mseq+1, last_issue_stamp); } + void rejoin_import() { mseq++; } void merge(Export& other) { // issued + pending int newpending = other.pending | pending(); diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 3f090bb3238..3129ed7c267 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -4097,20 +4097,25 @@ bool MDCache::parallel_fetch_traverse_dir(inodeno_t ino, filepath& path, frag_t fg = cur->pick_dirfrag(path[i]); CDir *dir = cur->get_or_open_dirfrag(this, fg); CDentry *dn = dir->lookup(path[i]); - CDentry::linkage_t *dnl = dn->get_linkage(); - if (!dn || dnl->is_null()) { - if (!dir->is_complete()) { - // fetch dir - fetch_queue.insert(dir); - return false; - } else { + CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL; + + if (!dnl || dnl->is_null()) { + if (!dir->is_auth()) { + dout(10) << " not dirfrag auth " << *dir << dendl; + return true; + } + if (dnl || dir->is_complete()) { // probably because the client created it and held a cap but it never committed // to the journal, and the op hasn't replayed yet. dout(5) << " dne (not created yet?) " << ino << " at " << path << dendl; missing.insert(ino); return true; } + // fetch dir + fetch_queue.insert(dir); + return false; } + cur = dnl->get_inode(); if (!cur) { assert(dnl->is_remote()); @@ -5041,8 +5046,32 @@ void MDCache::rejoin_import_cap(CInode *in, client_t client, ceph_mds_cap_reconn Capability *cap = in->reconnect_cap(client, icr, session); - if (frommds >= 0) + if (frommds >= 0) { + cap->rejoin_import(); do_cap_import(session, in, cap); + } +} + +void MDCache::export_remaining_imported_caps() +{ + dout(10) << "export_remaining_imported_caps" << dendl; + + for (map > >::iterator p = cap_imports.begin(); + p != cap_imports.end(); + ++p) { + for (map >::iterator q = p->second.begin(); + q != p->second.end(); + ++q) { + Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v)); + if (session) { + // mark client caps stale. + MClientCaps *stale = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0); + mds->send_message_client_counted(stale, q->first); + } + } + } + + cap_imports.clear(); } void MDCache::try_reconnect_cap(CInode *in, Session *session) diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 73780e26892..d837586a3ac 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -486,6 +486,7 @@ public: void rejoin_import_cap(CInode *in, client_t client, ceph_mds_cap_reconnect& icr, int frommds); void finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq); void try_reconnect_cap(CInode *in, Session *session); + void export_remaining_imported_caps(); // cap imports. delayed snap parent opens. // realm inode -> client -> cap inodes needing to split to this realm diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc index 3b3b2d6dc2e..935fb0c417e 100644 --- a/src/mds/MDS.cc +++ b/src/mds/MDS.cc @@ -1504,6 +1504,7 @@ void MDS::active_start() mdcache->clean_open_file_lists(); mdcache->scan_stray_dir(); + mdcache->export_remaining_imported_caps(); finish_contexts(g_ceph_context, waiting_for_replay); // kick waiters finish_contexts(g_ceph_context, waiting_for_active); // kick waiters } diff --git a/src/mds/Server.cc b/src/mds/Server.cc index dc7ea23f763..11ab834d856 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -574,7 +574,7 @@ void Server::handle_client_reconnect(MClientReconnect *m) // notify client of success with an OPEN mds->messenger->send_message(new MClientSession(CEPH_SESSION_OPEN), m->get_connection()); - + if (session->is_closed()) { dout(10) << " session is closed, will make best effort to reconnect " << m->get_source_inst() << dendl; @@ -636,15 +636,12 @@ void Server::handle_client_reconnect(MClientReconnect *m) } filepath path(p->second.path, (uint64_t)p->second.capinfo.pathbase); - if ((in && !in->is_auth()) || - !mds->mdcache->path_is_mine(path)) { + if (in && !in->is_auth()) { // not mine. dout(0) << "non-auth " << p->first << " " << path << ", will pass off to authority" << dendl; // mark client caps stale. - inode_t fake_inode; - fake_inode.ino = p->first; MClientCaps *stale = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0); //stale->head.migrate_seq = 0; // FIXME ****** mds->send_message_client_counted(stale, session); @@ -652,11 +649,11 @@ void Server::handle_client_reconnect(MClientReconnect *m) // add to cap export list. mdcache->rejoin_export_caps(p->first, from, p->second); } else { - // mine. fetch later. + // don't know if the inode is mine dout(0) << "missing " << p->first << " " << path - << " (mine), will load later" << dendl; - mdcache->rejoin_recovered_caps(p->first, from, p->second, - -1); // "from" me. + << " will load or export later" << dendl; + mdcache->rejoin_recovered_caps(p->first, from, p->second, -1); + mdcache->rejoin_export_caps(p->first, from, p->second); } } -- 2.47.3