#include "messages/MMDSFindIno.h"
#include "messages/MMDSFindInoReply.h"
+#include "messages/MMDSOpenIno.h"
+#include "messages/MMDSOpenInoReply.h"
+
#include "messages/MClientRequest.h"
#include "messages/MClientCaps.h"
#include "messages/MClientSnap.h"
}
kick_find_ino_peers(who);
+ kick_open_ino_peers(who);
show_subtrees();
}
}
kick_discovers(who);
-
+ kick_open_ino_peers(who);
kick_find_ino_peers(who);
// queue them up.
case MSG_MDS_FINDINOREPLY:
handle_find_ino_reply(static_cast<MMDSFindInoReply *>(m));
break;
+
+ case MSG_MDS_OPENINO:
+ handle_open_ino(static_cast<MMDSOpenIno *>(m));
+ break;
+ case MSG_MDS_OPENINOREPLY:
+ handle_open_ino_reply(static_cast<MMDSOpenInoReply *>(m));
+ break;
default:
dout(7) << "cache unknown message " << m->get_type() << dendl;
}
+// -------------------------------------------------------------------------------
+// Open inode by inode number
+
+class C_MDC_OpenInoBacktraceFetched : public Context {
+ MDCache *cache;
+ inodeno_t ino;
+ public:
+ bufferlist bl;
+ C_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) :
+ cache(c), ino(i) {}
+ void finish(int r) {
+ cache->_open_ino_backtrace_fetched(ino, bl, r);
+ }
+};
+
+struct C_MDC_OpenInoTraverseDir : public Context {
+ MDCache *cache;
+ inodeno_t ino;
+ public:
+ C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i) : cache(c), ino(i) {}
+ void finish(int r) {
+ assert(cache->opening_inodes.count(ino));
+ cache->_open_ino_traverse_dir(ino, cache->opening_inodes[ino], r);
+ }
+};
+
+struct C_MDC_OpenInoParentOpened : public Context {
+ MDCache *cache;
+ inodeno_t ino;
+ public:
+ C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : cache(c), ino(i) {}
+ void finish(int r) {
+ cache->_open_ino_parent_opened(ino, r);
+ }
+};
+
+void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err)
+{
+ dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl;
+
+ assert(opening_inodes.count(ino));
+ open_ino_info_t& info = opening_inodes[ino];
+
+ CInode *in = get_inode(ino);
+ if (in) {
+ dout(10) << " found cached " << *in << dendl;
+ open_ino_finish(ino, info, in->authority().first);
+ return;
+ }
+
+ inode_backtrace_t backtrace;
+ if (err == 0) {
+ ::decode(backtrace, bl);
+ if (backtrace.pool != info.pool) {
+ dout(10) << " old object in pool " << info.pool
+ << ", retrying pool " << backtrace.pool << dendl;
+ info.pool = backtrace.pool;
+ C_MDC_OpenInoBacktraceFetched *fin = new C_MDC_OpenInoBacktraceFetched(this, ino);
+ fetch_backtrace(ino, info.pool, fin->bl, fin);
+ return;
+ }
+ } else if (err == -ENOENT) {
+ int64_t meta_pool = mds->mdsmap->get_metadata_pool();
+ if (info.pool != meta_pool) {
+ dout(10) << " no object in pool " << info.pool
+ << ", retrying pool " << meta_pool << dendl;
+ info.pool = meta_pool;
+ C_MDC_OpenInoBacktraceFetched *fin = new C_MDC_OpenInoBacktraceFetched(this, ino);
+ fetch_backtrace(ino, info.pool, fin->bl, fin);
+ return;
+ }
+ }
+
+ if (err == 0) {
+ if (backtrace.ancestors.empty()) {
+ dout(10) << " got empty backtrace " << dendl;
+ err = -EIO;
+ } else if (!info.ancestors.empty()) {
+ if (info.ancestors[0] == backtrace.ancestors[0]) {
+ dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl;
+ err = -EINVAL;
+ }
+ }
+ }
+ if (err) {
+ dout(10) << " failed to open ino " << ino << dendl;
+ open_ino_finish(ino, info, err);
+ return;
+ }
+
+ dout(10) << " got backtrace " << backtrace << dendl;
+ info.ancestors = backtrace.ancestors;
+
+ _open_ino_traverse_dir(ino, info, 0);
+}
+
+void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret)
+{
+ dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl;
+
+ assert(opening_inodes.count(ino));
+ open_ino_info_t& info = opening_inodes[ino];
+
+ CInode *in = get_inode(ino);
+ if (in) {
+ dout(10) << " found cached " << *in << dendl;
+ open_ino_finish(ino, info, in->authority().first);
+ return;
+ }
+
+ if (ret == mds->get_nodeid()) {
+ _open_ino_traverse_dir(ino, info, 0);
+ } else {
+ if (ret >= 0) {
+ info.check_peers = true;
+ info.auth_hint = ret;
+ info.checked.erase(ret);
+ }
+ do_open_ino(ino, info, ret);
+ }
+}
+
+Context* MDCache::_open_ino_get_waiter(inodeno_t ino, MMDSOpenIno *m)
+{
+ if (m)
+ return new C_MDS_RetryMessage(mds, m);
+ else
+ return new C_MDC_OpenInoTraverseDir(this, ino);
+}
+
+void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret)
+{
+ dout(10) << "_open_ino_trvserse_dir ino " << ino << " ret " << ret << dendl;
+
+ CInode *in = get_inode(ino);
+ if (in) {
+ dout(10) << " found cached " << *in << dendl;
+ open_ino_finish(ino, info, in->authority().first);
+ return;
+ }
+
+ if (ret) {
+ do_open_ino(ino, info, ret);
+ return;
+ }
+
+ int hint = info.auth_hint;
+ ret = open_ino_traverse_dir(ino, NULL, info.ancestors,
+ info.discover, info.want_xlocked, &hint);
+ if (ret > 0)
+ return;
+ if (hint != mds->get_nodeid())
+ info.auth_hint = hint;
+ do_open_ino(ino, info, ret);
+}
+
+int MDCache::open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m,
+ vector<inode_backpointer_t>& ancestors,
+ bool discover, bool want_xlocked, int *hint)
+{
+ dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl;
+ int err = 0;
+ for (unsigned i = 0; i < ancestors.size(); i++) {
+ CInode *diri = get_inode(ancestors[i].dirino);
+
+ if (!diri) {
+ if (discover && MDS_INO_IS_MDSDIR(ancestors[i].dirino)) {
+ open_foreign_mdsdir(ancestors[i].dirino, _open_ino_get_waiter(ino, m));
+ return 1;
+ }
+ continue;
+ }
+
+ if (!diri->is_dir()) {
+ dout(10) << " " << *diri << " is not dir" << dendl;
+ if (i == 0)
+ err = -ENOTDIR;
+ break;
+ }
+
+ string &name = ancestors[i].dname;
+ frag_t fg = diri->pick_dirfrag(name);
+ CDir *dir = diri->get_dirfrag(fg);
+ if (!dir) {
+ if (diri->is_auth()) {
+ if (diri->is_frozen()) {
+ dout(10) << " " << *diri << " is frozen, waiting " << dendl;
+ diri->add_waiter(CDir::WAIT_UNFREEZE, _open_ino_get_waiter(ino, m));
+ return 1;
+ }
+ dir = diri->get_or_open_dirfrag(this, fg);
+ } else if (discover) {
+ open_remote_dirfrag(diri, fg, _open_ino_get_waiter(ino, m));
+ return 1;
+ }
+ }
+ if (dir) {
+ inodeno_t next_ino = i > 0 ? ancestors[i - 1].dirino : ino;
+ if (dir->is_auth()) {
+ CDentry *dn = dir->lookup(name);
+ CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
+
+ if (!dnl && !dir->is_complete() &&
+ (!dir->has_bloom() || dir->is_in_bloom(name))) {
+ dout(10) << " fetching incomplete " << *dir << dendl;
+ dir->fetch(_open_ino_get_waiter(ino, m));
+ return 1;
+ }
+
+ dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
+ if (i == 0)
+ err = -ENOENT;
+ } else if (discover) {
+ discover_ino(dir, next_ino, _open_ino_get_waiter(ino, m),
+ (i == 0 && want_xlocked));
+ return 1;
+ }
+ }
+ if (hint && i == 0)
+ *hint = dir ? dir->authority().first : diri->authority().first;
+ break;
+ }
+ return err;
+}
+
+void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret)
+{
+ dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl;
+
+ finish_contexts(g_ceph_context, info.waiters, ret);
+ opening_inodes.erase(ino);
+}
+
+void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err)
+{
+ if (err < 0) {
+ info.checked.clear();
+ info.checked.insert(mds->get_nodeid());
+ info.checking = -1;
+ info.check_peers = true;
+ info.fetch_backtrace = true;
+ if (info.discover) {
+ info.discover = false;
+ info.ancestors.clear();
+ }
+ }
+
+ if (info.check_peers) {
+ info.check_peers = false;
+ info.checking = -1;
+ do_open_ino_peer(ino, info);
+ } else if (info.fetch_backtrace) {
+ info.check_peers = true;
+ info.fetch_backtrace = false;
+ info.checking = mds->get_nodeid();
+ info.checked.clear();
+ info.checked.insert(mds->get_nodeid());
+ C_MDC_OpenInoBacktraceFetched *fin = new C_MDC_OpenInoBacktraceFetched(this, ino);
+ fetch_backtrace(ino, info.pool, fin->bl, fin);
+ } else {
+ assert(!info.ancestors.empty());
+ info.checking = mds->get_nodeid();
+ open_ino(info.ancestors[0].dirino, mds->mdsmap->get_metadata_pool(),
+ new C_MDC_OpenInoParentOpened(this, ino), info.want_replica);
+ }
+}
+
+void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
+{
+ set<int> all, active;
+ mds->mdsmap->get_mds_set(all);
+ mds->mdsmap->get_clientreplay_or_active_or_stopping_mds_set(active);
+ if (mds->get_state() == MDSMap::STATE_REJOIN)
+ mds->mdsmap->get_mds_set(active, MDSMap::STATE_REJOIN);
+
+ dout(10) << "do_open_ino_peer " << ino << " active " << active
+ << " all " << all << " checked " << info.checked << dendl;
+
+ int peer = -1;
+ if (info.auth_hint >= 0) {
+ if (active.count(info.auth_hint)) {
+ peer = info.auth_hint;
+ info.auth_hint = -1;
+ }
+ } else {
+ for (set<int>::iterator p = active.begin(); p != active.end(); ++p)
+ if (*p != mds->get_nodeid() && info.checked.count(*p) == 0) {
+ peer = *p;
+ break;
+ }
+ }
+ if (peer < 0) {
+ if (all.size() > active.size() && all != info.checked) {
+ dout(10) << " waiting for more peers to be active" << dendl;
+ } else {
+ dout(10) << " all MDS peers have been checked " << dendl;
+ do_open_ino(ino, info, 0);
+ }
+ } else {
+ info.checking = peer;
+ mds->send_message_mds(new MMDSOpenIno(info.tid, ino, info.ancestors), peer);
+ }
+}
+
+void MDCache::handle_open_ino(MMDSOpenIno *m)
+{
+ dout(10) << "handle_open_ino " << *m << dendl;
+
+ inodeno_t ino = m->ino;
+ MMDSOpenInoReply *reply;
+ CInode *in = get_inode(ino);
+ if (in) {
+ dout(10) << " have " << *in << dendl;
+ reply = new MMDSOpenInoReply(m->get_tid(), ino, 0);
+ if (in->is_auth()) {
+ touch_inode(in);
+ while (1) {
+ CDentry *pdn = in->get_parent_dn();
+ if (!pdn)
+ break;
+ CInode *diri = pdn->get_dir()->get_inode();
+ reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->name,
+ in->inode.version));
+ in = diri;
+ }
+ } else {
+ reply->hint = in->authority().first;
+ }
+ } else {
+ int hint = -1;
+ int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint);
+ if (ret > 0)
+ return;
+ reply = new MMDSOpenInoReply(m->get_tid(), ino, hint, ret);
+ }
+ mds->messenger->send_message(reply, m->get_connection());
+ m->put();
+}
+
+void MDCache::handle_open_ino_reply(MMDSOpenInoReply *m)
+{
+ dout(10) << "handle_open_ino_reply " << *m << dendl;
+
+ inodeno_t ino = m->ino;
+ int from = m->get_source().num();
+ if (opening_inodes.count(ino)) {
+ open_ino_info_t& info = opening_inodes[ino];
+
+ if (info.checking == from)
+ info.checking = -1;
+ info.checked.insert(from);
+
+ CInode *in = get_inode(ino);
+ if (in) {
+ dout(10) << " found cached " << *in << dendl;
+ open_ino_finish(ino, info, in->authority().first);
+ } else if (!m->ancestors.empty()) {
+ dout(10) << " found ino " << ino << " on mds." << from << dendl;
+ if (!info.want_replica) {
+ open_ino_finish(ino, info, from);
+ return;
+ }
+
+ info.ancestors = m->ancestors;
+ info.auth_hint = from;
+ info.checking = mds->get_nodeid();
+ info.discover = true;
+ _open_ino_traverse_dir(ino, info, 0);
+ } else if (m->error) {
+ dout(10) << " error " << m->error << " from mds." << from << dendl;
+ do_open_ino(ino, info, m->error);
+ } else {
+ if (m->hint >= 0 && m->hint != mds->get_nodeid()) {
+ info.auth_hint = m->hint;
+ info.checked.erase(m->hint);
+ }
+ do_open_ino_peer(ino, info);
+ }
+ }
+ m->put();
+}
+
+void MDCache::kick_open_ino_peers(int who)
+{
+ dout(10) << "kick_open_ino_peers mds." << who << dendl;
+
+ for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin();
+ p != opening_inodes.end();
+ ++p) {
+ open_ino_info_t& info = p->second;
+ if (info.checking == who) {
+ dout(10) << " kicking ino " << p->first << " who was checking mds." << who << dendl;
+ info.checking = -1;
+ do_open_ino_peer(p->first, info);
+ } else if (info.checking == -1) {
+ dout(10) << " kicking ino " << p->first << " who was waiting" << dendl;
+ do_open_ino_peer(p->first, info);
+ }
+ }
+}
+
+void MDCache::open_ino(inodeno_t ino, int64_t pool, Context* fin,
+ bool want_replica, bool want_xlocked)
+{
+ dout(10) << "open_ino " << ino << " pool " << pool << " want_replica "
+ << want_replica << dendl;
+
+ if (opening_inodes.count(ino)) {
+ open_ino_info_t& info = opening_inodes[ino];
+ if (want_replica) {
+ info.want_replica = true;
+ if (want_xlocked)
+ info.want_xlocked = true;
+ }
+ info.waiters.push_back(fin);
+ } else {
+ open_ino_info_t& info = opening_inodes[ino];
+ info.checked.insert(mds->get_nodeid());
+ info.want_replica = want_replica;
+ info.want_xlocked = want_xlocked;
+ info.tid = ++open_ino_last_tid;
+ info.pool = pool >= 0 ? pool : mds->mdsmap->get_first_data_pool();
+ info.waiters.push_back(fin);
+ do_open_ino(ino, info, 0);
+ }
+}
+
/* ---------------------------- */
/*