From 52aed2a12704c5b3673e353be99c7f2f71073ce5 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 18 Jul 2008 14:04:26 -0700 Subject: [PATCH] mds: separate snaprealm creation from snap creation --- src/TODO | 4 ++ src/client/Client.cc | 27 +++++------- src/mds/MDCache.cc | 100 +++++++++++++++++++++++++++++++++++++++++- src/mds/MDCache.h | 5 +++ src/mds/Server.cc | 65 ++++++++------------------- src/mds/SnapClient.h | 8 ++++ src/mds/SnapServer.cc | 25 ++++++++--- src/mds/SnapServer.h | 3 ++ 8 files changed, 169 insertions(+), 68 deletions(-) diff --git a/src/TODO b/src/TODO index 5b29743a8dee3..1553792977dd3 100644 --- a/src/TODO +++ b/src/TODO @@ -83,6 +83,8 @@ mds -> i think cache expires are fine; the rejoin_ack handler just has to behave if rejoining items go missing - try_remove_unlinked_dn thing +- rename: importing inode... also journal imported client map? + - rerun destro trace against latest, with various journal lengths - lease length heuristics @@ -243,6 +245,8 @@ todo primary dir link -> multiversion inode remote link -> multiversion inode +** HRM, how to cope with split notifications from multiple mds's racing to client... + - for simplicity, don't replicate any snapshot data. - need rrealms in fraginfo_t diff --git a/src/client/Client.cc b/src/client/Client.cc index 67dacef7077f3..a6891d9627561 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -1629,13 +1629,7 @@ void Client::handle_snap(MClientSnap *m) if (m->split) { SnapRealm *realm = get_snap_realm(m->split); realm->created = m->snap_created; - if (realm->snaps.empty()) { - // new split.. pretend we have one less snap than we do now! - vector& newsnaps = m->realms[m->split]; - realm->snaps.resize(newsnaps.size() - 1); - for (unsigned i=0; isnaps.size(); i++) - realm->snaps[i] = newsnaps[i+1]; - } + realm->snaps = m->realms[m->split]; dout(10) << " splitting off " << *realm << dendl; for (list::iterator p = m->split_inos.begin(); p != m->split_inos.end(); @@ -1660,15 +1654,16 @@ void Client::handle_snap(MClientSnap *m) } } put_snap_realm(realm); - } - - for (map >::iterator p = m->realms.begin(); - p != m->realms.end(); - p++) { - dout(10) << "realm " << p->first << " snaps " << p->second << dendl; - SnapRealm *realm = get_snap_realm(p->first); - maybe_update_snaprealm(realm, 0, m->snap_highwater, p->second); - put_snap_realm(realm); + } else { + // regular update + for (map >::iterator p = m->realms.begin(); + p != m->realms.end(); + p++) { + dout(10) << "realm " << p->first << " snaps " << p->second << dendl; + SnapRealm *realm = get_snap_realm(p->first); + maybe_update_snaprealm(realm, 0, m->snap_highwater, p->second); + put_snap_realm(realm); + } } delete m; } diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 0eaf627d3fdc8..756ac1b89f8ca 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -20,9 +20,11 @@ #include "Locker.h" #include "MDLog.h" #include "MDBalancer.h" -#include "AnchorClient.h" #include "Migrator.h" +#include "AnchorClient.h" +#include "SnapClient.h" + #include "MDSMap.h" #include "CInode.h" @@ -66,6 +68,7 @@ #include "messages/MClientRequest.h" #include "messages/MClientFileCaps.h" +#include "messages/MClientSnap.h" #include "messages/MMDSSlaveRequest.h" @@ -5283,6 +5286,101 @@ void MDCache::_anchor_logged(CInode *in, version_t atid, Mutation *mut) } +// ------------------------------------------------------------------------------- +// SNAPREALMS + +struct C_MDC_snaprealm_create_finish : public Context { + MDCache *cache; + MDRequest *mdr; + CInode *in; + C_MDC_snaprealm_create_finish(MDCache *c, MDRequest *m, CInode *i) : cache(c), mdr(m), in(i) {} + void finish(int r) { + cache->_snaprealm_create_finish(mdr, in); + } +}; + +void MDCache::snaprealm_create(MDRequest *mdr, CInode *in) +{ + dout(10) << "snaprealm_create " << *in << dendl; + assert(!in->snaprealm); + + if (!in->inode.anchored) { + mds->mdcache->anchor_create(mdr, in, new C_MDS_RetryRequest(mds->mdcache, mdr)); + return; + } + + // allocate an id.. + if (!mdr->more()->stid) { + mds->snapclient->prepare_create_realm(in->ino(), &mdr->more()->stid, + new C_MDS_RetryRequest(this, mdr)); + return; + } + + EUpdate *le = new EUpdate(mds->mdlog, "snaprealm_create"); + le->metablob.add_table_transaction(TABLE_SNAP, mdr->more()->stid); + + inode_t *pi = in->project_inode(); + pi->version = in->pre_dirty(); + + SnapRealm t(this, in); + t.created = mdr->more()->stid; + bufferlist snapbl; + ::encode(t, snapbl); + + journal_cow_inode(&le->metablob, in); + le->metablob.add_primary_dentry(in->get_projected_parent_dn(), true, 0, pi, 0, &snapbl); + + mds->mdlog->submit_entry(le, new C_MDC_snaprealm_create_finish(this, mdr, in)); +} + +void MDCache::_snaprealm_create_finish(MDRequest *mdr, CInode *in) +{ + dout(10) << "_snaprealm_create_finish " << *in << dendl; + + in->pop_and_dirty_projected_inode(mdr->ls); + mdr->apply(); + + // create + in->open_snaprealm(); + in->snaprealm->created = mdr->more()->stid; + + // split existing caps + SnapRealm *parent = in->snaprealm->parent; + assert(parent); + assert(parent->open_children.count(in->snaprealm)); + parent->split_at(in->snaprealm); + + // notify clients of update|split + list split_inos; + for (xlist::iterator p = in->snaprealm->inodes_with_caps.begin(); !p.end(); ++p) + split_inos.push_back((*p)->ino()); + + const vector snaps = in->snaprealm->get_snap_vector(); + map updates; + for (map >::iterator p = in->snaprealm->client_caps.begin(); + p != in->snaprealm->client_caps.end(); + p++) { + assert(!p->second.empty()); + MClientSnap *update = updates[p->first] = new MClientSnap; + update->snap_created = in->snaprealm->created; + update->split = in->ino(); + update->split_inos = split_inos; + update->realms[in->ino()] = snaps; + updates[p->first] = update; + } + + // send + for (map::iterator p = updates.begin(); + p != updates.end(); + p++) + mds->send_message_client(p->second, p->first); + + // done. + mdr->more()->stid = 0; // caller will likely need to reuse this + dispatch_request(mdr); +} + + // ------------------------------------------------------------------------------- // STRAYS diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index a4d42b861679c..75bf39fb4a471 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -785,6 +785,11 @@ protected: friend class C_MDC_AnchorPrepared; friend class C_MDC_AnchorLogged; + // -- snaprealms -- +public: + void snaprealm_create(MDRequest *mdr, CInode *in); + void _snaprealm_create_finish(MDRequest *mdr, CInode *in); + // -- stray -- public: void eval_stray(CDentry *dn); diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 984a9ef4b7cba..fa2ec9cbef81c 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -3255,6 +3255,13 @@ void Server::handle_client_rename(MDRequest *mdr) return; } + // moving between snaprealms? + if (!srci->snaprealm && + srci->find_snaprealm() != destdn->dir->inode->find_snaprealm()) { + dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl; + mds->mdcache->snaprealm_create(mdr, srci); + return; + } // set done_locking flag, to avoid problems with wrlock moving auth target mdr->done_locking = true; @@ -4886,15 +4893,16 @@ void Server::handle_client_mksnap(MDRequest *mdr) return; } - // anchor diri - if (!diri->inode.anchored) { - mds->mdcache->anchor_create(mdr, diri, new C_MDS_RetryRequest(mds->mdcache, mdr)); - return; - } - if (mdr->now == utime_t()) mdr->now = g_clock.now(); + + // create snaprealm? + if (!diri->snaprealm) { + mds->mdcache->snaprealm_create(mdr, diri); + return; + } + // allocate a snapid if (!mdr->more()->stid) { // prepare an stid @@ -4927,16 +4935,9 @@ void Server::handle_client_mksnap(MDRequest *mdr) // project the snaprealm bufferlist snapbl; - if (diri->snaprealm) { - diri->snaprealm->snaps[snapid] = info; - diri->encode_snap_blob(snapbl); - diri->snaprealm->snaps.erase(snapid); - } else { - SnapRealm t(mdcache, diri); - t.created = snapid; - t.snaps[snapid] = info; - ::encode(t, snapbl); - } + diri->snaprealm->snaps[snapid] = info; + diri->encode_snap_blob(snapbl); + diri->snaprealm->snaps.erase(snapid); le->metablob.add_primary_dentry(diri->get_projected_parent_dn(), true, 0, pi, 0, &snapbl); mdlog->submit_entry(le, new C_MDS_mksnap_finish(mds, mdr, diri, info)); @@ -4951,33 +4952,11 @@ void Server::_mksnap_finish(MDRequest *mdr, CInode *diri, SnapInfo &info) mds->snapclient->commit(mdr->more()->stid, mdr->ls); - snapid_t snapid = info.snapid; - - // create realm? - inodeno_t split_parent = 0; - if (!diri->snaprealm) { - dout(10) << "creating snaprealm on " << *diri << dendl; - diri->open_snaprealm(); - diri->snaprealm->created = snapid; - - // split existing caps - SnapRealm *parent = diri->snaprealm->parent; - assert(parent); - assert(parent->open_children.count(diri->snaprealm)); - parent->split_at(diri->snaprealm); - split_parent = parent->inode->ino(); - } - // create snap + snapid_t snapid = info.snapid; diri->snaprealm->snaps[snapid] = info; dout(10) << "snaprealm now " << *diri->snaprealm << dendl; - // notify clients of update|split - list split_inos; - if (split_parent) - for (xlist::iterator p = diri->snaprealm->inodes_with_caps.begin(); !p.end(); ++p) - split_inos.push_back((*p)->ino()); - list realms; map updates; list q; @@ -5003,17 +4982,11 @@ void Server::_mksnap_finish(MDRequest *mdr, CInode *diri, SnapInfo &info) if (!update) { update = new MClientSnap; update->snap_highwater = snapid; - if (split_parent) { - update->snap_created = diri->snaprealm->created; - update->split = diri->ino(); - update->split_inos = split_inos; - split_parent = 0; - } updates[p->first] = update; } update->realms[realm->inode->ino()] = snapvec; } - + // notify for active children, too. dout(10) << " " << realm << " open_children are " << realm->open_children << dendl; for (set::iterator p = realm->open_children.begin(); diff --git a/src/mds/SnapClient.h b/src/mds/SnapClient.h index afaa33b7c1cca..5917e509bda6d 100644 --- a/src/mds/SnapClient.h +++ b/src/mds/SnapClient.h @@ -39,6 +39,14 @@ public: ::encode(stamp, bl); _prepare(bl, pstid, onfinish); } + + void prepare_create_realm(inodeno_t ino, version_t *pstid, Context *onfinish) { + bufferlist bl; + __u32 op = TABLE_OP_CREATE; + ::encode(op, bl); + ::encode(ino, bl); + _prepare(bl, pstid, onfinish); + } }; #endif diff --git a/src/mds/SnapServer.cc b/src/mds/SnapServer.cc index c238ee5fc152f..9f2a50328082f 100644 --- a/src/mds/SnapServer.cc +++ b/src/mds/SnapServer.cc @@ -72,12 +72,18 @@ void SnapServer::_prepare(bufferlist &bl, __u64 reqid, int bymds) switch (what) { case TABLE_OP_CREATE: { + version++; + SnapInfo info; ::decode(info.dirino, p); - ::decode(info.name, p); - ::decode(info.stamp, p); - info.snapid = ++version; - pending_create[version] = info; + if (!p.end()) { + ::decode(info.name, p); + ::decode(info.stamp, p); + info.snapid = version; + pending_create[version] = info; + } else { + pending_noop.insert(version); + } } break; @@ -116,7 +122,11 @@ void SnapServer::_commit(version_t tid) snaps.erase(pending_destroy[tid]); pending_destroy.erase(tid); } - else + else if (pending_noop.count(tid)) { + dout(7) << "commit " << tid << " noop" << dendl; + pending_noop.erase(tid); + } + else assert(0); // bump version. @@ -135,6 +145,11 @@ void SnapServer::_rollback(version_t tid) dout(7) << "rollback " << tid << " destroy " << pending_destroy[tid] << dendl; pending_destroy.erase(tid); } + + else if (pending_noop.count(tid)) { + dout(7) << "rollback " << tid << " noop" << dendl; + pending_noop.erase(tid); + } else assert(0); diff --git a/src/mds/SnapServer.h b/src/mds/SnapServer.h index 2c349d723b219..4b50c59f2678d 100644 --- a/src/mds/SnapServer.h +++ b/src/mds/SnapServer.h @@ -30,6 +30,7 @@ protected: map pending_create; map pending_destroy; + set pending_noop; public: SnapServer(MDS *m) : MDSTableServer(m, TABLE_SNAP) { } @@ -46,6 +47,7 @@ public: ::encode(pending_removal, bl); ::encode(pending_create, bl); ::encode(pending_destroy, bl); + ::encode(pending_noop, bl); ::encode(pending_for_mds, bl); } void decode_state(bufferlist::iterator& bl) { @@ -54,6 +56,7 @@ public: ::decode(pending_removal, bl); ::decode(pending_create, bl); ::decode(pending_destroy, bl); + ::decode(pending_noop, bl); ::decode(pending_for_mds, bl); } -- 2.39.5