From f842c95d92137610e361d44b889a4d4f885930f7 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 25 Jul 2017 19:52:29 +0800 Subject: [PATCH] mds: explict notification for snap update Signed-off-by: "Yan, Zheng" --- src/mds/CInode.cc | 5 -- src/mds/MDCache.cc | 145 +++++++++++++++++++++++++++------- src/mds/MDCache.h | 5 +- src/mds/Server.cc | 15 +++- src/messages/MMDSSnapUpdate.h | 59 ++++++++++++++ src/msg/Message.cc | 5 ++ src/msg/Message.h | 2 +- 7 files changed, 198 insertions(+), 38 deletions(-) create mode 100644 src/messages/MMDSSnapUpdate.h diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 9ca121e9f13..fa02cf19ee2 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -1818,12 +1818,7 @@ void CInode::decode_lock_state(int type, bufferlist& bl) decode(inode.version, p); decode(tm, p); if (inode.ctime < tm) inode.ctime = tm; - snapid_t seq = 0; - if (snaprealm) - seq = snaprealm->srnode.seq; decode_snap(p); - if (snaprealm && snaprealm->srnode.seq != seq) - mdcache->do_realm_invalidate_and_update_notify(this, seq ? CEPH_SNAP_OP_UPDATE:CEPH_SNAP_OP_SPLIT); } break; diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 26b332a0b5e..d4ff9f90eec 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -94,6 +94,7 @@ #include "messages/MMDSSlaveRequest.h" #include "messages/MMDSFragmentNotify.h" +#include "messages/MMDSSnapUpdate.h" #include "messages/MGatherCaps.h" @@ -4950,6 +4951,7 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) // for sending cache expire message set isolated_inodes; set refragged_inodes; + list > updated_realms; // dirs for (map::iterator p = ack->strong_dirfrags.begin(); @@ -5098,7 +5100,14 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) CInode *in = get_inode(ino, last); assert(in); bufferlist::iterator q = basebl.begin(); + snapid_t sseq = 0; + if (in->snaprealm) + sseq = in->snaprealm->srnode.seq; in->_decode_base(q); + if (in->snaprealm && in->snaprealm->srnode.seq != sseq) { + int snap_op = sseq > 0 ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT; + updated_realms.push_back(pair(in, snap_op)); + } dout(10) << " got inode base " << *in << dendl; } @@ -5160,11 +5169,26 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) assert(cap_exports[p->first].empty()); } + for (auto p : updated_realms) { + CInode *in = p.first; + bool notify_clients; + if (mds->is_rejoin()) { + if (!rejoin_pending_snaprealms.count(in)) { + in->get(CInode::PIN_OPENINGSNAPPARENTS); + rejoin_pending_snaprealms.insert(in); + } + notify_clients = false; + } else { + // notify clients if I'm survivor + notify_clients = true; + } + do_realm_invalidate_and_update_notify(in, p.second, notify_clients); + } + // done? assert(rejoin_ack_gather.count(from)); rejoin_ack_gather.erase(from); if (!survivor) { - if (rejoin_gather.empty()) { // eval unstable scatter locks after all wrlocks are rejoined. while (!rejoin_eval_locks.empty()) { @@ -5816,6 +5840,9 @@ void MDCache::open_snaprealms() } if (gather.has_subs()) { + // for multimds, must succeed the first time + assert(recovery_set.empty()); + dout(10) << "open_snaprealms - waiting for " << gather.num_subs_remaining() << dendl; gather.set_finisher(new C_MDC_OpenSnapRealms(this)); @@ -7824,6 +7851,10 @@ void MDCache::dispatch(Message *m) case MSG_MDS_OPENINOREPLY: handle_open_ino_reply(static_cast(m)); break; + + case MSG_MDS_SNAPUPDATE: + handle_snap_update(static_cast(m)); + break; default: derr << "cache unknown message " << m->get_type() << dendl; @@ -9362,28 +9393,29 @@ void MDCache::snaprealm_create(MDRequestRef& mdr, CInode *in) mds->mdlog->flush(); } - -void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool nosend) +void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients) { dout(10) << "do_realm_invalidate_and_update_notify " << *in->snaprealm << " " << *in << dendl; vector split_inos; vector split_realms; + bufferlist snapbl; - if (snapop == CEPH_SNAP_OP_SPLIT) { - // notify clients of update|split - for (elist::iterator p = in->snaprealm->inodes_with_caps.begin(member_offset(CInode, item_caps)); - !p.end(); ++p) - split_inos.push_back((*p)->ino()); - - for (set::iterator p = in->snaprealm->open_children.begin(); - p != in->snaprealm->open_children.end(); - ++p) - split_realms.push_back((*p)->inode->ino()); - } + if (notify_clients) { + assert(in->snaprealm->have_past_parents_open()); + if (snapop == CEPH_SNAP_OP_SPLIT) { + // notify clients of update|split + for (elist::iterator p = in->snaprealm->inodes_with_caps.begin(member_offset(CInode, item_caps)); + !p.end(); ++p) + split_inos.push_back((*p)->ino()); - bufferlist snapbl; - in->snaprealm->build_snap_trace(snapbl); + for (set::iterator p = in->snaprealm->open_children.begin(); + p != in->snaprealm->open_children.end(); + ++p) + split_realms.push_back((*p)->inode->ino()); + } + in->snaprealm->build_snap_trace(snapbl); + } set past_children; map updates; @@ -9396,17 +9428,19 @@ void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool dout(10) << " realm " << *realm << " on " << *realm->inode << dendl; realm->invalidate_cached_snaps(); - for (map* >::iterator p = realm->client_caps.begin(); - p != realm->client_caps.end(); - ++p) { - assert(!p->second->empty()); - if (!nosend && updates.count(p->first) == 0) { - MClientSnap *update = new MClientSnap(snapop); - update->head.split = in->ino(); - update->split_inos = split_inos; - update->split_realms = split_realms; - update->bl = snapbl; - updates[p->first] = update; + if (notify_clients) { + for (map* >::iterator p = realm->client_caps.begin(); + p != realm->client_caps.end(); + ++p) { + assert(!p->second->empty()); + if (updates.count(p->first) == 0) { + MClientSnap *update = new MClientSnap(snapop); + update->head.split = in->ino(); + update->split_inos = split_inos; + update->split_realms = split_realms; + update->bl = snapbl; + updates[p->first] = update; + } } } @@ -9425,7 +9459,7 @@ void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool q.push_back(*p); } - if (!nosend) + if (notify_clients) send_snaps(updates); // notify past children and their descendants if we update/delete old snapshots @@ -9503,6 +9537,61 @@ void MDCache::_snaprealm_create_finish(MDRequestRef& mdr, MutationRef& mut, CIno dispatch_request(mdr); } +void MDCache::send_snap_update(CInode *in, version_t stid, int snap_op) +{ + dout(10) << __func__ << " " << *in << " stid " << stid << dendl; + + bufferlist snap_blob; + in->encode_snap(snap_blob); + + set mds_set; + mds->mdsmap->get_mds_set_lower_bound(mds_set, MDSMap::STATE_RESOLVE); + mds_set.erase(mds->get_nodeid()); + for (auto p : mds_set) { + MMDSSnapUpdate *m = new MMDSSnapUpdate(in->ino(), stid, snap_op); + m->snap_blob = snap_blob; + mds->send_message_mds(m, p); + } +} + +void MDCache::handle_snap_update(MMDSSnapUpdate *m) +{ + mds_rank_t from = mds_rank_t(m->get_source().num()); + dout(10) << __func__ << " " << *m << " from mds." << from << dendl; + + if (mds->get_state() < MDSMap::STATE_RESOLVE && + mds->get_want_state() != CEPH_MDS_STATE_RESOLVE) { + m->put(); + return; + } + + mds->snapclient->notify_commit(m->get_tid()); + + CInode *in = get_inode(m->get_ino()); + if (in) { + assert(!in->is_auth()); + if (mds->get_state() > MDSMap::STATE_REJOIN || + (mds->is_rejoin() && !in->is_rejoining())) { + bufferlist::iterator p = m->snap_blob.begin(); + in->decode_snap(p); + + bool notify_clients = true; + if (mds->is_rejoin()) { + if (rejoin_done) { + // rejoin_done is non-null, it means open_snaprealms() hasn't been called + if (!rejoin_pending_snaprealms.count(in)) { + in->get(CInode::PIN_OPENINGSNAPPARENTS); + rejoin_pending_snaprealms.insert(in); + } + notify_clients = false; + } + } + do_realm_invalidate_and_update_notify(in, m->get_snap_op(), notify_clients); + } + } + + m->put(); +} // ------------------------------------------------------------------------------- // STRAYS diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 515e1d30519..e6e966b7a75 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -61,6 +61,7 @@ struct MMDSFindIno; struct MMDSFindInoReply; struct MMDSOpenIno; struct MMDSOpenInoReply; +class MMDSSnapUpdate; class Message; class MClientRequest; @@ -653,7 +654,6 @@ public: void choose_lock_states_and_reconnect_caps(); void prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino, map& splits); - void do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool nosend=false); void send_snaps(map& splits); Capability* rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds); void finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq, @@ -1041,6 +1041,9 @@ public: public: void snaprealm_create(MDRequestRef& mdr, CInode *in); void _snaprealm_create_finish(MDRequestRef& mdr, MutationRef& mut, CInode *in); + void do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients=true); + void send_snap_update(CInode *in, version_t stid, int snap_op); + void handle_snap_update(MMDSSnapUpdate *m); // -- stray -- public: diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 158bde3bd12..29caeb08df4 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -6173,7 +6173,7 @@ void Server::_unlink_local_finish(MDRequestRef& mdr, mdr->apply(); if (snap_is_new) //only new if strayin exists - mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, true); + mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, false); mdcache->send_dentry_unlink(dn, straydn, mdr); @@ -7493,7 +7493,7 @@ void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, C bool hadrealm = (oldin->snaprealm ? true : false); oldin->pop_and_dirty_projected_inode(mdr->ls); if (oldin->snaprealm && !hadrealm) - mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT); + mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT, false); } else { // FIXME this snaprealm is not filled out correctly //oldin->open_snaprealm(); might be sufficient.. @@ -8731,6 +8731,9 @@ void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info) // create snap dout(10) << "snaprealm now " << *diri->snaprealm << dendl; + // notify other mds + mdcache->send_snap_update(diri, mdr->more()->stid, op); + mdcache->do_realm_invalidate_and_update_notify(diri, op); // yay @@ -8860,6 +8863,9 @@ void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid) dout(10) << "snaprealm now " << *diri->snaprealm << dendl; + // notify other mds + mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_DESTROY); + mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY); // yay @@ -9000,7 +9006,10 @@ void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid dout(10) << "snaprealm now " << *diri->snaprealm << dendl; - mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE, true); + // notify other mds + mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_UPDATE); + + mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE); // yay mdr->in[0] = diri; diff --git a/src/messages/MMDSSnapUpdate.h b/src/messages/MMDSSnapUpdate.h new file mode 100644 index 00000000000..230e58a3fa2 --- /dev/null +++ b/src/messages/MMDSSnapUpdate.h @@ -0,0 +1,59 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MMDSSNAPUPDATE_H +#define CEPH_MMDSSNAPUPDATE_H + +#include "msg/Message.h" + +class MMDSSnapUpdate : public Message { + inodeno_t ino; + __s16 snap_op; + +public: + inodeno_t get_ino() { return ino; } + int get_snap_op() { return snap_op; } + + bufferlist snap_blob; + + MMDSSnapUpdate() : Message(MSG_MDS_SNAPUPDATE) {} + MMDSSnapUpdate(inodeno_t i, version_t tid, int op) : + Message(MSG_MDS_SNAPUPDATE), ino(i), snap_op(op) { + set_tid(tid); + } +private: + ~MMDSSnapUpdate() override {} + +public: + const char *get_type_name() const override { return "snap_update"; } + void print(ostream& o) const override { + o << "snap_update(" << ino << " table_tid " << get_tid() << ")"; + } + + void encode_payload(uint64_t features) override { + using ceph::encode; + encode(ino, payload); + encode(snap_op, payload); + encode(snap_blob, payload); + } + void decode_payload() override { + using ceph::decode; + bufferlist::iterator p = payload.begin(); + decode(ino, p); + decode(snap_op, p); + decode(snap_blob, p); + } +}; + +#endif diff --git a/src/msg/Message.cc b/src/msg/Message.cc index 11ad14ae28c..329deb26721 100644 --- a/src/msg/Message.cc +++ b/src/msg/Message.cc @@ -128,6 +128,7 @@ #include "messages/MMDSFindInoReply.h" #include "messages/MMDSOpenIno.h" #include "messages/MMDSOpenInoReply.h" +#include "messages/MMDSSnapUpdate.h" #include "messages/MDirUpdate.h" #include "messages/MDiscover.h" @@ -664,6 +665,10 @@ Message *decode_message(CephContext *cct, int crcflags, m = new MMDSOpenInoReply; break; + case MSG_MDS_SNAPUPDATE: + m = new MMDSSnapUpdate(); + break; + case MSG_MDS_FRAGMENTNOTIFY: m = new MMDSFragmentNotify; break; diff --git a/src/msg/Message.h b/src/msg/Message.h index b19991d1cec..22d45a468eb 100644 --- a/src/msg/Message.h +++ b/src/msg/Message.h @@ -147,7 +147,7 @@ #define MSG_MDS_FINDINOREPLY 0x20e #define MSG_MDS_OPENINO 0x20f #define MSG_MDS_OPENINOREPLY 0x210 - +#define MSG_MDS_SNAPUPDATE 0x211 #define MSG_MDS_LOCK 0x300 #define MSG_MDS_INODEFILECAPS 0x301 -- 2.39.5