From 1ec93753a37a24a7e72566848ccc0baab1df84ce Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 2 Aug 2017 17:26:56 +0800 Subject: [PATCH] mds: track snap inodes through sorted map Current mds track both head inodes and snap inodes through unsorted map. The unsorted map makes finding snap inode that follows a given snapid difficult. Currnt MDCache::pick_inode_snap() use snap set to guess snap inode's last. The method isn't reliable because snap set may change after creating the snap inode. For example: MDS cows inode[2,head] with snap set[5,6], which results inode[2,6] and inode[7,head]. Later mds wants to find snap inode that follows snapid 2. But the snap set become [5], mds can't find snap inode [2,5]. Signed-off-by: "Yan, Zheng" (cherry picked from commit 7b9eae62c8c654ff82684451c222257d2c93be64) Conflicts: src/mds/MDCache.cc: when i cherry-picked 3ca602e, there were conflicts in same file. to fix conflicts, i need to introduce snap_inode_map because it's not in luminous. to intoroduce this, i need to cherry-pick 7b9eae62. After cherry-picking 7b9eae62, there were conflicts. i picked code from head because it's introduced by 3ca602e which is required to fix http://tracker.ceph.com/issues/21928. --- src/mds/Locker.cc | 75 ++++++-------- src/mds/MDCache.cc | 240 +++++++++++++++++++++------------------------ src/mds/MDCache.h | 21 +++- 3 files changed, 156 insertions(+), 180 deletions(-) diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index a0ccf96016be..d432227da57c 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -2542,26 +2542,9 @@ void Locker::_do_null_snapflush(CInode *head_in, client_t client, snapid_t last) if (clients.count(client)) { dout(10) << " doing async NULL snapflush on " << snapid << " from client." << client << dendl; - CInode *sin = mdcache->get_inode(head_in->ino(), snapid); - if (!sin) { - // hrm, look forward until we find the inode. - // (we can only look it up by the last snapid it is valid for) - dout(10) << " didn't have " << head_in->ino() << " snapid " << snapid << dendl; - for (compact_map >::iterator q = p; // p is already at next entry - q != head_in->client_need_snapflush.end(); - ++q) { - dout(10) << " trying snapid " << q->first << dendl; - sin = mdcache->get_inode(head_in->ino(), q->first); - if (sin) { - assert(sin->first <= snapid); - break; - } - dout(10) << " didn't have " << head_in->ino() << " snapid " << q->first << dendl; - } - if (!sin && head_in->is_multiversion()) - sin = head_in; - assert(sin); - } + CInode *sin = mdcache->pick_inode_snap(head_in, snapid - 1); + assert(sin); + assert(sin->first <= snapid); _do_snap_update(sin, snapid, 0, sin->first - 1, client, NULL, NULL); head_in->remove_need_snapflush(sin, snapid, client); } @@ -2696,29 +2679,21 @@ void Locker::handle_client_caps(MClientCaps *m) mds->set_osd_epoch_barrier(m->osd_epoch_barrier); } - CInode *in = head_in; - if (follows > 0) { - in = mdcache->pick_inode_snap(head_in, follows); - if (in != head_in) - dout(10) << " head inode " << *head_in << dendl; - } - dout(10) << " cap inode " << *in << dendl; + dout(10) << " head inode " << *head_in << dendl; Capability *cap = 0; - cap = in->get_client_cap(client); - if (!cap && in != head_in) - cap = head_in->get_client_cap(client); + cap = head_in->get_client_cap(client); if (!cap) { - dout(7) << "handle_client_caps no cap for client." << client << " on " << *in << dendl; + dout(7) << "handle_client_caps no cap for client." << client << " on " << *head_in << dendl; m->put(); return; } assert(cap); // freezing|frozen? - if (should_defer_client_cap_frozen(in)) { - dout(7) << "handle_client_caps freezing|frozen on " << *in << dendl; - in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, m)); + if (should_defer_client_cap_frozen(head_in)) { + dout(7) << "handle_client_caps freezing|frozen on " << *head_in << dendl; + head_in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, m)); return; } if (ceph_seq_cmp(m->get_mseq(), cap->get_mseq()) < 0) { @@ -2732,15 +2707,22 @@ void Locker::handle_client_caps(MClientCaps *m) // flushsnap? if (op == CEPH_CAP_OP_FLUSHSNAP) { - if (!in->is_auth()) { - dout(7) << " not auth, ignoring flushsnap on " << *in << dendl; + if (!head_in->is_auth()) { + dout(7) << " not auth, ignoring flushsnap on " << *head_in << dendl; goto out; } - SnapRealm *realm = in->find_snaprealm(); + SnapRealm *realm = head_in->find_snaprealm(); snapid_t snap = realm->get_snap_following(follows); dout(10) << " flushsnap follows " << follows << " -> snap " << snap << dendl; + CInode *in = head_in; + if (snap != CEPH_NOSNAP) { + in = mdcache->pick_inode_snap(head_in, snap - 1); + if (in != head_in) + dout(10) << " snapped inode " << *in << dendl; + } + // we can prepare the ack now, since this FLUSHEDSNAP is independent of any // other cap ops. (except possibly duplicate FLUSHSNAP requests, but worst // case we get a dup response, so whatever.) @@ -2768,7 +2750,6 @@ void Locker::handle_client_caps(MClientCaps *m) if (in != head_in) head_in->remove_need_snapflush(in, snap, client); - } else { dout(7) << " not expecting flushsnap " << snap << " from client." << client << " on " << *in << dendl; if (ack) @@ -2780,14 +2761,18 @@ void Locker::handle_client_caps(MClientCaps *m) if (cap->get_cap_id() != m->get_cap_id()) { dout(7) << " ignoring client capid " << m->get_cap_id() << " != my " << cap->get_cap_id() << dendl; } else { - // intermediate snap inodes - while (in != head_in) { - assert(in->last != CEPH_NOSNAP); - if (in->is_auth() && m->get_dirty()) { - dout(10) << " updating intermediate snapped inode " << *in << dendl; - _do_cap_update(in, NULL, m->get_dirty(), follows, m); + CInode *in = head_in; + if (follows > 0) { + in = mdcache->pick_inode_snap(head_in, follows); + // intermediate snap inodes + while (in != head_in) { + assert(in->last != CEPH_NOSNAP); + if (in->is_auth() && m->get_dirty()) { + dout(10) << " updating intermediate snapped inode " << *in << dendl; + _do_cap_update(in, NULL, m->get_dirty(), follows, m); + } + in = mdcache->pick_inode_snap(head_in, in->last); } - in = mdcache->pick_inode_snap(head_in, in->last); } // head inode, and cap diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 90aa21848092..ada03ab17da2 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -256,8 +256,15 @@ bool MDCache::shutdown() void MDCache::add_inode(CInode *in) { // add to lru, inode map - assert(inode_map.count(in->vino()) == 0); // should be no dup inos! - inode_map[ in->vino() ] = in; + if (in->last == CEPH_NOSNAP) { + auto &p = inode_map[in->ino()]; + assert(!p); // should be no dup inos! + p = in; + } else { + auto &p = snap_inode_map[in->vino()]; + assert(!p); // should be no dup inos! + p = in; + } if (in->ino() < MDS_INO_SYSTEM_BASE) { if (in->ino() == MDS_INO_ROOT) @@ -302,7 +309,10 @@ void MDCache::remove_inode(CInode *o) export_pin_queue.erase(o); // remove from inode map - inode_map.erase(o->vino()); + if (o->last == CEPH_NOSNAP) + inode_map.erase(o->ino()); + else + snap_inode_map.erase(o->vino()); if (o->ino() < MDS_INO_SYSTEM_BASE) { if (o == root) root = 0; @@ -1459,24 +1469,12 @@ CInode *MDCache::pick_inode_snap(CInode *in, snapid_t follows) dout(10) << "pick_inode_snap follows " << follows << " on " << *in << dendl; assert(in->last == CEPH_NOSNAP); - SnapRealm *realm = in->find_snaprealm(); - const set& snaps = realm->get_snaps(); - dout(10) << " realm " << *realm << " " << *realm->inode << dendl; - dout(10) << " snaps " << snaps << dendl; - - if (snaps.empty()) - return in; - - for (set::const_iterator p = snaps.upper_bound(follows); // first item > follows - p != snaps.end(); - ++p) { - CInode *t = get_inode(in->ino(), *p); - if (t) { - in = t; - dout(10) << "pick_inode_snap snap " << *p << " found " << *in << dendl; - break; - } + auto p = snap_inode_map.upper_bound(vinodeno_t(in->ino(), follows)); + if (p != snap_inode_map.end() && p->second->ino() == in->ino()) { + dout(10) << "pick_inode_snap found " << *p->second << dendl; + in = p->second; } + return in; } @@ -1493,22 +1491,7 @@ CInode *MDCache::cow_inode(CInode *in, snapid_t last) { assert(last >= in->first); - SnapRealm *realm = in->find_snaprealm(); - const set& snaps = realm->get_snaps(); - - // make sure snap inode's last match existing snapshots. - // MDCache::pick_inode_snap() requires this. - snapid_t last_snap = last; - if (snaps.count(last) == 0) { - set::const_iterator p = snaps.upper_bound(last); - if (p != snaps.begin()) { - --p; - if (*p >= in->first) - last_snap = *p; - } - } - - CInode *oldin = new CInode(this, true, in->first, last_snap); + CInode *oldin = new CInode(this, true, in->first, last); oldin->inode = *in->get_previous_projected_inode(); oldin->symlink = in->symlink; oldin->xattrs = *in->get_previous_projected_xattrs(); @@ -1542,43 +1525,43 @@ CInode *MDCache::cow_inode(CInode *in, snapid_t last) return oldin; } - // clone caps? - for (map::iterator p = in->client_caps.begin(); - p != in->client_caps.end(); - ++p) { - client_t client = p->first; - Capability *cap = p->second; - int issued = cap->issued(); - if ((issued & CEPH_CAP_ANY_WR) && - cap->client_follows < last) { - // note in oldin - for (int i = 0; i < num_cinode_locks; i++) { - if (issued & cinode_lock_info[i].wr_caps) { - int lockid = cinode_lock_info[i].lock; - SimpleLock *lock = oldin->get_lock(lockid); - assert(lock); - oldin->client_snap_caps[lockid].insert(client); - oldin->auth_pin(lock); - lock->set_state(LOCK_SNAP_SYNC); // gathering - lock->get_wrlock(true); - dout(10) << " client." << client << " cap " << ccap_string(issued & cinode_lock_info[i].wr_caps) - << " wrlock lock " << *lock << " on " << *oldin << dendl; + if (!in->client_caps.empty()) { + const set& snaps = in->find_snaprealm()->get_snaps(); + // clone caps? + for (auto p : in->client_caps) { + client_t client = p.first; + Capability *cap = p.second; + int issued = cap->issued(); + if ((issued & CEPH_CAP_ANY_WR) && + cap->client_follows < last) { + // note in oldin + for (int i = 0; i < num_cinode_locks; i++) { + if (issued & cinode_lock_info[i].wr_caps) { + int lockid = cinode_lock_info[i].lock; + SimpleLock *lock = oldin->get_lock(lockid); + assert(lock); + oldin->client_snap_caps[lockid].insert(client); + oldin->auth_pin(lock); + lock->set_state(LOCK_SNAP_SYNC); // gathering + lock->get_wrlock(true); + dout(10) << " client." << client << " cap " << ccap_string(issued & cinode_lock_info[i].wr_caps) + << " wrlock lock " << *lock << " on " << *oldin << dendl; + } } + cap->client_follows = last; + + // we need snapflushes for any intervening snaps + dout(10) << " snaps " << snaps << dendl; + for (auto q = snaps.lower_bound(oldin->first); + q != snaps.end() && *q <= last; + ++q) { + in->add_need_snapflush(oldin, *q, client); + } + } else { + dout(10) << " ignoring client." << client << " cap follows " << cap->client_follows << dendl; } - cap->client_follows = last; - - // we need snapflushes for any intervening snaps - dout(10) << " snaps " << snaps << dendl; - for (set::const_iterator q = snaps.lower_bound(oldin->first); - q != snaps.end() && *q <= last; - ++q) { - in->add_need_snapflush(oldin, *q, client); - } - } else { - dout(10) << " ignoring client." << client << " cap follows " << cap->client_follows << dendl; } } - return oldin; } @@ -3737,10 +3720,8 @@ void MDCache::trim_unlinked_inodes() { dout(7) << "trim_unlinked_inodes" << dendl; list q; - for (ceph::unordered_map::iterator p = inode_map.begin(); - p != inode_map.end(); - ++p) { - CInode *in = p->second; + for (auto p : inode_map) { + CInode *in = p.second; if (in->get_parent_dn() == NULL && !in->is_base()) { dout(7) << " will trim from " << *in << dendl; q.push_back(in); @@ -4540,20 +4521,17 @@ void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, MMDSCacheRejoin *a { dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl; - for (ceph::unordered_map::iterator p = inode_map.begin(); - p != inode_map.end(); - ++p) { - CInode *in = p->second; - + auto scour_func = [this, from, ack, &acked_inodes, &gather_locks] (CInode *in) { // inode? if (in->is_auth() && in->is_replica(from) && - (ack == NULL || acked_inodes.count(p->second->vino()) == 0)) { + (ack == NULL || acked_inodes.count(in->vino()) == 0)) { inode_remove_replica(in, from, false, gather_locks); dout(10) << " rem " << *in << dendl; } - if (!in->is_dir()) continue; + if (!in->is_dir()) + return; list dfs; in->get_dirfrags(dfs); @@ -4585,7 +4563,12 @@ void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, MMDSCacheRejoin *a } } } - } + }; + + for (auto p : inode_map) + scour_func(p.second); + for (auto p : snap_inode_map) + scour_func(p.second); } @@ -5470,10 +5453,8 @@ void MDCache::choose_lock_states_and_reconnect_caps() map splits; - for (ceph::unordered_map::iterator i = inode_map.begin(); - i != inode_map.end(); - ++i) { - CInode *in = i->second; + for (auto i : inode_map) { + CInode *in = i.second; if (in->last != CEPH_NOSNAP) continue; @@ -5481,9 +5462,8 @@ void MDCache::choose_lock_states_and_reconnect_caps() if (in->is_auth() && !in->is_base() && in->inode.is_dirty_rstat()) in->mark_dirty_rstat(); - auto p = reconnected_caps.find(in->ino()); - int dirty_caps = 0; + auto p = reconnected_caps.find(in->ino()); if (p != reconnected_caps.end()) { for (const auto &it : p->second) dirty_caps |= it.second.dirty_caps; @@ -6089,10 +6069,8 @@ void MDCache::reissue_all_caps() { dout(10) << "reissue_all_caps" << dendl; - for (ceph::unordered_map::iterator p = inode_map.begin(); - p != inode_map.end(); - ++p) { - CInode *in = p->second; + for (auto p : inode_map) { + CInode *in = p.second; if (in->is_head() && in->is_any_caps()) { // called by MDSRank::active_start(). There shouldn't be any frozen subtree. if (in->is_frozen_inode()) { @@ -6179,10 +6157,8 @@ void MDCache::_queued_file_recover_cow(CInode *in, MutationRef& mut) void MDCache::identify_files_to_recover() { dout(10) << "identify_files_to_recover" << dendl; - for (ceph::unordered_map::iterator p = inode_map.begin(); - p != inode_map.end(); - ++p) { - CInode *in = p->second; + for (auto p : inode_map) { + CInode *in = p.second; if (!in->is_auth()) continue; @@ -6943,11 +6919,10 @@ void MDCache::trim_non_auth() if (lru.lru_get_size() == 0 && bottom_lru.lru_get_size() == 0) { // root, stray, etc.? - ceph::unordered_map::iterator p = inode_map.begin(); + auto p = inode_map.begin(); while (p != inode_map.end()) { - ceph::unordered_map::iterator next = p; - ++next; CInode *in = p->second; + ++p; if (!in->is_auth()) { list ls; in->get_dirfrags(ls); @@ -6964,7 +6939,6 @@ void MDCache::trim_non_auth() assert(in->get_num_ref() == 0); remove_inode(in); } - p = next; } } @@ -11676,10 +11650,8 @@ void MDCache::force_readonly() mds->server->force_clients_readonly(); // revoke write caps - for (ceph::unordered_map::iterator p = inode_map.begin(); - p != inode_map.end(); - ++p) { - CInode *in = p->second; + for (auto p : inode_map) { + CInode *in = p.second; if (in->is_head()) mds->locker->eval(in, CEPH_CAP_LOCKS); } @@ -11827,25 +11799,22 @@ void MDCache::show_subtrees(int dbl) assert(lost == 0); } - void MDCache::show_cache() { dout(7) << "show_cache" << dendl; - - for (ceph::unordered_map::iterator it = inode_map.begin(); - it != inode_map.end(); - ++it) { + + auto show_func = [this](CInode *in) { // unlinked? - if (!it->second->parent) - dout(7) << " unlinked " << *it->second << dendl; - + if (!in->parent) + dout(7) << " unlinked " << *in << dendl; + // dirfrags? list dfs; - it->second->get_dirfrags(dfs); + in->get_dirfrags(dfs); for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) { CDir *dir = *p; dout(7) << " dirfrag " << *dir << dendl; - + for (CDir::map_t::iterator p = dir->items.begin(); p != dir->items.end(); ++p) { @@ -11856,7 +11825,12 @@ void MDCache::show_cache() dout(7) << " inode " << *dnl->get_inode() << dendl; } } - } + }; + + for (auto p : inode_map) + show_func(p.second); + for (auto p : snap_inode_map) + show_func(p.second); } int MDCache::cache_status(Formatter *f) @@ -11914,11 +11888,8 @@ int MDCache::dump_cache(const char *fn, Formatter *f, } } - for (ceph::unordered_map::iterator it = inode_map.begin(); - it != inode_map.end(); - ++it) { - CInode *in = it->second; - + auto dump_func = [this, fd, f, depth, &dump_root](CInode *in) { + int r; if (!dump_root.empty()) { string ipath; if (in->is_root()) @@ -11928,11 +11899,11 @@ int MDCache::dump_cache(const char *fn, Formatter *f, if (dump_root.length() > ipath.length() || !equal(dump_root.begin(), dump_root.end(), ipath.begin())) - continue; + return 0; if (depth >= 0 && count(ipath.begin() + dump_root.length(), ipath.end(), '/') > depth) - continue; + return 0; } if (f) { @@ -11943,9 +11914,8 @@ int MDCache::dump_cache(const char *fn, Formatter *f, ss << *in << std::endl; std::string s = ss.str(); r = safe_write(fd, s.c_str(), s.length()); - if (r < 0) { - goto out; - } + if (r < 0) + return r; } list dfs; @@ -11963,9 +11933,8 @@ int MDCache::dump_cache(const char *fn, Formatter *f, tt << " " << *dir << std::endl; string t = tt.str(); r = safe_write(fd, t.c_str(), t.length()); - if (r < 0) { - goto out; - } + if (r < 0) + return r; } if (f) { @@ -11984,9 +11953,8 @@ int MDCache::dump_cache(const char *fn, Formatter *f, uu << " " << *dn << std::endl; string u = uu.str(); r = safe_write(fd, u.c_str(), u.length()); - if (r < 0) { - goto out; - } + if (r < 0) + return r; } } if (f) { @@ -12004,6 +11972,18 @@ int MDCache::dump_cache(const char *fn, Formatter *f, if (f) { f->close_section(); // inode } + return 1; + }; + + for (auto p : inode_map) { + r = dump_func(p.second); + if (r < 0) + goto out; + } + for (auto p : snap_inode_map) { + r = dump_func(p.second); + if (r < 0) + goto out; } out: diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 19fcb1dfb7e8..9429d39422c5 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -123,7 +123,8 @@ class MDCache { LRU lru; // dentry lru for expiring items from cache LRU bottom_lru; // dentries that should be trimmed ASAP protected: - ceph::unordered_map inode_map; // map of inodes by ino + ceph::unordered_map inode_map; // map of head inodes by ino + map snap_inode_map; // map of snap inodes by ino CInode *root; // root inode CInode *myin; // .ceph/mds%d dir @@ -764,14 +765,24 @@ public: // inode_map bool have_inode(vinodeno_t vino) { - return inode_map.count(vino) ? true:false; + if (vino.snapid == CEPH_NOSNAP) + return inode_map.count(vino.ino) ? true : false; + else + return snap_inode_map.count(vino) ? true : false; } bool have_inode(inodeno_t ino, snapid_t snap=CEPH_NOSNAP) { return have_inode(vinodeno_t(ino, snap)); } CInode* get_inode(vinodeno_t vino) { - if (have_inode(vino)) - return inode_map[vino]; + if (vino.snapid == CEPH_NOSNAP) { + auto p = inode_map.find(vino.ino); + if (p != inode_map.end()) + return p->second; + } else { + auto p = snap_inode_map.find(vino); + if (p != snap_inode_map.end()) + return p->second; + } return NULL; } CInode* get_inode(inodeno_t ino, snapid_t s=CEPH_NOSNAP) { @@ -1183,7 +1194,7 @@ public: CInode *hack_pick_random_inode() { assert(!inode_map.empty()); int n = rand() % inode_map.size(); - ceph::unordered_map::iterator p = inode_map.begin(); + auto p = inode_map.begin(); while (n--) ++p; return p->second; } -- 2.47.3