From eab59a85e121e3e42de8456f36dcec76863b8aba Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 16 Sep 2019 20:58:23 +0800 Subject: [PATCH] mds: adjust locking for subtree migration Take snap rdlocks (instead of taking dentry locks) on subtree's ancestor inodes. Path to subtree is stable after they are all locked. For dirfragtree locks on subtree bounds, it's not convenient to rdlock them in top-down order (paths to them are not stable). The solution is kicking them to SYNC state and try taking rdlocks on all of them. Signed-off-by: "Yan, Zheng" --- src/mds/Locker.cc | 37 +++++++----- src/mds/Locker.h | 6 +- src/mds/MDCache.cc | 2 +- src/mds/Migrator.cc | 139 ++++++++++++++++++++++++++------------------ src/mds/Migrator.h | 2 +- src/mds/Server.cc | 2 +- 6 files changed, 111 insertions(+), 77 deletions(-) diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index d1754524a5d..60db205f891 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -1235,7 +1235,7 @@ bool Locker::_rdlock_kick(SimpleLock *lock, bool as_anon) return false; } -bool Locker::rdlock_try(SimpleLock *lock, client_t client, MDSContext *con) +bool Locker::rdlock_try(SimpleLock *lock, client_t client) { dout(7) << "rdlock_try on " << *lock << " on " << *lock->get_parent() << dendl; @@ -1248,11 +1248,6 @@ bool Locker::rdlock_try(SimpleLock *lock, client_t client, MDSContext *con) if (lock->can_rdlock(client)) return true; - // wait! - if (con) { - dout(7) << "rdlock_try waiting on " << *lock << " on " << *lock->get_parent() << dendl; - lock->add_waiter(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_RD, con); - } return false; } @@ -1347,30 +1342,42 @@ void Locker::rdlock_finish(const MutationImpl::lock_iterator& it, MutationImpl * } } - -bool Locker::can_rdlock_set(MutationImpl::LockOpVec& lov) +bool Locker::rdlock_try_set(MutationImpl::LockOpVec& lov, MDRequestRef& mdr) { - dout(10) << "can_rdlock_set " << dendl; + dout(10) << __func__ << dendl; for (const auto& p : lov) { auto lock = p.lock; ceph_assert(p.is_rdlock()); - if (!lock->can_rdlock(-1)) { - dout(10) << "can_rdlock_set can't rdlock " << *lock << " on " << *lock->get_parent() << dendl; - return false; + if (!mdr->is_rdlocked(lock) && !rdlock_try(lock, mdr->get_client())) { + lock->add_waiter(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_RD, + new C_MDS_RetryRequest(mdcache, mdr)); + goto failed; } + lock->get_rdlock(); + mdr->emplace_lock(lock, MutationImpl::LockOp::RDLOCK); + dout(20) << " got rdlock on " << *lock << " " << *lock->get_parent() << dendl; } + return true; +failed: + dout(10) << __func__ << " failed" << dendl; + drop_locks(mdr.get(), nullptr); + mdr->drop_local_auth_pins(); + return false; } - -void Locker::rdlock_take_set(MutationImpl::LockOpVec& lov, MutationRef& mut) +bool Locker::rdlock_try_set(MutationImpl::LockOpVec& lov, MutationRef& mut) { - dout(10) << "rdlock_take_set " << dendl; + dout(10) << __func__ << dendl; for (const auto& p : lov) { + auto lock = p.lock; ceph_assert(p.is_rdlock()); + if (!lock->can_rdlock(mut->get_client())) + return false; p.lock->get_rdlock(); mut->emplace_lock(p.lock, MutationImpl::LockOp::RDLOCK); } + return true; } // ------------------ diff --git a/src/mds/Locker.h b/src/mds/Locker.h index e2836fe5bb9..5fbf124db9b 100644 --- a/src/mds/Locker.h +++ b/src/mds/Locker.h @@ -85,11 +85,11 @@ public: void try_eval(SimpleLock *lock, bool *pneed_issue); bool _rdlock_kick(SimpleLock *lock, bool as_anon); - bool rdlock_try(SimpleLock *lock, client_t client, MDSContext *c); + bool rdlock_try(SimpleLock *lock, client_t client); bool rdlock_start(SimpleLock *lock, MDRequestRef& mut, bool as_anon=false); void rdlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut, bool *pneed_issue); - bool can_rdlock_set(MutationImpl::LockOpVec& lov); - void rdlock_take_set(MutationImpl::LockOpVec& lov, MutationRef& mut); + bool rdlock_try_set(MutationImpl::LockOpVec& lov, MDRequestRef& mdr); + bool rdlock_try_set(MutationImpl::LockOpVec& lov, MutationRef& mut); void wrlock_force(SimpleLock *lock, MutationRef& mut); bool wrlock_try(SimpleLock *lock, MutationRef& mut); diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 6b782bdb25d..65db633c723 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -6912,7 +6912,7 @@ bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap& expirema // This is because that unconnected replicas are problematic for // subtree migration. // - if (!in->is_auth() && !mds->locker->rdlock_try(&in->dirfragtreelock, -1, nullptr)) { + if (!in->is_auth() && !mds->locker->rdlock_try(&in->dirfragtreelock, -1)) { return true; } diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index 9ab9824527e..23f0179922c 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -737,37 +737,42 @@ public: }; -void Migrator::get_export_lock_set(CDir *dir, MutationImpl::LockOpVec& lov) +bool Migrator::export_try_grab_locks(CDir *dir, MutationRef& mut) { - // path - vector trace; - cache->make_trace(trace, dir->inode); + CInode *diri = dir->get_inode(); - set wouldbe_bounds; - cache->get_wouldbe_subtree_bounds(dir, wouldbe_bounds); + if (!diri->filelock.can_wrlock(diri->get_loner()) || + !diri->nestlock.can_wrlock(diri->get_loner())) + return false; - lov.reserve(trace.size() + wouldbe_bounds.size() + 8); + MutationImpl::LockOpVec lov; - for (auto& dn : trace) - lov.add_rdlock(&dn->lock); + set wouldbe_bounds; + set bound_inodes; + cache->get_wouldbe_subtree_bounds(dir, wouldbe_bounds); + for (auto& bound : wouldbe_bounds) + bound_inodes.insert(bound->get_inode()); + for (auto& in : bound_inodes) + lov.add_rdlock(&in->dirfragtreelock); + + lov.add_rdlock(&diri->dirfragtreelock); + + CInode* in = diri; + while (true) { + lov.add_rdlock(&in->snaplock); + CDentry* pdn = in->get_projected_parent_dn(); + if (!pdn) + break; + in = pdn->get_dir()->get_inode(); + } - // prevent scatter gather race - lov.add_rdlock(&dir->get_inode()->dirfragtreelock); + if (!mds->locker->rdlock_try_set(lov, mut)) + return false; - // bound dftlocks: - // NOTE: We need to take an rdlock on bounding dirfrags during - // migration for a rather irritating reason: when we export the - // bound inode, we need to send scatterlock state for the dirfrags - // as well, so that the new auth also gets the correct info. If we - // race with a refragment, this info is useless, as we can't - // redivvy it up. And it's needed for the scatterlocks to work - // properly: when the auth is in a sync/lock state it keeps each - // dirfrag's portion in the local (auth OR replica) dirfrag. - for (auto& dir : wouldbe_bounds) - lov.add_rdlock(&dir->get_inode()->dirfragtreelock); + mds->locker->wrlock_force(&diri->filelock, mut); + mds->locker->wrlock_force(&diri->nestlock, mut); - // above code may add duplicated locks - lov.sort_and_merge(); + return true; } @@ -1067,26 +1072,54 @@ void Migrator::dispatch_export_dir(MDRequestRef& mdr, int count) } // locks? - MutationImpl::LockOpVec lov; - get_export_lock_set(dir, lov); - // If auth MDS of the subtree root inode is neither the exporter MDS - // nor the importer MDS and it gathers subtree root's fragstat/neststat - // while the subtree is exporting. It's possible that the exporter MDS - // and the importer MDS both are auth MDS of the subtree root or both - // are not auth MDS of the subtree root at the time they receive the - // lock messages. So the auth MDS of the subtree root inode may get no - // or duplicated fragstat/neststat for the subtree root dirfrag. - lov.lock_scatter_gather(&dir->get_inode()->filelock); - lov.lock_scatter_gather(&dir->get_inode()->nestlock); - if (dir->get_inode()->is_auth()) { - dir->get_inode()->filelock.set_scatter_wanted(); - dir->get_inode()->nestlock.set_scatter_wanted(); - } - - if (!mds->locker->acquire_locks(mdr, lov, NULL, true)) { - if (mdr->aborted) - export_try_cancel(dir); - return; + if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) { + MutationImpl::LockOpVec lov; + // If auth MDS of the subtree root inode is neither the exporter MDS + // nor the importer MDS and it gathers subtree root's fragstat/neststat + // while the subtree is exporting. It's possible that the exporter MDS + // and the importer MDS both are auth MDS of the subtree root or both + // are not auth MDS of the subtree root at the time they receive the + // lock messages. So the auth MDS of the subtree root inode may get no + // or duplicated fragstat/neststat for the subtree root dirfrag. + lov.lock_scatter_gather(&dir->get_inode()->filelock); + lov.lock_scatter_gather(&dir->get_inode()->nestlock); + if (dir->get_inode()->is_auth()) { + dir->get_inode()->filelock.set_scatter_wanted(); + dir->get_inode()->nestlock.set_scatter_wanted(); + } + lov.add_rdlock(&dir->get_inode()->dirfragtreelock); + + if (!mds->locker->acquire_locks(mdr, lov, nullptr, true)) { + if (mdr->aborted) + export_try_cancel(dir); + return; + } + + lov.clear(); + // bound dftlocks: + // NOTE: We need to take an rdlock on bounding dirfrags during + // migration for a rather irritating reason: when we export the + // bound inode, we need to send scatterlock state for the dirfrags + // as well, so that the new auth also gets the correct info. If we + // race with a refragment, this info is useless, as we can't + // redivvy it up. And it's needed for the scatterlocks to work + // properly: when the auth is in a sync/lock state it keeps each + // dirfrag's portion in the local (auth OR replica) dirfrag. + set wouldbe_bounds; + set bound_inodes; + cache->get_wouldbe_subtree_bounds(dir, wouldbe_bounds); + for (auto& bound : wouldbe_bounds) + bound_inodes.insert(bound->get_inode()); + for (auto& in : bound_inodes) + lov.add_rdlock(&in->dirfragtreelock); + + if (!mds->locker->rdlock_try_set(lov, mdr)) + return; + + if (!mds->locker->try_rdlock_snap_layout(dir->get_inode(), mdr)) + return; + + mdr->locking_state |= MutationImpl::ALL_LOCKED; } ceph_assert(g_conf()->mds_kill_export_at != 1); @@ -1314,28 +1347,20 @@ void Migrator::export_frozen(CDir *dir, uint64_t tid) ceph_assert(it->second.state == EXPORT_FREEZING); ceph_assert(dir->is_frozen_tree_root()); - CInode *diri = dir->get_inode(); + it->second.mut = new MutationImpl(); // ok, try to grab all my locks. - MutationImpl::LockOpVec lov; - get_export_lock_set(dir, lov); + CInode *diri = dir->get_inode(); if ((diri->is_auth() && diri->is_frozen()) || - !mds->locker->can_rdlock_set(lov) || - // for pinning scatter gather. loner has a higher chance to get wrlock - !diri->filelock.can_wrlock(diri->get_loner()) || - !diri->nestlock.can_wrlock(diri->get_loner())) { + !export_try_grab_locks(dir, it->second.mut)) { dout(7) << "export_dir couldn't acquire all needed locks, failing. " << *dir << dendl; export_try_cancel(dir); return; } - it->second.mut = new MutationImpl(); if (diri->is_auth()) it->second.mut->auth_pin(diri); - mds->locker->rdlock_take_set(lov, it->second.mut); - mds->locker->wrlock_force(&diri->filelock, it->second.mut); - mds->locker->wrlock_force(&diri->nestlock, it->second.mut); cache->show_subtrees(); @@ -2301,7 +2326,9 @@ void Migrator::handle_export_discover(const cref_t &m, bool filepath fpath(m->get_path()); vector trace; MDRequestRef null_ref; - int r = cache->path_traverse(null_ref, cf, fpath, MDS_TRAVERSE_DISCOVER, &trace); + int r = cache->path_traverse(null_ref, cf, fpath, + MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED, + &trace); if (r > 0) return; if (r < 0) { dout(7) << "handle_export_discover failed to discover or not dir " << m->get_path() << ", NAK" << dendl; diff --git a/src/mds/Migrator.h b/src/mds/Migrator.h index dbb299129a3..d7b9baf54aa 100644 --- a/src/mds/Migrator.h +++ b/src/mds/Migrator.h @@ -318,7 +318,7 @@ public: vector >& results); void child_export_finish(std::shared_ptr& parent, bool success); - void get_export_lock_set(CDir *dir, MutationImpl::LockOpVec& lov); + bool export_try_grab_locks(CDir *dir, MutationRef& mut); void get_export_client_set(CDir *dir, std::set &client_set); void get_export_client_set(CInode *in, std::set &client_set); diff --git a/src/mds/Server.cc b/src/mds/Server.cc index fc825995628..623e7ee6269 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -2605,7 +2605,7 @@ void Server::handle_slave_request(const cref_t &m) return; } - // may get these requests while mdr->slave_request is non-null + // may get these while mdr->slave_request is non-null if (m->get_op() == MMDSSlaveRequest::OP_DROPLOCKS) { mds->locker->drop_locks(mdr.get()); return; -- 2.39.5