From 893b9d94668b36a395c28cfebc7aea1bda29085f Mon Sep 17 00:00:00 2001 From: "Adam C. Emerson" Date: Fri, 5 Jun 2020 15:18:35 -0400 Subject: [PATCH] osdc: Fix crash on OSD failure OSDSession::lock should not use lockdep, as we violate its rules in a place that's safe. Revert it to std::mutex. Thanks to Or Friedmann for finding the problem. Fixes: https://tracker.ceph.com/issues/45878 Signed-off-by: Adam C. Emerson --- src/osdc/Objecter.cc | 7 ++++--- src/osdc/Objecter.h | 13 ++++++++----- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index 6c260d57cecc2..79ed8c6185db9 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -1506,7 +1506,7 @@ int Objecter::pool_snap_list(int64_t poolid, vector *snaps) } // sl may be unlocked. -void Objecter::_check_op_pool_dne(Op *op, std::unique_lock *sl) +void Objecter::_check_op_pool_dne(Op *op, std::unique_lock *sl) { // rwlock is locked unique @@ -3031,8 +3031,9 @@ int Objecter::_recalc_linger_op_target(LingerOp *linger_op, if (linger_op->session != s) { // NB locking two sessions (s and linger_op->session) at the // same time here is only safe because we are the only one that - // takes two, and we are holding rwlock for write. Disable - // lockdep because it doesn't know that. + // takes two, and we are holding rwlock for write. We use + // std::shared_mutex in OSDSession because lockdep doesn't know + // that. unique_lock sl(s->lock); _session_linger_op_remove(linger_op->session, linger_op); _session_linger_op_assign(s, linger_op); diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h index 321055b041b5e..18f17d53ac6e8 100644 --- a/src/osdc/Objecter.h +++ b/src/osdc/Objecter.h @@ -2372,7 +2372,12 @@ public: std::map backoffs_by_id; int osd; - ceph::shared_mutex lock; + // NB locking two sessions at the same time is only safe because + // it is only done in _recalc_linger_op_target with s and + // linger_op->session, and it holds rwlock for write. We disable + // lockdep (using std::sharedMutex) because lockdep doesn't know + // that. + std::shared_mutex lock; int incarnation; ConnectionRef con; @@ -2380,9 +2385,7 @@ public: std::unique_ptr completion_locks; OSDSession(CephContext *cct, int o) : - osd(o), lock(ceph::make_shared_mutex( - fmt::format("OSDSession::lock #{}", o))), - incarnation(0), con(NULL), + osd(o), incarnation(0), con(NULL), num_locks(cct->_conf->objecter_completion_locks_per_session), completion_locks(new std::mutex[num_locks]) {} @@ -2502,7 +2505,7 @@ public: } private: - void _check_op_pool_dne(Op *op, std::unique_lock *sl); + void _check_op_pool_dne(Op *op, std::unique_lock *sl); void _send_op_map_check(Op *op); void _op_cancel_map_check(Op *op); void _check_linger_pool_dne(LingerOp *op, bool *need_unregister); -- 2.39.5