From f2e966818fe6c40fa20d67e379c1e2a02c9d005e Mon Sep 17 00:00:00 2001 From: Radoslaw Zarzynski Date: Mon, 24 May 2021 11:15:51 +0000 Subject: [PATCH] crimson/osd: fix assertion failure in OpSequencer. `OpSequencer` assumes that ID of a previous client request is always lower than ID of current one. This is reflected by the assertion in `OpSequencer::start_op()`. It triggered the following failure [1] in Teuthology: ``` DEBUG 2021-05-07 08:01:41,227 [shard 0] osd - client_request(id=1, detail=osd_op(client.4171.0:1 2.2 2.7c339972 (undecoded) ondisk+retry+read+known_if_redirected e29) v8) same_interval_since: 31 ceph-osd: /home/jenkins-build/build/workspace/ceph-dev-new-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.0.0-3910-g1b18e076/rpm/el8/BUILD/ceph- 17.0.0-3910-g1b18e076/src/crimson/osd/osd_operation_sequencer.h:38: seastar::futurize_t crimson::osd::OpSequencer::start_op(HandleT&, uint64_t, uint64_t, FuncT&&) [with HandleT = crimson::PipelineHa ndle; FuncT = crimson::interruptible::interruptor::wrap_function(Func&&) [with Func = crimson::osd::ClientRequest::start():: mutable::)> mutable:: mutable::; InterruptCond = crimson::osd::IOInterruptCondition]::; Result = crimson::interruptible::interruptible_future_detail >; seastar::futurize_t = crimson::interruptible::interruptible_future_detail >; uint64_t = long unsigned int]: Assertion `prev_op < this_op' fai led. Aborting on shard 0. Backtrace: Segmentation fault. Backtrace: 0# 0x00005592B028932F in ceph-osd 1# FatalSignal::signaled(int, siginfo_t const*) in ceph-osd 2# FatalSignal::install_oneshot_signal_handler<6>()::{lambda(int, siginfo_t*, void*)#1}::_FUN(int, siginfo_t*, void*) in ceph-osd 3# 0x00007F57B72E7B20 in /lib64/libpthread.so.0 4# gsignal in /lib64/libc.so.6 5# abort in /lib64/libc.so.6 6# 0x00007F57B58E2B09 in /lib64/libc.so.6 7# 0x00007F57B58F0DE6 in /lib64/libc.so.6 8# 0x00005592ABB8484D in ceph-osd 9# 0x00005592ABB8ACB3 in ceph-osd 10# seastar::continuation >, seastar::noncopyable_function > (boost::intrusive_ptr&&)>, seastar::future >::then_impl_nrvo > (boost::intrusive_ptr&&)>, seastar::future > >(seastar::noncopyable_function > (boost::intrusive_ptr&&)>&&)::{lambda(seastar::internal::promise_base_with_type >&&, seastar::noncopyable_function > (boost::intrusive_ptr&&)>&, seastar::future_state >&&)#1}, boost::intrusive_ptr >::run_and_dispose() in ceph-osd 11# 0x00005592B357F88F in ceph-osd 12# 0x00005592B3584DD0 in ceph-osd ``` [1]: http://pulpito.front.sepia.ceph.com/rzarzynski-2021-05-07_07:41:02-rados-master-distro-basic-smithi/6104530 Crash analysis resulted in two observations: 1. during the request execution the acting set got changed, the request was interrupted and a try to re-execute it emerged; 2. the interrupted request was the very first client request the OSD has ever seen. Code analysis showed a problem in how `ClientRequest` establishes `prev_op_id`: although supposed to be performed only once for a request, it can get executed twice but only for the very first request `OpSequencer` saw. ```cpp void ClientRequest::may_set_prev_op() { // set prev_op_id if it's not set yet if (__builtin_expect(prev_op_id == 0, true)) { prev_op_id = sequencer.get_last_issued(); } } ``` Unfortunately, `0` isn't a distincted value that cannot be returned by `get_last_issued()`: ```cpp class OpSequencer { // ... uint64_t get_last_issued() const { return last_issued; } // ... // the id of last op which is issued uint64_t last_issued = 0; ``` As a result, `OpSequencer` returned on the second call a new value (actually `this_op`) violating the assertion. The commit fixes the problem by switching from a designated value to `std::optional`. Signed-off-by: Radoslaw Zarzynski --- src/crimson/osd/osd_operations/client_request.cc | 7 ++++--- src/crimson/osd/osd_operations/client_request.h | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/crimson/osd/osd_operations/client_request.cc b/src/crimson/osd/osd_operations/client_request.cc index b7ac62044f9..9719ab08051 100644 --- a/src/crimson/osd/osd_operations/client_request.cc +++ b/src/crimson/osd/osd_operations/client_request.cc @@ -63,8 +63,8 @@ bool ClientRequest::is_pg_op() const void ClientRequest::may_set_prev_op() { // set prev_op_id if it's not set yet - if (__builtin_expect(prev_op_id == 0, true)) { - prev_op_id = sequencer.get_last_issued(); + if (__builtin_expect(!prev_op_id.has_value(), true)) { + prev_op_id.emplace(sequencer.get_last_issued()); } } @@ -88,8 +88,9 @@ seastar::future<> ClientRequest::start() epoch_t same_interval_since = pgref->get_interval_start_epoch(); logger().debug("{} same_interval_since: {}", *this, same_interval_since); may_set_prev_op(); + assert(prev_op_id.has_value()); return sequencer.start_op( - handle, prev_op_id, get_id(), + handle, *prev_op_id, get_id(), interruptor::wrap_function( [this, pgref]() mutable -> interruptible_future<> { PG &pg = *pgref; diff --git a/src/crimson/osd/osd_operations/client_request.h b/src/crimson/osd/osd_operations/client_request.h index 11c5ac0c59d..c41035afe95 100644 --- a/src/crimson/osd/osd_operations/client_request.h +++ b/src/crimson/osd/osd_operations/client_request.h @@ -74,7 +74,7 @@ private: PGPipeline &pp(PG &pg); OpSequencer& sequencer; - uint64_t prev_op_id = 0; + std::optional prev_op_id; template using interruptible_errorator = -- 2.39.5