This commit fixes a memory corruption problem found on the intersection
of `seastar::thread` and `interruptor` by a teuthology job [1].
It turns out to be caused by a use-after-free which happens due to
an issue with type deduction in `non_futurized_call_with_interruption`.
However, it might be beneficial to audit other members of the family too.
If `Result` is deduced to be a non-reference type (i.e. the `Func`
returns e.g. `ceph::bufferlist`), `std::forward<Result>` forwards
it as a r-value reference (`Result&&`). `decltype(auto)` makes this
reference the return type of the entire function which isn't what
we want.
Debugging
=========
Instrumentation
---------------
```diff
--- a/src/crimson/osd/ops_executer.cc
+++ b/src/crimson/osd/ops_executer.cc
@@ -28,6 +28,21 @@ namespace {
namespace crimson::osd {
+struct scope_dbgr {
+ scope_dbgr() {
+ logger().debug("{}:{} on {}", __func__, __LINE__, (void*)this);
+ }
+ scope_dbgr(const scope_dbgr& src) {
+ logger().debug("{}:{} copy on {}; src {}", __func__, __LINE__, (void*)this, (void*)&src);
+ }
+ scope_dbgr(scope_dbgr&& src) noexcept {
+ logger().debug("{}:{} move on {}; src {}", __func__, __LINE__, (void*)this, (void*)&src);
+ }
+ ~scope_dbgr() {
+ logger().debug("{}:{} on {}", __func__, __LINE__, (void*)this);
+ }
+};
+
OpsExecuter::call_ierrorator::future<> OpsExecuter::do_op_call(OSDOp& osd_op)
{
std::string cname, mname;
@@ -84,14 +99,16 @@ OpsExecuter::call_ierrorator::future<> OpsExecuter::do_op_call(OSDOp& osd_op)
ceph::bufferlist outdata;
auto cls_context = reinterpret_cast<cls_method_context_t>(this);
const auto ret = method->exec(cls_context, indata, outdata);
- return std::make_pair(ret, std::move(outdata));
+ logger().debug("do_ops_call: outdata just after exec: {}", outdata);
+ return std::make_tuple(ret, std::move(outdata), scope_dbgr{});
}
).then_interruptible(
[this, prev_rd, prev_wr, &osd_op, flags]
(auto outcome) -> call_errorator::future<> {
- auto& [ret, outdata] = outcome;
+ auto& [ret, outdata, dbgr] = outcome;
osd_op.rval = ret;
-
+ logger().debug("do_ops_call: outdata just after then_interruptible: {}",
+ outdata);
logger().debug("do_op_call: method returned ret={}, outdata.length()={}"
" while num_read={}, num_write={}",
ret, outdata.length(), num_read, num_write);
```
After the fix
-------------
```
DEBUG 2021-04-21 15:32:30,012 [shard 0] bluestore - get_attr
DEBUG 2021-04-21 15:32:30,012 [shard 0] osd - do_ops_call: outdata just after exec: buffer::list(len=18,
buffer::ptr(0~18 0x6000000ef000 in raw 0x6000000ef000 len 4000 nref 1)
)
DEBUG 2021-04-21 15:32:30,012 [shard 0] osd - scope_dbgr:33 on 0x60000085feb3
DEBUG 2021-04-21 15:32:30,012 [shard 0] osd - scope_dbgr:39 move on 0x60000085ff00; src 0x60000085feb3
DEBUG 2021-04-21 15:32:30,012 [shard 0] osd - ~scope_dbgr:42 on 0x60000085feb3
DEBUG 2021-04-21 15:32:30,012 [shard 0] osd - scope_dbgr:39 move on 0x60000085fed0; src 0x60000085ff00
DEBUG 2021-04-21 15:32:30,012 [shard 0] osd - ~scope_dbgr:42 on 0x60000085ff00
DEBUG 2021-04-21 15:32:30,012 [shard 0] osd - scope_dbgr:39 move on 0x60000085ff40; src 0x60000085fed0
DEBUG 2021-04-21 15:32:30,012 [shard 0] osd - ~scope_dbgr:42 on 0x60000085fed0
DEBUG 2021-04-21 15:32:30,012 [shard 0] osd - scope_dbgr:39 move on 0x60000015a740; src 0x60000085ff40
DEBUG 2021-04-21 15:32:30,012 [shard 0] osd - ~scope_dbgr:42 on 0x60000085ff40
DEBUG 2021-04-21 15:32:30,012 [shard 0] osd - scope_dbgr:39 move on 0x7ffc4d018df0; src 0x60000015a740
DEBUG 2021-04-21 15:32:30,012 [shard 0] osd - ~scope_dbgr:42 on 0x60000015a740
DEBUG 2021-04-21 15:32:30,012 [shard 0] osd - scope_dbgr:39 move on 0x6000005196d8; src 0x7ffc4d018df0
DEBUG 2021-04-21 15:32:30,012 [shard 0] osd - ~scope_dbgr:42 on 0x7ffc4d018df0
DEBUG 2021-04-21 15:32:30,012 [shard 0] osd - scope_dbgr:39 move on 0x7ffc4d018df0; src 0x6000005196d8
DEBUG 2021-04-21 15:32:30,012 [shard 0] osd - ~scope_dbgr:42 on 0x6000005196d8
DEBUG 2021-04-21 15:32:30,012 [shard 0] osd - scope_dbgr:39 move on 0x7ffc4d018db0; src 0x7ffc4d018df0
DEBUG 2021-04-21 15:32:30,012 [shard 0] osd - ~scope_dbgr:42 on 0x7ffc4d018df0
DEBUG 2021-04-21 15:32:30,012 [shard 0] osd - scope_dbgr:39 move on 0x600000807b58; src 0x7ffc4d018db0
DEBUG 2021-04-21 15:32:30,012 [shard 0] osd - ~scope_dbgr:42 on 0x7ffc4d018db0
DEBUG 2021-04-21 15:32:30,012 [shard 0] osd - scope_dbgr:39 move on 0x7ffc4d018d30; src 0x600000807b58
DEBUG 2021-04-21 15:32:30,012 [shard 0] osd - do_ops_call: outdata just after then_interruptible: buffer::list(len=18,
buffer::ptr(0~18 0x6000000ef000 in raw 0x6000000ef000 len 4000 nref 1)
)
DEBUG 2021-04-21 15:32:30,012 [shard 0] osd - do_op_call: method returned ret=0, outdata.length()=18 while num_read=1, num_write=0
DEBUG 2021-04-21 15:32:30,012 [shard 0] osd - ~scope_dbgr:42 on 0x7ffc4d018d30
DEBUG 2021-04-21 15:32:30,012 [shard 0] osd - ~scope_dbgr:42 on 0x600000807b58
```
Before
------
```
DEBUG 2021-04-21 16:06:24,354 [shard 0] bluestore - get_attr
DEBUG 2021-04-21 16:06:24,354 [shard 0] osd - do_ops_call: outdata just after exec: buffer::list(len=18,
buffer::ptr(0~18 0x6000000f7000 in raw 0x6000000f7000 len 4000 nref 1)
)
DEBUG 2021-04-21 16:06:24,354 [shard 0] osd - scope_dbgr:33 on 0x60000085fe84
DEBUG 2021-04-21 16:06:24,354 [shard 0] osd - scope_dbgr:39 move on 0x60000085fed0; src 0x60000085fe84
DEBUG 2021-04-21 16:06:24,354 [shard 0] osd - ~scope_dbgr:42 on 0x60000085fe84
DEBUG 2021-04-21 16:06:24,354 [shard 0] osd - ~scope_dbgr:42 on 0x60000085fed0
DEBUG 2021-04-21 16:06:24,354 [shard 0] osd - scope_dbgr:39 move on 0x60000085fea0; src 0x60000085fed0
DEBUG 2021-04-21 16:06:24,354 [shard 0] osd - scope_dbgr:39 move on 0x60000085ff40; src 0x60000085fea0
DEBUG 2021-04-21 16:06:24,354 [shard 0] osd - ~scope_dbgr:42 on 0x60000085fea0
DEBUG 2021-04-21 16:06:24,354 [shard 0] osd - scope_dbgr:39 move on 0x6000008360b0; src 0x60000085ff40
DEBUG 2021-04-21 16:06:24,354 [shard 0] osd - ~scope_dbgr:42 on 0x60000085ff40
DEBUG 2021-04-21 16:06:24,354 [shard 0] osd - scope_dbgr:39 move on 0x7ffe48b3ee00; src 0x6000008360b0
DEBUG 2021-04-21 16:06:24,354 [shard 0] osd - ~scope_dbgr:42 on 0x6000008360b0
DEBUG 2021-04-21 16:06:24,354 [shard 0] osd - scope_dbgr:39 move on 0x600000801a38; src 0x7ffe48b3ee00
DEBUG 2021-04-21 16:06:24,354 [shard 0] osd - ~scope_dbgr:42 on 0x7ffe48b3ee00
DEBUG 2021-04-21 16:06:24,354 [shard 0] osd - scope_dbgr:39 move on 0x7ffe48b3ee00; src 0x600000801a38
DEBUG 2021-04-21 16:06:24,354 [shard 0] osd - ~scope_dbgr:42 on 0x600000801a38
DEBUG 2021-04-21 16:06:24,354 [shard 0] osd - scope_dbgr:39 move on 0x7ffe48b3edc0; src 0x7ffe48b3ee00
DEBUG 2021-04-21 16:06:24,354 [shard 0] osd - ~scope_dbgr:42 on 0x7ffe48b3ee00
DEBUG 2021-04-21 16:06:24,354 [shard 0] osd - scope_dbgr:39 move on 0x600000807bf8; src 0x7ffe48b3edc0
DEBUG 2021-04-21 16:06:24,354 [shard 0] osd - ~scope_dbgr:42 on 0x7ffe48b3edc0
DEBUG 2021-04-21 16:06:24,354 [shard 0] osd - scope_dbgr:39 move on 0x7ffe48b3ed40; src 0x600000807bf8
DEBUG 2021-04-21 16:06:24,354 [shard 0] osd - do_ops_call: outdata just after then_interruptible: buffer::list(len=18,
buffer:ptr(0~18 no raw)
)
DEBUG 2021-04-21 16:06:24,354 [shard 0] osd - do_op_call: method returned ret=0, outdata.length()=18 while num_read=1, num_write=0
DEBUG 2021-04-21 16:06:24,354 [shard 0] osd - ~scope_dbgr:42 on 0x7ffe48b3ed40
DEBUG 2021-04-21 16:06:24,354 [shard 0] osd - ~scope_dbgr:42 on 0x600000807bf8
DEBUG 2021-04-21 16:06:24,354 [shard 0] osd - do_osd_ops: osd_op(client.4131.0:4 1.e 1:
70fc7f5f:test-rados-api-o06-82166-2::foo:head {call lock.get_info in=31b out=18b}
snapc 0={} ondisk+read+known_if_redirected e6) v8 - object 1:
70fc7f5f:test-rados-api-o06-82166-2::foo:head all operations successful
Reactor stalled for 521 ms on shard 0.
Backtrace:
0x00000000017101b8
0x00000000016d7960
0x00000000016d7edc
0x00000000016d810f
/lib64/libpthread.so.0+0x0000000000012b2f
0x00000000016b563e
0x00000000016b801c
0x00000000016b8504
0x0000000000d68120
```
[1]: http://pulpito.front.sepia.ceph.com/rzarzynski-2021-04-19_12:11:51-rados-master-distro-basic-smithi/
6058778/
Signed-off-by: Radoslaw Zarzynski <rzarzyns@redhat.com>