The classical OSD never uses the the `encode_no_oid()`
and `decode_no_oid()` variants of object info's encoding
machinery. crimson departed from this to such an extent
on recovery it sends `OI_ATTR` with meaningless `soid`-
related bytes. The net result is the following crash in
an heterogenous cluster:
```
-7> 2023-08-22T14:36:57.621+0200
7f25113c8700 0 osd.1 pg_epoch: 62 pg[1.0( v 42'43 lc 39'37 (0'0,42'43] local-lis/les=45/58 n=4 ec=6/6 lis/c=45/6 les/c/f=58/8/0 sis=45) [1] r=0 lpr=53 pi=[6,45)/1 crt=42'43 lcod 0'0 mlcod 0'0 active+recovering rops=2 m=2 mbc={255={(0+1)=2}}] get_object_context:11843: soid.pool=1
-6> 2023-08-22T14:36:57.621+0200
7f25113c8700 10 osd.1 pg_epoch: 62 pg[1.0( v 42'43 lc 39'37 (0'0,42'43] local-lis/les=45/58 n=4 ec=6/6 lis/c=45/6 les/c/f=58/8/0 sis=45) [1] r=0 lpr=53 pi=[6,45)/1 crt=42'43 lcod 0'0 mlcod 0'0 active+recovering rops=2 m=2 mbc={255={(0+1)=2}}] get_object_context: obc NOT found in cache: 1:
30306672:devicehealth::main.db.
0000000000000000:head
-5> 2023-08-22T14:36:57.621+0200
7f25113c8700 0 osd.1 pg_epoch: 62 pg[1.0( v 42'43 lc 39'37 (0'0,42'43] local-lis/les=45/58 n=4 ec=6/6 lis/c=45/6 les/c/f=58/8/0 sis=45) [1] r=0 lpr=53 pi=[6,45)/1 crt=42'43 lcod 0'0 mlcod 0'0 active+recovering rops=2 m=2 mbc={255={(0+1)=2}}] get_object_context:11865 bv came from OI_ATTR
-4> 2023-08-22T14:36:57.621+0200
7f25113c8700 0 osd.1 pg_epoch: 62 pg[1.0( v 42'43 lc 39'37 (0'0,42'43] local-lis/les=45/58 n=4 ec=6/6 lis/c=45/6 les/c/f=58/8/0 sis=45) [1] r=0 lpr=53 pi=[6,45)/1 crt=42'43 lcod 0'0 mlcod 0'0 active+recovering rops=2 m=2 mbc={255={(0+1)=2}}] get_object_context:11897 just after bv decode oi.soid.pool=-
9223372036854775808
-3> 2023-08-22T14:36:57.621+0200
7f25113c8700 0 osd.1 pg_epoch: 62 pg[1.0( v 42'43 lc 39'37 (0'0,42'43] local-lis/les=45/58 n=4 ec=6/6 lis/c=45/6 les/c/f=58/8/0 sis=45) [1] r=0 lpr=53 pi=[6,45)/1 crt=42'43 lcod 0'0 mlcod 0'0 active+recovering rops=2 m=2 mbc={255={(0+1)=2}}] soid.pool=-
9223372036854775808 info.pgid.pool()=1
-2> 2023-08-22T14:36:57.641+0200
7f25113c8700 -1 ../src/osd/PrimaryLogPG.cc: In function 'ObjectContextRef PrimaryLogPG::get_object_context(const hobject_t&, bool, const std::map<std::__cxx11::basic_string<char>, ceph::buffer::v15_2_0::list, std::less<void> >*)' thread
7f25113c8700 time 2023-08-22T14:36:57.625282+0200
../src/osd/PrimaryLogPG.cc: 11906: FAILED ceph_assert(oi.soid.pool == (int64_t)info.pgid.pool())
ceph version
18.0.0-4293-g6cd888f2315 (
6cd888f2315e525412ee7d39587d11d0a19245c2) reef (dev)
1: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x127) [0x5602381712da]
2: (ceph::register_assert_context(ceph::common::CephContext*)+0) [0x560238171505]
3: (PrimaryLogPG::get_object_context(hobject_t const&, bool, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, ceph::buffer::v15_2_0::list, std::less<void>, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, ceph::buffer::v15_2_0::list> > > const*)+0xc9e) [0x560237b5aec6]
4: (non-virtual thunk to PrimaryLogPG::get_obc(hobject_t const&, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, ceph::buffer::v15_2_0::list, std::less<void>, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, ceph::buffer::v15_2_0::list> > > const&)+0x30) [0x560237c2f9d5]
5: (ReplicatedBackend::handle_pull_response(pg_shard_t, PushOp const&, PullOp*, std::__cxx11::list<ReplicatedBackend::pull_complete_info, std::allocator<ReplicatedBackend::pull_complete_info> >*, ceph::os::Transaction*)+0x5dc) [0x560237e9d116]
6: (ReplicatedBackend::_do_pull_response(boost::intrusive_ptr<OpRequest>)+0x477) [0x560237e9e6d5]
7: (ReplicatedBackend::_handle_message(boost::intrusive_ptr<OpRequest>)+0x1e1) [0x560237e9f2ef]
8: (PGBackend::handle_message(boost::intrusive_ptr<OpRequest>)+0x59) [0x560237c3f3bb]
9: (PrimaryLogPG::do_request(boost::intrusive_ptr<OpRequest>&, ThreadPool::TPHandle&)+0x93f) [0x560237bb7bdd]
10: (OSD::dequeue_op(boost::intrusive_ptr<PG>, boost::intrusive_ptr<OpRequest>, ThreadPool::TPHandle&)+0x531) [0x5602379dcb41]
11: (ceph::osd::scheduler::PGRecoveryMsg::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x22c) [0x560237d5b8f8]
12: (OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x3386) [0x5602379ff9f4]
13: (ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x979) [0x56023815f9c3]
14: (ShardedThreadPool::WorkThreadSharded::entry()+0x17) [0x560238163279]
15: (Thread::entry_wrapper()+0x43) [0x56023814c0af]
16: (Thread::_entry_func(void*)+0xd) [0x56023814c0cb]
17: /lib/x86_64-linux-gnu/libpthread.so.0(+0x8609) [0x7f2531844609]
18: clone()
```
Fixes: https://tracker.ceph.com/issues/62526
Signed-off-by: Radosław Zarzyński <rzarzyns@redhat.com>
obc->obs.oi.prior_version = obc->obs.oi.version;
obc->obs.oi.version = osd_op_p.at_version;
ceph::bufferlist bl;
- obc->obs.oi.encode_no_oid(
+ obc->obs.oi.encode(
bl,
pg->get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
txn.setattr(
attrs[SS_ATTR] = std::move(bl);
bl.clear();
- head_obc->obs.oi.encode_no_oid(bl,
+ head_obc->obs.oi.encode(bl,
pg->get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
attrs[OI_ATTR] = std::move(bl);
txn.setattrs(
if (auto oiiter = attrs.find(OI_ATTR); oiiter != attrs.end()) {
bufferlist bl = std::move(oiiter->second);
try {
- ret->os = ObjectState(
- object_info_t(bl, oid),
- true);
+ ret->os = ObjectState(object_info_t(bl), true);
+ ceph_assert(oid == ret->os.oi.soid);
} catch (const buffer::error&) {
logger().warn("unable to decode ObjectState");
throw crimson::osd::invalid_argument();
ceph_assert((obj.is_head() && ss) || (!obj.is_head() && !ss));
{
ceph::bufferlist bv;
- oi.encode_no_oid(bv, CEPH_FEATURES_ALL);
+ oi.encode(bv, CEPH_FEATURES_ALL);
txn.setattr(coll->get_cid(), ghobject_t{obj}, OI_ATTR, bv);
}
if (ss) {
}
DEBUGDPP("{}", pg, push_op->attrset[OI_ATTR]);
object_info_t oi;
- oi.decode_no_oid(push_op->attrset[OI_ATTR]);
+ oi.decode(push_op->attrset[OI_ATTR]);
new_progress.first = false;
return oi.version;
});
t->remove(coll->get_cid(), target_oid);
t->touch(coll->get_cid(), target_oid);
object_info_t oi;
- oi.decode_no_oid(attrs.at(OI_ATTR));
+ oi.decode(attrs.at(OI_ATTR));
t->set_alloc_hint(coll->get_cid(), target_oid,
oi.expected_object_size,
oi.expected_write_size,
{
LOG_PREFIX(ReplicatedRecoveryBackend::get_md_from_push_op);
object_info_t oi;
- oi.decode_no_oid(push_op.attrset.at(OI_ATTR), push_op.soid);
+ oi.decode(push_op.attrset.at(OI_ATTR));
+ assert(oi.soid == push_op.soid);
crimson::osd::SnapSetContextRef ssc;
if (auto ss_attr_iter = push_op.attrset.find(SS_ATTR);