]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
crimson/osd: always dencode object_info_t with hobject_t
authorRadosław Zarzyński <rzarzyns@redhat.com>
Tue, 22 Aug 2023 13:17:54 +0000 (15:17 +0200)
committerRadoslaw Zarzynski <rzarzyns@redhat.com>
Tue, 27 Jan 2026 14:37:29 +0000 (14:37 +0000)
The classical OSD never uses the the `encode_no_oid()`
and `decode_no_oid()` variants of object info's encoding
machinery. crimson departed from this to such an extent
on recovery it sends `OI_ATTR` with meaningless `soid`-
related bytes. The net result is the following crash in
an heterogenous cluster:

```
    -7> 2023-08-22T14:36:57.621+0200 7f25113c8700  0 osd.1 pg_epoch: 62 pg[1.0( v 42'43 lc 39'37 (0'0,42'43] local-lis/les=45/58 n=4 ec=6/6 lis/c=45/6 les/c/f=58/8/0 sis=45) [1] r=0 lpr=53 pi=[6,45)/1 crt=42'43 lcod 0'0 mlcod 0'0 active+recovering rops=2 m=2 mbc={255={(0+1)=2}}] get_object_context:11843: soid.pool=1
    -6> 2023-08-22T14:36:57.621+0200 7f25113c8700 10 osd.1 pg_epoch: 62 pg[1.0( v 42'43 lc 39'37 (0'0,42'43] local-lis/les=45/58 n=4 ec=6/6 lis/c=45/6 les/c/f=58/8/0 sis=45) [1] r=0 lpr=53 pi=[6,45)/1 crt=42'43 lcod 0'0 mlcod 0'0 active+recovering rops=2 m=2 mbc={255={(0+1)=2}}] get_object_context: obc NOT found in cache: 1:30306672:devicehealth::main.db.0000000000000000:head
    -5> 2023-08-22T14:36:57.621+0200 7f25113c8700  0 osd.1 pg_epoch: 62 pg[1.0( v 42'43 lc 39'37 (0'0,42'43] local-lis/les=45/58 n=4 ec=6/6 lis/c=45/6 les/c/f=58/8/0 sis=45) [1] r=0 lpr=53 pi=[6,45)/1 crt=42'43 lcod 0'0 mlcod 0'0 active+recovering rops=2 m=2 mbc={255={(0+1)=2}}] get_object_context:11865 bv came from OI_ATTR
    -4> 2023-08-22T14:36:57.621+0200 7f25113c8700  0 osd.1 pg_epoch: 62 pg[1.0( v 42'43 lc 39'37 (0'0,42'43] local-lis/les=45/58 n=4 ec=6/6 lis/c=45/6 les/c/f=58/8/0 sis=45) [1] r=0 lpr=53 pi=[6,45)/1 crt=42'43 lcod 0'0 mlcod 0'0 active+recovering rops=2 m=2 mbc={255={(0+1)=2}}] get_object_context:11897 just after bv decode oi.soid.pool=-9223372036854775808
    -3> 2023-08-22T14:36:57.621+0200 7f25113c8700  0 osd.1 pg_epoch: 62 pg[1.0( v 42'43 lc 39'37 (0'0,42'43] local-lis/les=45/58 n=4 ec=6/6 lis/c=45/6 les/c/f=58/8/0 sis=45) [1] r=0 lpr=53 pi=[6,45)/1 crt=42'43 lcod 0'0 mlcod 0'0 active+recovering rops=2 m=2 mbc={255={(0+1)=2}}] soid.pool=-9223372036854775808 info.pgid.pool()=1
    -2> 2023-08-22T14:36:57.641+0200 7f25113c8700 -1 ../src/osd/PrimaryLogPG.cc: In function 'ObjectContextRef PrimaryLogPG::get_object_context(const hobject_t&, bool, const std::map<std::__cxx11::basic_string<char>, ceph::buffer::v15_2_0::list, std::less<void> >*)' thread 7f25113c8700 time 2023-08-22T14:36:57.625282+0200
../src/osd/PrimaryLogPG.cc: 11906: FAILED ceph_assert(oi.soid.pool == (int64_t)info.pgid.pool())

 ceph version 18.0.0-4293-g6cd888f2315 (6cd888f2315e525412ee7d39587d11d0a19245c2) reef (dev)
 1: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x127) [0x5602381712da]
 2: (ceph::register_assert_context(ceph::common::CephContext*)+0) [0x560238171505]
 3: (PrimaryLogPG::get_object_context(hobject_t const&, bool, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, ceph::buffer::v15_2_0::list, std::less<void>, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, ceph::buffer::v15_2_0::list> > > const*)+0xc9e) [0x560237b5aec6]
 4: (non-virtual thunk to PrimaryLogPG::get_obc(hobject_t const&, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, ceph::buffer::v15_2_0::list, std::less<void>, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, ceph::buffer::v15_2_0::list> > > const&)+0x30) [0x560237c2f9d5]
 5: (ReplicatedBackend::handle_pull_response(pg_shard_t, PushOp const&, PullOp*, std::__cxx11::list<ReplicatedBackend::pull_complete_info, std::allocator<ReplicatedBackend::pull_complete_info> >*, ceph::os::Transaction*)+0x5dc) [0x560237e9d116]
 6: (ReplicatedBackend::_do_pull_response(boost::intrusive_ptr<OpRequest>)+0x477) [0x560237e9e6d5]
 7: (ReplicatedBackend::_handle_message(boost::intrusive_ptr<OpRequest>)+0x1e1) [0x560237e9f2ef]
 8: (PGBackend::handle_message(boost::intrusive_ptr<OpRequest>)+0x59) [0x560237c3f3bb]
 9: (PrimaryLogPG::do_request(boost::intrusive_ptr<OpRequest>&, ThreadPool::TPHandle&)+0x93f) [0x560237bb7bdd]
 10: (OSD::dequeue_op(boost::intrusive_ptr<PG>, boost::intrusive_ptr<OpRequest>, ThreadPool::TPHandle&)+0x531) [0x5602379dcb41]
 11: (ceph::osd::scheduler::PGRecoveryMsg::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x22c) [0x560237d5b8f8]
 12: (OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x3386) [0x5602379ff9f4]
 13: (ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x979) [0x56023815f9c3]
 14: (ShardedThreadPool::WorkThreadSharded::entry()+0x17) [0x560238163279]
 15: (Thread::entry_wrapper()+0x43) [0x56023814c0af]
 16: (Thread::_entry_func(void*)+0xd) [0x56023814c0cb]
 17: /lib/x86_64-linux-gnu/libpthread.so.0(+0x8609) [0x7f2531844609]
 18: clone()
```

Fixes: https://tracker.ceph.com/issues/62526
Signed-off-by: Radosław Zarzyński <rzarzyns@redhat.com>
src/crimson/osd/osd_operations/snaptrim_event.cc
src/crimson/osd/pg_backend.cc
src/crimson/osd/replicated_recovery_backend.cc

index c3ac6c19787e7ba5fa271e32ab343a2be9eb0884..650f37c299a399b5b6270540c16f89996f825805 100644 (file)
@@ -255,7 +255,7 @@ SnapTrimObjSubEvent::adjust_snaps(
   obc->obs.oi.prior_version = obc->obs.oi.version;
   obc->obs.oi.version = osd_op_p.at_version;
   ceph::bufferlist bl;
-  obc->obs.oi.encode_no_oid(
+  obc->obs.oi.encode(
     bl,
     pg->get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
   txn.setattr(
@@ -303,7 +303,7 @@ void SnapTrimObjSubEvent::update_head(
   attrs[SS_ATTR] = std::move(bl);
 
   bl.clear();
-  head_obc->obs.oi.encode_no_oid(bl,
+  head_obc->obs.oi.encode(bl,
     pg->get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
   attrs[OI_ATTR] = std::move(bl);
   txn.setattrs(
index ecdf7efb6fa6b2d7dfba239f1137cb98c0825275..4a66f72d421bdc1310e187776dd6c9cba3db0735 100644 (file)
@@ -94,9 +94,8 @@ PGBackend::load_metadata(const hobject_t& oid)
         if (auto oiiter = attrs.find(OI_ATTR); oiiter != attrs.end()) {
           bufferlist bl = std::move(oiiter->second);
           try {
-            ret->os = ObjectState(
-              object_info_t(bl, oid),
-              true);
+            ret->os = ObjectState(object_info_t(bl), true);
+            ceph_assert(oid == ret->os.oi.soid);
           } catch (const buffer::error&) {
             logger().warn("unable to decode ObjectState");
             throw crimson::osd::invalid_argument();
@@ -1856,7 +1855,7 @@ void PGBackend::set_metadata(
   ceph_assert((obj.is_head() && ss) || (!obj.is_head() && !ss));
   {
     ceph::bufferlist bv;
-    oi.encode_no_oid(bv, CEPH_FEATURES_ALL);
+    oi.encode(bv, CEPH_FEATURES_ALL);
     txn.setattr(coll->get_cid(), ghobject_t{obj}, OI_ATTR, bv);
   }
   if (ss) {
index 5503063cbc9d810d06fb6235755d73034f34a1e9..c9427ef96456c36a8f5e05fab44cbc4e8f9ee8a9 100644 (file)
@@ -635,7 +635,7 @@ ReplicatedRecoveryBackend::read_metadata_for_push_op(
     }
     DEBUGDPP("{}", pg, push_op->attrset[OI_ATTR]);
     object_info_t oi;
-    oi.decode_no_oid(push_op->attrset[OI_ATTR]);
+    oi.decode(push_op->attrset[OI_ATTR]);
     new_progress.first = false;
     return oi.version;
   });
@@ -1197,7 +1197,7 @@ ReplicatedRecoveryBackend::prep_push_target(
     t->remove(coll->get_cid(), target_oid);
     t->touch(coll->get_cid(), target_oid);
     object_info_t oi;
-    oi.decode_no_oid(attrs.at(OI_ATTR));
+    oi.decode(attrs.at(OI_ATTR));
     t->set_alloc_hint(coll->get_cid(), target_oid,
                       oi.expected_object_size,
                       oi.expected_write_size,
@@ -1373,7 +1373,8 @@ ReplicatedRecoveryBackend::get_md_from_push_op(PushOp &push_op)
 {
   LOG_PREFIX(ReplicatedRecoveryBackend::get_md_from_push_op);
   object_info_t oi;
-  oi.decode_no_oid(push_op.attrset.at(OI_ATTR), push_op.soid);
+  oi.decode(push_op.attrset.at(OI_ATTR));
+  assert(oi.soid == push_op.soid);
 
   crimson::osd::SnapSetContextRef ssc;
   if (auto ss_attr_iter = push_op.attrset.find(SS_ATTR);