]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: EC Optimizations: Partial write changes to add_next_event
authorBill Scales <156200352+bill-scales@users.noreply.github.com>
Thu, 6 Mar 2025 09:47:17 +0000 (09:47 +0000)
committerAlex Ainscow <aainscow@uk.ibm.com>
Tue, 22 Apr 2025 07:17:03 +0000 (08:17 +0100)
add_next_event is used during peering to process log entries
that a shard is missing to build up a list of missing objects.
With EC optimized pools and partial writes not every update
modifies every shard. The log entry contains details of which
shards were modified and this can be used to work out whether
a missing entry needs to be created/updated.

Signed-off-by: Bill Scales <bill_scales@uk.ibm.com>
src/osd/PGLog.cc
src/osd/PGLog.h
src/osd/PeeringState.cc
src/osd/PeeringState.h
src/osd/osd_types.h
src/test/osd/TestPGLog.cc
src/test/osd/types.cc

index 5831cef8c2b612d47b7120ecc20060a1538c791f..6f9cadda70714193362b883b92a935fd1ed9c6cf 100644 (file)
@@ -369,8 +369,10 @@ void PGLog::rewind_divergent_log(eversion_t newhead,
 }
 
 void PGLog::merge_log(pg_info_t &oinfo, pg_log_t&& olog, pg_shard_t fromosd,
-                      pg_info_t &info, LogEntryHandler *rollbacker,
-                      bool &dirty_info, bool &dirty_big_info)
+                      pg_info_t &info, const pg_pool_t &pool, pg_shard_t toosd,
+                     LogEntryHandler *rollbacker,
+                      bool &dirty_info, bool &dirty_big_info,
+                     bool ec_optimizations_enabled)
 {
   dout(10) << "merge_log " << olog << " from osd." << fromosd
            << " into " << log << dendl;
@@ -476,6 +478,8 @@ void PGLog::merge_log(pg_info_t &oinfo, pg_log_t&& olog, pg_shard_t fromosd,
       &log,
       missing,
       rollbacker,
+      pool,
+      toosd.shard,
       this);
 
     _merge_divergent_entries(
index a6434098d2d34299045bc590aff44e062e7b1df5..d668874e88182bb2ddb2bedd9e85520621f5233b 100644 (file)
@@ -796,8 +796,8 @@ public:
     missing.add(oid, need, have, is_delete);
   }
 
-  void missing_add_next_entry(const pg_log_entry_t& e) {
-    missing.add_next_event(e);
+  void missing_add_next_entry(const pg_log_entry_t& e, const pg_pool_t &pool, shard_id_t shard) {
+    missing.add_next_event(e, pool, shard);
   }
 
   //////////////////// get or std::set log ////////////////////
@@ -1287,8 +1287,12 @@ public:
   void merge_log(pg_info_t &oinfo,
                 pg_log_t&& olog,
                 pg_shard_t from,
-                pg_info_t &info, LogEntryHandler *rollbacker,
-                bool &dirty_info, bool &dirty_big_info);
+                pg_info_t &info,
+                const pg_pool_t &pool,
+                pg_shard_t to,
+                LogEntryHandler *rollbacker,
+                bool &dirty_info, bool &dirty_big_info,
+                bool ec_optimizations_enabled);
 
   template <typename missing_type>
   static bool append_log_entries_update_missing(
@@ -1298,6 +1302,8 @@ public:
     IndexedLog *log,
     missing_type &missing,
     LogEntryHandler *rollbacker,
+    const pg_pool_t &pool,
+    shard_id_t shard,
     const DoutPrefixProvider *dpp) {
     bool invalidate_stats = false;
     if (log && !entries.empty()) {
@@ -1312,12 +1318,12 @@ public:
       if (p->soid <= last_backfill &&
          !p->is_error()) {
        if (missing.may_include_deletes) {
-         missing.add_next_event(*p);
+         missing.add_next_event(*p, pool, shard);
        } else {
          if (p->is_delete()) {
            missing.rm(p->soid, p->version);
          } else {
-           missing.add_next_event(*p);
+           missing.add_next_event(*p, pool, shard);
          }
          if (rollbacker) {
            // hack to match PG::mark_all_unfound_lost
@@ -1335,7 +1341,10 @@ public:
   bool append_new_log_entries(
     const hobject_t &last_backfill,
     const mempool::osd_pglog::list<pg_log_entry_t> &entries,
+    pg_info_t *info,
     LogEntryHandler *rollbacker,
+    const pg_pool_t &pool,
+    shard_id_t shard,
     bool ec_optimizations_enabled) {
     bool invalidate_stats = append_log_entries_update_missing(
       last_backfill,
@@ -1344,6 +1353,8 @@ public:
       &log,
       missing,
       rollbacker,
+      pool,
+      shard,
       this);
     if (!entries.empty()) {
       mark_writeout_from(entries.begin()->version);
index 89f6037406ef7467603963f3d7c638cd6bfe8d7f..09f2b45f0fe9b3527dff4ea66c3d309ac62e5d12 100644 (file)
@@ -3003,7 +3003,7 @@ void PeeringState::activate(
            if (perform_deletes_during_peering() && p->is_delete()) {
              pm.rm(p->soid, p->version);
            } else {
-             pm.add_next_event(*p);
+             pm.add_next_event(*p, pool.info, peer.shard);
            }
          }
        }
@@ -3149,8 +3149,9 @@ void PeeringState::merge_log(
 {
   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
   pg_log.merge_log(
-    oinfo, std::move(olog), from, info, rollbacker.get(),
-    dirty_info, dirty_big_info);
+    oinfo, std::move(olog), from, info, pool.info, pg_whoami,
+    rollbacker.get(), dirty_info, dirty_big_info,
+    pool.info.allows_ecoptimizations());
 }
 
 void PeeringState::rewind_divergent_log(
@@ -4229,7 +4230,10 @@ bool PeeringState::append_log_entries_update_missing(
     pg_log.append_new_log_entries(
       info.last_backfill,
       entries,
+      &info,
       rollbacker.get(),
+      pool.info,
+      pg_whoami.shard,
       pool.info.allows_ecoptimizations());
 
   if (pg_committed_to && entries.rbegin()->soid > info.last_backfill) {
@@ -4288,6 +4292,8 @@ void PeeringState::merge_new_log_entries(
       NULL,
       pmissing,
       NULL,
+      pool.info,
+      peer.shard,
       dpp);
     pinfo.last_update = info.last_update;
     pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
@@ -4542,7 +4548,7 @@ void PeeringState::pre_submit_op(
       continue;
     requires_missing_loc = true;
     for (auto &&entry: logv) {
-      peer_missing[i].add_next_event(entry);
+      peer_missing[i].add_next_event(entry, pool.info, i.shard);
     }
   }
 
index 1a7323b4127c666ec534d564819e32459d0bbe30..a28c09b52765c75f1bf4203d5e23104eafa3f3b2 100644 (file)
@@ -1982,7 +1982,7 @@ public:
 
   /// Update missing set to reflect e (TODOSAM: not sure why this is needed)
   void add_local_next_event(const pg_log_entry_t& e) {
-    pg_log.missing_add_next_entry(e);
+    pg_log.missing_add_next_entry(e, pool.info, pg_whoami.shard);
   }
 
   /// Update log trim boundary
index be34ad0f717b74658bc93bb13d656867b8db88b2..f6e10c675532fe63433e92e38955d23db5882140 100644 (file)
@@ -5091,10 +5091,11 @@ public:
    * this needs to be called in log order as we extend the log.  it
    * assumes missing is accurate up through the previous log entry.
    */
-  void add_next_event(const pg_log_entry_t& e) {
+  void add_next_event(const pg_log_entry_t& e, const pg_pool_t &pool, shard_id_t shard) {
     std::map<hobject_t, item>::iterator missing_it;
     missing_it = missing.find(e.soid);
     bool is_missing_divergent_item = missing_it != missing.end();
+    bool skipped = false;
     if (e.prior_version == eversion_t() || e.is_clone()) {
       // new object.
       if (is_missing_divergent_item) {  // use iterator
@@ -5102,6 +5103,9 @@ public:
         // .have = nil
         missing_it->second = item(e.version, eversion_t(), e.is_delete());
         missing_it->second.clean_regions.mark_fully_dirty();
+      } else if (pool.is_nonprimary_shard(shard) && !e.is_written_shard(shard)) {
+       // new object, partial write and not already missing - skip
+       skipped = true;
       } else {
          // create new element in missing map
          // .have = nil
@@ -5117,6 +5121,9 @@ public:
         missing_it->second.clean_regions.mark_fully_dirty();
       else
         missing_it->second.clean_regions.merge(e.clean_regions);
+    } else if (pool.is_nonprimary_shard(shard) && !e.is_written_shard(shard)) {
+      // existing object, partial write and not already missing - skip
+      skipped = true;
     } else {
       // not missing, we must have prior_version (if any)
       ceph_assert(!is_missing_divergent_item);
@@ -5126,8 +5133,10 @@ public:
       else
         missing[e.soid].clean_regions = e.clean_regions;
     }
-    rmissing[e.version.version] = e.soid;
-    tracker.changed(e.soid);
+    if (!skipped) {
+      rmissing[e.version.version] = e.soid;
+      tracker.changed(e.soid);
+    }
   }
 
   void revise_need(hobject_t oid, eversion_t need, bool is_delete) {
index 5b85e0f9d2e880a1fd50c4770f3d39fcdd79d8ac..6aa7190b97482acd246050562370f2881e3dadf1 100644 (file)
@@ -294,7 +294,7 @@ public:
     bool dirty_big_info = false;
     merge_log(
       oinfo, std::move(olog), pg_shard_t(1, shard_id_t(0)), info,
-      &h, dirty_info, dirty_big_info);
+      pg_pool_t(), pg_shard_t(), &h, dirty_info, dirty_big_info, false);
 
     ASSERT_EQ(info.last_update, oinfo.last_update);
     verify_missing(tcase, missing);
@@ -328,7 +328,7 @@ public:
        if (i->is_delete() && tcase.deletes_during_peering) {
          omissing.rm(i->soid, i->version);
        } else {
-         omissing.add_next_event(*i);
+         omissing.add_next_event(*i, pg_pool_t(), shard_id_t());
        }
       }
     }
@@ -613,7 +613,7 @@ TEST_F(PGLogTest, merge_old_entry) {
     {
       pg_log_entry_t &ne = log.log.front();
       ne.op = pg_log_entry_t::MODIFY;
-      missing.add_next_event(ne);
+      missing.add_next_event(ne, pg_pool_t(), shard_id_t());
       pg_log_entry_t oe;
       oe.mark_unrollbackable();
       oe.version = eversion_t(1,1);
@@ -692,7 +692,7 @@ TEST_F(PGLogTest, merge_old_entry) {
 
     oe.version = eversion_t(2,1);
     oe.op = pg_log_entry_t::MODIFY;
-    missing.add_next_event(oe);
+    missing.add_next_event(oe, pg_pool_t(), shard_id_t());
 
     missing.flush();
     EXPECT_FALSE(is_dirty());
@@ -897,8 +897,8 @@ TEST_F(PGLogTest, merge_log) {
     EXPECT_FALSE(dirty_big_info);
 
     TestHandler h(remove_snap);
-    merge_log(oinfo, std::move(olog), fromosd, info, &h,
-              dirty_info, dirty_big_info);
+    merge_log(oinfo, std::move(olog), fromosd, info, pg_pool_t(), pg_shard_t(),
+              &h, dirty_info, dirty_big_info, false);
 
     EXPECT_FALSE(missing.have_missing());
     EXPECT_EQ(0U, log.log.size());
@@ -947,8 +947,8 @@ TEST_F(PGLogTest, merge_log) {
     EXPECT_FALSE(dirty_big_info);
 
     TestHandler h(remove_snap);
-    merge_log(oinfo, std::move(olog), fromosd, info, &h,
-              dirty_info, dirty_big_info);
+    merge_log(oinfo, std::move(olog), fromosd, info, pg_pool_t(), pg_shard_t(),
+              &h,dirty_info, dirty_big_info, false);
 
     EXPECT_FALSE(missing.have_missing());
     EXPECT_EQ(0U, log.log.size());
@@ -1052,8 +1052,8 @@ TEST_F(PGLogTest, merge_log) {
     EXPECT_FALSE(dirty_big_info);
 
     TestHandler h(remove_snap);
-    merge_log(oinfo, std::move(olog), fromosd, info, &h,
-              dirty_info, dirty_big_info);
+    merge_log(oinfo, std::move(olog), fromosd, info, pg_pool_t(), pg_shard_t(),
+              &h, dirty_info, dirty_big_info, false);
 
     EXPECT_FALSE(missing.have_missing());
     EXPECT_EQ(3U, log.log.size());
@@ -1161,8 +1161,8 @@ TEST_F(PGLogTest, merge_log) {
     EXPECT_FALSE(dirty_big_info);
 
     TestHandler h(remove_snap);
-    merge_log(oinfo, std::move(olog), fromosd, info, &h,
-              dirty_info, dirty_big_info);
+    merge_log(oinfo, std::move(olog), fromosd, info, pg_pool_t(), pg_shard_t(),
+              &h, dirty_info, dirty_big_info, false);
 
     /* When the divergent entry is a DELETE and the authoritative
        entry is a MODIFY, the object will be added to missing : it is
@@ -1280,8 +1280,8 @@ TEST_F(PGLogTest, merge_log) {
 
     TestHandler h(remove_snap);
     missing.may_include_deletes = false;
-    merge_log(oinfo, std::move(olog), fromosd, info, &h,
-              dirty_info, dirty_big_info);
+    merge_log(oinfo, std::move(olog), fromosd, info, pg_pool_t(), pg_shard_t(),
+              &h, dirty_info, dirty_big_info, false);
 
     /* When the divergent entry is a DELETE and the authoritative
        entry is a MODIFY, the object will be added to missing : it is
@@ -1382,8 +1382,8 @@ TEST_F(PGLogTest, merge_log) {
 
     TestHandler h(remove_snap);
     missing.may_include_deletes = false;
-    merge_log(oinfo, std::move(olog), fromosd, info, &h,
-              dirty_info, dirty_big_info);
+    merge_log(oinfo, std::move(olog), fromosd, info, pg_pool_t(), pg_shard_t(),
+              &h, dirty_info, dirty_big_info, false);
 
     EXPECT_FALSE(missing.have_missing());
     EXPECT_EQ(2U, log.log.size());
index 1fd56a272a8c95fe23f5c02e22de3e495502b865..a0da3cae2c2511054f4f3d016a90767875753359 100644 (file)
@@ -1154,7 +1154,7 @@ TEST(pg_missing_t, add_next_event)
     EXPECT_TRUE(e.object_is_indexed());
     EXPECT_TRUE(e.reqid_is_indexed());
     EXPECT_FALSE(missing.is_missing(oid));
-    missing.add_next_event(e);
+    missing.add_next_event(e, pg_pool_t(), shard_id_t());
     EXPECT_TRUE(missing.is_missing(oid));
     EXPECT_EQ(eversion_t(), missing.get_items().at(oid).have);
     EXPECT_EQ(oid, missing.get_rmissing().at(e.version.version));
@@ -1162,7 +1162,7 @@ TEST(pg_missing_t, add_next_event)
     EXPECT_EQ(1U, missing.get_rmissing().size());
 
     // adding the same object replaces the previous one
-    missing.add_next_event(e);
+    missing.add_next_event(e, pg_pool_t(), shard_id_t());
     EXPECT_TRUE(missing.is_missing(oid));
     EXPECT_EQ(1U, missing.num_missing());
     EXPECT_EQ(1U, missing.get_rmissing().size());
@@ -1179,7 +1179,7 @@ TEST(pg_missing_t, add_next_event)
     EXPECT_TRUE(e.object_is_indexed());
     EXPECT_FALSE(e.reqid_is_indexed());
     EXPECT_FALSE(missing.is_missing(oid));
-    missing.add_next_event(e);
+    missing.add_next_event(e, pg_pool_t(), shard_id_t());
     EXPECT_TRUE(missing.is_missing(oid));
     EXPECT_EQ(eversion_t(), missing.get_items().at(oid).have);
     EXPECT_EQ(oid, missing.get_rmissing().at(e.version.version));
@@ -1187,7 +1187,7 @@ TEST(pg_missing_t, add_next_event)
     EXPECT_EQ(1U, missing.get_rmissing().size());
 
     // adding the same object replaces the previous one
-    missing.add_next_event(e);
+    missing.add_next_event(e, pg_pool_t(), shard_id_t());
     EXPECT_TRUE(missing.is_missing(oid));
     EXPECT_EQ(1U, missing.num_missing());
     EXPECT_EQ(1U, missing.get_rmissing().size());
@@ -1204,7 +1204,7 @@ TEST(pg_missing_t, add_next_event)
     EXPECT_TRUE(e.object_is_indexed());
     EXPECT_TRUE(e.reqid_is_indexed());
     EXPECT_FALSE(missing.is_missing(oid));
-    missing.add_next_event(e);
+    missing.add_next_event(e, pg_pool_t(), shard_id_t());
     EXPECT_TRUE(missing.is_missing(oid));
     EXPECT_EQ(eversion_t(), missing.get_items().at(oid).have);
     EXPECT_EQ(oid, missing.get_rmissing().at(e.version.version));
@@ -1213,7 +1213,7 @@ TEST(pg_missing_t, add_next_event)
 
     // adding the same object with a different version
     e.prior_version = prior_version;
-    missing.add_next_event(e);
+    missing.add_next_event(e, pg_pool_t(), shard_id_t());
     EXPECT_EQ(eversion_t(), missing.get_items().at(oid).have);
     EXPECT_TRUE(missing.is_missing(oid));
     EXPECT_EQ(1U, missing.num_missing());
@@ -1230,7 +1230,7 @@ TEST(pg_missing_t, add_next_event)
     EXPECT_TRUE(e.object_is_indexed());
     EXPECT_TRUE(e.reqid_is_indexed());
     EXPECT_FALSE(missing.is_missing(oid));
-    missing.add_next_event(e);
+    missing.add_next_event(e, pg_pool_t(), shard_id_t());
     EXPECT_TRUE(missing.is_missing(oid));
     EXPECT_EQ(prior_version, missing.get_items().at(oid).have);
     EXPECT_EQ(version, missing.get_items().at(oid).need);
@@ -1249,12 +1249,12 @@ TEST(pg_missing_t, add_next_event)
     EXPECT_TRUE(e.object_is_indexed());
     EXPECT_TRUE(e.reqid_is_indexed());
     EXPECT_FALSE(missing.is_missing(oid));
-    missing.add_next_event(e);
+    missing.add_next_event(e, pg_pool_t(), shard_id_t());
     EXPECT_TRUE(missing.is_missing(oid));
 
     e.op = pg_log_entry_t::DELETE;
     EXPECT_TRUE(e.is_delete());
-    missing.add_next_event(e);
+    missing.add_next_event(e, pg_pool_t(), shard_id_t());
     EXPECT_TRUE(missing.is_missing(oid));
     EXPECT_TRUE(missing.get_items().at(oid).is_delete());
     EXPECT_EQ(prior_version, missing.get_items().at(oid).have);
@@ -1274,14 +1274,14 @@ TEST(pg_missing_t, add_next_event)
     EXPECT_TRUE(e.object_is_indexed());
     EXPECT_TRUE(e.reqid_is_indexed());
     EXPECT_FALSE(missing.is_missing(oid));
-    missing.add_next_event(e);
+    missing.add_next_event(e, pg_pool_t(), shard_id_t());
     EXPECT_TRUE(missing.is_missing(oid));
     EXPECT_FALSE(missing.get_items().at(oid).is_delete());
 
     e.op = pg_log_entry_t::LOST_DELETE;
     e.version.version++;
     EXPECT_TRUE(e.is_delete());
-    missing.add_next_event(e);
+    missing.add_next_event(e, pg_pool_t(), shard_id_t());
     EXPECT_TRUE(missing.is_missing(oid));
     EXPECT_TRUE(missing.get_items().at(oid).is_delete());
     EXPECT_EQ(prior_version, missing.get_items().at(oid).have);