]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: EC Optimizations: PG log changes for partial logs
authorBill Scales <bill_scales@uk.ibm.com>
Wed, 26 Mar 2025 13:04:46 +0000 (13:04 +0000)
committerAlex Ainscow <aainscow@uk.ibm.com>
Tue, 22 Apr 2025 07:17:03 +0000 (08:17 +0100)
Optimized EC pools will not add a log entry for shards that
are not modified by a partial write. This means the shard
will have a partial copy of the log.

There are several asserts in PGLog that assume that the log
is contiguous, these need to be relaxed when it is an optimized
EC pool (other pools retain the full strength asserts).

During peering the primary may provide a complete log to a
non-primary shard to merge into its log. This merge can skip
log entries for partial writes that do not update the shard.

Signed-off-by: Bill Scales <bill_scales@uk.ibm.com>
src/osd/PGLog.cc
src/osd/PGLog.h
src/osd/PeeringState.cc
src/test/osd/TestPGLog.cc

index 6f9cadda70714193362b883b92a935fd1ed9c6cf..07e3f30b6c820e6a1f321f8611ab061f76127822 100644 (file)
@@ -224,7 +224,8 @@ void PGLog::proc_replica_log(
   pg_info_t &oinfo,
   const pg_log_t &olog,
   pg_missing_t& omissing,
-  pg_shard_t from) const
+  pg_shard_t from,
+  bool ec_optimizations_enabled) const
 {
   dout(10) << "proc_replica_log for osd." << from << ": "
           << oinfo << " " << olog << " " << omissing << dendl;
@@ -301,6 +302,7 @@ void PGLog::proc_replica_log(
     olog.get_can_rollback_to(),
     omissing,
     0,
+    ec_optimizations_enabled,
     this);
 
   if (lu < oinfo.last_update) {
@@ -333,7 +335,8 @@ void PGLog::proc_replica_log(
  */
 void PGLog::rewind_divergent_log(eversion_t newhead,
                                 pg_info_t &info, LogEntryHandler *rollbacker,
-                                bool &dirty_info, bool &dirty_big_info)
+                                bool &dirty_info, bool &dirty_big_info,
+                                bool ec_optimizations_enabled)
 {
   dout(10) << "rewind_divergent_log truncate divergent future " <<
     newhead << dendl;
@@ -362,6 +365,7 @@ void PGLog::rewind_divergent_log(eversion_t newhead,
     original_crt,
     missing,
     rollbacker,
+    ec_optimizations_enabled,
     this);
 
   dirty_info = true;
@@ -430,7 +434,8 @@ void PGLog::merge_log(pg_info_t &oinfo, pg_log_t&& olog, pg_shard_t fromosd,
 
   // do we have divergent entries to throw out?
   if (olog.head < log.head) {
-    rewind_divergent_log(olog.head, info, rollbacker, dirty_info, dirty_big_info);
+    rewind_divergent_log(olog.head, info, rollbacker,
+                        dirty_info, dirty_big_info, ec_optimizations_enabled);
     changed = true;
   }
 
@@ -489,6 +494,7 @@ void PGLog::merge_log(pg_info_t &oinfo, pg_log_t&& olog, pg_shard_t fromosd,
       original_crt,
       missing,
       rollbacker,
+      ec_optimizations_enabled,
       this);
 
     info.last_update = log.head = olog.head;
index d668874e88182bb2ddb2bedd9e85520621f5233b..455fc4464185016d829c0ad4967f7e9f20103bba 100644 (file)
@@ -972,7 +972,8 @@ public:
 
   void proc_replica_log(pg_info_t &oinfo,
                        const pg_log_t &olog,
-                       pg_missing_t& omissing, pg_shard_t from) const;
+                       pg_missing_t& omissing, pg_shard_t from,
+                       bool ec_optimizations_enabled) const;
 
   void set_missing_may_contain_deletes() {
     missing.may_include_deletes = true;
@@ -1022,6 +1023,7 @@ protected:
     eversion_t olog_can_rollback_to,     ///< [in] rollback boundary of input InedexedLog
     missing_type &missing,               ///< [in,out] missing to adjust, use
     LogEntryHandler *rollbacker,         ///< [in] optional rollbacker object
+    bool ec_optimizations_enabled,       ///< [in] relax asserts for allow_ec_optimzations pools
     const DoutPrefixProvider *dpp        ///< [in] logging provider
     ) {
     ldpp_dout(dpp, 20) << __func__ << ": merging hoid " << hoid
@@ -1062,7 +1064,13 @@ protected:
        // in increasing order of version
        ceph_assert(i->version > last);
        // prior_version correct (unless it is an ERROR entry)
-       ceph_assert(i->prior_version == last || i->is_error());
+       if (ec_optimizations_enabled) {
+         // With partial writes prior_verson may be > last because of
+         // skipped log entries
+         ceph_assert(i->prior_version >= last || i->is_error());
+       } else {
+         ceph_assert(i->prior_version == last || i->is_error());
+       }
       }
       if (i->is_error()) {
        ldpp_dout(dpp, 20) << __func__ << ": ignoring " << *i << dendl;
@@ -1103,8 +1111,15 @@ protected:
       // ensure missing has been updated appropriately
       if (objiter->second->is_update() ||
          (missing.may_include_deletes && objiter->second->is_delete())) {
-       ceph_assert(missing.is_missing(hoid) &&
-              missing.get_items().at(hoid).need == objiter->second->version);
+       if (ec_optimizations_enabled) {
+         // relax the assert for partial writes - missing may be newer than the
+         // most recent log entry
+         ceph_assert(missing.is_missing(hoid) &&
+                     missing.get_items().at(hoid).need >= objiter->second->version);
+       } else {
+         ceph_assert(missing.is_missing(hoid) &&
+                     missing.get_items().at(hoid).need == objiter->second->version);
+       }
       } else {
        ceph_assert(!missing.is_missing(hoid));
       }
@@ -1235,6 +1250,7 @@ protected:
     eversion_t olog_can_rollback_to,     ///< [in] rollback boundary of input IndexedLog
     missing_type &omissing,              ///< [in,out] missing to adjust, use
     LogEntryHandler *rollbacker,         ///< [in] optional rollbacker object
+    bool ec_optimizations_enabled,       ///< [in] relax asserts for allow_ec_optimzations pools
     const DoutPrefixProvider *dpp        ///< [in] logging provider
     ) {
     std::map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t> > split;
@@ -1248,6 +1264,7 @@ protected:
        olog_can_rollback_to,
        omissing,
        rollbacker,
+       ec_optimizations_enabled,
        dpp);
     }
   }
@@ -1271,6 +1288,7 @@ protected:
       log.get_can_rollback_to(),
       missing,
       rollbacker,
+      false, // not allow_ec_optimizations pool
       this);
   }
 
@@ -1282,7 +1300,8 @@ public:
                             pg_info_t &info,
                             LogEntryHandler *rollbacker,
                             bool &dirty_info,
-                            bool &dirty_big_info);
+                            bool &dirty_big_info,
+                           bool ec_optimizations_enabled);
 
   void merge_log(pg_info_t &oinfo,
                 pg_log_t&& olog,
@@ -1313,7 +1332,11 @@ public:
       invalidate_stats = invalidate_stats || !p->is_error();
       if (log) {
        ldpp_dout(dpp, 20) << "update missing, append " << *p << dendl;
-       log->add(*p);
+        // Skip the log entry if it is a partial write that did not involve
+        // this shard
+        if (!pool.is_nonprimary_shard(shard) || p->is_written_shard(shard)) {
+         log->add(*p);
+       }
       }
       if (p->soid <= last_backfill &&
          !p->is_error()) {
index 483fe12bd3d8400272a83ffe1717e28b5b2996b0..0f49351d69a1f747a3122582c145445238740246 100644 (file)
@@ -3186,7 +3186,8 @@ void PeeringState::rewind_divergent_log(
 {
   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
   pg_log.rewind_divergent_log(
-    newhead, info, rollbacker.get(), dirty_info, dirty_big_info);
+    newhead, info, rollbacker.get(), dirty_info, dirty_big_info,
+    pool.info.allows_ecoptimizations());
 }
 
 
@@ -3257,7 +3258,7 @@ void PeeringState::proc_replica_log(
   psdout(10) << "proc_replica_log for osd." << from << ": "
             << oinfo << " " << olog << " " << omissing << dendl;
 
-  pg_log.proc_replica_log(oinfo, olog, omissing, from);
+  pg_log.proc_replica_log(oinfo, olog, omissing, from, pool.info.allows_ecoptimizations());
 
   peer_info[from] = oinfo;
   update_peer_info(from, oinfo);
index 6aa7190b97482acd246050562370f2881e3dadf1..ef6b8758dd3584f39f99c9a8b495676ad8e9ada1 100644 (file)
@@ -313,7 +313,7 @@ public:
     pg_info_t oinfo = tcase.get_divinfo();
 
     proc_replica_log(
-      oinfo, olog, omissing, pg_shard_t(1, shard_id_t(0)));
+      oinfo, olog, omissing, pg_shard_t(1, shard_id_t(0)), false);
 
     ceph_assert(oinfo.last_update >= log.tail);
 
@@ -434,7 +434,7 @@ TEST_F(PGLogTest, rewind_divergent_log) {
 
     TestHandler h(remove_snap);
     rewind_divergent_log(newhead, info, &h,
-                        dirty_info, dirty_big_info);
+                        dirty_info, dirty_big_info, false);
 
     EXPECT_TRUE(log.objects.count(divergent));
     EXPECT_TRUE(missing.is_missing(divergent_object));
@@ -499,7 +499,7 @@ TEST_F(PGLogTest, rewind_divergent_log) {
 
     TestHandler h(remove_snap);
     rewind_divergent_log(newhead, info, &h,
-                        dirty_info, dirty_big_info);
+                        dirty_info, dirty_big_info, false);
 
     EXPECT_TRUE(missing.is_missing(divergent_object));
     EXPECT_EQ(0U, log.objects.count(divergent_object));
@@ -538,7 +538,7 @@ TEST_F(PGLogTest, rewind_divergent_log) {
     TestHandler h(remove_snap);
     roll_forward_to(eversion_t(1, 6), &info, &h);
     rewind_divergent_log(eversion_t(1, 5), info, &h,
-                        dirty_info, dirty_big_info);
+                        dirty_info, dirty_big_info, false);
     pg_log_t log;
     reset_backfill_claim_log(log, &info, &h);
   }
@@ -1417,7 +1417,7 @@ TEST_F(PGLogTest, proc_replica_log) {
     EXPECT_EQ(last_complete, oinfo.last_complete);
 
     missing.may_include_deletes = false;
-    proc_replica_log(oinfo, olog, omissing, from);
+    proc_replica_log(oinfo, olog, omissing, from, false);
 
     EXPECT_FALSE(omissing.have_missing());
     EXPECT_EQ(last_update, oinfo.last_update);
@@ -1491,7 +1491,7 @@ TEST_F(PGLogTest, proc_replica_log) {
     EXPECT_EQ(olog.head, oinfo.last_complete);
 
     missing.may_include_deletes = false;
-    proc_replica_log(oinfo, olog, omissing, from);
+    proc_replica_log(oinfo, olog, omissing, from, false);
 
     EXPECT_FALSE(omissing.have_missing());
   }
@@ -1593,7 +1593,7 @@ TEST_F(PGLogTest, proc_replica_log) {
     EXPECT_EQ(olog.head, oinfo.last_complete);
 
     missing.may_include_deletes = false;
-    proc_replica_log(oinfo, olog, omissing, from);
+    proc_replica_log(oinfo, olog, omissing, from, false);
 
     EXPECT_TRUE(omissing.have_missing());
     EXPECT_TRUE(omissing.is_missing(divergent_object));
@@ -1680,7 +1680,7 @@ TEST_F(PGLogTest, proc_replica_log) {
     EXPECT_EQ(olog.head, oinfo.last_complete);
 
     missing.may_include_deletes = false;
-    proc_replica_log(oinfo, olog, omissing, from);
+    proc_replica_log(oinfo, olog, omissing, from, false);
 
     EXPECT_TRUE(omissing.have_missing());
     EXPECT_TRUE(omissing.is_missing(divergent_object));
@@ -1770,7 +1770,7 @@ TEST_F(PGLogTest, proc_replica_log) {
     EXPECT_EQ(olog.head, oinfo.last_complete);
 
     missing.may_include_deletes = false;
-    proc_replica_log(oinfo, olog, omissing, from);
+    proc_replica_log(oinfo, olog, omissing, from, false);
 
     EXPECT_TRUE(omissing.have_missing());
     EXPECT_TRUE(omissing.is_missing(divergent_object));
@@ -1864,7 +1864,7 @@ TEST_F(PGLogTest, proc_replica_log) {
     EXPECT_EQ(olog.head, oinfo.last_complete);
 
     missing.may_include_deletes = false;
-    proc_replica_log(oinfo, olog, omissing, from);
+    proc_replica_log(oinfo, olog, omissing, from, false);
 
     EXPECT_TRUE(omissing.have_missing());
     EXPECT_TRUE(omissing.get_items().begin()->second.need == eversion_t(1, 1));
@@ -2944,7 +2944,7 @@ TEST_F(PGLogTest, _merge_object_divergent_entries) {
                                     orig_entries, oinfo,
                                     log.get_can_rollback_to(),
                                     missing, &rollbacker,
-                                    this);
+                                    false, this);
     // No core dump
   }
   {
@@ -2970,7 +2970,7 @@ TEST_F(PGLogTest, _merge_object_divergent_entries) {
                                     orig_entries, oinfo,
                                     log.get_can_rollback_to(),
                                     missing, &rollbacker,
-                                    this);
+                                    false, this);
     // No core dump
   }
 }