From: Alex Ainscow <aainscow@uk.ibm.com>
Date: Wed, 16 Apr 2025 06:44:25 +0000 (+0100)
Subject: osd: Fix written shards policing for multiple loops through generate.
X-Git-Tag: v20.1.0~68^2~21
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=89e5d5c60efedd253bf70cfd2fdb59255303f972;p=ceph.git

osd: Fix written shards policing for multiple loops through generate.

Signed-off-by: Alex Ainscow <aainscow@uk.ibm.com>
(cherry picked from commit c5f67d3771384e7a780ed6488019fca2509e31c2)
---

diff --git a/src/osd/ECTransaction.cc b/src/osd/ECTransaction.cc
index c094738307e5..6a2d3ef32c90 100644
--- a/src/osd/ECTransaction.cc
+++ b/src/osd/ECTransaction.cc
@@ -51,7 +51,9 @@ void ECTransaction::Generate::encode_and_write() {
   // If partial writes are not supported, pad out to_write to a full stripe.
   if (!sinfo.supports_partial_writes()) {
     for (auto &&[shard, eset]: plan.will_write) {
-      if (sinfo.get_raw_shard(shard) >= sinfo.get_k()) continue;
+      if (sinfo.get_raw_shard(shard) >= sinfo.get_k()) {
+        continue;
+      }
 
       for (auto [off, len]: eset) {
         to_write.zero_pad(shard, off, len);
@@ -176,87 +178,89 @@ ECTransaction::WritePlanObj::WritePlanObj(
   write_superset.align(CEPH_PAGE_SIZE);
 
   shard_id_set writable_parity_shards = shard_id_set::intersection(sinfo.get_parity_shards(), writable_shards);
-  for (auto shard : writable_parity_shards) {
-    will_write[shard].insert(write_superset);
-  }
-
-  ECUtil::shard_extent_set_t reads(sinfo.get_k_plus_m());
-  ECUtil::shard_extent_set_t read_mask(sinfo.get_k_plus_m());
-
-  if (!sinfo.supports_partial_writes()) {
-    for (shard_id_t shard; shard < sinfo.get_k_plus_m(); ++shard) {
+  if (write_superset.size() > 0) {
+    for (auto shard : writable_parity_shards) {
       will_write[shard].insert(write_superset);
     }
-    will_write.align(sinfo.get_chunk_size());
-    reads = will_write;
-    sinfo.ro_size_to_read_mask(sinfo.ro_offset_to_next_stripe_ro_offset(orig_size), read_mask);
-    reads.intersection_of(read_mask);
-    do_parity_delta_write = false;
-  } else {
-    will_write.align(CEPH_PAGE_SIZE);
-    ECUtil::shard_extent_set_t pdw_reads(will_write);
 
-    sinfo.ro_size_to_read_mask(ECUtil::align_page_next(orig_size), read_mask);
+    ECUtil::shard_extent_set_t reads(sinfo.get_k_plus_m());
+    ECUtil::shard_extent_set_t read_mask(sinfo.get_k_plus_m());
 
-    /* Next we need to add the reads required for a conventional write */
-    for (auto shard : sinfo.get_data_shards()) {
-      reads[shard].insert(write_superset);
-      if (will_write.contains(shard)) {
-        reads[shard].subtract(will_write.at(shard));
+    if (!sinfo.supports_partial_writes()) {
+      for (shard_id_t shard; shard < sinfo.get_k_plus_m(); ++shard) {
+        will_write[shard].insert(write_superset);
       }
-      if (reads[shard].empty()) {
-        reads.erase(shard);
+      will_write.align(sinfo.get_chunk_size());
+      reads = will_write;
+      sinfo.ro_size_to_read_mask(sinfo.ro_offset_to_next_stripe_ro_offset(orig_size), read_mask);
+      reads.intersection_of(read_mask);
+      do_parity_delta_write = false;
+    } else {
+      will_write.align(CEPH_PAGE_SIZE);
+      ECUtil::shard_extent_set_t pdw_reads(will_write);
+
+      sinfo.ro_size_to_read_mask(ECUtil::align_page_next(orig_size), read_mask);
+
+      /* Next we need to add the reads required for a conventional write */
+      for (auto shard : sinfo.get_data_shards()) {
+        reads[shard].insert(write_superset);
+        if (will_write.contains(shard)) {
+          reads[shard].subtract(will_write.at(shard));
+        }
+        if (reads[shard].empty()) {
+          reads.erase(shard);
+        }
       }
-    }
 
-    /* We now need to add in the partial page ro writes. This is not particularly
-     * efficient as the are many divs in here, but non-4k aligned writes are
-     * not very efficient anyway
-     */
-    for (auto &&[off, len] : partial_page_ro_writes) {
-      sinfo.ro_range_to_shard_extent_set(
-        off, len, reads);
-    }
+      /* We now need to add in the partial page ro writes. This is not particularly
+       * efficient as the are many divs in here, but non-4k aligned writes are
+       * not very efficient anyway
+       */
+      for (auto &&[off, len] : partial_page_ro_writes) {
+        sinfo.ro_range_to_shard_extent_set(
+          off, len, reads);
+      }
 
-    reads.intersection_of(read_mask);
+      reads.intersection_of(read_mask);
 
-    /* Here we decide if we want to do a conventional write or a parity delta write. */
-    if (sinfo.supports_parity_delta_writes() && !object_in_cache &&
-        orig_size == projected_size && !reads.empty()) {
+      /* Here we decide if we want to do a conventional write or a parity delta write. */
+      if (sinfo.supports_parity_delta_writes() && !object_in_cache &&
+          orig_size == projected_size && !reads.empty()) {
 
-      shard_id_set read_shards = reads.get_shard_id_set();
-      shard_id_set pdw_read_shards = pdw_reads.get_shard_id_set();
+        shard_id_set read_shards = reads.get_shard_id_set();
+        shard_id_set pdw_read_shards = pdw_reads.get_shard_id_set();
 
-      if (pdw_write_mode != 0) {
-        do_parity_delta_write = (pdw_write_mode == 2);
-      } else if (!shard_id_set::difference(pdw_read_shards, readable_shards).empty()) {
-        // Some kind of reconstruct would be needed for PDW, so don't bother.
-        do_parity_delta_write = false;
-      } else if (!shard_id_set::difference(read_shards, readable_shards).empty()) {
-        // Some kind of reconstruct is needed for conventional, but NOT for PDW!
-        do_parity_delta_write = true;
-      } else {
-        /* Everything we need for both is available, opt for which ever is less
-         * reads.
-         */
-        do_parity_delta_write = pdw_read_shards.size() < read_shards.size();
-      }
+        if (pdw_write_mode != 0) {
+          do_parity_delta_write = (pdw_write_mode == 2);
+        } else if (!shard_id_set::difference(pdw_read_shards, readable_shards).empty()) {
+          // Some kind of reconstruct would be needed for PDW, so don't bother.
+          do_parity_delta_write = false;
+        } else if (!shard_id_set::difference(read_shards, readable_shards).empty()) {
+          // Some kind of reconstruct is needed for conventional, but NOT for PDW!
+          do_parity_delta_write = true;
+        } else {
+          /* Everything we need for both is available, opt for which ever is less
+           * reads.
+           */
+          do_parity_delta_write = pdw_read_shards.size() < read_shards.size();
+        }
 
-      if (do_parity_delta_write) {
-        to_read = std::move(pdw_reads);
-        reads.clear(); // So we don't stash it at the end.
-      }
-    }
+        if (do_parity_delta_write) {
+          to_read = std::move(pdw_reads);
+          reads.clear(); // So we don't stash it at the end.
+        }
+          }
 
-    /* NOTE: We intentionally leave un-writable shards in the write plan.  As
-     * it is actually less efficient to take them out:- PDWs still need to
-     * compute the deltas and conventional writes still need to calcualte the
-     * parity. The transaction will be dropped by generate_transactions.
-     */
-  }
+      /* NOTE: We intentionally leave un-writable shards in the write plan.  As
+       * it is actually less efficient to take them out:- PDWs still need to
+       * compute the deltas and conventional writes still need to calcualte the
+       * parity. The transaction will be dropped by generate_transactions.
+       */
+    }
 
-  if (!reads.empty()) {
-    to_read = std::move(reads);
+    if (!reads.empty()) {
+      to_read = std::move(reads);
+    }
   }
 
   /* validate post conditions:
@@ -267,18 +271,21 @@ ECTransaction::WritePlanObj::WritePlanObj(
 }
 
 void ECTransaction::Generate::all_shards_written() {
+  ceph_assert(!written_shards_final);
   if (entry) {
     entry->written_shards.insert_range(shard_id_t(0), sinfo.get_k_plus_m());
   }
 }
 
 void ECTransaction::Generate::shard_written(const shard_id_t shard) {
+  ceph_assert(!written_shards_final);
   if (entry) {
     entry->written_shards.insert(shard);
   }
 }
 
 void ECTransaction::Generate::shards_written(const shard_id_set &shards) {
+  ceph_assert(!written_shards_final);
   if (entry) {
     entry->written_shards.insert(shards);
   }
@@ -374,8 +381,9 @@ void ECTransaction::Generate::process_init() {
           ghobject_t(oid, ghobject_t::NO_GEN, shard));
       }
 
-      if (plan.hinfo && plan.shinfo)
+      if (plan.hinfo && plan.shinfo) {
         plan.hinfo->update_to(*plan.shinfo);
+      }
 
       if (obc) {
         auto cobciter = t.obc_map.find(cop.source);
@@ -393,8 +401,9 @@ void ECTransaction::Generate::process_init() {
           coll_t(spg_t(pgid, shard)),
           ghobject_t(oid, ghobject_t::NO_GEN, shard));
       }
-      if (plan.hinfo && plan.shinfo)
+      if (plan.hinfo && plan.shinfo) {
         plan.hinfo->update_to(*plan.shinfo);
+      }
       if (obc) {
         auto cobciter = t.obc_map.find(rop.source);
         ceph_assert(cobciter == t.obc_map.end());
@@ -455,6 +464,13 @@ ECTransaction::Generate::Generate(PGTransaction &t,
     plan(plan),
     read_sem(&sinfo),
     to_write(&sinfo) {
+
+  vector<unsigned> old_transaction_counts(sinfo.get_k_plus_m());
+
+  for (auto &&[shard, t] : transactions) {
+    old_transaction_counts[int(shard)] = t.get_num_ops();
+  }
+
   auto obiter = t.obc_map.find(oid);
   if (obiter != t.obc_map.end()) {
     obc = obiter->second;
@@ -508,6 +524,7 @@ ECTransaction::Generate::Generate(PGTransaction &t,
   ceph_assert(!(op.clear_omap) && !(op.omap_header) && op.omap_updates.empty());
 
   if (op.alloc_hint) {
+    all_shards_written();
     alloc_hint(op, transactions, pgid, oid, sinfo);
   }
 
@@ -520,7 +537,7 @@ ECTransaction::Generate::Generate(PGTransaction &t,
     }
   }
   debug(oid, "to_write", to_write, dpp);
-  ldpp_dout(dpp, 20) << "generate_transactions: plan: " << plan << dendl;
+  ldpp_dout(dpp, 20) << " generate_transactions: plan: " << plan << dendl;
 
   if (op.truncate && op.truncate->first < plan.orig_size) {
     truncate();
@@ -529,15 +546,6 @@ ECTransaction::Generate::Generate(PGTransaction &t,
   overlay_writes();
   appends_and_clone_ranges();
 
-  // On a size change, we want to update OI on all shards
-  if (plan.orig_size != plan.projected_size) {
-    all_shards_written();
-  } else {
-    // All priumary shards must always be written, regardless of the write plan.
-    shards_written(sinfo.get_parity_shards());
-    shard_written(shard_id_t(0));
-  }
-
   if (!to_write.empty()) {
     encode_and_write();
   }
@@ -553,18 +561,51 @@ ECTransaction::Generate::Generate(PGTransaction &t,
     entry->mod_desc.append(ECUtil::align_page_next(plan.orig_size));
   }
 
+  if (op.is_delete()) {
+    handle_deletes();
+  }
+
+  // On a size change, we want to update OI on all shards
+  if (plan.orig_size != plan.projected_size) {
+    all_shards_written();
+  } else {
+    // All primary shards must always be written, regardless of the write plan.
+    shards_written(sinfo.get_parity_shards());
+    shard_written(shard_id_t(0));
+  }
+
   written_and_present_shards();
 
   if (!op.attr_updates.empty()) {
     attr_updates();
   }
 
-  if (entry && !xattr_rollback.empty()) {
+  if (!entry) {
+    return;
+  }
+
+  if (!xattr_rollback.empty()) {
     entry->mod_desc.setattrs(xattr_rollback);
   }
 
-  if (!op.is_delete()) {
-    handle_deletes();
+  /* It is essential for rollback that every shard with a non-empty transaction
+   * is recorded in written_shards. In fact written shards contains every
+   * shard that would have a transaction if it were present. This is why we do
+   * not simply construct written shards here.
+   */
+  for (auto &&[shard, t] : transactions) {
+    if (t.get_num_ops() > old_transaction_counts[int(shard)] &&
+        !entry->is_written_shard(shard)) {
+      ldpp_dout(dpp, 20) << __func__ << " Transaction for shard " << shard << ": ";
+      Formatter *f = Formatter::create("json");
+      f->open_object_section("t");
+      t.dump(f);
+      f->close_section();
+      f->flush(*_dout);
+      delete f;
+      *_dout << dendl;
+      ceph_abort_msg("Written shard not set, but messages present. ");
+    }
   }
 }
 
@@ -768,6 +809,7 @@ void ECTransaction::Generate::written_and_present_shards() {
     if (entry->written_shards.size() == sinfo.get_k_plus_m()) {
       // More efficient to encode an empty set for all shards
       entry->written_shards.clear();
+      written_shards_final = true;
     }
     // Calculate set of present shards
     for (auto &&[shard, t]: transactions) {
@@ -822,17 +864,6 @@ void ECTransaction::Generate::written_and_present_shards() {
                          << " written=" << entry->written_shards
                          << " shard_versions=" << oi.shard_versions << dendl;
     }
-
-    /* It is essential for rollback that every shard with a non-empty transaction
-     * is recorded in written_shards. In fact written shards contains every
-     * shard that would have a transaction if it were present. This is why we do
-     * not simply construct written shards here.
-     */
-    for (auto &&[shard, t] : transactions) {
-      if (entry && (!t.empty() || !sinfo.is_nonprimary_shard(shard))) {
-        ceph_assert(entry->is_written_shard(shard));
-      }
-    }
   }
 }
 
@@ -842,12 +873,13 @@ void ECTransaction::Generate::attr_updates() {
     if (update) {
       to_set[attr] = *(update);
     } else {
-      all_shards_written();
       for (auto &&[shard, t]: transactions) {
-        t.rmattr(
-          coll_t(spg_t(pgid, shard)),
-          ghobject_t(oid, ghobject_t::NO_GEN, shard),
-          attr);
+        if (!sinfo.is_nonprimary_shard(shard)) {
+          t.rmattr(
+            coll_t(spg_t(pgid, shard)),
+            ghobject_t(oid, ghobject_t::NO_GEN, shard),
+            attr);
+        }
       }
     }
     if (obc) {
diff --git a/src/osd/ECTransaction.h b/src/osd/ECTransaction.h
index 2393201db739..b2f6157d17e5 100644
--- a/src/osd/ECTransaction.h
+++ b/src/osd/ECTransaction.h
@@ -99,6 +99,7 @@ class Generate {
   std::vector<std::pair<uint64_t, uint64_t>> rollback_extents;
   std::vector<shard_id_set> rollback_shards;
   uint32_t fadvise_flags = 0;
+  bool written_shards_final{false};
 
   void all_shards_written();
   void shard_written(const shard_id_t shard);
diff --git a/src/test/osd/test_ec_transaction.cc b/src/test/osd/test_ec_transaction.cc
index 5c662279a5dd..1f770cfac33a 100644
--- a/src/test/osd/test_ec_transaction.cc
+++ b/src/test/osd/test_ec_transaction.cc
@@ -368,4 +368,36 @@ TEST(ectransaction, test_overwrite_with_missing)
   ref_write[shard_id_t(0)].insert(0, 8192);
   ref_write[shard_id_t(1)].insert(0, 8192);
   ASSERT_EQ(ref_write, plan.will_write);
-}
\ No newline at end of file
+}
+
+TEST(ectransaction, truncate_to_bigger_without_write)
+{
+  hobject_t h;
+  PGTransaction::ObjectOperation op;
+
+  op.truncate = std::pair(8192, 8192);
+
+  pg_pool_t pool;
+  pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS);
+  ECUtil::stripe_info_t sinfo(2, 2, 8192, &pool);
+  shard_id_set shards;
+  shards.insert_range(shard_id_t(), 4);
+  ECTransaction::WritePlanObj plan(
+    h,
+    op,
+    sinfo,
+    shards,
+    shards,
+    false,
+    4096,
+    std::nullopt,
+    std::nullopt,
+    ECUtil::HashInfoRef(new ECUtil::HashInfo(1)),
+    nullptr,
+    0);
+
+  generic_derr << "plan " << plan << dendl;
+
+  ASSERT_FALSE(plan.to_read);
+  ASSERT_EQ(0u, plan.will_write.shard_count());
+}