osd: Fix written shards policing for multiple loops through generate.

author Alex Ainscow <aainscow@uk.ibm.com>

Wed, 16 Apr 2025 06:44:25 +0000 (07:44 +0100)

committer Alex Ainscow <aainscow@uk.ibm.com>

Tue, 1 Jul 2025 12:03:29 +0000 (13:03 +0100)
author Alex Ainscow <aainscow@uk.ibm.com>
Wed, 16 Apr 2025 06:44:25 +0000 (07:44 +0100)
committer Alex Ainscow <aainscow@uk.ibm.com>
Tue, 1 Jul 2025 12:03:29 +0000 (13:03 +0100)
diff --git a/src/osd/ECTransaction.cc b/src/osd/ECTransaction.cc

index c094738307e530e32deee8c47c2a6316f5d329b3..6a2d3ef32c90770552994189c943cd6b1f1e648f 100644 (file)
--- a/src/osd/ECTransaction.cc
+++ b/src/osd/ECTransaction.cc
@@ -51,7 +51,9 @@ void ECTransaction::Generate::encode_and_write() {
    // If partial writes are not supported, pad out to_write to a full stripe.
    if (!sinfo.supports_partial_writes()) {
      for (auto &&[shard, eset]: plan.will_write) {
-      if (sinfo.get_raw_shard(shard) >= sinfo.get_k()) continue;
+      if (sinfo.get_raw_shard(shard) >= sinfo.get_k()) {
+        continue;
+      }
  
        for (auto [off, len]: eset) {
          to_write.zero_pad(shard, off, len);
@@ -176,87 +178,89 @@ ECTransaction::WritePlanObj::WritePlanObj(
    write_superset.align(CEPH_PAGE_SIZE);
  
    shard_id_set writable_parity_shards = shard_id_set::intersection(sinfo.get_parity_shards(), writable_shards);
-  for (auto shard : writable_parity_shards) {
-    will_write[shard].insert(write_superset);
-  }
-
-  ECUtil::shard_extent_set_t reads(sinfo.get_k_plus_m());
-  ECUtil::shard_extent_set_t read_mask(sinfo.get_k_plus_m());
-
-  if (!sinfo.supports_partial_writes()) {
-    for (shard_id_t shard; shard < sinfo.get_k_plus_m(); ++shard) {
+  if (write_superset.size() > 0) {
+    for (auto shard : writable_parity_shards) {
        will_write[shard].insert(write_superset);
      }
-    will_write.align(sinfo.get_chunk_size());
-    reads = will_write;
-    sinfo.ro_size_to_read_mask(sinfo.ro_offset_to_next_stripe_ro_offset(orig_size), read_mask);
-    reads.intersection_of(read_mask);
-    do_parity_delta_write = false;
-  } else {
-    will_write.align(CEPH_PAGE_SIZE);
-    ECUtil::shard_extent_set_t pdw_reads(will_write);
  
-    sinfo.ro_size_to_read_mask(ECUtil::align_page_next(orig_size), read_mask);
+    ECUtil::shard_extent_set_t reads(sinfo.get_k_plus_m());
+    ECUtil::shard_extent_set_t read_mask(sinfo.get_k_plus_m());
  
-    /* Next we need to add the reads required for a conventional write */
-    for (auto shard : sinfo.get_data_shards()) {
-      reads[shard].insert(write_superset);
-      if (will_write.contains(shard)) {
-        reads[shard].subtract(will_write.at(shard));
+    if (!sinfo.supports_partial_writes()) {
+      for (shard_id_t shard; shard < sinfo.get_k_plus_m(); ++shard) {
+        will_write[shard].insert(write_superset);
        }
-      if (reads[shard].empty()) {
-        reads.erase(shard);
+      will_write.align(sinfo.get_chunk_size());
+      reads = will_write;
+      sinfo.ro_size_to_read_mask(sinfo.ro_offset_to_next_stripe_ro_offset(orig_size), read_mask);
+      reads.intersection_of(read_mask);
+      do_parity_delta_write = false;
+    } else {
+      will_write.align(CEPH_PAGE_SIZE);
+      ECUtil::shard_extent_set_t pdw_reads(will_write);
+
+      sinfo.ro_size_to_read_mask(ECUtil::align_page_next(orig_size), read_mask);
+
+      /* Next we need to add the reads required for a conventional write */
+      for (auto shard : sinfo.get_data_shards()) {
+        reads[shard].insert(write_superset);
+        if (will_write.contains(shard)) {
+          reads[shard].subtract(will_write.at(shard));
+        }
+        if (reads[shard].empty()) {
+          reads.erase(shard);
+        }
        }
-    }
  
-    /* We now need to add in the partial page ro writes. This is not particularly
-     * efficient as the are many divs in here, but non-4k aligned writes are
-     * not very efficient anyway
-     */
-    for (auto &&[off, len] : partial_page_ro_writes) {
-      sinfo.ro_range_to_shard_extent_set(
-        off, len, reads);
-    }
+      /* We now need to add in the partial page ro writes. This is not particularly
+       * efficient as the are many divs in here, but non-4k aligned writes are
+       * not very efficient anyway
+       */
+      for (auto &&[off, len] : partial_page_ro_writes) {
+        sinfo.ro_range_to_shard_extent_set(
+          off, len, reads);
+      }
  
-    reads.intersection_of(read_mask);
+      reads.intersection_of(read_mask);
  
-    /* Here we decide if we want to do a conventional write or a parity delta write. */
-    if (sinfo.supports_parity_delta_writes() && !object_in_cache &&
-        orig_size == projected_size && !reads.empty()) {
+      /* Here we decide if we want to do a conventional write or a parity delta write. */
+      if (sinfo.supports_parity_delta_writes() && !object_in_cache &&
+          orig_size == projected_size && !reads.empty()) {
  
-      shard_id_set read_shards = reads.get_shard_id_set();
-      shard_id_set pdw_read_shards = pdw_reads.get_shard_id_set();
+        shard_id_set read_shards = reads.get_shard_id_set();
+        shard_id_set pdw_read_shards = pdw_reads.get_shard_id_set();
  
-      if (pdw_write_mode != 0) {
-        do_parity_delta_write = (pdw_write_mode == 2);
-      } else if (!shard_id_set::difference(pdw_read_shards, readable_shards).empty()) {
-        // Some kind of reconstruct would be needed for PDW, so don't bother.
-        do_parity_delta_write = false;
-      } else if (!shard_id_set::difference(read_shards, readable_shards).empty()) {
-        // Some kind of reconstruct is needed for conventional, but NOT for PDW!
-        do_parity_delta_write = true;
-      } else {
-        /* Everything we need for both is available, opt for which ever is less
-         * reads.
-         */
-        do_parity_delta_write = pdw_read_shards.size() < read_shards.size();
-      }
+        if (pdw_write_mode != 0) {
+          do_parity_delta_write = (pdw_write_mode == 2);
+        } else if (!shard_id_set::difference(pdw_read_shards, readable_shards).empty()) {
+          // Some kind of reconstruct would be needed for PDW, so don't bother.
+          do_parity_delta_write = false;
+        } else if (!shard_id_set::difference(read_shards, readable_shards).empty()) {
+          // Some kind of reconstruct is needed for conventional, but NOT for PDW!
+          do_parity_delta_write = true;
+        } else {
+          /* Everything we need for both is available, opt for which ever is less
+           * reads.
+           */
+          do_parity_delta_write = pdw_read_shards.size() < read_shards.size();
+        }
  
-      if (do_parity_delta_write) {
-        to_read = std::move(pdw_reads);
-        reads.clear(); // So we don't stash it at the end.
-      }
-    }
+        if (do_parity_delta_write) {
+          to_read = std::move(pdw_reads);
+          reads.clear(); // So we don't stash it at the end.
+        }
+          }
  
-    /* NOTE: We intentionally leave un-writable shards in the write plan.  As
-     * it is actually less efficient to take them out:- PDWs still need to
-     * compute the deltas and conventional writes still need to calcualte the
-     * parity. The transaction will be dropped by generate_transactions.
-     */
-  }
+      /* NOTE: We intentionally leave un-writable shards in the write plan.  As
+       * it is actually less efficient to take them out:- PDWs still need to
+       * compute the deltas and conventional writes still need to calcualte the
+       * parity. The transaction will be dropped by generate_transactions.
+       */
+    }
  
-  if (!reads.empty()) {
-    to_read = std::move(reads);
+    if (!reads.empty()) {
+      to_read = std::move(reads);
+    }
    }
  
    /* validate post conditions:
@@ -267,18 +271,21 @@ ECTransaction::WritePlanObj::WritePlanObj(
  }
  
  void ECTransaction::Generate::all_shards_written() {
+  ceph_assert(!written_shards_final);
    if (entry) {
      entry->written_shards.insert_range(shard_id_t(0), sinfo.get_k_plus_m());
    }
  }
  
  void ECTransaction::Generate::shard_written(const shard_id_t shard) {
+  ceph_assert(!written_shards_final);
    if (entry) {
      entry->written_shards.insert(shard);
    }
  }
  
  void ECTransaction::Generate::shards_written(const shard_id_set &shards) {
+  ceph_assert(!written_shards_final);
    if (entry) {
      entry->written_shards.insert(shards);
    }
@@ -374,8 +381,9 @@ void ECTransaction::Generate::process_init() {
            ghobject_t(oid, ghobject_t::NO_GEN, shard));
        }
  
-      if (plan.hinfo && plan.shinfo)
+      if (plan.hinfo && plan.shinfo) {
          plan.hinfo->update_to(*plan.shinfo);
+      }
  
        if (obc) {
          auto cobciter = t.obc_map.find(cop.source);
@@ -393,8 +401,9 @@ void ECTransaction::Generate::process_init() {
            coll_t(spg_t(pgid, shard)),
            ghobject_t(oid, ghobject_t::NO_GEN, shard));
        }
-      if (plan.hinfo && plan.shinfo)
+      if (plan.hinfo && plan.shinfo) {
          plan.hinfo->update_to(*plan.shinfo);
+      }
        if (obc) {
          auto cobciter = t.obc_map.find(rop.source);
          ceph_assert(cobciter == t.obc_map.end());
@@ -455,6 +464,13 @@ ECTransaction::Generate::Generate(PGTransaction &t,
      plan(plan),
      read_sem(&sinfo),
      to_write(&sinfo) {
+
+  vector<unsigned> old_transaction_counts(sinfo.get_k_plus_m());
+
+  for (auto &&[shard, t] : transactions) {
+    old_transaction_counts[int(shard)] = t.get_num_ops();
+  }
+
    auto obiter = t.obc_map.find(oid);
    if (obiter != t.obc_map.end()) {
      obc = obiter->second;
@@ -508,6 +524,7 @@ ECTransaction::Generate::Generate(PGTransaction &t,
    ceph_assert(!(op.clear_omap) && !(op.omap_header) && op.omap_updates.empty());
  
    if (op.alloc_hint) {
+    all_shards_written();
      alloc_hint(op, transactions, pgid, oid, sinfo);
    }
  
@@ -520,7 +537,7 @@ ECTransaction::Generate::Generate(PGTransaction &t,
      }
    }
    debug(oid, "to_write", to_write, dpp);
-  ldpp_dout(dpp, 20) << "generate_transactions: plan: " << plan << dendl;
+  ldpp_dout(dpp, 20) << " generate_transactions: plan: " << plan << dendl;
  
    if (op.truncate && op.truncate->first < plan.orig_size) {
      truncate();
@@ -529,15 +546,6 @@ ECTransaction::Generate::Generate(PGTransaction &t,
    overlay_writes();
    appends_and_clone_ranges();
  
-  // On a size change, we want to update OI on all shards
-  if (plan.orig_size != plan.projected_size) {
-    all_shards_written();
-  } else {
-    // All priumary shards must always be written, regardless of the write plan.
-    shards_written(sinfo.get_parity_shards());
-    shard_written(shard_id_t(0));
-  }
-
    if (!to_write.empty()) {
      encode_and_write();
    }
@@ -553,18 +561,51 @@ ECTransaction::Generate::Generate(PGTransaction &t,
      entry->mod_desc.append(ECUtil::align_page_next(plan.orig_size));
    }
  
+  if (op.is_delete()) {
+    handle_deletes();
+  }
+
+  // On a size change, we want to update OI on all shards
+  if (plan.orig_size != plan.projected_size) {
+    all_shards_written();
+  } else {
+    // All primary shards must always be written, regardless of the write plan.
+    shards_written(sinfo.get_parity_shards());
+    shard_written(shard_id_t(0));
+  }
+
    written_and_present_shards();
  
    if (!op.attr_updates.empty()) {
      attr_updates();
    }
  
-  if (entry && !xattr_rollback.empty()) {
+  if (!entry) {
+    return;
+  }
+
+  if (!xattr_rollback.empty()) {
      entry->mod_desc.setattrs(xattr_rollback);
    }
  
-  if (!op.is_delete()) {
-    handle_deletes();
+  /* It is essential for rollback that every shard with a non-empty transaction
+   * is recorded in written_shards. In fact written shards contains every
+   * shard that would have a transaction if it were present. This is why we do
+   * not simply construct written shards here.
+   */
+  for (auto &&[shard, t] : transactions) {
+    if (t.get_num_ops() > old_transaction_counts[int(shard)] &&
+        !entry->is_written_shard(shard)) {
+      ldpp_dout(dpp, 20) << __func__ << " Transaction for shard " << shard << ": ";
+      Formatter *f = Formatter::create("json");
+      f->open_object_section("t");
+      t.dump(f);
+      f->close_section();
+      f->flush(*_dout);
+      delete f;
+      *_dout << dendl;
+      ceph_abort_msg("Written shard not set, but messages present. ");
+    }
    }
  }
  
@@ -768,6 +809,7 @@ void ECTransaction::Generate::written_and_present_shards() {
      if (entry->written_shards.size() == sinfo.get_k_plus_m()) {
        // More efficient to encode an empty set for all shards
        entry->written_shards.clear();
+      written_shards_final = true;
      }
      // Calculate set of present shards
      for (auto &&[shard, t]: transactions) {
@@ -822,17 +864,6 @@ void ECTransaction::Generate::written_and_present_shards() {
                           << " written=" << entry->written_shards
                           << " shard_versions=" << oi.shard_versions << dendl;
      }
-
-    /* It is essential for rollback that every shard with a non-empty transaction
-     * is recorded in written_shards. In fact written shards contains every
-     * shard that would have a transaction if it were present. This is why we do
-     * not simply construct written shards here.
-     */
-    for (auto &&[shard, t] : transactions) {
-      if (entry && (!t.empty() || !sinfo.is_nonprimary_shard(shard))) {
-        ceph_assert(entry->is_written_shard(shard));
-      }
-    }
    }
  }
  
@@ -842,12 +873,13 @@ void ECTransaction::Generate::attr_updates() {
      if (update) {
        to_set[attr] = *(update);
      } else {
-      all_shards_written();
        for (auto &&[shard, t]: transactions) {
-        t.rmattr(
-          coll_t(spg_t(pgid, shard)),
-          ghobject_t(oid, ghobject_t::NO_GEN, shard),
-          attr);
+        if (!sinfo.is_nonprimary_shard(shard)) {
+          t.rmattr(
+            coll_t(spg_t(pgid, shard)),
+            ghobject_t(oid, ghobject_t::NO_GEN, shard),
+            attr);
+        }
        }
      }
      if (obc) {
diff --git a/src/osd/ECTransaction.h b/src/osd/ECTransaction.h

index 2393201db7399d20e0d57b820476ed3f233aa979..b2f6157d17e51bc18cd1fa0e7e70692e03f2a556 100644 (file)
--- a/src/osd/ECTransaction.h
+++ b/src/osd/ECTransaction.h
@@ -99,6 +99,7 @@ class Generate {
    std::vector<std::pair<uint64_t, uint64_t>> rollback_extents;
    std::vector<shard_id_set> rollback_shards;
    uint32_t fadvise_flags = 0;
+  bool written_shards_final{false};
  
    void all_shards_written();
    void shard_written(const shard_id_t shard);
diff --git a/src/test/osd/test_ec_transaction.cc b/src/test/osd/test_ec_transaction.cc

index 5c662279a5dd9990e1195aaa5db1c0b51520023e..1f770cfac33a522db797acb0107c714e1c8ff9ca 100644 (file)
--- a/src/test/osd/test_ec_transaction.cc
+++ b/src/test/osd/test_ec_transaction.cc
@@ -368,4 +368,36 @@ TEST(ectransaction, test_overwrite_with_missing)
    ref_write[shard_id_t(0)].insert(0, 8192);
    ref_write[shard_id_t(1)].insert(0, 8192);
    ASSERT_EQ(ref_write, plan.will_write);
-}
-\ No newline at end of file
+}
+
+TEST(ectransaction, truncate_to_bigger_without_write)
+{
+  hobject_t h;
+  PGTransaction::ObjectOperation op;
+
+  op.truncate = std::pair(8192, 8192);
+
+  pg_pool_t pool;
+  pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS);
+  ECUtil::stripe_info_t sinfo(2, 2, 8192, &pool);
+  shard_id_set shards;
+  shards.insert_range(shard_id_t(), 4);
+  ECTransaction::WritePlanObj plan(
+    h,
+    op,
+    sinfo,
+    shards,
+    shards,
+    false,
+    4096,
+    std::nullopt,
+    std::nullopt,
+    ECUtil::HashInfoRef(new ECUtil::HashInfo(1)),
+    nullptr,
+    0);
+
+  generic_derr << "plan " << plan << dendl;
+
+  ASSERT_FALSE(plan.to_read);
+  ASSERT_EQ(0u, plan.will_write.shard_count());
+}
author	Alex Ainscow <aainscow@uk.ibm.com>
	Wed, 16 Apr 2025 06:44:25 +0000 (07:44 +0100)
committer	Alex Ainscow <aainscow@uk.ibm.com>
	Tue, 1 Jul 2025 12:03:29 +0000 (13:03 +0100)
src/osd/ECTransaction.cc		patch \| blob \| history
src/osd/ECTransaction.h		patch \| blob \| history
src/test/osd/test_ec_transaction.cc		patch \| blob \| history