]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: Introduce functions required for EC OMAP support
authorMatty Williams <Matty.Williams@ibm.com>
Fri, 12 Dec 2025 11:21:10 +0000 (11:21 +0000)
committerMatty Williams <Matty.Williams@ibm.com>
Tue, 9 Jun 2026 08:46:07 +0000 (09:46 +0100)
Introduced a "supports_omap" pool flag which is always enabled for Replicated pools and currently always disabled for EC pools.
Introduced wrappers around omap read operations in PGBackend to include updates from the journal in EC pools with optimisations enabled.
Introduced a function for encoding an EC_OMAP operation in the ObjectModDesc::Visitor class and a function for committing an operation in the Trimmer struct.

Signed-off-by: Matty Williams <Matty.Williams@ibm.com>
src/mon/MonCommands.h
src/mon/OSDMonitor.cc
src/osd/ECBackend.cc
src/osd/ECBackend.h
src/osd/ECCommon.h
src/osd/ECSwitch.h
src/osd/PGBackend.cc
src/osd/PGBackend.h
src/osd/ReplicatedBackend.h
src/osd/osd_types.h
src/test/osd/MockPGBackend.h

index aae1c46b79c8f75d303e4e755cca386edf49ca08..3c7f13db567b37a6f751f740ddcd292152f2d48b 100644 (file)
@@ -1170,11 +1170,136 @@ COMMAND("osd pool rename "
        "rename <srcpool> to <destpool>", "osd", "rw")
 COMMAND("osd pool get "
        "name=pool,type=CephPoolname "
-       "name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|pg_num_max|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|eio|bulk|read_ratio|pct_update_delay|allow_ec_optimizations|ec_data_shard_count|ec_coding_shard_count",
+       "name=var,type=CephChoices,strings=all"
+          "|allow_ec_optimizations"
+          "|allow_ec_overwrites"
+          "|bulk"
+          "|cache_min_evict_age"
+          "|cache_min_flush_age"
+          "|cache_target_dirty_high_ratio"
+          "|cache_target_dirty_ratio"
+          "|cache_target_full_ratio"
+          "|compression_algorithm"
+          "|compression_max_blob_size"
+          "|compression_min_blob_size"
+          "|compression_mode"
+          "|compression_required_ratio"
+          "|crush_rule"
+          "|csum_max_block"
+          "|csum_min_block"
+          "|csum_type"
+          "|deep_scrub_interval"
+          "|dedup_cdc_chunk_size"
+          "|dedup_chunk_algorithm"
+          "|dedup_tier"
+          "|ec_coding_shard_count"
+          "|ec_data_shard_count"
+          "|eio"
+          "|erasure_code_profile"
+          "|fast_read"
+          "|fingerprint_algorithm"
+          "|hashpspool"
+          "|hit_set_count"
+          "|hit_set_fpp"
+          "|hit_set_grade_decay_rate"
+          "|hit_set_period"
+          "|hit_set_search_last_n"
+          "|hit_set_type"
+          "|min_read_recency_for_promote"
+          "|min_size"
+          "|min_write_recency_for_promote"
+          "|nodeep-scrub"
+          "|nodelete"
+          "|nopgchange"
+          "|noscrub"
+          "|nosizechange"
+          "|pct_update_delay"
+          "|pg_autoscale_bias"
+          "|pg_autoscale_mode"
+          "|pg_num"
+          "|pg_num_max"
+          "|pg_num_min"
+          "|pgp_num"
+          "|read_ratio"
+          "|recovery_op_priority"
+          "|recovery_priority"
+          "|scrub_max_interval"
+          "|scrub_min_interval"
+          "|scrub_priority"
+          "|size"
+          "|supports_omap"
+          "|target_max_bytes"
+          "|target_max_objects"
+          "|target_size_bytes"
+          "|target_size_ratio"
+          "|use_gmt_hitset"
+          "|write_fadvise_dontneed",
        "get pool parameter <var>", "osd", "r")
 COMMAND("osd pool set "
        "name=pool,type=CephPoolname "
-       "name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|pgp_num_actual|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|min_read_recency_for_promote|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|pg_num_max|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|eio|bulk|read_ratio|pct_update_delay|allow_ec_optimizations|set_pool_flags|unset_pool_flags "
+       "name=var,type=CephChoices,strings=allow_ec_optimizations"
+          "|allow_ec_overwrites"
+          "|bulk"
+          "|cache_min_evict_age"
+          "|cache_min_flush_age"
+          "|cache_target_dirty_high_ratio"
+          "|cache_target_dirty_ratio"
+          "|cache_target_full_ratio"
+          "|compression_algorithm"
+          "|compression_max_blob_size"
+          "|compression_min_blob_size"
+          "|compression_mode"
+          "|compression_required_ratio"
+          "|crush_rule"
+          "|csum_max_block"
+          "|csum_min_block"
+          "|csum_type"
+          "|deep_scrub_interval"
+          "|dedup_cdc_chunk_size"
+          "|dedup_chunk_algorithm"
+          "|dedup_tier"
+          "|eio"
+          "|fast_read"
+          "|fingerprint_algorithm"
+          "|hashpspool"
+          "|hit_set_count"
+          "|hit_set_fpp"
+          "|hit_set_grade_decay_rate"
+          "|hit_set_period"
+          "|hit_set_search_last_n"
+          "|hit_set_type"
+          "|min_read_recency_for_promote"
+          "|min_size"
+          "|min_write_recency_for_promote"
+          "|nodeep-scrub"
+          "|nodelete"
+          "|nopgchange"
+          "|noscrub"
+          "|nosizechange"
+          "|pct_update_delay"
+          "|pg_autoscale_bias"
+          "|pg_autoscale_mode"
+          "|pg_num"
+          "|pg_num_max"
+          "|pg_num_min"
+          "|pgp_num"
+          "|pgp_num_actual"
+          "|read_ratio"
+          "|recovery_op_priority"
+          "|recovery_priority"
+          "|scrub_max_interval"
+          "|scrub_min_interval"
+          "|scrub_priority"
+          "|set_pool_flags"
+          "|size"
+          "|supports_omap"
+          "|target_max_bytes"
+          "|target_max_objects"
+          "|target_size_bytes"
+          "|target_size_ratio"
+          "|unset_pool_flags"
+          "|use_gmt_hitset"
+          "|write_fadvise_dontneed "
        "name=val,type=CephString "
        "name=yes_i_really_mean_it,type=CephBool,req=false",
        "set pool parameter <var> to <val>", "osd", "rw")
index 71b22918158916ce59921d6e04555f83e782bd6f..c27879fcd442bf6fce9c04d954106d2ff5226fa0 100644 (file)
@@ -1995,6 +1995,17 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
           maybe_enable_pool_split_ops(pending_inc.new_pools[id]);
         }
       }
+
+      // Auto-enable omap support for replicated pools
+      for (auto& [pool_id, pool] : tmp.get_pools()) {
+        if (!pool.has_flag(pg_pool_t::FLAG_OMAP) && pool.is_replicated()) {
+          pg_pool_t p = pool;
+          p.flags |= pg_pool_t::FLAG_OMAP;
+          pending_inc.new_pools[pool_id] = p;
+          dout(10) << __func__ << " replicated pool " << pool_id
+                   << " has OMAP support auto-enabled" << dendl;
+        }
+      }
     }
   }
 
@@ -5471,7 +5482,8 @@ namespace {
     PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
     PG_AUTOSCALE_BIAS, DEDUP_TIER, DEDUP_CHUNK_ALGORITHM, 
     DEDUP_CDC_CHUNK_SIZE, POOL_EIO, BULK, PG_NUM_MAX, READ_RATIO,
-    EC_OPTIMIZATIONS, EC_DATA_SHARD_COUNT, EC_CODING_SHARD_COUNT };
+    EC_OPTIMIZATIONS, EC_DATA_SHARD_COUNT, EC_CODING_SHARD_COUNT,
+    SUPPORTS_OMAP };
 
   std::set<osd_pool_get_choices>
     subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
@@ -6280,6 +6292,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
       {"allow_ec_optimizations", EC_OPTIMIZATIONS},
       {"ec_data_shard_count", EC_DATA_SHARD_COUNT},
       {"ec_coding_shard_count", EC_CODING_SHARD_COUNT},
+      {"supports_omap", SUPPORTS_OMAP},
     };
 
     typedef std::set<osd_pool_get_choices> choices_set_t;
@@ -6552,6 +6565,9 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
             f->dump_unsigned("ec_coding_shard_count",
                              p->ec_coding_shard_count.value_or(0));
          break;
+          case SUPPORTS_OMAP:
+            f->dump_bool("supports_omap", p->supports_omap());
+            break;
        }
       }
       f->close_section();
@@ -6738,6 +6754,10 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
                << static_cast<unsigned int>(p->ec_coding_shard_count.value_or(0))
                << "\n";
             break;
+          case SUPPORTS_OMAP:
+            ss << "supports_omap: " <<
+              (p->supports_omap() ? "true" : "false") << "\n";
+            break;
        }
        rdata.append(ss.str());
        ss.str("");
@@ -8331,6 +8351,10 @@ int OSDMonitor::prepare_new_pool(string& name,
   if (crimson) {
     pi->set_flag(pg_pool_t::FLAG_CRIMSON);
   }
+  if (pool_type == pg_pool_t::TYPE_REPLICATED
+      && osdmap.require_osd_release >= ceph_release_t::umbrella) {
+    pi->set_flag(pg_pool_t::FLAG_OMAP);
+  }
 
   pi->size = size;
   pi->min_size = min_size;
@@ -9125,6 +9149,25 @@ int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
     } else {
       p.unset_flag(n);
     }
+  } else if (var == "supports_omap") {
+    if ((val == "true") && osdmap.require_osd_release < ceph_release_t::umbrella) {
+      ss << "supports_omap cannot be enabled until require_osd_release is set to umbrella or later";
+      return -EPERM;
+    }
+    // Disabling omap support will leave omap data in RocksDB which cannot be cleaned up
+    // It will also break any services that depend on this pool to store metadata
+    if ((val == "false") && (p.has_flag(pg_pool_t::FLAG_OMAP))) {
+      ss << "supports_omap cannot be disabled once enabled";
+      return -EINVAL;
+    }
+    // This restriction is temporary until omap support is well tested in Fast EC pools
+    if ((val == "true") && p.is_erasure()) {
+      ss << "supports_omap cannot be enabled in ec pools";
+      return -EINVAL;
+    }
+    if (val == "true") {
+      p.flags |= pg_pool_t::FLAG_OMAP;
+    }
   } else if (var == "target_max_objects") {
     if (interr.length()) {
       ss << "error parsing int '" << val << "': " << interr;
index 3d7725c4bf9967df4971d5356ee9d942b49b4fae..ea9b5511f65dd4869cdbf9cb3fabf2ea1a471307 100644 (file)
@@ -1408,3 +1408,227 @@ int ECBackend::be_deep_scrub(
   o.omap_digest_present = true;
   return 0;
 }
+
+bool ECBackend::remove_ec_omap_journal_entry(const hobject_t &hoid, const ECOmapJournalEntry &entry) {
+  return ec_omap_journal.remove_entry(hoid, entry);
+}
+
+std::pair<gen_t, bool> ECBackend::omap_get_generation(const hobject_t& hoid) {
+  return ec_omap_journal.get_generation(hoid);
+}
+
+void ECBackend::omap_trim_delete_from_journal(const hobject_t &hoid, const version_t version) {
+  return ec_omap_journal.trim_delete(hoid, version);
+}
+
+int ECBackend::omap_iterate (
+  ObjectStore::CollectionHandle &c_, ///< [in] collection
+  const ghobject_t &oid, ///< [in] object
+  const ObjectStore::omap_iter_seek_t &start_from,
+  ///^ [in] where the iterator should point to at the beginning
+  const OmapIterFunction &f, ///< [in] function to call for each key/value pair
+  ObjectStore *store
+) {
+  // Updates in update_map take priority over removed_ranges
+  auto [update_map, removed_ranges] = ec_omap_journal.get_value_updates(oid.hobj);
+
+  auto journal_it = update_map.begin();
+  if (!start_from.seek_position.empty()) {
+    journal_it = update_map.lower_bound(start_from.seek_position);
+  }
+
+  auto wrapper = [&](const std::string_view store_key, const std::string_view store_value) {
+    bool found_store_key_in_journal = false;
+    
+    while (journal_it != update_map.end() && journal_it->first <= store_key) {
+      if (journal_it->first == store_key) {
+        found_store_key_in_journal = true;
+      }
+      if (journal_it->second.value.has_value()) {
+        ObjectStore::omap_iter_ret_t r = f(
+          journal_it->first,
+          std::string_view(
+            journal_it->second.value->c_str(),
+            journal_it->second.value->length()
+          )
+        );
+        if (r == ObjectStore::omap_iter_ret_t::STOP) {
+          return r;
+        }
+      }
+      ++journal_it;
+    }
+
+    if (found_store_key_in_journal) {
+      return ObjectStore::omap_iter_ret_t::NEXT;
+    }
+
+    if (should_be_removed(removed_ranges, store_key)) {
+      return ObjectStore::omap_iter_ret_t::NEXT;
+    }
+
+    return f(store_key, store_value);
+  };
+
+  if (const auto result = store->omap_iterate(c_, oid, start_from, wrapper);
+    result < 0) {
+    return result;
+  } else if (result > 0) {
+    return 1;
+  }
+
+  auto ret = ObjectStore::omap_iter_ret_t::NEXT;
+  while (journal_it != update_map.end()) {
+    if (journal_it->second.value.has_value()) {
+      ret = f(journal_it->first, std::string_view(journal_it->second.value->c_str(), journal_it->second.value->length()));
+      if (ret == ObjectStore::omap_iter_ret_t::STOP) {
+        break;
+      }
+    }
+    ++journal_it;
+  }
+
+  return ret == ObjectStore::omap_iter_ret_t::STOP ? 1 : 0;
+}
+
+int ECBackend::omap_get_values(
+  ObjectStore::CollectionHandle &c_, ///< [in] collection
+  const ghobject_t &oid,              ///< [in] object
+  const std::set<std::string> &keys,  ///< [in] keys to get
+  std::map<std::string, ceph::buffer::list> *out, ///< [out] returned key/values
+  ObjectStore *store
+) {
+  auto [update_map, removed_ranges] = ec_omap_journal.get_value_updates(oid.hobj);
+  
+  set<string> keys_still_to_get;
+  for (auto &key : keys) {
+    if (auto it = update_map.find(key);
+      it != update_map.end()) {
+      if (!it->second.value.has_value()) {
+        continue;
+      }
+      (*out)[key] = *(it->second.value);
+    } else if (should_be_removed(removed_ranges, key)) {
+      continue;
+    } else {
+      keys_still_to_get.insert(key);
+    }
+  }
+  store->omap_get_values(c_, oid, keys_still_to_get, out);
+
+  return 0;
+}
+
+int ECBackend::omap_get_header(
+  ObjectStore::CollectionHandle &c_,    ///< [in] Collection containing oid
+  const ghobject_t &oid,   ///< [in] Object containing omap
+  ceph::buffer::list *header,      ///< [out] omap header
+  const bool allow_eio, ///< [in] don't assert on eio
+  ObjectStore *store
+) {
+  std::optional<ceph::buffer::list> header_from_journal = ec_omap_journal.get_updated_header(oid.hobj);
+  if (header_from_journal) {
+    *header = *header_from_journal;
+  } else {
+    store->omap_get_header(c_, oid, header, allow_eio);
+  }
+  return 0;
+}
+
+int ECBackend::omap_get(
+  ObjectStore::CollectionHandle &c_,    ///< [in] Collection containing oid
+  const ghobject_t &oid,   ///< [in] Object containing omap
+  ceph::buffer::list *header,      ///< [out] omap header
+  std::map<std::string, ceph::buffer::list> *out, /// < [out] Key to value map
+  ObjectStore *store
+) {
+  // Update map takes priority over removed_ranges
+  auto [update_map, removed_ranges] = ec_omap_journal.get_value_updates(oid.hobj);
+  const auto updated_header = ec_omap_journal.get_updated_header(oid.hobj);
+
+  if (const int r = store->omap_get(c_, oid, header, out);
+    r < 0) {
+    return r;
+  }
+
+  // Update header if present
+  if (updated_header) {
+    *header = *updated_header;
+  }
+
+  // Remove keys in removed_ranges
+  for (auto out_it = out->begin(); out_it != out->end(); ++out_it) {
+    if (should_be_removed(removed_ranges, out_it->first)) {
+      out->erase(out_it->first);
+    }
+  }
+
+  // Apply updates in update_map
+  for (const auto &[key, val_opt] : update_map) {
+    if (val_opt.value.has_value()) {
+      (*out)[key] = *(val_opt.value);
+    } else {
+      out->erase(key);
+    }
+  }
+
+  return 0;
+}
+
+int ECBackend::omap_check_keys(
+  ObjectStore::CollectionHandle &c_,    ///< [in] Collection containing oid
+  const ghobject_t &oid,   ///< [in] Object containing omap
+  const std::set<std::string> &keys, ///< [in] Keys to check
+  std::set<std::string> *out,         ///< [out] Subset of keys defined on oid
+  ObjectStore *store
+) {
+  // Update map takes priority over removed_ranges
+  auto [update_map, removed_ranges] = ec_omap_journal.get_value_updates(oid.hobj);
+  auto updated_header = ec_omap_journal.get_updated_header(oid.hobj);
+
+  // First check keys in update_map and removed_ranges
+  set<string> keys_to_check_on_disk;
+  for (const auto &key : keys) {
+    if (auto it = update_map.find(key);
+      it != update_map.end()) {
+      if (it->second.value.has_value()) {
+        out->insert(key);
+      }
+    } else if (should_be_removed(removed_ranges, key)) {
+      continue;
+    } else {
+      keys_to_check_on_disk.insert(key);
+    }
+  }
+
+  const int r = store->omap_check_keys(c_, oid, keys_to_check_on_disk, out);
+
+  return r;
+}
+
+bool ECBackend::should_be_removed(
+  const std::map<std::string, std::optional<std::string>>& removed_ranges,
+  const std::string_view key) {
+  if (removed_ranges.empty()) {
+    return false;
+  }
+  
+  // Find range that comes after this key
+  auto it = removed_ranges.upper_bound(std::string(key));
+
+  // If all ranges start after the key, it can't be in any range
+  if (it == removed_ranges.begin()) {
+    return false;
+  }
+
+  // Go back to the previous range
+  --it;
+  // If this range contains the key, return true
+  const auto& end_opt = it->second;
+  if (!end_opt || key < *end_opt) {
+    return true;
+  }
+
+  // No ranges contain the key, return false
+  return false;
+}
index fbc74d2febc84f50c743bf7bcfd4e05f40c6c7d5..a5f013a0bb38a976700bbc41e88ff191091f35a5 100644 (file)
@@ -406,4 +406,54 @@ public:
     }
     return object_size_to_shard_size(logical_size, shard_id);
   }
+
+  bool remove_ec_omap_journal_entry(const hobject_t &hoid, const ECOmapJournalEntry &entry);
+  std::pair<gen_t, bool> omap_get_generation(const hobject_t &hoid);
+  void omap_trim_delete_from_journal(const hobject_t &hoid, const version_t version);
+
+  using OmapIterFunction = std::function<ObjectStore::omap_iter_ret_t(std::string_view, std::string_view)>;
+  int omap_iterate (
+    ObjectStore::CollectionHandle &c_, ///< [in] collection
+    const ghobject_t &oid, ///< [in] object
+    const ObjectStore::omap_iter_seek_t &start_from, ///< [in] where the iterator should point to at the beginning
+    const OmapIterFunction &f, ///< [in] function to call for each key/value pair
+    ObjectStore *store
+  );
+
+  int omap_get_values(
+    ObjectStore::CollectionHandle &c_, ///< [in] collection
+    const ghobject_t &oid,              ///< [in] object
+    const std::set<std::string> &keys,  ///< [in] keys to get
+    std::map<std::string, ceph::buffer::list> *out, ///< [out] returned key/values
+    ObjectStore *store
+  );
+
+  int omap_get_header(
+    ObjectStore::CollectionHandle &c_,    ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    ceph::buffer::list *header,      ///< [out] omap header
+    bool allow_eio, ///< [in] don't assert on eio
+    ObjectStore *store
+  );
+
+  int omap_get(
+    ObjectStore::CollectionHandle &c_,    ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    ceph::buffer::list *header,      ///< [out] omap header
+    std::map<std::string, ceph::buffer::list> *out, /// < [out] Key to value map
+    ObjectStore *store
+  );
+
+  int omap_check_keys(
+    ObjectStore::CollectionHandle &c_,    ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    const std::set<std::string> &keys, ///< [in] Keys to check
+    std::set<std::string> *out,         ///< [out] Subset of keys defined on oid
+    ObjectStore *store
+  );
+
+  static bool should_be_removed(
+    const std::map<std::string, std::optional<std::string>>& removed_ranges,
+    const std::string_view key
+  );
 };
index 370cce17d2910d3eee9618f05178e7c725a853f7..3065f52f1d728565eb21ffee195bdee77bccfd10 100644 (file)
@@ -24,6 +24,7 @@
 #include "ECTypes.h"
 #include "messages/MOSDPGPushReply.h"
 #include "msg/MessageRef.h"
+#include "osd/ECOmapJournal.h"
 #if WITH_CRIMSON
 #include "crimson/osd/object_context.h"
 #include "os/Transaction.h"
index cc0f0f389226d307a6b8e4322bf90c65c776d307..7b3ead51bab1eef07bb833e96abbd938c91aa622 100644 (file)
@@ -452,4 +452,79 @@ public:
   bool get_is_ec_optimized() const final {
     return is_optimized();
   }
+  bool remove_ec_omap_journal_entry(const hobject_t &hoid, const ECOmapJournalEntry &entry) override {
+    ceph_assert(is_optimized());
+    return optimized.remove_ec_omap_journal_entry(hoid, entry);
+  }
+
+  std::pair<gen_t, bool> omap_get_generation(const hobject_t &hoid) override {
+    ceph_assert(is_optimized());
+    return optimized.omap_get_generation(hoid);
+  }
+
+  void omap_trim_delete_from_journal(const hobject_t &hoid, const version_t version) override {
+    ceph_assert(is_optimized());
+    optimized.omap_trim_delete_from_journal(hoid, version);
+  }
+
+  int omap_iterate (
+    ObjectStore::CollectionHandle &c_, ///< [in] collection
+    const ghobject_t &oid, ///< [in] object
+    const ObjectStore::omap_iter_seek_t &start_from,
+    ///^ [in] where the iterator should point to at the beginning
+    const OmapIterFunction &f ///< [in] function to call for each key/value pair
+  ) override {
+    if (!is_optimized()) {
+      return store->omap_iterate(c_, oid, start_from, f);
+    }
+    return optimized.omap_iterate(c_, oid, start_from, f, store);
+  }
+
+  int omap_get_values(
+    ObjectStore::CollectionHandle &c_, ///< [in] collection
+    const ghobject_t &oid, ///< [in] object
+    const std::set<std::string> &keys, ///< [in] keys to get
+    std::map<std::string, ceph::buffer::list> *out ///< [out] returned key/values
+  ) override {
+    if (!is_optimized()) {
+      return store->omap_get_values(c_, oid, keys, out);
+    }
+    return optimized.omap_get_values(c_, oid, keys, out, store);
+  }
+
+  int omap_get_header(
+    ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid
+    const ghobject_t &oid, ///< [in] Object containing omap
+    ceph::buffer::list *header, ///< [out] omap header
+    bool allow_eio ///< [in] don't assert on eio
+  ) override {
+    if (!is_optimized()) {
+      return store->omap_get_header(c_, oid, header, allow_eio);
+    }
+    return optimized.omap_get_header(c_, oid, header, allow_eio, store);
+  }
+
+  int omap_get(
+    ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid
+    const ghobject_t &oid, ///< [in] Object containing omap
+    ceph::buffer::list *header, ///< [out] omap header
+    std::map<std::string, ceph::buffer::list> *out /// < [out] Key to value map
+  ) override {
+    if (!is_optimized()) {
+      return store->omap_get(c_, oid, header, out);
+    }
+    return optimized.omap_get(c_, oid, header, out, store);
+  }
+
+  int omap_check_keys(
+    ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid
+    const ghobject_t &oid, ///< [in] Object containing omap
+    const std::set<std::string> &keys, ///< [in] Keys to check
+    std::set<std::string> *out ///< [out] Subset of keys defined on oid
+  ) override {
+    if (!is_optimized()) {
+      return store->omap_check_keys(c_, oid, keys, out);
+    }
+    return optimized.omap_check_keys(c_, oid, keys, out, store);
+  }
 };
index bb40c28e66c9179e90290d08e475a43d41f7009a..5be44f75db619f17c1576b03748632cd2f2c90c7 100644 (file)
@@ -341,11 +341,12 @@ struct Trimmer : public ObjectModDesc::Visitor {
   const hobject_t &soid;
   PGBackend *pg;
   ObjectStore::Transaction *t;
+  const pg_log_entry_t &entry;
   Trimmer(
-    const hobject_t &soid,
     PGBackend *pg,
-    ObjectStore::Transaction *t)
-    : soid(soid), pg(pg), t(t) {}
+    ObjectStore::Transaction *t,
+    const pg_log_entry_t &entry)
+    : soid(entry.soid), pg(pg), t(t), entry(entry) {}
   void rmobject(version_t old_version) override {
     pg->trim_rollback_object(
       soid,
@@ -377,6 +378,80 @@ struct Trimmer : public ObjectModDesc::Visitor {
       }
     }
   }
+
+  void ec_omap(bool clear_omap, std::optional<ceph::buffer::list> omap_header, 
+    std::vector<std::pair<OmapUpdateType, ceph::buffer::list>> &omap_updates) override {
+
+    auto shard = pg->get_parent()->whoami_shard().shard;
+    spg_t spg = pg->get_parent()->whoami_spg_t();
+    auto sinfo = pg->ec_get_sinfo();
+    const auto [gen, lost_delete] = pg->omap_get_generation(soid);
+
+    if (!sinfo.is_nonprimary_shard(shard)) {
+      // If lost_delete is true, check if the object exists before performing updates
+      bool should_update = true;
+      if (lost_delete) {
+        struct stat st;
+        int r = pg->store->stat(
+          pg->ch,
+          ghobject_t(soid, gen, shard),
+          &st,
+          true);
+        if (r != 0) {
+          // Object doesn't exist on this shard, skip the update
+          should_update = false;
+        }
+      }
+
+      if (should_update) {
+        if (omap_header) {
+          t->omap_setheader(
+            coll_t(spg),
+            ghobject_t(soid, gen, shard),
+            *(omap_header));
+        }
+
+        if (clear_omap) {
+          t->omap_clear(
+            coll_t(spg),
+            ghobject_t(soid, gen, shard));
+        }
+
+        for (auto &&up: omap_updates) {
+          switch (up.first) {
+            case OmapUpdateType::Remove:
+              t->omap_rmkeys(
+                coll_t(spg),
+                ghobject_t(soid, gen, shard),
+                up.second);
+              break;
+            case OmapUpdateType::Insert:
+              t->omap_setkeys(
+                coll_t(spg),
+                ghobject_t(soid, gen, shard),
+                up.second);
+              break;
+            case OmapUpdateType::RemoveRange:
+              t->omap_rmkeyrange(
+                coll_t(spg),
+                ghobject_t(soid, gen, shard),
+                up.second);
+              break;
+          }
+        }
+      }
+    }
+
+    // Only remove journal entry if generation is NO_GEN (object not deleted)
+    // If gen != NO_GEN, the object has been deleted and journal was already cleared
+    if (gen == ghobject_t::NO_GEN && pg->get_parent()->pgb_is_primary()) {
+      const ECOmapJournalEntry to_remove(
+        entry.version, clear_omap,
+        omap_header, omap_updates
+        );
+      pg->remove_ec_omap_journal_entry(soid, to_remove);
+    }
+  }
 };
 
 void PGBackend::rollforward(
@@ -387,7 +462,7 @@ void PGBackend::rollforward(
   ldpp_dout(dpp, 20) << __func__ << ": entry=" << entry << dendl;
   if (!entry.can_rollback())
     return;
-  Trimmer trimmer(entry.soid, this, t);
+  Trimmer trimmer(this, t, entry);
   entry.mod_desc.visit(&trimmer);
 }
 
@@ -397,7 +472,7 @@ void PGBackend::trim(
 {
   if (!entry.can_rollback())
     return;
-  Trimmer trimmer(entry.soid, this, t);
+  Trimmer trimmer(this, t, entry);
   entry.mod_desc.visit(&trimmer);
 }
 
index d681c36b53c251ebb4f078f960522221bee696e9..57cc018eaa7b80fdd20802e428012b11a1496b5e 100644 (file)
@@ -35,6 +35,9 @@
 #include "common/ostream_temp.h"
 #include "Coroutines.h"
 
+
+class ECOmapJournalEntry;
+
 namespace Scrub {
   class Store;
 }
@@ -448,7 +451,45 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
    virtual shard_id_map<bufferlist> ec_decode_acting_set(
        const shard_id_map<bufferlist> &shard_map, int chunk_size) const = 0;
    virtual ECUtil::stripe_info_t ec_get_sinfo() const = 0;
-
+   virtual bool remove_ec_omap_journal_entry(const hobject_t &hoid, const ECOmapJournalEntry &entry) {
+     return false; // Only EC uses ec_omap_journal
+   };
+   virtual std::pair<gen_t, bool> omap_get_generation(const hobject_t &hoid) {
+     return {0, false}; // Only EC uses ec_omap_journal
+   };
+   virtual void omap_trim_delete_from_journal(const hobject_t &hoid, const version_t version) {};
+   using OmapIterFunction = std::function<ObjectStore::omap_iter_ret_t(std::string_view, std::string_view)>;
+   virtual int omap_iterate(
+     ObjectStore::CollectionHandle &c_, ///< [in] collection
+     const ghobject_t &oid, ///< [in] object
+     const ObjectStore::omap_iter_seek_t &start_from,
+     ///^ [in] where the iterator should point to at the beginning
+     const OmapIterFunction &f ///< [in] function to call for each key/value pair
+   ) = 0;
+   virtual int omap_get_values(
+     ObjectStore::CollectionHandle &c_, ///< [in] collection
+     const ghobject_t &oid, ///< [in] object
+     const std::set<std::string> &keys, ///< [in] keys to get
+     std::map<std::string, ceph::buffer::list> *out ///< [out] returned key/values
+   ) = 0;
+   virtual int omap_get_header(
+     ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid
+     const ghobject_t &oid, ///< [in] Object containing omap
+     ceph::buffer::list *header, ///< [out] omap header
+     bool allow_eio ///< [in] don't assert on eio
+   ) = 0;
+   virtual int omap_get(
+     ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid
+     const ghobject_t &oid, ///< [in] Object containing omap
+     ceph::buffer::list *header, ///< [out] omap header
+     std::map<std::string, ceph::buffer::list> *out /// < [out] Key to value map
+   ) = 0;
+   virtual int omap_check_keys(
+     ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid
+     const ghobject_t &oid, ///< [in] Object containing omap
+     const std::set<std::string> &keys, ///< [in] Keys to check
+     std::set<std::string> *out ///< [out] Subset of keys defined on oid
+   ) = 0;
  private:
    std::set<hobject_t> temp_contents;
  public:
index da06b20face92e0560e26bd5a2c0f585715465f5..1c1308f7f843b4e0715f6ed1e5b0d95ef1ebab6b 100644 (file)
@@ -134,6 +134,48 @@ public:
     }
   }
 
+  int omap_iterate(
+    ObjectStore::CollectionHandle &c_, ///< [in] collection
+    const ghobject_t &oid, ///< [in] object
+    const ObjectStore::omap_iter_seek_t &start_from,
+    ///^ [in] where the iterator should point to at the beginning
+    const OmapIterFunction &f ///< [in] function to call for each key/value pair
+  ) override {
+    return store->omap_iterate(c_, oid, start_from, f);
+  }
+  int omap_get_values(
+    ObjectStore::CollectionHandle &c_, ///< [in] collection
+    const ghobject_t &oid, ///< [in] object
+    const std::set<std::string> &keys, ///< [in] keys to get
+    std::map<std::string, ceph::buffer::list> *out ///< [out] returned key/values
+  ) override {
+    return store->omap_get_values(c_, oid, keys, out);
+  }
+  int omap_get_header(
+    ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid
+    const ghobject_t &oid, ///< [in] Object containing omap
+    ceph::buffer::list *header, ///< [out] omap header
+    bool allow_eio ///< [in] don't assert on eio
+  ) override {
+    return store->omap_get_header(c_, oid, header, allow_eio);
+  }
+  int omap_get(
+    ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid
+    const ghobject_t &oid, ///< [in] Object containing omap
+    ceph::buffer::list *header, ///< [out] omap header
+    std::map<std::string, ceph::buffer::list> *out /// < [out] Key to value map
+  ) override {
+    return store->omap_get(c_, oid, header, out);
+  }
+  int omap_check_keys(
+    ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid
+    const ghobject_t &oid, ///< [in] Object containing omap
+    const std::set<std::string> &keys, ///< [in] Keys to check
+    std::set<std::string> *out ///< [out] Subset of keys defined on oid
+  ) override {
+    return store->omap_check_keys(c_, oid, keys, out);
+  }
+
   int objects_read_sync(
     const hobject_t &hoid,
     uint64_t off,
index eed7eaf23a03438f07aa6b191684b0d3ceff09ac..7d6972655401d3ac64d33efb56c251d8d3c9028f 100644 (file)
@@ -1329,6 +1329,7 @@ struct pg_pool_t {
     FLAG_CRIMSON = 1<<18,
     FLAG_EC_OPTIMIZATIONS = 1<<19, // enable optimizations, once enabled, cannot be disabled
     FLAG_CLIENT_SPLIT_READS = 1<<20, // Optimized EC is permitted to do direct reads.
+    FLAG_OMAP = 1<<21, // Pool is permitted to perform OMAP operations
   };
 
   static const char *get_flag_name(uint64_t f) {
@@ -1354,6 +1355,7 @@ struct pg_pool_t {
     case FLAG_CRIMSON: return "crimson";
     case FLAG_EC_OPTIMIZATIONS: return "ec_optimizations";
     case FLAG_CLIENT_SPLIT_READS: return "split_reads";
+    case FLAG_OMAP: return "supports_omap";
     default: return "???";
     }
   }
@@ -1414,6 +1416,8 @@ struct pg_pool_t {
       return FLAG_EC_OPTIMIZATIONS;
     if (name == "split_reads")
       return FLAG_CLIENT_SPLIT_READS;
+    if (name == "supports_omap")
+      return FLAG_OMAP;
     return 0;
   }
 
@@ -1824,7 +1828,7 @@ public:
   bool is_erasure() const { return get_type() == TYPE_ERASURE; }
 
   bool supports_omap() const {
-    return !(get_type() == TYPE_ERASURE);
+    return has_flag(FLAG_OMAP) || is_replicated();
   }
 
   bool requires_aligned_append() const {
@@ -4107,6 +4111,8 @@ public:
   public:
     virtual void append(uint64_t old_offset) {}
     virtual void setattrs(std::map<std::string, std::optional<ceph::buffer::list>> &attrs) {}
+    virtual void ec_omap(bool clear_omap, std::optional<ceph::buffer::list> omap_header,
+      std::vector<std::pair<OmapUpdateType, ceph::buffer::list>> &omap_updates) {}
     virtual void rmobject(version_t old_version) {}
     /**
      * Used to support the unfound_lost_delete log event: if the stashed
@@ -4135,7 +4141,8 @@ public:
     CREATE = 4,
     UPDATE_SNAPS = 5,
     TRY_DELETE = 6,
-    ROLLBACK_EXTENTS = 7
+    ROLLBACK_EXTENTS = 7,
+    EC_OMAP = 8
   };
   ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {
     bl.reassign_to_mempool(mempool::mempool_osd_pglog);
@@ -4187,6 +4194,18 @@ public:
     encode(old_attrs, bl);
     ENCODE_FINISH(bl);
   }
+  void ec_omap(bool clear_omap, std::optional<ceph::buffer::list> omap_header,
+    std::vector<std::pair<OmapUpdateType, ceph::buffer::list>> &omap_updates) {
+    if(!can_local_rollback) {
+      return;
+    }
+    ENCODE_START(1, 1, bl);
+    append_id(EC_OMAP);
+    encode(clear_omap, bl);
+    encode(omap_header, bl);
+    encode(omap_updates, bl);
+    ENCODE_FINISH(bl);
+  }
   bool rmobject(version_t deletion_version) {
     if (!can_local_rollback || rollback_info_completed) {
       return false;
index f2d66a4f3f4c44af38ad0cabe06e11552f5e89e2..ffc8c6b5f494f9def056e60e962241f66399f48d 100644 (file)
@@ -94,6 +94,53 @@ public:
     return {0, 0, 0};
   }
 
+  using OmapIterFunction = std::function<ObjectStore::omap_iter_ret_t(std::string_view, std::string_view)>;
+  int omap_iterate(
+    ObjectStore::CollectionHandle &c_, ///< [in] collection
+    const ghobject_t &oid, ///< [in] object
+    const ObjectStore::omap_iter_seek_t &start_from,
+    ///^ [in] where the iterator should point to at the beginning
+    const OmapIterFunction &f ///< [in] function to call for each key/value pair
+  ) override {
+    return 0;
+  }
+
+  int omap_get_values(
+    ObjectStore::CollectionHandle &c_, ///< [in] collection
+    const ghobject_t &oid, ///< [in] object
+    const std::set<std::string> &keys, ///< [in] keys to get
+    std::map<std::string, ceph::buffer::list> *out ///< [out] returned key/values
+  ) override {
+    return 0;
+  }
+
+  int omap_get_header(
+    ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid
+    const ghobject_t &oid, ///< [in] Object containing omap
+    ceph::buffer::list *header, ///< [out] omap header
+    bool allow_eio ///< [in] don't assert on eio
+  ) override {
+    return 0;
+  }
+
+  int omap_get(
+    ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid
+    const ghobject_t &oid, ///< [in] Object containing omap
+    ceph::buffer::list *header, ///< [out] omap header
+    std::map<std::string, ceph::buffer::list> *out /// < [out] Key to value map
+  ) override {
+    return 0;
+  }
+
+  int omap_check_keys(
+    ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid
+    const ghobject_t &oid, ///< [in] Object containing omap
+    const std::set<std::string> &keys, ///< [in] Keys to check
+    std::set<std::string> *out ///< [out] Subset of keys defined on oid
+  ) override {
+    return 0;
+  }
+
   // Transaction submission
   void submit_transaction(
     const hobject_t &hoid,