From: Matty Williams Date: Fri, 12 Dec 2025 11:21:10 +0000 (+0000) Subject: osd: Introduce functions required for EC OMAP support X-Git-Tag: v21.0.1^2~5 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=df84974a39423261a8d864063979a26c51e51eae;p=ceph.git osd: Introduce functions required for EC OMAP support Introduced a "supports_omap" pool flag which is always enabled for Replicated pools and currently always disabled for EC pools. Introduced wrappers around omap read operations in PGBackend to include updates from the journal in EC pools with optimisations enabled. Introduced a function for encoding an EC_OMAP operation in the ObjectModDesc::Visitor class and a function for committing an operation in the Trimmer struct. Signed-off-by: Matty Williams --- diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index aae1c46b79c..3c7f13db567 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -1170,11 +1170,136 @@ COMMAND("osd pool rename " "rename to ", "osd", "rw") COMMAND("osd pool get " "name=pool,type=CephPoolname " - "name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|pg_num_max|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|eio|bulk|read_ratio|pct_update_delay|allow_ec_optimizations|ec_data_shard_count|ec_coding_shard_count", + "name=var,type=CephChoices,strings=all" + "|allow_ec_optimizations" + "|allow_ec_overwrites" + "|bulk" + "|cache_min_evict_age" + "|cache_min_flush_age" + "|cache_target_dirty_high_ratio" + "|cache_target_dirty_ratio" + "|cache_target_full_ratio" + "|compression_algorithm" + "|compression_max_blob_size" + "|compression_min_blob_size" + "|compression_mode" + "|compression_required_ratio" + "|crush_rule" + "|csum_max_block" + "|csum_min_block" + "|csum_type" + "|deep_scrub_interval" + "|dedup_cdc_chunk_size" + "|dedup_chunk_algorithm" + "|dedup_tier" + "|ec_coding_shard_count" + "|ec_data_shard_count" + "|eio" + "|erasure_code_profile" + "|fast_read" + "|fingerprint_algorithm" + "|hashpspool" + "|hit_set_count" + "|hit_set_fpp" + "|hit_set_grade_decay_rate" + "|hit_set_period" + "|hit_set_search_last_n" + "|hit_set_type" + "|min_read_recency_for_promote" + "|min_size" + "|min_write_recency_for_promote" + "|nodeep-scrub" + "|nodelete" + "|nopgchange" + "|noscrub" + "|nosizechange" + "|pct_update_delay" + "|pg_autoscale_bias" + "|pg_autoscale_mode" + "|pg_num" + "|pg_num_max" + "|pg_num_min" + "|pgp_num" + "|read_ratio" + "|recovery_op_priority" + "|recovery_priority" + "|scrub_max_interval" + "|scrub_min_interval" + "|scrub_priority" + "|size" + "|supports_omap" + "|target_max_bytes" + "|target_max_objects" + "|target_size_bytes" + "|target_size_ratio" + "|use_gmt_hitset" + "|write_fadvise_dontneed", "get pool parameter ", "osd", "r") COMMAND("osd pool set " "name=pool,type=CephPoolname " - "name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|pgp_num_actual|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|min_read_recency_for_promote|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|pg_num_max|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|eio|bulk|read_ratio|pct_update_delay|allow_ec_optimizations|set_pool_flags|unset_pool_flags " + "name=var,type=CephChoices,strings=allow_ec_optimizations" + "|allow_ec_overwrites" + "|bulk" + "|cache_min_evict_age" + "|cache_min_flush_age" + "|cache_target_dirty_high_ratio" + "|cache_target_dirty_ratio" + "|cache_target_full_ratio" + "|compression_algorithm" + "|compression_max_blob_size" + "|compression_min_blob_size" + "|compression_mode" + "|compression_required_ratio" + "|crush_rule" + "|csum_max_block" + "|csum_min_block" + "|csum_type" + "|deep_scrub_interval" + "|dedup_cdc_chunk_size" + "|dedup_chunk_algorithm" + "|dedup_tier" + "|eio" + "|fast_read" + "|fingerprint_algorithm" + "|hashpspool" + "|hit_set_count" + "|hit_set_fpp" + "|hit_set_grade_decay_rate" + "|hit_set_period" + "|hit_set_search_last_n" + "|hit_set_type" + "|min_read_recency_for_promote" + "|min_size" + "|min_write_recency_for_promote" + "|nodeep-scrub" + "|nodelete" + "|nopgchange" + "|noscrub" + "|nosizechange" + "|pct_update_delay" + "|pg_autoscale_bias" + "|pg_autoscale_mode" + "|pg_num" + "|pg_num_max" + "|pg_num_min" + "|pgp_num" + "|pgp_num_actual" + "|read_ratio" + "|recovery_op_priority" + "|recovery_priority" + "|scrub_max_interval" + "|scrub_min_interval" + "|scrub_priority" + "|set_pool_flags" + "|size" + "|supports_omap" + "|target_max_bytes" + "|target_max_objects" + "|target_size_bytes" + "|target_size_ratio" + "|unset_pool_flags" + "|use_gmt_hitset" + "|write_fadvise_dontneed " "name=val,type=CephString " "name=yes_i_really_mean_it,type=CephBool,req=false", "set pool parameter to ", "osd", "rw") diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 71b22918158..c27879fcd44 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -1995,6 +1995,17 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t) maybe_enable_pool_split_ops(pending_inc.new_pools[id]); } } + + // Auto-enable omap support for replicated pools + for (auto& [pool_id, pool] : tmp.get_pools()) { + if (!pool.has_flag(pg_pool_t::FLAG_OMAP) && pool.is_replicated()) { + pg_pool_t p = pool; + p.flags |= pg_pool_t::FLAG_OMAP; + pending_inc.new_pools[pool_id] = p; + dout(10) << __func__ << " replicated pool " << pool_id + << " has OMAP support auto-enabled" << dendl; + } + } } } @@ -5471,7 +5482,8 @@ namespace { PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO, PG_AUTOSCALE_BIAS, DEDUP_TIER, DEDUP_CHUNK_ALGORITHM, DEDUP_CDC_CHUNK_SIZE, POOL_EIO, BULK, PG_NUM_MAX, READ_RATIO, - EC_OPTIMIZATIONS, EC_DATA_SHARD_COUNT, EC_CODING_SHARD_COUNT }; + EC_OPTIMIZATIONS, EC_DATA_SHARD_COUNT, EC_CODING_SHARD_COUNT, + SUPPORTS_OMAP }; std::set subtract_second_from_first(const std::set& first, @@ -6280,6 +6292,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op) {"allow_ec_optimizations", EC_OPTIMIZATIONS}, {"ec_data_shard_count", EC_DATA_SHARD_COUNT}, {"ec_coding_shard_count", EC_CODING_SHARD_COUNT}, + {"supports_omap", SUPPORTS_OMAP}, }; typedef std::set choices_set_t; @@ -6552,6 +6565,9 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op) f->dump_unsigned("ec_coding_shard_count", p->ec_coding_shard_count.value_or(0)); break; + case SUPPORTS_OMAP: + f->dump_bool("supports_omap", p->supports_omap()); + break; } } f->close_section(); @@ -6738,6 +6754,10 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op) << static_cast(p->ec_coding_shard_count.value_or(0)) << "\n"; break; + case SUPPORTS_OMAP: + ss << "supports_omap: " << + (p->supports_omap() ? "true" : "false") << "\n"; + break; } rdata.append(ss.str()); ss.str(""); @@ -8331,6 +8351,10 @@ int OSDMonitor::prepare_new_pool(string& name, if (crimson) { pi->set_flag(pg_pool_t::FLAG_CRIMSON); } + if (pool_type == pg_pool_t::TYPE_REPLICATED + && osdmap.require_osd_release >= ceph_release_t::umbrella) { + pi->set_flag(pg_pool_t::FLAG_OMAP); + } pi->size = size; pi->min_size = min_size; @@ -9125,6 +9149,25 @@ int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap, } else { p.unset_flag(n); } + } else if (var == "supports_omap") { + if ((val == "true") && osdmap.require_osd_release < ceph_release_t::umbrella) { + ss << "supports_omap cannot be enabled until require_osd_release is set to umbrella or later"; + return -EPERM; + } + // Disabling omap support will leave omap data in RocksDB which cannot be cleaned up + // It will also break any services that depend on this pool to store metadata + if ((val == "false") && (p.has_flag(pg_pool_t::FLAG_OMAP))) { + ss << "supports_omap cannot be disabled once enabled"; + return -EINVAL; + } + // This restriction is temporary until omap support is well tested in Fast EC pools + if ((val == "true") && p.is_erasure()) { + ss << "supports_omap cannot be enabled in ec pools"; + return -EINVAL; + } + if (val == "true") { + p.flags |= pg_pool_t::FLAG_OMAP; + } } else if (var == "target_max_objects") { if (interr.length()) { ss << "error parsing int '" << val << "': " << interr; diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index 3d7725c4bf9..ea9b5511f65 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -1408,3 +1408,227 @@ int ECBackend::be_deep_scrub( o.omap_digest_present = true; return 0; } + +bool ECBackend::remove_ec_omap_journal_entry(const hobject_t &hoid, const ECOmapJournalEntry &entry) { + return ec_omap_journal.remove_entry(hoid, entry); +} + +std::pair ECBackend::omap_get_generation(const hobject_t& hoid) { + return ec_omap_journal.get_generation(hoid); +} + +void ECBackend::omap_trim_delete_from_journal(const hobject_t &hoid, const version_t version) { + return ec_omap_journal.trim_delete(hoid, version); +} + +int ECBackend::omap_iterate ( + ObjectStore::CollectionHandle &c_, ///< [in] collection + const ghobject_t &oid, ///< [in] object + const ObjectStore::omap_iter_seek_t &start_from, + ///^ [in] where the iterator should point to at the beginning + const OmapIterFunction &f, ///< [in] function to call for each key/value pair + ObjectStore *store +) { + // Updates in update_map take priority over removed_ranges + auto [update_map, removed_ranges] = ec_omap_journal.get_value_updates(oid.hobj); + + auto journal_it = update_map.begin(); + if (!start_from.seek_position.empty()) { + journal_it = update_map.lower_bound(start_from.seek_position); + } + + auto wrapper = [&](const std::string_view store_key, const std::string_view store_value) { + bool found_store_key_in_journal = false; + + while (journal_it != update_map.end() && journal_it->first <= store_key) { + if (journal_it->first == store_key) { + found_store_key_in_journal = true; + } + if (journal_it->second.value.has_value()) { + ObjectStore::omap_iter_ret_t r = f( + journal_it->first, + std::string_view( + journal_it->second.value->c_str(), + journal_it->second.value->length() + ) + ); + if (r == ObjectStore::omap_iter_ret_t::STOP) { + return r; + } + } + ++journal_it; + } + + if (found_store_key_in_journal) { + return ObjectStore::omap_iter_ret_t::NEXT; + } + + if (should_be_removed(removed_ranges, store_key)) { + return ObjectStore::omap_iter_ret_t::NEXT; + } + + return f(store_key, store_value); + }; + + if (const auto result = store->omap_iterate(c_, oid, start_from, wrapper); + result < 0) { + return result; + } else if (result > 0) { + return 1; + } + + auto ret = ObjectStore::omap_iter_ret_t::NEXT; + while (journal_it != update_map.end()) { + if (journal_it->second.value.has_value()) { + ret = f(journal_it->first, std::string_view(journal_it->second.value->c_str(), journal_it->second.value->length())); + if (ret == ObjectStore::omap_iter_ret_t::STOP) { + break; + } + } + ++journal_it; + } + + return ret == ObjectStore::omap_iter_ret_t::STOP ? 1 : 0; +} + +int ECBackend::omap_get_values( + ObjectStore::CollectionHandle &c_, ///< [in] collection + const ghobject_t &oid, ///< [in] object + const std::set &keys, ///< [in] keys to get + std::map *out, ///< [out] returned key/values + ObjectStore *store +) { + auto [update_map, removed_ranges] = ec_omap_journal.get_value_updates(oid.hobj); + + set keys_still_to_get; + for (auto &key : keys) { + if (auto it = update_map.find(key); + it != update_map.end()) { + if (!it->second.value.has_value()) { + continue; + } + (*out)[key] = *(it->second.value); + } else if (should_be_removed(removed_ranges, key)) { + continue; + } else { + keys_still_to_get.insert(key); + } + } + store->omap_get_values(c_, oid, keys_still_to_get, out); + + return 0; +} + +int ECBackend::omap_get_header( + ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + ceph::buffer::list *header, ///< [out] omap header + const bool allow_eio, ///< [in] don't assert on eio + ObjectStore *store +) { + std::optional header_from_journal = ec_omap_journal.get_updated_header(oid.hobj); + if (header_from_journal) { + *header = *header_from_journal; + } else { + store->omap_get_header(c_, oid, header, allow_eio); + } + return 0; +} + +int ECBackend::omap_get( + ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + ceph::buffer::list *header, ///< [out] omap header + std::map *out, /// < [out] Key to value map + ObjectStore *store +) { + // Update map takes priority over removed_ranges + auto [update_map, removed_ranges] = ec_omap_journal.get_value_updates(oid.hobj); + const auto updated_header = ec_omap_journal.get_updated_header(oid.hobj); + + if (const int r = store->omap_get(c_, oid, header, out); + r < 0) { + return r; + } + + // Update header if present + if (updated_header) { + *header = *updated_header; + } + + // Remove keys in removed_ranges + for (auto out_it = out->begin(); out_it != out->end(); ++out_it) { + if (should_be_removed(removed_ranges, out_it->first)) { + out->erase(out_it->first); + } + } + + // Apply updates in update_map + for (const auto &[key, val_opt] : update_map) { + if (val_opt.value.has_value()) { + (*out)[key] = *(val_opt.value); + } else { + out->erase(key); + } + } + + return 0; +} + +int ECBackend::omap_check_keys( + ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const std::set &keys, ///< [in] Keys to check + std::set *out, ///< [out] Subset of keys defined on oid + ObjectStore *store +) { + // Update map takes priority over removed_ranges + auto [update_map, removed_ranges] = ec_omap_journal.get_value_updates(oid.hobj); + auto updated_header = ec_omap_journal.get_updated_header(oid.hobj); + + // First check keys in update_map and removed_ranges + set keys_to_check_on_disk; + for (const auto &key : keys) { + if (auto it = update_map.find(key); + it != update_map.end()) { + if (it->second.value.has_value()) { + out->insert(key); + } + } else if (should_be_removed(removed_ranges, key)) { + continue; + } else { + keys_to_check_on_disk.insert(key); + } + } + + const int r = store->omap_check_keys(c_, oid, keys_to_check_on_disk, out); + + return r; +} + +bool ECBackend::should_be_removed( + const std::map>& removed_ranges, + const std::string_view key) { + if (removed_ranges.empty()) { + return false; + } + + // Find range that comes after this key + auto it = removed_ranges.upper_bound(std::string(key)); + + // If all ranges start after the key, it can't be in any range + if (it == removed_ranges.begin()) { + return false; + } + + // Go back to the previous range + --it; + // If this range contains the key, return true + const auto& end_opt = it->second; + if (!end_opt || key < *end_opt) { + return true; + } + + // No ranges contain the key, return false + return false; +} diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h index fbc74d2febc..a5f013a0bb3 100644 --- a/src/osd/ECBackend.h +++ b/src/osd/ECBackend.h @@ -406,4 +406,54 @@ public: } return object_size_to_shard_size(logical_size, shard_id); } + + bool remove_ec_omap_journal_entry(const hobject_t &hoid, const ECOmapJournalEntry &entry); + std::pair omap_get_generation(const hobject_t &hoid); + void omap_trim_delete_from_journal(const hobject_t &hoid, const version_t version); + + using OmapIterFunction = std::function; + int omap_iterate ( + ObjectStore::CollectionHandle &c_, ///< [in] collection + const ghobject_t &oid, ///< [in] object + const ObjectStore::omap_iter_seek_t &start_from, ///< [in] where the iterator should point to at the beginning + const OmapIterFunction &f, ///< [in] function to call for each key/value pair + ObjectStore *store + ); + + int omap_get_values( + ObjectStore::CollectionHandle &c_, ///< [in] collection + const ghobject_t &oid, ///< [in] object + const std::set &keys, ///< [in] keys to get + std::map *out, ///< [out] returned key/values + ObjectStore *store + ); + + int omap_get_header( + ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + ceph::buffer::list *header, ///< [out] omap header + bool allow_eio, ///< [in] don't assert on eio + ObjectStore *store + ); + + int omap_get( + ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + ceph::buffer::list *header, ///< [out] omap header + std::map *out, /// < [out] Key to value map + ObjectStore *store + ); + + int omap_check_keys( + ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const std::set &keys, ///< [in] Keys to check + std::set *out, ///< [out] Subset of keys defined on oid + ObjectStore *store + ); + + static bool should_be_removed( + const std::map>& removed_ranges, + const std::string_view key + ); }; diff --git a/src/osd/ECCommon.h b/src/osd/ECCommon.h index 370cce17d29..3065f52f1d7 100644 --- a/src/osd/ECCommon.h +++ b/src/osd/ECCommon.h @@ -24,6 +24,7 @@ #include "ECTypes.h" #include "messages/MOSDPGPushReply.h" #include "msg/MessageRef.h" +#include "osd/ECOmapJournal.h" #if WITH_CRIMSON #include "crimson/osd/object_context.h" #include "os/Transaction.h" diff --git a/src/osd/ECSwitch.h b/src/osd/ECSwitch.h index cc0f0f38922..7b3ead51bab 100644 --- a/src/osd/ECSwitch.h +++ b/src/osd/ECSwitch.h @@ -452,4 +452,79 @@ public: bool get_is_ec_optimized() const final { return is_optimized(); } + bool remove_ec_omap_journal_entry(const hobject_t &hoid, const ECOmapJournalEntry &entry) override { + ceph_assert(is_optimized()); + return optimized.remove_ec_omap_journal_entry(hoid, entry); + } + + std::pair omap_get_generation(const hobject_t &hoid) override { + ceph_assert(is_optimized()); + return optimized.omap_get_generation(hoid); + } + + void omap_trim_delete_from_journal(const hobject_t &hoid, const version_t version) override { + ceph_assert(is_optimized()); + optimized.omap_trim_delete_from_journal(hoid, version); + } + + int omap_iterate ( + ObjectStore::CollectionHandle &c_, ///< [in] collection + const ghobject_t &oid, ///< [in] object + const ObjectStore::omap_iter_seek_t &start_from, + ///^ [in] where the iterator should point to at the beginning + const OmapIterFunction &f ///< [in] function to call for each key/value pair + ) override { + if (!is_optimized()) { + return store->omap_iterate(c_, oid, start_from, f); + } + return optimized.omap_iterate(c_, oid, start_from, f, store); + } + + int omap_get_values( + ObjectStore::CollectionHandle &c_, ///< [in] collection + const ghobject_t &oid, ///< [in] object + const std::set &keys, ///< [in] keys to get + std::map *out ///< [out] returned key/values + ) override { + if (!is_optimized()) { + return store->omap_get_values(c_, oid, keys, out); + } + return optimized.omap_get_values(c_, oid, keys, out, store); + } + + int omap_get_header( + ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + ceph::buffer::list *header, ///< [out] omap header + bool allow_eio ///< [in] don't assert on eio + ) override { + if (!is_optimized()) { + return store->omap_get_header(c_, oid, header, allow_eio); + } + return optimized.omap_get_header(c_, oid, header, allow_eio, store); + } + + int omap_get( + ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + ceph::buffer::list *header, ///< [out] omap header + std::map *out /// < [out] Key to value map + ) override { + if (!is_optimized()) { + return store->omap_get(c_, oid, header, out); + } + return optimized.omap_get(c_, oid, header, out, store); + } + + int omap_check_keys( + ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const std::set &keys, ///< [in] Keys to check + std::set *out ///< [out] Subset of keys defined on oid + ) override { + if (!is_optimized()) { + return store->omap_check_keys(c_, oid, keys, out); + } + return optimized.omap_check_keys(c_, oid, keys, out, store); + } }; diff --git a/src/osd/PGBackend.cc b/src/osd/PGBackend.cc index bb40c28e66c..5be44f75db6 100644 --- a/src/osd/PGBackend.cc +++ b/src/osd/PGBackend.cc @@ -341,11 +341,12 @@ struct Trimmer : public ObjectModDesc::Visitor { const hobject_t &soid; PGBackend *pg; ObjectStore::Transaction *t; + const pg_log_entry_t &entry; Trimmer( - const hobject_t &soid, PGBackend *pg, - ObjectStore::Transaction *t) - : soid(soid), pg(pg), t(t) {} + ObjectStore::Transaction *t, + const pg_log_entry_t &entry) + : soid(entry.soid), pg(pg), t(t), entry(entry) {} void rmobject(version_t old_version) override { pg->trim_rollback_object( soid, @@ -377,6 +378,80 @@ struct Trimmer : public ObjectModDesc::Visitor { } } } + + void ec_omap(bool clear_omap, std::optional omap_header, + std::vector> &omap_updates) override { + + auto shard = pg->get_parent()->whoami_shard().shard; + spg_t spg = pg->get_parent()->whoami_spg_t(); + auto sinfo = pg->ec_get_sinfo(); + const auto [gen, lost_delete] = pg->omap_get_generation(soid); + + if (!sinfo.is_nonprimary_shard(shard)) { + // If lost_delete is true, check if the object exists before performing updates + bool should_update = true; + if (lost_delete) { + struct stat st; + int r = pg->store->stat( + pg->ch, + ghobject_t(soid, gen, shard), + &st, + true); + if (r != 0) { + // Object doesn't exist on this shard, skip the update + should_update = false; + } + } + + if (should_update) { + if (omap_header) { + t->omap_setheader( + coll_t(spg), + ghobject_t(soid, gen, shard), + *(omap_header)); + } + + if (clear_omap) { + t->omap_clear( + coll_t(spg), + ghobject_t(soid, gen, shard)); + } + + for (auto &&up: omap_updates) { + switch (up.first) { + case OmapUpdateType::Remove: + t->omap_rmkeys( + coll_t(spg), + ghobject_t(soid, gen, shard), + up.second); + break; + case OmapUpdateType::Insert: + t->omap_setkeys( + coll_t(spg), + ghobject_t(soid, gen, shard), + up.second); + break; + case OmapUpdateType::RemoveRange: + t->omap_rmkeyrange( + coll_t(spg), + ghobject_t(soid, gen, shard), + up.second); + break; + } + } + } + } + + // Only remove journal entry if generation is NO_GEN (object not deleted) + // If gen != NO_GEN, the object has been deleted and journal was already cleared + if (gen == ghobject_t::NO_GEN && pg->get_parent()->pgb_is_primary()) { + const ECOmapJournalEntry to_remove( + entry.version, clear_omap, + omap_header, omap_updates + ); + pg->remove_ec_omap_journal_entry(soid, to_remove); + } + } }; void PGBackend::rollforward( @@ -387,7 +462,7 @@ void PGBackend::rollforward( ldpp_dout(dpp, 20) << __func__ << ": entry=" << entry << dendl; if (!entry.can_rollback()) return; - Trimmer trimmer(entry.soid, this, t); + Trimmer trimmer(this, t, entry); entry.mod_desc.visit(&trimmer); } @@ -397,7 +472,7 @@ void PGBackend::trim( { if (!entry.can_rollback()) return; - Trimmer trimmer(entry.soid, this, t); + Trimmer trimmer(this, t, entry); entry.mod_desc.visit(&trimmer); } diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h index d681c36b53c..57cc018eaa7 100644 --- a/src/osd/PGBackend.h +++ b/src/osd/PGBackend.h @@ -35,6 +35,9 @@ #include "common/ostream_temp.h" #include "Coroutines.h" + +class ECOmapJournalEntry; + namespace Scrub { class Store; } @@ -448,7 +451,45 @@ typedef std::shared_ptr OSDMapRef; virtual shard_id_map ec_decode_acting_set( const shard_id_map &shard_map, int chunk_size) const = 0; virtual ECUtil::stripe_info_t ec_get_sinfo() const = 0; - + virtual bool remove_ec_omap_journal_entry(const hobject_t &hoid, const ECOmapJournalEntry &entry) { + return false; // Only EC uses ec_omap_journal + }; + virtual std::pair omap_get_generation(const hobject_t &hoid) { + return {0, false}; // Only EC uses ec_omap_journal + }; + virtual void omap_trim_delete_from_journal(const hobject_t &hoid, const version_t version) {}; + using OmapIterFunction = std::function; + virtual int omap_iterate( + ObjectStore::CollectionHandle &c_, ///< [in] collection + const ghobject_t &oid, ///< [in] object + const ObjectStore::omap_iter_seek_t &start_from, + ///^ [in] where the iterator should point to at the beginning + const OmapIterFunction &f ///< [in] function to call for each key/value pair + ) = 0; + virtual int omap_get_values( + ObjectStore::CollectionHandle &c_, ///< [in] collection + const ghobject_t &oid, ///< [in] object + const std::set &keys, ///< [in] keys to get + std::map *out ///< [out] returned key/values + ) = 0; + virtual int omap_get_header( + ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + ceph::buffer::list *header, ///< [out] omap header + bool allow_eio ///< [in] don't assert on eio + ) = 0; + virtual int omap_get( + ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + ceph::buffer::list *header, ///< [out] omap header + std::map *out /// < [out] Key to value map + ) = 0; + virtual int omap_check_keys( + ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const std::set &keys, ///< [in] Keys to check + std::set *out ///< [out] Subset of keys defined on oid + ) = 0; private: std::set temp_contents; public: diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h index da06b20face..1c1308f7f84 100644 --- a/src/osd/ReplicatedBackend.h +++ b/src/osd/ReplicatedBackend.h @@ -134,6 +134,48 @@ public: } } + int omap_iterate( + ObjectStore::CollectionHandle &c_, ///< [in] collection + const ghobject_t &oid, ///< [in] object + const ObjectStore::omap_iter_seek_t &start_from, + ///^ [in] where the iterator should point to at the beginning + const OmapIterFunction &f ///< [in] function to call for each key/value pair + ) override { + return store->omap_iterate(c_, oid, start_from, f); + } + int omap_get_values( + ObjectStore::CollectionHandle &c_, ///< [in] collection + const ghobject_t &oid, ///< [in] object + const std::set &keys, ///< [in] keys to get + std::map *out ///< [out] returned key/values + ) override { + return store->omap_get_values(c_, oid, keys, out); + } + int omap_get_header( + ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + ceph::buffer::list *header, ///< [out] omap header + bool allow_eio ///< [in] don't assert on eio + ) override { + return store->omap_get_header(c_, oid, header, allow_eio); + } + int omap_get( + ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + ceph::buffer::list *header, ///< [out] omap header + std::map *out /// < [out] Key to value map + ) override { + return store->omap_get(c_, oid, header, out); + } + int omap_check_keys( + ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const std::set &keys, ///< [in] Keys to check + std::set *out ///< [out] Subset of keys defined on oid + ) override { + return store->omap_check_keys(c_, oid, keys, out); + } + int objects_read_sync( const hobject_t &hoid, uint64_t off, diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index eed7eaf23a0..7d697265540 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -1329,6 +1329,7 @@ struct pg_pool_t { FLAG_CRIMSON = 1<<18, FLAG_EC_OPTIMIZATIONS = 1<<19, // enable optimizations, once enabled, cannot be disabled FLAG_CLIENT_SPLIT_READS = 1<<20, // Optimized EC is permitted to do direct reads. + FLAG_OMAP = 1<<21, // Pool is permitted to perform OMAP operations }; static const char *get_flag_name(uint64_t f) { @@ -1354,6 +1355,7 @@ struct pg_pool_t { case FLAG_CRIMSON: return "crimson"; case FLAG_EC_OPTIMIZATIONS: return "ec_optimizations"; case FLAG_CLIENT_SPLIT_READS: return "split_reads"; + case FLAG_OMAP: return "supports_omap"; default: return "???"; } } @@ -1414,6 +1416,8 @@ struct pg_pool_t { return FLAG_EC_OPTIMIZATIONS; if (name == "split_reads") return FLAG_CLIENT_SPLIT_READS; + if (name == "supports_omap") + return FLAG_OMAP; return 0; } @@ -1824,7 +1828,7 @@ public: bool is_erasure() const { return get_type() == TYPE_ERASURE; } bool supports_omap() const { - return !(get_type() == TYPE_ERASURE); + return has_flag(FLAG_OMAP) || is_replicated(); } bool requires_aligned_append() const { @@ -4107,6 +4111,8 @@ public: public: virtual void append(uint64_t old_offset) {} virtual void setattrs(std::map> &attrs) {} + virtual void ec_omap(bool clear_omap, std::optional omap_header, + std::vector> &omap_updates) {} virtual void rmobject(version_t old_version) {} /** * Used to support the unfound_lost_delete log event: if the stashed @@ -4135,7 +4141,8 @@ public: CREATE = 4, UPDATE_SNAPS = 5, TRY_DELETE = 6, - ROLLBACK_EXTENTS = 7 + ROLLBACK_EXTENTS = 7, + EC_OMAP = 8 }; ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) { bl.reassign_to_mempool(mempool::mempool_osd_pglog); @@ -4187,6 +4194,18 @@ public: encode(old_attrs, bl); ENCODE_FINISH(bl); } + void ec_omap(bool clear_omap, std::optional omap_header, + std::vector> &omap_updates) { + if(!can_local_rollback) { + return; + } + ENCODE_START(1, 1, bl); + append_id(EC_OMAP); + encode(clear_omap, bl); + encode(omap_header, bl); + encode(omap_updates, bl); + ENCODE_FINISH(bl); + } bool rmobject(version_t deletion_version) { if (!can_local_rollback || rollback_info_completed) { return false; diff --git a/src/test/osd/MockPGBackend.h b/src/test/osd/MockPGBackend.h index f2d66a4f3f4..ffc8c6b5f49 100644 --- a/src/test/osd/MockPGBackend.h +++ b/src/test/osd/MockPGBackend.h @@ -94,6 +94,53 @@ public: return {0, 0, 0}; } + using OmapIterFunction = std::function; + int omap_iterate( + ObjectStore::CollectionHandle &c_, ///< [in] collection + const ghobject_t &oid, ///< [in] object + const ObjectStore::omap_iter_seek_t &start_from, + ///^ [in] where the iterator should point to at the beginning + const OmapIterFunction &f ///< [in] function to call for each key/value pair + ) override { + return 0; + } + + int omap_get_values( + ObjectStore::CollectionHandle &c_, ///< [in] collection + const ghobject_t &oid, ///< [in] object + const std::set &keys, ///< [in] keys to get + std::map *out ///< [out] returned key/values + ) override { + return 0; + } + + int omap_get_header( + ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + ceph::buffer::list *header, ///< [out] omap header + bool allow_eio ///< [in] don't assert on eio + ) override { + return 0; + } + + int omap_get( + ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + ceph::buffer::list *header, ///< [out] omap header + std::map *out /// < [out] Key to value map + ) override { + return 0; + } + + int omap_check_keys( + ObjectStore::CollectionHandle &c_, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const std::set &keys, ///< [in] Keys to check + std::set *out ///< [out] Subset of keys defined on oid + ) override { + return 0; + } + // Transaction submission void submit_transaction( const hobject_t &hoid,