From b4b2ab171c123b6701e8c78dda6c7e20e657639a Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Thu, 4 Aug 2022 00:17:50 -0700 Subject: [PATCH] crimson/osd: rework OSDMap handling for multicore OSDMaps can occupy significant space in memory. Duplicating OSDMaps across cores would multiply that memory usage as well as complicate the internal structure sharing we do when applying OSDMap incrementals. Because PeeringState and other interfaces expect efficient copying of OSDMapRef objects we don't want to use foreign_ptr directly. Instead, maintain a single cache and distribute local_shared_foreign_ptrs to other cores. ShardServices becomes the only OSDMapService. Signed-off-by: Samuel Just --- src/crimson/admin/osd_admin.cc | 4 +- src/crimson/admin/pg_commands.cc | 2 +- src/crimson/osd/heartbeat.cc | 20 +++--- src/crimson/osd/ops_executer.h | 1 - src/crimson/osd/osd.cc | 56 ++++++++++------- .../osd/osd_operations/peering_event.cc | 2 +- .../osd/osd_operations/pg_advance_map.cc | 6 +- src/crimson/osd/osdmap_service.h | 7 ++- src/crimson/osd/pg.h | 3 +- src/crimson/osd/pg_backend.h | 1 - src/crimson/osd/pg_shard_manager.cc | 5 ++ src/crimson/osd/pg_shard_manager.h | 17 +++-- src/crimson/osd/shard_services.cc | 22 +++---- src/crimson/osd/shard_services.h | 62 ++++++++++++------- src/osd/OSDMap.h | 4 +- 15 files changed, 122 insertions(+), 90 deletions(-) diff --git a/src/crimson/admin/osd_admin.cc b/src/crimson/admin/osd_admin.cc index 5841e5f0b94..b5368738904 100644 --- a/src/crimson/admin/osd_admin.cc +++ b/src/crimson/admin/osd_admin.cc @@ -369,7 +369,7 @@ public: { ghobject_t obj; try { - obj = test_ops_get_object_name(*shard_services.get_osdmap(), cmdmap); + obj = test_ops_get_object_name(*shard_services.get_map(), cmdmap); } catch (const std::invalid_argument& e) { logger().info("error during data error injection: {}", e.what()); return seastar::make_ready_future(-EINVAL, @@ -411,7 +411,7 @@ public: { ghobject_t obj; try { - obj = test_ops_get_object_name(*shard_services.get_osdmap(), cmdmap); + obj = test_ops_get_object_name(*shard_services.get_map(), cmdmap); } catch (const std::invalid_argument& e) { logger().info("error during metadata error injection: {}", e.what()); return seastar::make_ready_future(-EINVAL, diff --git a/src/crimson/admin/pg_commands.cc b/src/crimson/admin/pg_commands.cc index e116e491fb5..f2c84b254db 100644 --- a/src/crimson/admin/pg_commands.cc +++ b/src/crimson/admin/pg_commands.cc @@ -51,7 +51,7 @@ public: tell_result_t{-EINVAL, fmt::format("couldn't parse pgid '{}'", pgid_str)}); } // am i the primary for this pg? - const auto osdmap = osd.get_shard_services().get_osdmap(); + const auto osdmap = osd.get_shard_services().get_map(); spg_t spg_id; if (!osdmap->get_primary_shard(pgid, &spg_id)) { return seastar::make_ready_future(tell_result_t{ diff --git a/src/crimson/osd/heartbeat.cc b/src/crimson/osd/heartbeat.cc index 4073f80ae3c..049bdac0383 100644 --- a/src/crimson/osd/heartbeat.cc +++ b/src/crimson/osd/heartbeat.cc @@ -137,7 +137,7 @@ Heartbeat::osds_t Heartbeat::remove_down_peers() { osds_t old_osds; // osds not added in this epoch for (auto i = peers.begin(); i != peers.end(); ) { - auto osdmap = service.get_osdmap_service().get_map(); + auto osdmap = service.get_map(); const auto& [osd, peer] = *i; if (!osdmap->is_up(osd)) { i = peers.erase(i); @@ -153,7 +153,7 @@ Heartbeat::osds_t Heartbeat::remove_down_peers() void Heartbeat::add_reporter_peers(int whoami) { - auto osdmap = service.get_osdmap_service().get_map(); + auto osdmap = service.get_map(); // include next and previous up osds to ensure we have a fully-connected set set want; if (auto next = osdmap->get_next_up_osd_after(whoami); next >= 0) { @@ -188,7 +188,7 @@ void Heartbeat::update_peers(int whoami) remove_peer(osd); } // or too few? - auto osdmap = service.get_osdmap_service().get_map(); + auto osdmap = service.get_map(); auto epoch = osdmap->get_epoch(); for (auto next = osdmap->get_next_up_osd_after(whoami); peers.size() < min_peers && next >= 0 && next != whoami; @@ -291,12 +291,12 @@ seastar::future<> Heartbeat::handle_ping(crimson::net::ConnectionRef conn, auto reply = crimson::make_message( m->fsid, - service.get_osdmap_service().get_map()->get_epoch(), + service.get_map()->get_epoch(), MOSDPing::PING_REPLY, m->ping_stamp, m->mono_ping_stamp, service.get_mnow(), - service.get_osdmap_service().get_up_epoch(), + service.get_up_epoch(), min_message); return conn->send(std::move(reply)); } @@ -586,7 +586,7 @@ seastar::future<> Heartbeat::Peer::handle_reply( entity_addr_t Heartbeat::Peer::get_peer_addr(type_t type) { - const auto osdmap = heartbeat.service.get_osdmap_service().get_map(); + const auto osdmap = heartbeat.service.get_map(); if (type == type_t::front) { return osdmap->get_hb_front_addrs(peer).front(); } else { @@ -625,12 +625,12 @@ void Heartbeat::Peer::do_send_heartbeat( local_conf()->osd_heartbeat_min_size); auto ping = crimson::make_message( heartbeat.monc.get_fsid(), - heartbeat.service.get_osdmap_service().get_map()->get_epoch(), + heartbeat.service.get_map()->get_epoch(), MOSDPing::PING, sent_stamp, mnow, mnow, - heartbeat.service.get_osdmap_service().get_up_epoch(), + heartbeat.service.get_up_epoch(), min_message); if (futures) { futures->push_back(conn.send(std::move(ping))); @@ -649,7 +649,7 @@ bool Heartbeat::FailingPeers::add_pending( } auto failed_for = std::chrono::duration_cast( now - failed_since).count(); - auto osdmap = heartbeat.service.get_osdmap_service().get_map(); + auto osdmap = heartbeat.service.get_map(); auto failure_report = crimson::make_message(heartbeat.monc.get_fsid(), peer, @@ -683,7 +683,7 @@ Heartbeat::FailingPeers::send_still_alive( osd, addrs, 0, - heartbeat.service.get_osdmap_service().get_map()->get_epoch(), + heartbeat.service.get_map()->get_epoch(), MOSDFailure::FLAG_ALIVE); logger().info("{}: osd.{}", __func__, osd); return heartbeat.monc.send_message(std::move(still_alive)); diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h index f187ce34507..b7ea934e322 100644 --- a/src/crimson/osd/ops_executer.h +++ b/src/crimson/osd/ops_executer.h @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include diff --git a/src/crimson/osd/osd.cc b/src/crimson/osd/osd.cc index 07e15168a56..a9d3d1ea131 100644 --- a/src/crimson/osd/osd.cc +++ b/src/crimson/osd/osd.cc @@ -367,11 +367,12 @@ seastar::future<> OSD::start() ); }).then([this](OSDSuperblock&& sb) { superblock = std::move(sb); - return pg_shard_manager.get_map(superblock.current_epoch); - }).then([this](OSDMapService::cached_map_t&& map) { - pg_shard_manager.update_map(map); - pg_shard_manager.got_map(map->get_epoch()); - osdmap = std::move(map); + return pg_shard_manager.get_local_map(superblock.current_epoch); + }).then([this](OSDMapService::local_cached_map_t&& map) { + osdmap = make_local_shared_foreign(OSDMapService::local_cached_map_t(map)); + return pg_shard_manager.update_map(std::move(map)); + }).then([this] { + pg_shard_manager.got_map(osdmap->get_epoch()); bind_epoch = osdmap->get_epoch(); return pg_shard_manager.load_pgs(); }).then([this] { @@ -796,10 +797,10 @@ void OSD::handle_authentication(const EntityName& name, void OSD::update_stats() { osd_stat_seq++; - osd_stat.up_from = pg_shard_manager.get_up_epoch(); + osd_stat.up_from = get_shard_services().get_up_epoch(); osd_stat.hb_peers = heartbeat->get_peers(); osd_stat.seq = ( - static_cast(pg_shard_manager.get_up_epoch()) << 32 + static_cast(get_shard_services().get_up_epoch()) << 32 ) | osd_stat_seq; gate.dispatch_in_background("statfs", *this, [this] { (void) store.stat().then([this](store_statfs_t&& st) { @@ -915,16 +916,24 @@ seastar::future<> OSD::committed_osd_maps(version_t first, return seastar::do_for_each(boost::make_counting_iterator(first), boost::make_counting_iterator(last + 1), [this](epoch_t cur) { - return pg_shard_manager.get_map(cur).then([this](OSDMapService::cached_map_t&& o) { - osdmap = std::move(o); - pg_shard_manager.update_map(osdmap); - if (pg_shard_manager.get_up_epoch() == 0 && - osdmap->is_up(whoami) && - osdmap->get_addrs(whoami) == public_msgr->get_myaddrs()) { - pg_shard_manager.set_up_epoch(osdmap->get_epoch()); - if (!boot_epoch) { - boot_epoch = osdmap->get_epoch(); - } + return pg_shard_manager.get_local_map( + cur + ).then([this](OSDMapService::local_cached_map_t&& o) { + osdmap = make_local_shared_foreign(OSDMapService::local_cached_map_t(o)); + return pg_shard_manager.update_map(std::move(o)); + }).then([this] { + if (get_shard_services().get_up_epoch() == 0 && + osdmap->is_up(whoami) && + osdmap->get_addrs(whoami) == public_msgr->get_myaddrs()) { + return pg_shard_manager.set_up_epoch( + osdmap->get_epoch() + ).then([this] { + if (!boot_epoch) { + boot_epoch = osdmap->get_epoch(); + } + }); + } else { + return seastar::now(); } }); }).then([m, this] { @@ -1132,11 +1141,14 @@ seastar::future<> OSD::restart() { beacon_timer.cancel(); tick_timer.cancel(); - pg_shard_manager.set_up_epoch(0); - bind_epoch = osdmap->get_epoch(); - // TODO: promote to shutdown if being marked down for multiple times - // rebind messengers - return start_boot(); + return pg_shard_manager.set_up_epoch( + 0 + ).then([this] { + bind_epoch = osdmap->get_epoch(); + // TODO: promote to shutdown if being marked down for multiple times + // rebind messengers + return start_boot(); + }); } seastar::future<> OSD::shutdown() diff --git a/src/crimson/osd/osd_operations/peering_event.cc b/src/crimson/osd/osd_operations/peering_event.cc index e84439b7672..277460d485b 100644 --- a/src/crimson/osd/osd_operations/peering_event.cc +++ b/src/crimson/osd/osd_operations/peering_event.cc @@ -125,7 +125,7 @@ void RemotePeeringEvent::on_pg_absent(ShardServices &shard_services) if (auto& e = get_event().get_event(); e.dynamic_type() == MQuery::static_type()) { const auto map_epoch = - shard_services.get_osdmap_service().get_map()->get_epoch(); + shard_services.get_map()->get_epoch(); const auto& q = static_cast(e); const pg_info_t empty{spg_t{pgid.pgid, q.query.to}}; if (q.query.type == q.query.LOG || diff --git a/src/crimson/osd/osd_operations/pg_advance_map.cc b/src/crimson/osd/osd_operations/pg_advance_map.cc index 4d2b388388d..140dd63fb7e 100644 --- a/src/crimson/osd/osd_operations/pg_advance_map.cc +++ b/src/crimson/osd/osd_operations/pg_advance_map.cc @@ -1,13 +1,13 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab -#include #include #include "include/types.h" #include "common/Formatter.h" #include "crimson/osd/pg.h" #include "crimson/osd/pg_shard_manager.h" +#include "crimson/osd/osdmap_service.h" #include "crimson/osd/osd_operations/pg_advance_map.h" #include "crimson/osd/osd_operation_external_tracking.h" #include "osd/PeeringState.h" @@ -54,7 +54,7 @@ void PGAdvanceMap::dump_detail(Formatter *f) const seastar::future<> PGAdvanceMap::start() { - using cached_map_t = boost::local_shared_ptr; + using cached_map_t = OSDMapService::cached_map_t; logger().debug("{}: start", *this); @@ -71,7 +71,7 @@ seastar::future<> PGAdvanceMap::start() boost::make_counting_iterator(*from + 1), boost::make_counting_iterator(to + 1), [this](epoch_t next_epoch) { - return shard_manager.get_map(next_epoch).then( + return shard_manager.get_shard_services().get_map(next_epoch).then( [this] (cached_map_t&& next_map) { logger().debug("{}: advancing map to {}", *this, next_map->get_epoch()); diff --git a/src/crimson/osd/osdmap_service.h b/src/crimson/osd/osdmap_service.h index effd45b7946..017303536dc 100644 --- a/src/crimson/osd/osdmap_service.h +++ b/src/crimson/osd/osdmap_service.h @@ -3,15 +3,16 @@ #pragma once -#include - #include "include/types.h" +#include "osd/OSDMap.h" class OSDMap; class OSDMapService { public: - using cached_map_t = boost::local_shared_ptr; + using cached_map_t = OSDMapRef; + using local_cached_map_t = LocalOSDMapRef; + virtual ~OSDMapService() = default; virtual seastar::future get_map(epoch_t e) = 0; /// get the latest map diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h index 7a5d6f7075e..3a9b709b019 100644 --- a/src/crimson/osd/pg.h +++ b/src/crimson/osd/pg.h @@ -6,7 +6,6 @@ #include #include #include -#include #include #include @@ -67,7 +66,7 @@ class PG : public boost::intrusive_ref_counter< DoutPrefixProvider { using ec_profile_t = std::map; - using cached_map_t = boost::local_shared_ptr; + using cached_map_t = OSDMapService::cached_map_t; ClientRequest::PGPipeline client_request_pg_pipeline; PGPeeringPipeline peering_request_pg_pipeline; diff --git a/src/crimson/osd/pg_backend.h b/src/crimson/osd/pg_backend.h index 7777b53781c..7f38cb03873 100644 --- a/src/crimson/osd/pg_backend.h +++ b/src/crimson/osd/pg_backend.h @@ -6,7 +6,6 @@ #include #include #include -#include #include #include "include/rados.h" diff --git a/src/crimson/osd/pg_shard_manager.cc b/src/crimson/osd/pg_shard_manager.cc index bfe4a762f84..8597725eaf1 100644 --- a/src/crimson/osd/pg_shard_manager.cc +++ b/src/crimson/osd/pg_shard_manager.cc @@ -18,4 +18,9 @@ PGShardManager::PGShardManager( shard_services(osd_singleton_state, local_state) {} +seastar::future<> PGShardManager::set_up_epoch(epoch_t e) { + local_state.set_up_epoch(e); + return seastar::now(); +} + } diff --git a/src/crimson/osd/pg_shard_manager.h b/src/crimson/osd/pg_shard_manager.h index eccbce358bb..e9138cd63b7 100644 --- a/src/crimson/osd/pg_shard_manager.h +++ b/src/crimson/osd/pg_shard_manager.h @@ -26,6 +26,9 @@ class PGShardManager { ShardServices shard_services; public: + using cached_map_t = OSDMapService::cached_map_t; + using local_cached_map_t = OSDMapService::local_cached_map_t; + PGShardManager( const int whoami, crimson::net::Messenger &cluster_msgr, @@ -36,9 +39,11 @@ public: auto &get_shard_services() { return shard_services; } - void update_map(OSDMapService::cached_map_t map) { - osd_singleton_state.update_map(map); - local_state.update_map(map); + seastar::future<> update_map(local_cached_map_t &&map) { + auto fmap = make_local_shared_foreign(std::move(map)); + osd_singleton_state.update_map(fmap); + local_state.update_map(std::move(fmap)); + return seastar::now(); } auto stop_registries() { @@ -70,12 +75,12 @@ public: FORWARD_TO_OSD_SINGLETON(get_meta_coll) // Core OSDMap methods - FORWARD_TO_OSD_SINGLETON(get_map) + FORWARD_TO_OSD_SINGLETON(get_local_map) FORWARD_TO_OSD_SINGLETON(load_map_bl) FORWARD_TO_OSD_SINGLETON(load_map_bls) FORWARD_TO_OSD_SINGLETON(store_maps) - FORWARD_TO_OSD_SINGLETON(get_up_epoch) - FORWARD_TO_OSD_SINGLETON(set_up_epoch) + + seastar::future<> set_up_epoch(epoch_t e); FORWARD(pg_created, pg_created, osd_singleton_state.pg_map) auto load_pgs() { diff --git a/src/crimson/osd/shard_services.cc b/src/crimson/osd/shard_services.cc index 654c3755935..0bf4ba485d3 100644 --- a/src/crimson/osd/shard_services.cc +++ b/src/crimson/osd/shard_services.cc @@ -273,20 +273,16 @@ void OSDSingletonState::handle_conf_change( } } -OSDSingletonState::cached_map_t OSDSingletonState::get_map() const -{ - return osdmap; -} - -seastar::future OSDSingletonState::get_map(epoch_t e) +seastar::future +OSDSingletonState::get_local_map(epoch_t e) { // TODO: use LRU cache for managing osdmap, fallback to disk if we have to if (auto found = osdmaps.find(e); found) { - return seastar::make_ready_future(std::move(found)); + return seastar::make_ready_future(std::move(found)); } else { return load_map(e).then([e, this](std::unique_ptr osdmap) { - return seastar::make_ready_future( - osdmaps.insert(e, std::move(osdmap))); + return seastar::make_ready_future( + osdmaps.insert(e, std::move(osdmap))); }); } } @@ -437,13 +433,13 @@ seastar::future> OSDSingletonState::handle_pg_create_info( std::move(info), [this, &shard_manager, &shard_services](auto &info) -> seastar::future> { - return get_map(info->epoch).then( + return shard_services.get_map(info->epoch).then( [&info, &shard_services, this](OSDMapService::cached_map_t startmap) -> seastar::future, OSDMapService::cached_map_t>> { const spg_t &pgid = info->pgid; if (info->by_mon) { int64_t pool_id = pgid.pgid.pool(); - const pg_pool_t *pool = get_osdmap()->get_pg_pool(pool_id); + const pg_pool_t *pool = shard_services.get_map()->get_pg_pool(pool_id); if (!pool) { logger().debug( "{} ignoring pgid {}, pool dne", @@ -593,8 +589,8 @@ seastar::future> OSDSingletonState::load_pg( return seastar::do_with(PGMeta(store, pgid), [](auto& pg_meta) { return pg_meta.get_epoch(); - }).then([this](epoch_t e) { - return get_map(e); + }).then([&shard_services](epoch_t e) { + return shard_services.get_map(e); }).then([pgid, this, &shard_services] (auto&& create_map) { return make_pg(shard_services, std::move(create_map), pgid, false); }).then([this](Ref pg) { diff --git a/src/crimson/osd/shard_services.h b/src/crimson/osd/shard_services.h index 9698a89eed3..bc68f290f84 100644 --- a/src/crimson/osd/shard_services.h +++ b/src/crimson/osd/shard_services.h @@ -55,6 +55,8 @@ class PGShardManager; class PerShardState { friend class ShardServices; friend class PGShardManager; + using cached_map_t = OSDMapService::cached_map_t; + using local_cached_map_t = OSDMapService::local_cached_map_t; const int whoami; crimson::common::CephContext cct; @@ -66,11 +68,13 @@ class PerShardState { OSDOperationRegistry registry; OperationThrottler throttler; + epoch_t up_epoch = 0; OSDMapService::cached_map_t osdmap; - OSDMapService::cached_map_t &get_osdmap() { return osdmap; } + const auto &get_osdmap() const { return osdmap; } void update_map(OSDMapService::cached_map_t new_osdmap) { osdmap = std::move(new_osdmap); } + void set_up_epoch(epoch_t epoch) { up_epoch = epoch; } crimson::osd::ObjectContextRegistry obc_registry; @@ -110,9 +114,13 @@ class PerShardState { * OSD-wide singleton holding instances that need to be accessible * from all PGs. */ -class OSDSingletonState : public md_config_obs_t, public OSDMapService { +class OSDSingletonState : public md_config_obs_t { friend class ShardServices; friend class PGShardManager; + using cached_map_t = OSDMapService::cached_map_t; + using local_cached_map_t = OSDMapService::local_cached_map_t; + +public: OSDSingletonState( int whoami, crimson::net::Messenger &cluster_msgr, @@ -127,6 +135,9 @@ class OSDSingletonState : public md_config_obs_t, public OSDMapService { OSDState osd_state; + SharedLRU osdmaps; + SimpleLRU map_bl_cache; + cached_map_t osdmap; cached_map_t &get_osdmap() { return osdmap; } void update_map(cached_map_t new_osdmap) { @@ -218,20 +229,7 @@ class OSDSingletonState : public md_config_obs_t, public OSDMapService { const ConfigProxy& conf, const std::set &changed) final; - // OSDMapService - epoch_t up_epoch = 0; - epoch_t get_up_epoch() const final { - return up_epoch; - } - void set_up_epoch(epoch_t e) { - up_epoch = e; - } - - SharedLRU osdmaps; - SimpleLRU map_bl_cache; - - seastar::future get_map(epoch_t e) final; - cached_map_t get_map() const final; + seastar::future get_local_map(epoch_t e); seastar::future> load_map(epoch_t e); seastar::future load_map_bl(epoch_t e); seastar::future> @@ -301,11 +299,17 @@ class OSDSingletonState : public md_config_obs_t, public OSDMapService { /** * Represents services available to each PG */ -class ShardServices { - using cached_map_t = boost::local_shared_ptr; +class ShardServices : public OSDMapService { + using cached_map_t = OSDMapService::cached_map_t; + using local_cached_map_t = OSDMapService::local_cached_map_t; OSDSingletonState &osd_singleton_state; PerShardState &local_state; + + template + auto with_singleton(F &&f, Args&&... args) { + return std::invoke(f, osd_singleton_state, std::forward(args)...); + } public: ShardServices( OSDSingletonState &osd_singleton_state, @@ -322,11 +326,6 @@ public: return &(local_state.cct); } - // OSDMapService - const OSDMapService &get_osdmap_service() const { - return osd_singleton_state; - } - template auto start_operation(Args&&... args) { return local_state.start_operation(std::forward(args)...); @@ -361,7 +360,22 @@ public: return dispatch_context({}, std::move(ctx)); } - FORWARD_TO_LOCAL(get_osdmap) + // OSDMapService + cached_map_t get_map() const final { return local_state.get_osdmap(); } + epoch_t get_up_epoch() const final { return local_state.up_epoch; } + seastar::future get_map(epoch_t e) final { + return with_singleton( + [](auto &sstate, epoch_t e) { + return sstate.get_local_map( + e + ).then([](auto lmap) { + return seastar::foreign_ptr(lmap); + }); + }, e).then([](auto fmap) { + return make_local_shared_foreign(std::move(fmap)); + }); + } + FORWARD_TO_OSD_SINGLETON(get_pg_num) FORWARD(with_throttle_while, with_throttle_while, local_state.throttler) diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index 28449c8b63c..534879f1e77 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -1674,7 +1674,9 @@ WRITE_CLASS_ENCODER_FEATURES(OSDMap) WRITE_CLASS_ENCODER_FEATURES(OSDMap::Incremental) #ifdef WITH_SEASTAR -using OSDMapRef = boost::local_shared_ptr; +#include "crimson/common/local_shared_foreign_ptr.h" +using LocalOSDMapRef = boost::local_shared_ptr; +using OSDMapRef = crimson::local_shared_foreign_ptr; #else using OSDMapRef = std::shared_ptr; #endif -- 2.39.5