From: Leonid Chernin Date: Sun, 18 Aug 2024 05:16:14 +0000 (+0000) Subject: mon/NVMeofGw*: support upgrades from prior out-of-tree nvmeofha implementation (nvmeo... X-Git-Tag: testing/wip-mchangir-testing-20240830.082048-main-debug~19^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=3c50ef6f5c0cec7303de480a1932964157e17318;p=ceph-ci.git mon/NVMeofGw*: support upgrades from prior out-of-tree nvmeofha implementation (nvmeof-reef) This commit adds upgrade support for users running an experimental nvmeofha implementation which can be found in the nvmeof-reef branch in ceph.git. Signed-off-by: Leonid Chernin --- diff --git a/src/messages/MNVMeofGwMap.h b/src/messages/MNVMeofGwMap.h index 3affdd250dc..efa0e91cbe4 100644 --- a/src/messages/MNVMeofGwMap.h +++ b/src/messages/MNVMeofGwMap.h @@ -56,7 +56,7 @@ public: using ceph::encode; encode(VERSION, payload); encode(gwmap_epoch, payload); - encode(map, payload); + encode(map, payload, features); } private: using RefCountedObject::put; diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc index 646d56d30e6..3ccbd48435e 100755 --- a/src/mon/NVMeofGwMap.cc +++ b/src/mon/NVMeofGwMap.cc @@ -83,6 +83,12 @@ int NVMeofGwMap::cfg_add_gw( return -EEXIST ; } } + if (allocated.size() == MAX_SUPPORTED_ANA_GROUPS) { + dout(4) << "Warning: cannot add GW " << gw_id + << " since number GWs in the group is " + << MAX_SUPPORTED_ANA_GROUPS << dendl; + return -EINVAL; + } // Allocate the new group id NvmeAnaGrpId i = 0; bool was_allocated = false; diff --git a/src/mon/NVMeofGwMap.h b/src/mon/NVMeofGwMap.h index c128d7c9285..1f13fbfd8d0 100755 --- a/src/mon/NVMeofGwMap.h +++ b/src/mon/NVMeofGwMap.h @@ -106,13 +106,13 @@ public: const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId ANA_groupid, epoch_t &epoch, bool failover); - void encode(ceph::buffer::list &bl) const { + void encode(ceph::buffer::list &bl, uint64_t features) const { using ceph::encode; ENCODE_START(1, 1, bl); encode(epoch, bl);// global map epoch - encode(created_gws, bl); //Encode created GWs - encode(fsm_timers, bl); + encode(created_gws, bl, features); //Encode created GWs + encode(fsm_timers, bl, features); ENCODE_FINISH(bl); } diff --git a/src/mon/NVMeofGwMon.cc b/src/mon/NVMeofGwMon.cc index ac4a6e199fb..ce4507281b5 100644 --- a/src/mon/NVMeofGwMon.cc +++ b/src/mon/NVMeofGwMon.cc @@ -158,7 +158,10 @@ void NVMeofGwMon::encode_pending(MonitorDBStore::TransactionRef t) dout(10) << dendl; ceph_assert(get_last_committed() + 1 == pending_map.epoch); bufferlist bl; - pending_map.encode(bl); + uint64_t features = mon.get_quorum_con_features(); + pending_map.encode(bl, features); + dout(10) << " has NVMEOFHA: " + << HAVE_FEATURE(mon.get_quorum_con_features(), NVMEOFHA) << dendl; put_version(t, pending_map.epoch, bl); put_last_committed(t, pending_map.epoch); } diff --git a/src/mon/NVMeofGwSerialize.h b/src/mon/NVMeofGwSerialize.h index ca4b970ef44..cbda90ea379 100755 --- a/src/mon/NVMeofGwSerialize.h +++ b/src/mon/NVMeofGwSerialize.h @@ -17,6 +17,7 @@ #undef dout_prefix #define MODULE_PREFFIX "nvmeofgw " #define dout_prefix *_dout << MODULE_PREFFIX << __PRETTY_FUNCTION__ << " " +#define MAX_SUPPORTED_ANA_GROUPS 16 inline std::ostream& operator<<( std::ostream& os, const gw_exported_states_per_group_t value) { @@ -254,12 +255,30 @@ inline void decode(ana_state_t& st, ceph::buffer::list::const_iterator &bl) { DECODE_FINISH(bl); } -inline void encode(const GwSubsystems& subsystems, ceph::bufferlist &bl) { - ENCODE_START(1, 1, bl); +inline void encode( + const GwSubsystems& subsystems, ceph::bufferlist &bl, uint64_t features) { + uint8_t version = 1; + if (HAVE_FEATURE(features, NVMEOFHA)) { + version = 2; + } + ENCODE_START(version, version, bl); encode((uint32_t)subsystems.size(), bl); for (const auto& sub: subsystems) { encode(sub.second.nqn, bl); - encode(sub.second.ana_state, bl); + if (version == 1) { + dout(20) << "encode ana_state vector version1 = " << version << dendl; + /* Version 1 requires exactly 16 entries */ + ana_state_t filled(sub.second.ana_state); + filled.resize( + MAX_SUPPORTED_ANA_GROUPS, + std::make_pair( + gw_exported_states_per_group_t::GW_EXPORTED_INACCESSIBLE_STATE, + 0)); + encode(filled, bl); + } else { + dout(20) << "encode ana_state vector version2 = " << version << dendl; + encode(sub.second.ana_state, bl); + } } ENCODE_FINISH(bl); } @@ -267,7 +286,7 @@ inline void encode(const GwSubsystems& subsystems, ceph::bufferlist &bl) { inline void decode( GwSubsystems& subsystems, ceph::bufferlist::const_iterator& bl) { uint32_t num_subsystems; - DECODE_START(1, bl); + DECODE_START(2, bl); decode(num_subsystems, bl); subsystems.clear(); for (uint32_t i=0; i= 2) { + encode((uint32_t)state.data.size(), bl); + for (auto &tm_itr:state.data) { + encode((uint32_t)tm_itr.first, bl);// encode key + uint32_t tick = tm_itr.second.timer_started; + uint8_t val = tm_itr.second.timer_value; + encode(tick, bl); + encode(val, bl); + auto endtime = tm_itr.second.end_time; + // Convert the time point to milliseconds since the epoch + uint64_t millisecondsSinceEpoch = std::chrono::duration_cast( - endtime.time_since_epoch()).count(); - encode(millisecondsSinceEpoch , bl); + endtime.time_since_epoch()).count(); + encode(millisecondsSinceEpoch , bl); + } + } else { + encode((uint32_t)MAX_SUPPORTED_ANA_GROUPS, bl); + Tmdata empty; + for (uint32_t i = 0; i < MAX_SUPPORTED_ANA_GROUPS; i++) { + auto tmiter = state.data.find(i); + const Tmdata *to_encode = ∅ + if (tmiter != state.data.end()) { + to_encode = &(tmiter->second); + } + encode(to_encode->timer_started, bl); + encode(to_encode->timer_value, bl); + auto endtime = to_encode->end_time; + // Convert the time point to milliseconds since the epoch + uint64_t millisecondsSinceEpoch = + std::chrono::duration_cast( + endtime.time_since_epoch()).count(); + encode(millisecondsSinceEpoch , bl); + } } ENCODE_FINISH(bl); } inline void decode( NvmeGwTimerState& state, ceph::bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + dout(20) << "decode NvmeGwTimers version = " << struct_v << dendl; uint32_t size; - DECODE_START(1, bl); decode(size, bl); for (uint32_t i = 0; i (duration); - state.data[tm_key] = tm; + if (struct_v >= 2) { + decode(tm_key, bl); + decode(tick, bl); + decode(val, bl); + Tmdata tm; + tm.timer_started = tick; + tm.timer_value = val; + uint64_t milliseconds; + decode(milliseconds, bl); + auto duration = std::chrono::milliseconds(milliseconds); + tm.end_time = std::chrono::time_point(duration); + state.data[tm_key] = tm; + } else { + decode(tick, bl); + decode(val, bl); + Tmdata tm; + tm.timer_started = tick; + tm.timer_value = val; + uint64_t milliseconds; + decode(milliseconds, bl); + if (tm.timer_started) { + // relevant only entries with started timers in the state + auto duration = std::chrono::milliseconds(milliseconds); + tm.end_time = std::chrono::time_point(duration); + state.data[i] = tm; + } + } } DECODE_FINISH(bl); } -inline void encode(const NvmeAnaNonceMap& nonce_map, ceph::bufferlist &bl) { +inline void encode(const NvmeAnaNonceMap& nonce_map, ceph::bufferlist &bl, + uint64_t features) { ENCODE_START(1, 1, bl); encode((uint32_t)nonce_map.size(), bl); for (auto& ana_group_nonces : nonce_map) { @@ -359,6 +422,7 @@ inline void encode(const NvmeAnaNonceMap& nonce_map, ceph::bufferlist &bl) { inline void decode( NvmeAnaNonceMap& nonce_map, ceph::buffer::list::const_iterator &bl) { + dout(20) << "decode nonce map " << dendl; uint32_t map_size; NvmeAnaGrpId ana_grp_id; uint32_t vector_size; @@ -376,29 +440,55 @@ inline void decode( DECODE_FINISH(bl); } -inline void encode(const NvmeGwMonStates& gws, ceph::bufferlist &bl) { - ENCODE_START(1, 1, bl); +inline void encode(const NvmeGwMonStates& gws, ceph::bufferlist &bl, + uint64_t features) { + uint8_t version = 1; + if (HAVE_FEATURE(features, NVMEOFHA)) { + version = 2; + } + ENCODE_START(version, version, bl); encode ((uint32_t)gws.size(), bl); // number of gws in the group for (auto& gw : gws) { encode(gw.first, bl);// GW_id encode(gw.second.ana_grp_id, bl); // GW owns this group-id - encode((uint32_t)gw.second.sm_state.size(), bl); - for (auto &state_it:gw.second.sm_state) { - encode((uint32_t)state_it.first, bl); //key of map - encode((uint32_t)state_it.second, bl);//value of map - } - encode((uint32_t)gw.second.availability, bl); - encode((uint16_t)gw.second.performed_full_startup, bl); - encode((uint16_t)gw.second.last_gw_map_epoch_valid, bl); - encode(gw.second.subsystems, bl); - - encode((uint32_t)gw.second.blocklist_data.size(), bl); - for (auto &blklst_itr: gw.second.blocklist_data) { - encode((uint32_t)blklst_itr.first, bl); - encode((uint32_t)blklst_itr.second.osd_epoch, bl); - encode((uint32_t)blklst_itr.second.is_failover, bl); + if (version >= 2) { + encode((uint32_t)gw.second.sm_state.size(), bl); + for (auto &state_it:gw.second.sm_state) { + encode((uint32_t)state_it.first, bl); //key of map + encode((uint32_t)state_it.second, bl);//value of map + } + encode((uint32_t)gw.second.availability, bl); + encode((uint16_t)gw.second.performed_full_startup, bl); + encode((uint16_t)gw.second.last_gw_map_epoch_valid, bl); + encode(gw.second.subsystems, bl); + + encode((uint32_t)gw.second.blocklist_data.size(), bl); + for (auto &blklst_itr: gw.second.blocklist_data) { + encode((uint32_t)blklst_itr.first, bl); + encode((uint32_t)blklst_itr.second.osd_epoch, bl); + encode((uint32_t)blklst_itr.second.is_failover, bl); + } + } else { + gw_states_per_group_t states[MAX_SUPPORTED_ANA_GROUPS]; + for (int i = 0; i < MAX_SUPPORTED_ANA_GROUPS; i++) states[i] = gw_states_per_group_t::GW_IDLE_STATE; + for (auto &state_it:gw.second.sm_state) states[state_it.first] = state_it.second; + for (int i = 0; i < MAX_SUPPORTED_ANA_GROUPS; i++) encode((uint32_t)states[i], bl); + + encode((uint32_t)gw.second.availability, bl); + encode((uint16_t)gw.second.performed_full_startup, bl); + encode((uint16_t)gw.second.last_gw_map_epoch_valid, bl); + encode(gw.second.subsystems, bl); // TODO reuse but put features - encode version + Blocklist_data bl_data[MAX_SUPPORTED_ANA_GROUPS]; + for (auto &blklst_itr: gw.second.blocklist_data) { + bl_data[blklst_itr.first].osd_epoch = blklst_itr.second.osd_epoch; + bl_data[blklst_itr.first].is_failover = blklst_itr.second.is_failover; + } + for (int i = 0; i < MAX_SUPPORTED_ANA_GROUPS; i++) { + encode((uint32_t)bl_data[i].osd_epoch, bl); + encode((bool)bl_data[i].is_failover, bl); + } } - encode(gw.second.nonce_map, bl); + encode(gw.second.nonce_map, bl, features); } ENCODE_FINISH(bl); } @@ -407,28 +497,42 @@ inline void decode( NvmeGwMonStates& gws, ceph::buffer::list::const_iterator &bl) { gws.clear(); uint32_t num_created_gws; - DECODE_START(1, bl); + DECODE_START(2, bl); + dout(20) << "decode NvmeGwMonStates. struct_v: " << struct_v << dendl; decode(num_created_gws, bl); - + dout(20) << "decode NvmeGwMonStates. num gws " << num_created_gws << dendl; + std::set created_anagrps; for (uint32_t i = 0; i= 2) { + decode(size, bl); + for (uint32_t i = 0; i = 2) { + decode(size, bl); + for (uint32_t i=0; i& created_gws, - ceph::bufferlist &bl) { + ceph::bufferlist &bl, uint64_t features) { ENCODE_START(1, 1, bl); encode ((uint32_t)created_gws.size(), bl); // number of groups for (auto& group_gws: created_gws) { @@ -468,7 +598,7 @@ inline void encode( encode(group_key.second, bl); // group auto& gws = group_gws.second; - encode (gws, bl); // encode group gws + encode(gws, bl, features); // encode group gws } ENCODE_FINISH(bl); } @@ -477,7 +607,7 @@ inline void decode( std::map& created_gws, ceph::buffer::list::const_iterator &bl) { created_gws.clear(); - uint32_t ngroups; + uint32_t ngroups = 0; DECODE_START(1, bl); decode(ngroups, bl); for (uint32_t i = 0; i& gmap, - ceph::bufferlist &bl) { + ceph::bufferlist &bl, + uint64_t features) { ENCODE_START(1, 1, bl); encode ((uint32_t)gmap.size(), bl); // number of groups for (auto& group_state: gmap) { auto& group_key = group_state.first; encode(group_key.first, bl); // pool encode(group_key.second, bl); // group - encode(group_state.second, bl); + encode(group_state.second, bl, features); } ENCODE_FINISH(bl); } @@ -555,7 +686,7 @@ inline void decode( inline void encode( const std::map& gmetadata, - ceph::bufferlist &bl) { + ceph::bufferlist &bl, uint64_t features) { ENCODE_START(1, 1, bl); encode ((uint32_t)gmetadata.size(), bl); // number of groups for (auto& group_md: gmetadata) { @@ -563,7 +694,7 @@ inline void encode( encode(group_key.first, bl); // pool encode(group_key.second, bl); // group - encode(group_md.second, bl); + encode(group_md.second, bl, features); } ENCODE_FINISH(bl); } @@ -586,12 +717,13 @@ inline void decode( DECODE_FINISH(bl); } -inline void encode(const NvmeGwTimers& group_md, ceph::bufferlist &bl) { +inline void encode(const NvmeGwTimers& group_md, ceph::bufferlist &bl, + uint64_t features) { ENCODE_START(1, 1, bl); encode ((uint32_t)group_md.size(), bl); // number of groups for (auto& gw_md: group_md) { encode(gw_md.first, bl); // gw - encode(gw_md.second, bl); // map of this gw + encode(gw_md.second, bl, features); // map of this gw } ENCODE_FINISH(bl); } @@ -654,6 +786,7 @@ inline void encode(const BeaconSubsystem& sub, ceph::bufferlist &bl) { inline void decode(BeaconSubsystem& sub, ceph::buffer::list::const_iterator &bl) { DECODE_START(1, bl); + dout(20) << "decode BeaconSubsystems " << dendl; decode(sub.nqn, bl); uint32_t s; sub.listeners.clear(); diff --git a/src/test/test_nvmeof_mon_encoding.cc b/src/test/test_nvmeof_mon_encoding.cc index 8cd2381fa78..d66efb77fe6 100644 --- a/src/test/test_nvmeof_mon_encoding.cc +++ b/src/test/test_nvmeof_mon_encoding.cc @@ -52,7 +52,7 @@ void test_NVMeofGwMap() { dout(0) << pending_map << dendl; ceph::buffer::list bl; - pending_map.encode(bl); + pending_map.encode(bl, CEPH_FEATURES_ALL); auto p = bl.cbegin(); pending_map.decode(p); dout(0) << " == Dump map after Decode: == " <(end_time.time_since_epoch()).count(); dout(0) << "Metadata milliseconds " << millisecondsSinceEpoch << " " << (int)pending_map.fsm_timers[group_key][gwid].data[grpid].timer_value << dendl; ceph::buffer::list bl; - pending_map.encode(bl); + pending_map.encode(bl, CEPH_FEATURES_ALL); auto p = bl.cbegin(); pending_map.decode(p);