From 037537ed8c6f16c2e471dffdb8cdd983fa400420 Mon Sep 17 00:00:00 2001 From: Leonid Chernin Date: Sun, 1 Sep 2024 13:17:45 +0000 Subject: [PATCH] mon/NVMeofGw*: Fix issue when ana group of deleted GW was not serviced. Introduced GW Deleting state Signed-off-by: Leonid Chernin --- src/mon/NVMeofGwMap.cc | 198 +++++++++++++++++++++++------------- src/mon/NVMeofGwMap.h | 9 +- src/mon/NVMeofGwMon.cc | 102 +++++++++++++------ src/mon/NVMeofGwMon.h | 3 +- src/mon/NVMeofGwSerialize.h | 3 + src/mon/NVMeofGwTypes.h | 10 +- 6 files changed, 219 insertions(+), 106 deletions(-) diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc index b20060c68fec..6fe1a611fd71 100755 --- a/src/mon/NVMeofGwMap.cc +++ b/src/mon/NVMeofGwMap.cc @@ -37,9 +37,15 @@ void NVMeofGwMap::to_gmap( for (const auto& gw_created_pair: gw_created_map) { const auto& gw_id = gw_created_pair.first; const auto& gw_created = gw_created_pair.second; + gw_availability_t availability = gw_created.availability; + // Gateways expect to see UNAVAILABLE, not DELETING + // for entries in DELETING state + if (gw_created.availability == gw_availability_t::GW_DELETING) { + availability = gw_availability_t::GW_UNAVAILABLE; + } auto gw_state = NvmeGwClientState( - gw_created.ana_grp_id, epoch, gw_created.availability); + gw_created.ana_grp_id, epoch, availability); for (const auto& sub: gw_created.subsystems) { gw_state.subsystems.insert({ sub.nqn, @@ -78,10 +84,36 @@ int NVMeofGwMap::cfg_add_gw( for (auto& itr: created_gws[group_key]) { allocated.insert(itr.second.ana_grp_id); if (itr.first == gw_id) { - dout(1) << __func__ << " ERROR create GW: already exists in map " - << gw_id << dendl; - return -EEXIST ; + if (itr.second.availability != gw_availability_t::GW_DELETING) { + dout(1) << __func__ << " ERROR create GW: already exists in map " + << gw_id << dendl; + return -EEXIST; + } else { + //this GW exists in the map in "Deleting" state + // but user again creates it - need just set attribute values + created_gws[group_key][gw_id].performed_full_startup = true; + created_gws[group_key][gw_id].availability + = gw_availability_t::GW_CREATED; + dout(4) << "GW in Deleting state " << gw_id + << " was created again" << dendl; + return 0; + } } + if (itr.second.availability == gw_availability_t::GW_DELETING) { + //Was found some GW in "Deleting" state. Just to inherit its ANA group + NvmeGwMonState & gw_created = created_gws[group_key][itr.first]; + created_gws[group_key][gw_id] = gw_created; + // Deep copy of all data of "Deleting" GW + created_gws[group_key][gw_id].performed_full_startup = true; + created_gws[group_key][gw_id].availability + = gw_availability_t::GW_CREATED; + dout(4) << "Created GW inherits ANA group of deleting GW-id :" + << itr.first << " group " << itr.second.ana_grp_id << dendl; + do_erase_gw_id(itr.first, group_key); + dout(4) << "Created GWS after create/delete: " + << created_gws << dendl; + return 0; + } } if (allocated.size() == MAX_SUPPORTED_ANA_GROUPS) { dout(4) << "Warning: cannot add GW " << gw_id @@ -125,7 +157,40 @@ int NVMeofGwMap::cfg_add_gw( int NVMeofGwMap::cfg_delete_gw( const NvmeGwId &gw_id, const NvmeGroupKey& group_key) { - int rc = 0; + if (HAVE_FEATURE(mon->get_quorum_con_features(), NVMEOFHA)) { + dout(10) << " has NVMEOFHA: 1" << dendl; + for (auto& gws_states: created_gws[group_key]) { + if (gws_states.first == gw_id) { + auto& state = gws_states.second; + state.availability = gw_availability_t::GW_DELETING; + dout(4) << " Deleting GW :"<< gw_id << " in state " + << state.availability << " Resulting GW availability: " + << state.availability << dendl; + return 0; + } + } + } else { + return do_delete_gw(gw_id, group_key); + } + return -EINVAL; +} + +int NVMeofGwMap::do_erase_gw_id(const NvmeGwId &gw_id, + const NvmeGroupKey& group_key) { + + fsm_timers[group_key].erase(gw_id); + if (fsm_timers[group_key].size() == 0) + fsm_timers.erase(group_key); + + created_gws[group_key].erase(gw_id); + if (created_gws[group_key].size() == 0) + created_gws.erase(group_key); + return 0; +} + +int NVMeofGwMap::do_delete_gw( + const NvmeGwId &gw_id, const NvmeGroupKey& group_key) +{ for (auto& gws_states: created_gws[group_key]) { if (gws_states.first == gw_id) { @@ -136,26 +201,52 @@ int NVMeofGwMap::cfg_delete_gw( gw_id, group_key,state_itr.second , state_itr.first, modified); } dout(10) << " Delete GW :"<< gw_id << " ANA grpid: " - << state.ana_grp_id << dendl; + << state.ana_grp_id << dendl; for (auto& itr: created_gws[group_key]) { // Update state map and other maps remove_grp_id(itr.first, group_key, state.ana_grp_id); // of all created gateways. Removed key = anagrp } - fsm_timers[group_key].erase(gw_id); - if (fsm_timers[group_key].size() == 0) - fsm_timers.erase(group_key); - - created_gws[group_key].erase(gw_id); - if (created_gws[group_key].size() == 0) - created_gws.erase(group_key); - return rc; + return do_erase_gw_id(gw_id, group_key); } } return -EINVAL; } +int NVMeofGwMap::get_num_namespaces(const NvmeGwId &gw_id, + const NvmeGroupKey& group_key, const BeaconSubsystems& subs) +{ + auto grpid = created_gws[group_key][gw_id].ana_grp_id ; + int num_ns = 0; + for (auto & subs_it:subs) { + for (auto & ns :subs_it.namespaces) { + if (ns.anagrpid == (grpid+1)) { + num_ns++; + } + } + } + return num_ns; +} + +void NVMeofGwMap::track_deleting_gws(const NvmeGroupKey& group_key, + const BeaconSubsystems& subs, bool &propose_pending) +{ + propose_pending = false; + for (auto& itr: created_gws[group_key]) { + auto &gw_id = itr.first; + if (itr.second.availability == gw_availability_t::GW_DELETING) { + int num_ns = 0; + if ( (num_ns = get_num_namespaces(gw_id, group_key, subs)) == 0) { + do_delete_gw(gw_id, group_key); + propose_pending = true; + } + dout(4) << " to delete ? " << gw_id << " num_ns " << num_ns << dendl; + break; // handle just one GW in "Deleting" state in time. + } + } +} + int NVMeofGwMap::process_gw_map_gw_down( const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose_pending) { @@ -192,17 +283,8 @@ void NVMeofGwMap::process_gw_map_ka( dout(20) << "KA beacon from the GW " << gw_id << " in state " << (int)st.availability << dendl; - if (st.availability == gw_availability_t::GW_CREATED) { - // first time appears - allow IO traffic for this GW - st.availability = gw_availability_t::GW_AVAILABLE; - for (auto& state_itr: created_gws[group_key][gw_id].sm_state) { - state_itr.second = gw_states_per_group_t::GW_STANDBY_STATE; - } - if (st.ana_grp_id != REDUNDANT_GW_ANA_GROUP_ID) { // not a redundand GW - st.active_state(st.ana_grp_id); - } - propose_pending = true; - } else if (st.availability == gw_availability_t::GW_UNAVAILABLE) { + if (st.availability == gw_availability_t::GW_CREATED || + st.availability == gw_availability_t::GW_UNAVAILABLE) { st.availability = gw_availability_t::GW_AVAILABLE; if (st.ana_grp_id == REDUNDANT_GW_ANA_GROUP_ID) { for (auto& state_itr: created_gws[group_key][gw_id].sm_state) { @@ -237,7 +319,9 @@ void NVMeofGwMap::handle_abandoned_ana_groups(bool& propose) // 1. Failover missed : is there is a GW in unavailable state? // if yes, is its ANA group handled by some other GW? - if (state.availability == gw_availability_t::GW_UNAVAILABLE && + if ((state.availability == gw_availability_t::GW_UNAVAILABLE || + state.availability == gw_availability_t::GW_DELETING || + state.availability == gw_availability_t::GW_CREATED) && state.ana_grp_id != REDUNDANT_GW_ANA_GROUP_ID) { auto found_gw_for_ana_group = false; for (auto& gw_state2 : gws_states) { @@ -251,7 +335,7 @@ void NVMeofGwMap::handle_abandoned_ana_groups(bool& propose) } // choose the GW for handle ana group if (found_gw_for_ana_group == false) { - dout(10) << "Was not found the GW " << " that handles ANA grp " + dout(20) << "Was not found the GW " << " that handles ANA grp " << (int)state.ana_grp_id << " find candidate "<< dendl; for (auto& state_itr: created_gws[group_key][gw_id].sm_state) { find_failover_candidate(gw_id, group_key, state_itr.first, propose); @@ -277,14 +361,23 @@ void NVMeofGwMap::set_failover_gw_for_ANA_group( const NvmeGwId &gw_id, NvmeAnaGrpId ANA_groupid) { NvmeGwMonState& gw_state = created_gws[group_key][gw_id]; + NvmeGwMonState& failed_gw_state = created_gws[group_key][failed_gw_id]; epoch_t epoch; dout(10) << "Found failover GW " << gw_id << " for ANA group " << (int)ANA_groupid << dendl; + if (failed_gw_state.availability == gw_availability_t::GW_CREATED) { + dout(10) << "Failover GW " << gw_id << + " takes over the group of GW in Created state " << + failed_gw_id << dendl; + // just take over on the group of created GW + gw_state.active_state(ANA_groupid); + return; + } int rc = blocklist_gw(failed_gw_id, group_key, ANA_groupid, epoch, true); if (rc) { //start failover even when nonces are empty ! gw_state.active_state(ANA_groupid); - } else{ + } else { gw_state.sm_state[ANA_groupid] = gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL; gw_state.blocklist_data[ANA_groupid].osd_epoch = epoch; @@ -507,7 +600,7 @@ void NVMeofGwMap::fsm_handle_gw_alive( // ana group wouldnt be taken back during blocklist wait period cancel_timer(gw_id, group_key, grpid); map_modified = true; - } else{ + } else { dout(20) << "osd epoch not changed from " << gw_map.blocklist_data[grpid].osd_epoch << " to "<< last_osd_epoch @@ -576,6 +669,8 @@ void NVMeofGwMap::fsm_handle_gw_down( void NVMeofGwMap::fsm_handle_gw_delete( const NvmeGwId &gw_id, const NvmeGroupKey& group_key, gw_states_per_group_t state , NvmeAnaGrpId grpid, bool &map_modified) { + //This function is called when GW already passed Failover and its native + //Ana group has no volumes, so some states are not relevant switch (state) { case gw_states_per_group_t::GW_STANDBY_STATE: case gw_states_per_group_t::GW_IDLE_STATE: @@ -583,8 +678,8 @@ void NVMeofGwMap::fsm_handle_gw_delete( { NvmeGwMonState& gw_state = created_gws[group_key][gw_id]; - // Try to find GW that temporary owns my group - if found, - // this GW should pass to standby for this group + // Try to find GW that temporary owns gw-id group that is about to disappear! + // - if found, this GW should pass to standby for this group if (grpid == gw_state.ana_grp_id) { auto& gateway_states = created_gws[group_key]; for (auto& gs: gateway_states) { @@ -605,43 +700,6 @@ void NVMeofGwMap::fsm_handle_gw_delete( } break; - case gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL: - { - NvmeGwMonState& gw_state = created_gws[group_key][gw_id]; - cancel_timer(gw_id, group_key, grpid); - map_modified = true; - gw_state.standby_state(grpid); - } - break; - - case gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED: - { - cancel_timer(gw_id, group_key, grpid); - map_modified = true; - for (auto& nqn_gws_state: created_gws[group_key]) { - auto& st = nqn_gws_state.second; - - // found GW that was intended for Failback for this ana grp - if (st.sm_state[grpid] == - gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED) { - dout(4) << "Warning: Outgoing Failback when GW is deleted " - << "- to rollback it GW " << gw_id << "for ANA Group " - << grpid << dendl; - st.standby_state(grpid); - break; - } - } - } - break; - - case gw_states_per_group_t::GW_ACTIVE_STATE: - { - NvmeGwMonState& gw_state = created_gws[group_key][gw_id]; - map_modified = true; - gw_state.standby_state(grpid); - } - break; - default: { dout(4) << "Error : Invalid state " << state << "for GW " << gw_id << dendl; @@ -781,14 +839,14 @@ int NVMeofGwMap::blocklist_gw( new CMonRequestProposal(this, addr_vect, expires) ); // return false; - } else{ + } else { mon->nvmegwmon()->request_proposal(mon->osdmon()); } } dout(10) << str << " mon->osdmon()->blocklist: epoch : " << epoch << " address vector: " << addr_vect << " " << addr_vect.size() << dendl; - } else{ + } else { dout(4) << "Error: No nonces context present for gw: " << gw_id << " ANA group: " << grpid << dendl; return 1; diff --git a/src/mon/NVMeofGwMap.h b/src/mon/NVMeofGwMap.h index 688a68662bee..4c9d79664101 100755 --- a/src/mon/NVMeofGwMap.h +++ b/src/mon/NVMeofGwMap.h @@ -44,7 +44,8 @@ public: std::map fsm_timers; void to_gmap(std::map& Gmap) const; - + void track_deleting_gws(const NvmeGroupKey& group_key, + const BeaconSubsystems& subs, bool &propose_pending); int cfg_add_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key); int cfg_delete_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key); void process_gw_map_ka( @@ -64,6 +65,9 @@ public: void handle_gw_performing_fast_reboot(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &map_modified); private: + int do_delete_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key); + int do_erase_gw_id(const NvmeGwId &gw_id, + const NvmeGroupKey& group_key); void add_grp_id( const NvmeGwId &gw_id, const NvmeGroupKey& group_key, const NvmeAnaGrpId grpid); @@ -95,7 +99,8 @@ private: void set_failover_gw_for_ANA_group( const NvmeGwId &failed_gw_id, const NvmeGroupKey& group_key, const NvmeGwId &gw_id, NvmeAnaGrpId groupid); - + int get_num_namespaces(const NvmeGwId &gw_id, + const NvmeGroupKey& group_key, const BeaconSubsystems& subs ); int get_timer( const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid); diff --git a/src/mon/NVMeofGwMon.cc b/src/mon/NVMeofGwMon.cc index 651046515feb..b6faeb2e97ce 100644 --- a/src/mon/NVMeofGwMon.cc +++ b/src/mon/NVMeofGwMon.cc @@ -105,6 +105,7 @@ void NVMeofGwMon::tick() const auto cutoff = now - nvmegw_beacon_grace; // Pass over all the stored beacons + NvmeGroupKey old_group_key; for (auto &itr : last_beacon) { auto& lb = itr.first; auto last_beacon_time = itr.second; @@ -114,6 +115,14 @@ void NVMeofGwMon::tick() _propose_pending |= propose; last_beacon.erase(lb); } else { + BeaconSubsystems *subsystems = + &pending_map.created_gws[lb.group_key][lb.gw_id].subsystems; + if (subsystems && subsystems->size() && old_group_key != lb.group_key) { + // to call track_deleting_gws once per each group-key + pending_map.track_deleting_gws(lb.group_key, *subsystems, propose); + old_group_key = lb.group_key; + _propose_pending |= propose; + } dout(20) << "beacon live for GW key: " << lb.gw_id << dendl; } } @@ -299,29 +308,58 @@ bool NVMeofGwMon::preprocess_command(MonOpRequestRef op) auto group_key = std::make_pair(pool, group); dout(10) << "nvme-gw show pool " << pool << " group " << group << dendl; - if (map.created_gws[group_key].size()) { - f->open_object_section("common"); - f->dump_unsigned("epoch", map.epoch); - f->dump_string("pool", pool); - f->dump_string("group", group); - f->dump_unsigned("num gws", map.created_gws[group_key].size()); + f->open_object_section("common"); + f->dump_unsigned("epoch", map.epoch); + f->dump_string("pool", pool); + f->dump_string("group", group); + if (HAVE_FEATURE(mon.get_quorum_con_features(), NVMEOFHA)) { + f->dump_string("features", "LB"); + } + f->dump_unsigned("num gws", map.created_gws[group_key].size()); + if (map.created_gws[group_key].size() == 0) { + f->close_section(); + f->flush(rdata); + sstrm.str(""); + } else { sstrm << "[ "; NvmeGwId gw_id; + BeaconSubsystems *subsystems = NULL; for (auto& gw_created_pair: map.created_gws[group_key]) { - gw_id = gw_created_pair.first; - auto& st = gw_created_pair.second; - sstrm << st.ana_grp_id+1 << " "; + gw_id = gw_created_pair.first; + auto& st = gw_created_pair.second; + if (st.availability != gw_availability_t::GW_DELETING) { + // not show ana group of deleting gw in the list - + // it is information for the GW used in rebalancing process + sstrm << st.ana_grp_id+1 << " "; + } + if (st.availability == gw_availability_t::GW_AVAILABLE) { + subsystems = &st.subsystems; + } } sstrm << "]"; f->dump_string("Anagrp list", sstrm.str()); - f->close_section(); - + std::map num_ns; + uint16_t total_ns = 0; + if (subsystems && subsystems->size()) { + for (auto & subs_it:*subsystems) { + for (auto & ns :subs_it.namespaces) { + if (num_ns.find(ns.anagrpid) == num_ns.end()) num_ns[ns.anagrpid] = 0; + num_ns[ns.anagrpid] +=1; + total_ns += 1; + } + } + } + f->dump_unsigned("num-namespaces", total_ns); + f->open_array_section("Created Gateways:"); + uint32_t i = 0; for (auto& gw_created_pair: map.created_gws[group_key]) { auto& gw_id = gw_created_pair.first; auto& state = gw_created_pair.second; + i = 0; f->open_object_section("stat"); f->dump_string("gw-id", gw_id); f->dump_unsigned("anagrp-id",state.ana_grp_id+1); + f->dump_unsigned("num-namespaces", num_ns[state.ana_grp_id+1]); f->dump_unsigned("performed-full-startup", state.performed_full_startup); std::stringstream sstrm1; sstrm1 << state.availability; @@ -329,17 +367,18 @@ bool NVMeofGwMon::preprocess_command(MonOpRequestRef op) sstrm1.str(""); for (auto &state_itr: map.created_gws[group_key][gw_id].sm_state) { sstrm1 << " " << state_itr.first + 1 << ": " - << state.sm_state[state_itr.first] << ","; + << state.sm_state[state_itr.first]; + if (++i < map.created_gws[group_key][gw_id].sm_state.size()) + sstrm1<< ", "; } f->dump_string("ana states", sstrm1.str()); f->close_section(); } + f->close_section(); + f->close_section(); f->flush(rdata); sstrm.str(""); } - else { - sstrm << "num_gws 0"; - } getline(sstrm, rs); mon.reply_command(op, err, rs, rdata, get_last_committed()); return true; @@ -388,19 +427,18 @@ bool NVMeofGwMon::prepare_command(MonOpRequestRef op) << " " << pool << " " << group << " rc " << rc << dendl; sstrm.str(""); } - } - else{ + } else { rc = pending_map.cfg_delete_gw(id, group_key); - if (rc == -EINVAL) { + if (rc == 0) { + bool propose = false; + // Simulate immediate Failover of this GW + process_gw_down(id, group_key, propose); + } else if (rc == -EINVAL) { dout (4) << "Error: GW not found in the database " << id << " " << pool << " " << group << " rc " << rc << dendl; err = 0; sstrm.str(""); } - if (rc == 0) { - LastBeacon lb = {id, group_key}; - last_beacon.erase(lb); - } } // propose pending would be generated by the PaxosService if ((rc != -EEXIST) && (rc != -EINVAL)) { @@ -423,6 +461,16 @@ bool NVMeofGwMon::prepare_command(MonOpRequestRef op) return response; } +void NVMeofGwMon::process_gw_down(const NvmeGwId &gw_id, + const NvmeGroupKey& group_key, bool &propose_pending) +{ + LastBeacon lb = {gw_id, group_key}; + auto it = last_beacon.find(lb); + if (it != last_beacon.end()) { + last_beacon.erase(it); + pending_map.process_gw_map_gw_down(gw_id, group_key, propose_pending); + } +} bool NVMeofGwMon::preprocess_beacon(MonOpRequestRef op) { @@ -527,7 +575,7 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op) << pending_map.created_gws[group_key][gw_id].nonce_map << dendl; nonce_propose = true; } - } else { + } else { dout(10) << "Warning: received empty nonce map in the beacon of GW " << gw_id << " " << dendl; } @@ -560,13 +608,7 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op) pending_map.process_gw_map_ka(gw_id, group_key, last_osd_epoch, propose); // state set by GW client application } else if (avail == gw_availability_t::GW_UNAVAILABLE) { - LastBeacon lb = {gw_id, group_key}; - - auto it = last_beacon.find(lb); - if (it != last_beacon.end()) { - last_beacon.erase(lb); - pending_map.process_gw_map_gw_down(gw_id, group_key, propose); - } + process_gw_down(gw_id, group_key, propose); } // Periodic: check active FSM timers pending_map.update_active_timers(timer_propose); diff --git a/src/mon/NVMeofGwMon.h b/src/mon/NVMeofGwMon.h index acd72dfe0c42..f132c87d92af 100644 --- a/src/mon/NVMeofGwMon.h +++ b/src/mon/NVMeofGwMon.h @@ -84,7 +84,8 @@ public: private: void synchronize_last_beacon(); - + void process_gw_down(const NvmeGwId &gw_id, + const NvmeGroupKey& group_key, bool &propose_pending); }; #endif /* MON_NVMEGWMONITOR_H_ */ diff --git a/src/mon/NVMeofGwSerialize.h b/src/mon/NVMeofGwSerialize.h index cbda90ea3791..b10eac88c2fd 100755 --- a/src/mon/NVMeofGwSerialize.h +++ b/src/mon/NVMeofGwSerialize.h @@ -74,6 +74,9 @@ inline std::ostream& operator<<( case gw_availability_t::GW_UNAVAILABLE: os << "UNAVAILABLE"; break; + case gw_availability_t::GW_DELETING: + os << "DELETING"; break; + default: os << "Invalid " << (int)value << " "; } diff --git a/src/mon/NVMeofGwTypes.h b/src/mon/NVMeofGwTypes.h index 057c8cd37950..2dd3e11ba3ab 100755 --- a/src/mon/NVMeofGwTypes.h +++ b/src/mon/NVMeofGwTypes.h @@ -42,6 +42,7 @@ enum class gw_availability_t { GW_CREATED = 0, GW_AVAILABLE, GW_UNAVAILABLE, + GW_DELETING, GW_DELETED }; @@ -134,9 +135,12 @@ struct NvmeGwMonState { : ana_grp_id(id), availability(gw_availability_t::GW_CREATED), last_gw_map_epoch_valid(false), performed_full_startup(false) {} void set_unavailable_state() { - availability = gw_availability_t::GW_UNAVAILABLE; - // after setting this state the next time monitor sees GW, - // it expects it performed the full startup + if (availability != gw_availability_t::GW_DELETING) { + //for not to override Deleting + availability = gw_availability_t::GW_UNAVAILABLE; + } + // after setting this state, the next time monitor sees GW, + // it expects it performed the full startup performed_full_startup = false; } void standby_state(NvmeAnaGrpId grpid) { -- 2.47.3