From 8bf309ecd7b59edb2dfb7d0e86e961bca20cd9cd Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Sat, 13 Jul 2024 01:56:18 +0000 Subject: [PATCH] mon/NVMeofGwMap.cc: tabbing, line length, formatting - Retabs file to match emacs/vim modelines at top - Fixes bracing - Adjusts line length to 80 char Signed-off-by: Samuel Just --- src/mon/NVMeofGwMap.cc | 1232 ++++++++++++++++++++++------------------ 1 file changed, 692 insertions(+), 540 deletions(-) diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc index 9af9f81b7f3e..646d56d30e6c 100755 --- a/src/mon/NVMeofGwMap.cc +++ b/src/mon/NVMeofGwMap.cc @@ -27,633 +27,785 @@ using std::string; #undef dout_prefix #define dout_prefix *_dout << "nvmeofgw " << __PRETTY_FUNCTION__ << " " -void NVMeofGwMap::to_gmap(std::map& Gmap) const { - Gmap.clear(); - for (const auto& created_map_pair: created_gws) { - const auto& group_key = created_map_pair.first; - const NvmeGwMonStates& gw_created_map = created_map_pair.second; - for (const auto& gw_created_pair: gw_created_map) { - const auto& gw_id = gw_created_pair.first; - const auto& gw_created = gw_created_pair.second; - - auto gw_state = NvmeGwClientState(gw_created.ana_grp_id, epoch, gw_created.availability); - for (const auto& sub: gw_created.subsystems) { - gw_state.subsystems.insert({sub.nqn, NqnState(sub.nqn, gw_created.sm_state, gw_created )}); - } - Gmap[group_key][gw_id] = gw_state; - dout (20) << gw_id << " Gw-Client: " << gw_state << dendl; - } +void NVMeofGwMap::to_gmap( + std::map& Gmap) const +{ + Gmap.clear(); + for (const auto& created_map_pair: created_gws) { + const auto& group_key = created_map_pair.first; + const NvmeGwMonStates& gw_created_map = created_map_pair.second; + for (const auto& gw_created_pair: gw_created_map) { + const auto& gw_id = gw_created_pair.first; + const auto& gw_created = gw_created_pair.second; + + auto gw_state = NvmeGwClientState( + gw_created.ana_grp_id, epoch, gw_created.availability); + for (const auto& sub: gw_created.subsystems) { + gw_state.subsystems.insert({ + sub.nqn, + NqnState(sub.nqn, gw_created.sm_state, gw_created) + }); + } + Gmap[group_key][gw_id] = gw_state; + dout (20) << gw_id << " Gw-Client: " << gw_state << dendl; } + } } -void NVMeofGwMap::add_grp_id(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, const NvmeAnaGrpId grpid) +void NVMeofGwMap::add_grp_id( + const NvmeGwId &gw_id, const NvmeGroupKey& group_key, const NvmeAnaGrpId grpid) { - Tmdata tm_data; - Blocklist_data blklist_data; - created_gws[group_key][gw_id].sm_state[grpid] = gw_states_per_group_t::GW_STANDBY_STATE; - fsm_timers[group_key][gw_id].data[grpid] = tm_data; - created_gws[group_key][gw_id].blocklist_data[grpid] = blklist_data; + Tmdata tm_data; + Blocklist_data blklist_data; + created_gws[group_key][gw_id].sm_state[grpid] = + gw_states_per_group_t::GW_STANDBY_STATE; + fsm_timers[group_key][gw_id].data[grpid] = tm_data; + created_gws[group_key][gw_id].blocklist_data[grpid] = blklist_data; } -void NVMeofGwMap::remove_grp_id(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, const NvmeAnaGrpId grpid) +void NVMeofGwMap::remove_grp_id( + const NvmeGwId &gw_id, const NvmeGroupKey& group_key, const NvmeAnaGrpId grpid) { - created_gws[group_key][gw_id].sm_state.erase(grpid); - created_gws[group_key][gw_id].blocklist_data.erase(grpid); - fsm_timers[group_key][gw_id].data.erase(grpid); + created_gws[group_key][gw_id].sm_state.erase(grpid); + created_gws[group_key][gw_id].blocklist_data.erase(grpid); + fsm_timers[group_key][gw_id].data.erase(grpid); } -int NVMeofGwMap::cfg_add_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key) { - std::set allocated; - for (auto& itr: created_gws[group_key]) { - allocated.insert(itr.second.ana_grp_id); - if (itr.first == gw_id) { - dout(1) << __func__ << " ERROR create GW: already exists in map " << gw_id << dendl; - return -EEXIST ; - } - } - // Allocate the new group id - NvmeAnaGrpId i = 0; - bool was_allocated = false; - for (NvmeAnaGrpId elem: allocated) {// "allocated" is a sorted set (!),so if found any gap between numbers, it should be filled - if (i != elem) { - allocated.insert(i); - was_allocated = true; - break; - } - i++; - } - if (!was_allocated) allocated.insert(i); - dout(10) << "allocated ANA groupId " << i << " for GW " << gw_id << dendl; - for (auto& itr: created_gws[group_key]) { // add new allocated grp_id to maps of created gateways - add_grp_id(itr.first, group_key, i); +int NVMeofGwMap::cfg_add_gw( + const NvmeGwId &gw_id, const NvmeGroupKey& group_key) +{ + std::set allocated; + for (auto& itr: created_gws[group_key]) { + allocated.insert(itr.second.ana_grp_id); + if (itr.first == gw_id) { + dout(1) << __func__ << " ERROR create GW: already exists in map " + << gw_id << dendl; + return -EEXIST ; } - NvmeGwMonState gw_created(i); - created_gws[group_key][gw_id] = gw_created; - created_gws[group_key][gw_id].performed_full_startup = true; - for (NvmeAnaGrpId elem: allocated) { - add_grp_id(gw_id, group_key, elem); // add all existed grp_ids to newly created gateway - dout(4) << "adding group " << elem << " to gw " << gw_id << dendl; + } + // Allocate the new group id + NvmeAnaGrpId i = 0; + bool was_allocated = false; + + // "allocated" is a sorted set (!),so if found any gap between numbers, + // it should be filled + for (NvmeAnaGrpId elem: allocated) { + if (i != elem) { + allocated.insert(i); + was_allocated = true; + break; } - dout(10) << __func__ << " Created GWS: " << created_gws << dendl; - return 0; + i++; + } + if (!was_allocated) allocated.insert(i); + + dout(10) << "allocated ANA groupId " << i << " for GW " << gw_id << dendl; + // add new allocated grp_id to maps of created gateways + for (auto& itr: created_gws[group_key]) { + add_grp_id(itr.first, group_key, i); + } + NvmeGwMonState gw_created(i); + created_gws[group_key][gw_id] = gw_created; + created_gws[group_key][gw_id].performed_full_startup = true; + for (NvmeAnaGrpId elem: allocated) { + // add all existed grp_ids to newly created gateway + add_grp_id(gw_id, group_key, elem); + dout(4) << "adding group " << elem << " to gw " << gw_id << dendl; + } + dout(10) << __func__ << " Created GWS: " << created_gws << dendl; + return 0; } -int NVMeofGwMap::cfg_delete_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key) { - int rc = 0; - for (auto& gws_states: created_gws[group_key]) { - - if (gws_states.first == gw_id) { - auto& state = gws_states.second; - for (auto& state_itr: created_gws[group_key][gw_id].sm_state ) { - bool modified; - fsm_handle_gw_delete(gw_id, group_key,state_itr.second , state_itr.first, modified); - } - dout(10) << " Delete GW :"<< gw_id << " ANA grpid: " << state.ana_grp_id << dendl; - for (auto& itr: created_gws[group_key]) { - remove_grp_id(itr.first, group_key, state.ana_grp_id);// Update state map and other maps - // of all created gateways. Removed key = anagrp - } - fsm_timers[group_key].erase(gw_id); - if (fsm_timers[group_key].size() == 0) - fsm_timers.erase(group_key); - - created_gws[group_key].erase(gw_id); - if (created_gws[group_key].size() == 0) - created_gws.erase(group_key); - return rc; - } +int NVMeofGwMap::cfg_delete_gw( + const NvmeGwId &gw_id, const NvmeGroupKey& group_key) +{ + int rc = 0; + for (auto& gws_states: created_gws[group_key]) { + + if (gws_states.first == gw_id) { + auto& state = gws_states.second; + for (auto& state_itr: created_gws[group_key][gw_id].sm_state) { + bool modified; + fsm_handle_gw_delete( + gw_id, group_key,state_itr.second , state_itr.first, modified); + } + dout(10) << " Delete GW :"<< gw_id << " ANA grpid: " + << state.ana_grp_id << dendl; + for (auto& itr: created_gws[group_key]) { + // Update state map and other maps + remove_grp_id(itr.first, group_key, state.ana_grp_id); + // of all created gateways. Removed key = anagrp + } + fsm_timers[group_key].erase(gw_id); + if (fsm_timers[group_key].size() == 0) + fsm_timers.erase(group_key); + + created_gws[group_key].erase(gw_id); + if (created_gws[group_key].size() == 0) + created_gws.erase(group_key); + return rc; } + } - return -EINVAL; + return -EINVAL; } - -int NVMeofGwMap::process_gw_map_gw_down(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, - bool &propose_pending) { - int rc = 0; - auto& gws_states = created_gws[group_key]; - auto gw_state = gws_states.find(gw_id); - if (gw_state != gws_states.end()) { - dout(10) << "GW down " << gw_id << dendl; - auto& st = gw_state->second; - st.set_unavailable_state(); - for (auto& state_itr: created_gws[group_key][gw_id].sm_state ) { - fsm_handle_gw_down(gw_id, group_key, state_itr.second, state_itr.first, propose_pending); - state_itr.second = gw_states_per_group_t::GW_STANDBY_STATE; - } - propose_pending = true; // map should reflect that gw becames unavailable - if (propose_pending) validate_gw_map(group_key); - } - else { - dout(1) << __FUNCTION__ << "ERROR GW-id was not found in the map " << gw_id << dendl; - rc = -EINVAL; +int NVMeofGwMap::process_gw_map_gw_down( + const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose_pending) +{ + int rc = 0; + auto& gws_states = created_gws[group_key]; + auto gw_state = gws_states.find(gw_id); + if (gw_state != gws_states.end()) { + dout(10) << "GW down " << gw_id << dendl; + auto& st = gw_state->second; + st.set_unavailable_state(); + for (auto& state_itr: created_gws[group_key][gw_id].sm_state) { + fsm_handle_gw_down( + gw_id, group_key, state_itr.second, + state_itr.first, propose_pending); + state_itr.second = gw_states_per_group_t::GW_STANDBY_STATE; } - return rc; + propose_pending = true; // map should reflect that gw becames unavailable + if (propose_pending) validate_gw_map(group_key); + } else { + dout(1) << __FUNCTION__ << "ERROR GW-id was not found in the map " + << gw_id << dendl; + rc = -EINVAL; + } + return rc; } - -void NVMeofGwMap::process_gw_map_ka(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, epoch_t& last_osd_epoch, bool &propose_pending) +void NVMeofGwMap::process_gw_map_ka( + const NvmeGwId &gw_id, const NvmeGroupKey& group_key, + epoch_t& last_osd_epoch, bool &propose_pending) { - auto& gws_states = created_gws[group_key]; - auto gw_state = gws_states.find(gw_id); - auto& st = gw_state->second; - dout(20) << "KA beacon from the GW " << gw_id << " in state " << (int)st.availability << dendl; - - if (st.availability == gw_availability_t::GW_CREATED) { - // first time appears - allow IO traffic for this GW - st.availability = gw_availability_t::GW_AVAILABLE; - for (auto& state_itr: created_gws[group_key][gw_id].sm_state ) state_itr.second = gw_states_per_group_t::GW_STANDBY_STATE; - if (st.ana_grp_id != REDUNDANT_GW_ANA_GROUP_ID) { // not a redundand GW - st.active_state(st.ana_grp_id); - } - propose_pending = true; + auto& gws_states = created_gws[group_key]; + auto gw_state = gws_states.find(gw_id); + auto& st = gw_state->second; + dout(20) << "KA beacon from the GW " << gw_id + << " in state " << (int)st.availability << dendl; + + if (st.availability == gw_availability_t::GW_CREATED) { + // first time appears - allow IO traffic for this GW + st.availability = gw_availability_t::GW_AVAILABLE; + for (auto& state_itr: created_gws[group_key][gw_id].sm_state) { + state_itr.second = gw_states_per_group_t::GW_STANDBY_STATE; } - else if (st.availability == gw_availability_t::GW_UNAVAILABLE) { - st.availability = gw_availability_t::GW_AVAILABLE; - if (st.ana_grp_id == REDUNDANT_GW_ANA_GROUP_ID) { - for (auto& state_itr: created_gws[group_key][gw_id].sm_state ) state_itr.second = gw_states_per_group_t::GW_STANDBY_STATE; - propose_pending = true; - } - else { - //========= prepare to Failback to this GW ========= - // find the GW that took over on the group st.ana_grp_id - find_failback_gw(gw_id, group_key, propose_pending); - } + if (st.ana_grp_id != REDUNDANT_GW_ANA_GROUP_ID) { // not a redundand GW + st.active_state(st.ana_grp_id); } - else if (st.availability == gw_availability_t::GW_AVAILABLE) { - for (auto& state_itr: created_gws[group_key][gw_id].sm_state ) { - fsm_handle_gw_alive(gw_id, group_key, gw_state->second, state_itr.second, state_itr.first, last_osd_epoch, propose_pending); - } + propose_pending = true; + } else if (st.availability == gw_availability_t::GW_UNAVAILABLE) { + st.availability = gw_availability_t::GW_AVAILABLE; + if (st.ana_grp_id == REDUNDANT_GW_ANA_GROUP_ID) { + for (auto& state_itr: created_gws[group_key][gw_id].sm_state) { + state_itr.second = gw_states_per_group_t::GW_STANDBY_STATE; + } + propose_pending = true; + } else { + //========= prepare to Failback to this GW ========= + // find the GW that took over on the group st.ana_grp_id + find_failback_gw(gw_id, group_key, propose_pending); } - if (propose_pending) validate_gw_map(group_key); + } else if (st.availability == gw_availability_t::GW_AVAILABLE) { + for (auto& state_itr: created_gws[group_key][gw_id].sm_state) { + fsm_handle_gw_alive( + gw_id, group_key, gw_state->second, state_itr.second, + state_itr.first, last_osd_epoch, propose_pending); + } + } + if (propose_pending) validate_gw_map(group_key); } - void NVMeofGwMap::handle_abandoned_ana_groups(bool& propose) { - propose = false; - for (auto& group_state: created_gws) { - auto& group_key = group_state.first; - auto& gws_states = group_state.second; - - for (auto& gw_state : gws_states) { // loop for GWs inside nqn group - auto& gw_id = gw_state.first; - NvmeGwMonState& state = gw_state.second; - - //1. Failover missed : is there is a GW in unavailable state? if yes, is its ANA group handled by some other GW? - if (state.availability == gw_availability_t::GW_UNAVAILABLE && state.ana_grp_id != REDUNDANT_GW_ANA_GROUP_ID) { - auto found_gw_for_ana_group = false; - for (auto& gw_state2 : gws_states) { - NvmeGwMonState& state2 = gw_state2.second; - if (state2.availability == gw_availability_t::GW_AVAILABLE && state2.sm_state[state.ana_grp_id] == gw_states_per_group_t::GW_ACTIVE_STATE) { - found_gw_for_ana_group = true; - break; - } - } - if (found_gw_for_ana_group == false) { //choose the GW for handle ana group - dout(10)<< "Was not found the GW " << " that handles ANA grp " << (int)state.ana_grp_id << " find candidate "<< dendl; - for (auto& state_itr: created_gws[group_key][gw_id].sm_state ) { - find_failover_candidate(gw_id, group_key, state_itr.first, propose); - } - } - } - - //2. Failback missed: Check this GW is Available and Standby and no other GW is doing Failback to it - else if (state.availability == gw_availability_t::GW_AVAILABLE - && state.ana_grp_id != REDUNDANT_GW_ANA_GROUP_ID && - state.sm_state[state.ana_grp_id] == gw_states_per_group_t::GW_STANDBY_STATE) - { - find_failback_gw(gw_id, group_key, propose); - } - } - if (propose) { - validate_gw_map(group_key); - } + propose = false; + for (auto& group_state: created_gws) { + auto& group_key = group_state.first; + auto& gws_states = group_state.second; + + for (auto& gw_state : gws_states) { // loop for GWs inside nqn group + auto& gw_id = gw_state.first; + NvmeGwMonState& state = gw_state.second; + + // 1. Failover missed : is there is a GW in unavailable state? + // if yes, is its ANA group handled by some other GW? + if (state.availability == gw_availability_t::GW_UNAVAILABLE && + state.ana_grp_id != REDUNDANT_GW_ANA_GROUP_ID) { + auto found_gw_for_ana_group = false; + for (auto& gw_state2 : gws_states) { + NvmeGwMonState& state2 = gw_state2.second; + if (state2.availability == gw_availability_t::GW_AVAILABLE && + (state2.sm_state[state.ana_grp_id] == + gw_states_per_group_t::GW_ACTIVE_STATE)) { + found_gw_for_ana_group = true; + break; + } + } + // choose the GW for handle ana group + if (found_gw_for_ana_group == false) { + dout(10) << "Was not found the GW " << " that handles ANA grp " + << (int)state.ana_grp_id << " find candidate "<< dendl; + for (auto& state_itr: created_gws[group_key][gw_id].sm_state) { + find_failover_candidate(gw_id, group_key, state_itr.first, propose); + } + } + } else if (state.availability == gw_availability_t::GW_AVAILABLE && + state.ana_grp_id != REDUNDANT_GW_ANA_GROUP_ID && + (state.sm_state[state.ana_grp_id] == + gw_states_per_group_t::GW_STANDBY_STATE)) { + // 2. Failback missed: Check this GW is Available and Standby and + // no other GW is doing Failback to it + find_failback_gw(gw_id, group_key, propose); + } + } + if (propose) { + validate_gw_map(group_key); } + } } +void NVMeofGwMap::set_failover_gw_for_ANA_group( + const NvmeGwId &failed_gw_id, const NvmeGroupKey& group_key, + const NvmeGwId &gw_id, NvmeAnaGrpId ANA_groupid) +{ + NvmeGwMonState& gw_state = created_gws[group_key][gw_id]; + epoch_t epoch; + dout(10) << "Found failover GW " << gw_id + << " for ANA group " << (int)ANA_groupid << dendl; + int rc = blocklist_gw(failed_gw_id, group_key, ANA_groupid, epoch, true); + if (rc) { + //start failover even when nonces are empty ! + gw_state.active_state(ANA_groupid); + } else{ + gw_state.sm_state[ANA_groupid] = + gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL; + gw_state.blocklist_data[ANA_groupid].osd_epoch = epoch; + gw_state.blocklist_data[ANA_groupid].is_failover = true; + // start Failover preparation timer + start_timer(gw_id, group_key, ANA_groupid, 30); + } +} -void NVMeofGwMap::set_failover_gw_for_ANA_group(const NvmeGwId &failed_gw_id, const NvmeGroupKey& group_key, const NvmeGwId &gw_id, NvmeAnaGrpId ANA_groupid) +void NVMeofGwMap::find_failback_gw( + const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose) { - NvmeGwMonState& gw_state = created_gws[group_key][gw_id]; - epoch_t epoch; - dout(10) << "Found failover GW " << gw_id << " for ANA group " << (int)ANA_groupid << dendl; - int rc = blocklist_gw(failed_gw_id, group_key, ANA_groupid, epoch, true); - if (rc) { - gw_state.active_state(ANA_groupid); //start failover even when nonces are empty ! + auto& gws_states = created_gws[group_key]; + auto& gw_state = created_gws[group_key][gw_id]; + bool do_failback = false; + + dout(10) << "Find failback GW for GW " << gw_id << dendl; + for (auto& gw_state_it: gws_states) { + auto& st = gw_state_it.second; + // some other gw owns or owned the desired ana-group + if (st.sm_state[gw_state.ana_grp_id] != + gw_states_per_group_t::GW_STANDBY_STATE) { + // if candidate is in state ACTIVE for the desired ana-group, + // then failback starts immediately, otherwise need to wait + do_failback = true; + dout(10) << "Found some gw " << gw_state_it.first + << " in state " << st.sm_state[gw_state.ana_grp_id] << dendl; + break; } - else{ - gw_state.sm_state[ANA_groupid] = gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL; - gw_state.blocklist_data[ANA_groupid].osd_epoch = epoch; - gw_state.blocklist_data[ANA_groupid].is_failover = true; - start_timer(gw_id, group_key, ANA_groupid, 30); //start Failover preparation timer + } + + if (do_failback == false) { + // No other gw currently performs some activity with desired ana + // group of coming-up GW - so it just takes over on the group + dout(10) << "Failback GW candidate was not found, " + << "just set Optimized to group " << gw_state.ana_grp_id + << " to GW " << gw_id << dendl; + gw_state.active_state(gw_state.ana_grp_id); + propose = true; + return; + } + + // try to do_failback + for (auto& gw_state_it: gws_states) { + auto& failback_gw_id = gw_state_it.first; + auto& st = gw_state_it.second; + if (st.sm_state[gw_state.ana_grp_id] == + gw_states_per_group_t::GW_ACTIVE_STATE) { + dout(10) << "Found Failback GW " << failback_gw_id + << " that previously took over the ANAGRP " + << gw_state.ana_grp_id << " of the available GW " + << gw_id << dendl; + st.sm_state[gw_state.ana_grp_id] = + gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED; + + // Add timestamp of start Failback preparation + start_timer(failback_gw_id, group_key, gw_state.ana_grp_id, 3); + gw_state.sm_state[gw_state.ana_grp_id] = + gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED; + propose = true; + break; } + } } -void NVMeofGwMap::find_failback_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose) +void NVMeofGwMap::find_failover_candidate( + const NvmeGwId &gw_id, const NvmeGroupKey& group_key, + NvmeAnaGrpId grpid, bool &propose_pending) { - auto& gws_states = created_gws[group_key]; - auto& gw_state = created_gws[group_key][gw_id]; - bool do_failback = false; - - dout(10) << "Find failback GW for GW " << gw_id << dendl; - for (auto& gw_state_it: gws_states) { - auto& st = gw_state_it.second; - if (st.sm_state[gw_state.ana_grp_id] != gw_states_per_group_t::GW_STANDBY_STATE) {// some other gw owns or owned the desired ana-group - do_failback = true;// if candidate is in state ACTIVE for the desired ana-group, then failback starts immediately, otherwise need to wait - dout(10) << "Found some gw " << gw_state_it.first << " in state " << st.sm_state[gw_state.ana_grp_id] << dendl; - break; - } + dout(10) << __func__<< " " << gw_id << dendl; +#define ILLEGAL_GW_ID " " +#define MIN_NUM_ANA_GROUPS 0xFFF + int min_num_ana_groups_in_gw = 0; + int current_ana_groups_in_gw = 0; + NvmeGwId min_loaded_gw_id = ILLEGAL_GW_ID; + auto& gws_states = created_gws[group_key]; + auto gw_state = gws_states.find(gw_id); + + // this GW may handle several ANA groups and for each + // of them need to found the candidate GW + if ((gw_state->second.sm_state[grpid] == + gw_states_per_group_t::GW_ACTIVE_STATE) || + gw_state->second.ana_grp_id == grpid) { + + // for all the gateways of the subsystem + for (auto& found_gw_state: gws_states) { + auto st = found_gw_state.second; + // some GW already started failover/failback on this group + if (st.sm_state[grpid] == gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL) { + dout(4) << "Warning : Failover" << st.blocklist_data[grpid].is_failover + << " already started for the group " << grpid + << " by GW " << found_gw_state.first << dendl; + gw_state->second.standby_state(grpid); + return ; + } } - - if (do_failback == false) { - // No other gw currently performs some activity with desired ana group of coming-up GW - so it just takes over on the group - dout(10) << "Failback GW candidate was not found, just set Optimized to group " << gw_state.ana_grp_id << " to GW " << gw_id << dendl; - gw_state.active_state(gw_state.ana_grp_id); - propose = true; - return; + // Find a GW that takes over the ANA group(s) + min_num_ana_groups_in_gw = MIN_NUM_ANA_GROUPS; + min_loaded_gw_id = ILLEGAL_GW_ID; + + // for all the gateways of the subsystem + for (auto& found_gw_state: gws_states) { + auto st = found_gw_state.second; + if (st.availability == gw_availability_t::GW_AVAILABLE) { + current_ana_groups_in_gw = 0; + for (auto& state_itr: created_gws[group_key][gw_id].sm_state) { + NvmeAnaGrpId anagrp = state_itr.first; + if ((st.sm_state[anagrp] == + gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED) || + (st.sm_state[anagrp] == + gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED) || + (st.sm_state[anagrp] == + gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL)) { + current_ana_groups_in_gw = 0xFFFF; + break; // dont take into account GWs in the transitive state + } else if (st.sm_state[anagrp] == + gw_states_per_group_t::GW_ACTIVE_STATE) { + // how many ANA groups are handled by this GW + current_ana_groups_in_gw++; + } + } + if (min_num_ana_groups_in_gw > current_ana_groups_in_gw) { + min_num_ana_groups_in_gw = current_ana_groups_in_gw; + min_loaded_gw_id = found_gw_state.first; + dout(10) << "choose: gw-id min_ana_groups " << min_loaded_gw_id + << current_ana_groups_in_gw << " min " + << min_num_ana_groups_in_gw << dendl; + } + } } - //try to do_failback - for (auto& gw_state_it: gws_states) { - auto& failback_gw_id = gw_state_it.first; - auto& st = gw_state_it.second; - if (st.sm_state[gw_state.ana_grp_id] == gw_states_per_group_t::GW_ACTIVE_STATE) { - dout(10) << "Found Failback GW " << failback_gw_id << " that previously took over the ANAGRP " << gw_state.ana_grp_id << " of the available GW " << gw_id << dendl; - st.sm_state[gw_state.ana_grp_id] = gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED; - start_timer(failback_gw_id, group_key, gw_state.ana_grp_id, 3);// Add timestamp of start Failback preparation - gw_state.sm_state[gw_state.ana_grp_id] = gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED; - propose = true; - break; - } + if (min_loaded_gw_id != ILLEGAL_GW_ID) { + propose_pending = true; + set_failover_gw_for_ANA_group(gw_id, group_key, min_loaded_gw_id, grpid); + } else { + // not found candidate but map changed. + if (gw_state->second.sm_state[grpid] == + gw_states_per_group_t::GW_ACTIVE_STATE) { + propose_pending = true; + dout(10) << "gw down: no candidate found " << dendl; + } } + gw_state->second.standby_state(grpid); + } } -void NVMeofGwMap::find_failover_candidate(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId grpid, bool &propose_pending) +void NVMeofGwMap::fsm_handle_gw_alive( + const NvmeGwId &gw_id, const NvmeGroupKey& group_key, + NvmeGwMonState & gw_state, gw_states_per_group_t state, + NvmeAnaGrpId grpid, epoch_t& last_osd_epoch, bool &map_modified) { - dout(10) <<__func__<< " " << gw_id << dendl; - #define ILLEGAL_GW_ID " " - #define MIN_NUM_ANA_GROUPS 0xFFF - int min_num_ana_groups_in_gw = 0; - int current_ana_groups_in_gw = 0; - NvmeGwId min_loaded_gw_id = ILLEGAL_GW_ID; - auto& gws_states = created_gws[group_key]; - auto gw_state = gws_states.find(gw_id); - - // this GW may handle several ANA groups and for each of them need to found the candidate GW - if (gw_state->second.sm_state[grpid] == gw_states_per_group_t::GW_ACTIVE_STATE || gw_state->second.ana_grp_id == grpid) { - - for (auto& found_gw_state: gws_states) { // for all the gateways of the subsystem - auto st = found_gw_state.second; - if (st.sm_state[grpid] == gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL) { // some GW already started failover/failback on this group - dout(4) << "Warning : Failover" << st.blocklist_data[grpid].is_failover << " already started for the group " << grpid << " by GW " << found_gw_state.first << dendl; - gw_state->second.standby_state(grpid); - return ; - } - } - // Find a GW that takes over the ANA group(s) - min_num_ana_groups_in_gw = MIN_NUM_ANA_GROUPS; - min_loaded_gw_id = ILLEGAL_GW_ID; - for (auto& found_gw_state: gws_states) { // for all the gateways of the subsystem - auto st = found_gw_state.second; - if (st.availability == gw_availability_t::GW_AVAILABLE) { - current_ana_groups_in_gw = 0; - for (auto& state_itr: created_gws[group_key][gw_id].sm_state ) { - NvmeAnaGrpId anagrp = state_itr.first; - if (st.sm_state[anagrp] == gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED || st.sm_state[anagrp] == gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED - || st.sm_state[anagrp] == gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL) { - current_ana_groups_in_gw = 0xFFFF; - break; // dont take into account GWs in the transitive state - } - else if (st.sm_state[anagrp] == gw_states_per_group_t::GW_ACTIVE_STATE) { - current_ana_groups_in_gw++; // how many ANA groups are handled by this GW - } - } - if (min_num_ana_groups_in_gw > current_ana_groups_in_gw) { - min_num_ana_groups_in_gw = current_ana_groups_in_gw; - min_loaded_gw_id = found_gw_state.first; - dout(10) << "choose: gw-id min_ana_groups " << min_loaded_gw_id << current_ana_groups_in_gw << " min " << min_num_ana_groups_in_gw << dendl; - } - } - } - if (min_loaded_gw_id != ILLEGAL_GW_ID) { - propose_pending = true; - set_failover_gw_for_ANA_group(gw_id, group_key, min_loaded_gw_id, grpid); - } - else { - if (gw_state->second.sm_state[grpid] == gw_states_per_group_t::GW_ACTIVE_STATE) {// not found candidate but map changed. - propose_pending = true; - dout(10) << "gw down: no candidate found " << dendl; - } - } - gw_state->second.standby_state(grpid); + switch (state) { + case gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL: + { + int timer_val = get_timer(gw_id, group_key, grpid); + NvmeGwMonState& gw_map = created_gws[group_key][gw_id]; + if (gw_map.blocklist_data[grpid].osd_epoch <= last_osd_epoch) { + dout(10) << "is-failover: " << gw_map.blocklist_data[grpid].is_failover + << " osd epoch changed from " + << gw_map.blocklist_data[grpid].osd_epoch + << " to "<< last_osd_epoch + << " Ana-grp: " << grpid + << " timer:" << timer_val << dendl; + // Failover Gw still alive and guaranteed that + gw_state.active_state(grpid); + // ana group wouldnt be taken back during blocklist wait period + cancel_timer(gw_id, group_key, grpid); + map_modified = true; + } else{ + dout(20) << "osd epoch not changed from " + << gw_map.blocklist_data[grpid].osd_epoch + << " to "<< last_osd_epoch + << " Ana-grp: " << grpid + << " timer:" << timer_val << dendl; } + } + break; + + default: + break; + } } -void NVMeofGwMap::fsm_handle_gw_alive(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeGwMonState & gw_state, gw_states_per_group_t state, NvmeAnaGrpId grpid, epoch_t& last_osd_epoch, bool &map_modified) +void NVMeofGwMap::fsm_handle_gw_down( + const NvmeGwId &gw_id, const NvmeGroupKey& group_key, + gw_states_per_group_t state, NvmeAnaGrpId grpid, bool &map_modified) { - switch (state) { - case gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL: - { - int timer_val = get_timer(gw_id, group_key, grpid); - NvmeGwMonState& gw_map = created_gws[group_key][gw_id]; - if (gw_map.blocklist_data[grpid].osd_epoch <= last_osd_epoch) { - dout(10) << "is-failover: " << gw_map.blocklist_data[grpid].is_failover << " osd epoch changed from " << gw_map.blocklist_data[grpid].osd_epoch << " to "<< last_osd_epoch - << " Ana-grp: " << grpid << " timer:" << timer_val << dendl; - gw_state.active_state(grpid); // Failover Gw still alive and guaranteed that - cancel_timer(gw_id, group_key, grpid); // ana group wouldnt be taken back during blocklist wait period - map_modified = true; - } - else{ - dout(20) << "osd epoch not changed from " << gw_map.blocklist_data[grpid].osd_epoch << " to "<< last_osd_epoch - << " Ana-grp: " << grpid << " timer:" << timer_val << dendl; - } - } + switch (state) { + case gw_states_per_group_t::GW_STANDBY_STATE: + case gw_states_per_group_t::GW_IDLE_STATE: + // nothing to do break; - default: - break; + case gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL: + { + cancel_timer(gw_id, group_key, grpid); + map_modified = true; + } + break; + + case gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED: + cancel_timer(gw_id, group_key, grpid); + map_modified = true; + for (auto& gw_st: created_gws[group_key]) { + auto& st = gw_st.second; + // found GW that was intended for Failback for this ana grp + if (st.sm_state[grpid] == + gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED) { + dout(4) << "Warning: Outgoing Failback when GW is down back" + << " - to rollback it" <<" GW " << gw_id << "for ANA Group " + << grpid << dendl; + st.standby_state(grpid); + break; + } } + break; + + case gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED: + // nothing to do - let failback timer expire + break; + + case gw_states_per_group_t::GW_ACTIVE_STATE: + { + find_failover_candidate(gw_id, group_key, grpid, map_modified); + } + break; + + default: + { + dout(4) << "Error : Invalid state " << state << "for GW " << gw_id << dendl; + } + } } - void NVMeofGwMap::fsm_handle_gw_down(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, gw_states_per_group_t state, NvmeAnaGrpId grpid, bool &map_modified) - { - switch (state) - { - case gw_states_per_group_t::GW_STANDBY_STATE: - case gw_states_per_group_t::GW_IDLE_STATE: - // nothing to do - break; - - case gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL: - { - cancel_timer(gw_id, group_key, grpid); - map_modified = true; - }break; - - case gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED: - cancel_timer(gw_id, group_key, grpid); - map_modified = true; - for (auto& gw_st: created_gws[group_key]) { - auto& st = gw_st.second; - if (st.sm_state[grpid] == gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED) { // found GW that was intended for Failback for this ana grp - dout(4) << "Warning: Outgoing Failback when GW is down back - to rollback it" <<" GW " <mon->osdmon()->is_writeable() << dendl; - if (m->mon->osdmon()->is_writeable()) { - epoch_t epoch = m->mon->osdmon()->blocklist(addr_vect, expires); - dout(10) << "epoch " << epoch <mon->nvmegwmon()->request_proposal(m->mon->osdmon()); - } - else { - m->mon->osdmon()->wait_for_writeable_ctx( new CMonRequestProposal(m, addr_vect, expires)); - } + dout(10) << "osdmon is writable? " + << m->mon->osdmon()->is_writeable() << dendl; + if (m->mon->osdmon()->is_writeable()) { + epoch_t epoch = m->mon->osdmon()->blocklist(addr_vect, expires); + dout(10) << "epoch " << epoch << dendl; + m->mon->nvmegwmon()->request_proposal(m->mon->osdmon()); + } else { + m->mon->osdmon()->wait_for_writeable_ctx( + new CMonRequestProposal(m, addr_vect, expires) + ); + } } }; -int NVMeofGwMap::blocklist_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId grpid, epoch_t &epoch, bool failover) +int NVMeofGwMap::blocklist_gw( + const NvmeGwId &gw_id, const NvmeGroupKey& group_key, + NvmeAnaGrpId grpid, epoch_t &epoch, bool failover) { - NvmeGwMonState& gw_map = created_gws[group_key][gw_id]; //find_already_created_gw(gw_id, group_key); - - if (gw_map.nonce_map[grpid].size() > 0) { - NvmeNonceVector &nonce_vector = gw_map.nonce_map[grpid];; - std::string str = "["; - entity_addrvec_t addr_vect; - - double d = g_conf().get_val("mon_osd_blocklist_default_expire"); - utime_t expires = ceph_clock_now(); - expires += d; - dout(10) << " blocklist timestamp " << expires << dendl; - for (auto &it: nonce_vector ) { - if (str != "[") str += ","; - str += it; - } - str += "]"; - bool rc = addr_vect.parse(&str[0]); - dout(10) << str << " rc " << rc << " network vector: " << addr_vect << " " << addr_vect.size() << dendl; - if (rc) - return 1; - - if (!mon->osdmon()->is_writeable()) { - dout(10) << "osdmon is not writable, waiting, epoch = " << epoch << dendl; - mon->osdmon()->wait_for_writeable_ctx( new CMonRequestProposal(this, addr_vect, expires ));// return false; - } - else { - epoch = mon->osdmon()->blocklist(addr_vect, expires); - if (!mon->osdmon()->is_writeable()) { - dout(10) << "osdmon is not writable after blocklist is done, waiting, epoch = " << epoch << dendl; - mon->osdmon()->wait_for_writeable_ctx( new CMonRequestProposal(this, addr_vect, expires ));// return false; - } - else{ - mon->nvmegwmon()->request_proposal(mon->osdmon()); - } - } - dout(10) << str << " mon->osdmon()->blocklist: epoch : " << epoch << " address vector: " << addr_vect << " " << addr_vect.size() << dendl; + // find_already_created_gw(gw_id, group_key); + NvmeGwMonState& gw_map = created_gws[group_key][gw_id]; + + if (gw_map.nonce_map[grpid].size() > 0) { + NvmeNonceVector &nonce_vector = gw_map.nonce_map[grpid];; + std::string str = "["; + entity_addrvec_t addr_vect; + + double d = g_conf().get_val("mon_osd_blocklist_default_expire"); + utime_t expires = ceph_clock_now(); + expires += d; + dout(10) << " blocklist timestamp " << expires << dendl; + for (auto &it: nonce_vector ) { + if (str != "[") str += ","; + str += it; } - else{ - dout(4) << "Error: No nonces context present for gw: " <osdmon()->is_writeable()) { + dout(10) << "osdmon is not writable, waiting, epoch = " << epoch << dendl; + mon->osdmon()->wait_for_writeable_ctx( + new CMonRequestProposal(this, addr_vect, expires) + ); + // return false; + } else { + epoch = mon->osdmon()->blocklist(addr_vect, expires); + if (!mon->osdmon()->is_writeable()) { + dout(10) << "osdmon is not writable after blocklist is " + << "done, waiting, epoch = " << epoch << dendl; + mon->osdmon()->wait_for_writeable_ctx( + new CMonRequestProposal(this, addr_vect, expires) + ); + // return false; + } else{ + mon->nvmegwmon()->request_proposal(mon->osdmon()); + } + } + dout(10) << str << " mon->osdmon()->blocklist: epoch : " << epoch + << " address vector: " << addr_vect << " " + << addr_vect.size() << dendl; + } else{ + dout(4) << "Error: No nonces context present for gw: " + << gw_id << " ANA group: " << grpid << dendl; + return 1; + } + return 0; } void NVMeofGwMap::validate_gw_map(const NvmeGroupKey& group_key) { - for (auto& gw_created: created_gws[group_key]) { - auto gw_id = gw_created.first; - for (auto& state_itr: created_gws[group_key][gw_id].sm_state ) { - NvmeAnaGrpId ana_group = state_itr.first; - int count = 0; - for (auto& gw_created_pair: created_gws[group_key]) { - auto& st = gw_created_pair.second; - if (st.sm_state[ana_group] == gw_states_per_group_t::GW_ACTIVE_STATE) { - count ++; - if (count == 2) { - dout(1) << "Critical Error : number active states per ana-group " << ana_group << "more than 1 in pool-group " << group_key << dendl; - dout(4) << created_gws[group_key] << dendl; - } - } - } - } - break; + for (auto& gw_created: created_gws[group_key]) { + auto gw_id = gw_created.first; + for (auto& state_itr: created_gws[group_key][gw_id].sm_state) { + NvmeAnaGrpId ana_group = state_itr.first; + int count = 0; + for (auto& gw_created_pair: created_gws[group_key]) { + auto& st = gw_created_pair.second; + if (st.sm_state[ana_group] == gw_states_per_group_t::GW_ACTIVE_STATE) { + count ++; + if (count == 2) { + dout(1) << "Critical Error : number active states per ana-group " + << ana_group << "more than 1 in pool-group " << group_key + << dendl; + dout(4) << created_gws[group_key] << dendl; + } + } + } + } + break; } } -void NVMeofGwMap::update_active_timers( bool &propose_pending ) { - const auto now = std::chrono::system_clock::now(); - for (auto& group_to: fsm_timers) { - auto& group_key = group_to.first; - auto& pool = group_key.first; - auto& group = group_key.second; - for (auto& gw_to: group_to.second) { - auto& gw_id = gw_to.first; - auto& to = gw_to.second; - for (auto &to_itr:to.data) - { - if (to.data[to_itr.first].timer_started == 0) continue; - dout(20) << "Checking timer for GW " << gw_id << " ANA GRP " << to_itr.first<< " value(seconds): "<< (int)to.data[to_itr.first].timer_value << dendl; - if (now >= to.data[to_itr.first].end_time) { - fsm_handle_to_expired(gw_id, std::make_pair(pool, group), to_itr.first, propose_pending); - } - } - } +void NVMeofGwMap::update_active_timers(bool &propose_pending) +{ + const auto now = std::chrono::system_clock::now(); + for (auto& group_to: fsm_timers) { + auto& group_key = group_to.first; + auto& pool = group_key.first; + auto& group = group_key.second; + for (auto& gw_to: group_to.second) { + auto& gw_id = gw_to.first; + auto& to = gw_to.second; + for (auto &to_itr:to.data) { + if (to.data[to_itr.first].timer_started == 0) continue; + dout(20) << "Checking timer for GW " << gw_id << " ANA GRP " + << to_itr.first<< " value(seconds): " + << (int)to.data[to_itr.first].timer_value << dendl; + if (now >= to.data[to_itr.first].end_time) { + fsm_handle_to_expired( + gw_id, + std::make_pair(pool, group), to_itr.first, propose_pending); + } + } } + } } -void NVMeofGwMap::start_timer(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid, uint8_t value_sec) { - fsm_timers[group_key][gw_id].data[anagrpid].timer_started = 1; - fsm_timers[group_key][gw_id].data[anagrpid].timer_value = value_sec; - dout(10) << "start timer for ana " << anagrpid << " gw " << gw_id << "value sec " << (int)value_sec << dendl; - const auto now = std::chrono::system_clock::now(); - fsm_timers[group_key][gw_id].data[anagrpid].end_time = now + std::chrono::seconds(value_sec); +void NVMeofGwMap::start_timer( + const NvmeGwId &gw_id, const NvmeGroupKey& group_key, + NvmeAnaGrpId anagrpid, uint8_t value_sec) +{ + fsm_timers[group_key][gw_id].data[anagrpid].timer_started = 1; + fsm_timers[group_key][gw_id].data[anagrpid].timer_value = value_sec; + dout(10) << "start timer for ana " << anagrpid << " gw " + << gw_id << "value sec " << (int)value_sec << dendl; + const auto now = std::chrono::system_clock::now(); + fsm_timers[group_key][gw_id].data[anagrpid].end_time = + now + std::chrono::seconds(value_sec); } -int NVMeofGwMap::get_timer(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid) { - auto timer = fsm_timers[group_key][gw_id].data[anagrpid].timer_value; - return timer; +int NVMeofGwMap::get_timer( + const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid) +{ + auto timer = fsm_timers[group_key][gw_id].data[anagrpid].timer_value; + return timer; } -void NVMeofGwMap::cancel_timer(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid) { - fsm_timers[group_key][gw_id].data[anagrpid].timer_started = 0; +void NVMeofGwMap::cancel_timer( + const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid) +{ + fsm_timers[group_key][gw_id].data[anagrpid].timer_started = 0; } -- 2.47.3