for (const auto& gw_created_pair: gw_created_map) {
const auto& gw_id = gw_created_pair.first;
const auto& gw_created = gw_created_pair.second;
+ gw_availability_t availability = gw_created.availability;
+ // Gateways expect to see UNAVAILABLE, not DELETING
+ // for entries in DELETING state
+ if (gw_created.availability == gw_availability_t::GW_DELETING) {
+ availability = gw_availability_t::GW_UNAVAILABLE;
+ }
auto gw_state = NvmeGwClientState(
- gw_created.ana_grp_id, epoch, gw_created.availability);
+ gw_created.ana_grp_id, epoch, availability);
for (const auto& sub: gw_created.subsystems) {
gw_state.subsystems.insert({
sub.nqn,
for (auto& itr: created_gws[group_key]) {
allocated.insert(itr.second.ana_grp_id);
if (itr.first == gw_id) {
- dout(1) << __func__ << " ERROR create GW: already exists in map "
- << gw_id << dendl;
- return -EEXIST ;
+ if (itr.second.availability != gw_availability_t::GW_DELETING) {
+ dout(1) << __func__ << " ERROR create GW: already exists in map "
+ << gw_id << dendl;
+ return -EEXIST;
+ } else {
+ //this GW exists in the map in "Deleting" state
+ // but user again creates it - need just set attribute values
+ created_gws[group_key][gw_id].performed_full_startup = true;
+ created_gws[group_key][gw_id].availability
+ = gw_availability_t::GW_CREATED;
+ dout(4) << "GW in Deleting state " << gw_id
+ << " was created again" << dendl;
+ return 0;
+ }
}
+ if (itr.second.availability == gw_availability_t::GW_DELETING) {
+ //Was found some GW in "Deleting" state. Just to inherit its ANA group
+ NvmeGwMonState & gw_created = created_gws[group_key][itr.first];
+ created_gws[group_key][gw_id] = gw_created;
+ // Deep copy of all data of "Deleting" GW
+ created_gws[group_key][gw_id].performed_full_startup = true;
+ created_gws[group_key][gw_id].availability
+ = gw_availability_t::GW_CREATED;
+ dout(4) << "Created GW inherits ANA group of deleting GW-id :"
+ << itr.first << " group " << itr.second.ana_grp_id << dendl;
+ do_erase_gw_id(itr.first, group_key);
+ dout(4) << "Created GWS after create/delete: "
+ << created_gws << dendl;
+ return 0;
+ }
}
if (allocated.size() == MAX_SUPPORTED_ANA_GROUPS) {
dout(4) << "Warning: cannot add GW " << gw_id
int NVMeofGwMap::cfg_delete_gw(
const NvmeGwId &gw_id, const NvmeGroupKey& group_key)
{
- int rc = 0;
+ if (HAVE_FEATURE(mon->get_quorum_con_features(), NVMEOFHA)) {
+ dout(10) << " has NVMEOFHA: 1" << dendl;
+ for (auto& gws_states: created_gws[group_key]) {
+ if (gws_states.first == gw_id) {
+ auto& state = gws_states.second;
+ state.availability = gw_availability_t::GW_DELETING;
+ dout(4) << " Deleting GW :"<< gw_id << " in state "
+ << state.availability << " Resulting GW availability: "
+ << state.availability << dendl;
+ return 0;
+ }
+ }
+ } else {
+ return do_delete_gw(gw_id, group_key);
+ }
+ return -EINVAL;
+}
+
+int NVMeofGwMap::do_erase_gw_id(const NvmeGwId &gw_id,
+ const NvmeGroupKey& group_key) {
+
+ fsm_timers[group_key].erase(gw_id);
+ if (fsm_timers[group_key].size() == 0)
+ fsm_timers.erase(group_key);
+
+ created_gws[group_key].erase(gw_id);
+ if (created_gws[group_key].size() == 0)
+ created_gws.erase(group_key);
+ return 0;
+}
+
+int NVMeofGwMap::do_delete_gw(
+ const NvmeGwId &gw_id, const NvmeGroupKey& group_key)
+{
for (auto& gws_states: created_gws[group_key]) {
if (gws_states.first == gw_id) {
gw_id, group_key,state_itr.second , state_itr.first, modified);
}
dout(10) << " Delete GW :"<< gw_id << " ANA grpid: "
- << state.ana_grp_id << dendl;
+ << state.ana_grp_id << dendl;
for (auto& itr: created_gws[group_key]) {
// Update state map and other maps
remove_grp_id(itr.first, group_key, state.ana_grp_id);
// of all created gateways. Removed key = anagrp
}
- fsm_timers[group_key].erase(gw_id);
- if (fsm_timers[group_key].size() == 0)
- fsm_timers.erase(group_key);
-
- created_gws[group_key].erase(gw_id);
- if (created_gws[group_key].size() == 0)
- created_gws.erase(group_key);
- return rc;
+ return do_erase_gw_id(gw_id, group_key);
}
}
return -EINVAL;
}
+int NVMeofGwMap::get_num_namespaces(const NvmeGwId &gw_id,
+ const NvmeGroupKey& group_key, const BeaconSubsystems& subs)
+{
+ auto grpid = created_gws[group_key][gw_id].ana_grp_id ;
+ int num_ns = 0;
+ for (auto & subs_it:subs) {
+ for (auto & ns :subs_it.namespaces) {
+ if (ns.anagrpid == (grpid+1)) {
+ num_ns++;
+ }
+ }
+ }
+ return num_ns;
+}
+
+void NVMeofGwMap::track_deleting_gws(const NvmeGroupKey& group_key,
+ const BeaconSubsystems& subs, bool &propose_pending)
+{
+ propose_pending = false;
+ for (auto& itr: created_gws[group_key]) {
+ auto &gw_id = itr.first;
+ if (itr.second.availability == gw_availability_t::GW_DELETING) {
+ int num_ns = 0;
+ if ( (num_ns = get_num_namespaces(gw_id, group_key, subs)) == 0) {
+ do_delete_gw(gw_id, group_key);
+ propose_pending = true;
+ }
+ dout(4) << " to delete ? " << gw_id << " num_ns " << num_ns << dendl;
+ break; // handle just one GW in "Deleting" state in time.
+ }
+ }
+}
+
int NVMeofGwMap::process_gw_map_gw_down(
const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose_pending)
{
dout(20) << "KA beacon from the GW " << gw_id
<< " in state " << (int)st.availability << dendl;
- if (st.availability == gw_availability_t::GW_CREATED) {
- // first time appears - allow IO traffic for this GW
- st.availability = gw_availability_t::GW_AVAILABLE;
- for (auto& state_itr: created_gws[group_key][gw_id].sm_state) {
- state_itr.second = gw_states_per_group_t::GW_STANDBY_STATE;
- }
- if (st.ana_grp_id != REDUNDANT_GW_ANA_GROUP_ID) { // not a redundand GW
- st.active_state(st.ana_grp_id);
- }
- propose_pending = true;
- } else if (st.availability == gw_availability_t::GW_UNAVAILABLE) {
+ if (st.availability == gw_availability_t::GW_CREATED ||
+ st.availability == gw_availability_t::GW_UNAVAILABLE) {
st.availability = gw_availability_t::GW_AVAILABLE;
if (st.ana_grp_id == REDUNDANT_GW_ANA_GROUP_ID) {
for (auto& state_itr: created_gws[group_key][gw_id].sm_state) {
// 1. Failover missed : is there is a GW in unavailable state?
// if yes, is its ANA group handled by some other GW?
- if (state.availability == gw_availability_t::GW_UNAVAILABLE &&
+ if ((state.availability == gw_availability_t::GW_UNAVAILABLE ||
+ state.availability == gw_availability_t::GW_DELETING ||
+ state.availability == gw_availability_t::GW_CREATED) &&
state.ana_grp_id != REDUNDANT_GW_ANA_GROUP_ID) {
auto found_gw_for_ana_group = false;
for (auto& gw_state2 : gws_states) {
}
// choose the GW for handle ana group
if (found_gw_for_ana_group == false) {
- dout(10) << "Was not found the GW " << " that handles ANA grp "
+ dout(20) << "Was not found the GW " << " that handles ANA grp "
<< (int)state.ana_grp_id << " find candidate "<< dendl;
for (auto& state_itr: created_gws[group_key][gw_id].sm_state) {
find_failover_candidate(gw_id, group_key, state_itr.first, propose);
const NvmeGwId &gw_id, NvmeAnaGrpId ANA_groupid)
{
NvmeGwMonState& gw_state = created_gws[group_key][gw_id];
+ NvmeGwMonState& failed_gw_state = created_gws[group_key][failed_gw_id];
epoch_t epoch;
dout(10) << "Found failover GW " << gw_id
<< " for ANA group " << (int)ANA_groupid << dendl;
+ if (failed_gw_state.availability == gw_availability_t::GW_CREATED) {
+ dout(10) << "Failover GW " << gw_id <<
+ " takes over the group of GW in Created state " <<
+ failed_gw_id << dendl;
+ // just take over on the group of created GW
+ gw_state.active_state(ANA_groupid);
+ return;
+ }
int rc = blocklist_gw(failed_gw_id, group_key, ANA_groupid, epoch, true);
if (rc) {
//start failover even when nonces are empty !
gw_state.active_state(ANA_groupid);
- } else{
+ } else {
gw_state.sm_state[ANA_groupid] =
gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL;
gw_state.blocklist_data[ANA_groupid].osd_epoch = epoch;
// ana group wouldnt be taken back during blocklist wait period
cancel_timer(gw_id, group_key, grpid);
map_modified = true;
- } else{
+ } else {
dout(20) << "osd epoch not changed from "
<< gw_map.blocklist_data[grpid].osd_epoch
<< " to "<< last_osd_epoch
void NVMeofGwMap::fsm_handle_gw_delete(
const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
gw_states_per_group_t state , NvmeAnaGrpId grpid, bool &map_modified) {
+ //This function is called when GW already passed Failover and its native
+ //Ana group has no volumes, so some states are not relevant
switch (state) {
case gw_states_per_group_t::GW_STANDBY_STATE:
case gw_states_per_group_t::GW_IDLE_STATE:
{
NvmeGwMonState& gw_state = created_gws[group_key][gw_id];
- // Try to find GW that temporary owns my group - if found,
- // this GW should pass to standby for this group
+ // Try to find GW that temporary owns gw-id group that is about to disappear!
+ // - if found, this GW should pass to standby for this group
if (grpid == gw_state.ana_grp_id) {
auto& gateway_states = created_gws[group_key];
for (auto& gs: gateway_states) {
}
break;
- case gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL:
- {
- NvmeGwMonState& gw_state = created_gws[group_key][gw_id];
- cancel_timer(gw_id, group_key, grpid);
- map_modified = true;
- gw_state.standby_state(grpid);
- }
- break;
-
- case gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED:
- {
- cancel_timer(gw_id, group_key, grpid);
- map_modified = true;
- for (auto& nqn_gws_state: created_gws[group_key]) {
- auto& st = nqn_gws_state.second;
-
- // found GW that was intended for Failback for this ana grp
- if (st.sm_state[grpid] ==
- gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED) {
- dout(4) << "Warning: Outgoing Failback when GW is deleted "
- << "- to rollback it GW " << gw_id << "for ANA Group "
- << grpid << dendl;
- st.standby_state(grpid);
- break;
- }
- }
- }
- break;
-
- case gw_states_per_group_t::GW_ACTIVE_STATE:
- {
- NvmeGwMonState& gw_state = created_gws[group_key][gw_id];
- map_modified = true;
- gw_state.standby_state(grpid);
- }
- break;
-
default: {
dout(4) << "Error : Invalid state " << state
<< "for GW " << gw_id << dendl;
new CMonRequestProposal(this, addr_vect, expires)
);
// return false;
- } else{
+ } else {
mon->nvmegwmon()->request_proposal(mon->osdmon());
}
}
dout(10) << str << " mon->osdmon()->blocklist: epoch : " << epoch
<< " address vector: " << addr_vect << " "
<< addr_vect.size() << dendl;
- } else{
+ } else {
dout(4) << "Error: No nonces context present for gw: "
<< gw_id << " ANA group: " << grpid << dendl;
return 1;
const auto cutoff = now - nvmegw_beacon_grace;
// Pass over all the stored beacons
+ NvmeGroupKey old_group_key;
for (auto &itr : last_beacon) {
auto& lb = itr.first;
auto last_beacon_time = itr.second;
_propose_pending |= propose;
last_beacon.erase(lb);
} else {
+ BeaconSubsystems *subsystems =
+ &pending_map.created_gws[lb.group_key][lb.gw_id].subsystems;
+ if (subsystems && subsystems->size() && old_group_key != lb.group_key) {
+ // to call track_deleting_gws once per each group-key
+ pending_map.track_deleting_gws(lb.group_key, *subsystems, propose);
+ old_group_key = lb.group_key;
+ _propose_pending |= propose;
+ }
dout(20) << "beacon live for GW key: " << lb.gw_id << dendl;
}
}
auto group_key = std::make_pair(pool, group);
dout(10) << "nvme-gw show pool " << pool << " group " << group << dendl;
- if (map.created_gws[group_key].size()) {
- f->open_object_section("common");
- f->dump_unsigned("epoch", map.epoch);
- f->dump_string("pool", pool);
- f->dump_string("group", group);
- f->dump_unsigned("num gws", map.created_gws[group_key].size());
+ f->open_object_section("common");
+ f->dump_unsigned("epoch", map.epoch);
+ f->dump_string("pool", pool);
+ f->dump_string("group", group);
+ if (HAVE_FEATURE(mon.get_quorum_con_features(), NVMEOFHA)) {
+ f->dump_string("features", "LB");
+ }
+ f->dump_unsigned("num gws", map.created_gws[group_key].size());
+ if (map.created_gws[group_key].size() == 0) {
+ f->close_section();
+ f->flush(rdata);
+ sstrm.str("");
+ } else {
sstrm << "[ ";
NvmeGwId gw_id;
+ BeaconSubsystems *subsystems = NULL;
for (auto& gw_created_pair: map.created_gws[group_key]) {
- gw_id = gw_created_pair.first;
- auto& st = gw_created_pair.second;
- sstrm << st.ana_grp_id+1 << " ";
+ gw_id = gw_created_pair.first;
+ auto& st = gw_created_pair.second;
+ if (st.availability != gw_availability_t::GW_DELETING) {
+ // not show ana group of deleting gw in the list -
+ // it is information for the GW used in rebalancing process
+ sstrm << st.ana_grp_id+1 << " ";
+ }
+ if (st.availability == gw_availability_t::GW_AVAILABLE) {
+ subsystems = &st.subsystems;
+ }
}
sstrm << "]";
f->dump_string("Anagrp list", sstrm.str());
- f->close_section();
-
+ std::map<NvmeAnaGrpId, uint16_t> num_ns;
+ uint16_t total_ns = 0;
+ if (subsystems && subsystems->size()) {
+ for (auto & subs_it:*subsystems) {
+ for (auto & ns :subs_it.namespaces) {
+ if (num_ns.find(ns.anagrpid) == num_ns.end()) num_ns[ns.anagrpid] = 0;
+ num_ns[ns.anagrpid] +=1;
+ total_ns += 1;
+ }
+ }
+ }
+ f->dump_unsigned("num-namespaces", total_ns);
+ f->open_array_section("Created Gateways:");
+ uint32_t i = 0;
for (auto& gw_created_pair: map.created_gws[group_key]) {
auto& gw_id = gw_created_pair.first;
auto& state = gw_created_pair.second;
+ i = 0;
f->open_object_section("stat");
f->dump_string("gw-id", gw_id);
f->dump_unsigned("anagrp-id",state.ana_grp_id+1);
+ f->dump_unsigned("num-namespaces", num_ns[state.ana_grp_id+1]);
f->dump_unsigned("performed-full-startup", state.performed_full_startup);
std::stringstream sstrm1;
sstrm1 << state.availability;
sstrm1.str("");
for (auto &state_itr: map.created_gws[group_key][gw_id].sm_state) {
sstrm1 << " " << state_itr.first + 1 << ": "
- << state.sm_state[state_itr.first] << ",";
+ << state.sm_state[state_itr.first];
+ if (++i < map.created_gws[group_key][gw_id].sm_state.size())
+ sstrm1<< ", ";
}
f->dump_string("ana states", sstrm1.str());
f->close_section();
}
+ f->close_section();
+ f->close_section();
f->flush(rdata);
sstrm.str("");
}
- else {
- sstrm << "num_gws 0";
- }
getline(sstrm, rs);
mon.reply_command(op, err, rs, rdata, get_last_committed());
return true;
<< " " << pool << " " << group << " rc " << rc << dendl;
sstrm.str("");
}
- }
- else{
+ } else {
rc = pending_map.cfg_delete_gw(id, group_key);
- if (rc == -EINVAL) {
+ if (rc == 0) {
+ bool propose = false;
+ // Simulate immediate Failover of this GW
+ process_gw_down(id, group_key, propose);
+ } else if (rc == -EINVAL) {
dout (4) << "Error: GW not found in the database " << id << " "
<< pool << " " << group << " rc " << rc << dendl;
err = 0;
sstrm.str("");
}
- if (rc == 0) {
- LastBeacon lb = {id, group_key};
- last_beacon.erase(lb);
- }
}
// propose pending would be generated by the PaxosService
if ((rc != -EEXIST) && (rc != -EINVAL)) {
return response;
}
+void NVMeofGwMon::process_gw_down(const NvmeGwId &gw_id,
+ const NvmeGroupKey& group_key, bool &propose_pending)
+{
+ LastBeacon lb = {gw_id, group_key};
+ auto it = last_beacon.find(lb);
+ if (it != last_beacon.end()) {
+ last_beacon.erase(it);
+ pending_map.process_gw_map_gw_down(gw_id, group_key, propose_pending);
+ }
+}
bool NVMeofGwMon::preprocess_beacon(MonOpRequestRef op)
{
<< pending_map.created_gws[group_key][gw_id].nonce_map << dendl;
nonce_propose = true;
}
- } else {
+ } else {
dout(10) << "Warning: received empty nonce map in the beacon of GW "
<< gw_id << " " << dendl;
}
pending_map.process_gw_map_ka(gw_id, group_key, last_osd_epoch, propose);
// state set by GW client application
} else if (avail == gw_availability_t::GW_UNAVAILABLE) {
- LastBeacon lb = {gw_id, group_key};
-
- auto it = last_beacon.find(lb);
- if (it != last_beacon.end()) {
- last_beacon.erase(lb);
- pending_map.process_gw_map_gw_down(gw_id, group_key, propose);
- }
+ process_gw_down(gw_id, group_key, propose);
}
// Periodic: check active FSM timers
pending_map.update_active_timers(timer_propose);