}
dout(10) << " Delete GW :"<< gw_id << " ANA grpid: "
<< state.ana_grp_id << dendl;
+ if(is_last_gw_in_location(gw_id, group_key, state.location)) {
+ disaster_map_remove_location(group_key, state.location);
+ }
+
for (auto& itr: created_gws[group_key]) {
// Update state map and other maps
remove_grp_id(itr.first, group_key, state.ana_grp_id);
dout(4) << "GW-id same location is set " << group_key
<< " " << gw_id << " " << location << dendl;
return 0;
- } else {
- bool last_gw = true;
+ } else { //is_last_gw_in_location
+ bool last_gw = is_last_gw_in_location(gw_id, group_key, st.location);
for (auto& states: created_gws[group_key]) {
- auto &state = states.second;
- // calculate number set locations
- locations.insert(state.location);
- if (state.location == st.location && states.first != gw_id) {
- last_gw = false;
- break;
- }
+ locations.insert(states.second.location);
}
if (last_gw) { // this location would be removed so erase from set
locations.erase(st.location);
}
locations.insert(location);
dout(10) << "num GWs " << num_gws << " num set locations "
- << locations.size() << dendl;
+ << locations.size() << dendl;
bool rc = validate_number_locations(num_gws, locations.size());
if (rc ==false) {
dout(4) << "defined invalid number of locations "
}
if (last_gw) {
dout(4) << "remove location:last gw-id " << gw_id << " location "
- << st.location << dendl;
+ << st.location << dendl;
+ disaster_map_remove_location(group_key, st.location);
}
st.location = location;
dout(10) << "GW-id location is set " << group_key
}
}
-int NVMeofGwMap::cfg_start_inter_location_failback(
+bool NVMeofGwMap::is_last_gw_in_location(const NvmeGwId &gw_id,
+ const NvmeGroupKey& group_key, NvmeLocation& location) {
+ for (auto& states: created_gws[group_key]) {
+ auto &state = states.second;
+ if (state.location == location && states.first != gw_id) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool NVMeofGwMap::is_location_in_disaster(const NvmeGroupKey& group_key,
+ NvmeLocation& location, bool &cleanup_in_process) {
+ auto grp_it = disaster_locations.find(group_key);
+ if (grp_it != disaster_locations.end()) {
+ auto &loc_states = grp_it->second;
+ if (loc_states.find(location) != loc_states.end()) {
+ cleanup_in_process =
+ disaster_locations[group_key][location].failbacks_in_process;
+ return true;
+ }
+ }
+ return false;
+}
+
+bool NVMeofGwMap::get_location_in_disaster_cleanup(const NvmeGroupKey& group_key,
+ NvmeLocation& returned_location) {
+ auto grp_it = disaster_locations.find(group_key);
+ if (grp_it != disaster_locations.end()) {
+ for (auto &entry : grp_it->second) {
+ const NvmeLocation &location = entry.first;
+ if (entry.second.failbacks_in_process) { // find first location in recovering state and return
+ returned_location = location;
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+void NVMeofGwMap::disaster_map_remove_location(const NvmeGroupKey& group_key,
+ NvmeLocation& location) {
+ // this function called when GW with last location removed from the group
+ //or when removed location from the disaster_location map
+ auto &locs = disaster_locations.at(group_key);
+ locs.erase(location); // remove location if found
+ if (locs.empty()) {
+ disaster_locations.erase(group_key);
+ }
+}
+
+int NVMeofGwMap::cfg_location_disaster_set(
const NvmeGroupKey& group_key,
std::string &location, bool &propose_pending) {
+
+ if (!HAVE_FEATURE(mon->get_quorum_con_features(), NVMEOF_BEACON_DIFF)) {
+ dout(4) << "Command is not allowed - feature is not installed"
+ << group_key << dendl;
+ return -EINVAL;
+ }
+ bool cleanup_in_process;
+ bool location_exists = false;
auto& gws_states = created_gws[group_key];
- bool accept = false;
- // for all the gateways of the subsystem
+ if (is_location_in_disaster(group_key, location, cleanup_in_process)) {
+ dout(4) << "command cannot be accepted since location already in disaster "
+ << location << dendl;
+ return -EEXIST;
+ }
+ // validate: check that all gws in location are not available
+ for (auto& found_gw_state: gws_states) {
+ auto st = found_gw_state.second;
+ if (st.location == location &&
+ st.availability == gw_availability_t::GW_AVAILABLE) {
+ dout(4) << "command cannot be accepted since gw " << found_gw_state.first
+ <<" in location " << location << " is available" << dendl;
+ return -EINVAL;
+ }
+ if (st.location == location) {
+ location_exists = true;
+ }
+ }
+ if (!location_exists) {
+ dout(4) << "command cannot be accepted since in group " << group_key
+ <<" location " << location << " was not configured yet" << dendl;
+ return -EINVAL;
+ }
+ dout(10) << " set disaster location " << location << " in group " << group_key << dendl;
+ disaster_locations[group_key][location];
+ propose_pending = true;
+ return 0;
+}
+
+int NVMeofGwMap::cfg_location_disaster_clear(
+ const NvmeGroupKey& group_key,
+ std::string &location, bool &propose_pending) {
+
if (!HAVE_FEATURE(mon->get_quorum_con_features(), NVMEOF_BEACON_DIFF)) {
dout(4) << "Command is not allowed - feature is not installed"
<< group_key << dendl;
return -EINVAL;
}
- if (failbacks_in_progress.find(group_key) != failbacks_in_progress.end()) {
- dout(4) << "command cannot be accepted since found active failback for a group "
- << failbacks_in_progress[group_key] << dendl;
+ auto& gws_states = created_gws[group_key];
+ bool accept = false;
+ bool cleanup_in_process;
+ // for all the gateways of the subsystem
+ if (!is_location_in_disaster(group_key, location, cleanup_in_process)) {
+ dout(4) << "command cannot be accepted: in a group " << group_key
+ << " disaster location " << location << " was not found" << dendl;
+ return -EINVAL;
+ } else if (cleanup_in_process) {
+ dout(4) << "command cannot be accepted since recovering already started for group "
+ << group_key << "and location " << location << dendl;
return -EEXIST;
}
for (auto& found_gw_state: gws_states) {
}
}
}
- if (accept) {
- failbacks_in_progress[group_key] = location;
+ if (accept) { // conditions for a cleanup
+ disaster_locations[group_key][location].failbacks_in_process = true;
propose_pending = true;
return 0;
} else {
- dout(10) << "command not accepted: not found AVAILABLE GW"
+ dout(10) << "command accepted: but not found AVAILABLE GW"
"with ANA grp in standby state" << dendl;
- return -EINVAL;
+ disaster_map_remove_location(group_key, location);
+ propose_pending = true;
+ return 0;//-EINVAL;
}
}
for (auto& gw_state : gws_states) { // loop for GWs inside nqn group
auto& gw_id = gw_state.first;
NvmeGwMonState& state = gw_state.second;
-
+ bool disaster_cleanup = false;
+ bool in_disaster =
+ is_location_in_disaster(group_key, state.location, disaster_cleanup);
// 1. Failover missed : is there is a GW in unavailable state?
// if yes, is its ANA group handled by some other GW?
- if ((state.availability == gw_availability_t::GW_UNAVAILABLE ||
+ if ( (state.availability == gw_availability_t::GW_UNAVAILABLE ||
state.availability == gw_availability_t::GW_DELETING ||
- state.availability == gw_availability_t::GW_CREATED) &&
+ state.availability == gw_availability_t::GW_CREATED ||
+ (in_disaster && !disaster_cleanup))
+ &&
state.ana_grp_id != REDUNDANT_GW_ANA_GROUP_ID) {
auto found_gw_for_ana_group = false;
for (auto& gw_state2 : gws_states) {
if (state2.availability == gw_availability_t::GW_AVAILABLE &&
(state2.sm_state[state.ana_grp_id] ==
gw_states_per_group_t::GW_ACTIVE_STATE)) {
- found_gw_for_ana_group = true;
+ found_gw_for_ana_group = true; // found GW that currently handles the ana group
break;
}
}
// choose the GW for handle ana group
if (found_gw_for_ana_group == false) {
- dout(20) << "Was not found the GW " << " that handles ANA grp "
+ dout(10) << "No GW currently" << " handles ANA grp "
<< (int)state.ana_grp_id << " find candidate "<< dendl;
for (auto& state_itr: created_gws[group_key][gw_id].sm_state) {
find_failover_candidate(gw_id, group_key, state_itr.first, propose);
void NVMeofGwMap::check_relocate_ana_groups(const NvmeGroupKey& group_key,
bool &propose) {
- /* if location exists in failbacks_in_progress find all gws in location.
+ /* if location in disaster_locations found in recovering state state - find all gws in location.
* add ana-grp of not Available gws to the list.
* if ana-grp is already active on some gw in location skip it
* for ana-grp in list make relocation.
- * if all ana-grps in location active remove location from the map failbacks_in_progress
+ * if all ana-grps in location active remove location from the map disaster_locations.group
*/
+ if (!HAVE_FEATURE(mon->get_quorum_con_features(), NVMEOF_BEACON_DIFF)) {
+ dout(20) << "relocate is not allowed - feature is not installed"
+ << group_key << dendl;
+ return ;
+ }
std::list<NvmeAnaGrpId> reloc_list;
auto& gws_states = created_gws[group_key];
- if (failbacks_in_progress.find(group_key) != failbacks_in_progress.end()) {
- FailbackLocation location = failbacks_in_progress[group_key];
+ FailbackLocation location;
+ if (get_location_in_disaster_cleanup(group_key, location)) {
uint32_t num_gw_in_location = 0;
uint32_t num_active_ana_in_location = 0;
for (auto& gw_state : gws_states) { // loop for GWs inside group-key
}
}
}
- if (num_gw_in_location == num_active_ana_in_location) {
- failbacks_in_progress.erase(group_key); // All ana groups of location are in Active
+ if (num_gw_in_location == num_active_ana_in_location) {// All ana groups of disaster location are in Active
+ disaster_map_remove_location(group_key, location);
dout(4) << "the location entry is erased "<< location
- << " num_ana_groups in location " << num_gw_in_location
+ << " from disaster-locations num_ana_groups in location " << num_gw_in_location
<< " from the failbacks-in-progress of group " << group_key <<dendl;
propose = true;
return;
break;
} else { // try to relocate
dout(10) << "ana " << anagrp
- << " to relocate to " << location << dendl;
+ << " to relocate to " << location << dendl;
relocate_ana_grp(gw_state.first, group_key, anagrp,
location, propose);
}
// how many ANA groups are handled by this GW
current_ana_groups_in_gw ++;
if (current_ana_groups_in_gw < min_num_ana_groups_in_gw ) {
- min_num_ana_groups_in_gw = current_ana_groups_in_gw;
+ min_num_ana_groups_in_gw = current_ana_groups_in_gw;
min_gw_id = gw_state.first;
}
}
NvmeGwMonState& gw_state = created_gws[group_key][gw_id];
NvmeGwMonState& failed_gw_state = created_gws[group_key][failed_gw_id];
epoch_t epoch;
- dout(10) << "Found failover GW " << gw_id
+ dout(10) << "Found Failover GW " << gw_id
<< " for ANA group " << (int)ANA_groupid << dendl;
if (failed_gw_state.availability == gw_availability_t::GW_CREATED) {
dout(10) << "Failover GW " << gw_id <<
auto& gws_states = created_gws[group_key];
auto& gw_state = created_gws[group_key][gw_id];
bool do_failback = false;
- bool force_inter_location = false;
-
- if (failbacks_in_progress.find(group_key) !=
- failbacks_in_progress.end()) {
- FailbackLocation location = failbacks_in_progress[group_key];
- if (gw_state.location == location) {
- force_inter_location = true;
+ bool allow_inter_location = true;
+ bool cleanup_in_process;
+ if (is_location_in_disaster(group_key, gw_state.location, cleanup_in_process)) {
+ if (!cleanup_in_process) {
+ allow_inter_location = false;
}
}
dout(10) << "Find failback GW for GW " << gw_id << "location "
if (do_failback == false) {
// No other gw currently performs some activity with desired ana
// group of coming-up GW - so it just takes over on the group
+ if(allow_inter_location == false) {
+ dout (10) << "Failback GW candidate was not found but "
+ << "location of the gw " << gw_id
+ << "is currently in disaster state" <<dendl;
+ return;
+ }
dout(10) << "Failback GW candidate was not found, "
<< "just set Optimized to group " << gw_state.ana_grp_id
<< " to GW " << gw_id << dendl;
<< " that previously took over the ANAGRP "
<< gw_state.ana_grp_id << " of the available GW "
<< gw_id << "location " << st.location << dendl;
- if (st.location != gw_state.location && !force_inter_location ) {
+ if (st.location != gw_state.location && !allow_inter_location ) {
//not allowed inter-location failbacks
dout(10) << "not allowed interlocation failbacks. GW "
<< gw_id << dendl;
}
-int NVMeofGwMap::find_failover_gw_logic(NvmeGwMonStates& gws_states, NvmeLocation& location,
+int NVMeofGwMap::find_failover_gw_logic(const NvmeGroupKey& group_key,
+ NvmeGwMonStates& gws_states, NvmeLocation& location,
NvmeGwId& min_loaded_gw_id, bool ignore_locations)
{
#define ILLEGAL_GW_ID " "
(ignore_locations || st.location == location)) {
num_gws ++;
active_ana_groups_in_gw = 0;
- //for (auto& state_itr: created_gws[group_key][gw_id].sm_state) {
+ bool cleanup_in_process;
+ if (is_location_in_disaster(group_key, st.location, cleanup_in_process)) {
+ continue;
+ }
for (auto& state_itr: st.sm_state) {
NvmeAnaGrpId anagrp = state_itr.first;
if ((st.sm_state[anagrp] ==
active_ana_groups_in_gw++;
}
}
- if (active_ana_groups_in_gw == 0) {
- // dont take into account Available GW with no Active states
- // it is probably GW of the cluster that was started after disaster
- // so no failbacks are allowed and num active groups = 0
- // failovers to such GW also not allowed
- continue;
- }
if (min_num_ana_groups_in_gw > active_ana_groups_in_gw) {
min_num_ana_groups_in_gw = active_ana_groups_in_gw;
min_loaded_gw_id = found_gw_state.first;
}
// Find a GW that takes over the ANA group(s)
//Find GW among the GWs belong to the same location
- int rc = find_failover_gw_logic(gws_states, ana_location,
+ int rc = find_failover_gw_logic(group_key, gws_states, ana_location,
min_loaded_gw_id, false);
if (rc == -ENOENT) {
// looks at all GWs
dout(10) << "Find Failover GW -look at all Gateways in the pool/group" << dendl;
- rc = find_failover_gw_logic(gws_states, ana_location,
+ rc = find_failover_gw_logic(group_key, gws_states, ana_location,
min_loaded_gw_id, true);
}
if (min_loaded_gw_id != ILLEGAL_GW_ID) {
#include "common/ceph_time.h"
#include "NVMeofGwTypes.h"
+inline void encode(
+ const std::map<NvmeGroupKey, LocationStates> &disaster_locations, ceph::bufferlist &bl);
+inline void decode(
+ std::map<NvmeGroupKey, LocationStates> &disaster_locations, ceph::buffer::list::const_iterator &bl);
+
using ceph::coarse_mono_clock;
class health_check_map_t;
* updating the map with a change affecting gws in group_key.
*/
std::map<NvmeGroupKey, epoch_t> gw_epoch;
- /* in stretched cluster configuration
- * failbacks between locations does not happen automatically
+ /* in stretched cluster configuration:
+ * remember disaster state per location
+ * failbacks/failovers to disaster location not allowed
+ * disaster recovery state - relocate ana groups, failbacks allowed
* */
- std::map<NvmeGroupKey, FailbackLocation> failbacks_in_progress;
+ std::map<NvmeGroupKey, LocationStates> disaster_locations;
void to_gmap(std::map<NvmeGroupKey, NvmeGwMonClientStates>& Gmap) const;
void track_deleting_gws(const NvmeGroupKey& group_key,
gw_admin_state_t state, bool &propose_pending);
int cfg_set_location(const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
std::string &location, bool &propose_pending);
- int cfg_start_inter_location_failback(const NvmeGroupKey& group_key,
+ bool is_last_gw_in_location(const NvmeGwId &gw_id,
+ const NvmeGroupKey& group_key, NvmeLocation& location);
+ int cfg_location_disaster_set(const NvmeGroupKey& group_key,
+ std::string &location, bool &propose_pending);
+ int cfg_location_disaster_clear(const NvmeGroupKey& group_key,
std::string &location, bool &propose_pending);
void process_gw_map_ka(
const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
uint64_t& old_sequence);
bool set_gw_beacon_sequence_number(const NvmeGwId &gw_id, int gw_version,
const NvmeGroupKey& group_key, uint64_t beacon_sequence);
+ bool is_location_in_disaster(const NvmeGroupKey& group_key,
+ NvmeLocation& location, bool &cleanup_in_process);
private:
int do_delete_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key);
int do_erase_gw_id(const NvmeGwId &gw_id,
void validate_gw_map(
const NvmeGroupKey& group_key);
void increment_gw_epoch(const NvmeGroupKey& group_key);
- int find_failover_gw_logic(NvmeGwMonStates& gws_states,
+ int find_failover_gw_logic(const NvmeGroupKey& group_key, NvmeGwMonStates& gws_states,
NvmeLocation& location, NvmeGwId& min_loaded_gw_id, bool ignore_locations);
+ bool get_location_in_disaster_cleanup(const NvmeGroupKey& group_key,
+ NvmeLocation& returned_location);
+ void disaster_map_remove_location(const NvmeGroupKey& group_key,
+ NvmeLocation& location);
bool validate_number_locations(int num_gws, int num_locations);
void check_relocate_ana_groups(const NvmeGroupKey& group_key,
bool &propose);
encode(gw_epoch, bl);
}
if (version >=3) {
- encode(failbacks_in_progress, bl);
+ encode(disaster_locations, bl);
}
ENCODE_FINISH(bl);
}
decode(gw_epoch, bl);
}
if (struct_v >=3) {
- decode(failbacks_in_progress, bl);
+ decode(disaster_locations, bl);
}
DECODE_FINISH(bl);
}
return os;
}
+inline std::ostream& operator<<(std::ostream& os, const LocationStates value) {
+ if(value.size()) os << "\n" << MODULE_PREFFIX;;
+
+ for (auto &locations : value) {
+ os << "location " << locations.first << " recovering state "
+ << locations.second.failbacks_in_process;
+ os << "\n"<< MODULE_PREFFIX;
+ }
+ return os;
+}
+
inline std::ostream& operator<<(std::ostream& os, const NVMeofGwMap value) {
os << "\n" << MODULE_PREFFIX << "== NVMeofGwMap [ Created_gws: epoch "
<< value.epoch;
os << "\n" << MODULE_PREFFIX << "{ " << group_gws.first
<< " } -> GW epoch: " << group_gws.second << " }";
}
- for (auto& group_gws: value.failbacks_in_progress) {
+ for (auto& group_gws: value.disaster_locations) {
os << "\n" << MODULE_PREFFIX << "{ " << group_gws.first
- << " } -> failback-to: " << group_gws.second << " }";
+ << " } -> disaster-locations: " << group_gws.second << " }";
}
for (auto& group_gws: value.created_gws) {
os << "\n" << MODULE_PREFFIX << "{ " << group_gws.first
}
inline void encode(
- const std::map<NvmeGroupKey, FailbackLocation> &failbacks_in_progress,
- ceph::bufferlist &bl) {
+ const std::map<NvmeGroupKey, LocationStates> &disaster_locations, ceph::bufferlist &bl) {
ENCODE_START(1, 1, bl);
- encode ((uint32_t)failbacks_in_progress.size(), bl); // number of groups
- for (auto& group_failbacks: failbacks_in_progress) {
- auto& group_key = group_failbacks.first;
- encode(group_key.first, bl); // pool
- encode(group_key.second, bl); // group
- encode(group_failbacks.second, bl);
+ encode ((uint32_t)disaster_locations.size(), bl); // number of groups
+ for (auto& group_disaster: disaster_locations) {
+ auto& group_key = group_disaster.first;
+ encode(group_key.first, bl); // pool
+ encode(group_key.second, bl); // group
+ const LocationStates &locations = group_disaster.second;
+ encode((uint32_t)locations.size(), bl);
+ for( auto &locations_it: locations) {
+ NvmeLocation location = locations_it.first;
+ encode(location, bl);
+ encode(locations_it.second.failbacks_in_process, bl);
+ }
}
ENCODE_FINISH(bl);
}
inline void decode(
- std::map<NvmeGroupKey, FailbackLocation> &failbacks_in_progress,
+ std::map<NvmeGroupKey, LocationStates> &disaster_locations,
ceph::buffer::list::const_iterator &bl) {
- failbacks_in_progress.clear();
+ disaster_locations.clear();
uint32_t ngroups;
DECODE_START(1, bl);
decode(ngroups, bl);
- for(uint32_t i = 0; i<ngroups; i++){
+ for (uint32_t i = 0; i<ngroups; i++) {
std::string pool, group;
decode(pool, bl);
decode(group, bl);
- FailbackLocation location;
- decode(location, bl);
- failbacks_in_progress[std::make_pair(pool, group)] = location;
+ uint32_t nlocations;
+ decode(nlocations, bl);
+ for (uint32_t i = 0; i<nlocations; i++) {
+ NvmeLocation location;
+ bool failback_in_progress;
+ decode(location, bl);
+ decode(failback_in_progress, bl);
+ disaster_locations[std::make_pair(pool, group)][location]
+ .failbacks_in_process = failback_in_progress;
+ }
}
DECODE_FINISH(bl);
}
-
inline void encode(
const std::map<NvmeGroupKey, NvmeGwMonStates>& created_gws,
ceph::bufferlist &bl, uint64_t features) {