const NvmeGroupKey& group_key,
std::string &location, bool &propose_pending) {
auto& gws_states = created_gws[group_key];
+ bool accept = false;
// for all the gateways of the subsystem
- for (auto& found_gw_state: gws_states) {
- auto st = found_gw_state.second;
- if (st.location == location) {
- if(st.availability != gw_availability_t::GW_AVAILABLE ||
- st.sm_state[st.ana_grp_id] != gw_states_per_group_t::GW_STANDBY_STATE) {
- dout(4) << "command rejected found gw in state " << st.availability
- << " ana grp state " << st.sm_state[st.ana_grp_id] << dendl;
- return -EINVAL;
- }
- }
+ if (!HAVE_FEATURE(mon->get_quorum_con_features(), NVMEOF_BEACON_DIFF)) {
+ dout(4) << "Command is not allowed - feature is not installed"
+ << group_key << dendl;
+ return -EINVAL;
+ }
+ if (failbacks_in_progress.find(group_key) != failbacks_in_progress.end())
+ {
+ dout(4) << "command cannot be accepted since found active failback for a group "
+ << failbacks_in_progress[group_key] << dendl;
+ return -EEXIST;
}
for (auto& found_gw_state: gws_states) {
auto st = found_gw_state.second;
if (st.location == location) {
- auto gw_id = found_gw_state.first;
- find_failback_gw(gw_id, group_key, propose_pending, true);
+ if(st.availability == gw_availability_t::GW_AVAILABLE &&
+ st.sm_state[st.ana_grp_id] == gw_states_per_group_t::GW_STANDBY_STATE) {
+ dout(10) << "command accepted found gw in state " << st.availability
+ << " ana grp state " << st.sm_state[st.ana_grp_id] << dendl;
+ accept = true;
+ break;
+ }
}
}
- return 0;
+ if (accept) {
+ failbacks_in_progress[group_key] = location;
+ propose_pending = true;
+ return 0;
+ } else {
+ dout(10) << "command not accepted: not found AVAILABLE GW"
+ "with ANA grp in standby state" << dendl;
+ return -EINVAL;
+ }
}
void NVMeofGwMap::gw_performed_startup(const NvmeGwId &gw_id,
find_failback_gw(gw_id, group_key, propose);
}
}
+ check_relocate_ana_groups(group_key, propose);
if (propose) {
validate_gw_map(group_key);
increment_gw_epoch(group_key);
}
}
+void NVMeofGwMap::check_relocate_ana_groups(const NvmeGroupKey& group_key,
+ bool &propose) {
+ /* if location exists in failbacks_in_progress find all gws in location.
+ * add ana-grp of not Available gws to the list.
+ * if ana-grp is already active on some gw in location skip it
+ * for ana-grp in list make relocation.
+ * if all ana-grps in location active remove location from the map failbacks_in_progress
+ */
+ FailbackLocation location = "";
+ std::list<NvmeAnaGrpId> reloc_list;
+ auto& gws_states = created_gws[group_key];
+ if (failbacks_in_progress.find(group_key) != failbacks_in_progress.end()) {
+ location = failbacks_in_progress[group_key];
+ uint32_t num_gw_in_location = 0;
+ uint32_t num_active_ana_in_location = 0;
+ for (auto& gw_state : gws_states) { // loop for GWs inside group-key
+ NvmeGwMonState& state = gw_state.second;
+ if (state.location == location) {
+ num_gw_in_location ++;
+ if (state.availability != gw_availability_t::GW_AVAILABLE) {
+ reloc_list.push_back(state.ana_grp_id);
+ } else { // in parallel check condition to complete failback-in-process
+ for (auto& state_it: state.sm_state) {
+ if (state_it.second == gw_states_per_group_t::GW_ACTIVE_STATE) {
+ num_active_ana_in_location ++;
+ }
+ }
+ }
+ }
+ }
+ if (num_gw_in_location == num_active_ana_in_location) {
+ failbacks_in_progress.erase(group_key); // All ana groups of location are in Active
+ dout(4) << "the location entry is erased "<< location
+ << " num_ana_groups in location " << num_gw_in_location
+ << " from the failbacks-in-progress of group " << group_key <<dendl;
+ propose = true;
+ return;
+ }
+ // for all ana groups in the list do relocate
+ for (auto& anagrp : reloc_list) {
+ for (auto& gw_state : gws_states) { // loop for GWs inside group-key
+ NvmeGwMonState& state = gw_state.second;
+ if (state.sm_state[anagrp] == gw_states_per_group_t::GW_ACTIVE_STATE) {
+ if (state.location == location) { // already relocated to the location
+ dout(10) << "ana " << anagrp << " already in " << location << dendl;
+ break;
+ } else { // try to relocate
+ dout(10) << "ana " << anagrp
+ << " to relocate to " << location << dendl;
+ relocate_ana_grp(gw_state.first, group_key, anagrp,
+ location, propose);
+ }
+ }
+ }
+ }
+ }
+}
+
+int NVMeofGwMap::relocate_ana_grp(const NvmeGwId &src_gw_id,
+ const NvmeGroupKey& group_key, NvmeAnaGrpId grpid, NvmeLocation& location,
+ bool &propose){
+ /* grpid should be in Active state on src_gw_id as precondition
+ * function tries to found the minimum loaded gw in location
+ * if found, relocate ana grp to it using regular failback FSM
+ * */
+#define MIN_NUM_ANA_GROUPS 0xFFF
+ uint32_t min_num_ana_groups_in_gw = MIN_NUM_ANA_GROUPS;
+ NvmeGwId min_gw_id = "empty";
+ auto& gws_states = created_gws[group_key];
+
+ for (auto& gw_state : gws_states) { // loop for GWs inside group-key
+ NvmeGwMonState& state = gw_state.second;
+ uint32_t current_ana_groups_in_gw = 0;
+ if (state.location == location && state.availability ==
+ gw_availability_t::GW_AVAILABLE) {
+ //find minimum loaded gw in location for relocate anagr to it
+ for (auto& state_itr: state.sm_state) {
+ NvmeAnaGrpId anagrp = state_itr.first;
+ //TODO - if found some gw in intermediate state - exit the process
+ // need to calculate the correct loading of Active groups
+ if ( (state.sm_state[anagrp] !=
+ gw_states_per_group_t::GW_ACTIVE_STATE) &&
+ (state.sm_state[anagrp] !=
+ gw_states_per_group_t::GW_STANDBY_STATE)
+ ) {
+ dout(10) << "relocation found gw in intermediate state "
+ << gw_state.first << " state " << state.sm_state[anagrp] << dendl;
+ return 0;
+ }
+ if (state.sm_state[anagrp] ==
+ gw_states_per_group_t::GW_ACTIVE_STATE) {
+ // how many ANA groups are handled by this GW
+ current_ana_groups_in_gw ++;
+ if (current_ana_groups_in_gw < min_num_ana_groups_in_gw ) {
+ min_num_ana_groups_in_gw = current_ana_groups_in_gw;
+ min_gw_id = gw_state.first;
+ }
+ }
+ }
+ }
+ }
+ dout(10) << "found min loaded gw for relocate " << min_gw_id << " location "
+ << location << "min load " << min_num_ana_groups_in_gw << dendl;
+
+ if (min_num_ana_groups_in_gw < MIN_NUM_ANA_GROUPS) {
+ dout(4) << "relocate starts " << grpid << " location " << location << dendl;
+ gws_states[src_gw_id].sm_state[grpid] =
+ gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED;
+ // Add timestamp of start Failback preparation
+ start_timer(src_gw_id, group_key, grpid, 3);
+ gws_states[min_gw_id].sm_state[grpid] =
+ gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED;
+ propose = true;
+ }
+ return 0;
+}
+
void NVMeofGwMap::set_failover_gw_for_ANA_group(
const NvmeGwId &failed_gw_id, const NvmeGroupKey& group_key,
const NvmeGwId &gw_id, NvmeAnaGrpId ANA_groupid)
}
void NVMeofGwMap::find_failback_gw(
- const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose,
- bool force_inter_location)
+ const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose)
{
auto& gws_states = created_gws[group_key];
auto& gw_state = created_gws[group_key][gw_id];
bool do_failback = false;
+ bool force_inter_location = false;
+ FailbackLocation location = "";
+
+ if (failbacks_in_progress.find(group_key) !=
+ failbacks_in_progress.end()) {
+ location = failbacks_in_progress[group_key];
+ if (gw_state.location == location) {
+ force_inter_location = true;
+ }
+ }
dout(10) << "Find failback GW for GW " << gw_id << "location "
<< gw_state.location << dendl;
for (auto& gw_state_it: gws_states) {
for (auto& gw_state: created_gws[group_key]) {
auto& st = gw_state.second;
// group owner
- if (st.ana_grp_id == grpid) {
+ if (st.sm_state[grpid] ==
+ gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED) {
grp_owner_found = true;
if (st.availability == gw_availability_t::GW_AVAILABLE) {
if (!(fbp_gw_state.last_gw_map_epoch_valid &&
}
cancel_timer(gw_id, group_key, grpid);
map_modified = true;
- if ((st.sm_state[grpid] ==
- gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED) &&
- (st.availability == gw_availability_t::GW_AVAILABLE)) {
- // Previous failover GW set to standby
- fbp_gw_state.standby_state(grpid);
+ // unconditionaly previous failover GW set to standby
+ fbp_gw_state.standby_state(grpid);
+ if (st.availability == gw_availability_t::GW_AVAILABLE) {
st.active_state(grpid);
dout(10) << "Expired Failback-preparation timer from GW "
<< gw_id << " ANA groupId "<< grpid << dendl;
- map_modified = true;
break;
- } else if ((st.sm_state[grpid] ==
- gw_states_per_group_t::GW_STANDBY_STATE) &&
- (st.availability == gw_availability_t::GW_AVAILABLE)) {
- // GW failed during the persistency interval
- st.standby_state(grpid);
- dout(10) << "Failback unsuccessfull. GW: " << gw_state.first
- << " becomes Standby for the ANA groupId " << grpid << dendl;
}
- fbp_gw_state.standby_state(grpid);
- dout(10) << "Failback unsuccessfull GW: " << gw_id
- << " becomes Standby for the ANA groupId " << grpid << dendl;
- map_modified = true;
- break;
+ else {
+ st.standby_state(grpid);
+ dout(10) << "GW failed durind failback/relocation persistency interval"
+ << gw_state.first << dendl;
+ }
}
}
if (grp_owner_found == false) {
- // when GW group owner is deleted the fbk gw is put to standby
- dout(4) << "group owner not found " << grpid << " GW: " << gw_id << dendl;
+ cancel_timer(gw_id, group_key, grpid);
+ fbp_gw_state.standby_state(grpid);
+ dout(4) << "group owner/relocation target not found "
+ << grpid << " GW: " << gw_id << dendl;
+ dout(10) << "Failback unsuccessfull GW: " << gw_id
+ << " becomes Standby for the ANA groupId " << grpid << dendl;
+ map_modified = true;
}
} else if (fbp_gw_state.sm_state[grpid] ==
gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL) {
* updating the map with a change affecting gws in group_key.
*/
std::map<NvmeGroupKey, epoch_t> gw_epoch;
+ /* in stretched cluster configuration
+ * failbacks between locations not happens automatically
+ * */
+ std::map<NvmeGroupKey, FailbackLocation> failbacks_in_progress;
void to_gmap(std::map<NvmeGroupKey, NvmeGwMonClientStates>& Gmap) const;
void track_deleting_gws(const NvmeGroupKey& group_key,
void find_failover_candidate(
const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
NvmeAnaGrpId grpid, bool &propose_pending);
- void find_failback_gw(
- const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
- bool &propose_pending, bool force_inter_location = false);
+ void find_failback_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
+ bool &propose_pending);
void set_failover_gw_for_ANA_group(
const NvmeGwId &failed_gw_id, const NvmeGroupKey& group_key,
const NvmeGwId &gw_id, NvmeAnaGrpId groupid);
int find_failover_gw_logic(NvmeGwMonStates& gws_states,
NvmeLocation& location, NvmeGwId& min_loaded_gw_id);
bool validate_number_locations(int num_gws, int num_locations);
+ void check_relocate_ana_groups(const NvmeGroupKey& group_key,
+ bool &propose);
+ int relocate_ana_grp(const NvmeGwId &src_gw_id,
+ const NvmeGroupKey& group_key, NvmeAnaGrpId grpid,
+ NvmeLocation& location, bool &propose);
public:
int blocklist_gw(
using ceph::encode;
uint8_t version = 1;
if (HAVE_FEATURE(features, NVMEOFHAMAP)) {
- version = 2;
+ version = 2;
+ }
+ if (HAVE_FEATURE(features, NVMEOF_BEACON_DIFF)) {
+ version = 3;
}
ENCODE_START(version, version, bl);
encode(epoch, bl);// global map epoch
if (version >= 2) {
encode(gw_epoch, bl);
}
+ if (version >=3) {
+ encode(failbacks_in_progress, bl);
+ }
ENCODE_FINISH(bl);
}
void decode(ceph::buffer::list::const_iterator &bl) {
using ceph::decode;
- DECODE_START(2, bl);
+ DECODE_START(3, bl);
decode(epoch, bl);
decode(created_gws, bl);
if (struct_v >= 2) {
decode(gw_epoch, bl);
}
+ if (struct_v >=3) {
+ decode(failbacks_in_progress, bl);
+ }
DECODE_FINISH(bl);
}
os << " " << state_itr.first <<": " << state_itr.second << ",";
}
os << "]\n"<< MODULE_PREFFIX << " entity-addr : " << value.addr_vect
- << " availability " << value.availability
+ << " availability " << value.availability << "location " << value.location
<< " full-startup " << value.performed_full_startup << " ]";
return os;
os << "\n" << MODULE_PREFFIX << "{ " << group_gws.first
<< " } -> GW epoch: " << group_gws.second << " }";
}
+ for (auto& group_gws: value.failbacks_in_progress) {
+ os << "\n" << MODULE_PREFFIX << "{ " << group_gws.first
+ << " } -> failback-to: " << group_gws.second << " }";
+ }
for (auto& group_gws: value.created_gws) {
os << "\n" << MODULE_PREFFIX << "{ " << group_gws.first
<< " } -> { " << group_gws.second << " }";
DECODE_FINISH(bl);
}
+inline void encode(
+ const std::map<NvmeGroupKey, FailbackLocation> &failbacks_in_progress,
+ ceph::bufferlist &bl) {
+ ENCODE_START(1, 1, bl);
+ encode ((uint32_t)failbacks_in_progress.size(), bl); // number of groups
+ for (auto& group_failbacks: failbacks_in_progress) {
+ auto& group_key = group_failbacks.first;
+ encode(group_key.first, bl); // pool
+ encode(group_key.second, bl); // group
+ encode(group_failbacks.second, bl);
+ }
+ ENCODE_FINISH(bl);
+}
+
+inline void decode(
+ std::map<NvmeGroupKey, FailbackLocation> &failbacks_in_progress,
+ ceph::buffer::list::const_iterator &bl) {
+ failbacks_in_progress.clear();
+ uint32_t ngroups;
+ DECODE_START(1, bl);
+ decode(ngroups, bl);
+ for(uint32_t i = 0; i<ngroups; i++){
+ std::string pool, group;
+ decode(pool, bl);
+ decode(group, bl);
+ FailbackLocation location;
+ decode(location, bl);
+ failbacks_in_progress[std::make_pair(pool, group)] = location;
+ }
+ DECODE_FINISH(bl);
+}
+
+
inline void encode(
const std::map<NvmeGroupKey, NvmeGwMonStates>& created_gws,
ceph::bufferlist &bl, uint64_t features) {