" show nvmeof gateways within (pool, group)",
"mon", "r")
+COMMAND("nvme-gw enable"
+ " name=id,type=CephString"
+ " name=pool,type=CephString"
+ " name=group,type=CephString",
+ "administratively enables nvmeof gateway id for (pool, group)",
+ "mgr", "rw")
+
+COMMAND("nvme-gw disable"
+ " name=id,type=CephString"
+ " name=pool,type=CephString"
+ " name=group,type=CephString",
+ "administratively disables nvmeof gateway id for (pool, group)",
+ "mgr", "rw")
+
+COMMAND("nvme-gw set-locale"
+ " name=id,type=CephString"
+ " name=pool,type=CephString"
+ " name=group,type=CephString"
+ " name=locale,type=CephString",
+ "set location for nvmeof gateway id for (pool, group)",
+ "mgr", "rw")
+
+
// these are tell commands that were implemented as CLI commands in
// the broken pre-octopus way that we want to allow to work when a
// monitor has upgraded to octopus+ but the monmap min_mon_release is
return -EINVAL;
}
+int NVMeofGwMap::cfg_admin_state_change(const NvmeGwId &gw_id,
+ const NvmeGroupKey& group_key,
+ gw_admin_state_t state, bool &propose_pending, bool test)
+{
+ auto& gws_states = created_gws[group_key];
+ auto gw_state = gws_states.find(gw_id);
+ if (gw_state != gws_states.end()) {
+ auto& st = gw_state->second;
+ if (state == gw_admin_state_t::GW_ADMIN_DISABLED) {
+ if (st.gw_admin_state == gw_admin_state_t::GW_ADMIN_ENABLED) {
+ dout(4) << "GW-id set admin Disabled " << group_key
+ << " " << gw_id << dendl;
+ if (st.availability == gw_availability_t::GW_AVAILABLE) {
+ skip_failovers_for_group(group_key, 5);
+ process_gw_map_gw_down(gw_id, group_key, propose_pending);
+ }
+ propose_pending = true;
+ }
+ }
+ else if (state == gw_admin_state_t::GW_ADMIN_ENABLED) {
+ if (st.gw_admin_state == gw_admin_state_t::GW_ADMIN_DISABLED) {
+ dout(4) << "GW-id set admin Enabled " << group_key
+ << " " << gw_id << dendl;
+ propose_pending = true;
+ }
+ }
+ st.gw_admin_state = state;
+ } else {
+ dout(4) << "GW-id not created yet " << group_key << " " << gw_id << dendl;
+ return -EINVAL;
+ }
+ return 0;
+}
+int NVMeofGwMap::cfg_set_location(const NvmeGwId &gw_id,
+ const NvmeGroupKey& group_key,
+ std::string &location, bool &propose_pending, bool test) {
+// validate that location differs from gw location
+ auto& gws_states = created_gws[group_key];
+ auto gw_state = gws_states.find(gw_id);
+ if (gw_state != gws_states.end()) {
+ auto& st = gw_state->second;
+ if (st.location == location) {
+ dout(4) << "GW-id same location is set " << group_key
+ << " " << gw_id << " " << location << dendl;
+ return 0;
+ }
+ else {
+ st.location = location;
+ dout(10) << "GW-id location is set " << group_key
+ << " " << gw_id << " " << location << dendl;
+ propose_pending = true;
+ return 0;
+ }
+ } else {
+ dout(4) << "GW-id not created yet " << group_key << " " << gw_id << dendl;
+ return -EINVAL;
+ }
+}
+
void NVMeofGwMap::gw_performed_startup(const NvmeGwId &gw_id,
const NvmeGroupKey& group_key, bool &propose_pending)
{
}
}
+
+int NVMeofGwMap::find_failover_gw_logic(NvmeGwMonStates& gws_states, NvmeLocation& location,
+ NvmeGwId& min_loaded_gw_id)
+{
+#define ILLEGAL_GW_ID " "
+#define MIN_NUM_ANA_GROUPS 0xFFF
+ int min_num_ana_groups_in_gw = MIN_NUM_ANA_GROUPS;
+ min_loaded_gw_id = ILLEGAL_GW_ID;
+ int current_ana_groups_in_gw = 0;
+ int num_busy = 0, num_gws = 0;
+ // for all the gateways of the subsystem
+ // find the gws related to the same location as in anagrp
+ for (auto& found_gw_state: gws_states) {
+ auto st = found_gw_state.second;
+ if (st.availability == gw_availability_t::GW_AVAILABLE)
+ if (location == "" || st.location == location) {
+ num_gws ++;
+ current_ana_groups_in_gw = 0;
+ //for (auto& state_itr: created_gws[group_key][gw_id].sm_state) {
+ for (auto& state_itr: st.sm_state) {
+ NvmeAnaGrpId anagrp = state_itr.first;
+ if ((st.sm_state[anagrp] ==
+ gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED) ||
+ (st.sm_state[anagrp] ==
+ gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED) ||
+ (st.sm_state[anagrp] ==
+ gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL)) {
+ current_ana_groups_in_gw = 0xFFFF;
+ num_busy ++;
+ break; // dont take into account GWs in the transitive state
+ } else if (st.sm_state[anagrp] ==
+ gw_states_per_group_t::GW_ACTIVE_STATE) {
+ // how many ANA groups are handled by this GW
+ current_ana_groups_in_gw++;
+ }
+ }
+ if (min_num_ana_groups_in_gw > current_ana_groups_in_gw) {
+ min_num_ana_groups_in_gw = current_ana_groups_in_gw;
+ min_loaded_gw_id = found_gw_state.first;
+ dout(10) << "choose: gw-id min_ana_groups " << min_loaded_gw_id
+ << current_ana_groups_in_gw << " min "
+ << min_num_ana_groups_in_gw << dendl;
+ }
+ }
+ }
+ if (min_loaded_gw_id !=ILLEGAL_GW_ID) { // some GW choosen
+ return 0;
+ } else if (num_busy) {
+ dout(4) << "some GWs are busy " << num_busy
+ << "num Available " << num_gws << dendl;
+ return -EBUSY;
+ } else {
+ dout(4) << "no GWs in Active state. num Available " << num_gws << dendl;
+ return -ENOENT;
+ }
+}
+
void NVMeofGwMap::find_failover_candidate(
const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
NvmeAnaGrpId grpid, bool &propose_pending)
{
dout(10) << __func__<< " " << gw_id << dendl;
-#define ILLEGAL_GW_ID " "
-#define MIN_NUM_ANA_GROUPS 0xFFF
- int min_num_ana_groups_in_gw = 0;
- int current_ana_groups_in_gw = 0;
+ //int current_ana_groups_in_gw = 0;
+ NvmeLocation ana_location = "";
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
NvmeGwId min_loaded_gw_id = ILLEGAL_GW_ID;
auto& gws_states = created_gws[group_key];
gw_state->second.standby_state(grpid);
return ;
}
+ if (st.ana_grp_id == grpid) {
+ ana_location = st.location; // found original location of the ANA group
+ dout(10) << "Found location " << ana_location
+ << " for anagrp " << grpid << dendl;
+ }
}
// Find a GW that takes over the ANA group(s)
- min_num_ana_groups_in_gw = MIN_NUM_ANA_GROUPS;
- min_loaded_gw_id = ILLEGAL_GW_ID;
-
- // for all the gateways of the subsystem
- for (auto& found_gw_state: gws_states) {
- auto st = found_gw_state.second;
- if (st.availability == gw_availability_t::GW_AVAILABLE) {
- current_ana_groups_in_gw = 0;
- for (auto& state_itr: created_gws[group_key][gw_id].sm_state) {
- NvmeAnaGrpId anagrp = state_itr.first;
- if ((st.sm_state[anagrp] ==
- gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED) ||
- (st.sm_state[anagrp] ==
- gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED) ||
- (st.sm_state[anagrp] ==
- gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL)) {
- current_ana_groups_in_gw = 0xFFFF;
- break; // dont take into account GWs in the transitive state
- } else if (st.sm_state[anagrp] ==
- gw_states_per_group_t::GW_ACTIVE_STATE) {
- // how many ANA groups are handled by this GW
- current_ana_groups_in_gw++;
- }
- }
- if (min_num_ana_groups_in_gw > current_ana_groups_in_gw) {
- min_num_ana_groups_in_gw = current_ana_groups_in_gw;
- min_loaded_gw_id = found_gw_state.first;
- dout(10) << "choose: gw-id min_ana_groups " << min_loaded_gw_id
- << current_ana_groups_in_gw << " min "
- << min_num_ana_groups_in_gw << dendl;
- }
- }
+ //Find GW among the GWs belong to the same location
+ int rc = find_failover_gw_logic(gws_states, ana_location, min_loaded_gw_id);
+ if (rc == -ENOENT) {
+ ana_location = ""; // looks at all GWs
+ dout(10) << "Find Failover GW -look at all Gateways in the pool/group" << dendl;
+ rc = find_failover_gw_logic(gws_states, ana_location, min_loaded_gw_id);
}
if (min_loaded_gw_id != ILLEGAL_GW_ID) {
propose_pending = true;
int cfg_add_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
bool test = false);
int cfg_delete_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key);
+ int cfg_admin_state_change(const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
+ gw_admin_state_t state, bool &propose_pending, bool test = false);
+ int cfg_set_location(const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
+ std::string &location, bool &propose_pending, bool test = false);
void process_gw_map_ka(
const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
epoch_t& last_osd_epoch, bool &propose_pending);
void validate_gw_map(
const NvmeGroupKey& group_key);
void increment_gw_epoch(const NvmeGroupKey& group_key);
+ int find_failover_gw_logic(NvmeGwMonStates& gws_states,
+ NvmeLocation& location, NvmeGwId& min_loaded_gw_id);
public:
int blocklist_gw(
f->open_object_section("stat");
f->dump_string("gw-id", gw_id);
f->dump_unsigned("anagrp-id",state.ana_grp_id+1);
+ f->dump_string("location", state.location);
+ f->dump_unsigned("admin state", (uint32_t)state.gw_admin_state);
f->dump_unsigned("num-namespaces", num_ns[state.ana_grp_id+1]);
f->dump_unsigned("performed-full-startup", state.performed_full_startup);
std::stringstream sstrm1;
response = true;
}
}
+ else if (prefix == "nvme-gw enable" || prefix == "nvme-gw disable") {
+ std::string id, pool, group;
+ cmd_getval(cmdmap, "id", id);
+ cmd_getval(cmdmap, "pool", pool);
+ cmd_getval(cmdmap, "group", group);
+ auto group_key = std::make_pair(pool, group);
+ dout(10) << " id "<< id <<" pool "<< pool << " group "<< group
+ << " " << prefix << dendl;
+ gw_admin_state_t set = (prefix == "nvme-gw enable") ?
+ gw_admin_state_t::GW_ADMIN_ENABLED :
+ gw_admin_state_t::GW_ADMIN_DISABLED;
+ bool propose = false;
+ rc = pending_map.cfg_admin_state_change(id, group_key, set, propose);
+ if (rc == -EINVAL) {
+ err = rc;
+ dout (4) << "Error: GW cannot be set to admin state " << id
+ << " " << pool << " " << group << " rc " << rc << dendl;
+ sstrm.str("");
+ }
+ // propose pending would be generated by the PaxosService
+ if (rc == 0 && propose == true) {
+ response = true;
+ }
+ }
+ else if (prefix == "nvme-gw set-locale") {
+
+ std::string id, pool, group, locale;
+ cmd_getval(cmdmap, "id", id);
+ cmd_getval(cmdmap, "pool", pool);
+ cmd_getval(cmdmap, "group", group);
+ cmd_getval(cmdmap, "locale", locale);
+ auto group_key = std::make_pair(pool, group);
+ dout(10) << " id "<< id <<" pool "<< pool << " group "<< group
+ <<" locale "<< locale << dendl;
+ bool propose = false;
+ rc = pending_map.cfg_set_location(id, group_key, locale, propose);
+ if (rc == -EINVAL) {
+ err = rc;
+ dout (4) << "Error: GW cannot set location " << id
+ << " " << pool << " " << group << " rc " << rc << dendl;
+ sstrm.str("");
+ }
+ // propose pending would be generated by the PaxosService
+ if (rc == 0 && propose == true) {
+ response = true;
+ }
+ }
getline(sstrm, rs);
if (response == false) {
if (err < 0 && rs.length() == 0) {
rs = cpp_strerror(err);
dout(10) << "Error command err : "<< err << " rs-len: "
- << rs.length() << dendl;
+ << rs.length() << dendl;
}
mon.reply_command(op, err, rs, rdata, get_last_committed());
} else {
if (changed) {
avail = gw_availability_t::GW_AVAILABLE;
}
+ if (state.gw_admin_state ==gw_admin_state_t::GW_ADMIN_DISABLED) {
+ avail = gw_availability_t::GW_CREATED;
+ }
if (gw_subs.size() == 0) {
avail = gw_availability_t::GW_CREATED;
dout(10) << "No-subsystems condition detected for GW " << gw_id <<dendl;
decode(state.gw_map_epoch, bl);
decode(state.subsystems, bl);
uint32_t avail;
+ uint64_t last_beacon_seq_number;
decode(avail, bl);
state.availability = (gw_availability_t)avail;
if (struct_v >= 2) {
- decode(state.last_beacon_seq_number, bl);
+ decode(last_beacon_seq_number, bl);
+ state.last_beacon_seq_number = last_beacon_seq_number;
decode(state.last_beacon_seq_ooo, bl);
}
DECODE_FINISH(bl);
if (HAVE_FEATURE(features, NVMEOFHAMAP)) {
version = 3;
}
+ if (HAVE_FEATURE(features, NVMEOF_BEACON_DIFF)) {
+ version = 4;
+ }
ENCODE_START(version, version, bl);
dout(20) << "encode NvmeGwMonStates. struct_v: " << (int)version << dendl;
encode ((uint32_t)gws.size(), bl); // number of gws in the group
gw.second.addr_vect.encode(bl, features);
encode(gw.second.beacon_index, bl);
}
+ if (version >= 4) {
+ encode((int)gw.second.gw_admin_state, bl);
+ dout(10) << "encode location " << gw.second.location << dendl;
+ encode(gw.second.location, bl);
+ }
}
ENCODE_FINISH(bl);
}
NvmeGwMonStates& gws, ceph::buffer::list::const_iterator &bl) {
gws.clear();
uint32_t num_created_gws;
- DECODE_START(3, bl);
+ DECODE_START(4, bl);
dout(20) << "decode NvmeGwMonStates. struct_v: " << struct_v << dendl;
decode(num_created_gws, bl);
dout(20) << "decode NvmeGwMonStates. num gws " << num_created_gws << dendl;
decode(gw_created.beacon_index, bl);
dout(20) << "decoded beacon_index " << gw_created.beacon_index << dendl;
}
+ if (struct_v >= 4) {
+ dout(20) << "decode admin state and location" << dendl;
+ int admin_state;
+ decode(admin_state, bl);
+ gw_created.gw_admin_state = (gw_admin_state_t)admin_state;
+ decode(gw_created.location, bl);
+ dout(20) << "decoded location " << gw_created.location << dendl;
+ }
gws[gw_name] = gw_created;
}
#include "msg/msg_types.h"
using NvmeGwId = std::string;
+using NvmeLocation = std::string;
using NvmeGroupKey = std::pair<std::string, std::string>;
using NvmeNqnId = std::string;
using NvmeAnaGrpId = uint32_t;
GW_DELETED
};
+enum class gw_admin_state_t {
+ GW_ADMIN_ENABLED = 0,
+ GW_ADMIN_DISABLED,
+};
+
enum class subsystem_change_t {
SUBSYSTEM_ADDED,
SUBSYSTEM_CHANGED,
* it from being overriden by new epochs in monitor's function create_pending -
* function restore_pending_map_info is called for this purpose
*/
+ gw_admin_state_t gw_admin_state = gw_admin_state_t::GW_ADMIN_ENABLED;
+ std::string location = "";
std::chrono::system_clock::time_point allow_failovers_ts =
std::chrono::system_clock::now();
std::chrono::system_clock::time_point last_gw_down_ts =