From: Leonid Chernin Date: Wed, 21 Aug 2024 16:30:14 +0000 (+0000) Subject: mon/NVMeofGw*: fixing bugs - handle gw fast-reboot, proper handle of gw delete scenarios X-Git-Tag: v20.0.0~1182^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=8d402e31f5eaadfefe0afe909f5ad3bc5a907e18;p=ceph.git mon/NVMeofGw*: fixing bugs - handle gw fast-reboot, proper handle of gw delete scenarios Signed-off-by: Leonid Chernin --- diff --git a/NVMeofGwMap.h b/NVMeofGwMap.h new file mode 100755 index 0000000000000..f7955e5b53115 --- /dev/null +++ b/NVMeofGwMap.h @@ -0,0 +1,118 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2023 IBM, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef MON_NVMEOFGWMAP_H_ +#define MON_NVMEOFGWMAP_H_ +#include +#include +#include "include/encoding.h" +#include "include/utime.h" +#include "common/Formatter.h" +#include "common/ceph_releases.h" +#include "common/version.h" +#include "common/options.h" +#include "common/Clock.h" +#include "msg/Message.h" +#include "common/ceph_time.h" +#include "NVMeofGwTypes.h" +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mon +#undef dout_prefix +#define MODULE_PREFFIX "nvmeofgw " +#define dout_prefix *_dout << MODULE_PREFFIX << __PRETTY_FUNCTION__ << " " + + +static const version_t STRUCT_VERSION = 2; +static const version_t OLD_STRUCT_VERSION = 1; + +using ceph::coarse_mono_clock; +class Monitor; +/*-------------------*/ +class NVMeofGwMap +{ +public: + Monitor* mon = NULL; + epoch_t epoch = 0; // epoch is for Paxos synchronization mechanizm + bool delay_propose = false; + std::map peer_addr_2_version; + std::map created_gws; + std::map fsm_timers;// map that handles timers started by all Gateway FSMs + void to_gmap(std::map& Gmap) const; + + int cfg_add_gw (const NvmeGwId &gw_id, const NvmeGroupKey& group_key); + int cfg_delete_gw (const NvmeGwId &gw_id, const NvmeGroupKey& group_key); + void process_gw_map_ka (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, epoch_t& last_osd_epoch, bool &propose_pending); + int process_gw_map_gw_down (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose_pending); + void update_active_timers (bool &propose_pending); + void handle_abandoned_ana_groups (bool &propose_pending); + void handle_removed_subsystems (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, const std::vector ¤t_subsystems, bool &propose_pending); + void start_timer (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid, uint8_t value); +private: + void add_grp_id (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, const NvmeAnaGrpId grpid); + void remove_grp_id(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, const NvmeAnaGrpId grpid); + void fsm_handle_gw_down (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, gw_states_per_group_t state, NvmeAnaGrpId grpid, bool &map_modified); + void fsm_handle_gw_delete (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, gw_states_per_group_t state, NvmeAnaGrpId grpid, bool &map_modified); + void fsm_handle_gw_alive (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeGwMonState & gw_state, gw_states_per_group_t state, + NvmeAnaGrpId grpid, epoch_t& last_osd_epoch, bool &map_modified); + void fsm_handle_to_expired (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId grpid, bool &map_modified); + + void find_failover_candidate(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId grpid, bool &propose_pending); + void find_failback_gw (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose_pending); + void set_failover_gw_for_ANA_group (const NvmeGwId &failed_gw_id, const NvmeGroupKey& group_key, const NvmeGwId &gw_id, + NvmeAnaGrpId groupid); + + + int get_timer (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid); + void cancel_timer(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid); + void validate_gw_map(const NvmeGroupKey& group_key); + +public: + int blocklist_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId ANA_groupid, epoch_t &epoch, bool failover); + + void encode(ceph::buffer::list &bl, uint64_t features) const { + uint8_t version; + if (HAVE_FEATURE(features, SERVER_SQUID)) version = STRUCT_VERSION; + else version = OLD_STRUCT_VERSION; + ENCODE_START(version, 1, bl); + dout(4) << "encode1 version " << (uint64_t)version << version << " features " << features << dendl; + using ceph::encode; + encode(epoch, bl);// global map epoch + if (version == STRUCT_VERSION) { + //encode(peer_addr_2_version, bl); + } + encode(created_gws, bl, features); //Encode created GWs + encode(fsm_timers, bl, features); + ENCODE_FINISH(bl); + } + + void decode(ceph::buffer::list::const_iterator &bl) { + using ceph::decode; + epoch_t struct_version = 0; + DECODE_START(STRUCT_VERSION, bl); + DECODE_OLDEST(1); + struct_version = struct_v; + dout(4) << "decode version " << struct_version << dendl; + decode(epoch, bl); + if (struct_version == STRUCT_VERSION) { + //dout(4) << "Decode peer_2_addr " << dendl; + //decode(peer_addr_2_version, bl); + } + decode(created_gws, bl); + decode(fsm_timers, bl); + DECODE_FINISH(bl); + } +}; + +#include "NVMeofGwSerialize.h" + +#endif /* SRC_MON_NVMEOFGWMAP_H_ */ diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc index 646d56d30e6cf..658708a069fa6 100755 --- a/src/mon/NVMeofGwMap.cc +++ b/src/mon/NVMeofGwMap.cc @@ -424,6 +424,61 @@ void NVMeofGwMap::find_failover_candidate( } } +void NVMeofGwMap::handle_gw_performing_fast_reboot(const NvmeGwId &gw_id, + const NvmeGroupKey& group_key, bool &map_modified) +{ + for (auto& state_itr: created_gws[group_key][gw_id].sm_state ) { + fsm_handle_gw_fast_reboot(gw_id,group_key, state_itr.first, map_modified); + } +} + +void NVMeofGwMap::fsm_handle_gw_fast_reboot(const NvmeGwId &gw_id, + const NvmeGroupKey& group_key, NvmeAnaGrpId grpid, bool &map_modified) +{ + // GW that appears in the internal map as Available, performed reboot, + // need to re-apply this GW: to load proper states for all active ANA groups + auto& gw_state = created_gws[group_key][gw_id]; + map_modified = true; + gw_states_per_group_t state = gw_state.sm_state[grpid]; + dout(10) << "GW " << gw_id << " ANA groupId: " << grpid << " state " + << state << dendl; + switch (state){ + case gw_states_per_group_t::GW_IDLE_STATE: + case gw_states_per_group_t::GW_STANDBY_STATE: + case gw_states_per_group_t::GW_ACTIVE_STATE: + break; + + case gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED: + { + //restart timeout + start_timer(gw_id, group_key, grpid, 3); + } + break; + + case gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED: + { + // since owner was reseted for this group, wait for the background process + // to choose it again + gw_state.standby_state(grpid); + } + break; + + case gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL: + { + //restart timer + // The blocklist was started, need to wait for the epoch in the GW + start_timer(gw_id, group_key, grpid, 30); + } + break; + + default: + { + dout(4) << "Warning: GW " << gw_id << " Invalid state " << state << dendl; + } + } + validate_gw_map(group_key); +} + void NVMeofGwMap::fsm_handle_gw_alive( const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeGwMonState & gw_state, gw_states_per_group_t state, diff --git a/src/mon/NVMeofGwMap.h b/src/mon/NVMeofGwMap.h index c128d7c928564..1141b8ec09022 100755 --- a/src/mon/NVMeofGwMap.h +++ b/src/mon/NVMeofGwMap.h @@ -61,6 +61,8 @@ public: void start_timer( const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid, uint8_t value); + void handle_gw_performing_fast_reboot(const NvmeGwId &gw_id, + const NvmeGroupKey& group_key, bool &map_modified); private: void add_grp_id( const NvmeGwId &gw_id, const NvmeGroupKey& group_key, @@ -81,7 +83,9 @@ private: void fsm_handle_to_expired( const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId grpid, bool &map_modified); - + void fsm_handle_gw_fast_reboot(const NvmeGwId &gw_id, + const NvmeGroupKey& group_key, NvmeAnaGrpId grpid, + bool &map_modified); void find_failover_candidate( const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId grpid, bool &propose_pending); diff --git a/src/mon/NVMeofGwMon.cc b/src/mon/NVMeofGwMon.cc index ac4a6e199fbd2..5f41ebab5a770 100644 --- a/src/mon/NVMeofGwMon.cc +++ b/src/mon/NVMeofGwMon.cc @@ -394,6 +394,10 @@ bool NVMeofGwMon::prepare_command(MonOpRequestRef op) err = 0; sstrm.str(""); } + if (rc == 0) { + LastBeacon lb = {id, group_key}; + last_beacon.erase(lb); + } } // propose pending would be generated by the PaxosService if ((rc != -EEXIST) && (rc != -EINVAL)) { @@ -450,6 +454,7 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op) auto& group_gws = map.created_gws[group_key]; auto gw = group_gws.find(gw_id); const BeaconSubsystems& sub = m->get_subsystems(); + auto now = ceph::coarse_mono_clock::now(); if (avail == gw_availability_t::GW_CREATED) { if (gw == group_gws.end()) { @@ -466,17 +471,18 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op) if (pending_map.created_gws[group_key][gw_id].availability == gw_availability_t::GW_AVAILABLE) { dout(4) << " Warning :GW marked as Available in the NVmeofGwMon " - << "database, performed full startup - Force gw to exit!" + << "database, performed full startup - Apply GW!" << gw_id << dendl; - avail = gw_availability_t::GW_UNAVAILABLE; - // Monitor performs Force Failover for this GW in process_gw_map_gw_down + pending_map.handle_gw_performing_fast_reboot(gw_id, group_key, propose); + LastBeacon lb = {gw_id, group_key}; + last_beacon[lb] = now; //Update last beacon } else if ( pending_map.created_gws[group_key][gw_id].performed_full_startup == false) { pending_map.created_gws[group_key][gw_id].performed_full_startup = true; propose = true; - goto set_propose; } + goto set_propose; } // gw already created } else { @@ -542,7 +548,6 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op) << " beacon_epoch " << m->get_last_gwmap_epoch() << dendl; } if (avail == gw_availability_t::GW_AVAILABLE) { - auto now = ceph::coarse_mono_clock::now(); // check pending_map.epoch vs m->get_version() - // if different - drop the beacon