]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mon/NVMeofGw*: fixing bugs - handle gw fast-reboot, proper handle of gw delete scenarios 59385/head
authorLeonid Chernin <leonidc@il.ibm.com>
Wed, 21 Aug 2024 16:30:14 +0000 (16:30 +0000)
committerLeonid Chernin <leonidc@il.ibm.com>
Mon, 26 Aug 2024 08:21:38 +0000 (08:21 +0000)
Signed-off-by: Leonid Chernin <leonidc@il.ibm.com>
NVMeofGwMap.h [new file with mode: 0755]
src/mon/NVMeofGwMap.cc
src/mon/NVMeofGwMap.h
src/mon/NVMeofGwMon.cc

diff --git a/NVMeofGwMap.h b/NVMeofGwMap.h
new file mode 100755 (executable)
index 0000000..f7955e5
--- /dev/null
@@ -0,0 +1,118 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef MON_NVMEOFGWMAP_H_
+#define MON_NVMEOFGWMAP_H_
+#include <map>
+#include <iostream>
+#include "include/encoding.h"
+#include "include/utime.h"
+#include "common/Formatter.h"
+#include "common/ceph_releases.h"
+#include "common/version.h"
+#include "common/options.h"
+#include "common/Clock.h"
+#include "msg/Message.h"
+#include "common/ceph_time.h"
+#include "NVMeofGwTypes.h"
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define MODULE_PREFFIX "nvmeofgw "
+#define dout_prefix *_dout << MODULE_PREFFIX << __PRETTY_FUNCTION__ << " "
+
+
+static const version_t STRUCT_VERSION = 2;
+static const version_t OLD_STRUCT_VERSION = 1;
+
+using ceph::coarse_mono_clock;
+class Monitor;
+/*-------------------*/
+class NVMeofGwMap
+{
+public:
+    Monitor*                            mon           = NULL;
+    epoch_t                             epoch         = 0;      // epoch is for Paxos synchronization  mechanizm
+    bool                                delay_propose = false;
+    std::map<entity_addrvec_t , uint32_t>   peer_addr_2_version;
+    std::map<NvmeGroupKey, NvmeGwMonStates>  created_gws;
+    std::map<NvmeGroupKey, NvmeGwTimers> fsm_timers;// map that handles timers started by all Gateway FSMs
+    void to_gmap(std::map<NvmeGroupKey, NvmeGwMonClientStates>& Gmap) const;
+
+    int   cfg_add_gw                    (const NvmeGwId &gw_id, const NvmeGroupKey& group_key);
+    int   cfg_delete_gw                 (const NvmeGwId &gw_id, const NvmeGroupKey& group_key);
+    void  process_gw_map_ka             (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, epoch_t& last_osd_epoch,  bool &propose_pending);
+    int   process_gw_map_gw_down        (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose_pending);
+    void  update_active_timers          (bool &propose_pending);
+    void  handle_abandoned_ana_groups   (bool &propose_pending);
+    void  handle_removed_subsystems     (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, const std::vector<NvmeNqnId> &current_subsystems, bool &propose_pending);
+    void  start_timer (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid, uint8_t value);
+private:
+    void add_grp_id   (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, const NvmeAnaGrpId grpid);
+    void remove_grp_id(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, const NvmeAnaGrpId grpid);
+    void fsm_handle_gw_down    (const NvmeGwId &gw_id, const NvmeGroupKey& group_key,  gw_states_per_group_t state, NvmeAnaGrpId grpid,  bool &map_modified);
+    void fsm_handle_gw_delete  (const NvmeGwId &gw_id, const NvmeGroupKey& group_key,  gw_states_per_group_t state, NvmeAnaGrpId grpid,  bool &map_modified);
+    void fsm_handle_gw_alive   (const NvmeGwId &gw_id, const NvmeGroupKey& group_key,  NvmeGwMonState & gw_state, gw_states_per_group_t state,
+                                                                                   NvmeAnaGrpId grpid, epoch_t& last_osd_epoch, bool &map_modified);
+    void fsm_handle_to_expired (const NvmeGwId &gw_id, const NvmeGroupKey& group_key,  NvmeAnaGrpId grpid,  bool &map_modified);
+
+    void find_failover_candidate(const NvmeGwId &gw_id, const NvmeGroupKey& group_key,  NvmeAnaGrpId grpid, bool &propose_pending);
+    void find_failback_gw       (const NvmeGwId &gw_id, const NvmeGroupKey& group_key,  bool &propose_pending);
+    void set_failover_gw_for_ANA_group (const NvmeGwId &failed_gw_id, const NvmeGroupKey& group_key, const NvmeGwId &gw_id,
+                                                                                                     NvmeAnaGrpId groupid);
+
+
+    int  get_timer   (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid);
+    void cancel_timer(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid);
+    void validate_gw_map(const NvmeGroupKey& group_key);
+
+public:
+    int  blocklist_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId ANA_groupid, epoch_t &epoch, bool failover);
+
+    void encode(ceph::buffer::list &bl, uint64_t features) const {
+      uint8_t   version;
+      if (HAVE_FEATURE(features, SERVER_SQUID)) version = STRUCT_VERSION;
+      else                                      version = OLD_STRUCT_VERSION;
+      ENCODE_START(version, 1, bl);
+      dout(4) << "encode1 version " << (uint64_t)version  << version << " features " << features << dendl;
+      using ceph::encode;
+      encode(epoch, bl);// global map epoch
+      if (version == STRUCT_VERSION) {
+        //encode(peer_addr_2_version, bl);
+      }
+      encode(created_gws, bl, features); //Encode created GWs
+      encode(fsm_timers, bl, features);
+      ENCODE_FINISH(bl);
+    }
+
+    void decode(ceph::buffer::list::const_iterator &bl) {
+      using ceph::decode;
+      epoch_t struct_version = 0;
+      DECODE_START(STRUCT_VERSION, bl);
+      DECODE_OLDEST(1);
+      struct_version = struct_v;
+      dout(4) << "decode version " << struct_version   << dendl;
+      decode(epoch, bl);
+      if (struct_version == STRUCT_VERSION) {
+        //dout(4) << "Decode peer_2_addr " << dendl;
+        //decode(peer_addr_2_version, bl);
+      }
+      decode(created_gws, bl);
+      decode(fsm_timers, bl);
+      DECODE_FINISH(bl);
+    }
+};
+
+#include "NVMeofGwSerialize.h"
+
+#endif /* SRC_MON_NVMEOFGWMAP_H_ */
index 646d56d30e6cfc9adedd061197558769aab7313b..658708a069fa684a6dc0beeb1ccd830864451c46 100755 (executable)
@@ -424,6 +424,61 @@ void  NVMeofGwMap::find_failover_candidate(
   }
 }
 
+void  NVMeofGwMap::handle_gw_performing_fast_reboot(const NvmeGwId &gw_id,
+     const NvmeGroupKey& group_key, bool &map_modified)
+{
+  for (auto& state_itr: created_gws[group_key][gw_id].sm_state ) {
+    fsm_handle_gw_fast_reboot(gw_id,group_key, state_itr.first, map_modified);
+  }
+}
+
+void NVMeofGwMap::fsm_handle_gw_fast_reboot(const NvmeGwId &gw_id,
+    const NvmeGroupKey& group_key, NvmeAnaGrpId grpid, bool &map_modified)
+{
+  // GW that appears in the internal map as Available, performed reboot,
+  // need to re-apply this GW: to load proper states for all active ANA groups
+  auto& gw_state = created_gws[group_key][gw_id];
+  map_modified = true;
+  gw_states_per_group_t  state = gw_state.sm_state[grpid];
+  dout(10) << "GW " << gw_id  << " ANA groupId: " << grpid << " state "
+        << state << dendl;
+  switch (state){
+  case gw_states_per_group_t::GW_IDLE_STATE:
+  case gw_states_per_group_t::GW_STANDBY_STATE:
+  case gw_states_per_group_t::GW_ACTIVE_STATE:
+    break;
+
+  case gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED:
+  {
+    //restart timeout
+    start_timer(gw_id, group_key, grpid, 3);
+  }
+  break;
+
+  case gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED:
+  {
+    // since owner was reseted for this group, wait for the background process
+    // to choose it again
+    gw_state.standby_state(grpid);
+  }
+  break;
+
+  case gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL:
+  {
+    //restart timer
+    // The blocklist was started, need to wait for the epoch in the GW
+    start_timer(gw_id, group_key, grpid, 30);
+  }
+  break;
+
+  default:
+  {
+    dout(4) << "Warning: GW " << gw_id  << " Invalid state " << state << dendl;
+  }
+  }
+  validate_gw_map(group_key);
+}
+
 void NVMeofGwMap::fsm_handle_gw_alive(
   const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
   NvmeGwMonState & gw_state, gw_states_per_group_t state,
index c128d7c9285646b61ae7509a58f219c5a5745041..1141b8ec09022569d8cdd2375082a3133f872d94 100755 (executable)
@@ -61,6 +61,8 @@ public:
   void start_timer(
     const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
     NvmeAnaGrpId anagrpid, uint8_t value);
+  void handle_gw_performing_fast_reboot(const NvmeGwId &gw_id,
+       const NvmeGroupKey& group_key, bool &map_modified);
 private:
   void add_grp_id(
     const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
@@ -81,7 +83,9 @@ private:
   void fsm_handle_to_expired(
     const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
     NvmeAnaGrpId grpid,  bool &map_modified);
-
+  void fsm_handle_gw_fast_reboot(const NvmeGwId &gw_id,
+      const NvmeGroupKey& group_key, NvmeAnaGrpId grpid,
+      bool &map_modified);
   void find_failover_candidate(
     const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
     NvmeAnaGrpId grpid, bool &propose_pending);
index ac4a6e199fbd226748ae70d8d12d96480c2fc48c..5f41ebab5a770bcfa8f2cf57366fac271af0dccb 100644 (file)
@@ -394,6 +394,10 @@ bool NVMeofGwMon::prepare_command(MonOpRequestRef op)
        err = 0;
        sstrm.str("");
       }
+      if (rc == 0) {
+        LastBeacon lb = {id, group_key};
+        last_beacon.erase(lb);
+      }
     }
     // propose pending would be generated by the PaxosService
     if ((rc != -EEXIST) && (rc != -EINVAL)) {
@@ -450,6 +454,7 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
   auto& group_gws = map.created_gws[group_key];
   auto gw = group_gws.find(gw_id);
   const BeaconSubsystems& sub = m->get_subsystems();
+  auto now = ceph::coarse_mono_clock::now();
 
   if (avail == gw_availability_t::GW_CREATED) {
     if (gw == group_gws.end()) {
@@ -466,17 +471,18 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
       if (pending_map.created_gws[group_key][gw_id].availability ==
          gw_availability_t::GW_AVAILABLE) {
        dout(4) << " Warning :GW marked as Available in the NVmeofGwMon "
-               << "database, performed full startup - Force gw to exit!"
+               << "database, performed full startup - Apply GW!"
                << gw_id << dendl;
-       avail = gw_availability_t::GW_UNAVAILABLE;
-       // Monitor performs Force Failover for this GW in process_gw_map_gw_down
+        pending_map.handle_gw_performing_fast_reboot(gw_id, group_key, propose);
+        LastBeacon lb = {gw_id, group_key};
+        last_beacon[lb] = now; //Update last beacon
       } else if (
        pending_map.created_gws[group_key][gw_id].performed_full_startup ==
        false) {
        pending_map.created_gws[group_key][gw_id].performed_full_startup = true;
        propose = true;
-       goto set_propose;
       }
+      goto set_propose;
     }
   // gw already created
   } else {
@@ -542,7 +548,6 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
             << " beacon_epoch " << m->get_last_gwmap_epoch() <<  dendl;
   }
   if (avail == gw_availability_t::GW_AVAILABLE) {
-    auto now = ceph::coarse_mono_clock::now();
     // check pending_map.epoch vs m->get_version() -
     // if different - drop the beacon