mon: add NVMe-oF gateway monitor and HA

author Leonid Chernin <lechernin@gmail.com>

Tue, 17 Oct 2023 13:25:07 +0000 (13:25 +0000)

committer Alexander Indenbaum <aindenba@redhat.com>

Wed, 31 Jul 2024 08:50:10 +0000 (08:50 +0000)
author Leonid Chernin <lechernin@gmail.com>
Tue, 17 Oct 2023 13:25:07 +0000 (13:25 +0000)
committer Alexander Indenbaum <aindenba@redhat.com>
Wed, 31 Jul 2024 08:50:10 +0000 (08:50 +0000)
diff --git a/.gitmodules b/.gitmodules

index c4f68c6b2fa4302a3f49d342b8f3028079b51818..a44fccbedcfeb91df5e3efcd64d8bfab2055a7ee 100644 (file)
--- a/.gitmodules
+++ b/.gitmodules
@@ -78,4 +78,11 @@
  [submodule "src/BLAKE3"]
         path = src/BLAKE3
         url = https://github.com/BLAKE3-team/BLAKE3.git
-
+[submodule "src/boost_redis"]
+       path = src/boost_redis
+       url = https://github.com/boostorg/redis.git
+[submodule "src/nvmeof/gateway"]
+       path = src/nvmeof/gateway
+       url = https://github.com/ceph/ceph-nvmeof.git
+       fetchRecurseSubmodules = false
+       shallow = true
diff --git a/PendingReleaseNotes b/PendingReleaseNotes

index 25fcbb70db08b306754b5d1900a14f1e3dad1076..391b8e69cfbbbaf4ae2db3835b37ab7572b69cbe 100644 (file)
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -506,3 +506,11 @@ Relevant tracker: https://tracker.ceph.com/issues/57090
  set using the `fs set` command. This flag prevents using a standby for another
  file system (join_fs = X) when standby for the current filesystem is not available.
  Relevant tracker: https://tracker.ceph.com/issues/61599
+* mon: add NVMe-oF gateway monitor and HA
+  This PR adds high availability support for the nvmeof Ceph service. High availability
+means that even in the case that a certain GW is down, there will be another available
+path for the initiator to be able to continue the IO through another GW.
+It is also adding 2 new mon commands, to notify monitor about the gateway creation/deletion:
+  - nvme-gw create
+  - nvme-gw delete
+Relevant tracker: https://tracker.ceph.com/issues/64777
diff --git a/ceph.spec.in b/ceph.spec.in

index fae1e390ebab76303554088ced247cadcdb02e1f..686b9388c9427d3281353a701ecda36093fdb52e 100644 (file)
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -250,6 +250,7 @@ BuildRequires:      gperf
  BuildRequires:  cmake > 3.5
  BuildRequires: fuse-devel
  BuildRequires: git
+BuildRequires: grpc-devel
  %if 0%{?fedora} || 0%{?suse_version} > 1500 || 0%{?rhel} == 9 || 0%{?openEuler}
  BuildRequires: gcc-c++ >= 11
  %endif
@@ -642,6 +643,17 @@ system. One or more instances of ceph-mon form a Paxos part-time
  parliament cluster that provides extremely reliable and durable storage
  of cluster membership, configuration, and state.
  
+%package mon-client-nvmeof
+Summary:       Ceph NVMeoF Gateway Monitor Client
+%if 0%{?suse_version}
+Group:         System/Filesystems
+%endif
+Provides:      ceph-test:/usr/bin/ceph-nvmeof-monitor-client
+Requires:      librados2 = %{_epoch_prefix}%{version}-%{release}
+%description mon-client-nvmeof
+Ceph NVMeoF Gateway Monitor Client distributes Paxos ANA info
+to NVMeoF Gateway and provides beacons to the monitor daemon
+
  %package mgr
  Summary:        Ceph Manager Daemon
  %if 0%{?suse_version}
@@ -2077,6 +2089,9 @@ if [ $1 -ge 1 ] ; then
    fi
  fi
  
+%files mon-client-nvmeof
+%{_bindir}/ceph-nvmeof-monitor-client
+
  %files fuse
  %{_bindir}/ceph-fuse
  %{_mandir}/man8/ceph-fuse.8*
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt

index 79b45ef171f9706c5d5c3179c2c7c2aa3e18711f..591ea5f357e1eda9b0e8ee31f4fc43148bce9630 100644 (file)
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -305,6 +305,12 @@ endif(WITH_BLKIN)
  
  if(WITH_JAEGER)
    find_package(thrift 0.13.0 REQUIRED)
+
+  if(EXISTS "/etc/redhat-release" OR EXISTS "/etc/fedora-release")
+    # absl is installed as grpc build dependency on RPM based systems
+    add_definitions(-DHAVE_ABSEIL)
+  endif()
+
    include(BuildOpentelemetry)
    build_opentelemetry()
    add_library(jaeger_base INTERFACE)
@@ -875,6 +881,112 @@ if(WITH_FUSE)
    install(PROGRAMS mount.fuse.ceph DESTINATION ${CMAKE_INSTALL_SBINDIR})
  endif(WITH_FUSE)
  
+# NVMEOF GATEWAY MONITOR CLIENT
+# Supported on RPM-based platforms only, depends on grpc devel libraries/tools
+if(EXISTS "/etc/redhat-release" OR EXISTS "/etc/fedora-release")
+  option(WITH_NVMEOF_GATEWAY_MONITOR_CLIENT "build nvmeof gateway monitor client" ON)
+else()
+  option(WITH_NVMEOF_GATEWAY_MONITOR_CLIENT "build nvmeof gateway monitor client" OFF)
+endif()
+
+if(WITH_NVMEOF_GATEWAY_MONITOR_CLIENT)
+
+  # Find Protobuf installation
+  # Looks for protobuf-config.cmake file installed by Protobuf's cmake installation.
+  option(protobuf_MODULE_COMPATIBLE TRUE)
+  find_package(Protobuf REQUIRED)
+
+  set(_REFLECTION grpc++_reflection)
+  if(CMAKE_CROSSCOMPILING)
+    find_program(_PROTOBUF_PROTOC protoc)
+  else()
+    set(_PROTOBUF_PROTOC $<TARGET_FILE:protobuf::protoc>)
+  endif()
+
+  # Find gRPC installation
+  # Looks for gRPCConfig.cmake file installed by gRPC's cmake installation.
+  find_package(gRPC CONFIG REQUIRED)
+  message(STATUS "Using gRPC ${gRPC_VERSION}")
+  set(_GRPC_GRPCPP gRPC::grpc++)
+  if(CMAKE_CROSSCOMPILING)
+    find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
+  else()
+    set(_GRPC_CPP_PLUGIN_EXECUTABLE $<TARGET_FILE:gRPC::grpc_cpp_plugin>)
+  endif()
+
+  # Gateway Proto file
+  get_filename_component(nvmeof_gateway_proto "nvmeof/gateway/control/proto/gateway.proto" ABSOLUTE)
+  get_filename_component(nvmeof_gateway_proto_path "${nvmeof_gateway_proto}" PATH)
+
+  # Generated sources
+  set(nvmeof_gateway_proto_srcs "${CMAKE_CURRENT_BINARY_DIR}/gateway.pb.cc")
+  set(nvmeof_gateway_proto_hdrs "${CMAKE_CURRENT_BINARY_DIR}/gateway.pb.h")
+  set(nvmeof_gateway_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/gateway.grpc.pb.cc")
+  set(nvmeof_gateway_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/gateway.grpc.pb.h")
+
+  add_custom_command(
+        OUTPUT "${nvmeof_gateway_proto_srcs}" "${nvmeof_gateway_proto_hdrs}" "${nvmeof_gateway_grpc_srcs}" "${nvmeof_gateway_grpc_hdrs}"
+        COMMAND ${_PROTOBUF_PROTOC}
+        ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
+          --cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
+          -I "${nvmeof_gateway_proto_path}"
+          --experimental_allow_proto3_optional
+          --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}"
+          "${nvmeof_gateway_proto}"
+        DEPENDS "${nvmeof_gateway_proto}")
+
+
+  # Monitor Proto file
+  get_filename_component(nvmeof_monitor_proto "nvmeof/gateway/control/proto/monitor.proto" ABSOLUTE)
+  get_filename_component(nvmeof_monitor_proto_path "${nvmeof_monitor_proto}" PATH)
+
+  # Generated sources
+  set(nvmeof_monitor_proto_srcs "${CMAKE_CURRENT_BINARY_DIR}/monitor.pb.cc")
+  set(nvmeof_monitor_proto_hdrs "${CMAKE_CURRENT_BINARY_DIR}/monitor.pb.h")
+  set(nvmeof_monitor_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/monitor.grpc.pb.cc")
+  set(nvmeof_monitor_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/monitor.grpc.pb.h")
+
+  add_custom_command(
+        OUTPUT "${nvmeof_monitor_proto_srcs}" "${nvmeof_monitor_proto_hdrs}" "${nvmeof_monitor_grpc_srcs}" "${nvmeof_monitor_grpc_hdrs}"
+        COMMAND ${_PROTOBUF_PROTOC}
+        ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
+          --cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
+          -I "${nvmeof_monitor_proto_path}"
+          --experimental_allow_proto3_optional
+          --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}"
+          "${nvmeof_monitor_proto}"
+        DEPENDS "${nvmeof_monitor_proto}")
+
+  # Include generated *.pb.h files
+  include_directories("${CMAKE_CURRENT_BINARY_DIR}")
+
+  set(ceph_nvmeof_monitor_client_srcs
+    ${nvmeof_gateway_proto_srcs}
+    ${nvmeof_gateway_proto_hdrs}
+    ${nvmeof_gateway_grpc_srcs}
+    ${nvmeof_gateway_grpc_hdrs}
+    ${nvmeof_monitor_proto_srcs}
+    ${nvmeof_monitor_proto_hdrs}
+    ${nvmeof_monitor_grpc_srcs}
+    ${nvmeof_monitor_grpc_hdrs}
+    ceph_nvmeof_monitor_client.cc
+    nvmeof/NVMeofGwClient.cc
+    nvmeof/NVMeofGwMonitorGroupClient.cc
+    nvmeof/NVMeofGwMonitorClient.cc)
+  add_executable(ceph-nvmeof-monitor-client ${ceph_nvmeof_monitor_client_srcs})
+  add_dependencies(ceph-nvmeof-monitor-client ceph-common)
+  target_link_libraries(ceph-nvmeof-monitor-client
+    client
+    mon
+    global-static
+    ceph-common
+    ${_REFLECTION}
+    ${_GRPC_GRPCPP}
+    )
+  install(TARGETS ceph-nvmeof-monitor-client DESTINATION bin)
+endif()
+# END OF NVMEOF GATEWAY MONITOR CLIENT
+
  if(WITH_DOKAN)
    add_subdirectory(dokan)
  endif(WITH_DOKAN)
diff --git a/src/ceph_nvmeof_monitor_client.cc b/src/ceph_nvmeof_monitor_client.cc

new file mode 100644 (file)

index 0000000..0545799
--- /dev/null
+++ b/src/ceph_nvmeof_monitor_client.cc
@@ -0,0 +1,79 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM Inc
+ *
+ * Author: Alexander Indenbaum <aindenba@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <pthread.h>
+
+#include "include/types.h"
+#include "include/compat.h"
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "common/pick_address.h"
+#include "global/global_init.h"
+
+#include "nvmeof/NVMeofGwMonitorClient.h"
+
+static void usage()
+{
+  std::cout << "usage: ceph-nvmeof-monitor-client\n"
+               "        --gateway-name <GW_NAME>\n"
+               "        --gateway-address <GW_ADDRESS>\n"
+               "        --gateway-pool <CEPH_POOL>\n"
+               "        --gateway-group <GW_GROUP>\n"
+               "        --monitor-group-address <MONITOR_GROUP_ADDRESS>\n"
+               "        [flags]\n"
+           << std::endl;
+  generic_server_usage();
+}
+
+/**
+ * A short main() which just instantiates a Nvme and
+ * hands over control to that.
+ */
+int main(int argc, const char **argv)
+{
+  ceph_pthread_setname(pthread_self(), "ceph-nvmeof-monitor-client");
+
+  auto args = argv_to_vec(argc, argv);
+  if (args.empty()) {
+    std::cerr << argv[0] << ": -h or --help for usage" << std::endl;
+    exit(1);
+  }
+  if (ceph_argparse_need_usage(args)) {
+    usage();
+    exit(0);
+  }
+
+  auto cct = global_init(nullptr, args, CEPH_ENTITY_TYPE_CLIENT,
+                         CODE_ENVIRONMENT_UTILITY, // maybe later use CODE_ENVIRONMENT_DAEMON,
+                        CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+
+  pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC);
+
+  global_init_daemonize(g_ceph_context);
+  global_init_chdir(g_ceph_context);
+  common_init_finish(g_ceph_context);
+
+  NVMeofGwMonitorClient gw_monitor_client(argc, argv);
+  int rc = gw_monitor_client.init();
+  if (rc != 0) {
+      std::cerr << "Error in initialization: " << cpp_strerror(rc) << std::endl;
+      return rc;
+  }
+
+  return gw_monitor_client.main(args);
+}
+
diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in

index 1b355d6e03ada8ef3e85f9bdf56e01f9b77e6ef0..b34e3c5a337bae28b4f56bb391a3ee854248be3c 100644 (file)
--- a/src/common/options/global.yaml.in
+++ b/src/common/options/global.yaml.in
@@ -1755,6 +1755,13 @@ options:
    default: 500
    services:
    - mon
+- name: mon_max_nvmeof_epochs
+  type: int
+  level: advanced
+  desc: max number of nvmeof gateway maps to store
+  default: 500
+  services:
+  - mon
  - name: mon_max_osd
    type: int
    level: advanced
diff --git a/src/common/options/mon.yaml.in b/src/common/options/mon.yaml.in

index 1ec9871b6a8ea3d06f5fe042b9f0a902ddfff019..ab1634bc154bfaef5572718cbdec020332fd881f 100644 (file)
--- a/src/common/options/mon.yaml.in
+++ b/src/common/options/mon.yaml.in
@@ -72,6 +72,25 @@ options:
    default: 30
    services:
    - mon
+- name: mon_nvmeofgw_beacon_grace
+  type: secs
+  level: advanced
+  desc: Period in seconds from last beacon to monitor marking a  NVMeoF gateway as
+    failed
+  default: 10
+  services:
+  - mon
+- name: mon_nvmeofgw_set_group_id_retry
+  type: uint
+  level: advanced
+  desc: Retry wait time in microsecond for set group id between the monitor client
+    and gateway
+  long_desc: The monitor server determines the gateway's group ID. If the monitor client
+    receives a monitor group ID assignment before the gateway is fully up during
+    initialization, a retry is required.
+  default: 1000
+  services:
+  - mon
  - name: mon_mgr_inactive_grace
    type: int
    level: advanced
@@ -1341,3 +1360,18 @@ options:
    with_legacy: true
    see_also:
    - osd_heartbeat_use_min_delay_socket
+- name: nvmeof_mon_client_disconnect_panic
+  type: secs
+  level: advanced
+  desc: The duration, expressed in seconds, after which the nvmeof gateway
+    should trigger a panic if it loses connection to the monitor
+  default: 100
+  services:
+  - mon
+- name: nvmeof_mon_client_tick_period
+  type: secs
+  level: advanced
+  desc: Period in seconds of nvmeof gateway beacon messages to monitor
+  default: 2
+  services:
+  - mon
diff --git a/src/messages/MNVMeofGwBeacon.h b/src/messages/MNVMeofGwBeacon.h

new file mode 100644 (file)

index 0000000..26fc8dc
--- /dev/null
+++ b/src/messages/MNVMeofGwBeacon.h
@@ -0,0 +1,122 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_NVMEOFGWBEACON_H
+#define CEPH_NVMEOFGWBEACON_H
+
+#include <cstddef>
+#include <vector>
+#include "messages/PaxosServiceMessage.h"
+#include "mon/MonCommand.h"
+#include "mon/NVMeofGwMap.h"
+#include "include/types.h"
+
+class MNVMeofGwBeacon final : public PaxosServiceMessage {
+private:
+  static constexpr int HEAD_VERSION = 1;
+  static constexpr int COMPAT_VERSION = 1;
+
+protected:
+    std::string       gw_id;
+    std::string       gw_pool;
+    std::string       gw_group;
+    BeaconSubsystems  subsystems;                           // gateway susbsystem and their state machine states
+    gw_availability_t availability;                         // in absence of  beacon  heartbeat messages it becomes inavailable
+    epoch_t           last_osd_epoch;
+    epoch_t           last_gwmap_epoch;
+
+public:
+  MNVMeofGwBeacon()
+    : PaxosServiceMessage{MSG_MNVMEOF_GW_BEACON, 0, HEAD_VERSION, COMPAT_VERSION}
+  {
+    set_priority(CEPH_MSG_PRIO_HIGH);
+  }
+
+  MNVMeofGwBeacon(const std::string &gw_id_,
+        const std::string& gw_pool_,
+        const std::string& gw_group_,
+        const BeaconSubsystems& subsystems_,
+        const gw_availability_t& availability_,
+        const epoch_t& last_osd_epoch_,
+        const epoch_t& last_gwmap_epoch_
+  )
+    : PaxosServiceMessage{MSG_MNVMEOF_GW_BEACON, 0, HEAD_VERSION, COMPAT_VERSION},
+      gw_id(gw_id_), gw_pool(gw_pool_), gw_group(gw_group_), subsystems(subsystems_),
+      availability(availability_), last_osd_epoch(last_osd_epoch_), last_gwmap_epoch(last_gwmap_epoch_)
+  {
+    set_priority(CEPH_MSG_PRIO_HIGH);
+  }
+
+  const std::string& get_gw_id() const { return gw_id; }
+  const std::string& get_gw_pool() const { return gw_pool; }
+  const std::string& get_gw_group() const { return gw_group; }
+  NvmeAnaNonceMap get_nonce_map() const {
+    NvmeAnaNonceMap nonce_map;
+    for (const auto& sub: subsystems) {
+      for (const auto& ns: sub.namespaces) {
+        auto& nonce_vec = nonce_map[ns.anagrpid-1];//Converting   ana groups to offsets
+        if (std::find(nonce_vec.begin(), nonce_vec.end(), ns.nonce) == nonce_vec.end())
+          nonce_vec.push_back(ns.nonce);
+      }
+    }
+    return nonce_map;
+  }
+
+  const gw_availability_t& get_availability()   const   { return availability; }
+  const epoch_t&           get_last_osd_epoch() const   { return last_osd_epoch; }
+  const epoch_t&           get_last_gwmap_epoch() const { return last_gwmap_epoch; }
+  const BeaconSubsystems&  get_subsystems()     const   { return subsystems; };
+
+private:
+  ~MNVMeofGwBeacon() final {}
+
+public:
+
+  std::string_view get_type_name() const override { return "nvmeofgwbeacon"; }
+
+  void encode_payload(uint64_t features) override {
+    using ceph::encode;
+    paxos_encode();
+    encode(gw_id, payload);
+    encode(gw_pool, payload);
+    encode(gw_group, payload);
+    encode(subsystems, payload);
+    encode((uint32_t)availability, payload);
+    encode(last_osd_epoch, payload);
+    encode(last_gwmap_epoch, payload);
+  }
+
+  void decode_payload() override {
+    using ceph::decode;
+    auto p = payload.cbegin();
+    
+    paxos_decode(p);
+    decode(gw_id, p);
+    decode(gw_pool, p);
+    decode(gw_group, p);
+    decode(subsystems, p);
+    uint32_t tmp;
+    decode(tmp, p);
+    availability = static_cast<gw_availability_t>(tmp);
+    decode(last_osd_epoch, p);
+    decode(last_gwmap_epoch, p);
+  }
+
+private:
+  template<class T, typename... Args>
+  friend boost::intrusive_ptr<T> ceph::make_message(Args&&... args);
+};
+
+
+#endif
diff --git a/src/messages/MNVMeofGwMap.h b/src/messages/MNVMeofGwMap.h

new file mode 100644 (file)

index 0000000..3affdd2
--- /dev/null
+++ b/src/messages/MNVMeofGwMap.h
@@ -0,0 +1,70 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MNVMEOFGWMAP_H
+#define CEPH_MNVMEOFGWMAP_H
+
+#include "msg/Message.h"
+#include "mon/NVMeofGwMap.h"
+
+class MNVMeofGwMap final : public Message {
+private:
+  static constexpr int VERSION = 1;
+
+protected:
+  std::map<NvmeGroupKey, NvmeGwMonClientStates> map;
+  epoch_t                           gwmap_epoch;
+
+public:
+  const std::map<NvmeGroupKey, NvmeGwMonClientStates>& get_map() {return map;}
+  const epoch_t& get_gwmap_epoch() {return gwmap_epoch;}
+
+private:
+  MNVMeofGwMap() :
+    Message{MSG_MNVMEOF_GW_MAP} {}
+  MNVMeofGwMap(const NVMeofGwMap &map_) :
+    Message{MSG_MNVMEOF_GW_MAP}, gwmap_epoch(map_.epoch)
+  {
+    map_.to_gmap(map);
+  }
+  ~MNVMeofGwMap() final {}
+
+public:
+  std::string_view get_type_name() const override { return "nvmeofgwmap"; }
+
+  void decode_payload() override {
+    auto p = payload.cbegin();
+    int version;
+    decode(version, p);
+    if (version > VERSION)
+      throw ::ceph::buffer::malformed_input(DECODE_ERR_OLDVERSION(__PRETTY_FUNCTION__, VERSION, version));
+    decode(gwmap_epoch, p);
+    decode(map, p);
+  }
+  void encode_payload(uint64_t features) override {
+    using ceph::encode;
+    encode(VERSION, payload);
+    encode(gwmap_epoch, payload);
+    encode(map, payload);
+  }
+private:
+  using RefCountedObject::put;
+  using RefCountedObject::get;
+  template<class T, typename... Args>
+  friend boost::intrusive_ptr<T> ceph::make_message(Args&&... args);
+  template<class T, typename... Args>
+  friend MURef<T> crimson::make_message(Args&&... args);
+};
+
+#endif
diff --git a/src/mon/CMakeLists.txt b/src/mon/CMakeLists.txt

index 4019f854c99135e1d9c8fc224107d2f385c97c83..c5bf64f8c153a221b154e1b1e1d00c6c55e2ee23 100644 (file)
--- a/src/mon/CMakeLists.txt
+++ b/src/mon/CMakeLists.txt
@@ -21,6 +21,8 @@ set(lib_mon_srcs
    ConnectionTracker.cc
    HealthMonitor.cc
    KVMonitor.cc
+  NVMeofGwMon.cc
+  NVMeofGwMap.cc
    ../mds/MDSAuthCaps.cc
    ../mgr/mgr_commands.cc
    ../osd/OSDCap.cc
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h

index e9025b05ef772d359dc67be20575f50ed85c80a8..438cbcfd6d5805e913704439d0d7c8deea6a8d12 100644 (file)
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -1378,8 +1378,25 @@ COMMAND("config generate-minimal-conf",
         "Generate a minimal ceph.conf file",
         "config", "r")
  
+/* NVMeofGwMon*/
+COMMAND("nvme-gw create"
+    " name=id,type=CephString"
+    " name=pool,type=CephString"
+    " name=group,type=CephString",
+    "create nvmeof gateway id for (pool, group)",
+    "mgr", "rw")
+COMMAND("nvme-gw delete"
+    " name=id,type=CephString"
+    " name=pool,type=CephString"
+    " name=group,type=CephString",
+    "delete nvmeof gateway id for (pool, group)",
+    "mgr", "rw")
  
-
+COMMAND("nvme-gw show"
+   " name=pool,type=CephString"
+   " name=group,type=CephString",
+   " show nvmeof gateways within (pool, group)",
+   "mon", "r")
  
  // these are tell commands that were implemented as CLI commands in
  // the broken pre-octopus way that we want to allow to work when a
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc

index a70bfbe33c9deb8e5fa0960310c6fe9589d178fb..07e6bebab4971f5130cb5993636fe14c19a364ee 100644 (file)
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -84,6 +84,7 @@
  #include "MgrStatMonitor.h"
  #include "ConfigMonitor.h"
  #include "KVMonitor.h"
+#include "NVMeofGwMon.h"
  #include "mon/HealthMonitor.h"
  #include "common/config.h"
  #include "common/cmdparse.h"
@@ -247,6 +248,7 @@ Monitor::Monitor(CephContext* cct_, string nm, MonitorDBStore *s,
    paxos_service[PAXOS_HEALTH].reset(new HealthMonitor(*this, *paxos, "health"));
    paxos_service[PAXOS_CONFIG].reset(new ConfigMonitor(*this, *paxos, "config"));
    paxos_service[PAXOS_KV].reset(new KVMonitor(*this, *paxos, "kv"));
+  paxos_service[PAXOS_NVMEGW].reset(new NVMeofGwMon(*this, *paxos, "nvmeofgw"));
  
    bool r = mon_caps.parse("allow *", NULL);
    ceph_assert(r);
@@ -3617,7 +3619,10 @@ void Monitor::handle_command(MonOpRequestRef op)
      mgrmon()->dispatch(op);
      return;
    }
-
+  if (module == "nvme-gw"){
+      nvmegwmon()->dispatch(op);
+      return;
+  }
    if (prefix == "fsid") {
      if (f) {
        f->open_object_section("fsid");
@@ -4551,6 +4556,7 @@ void Monitor::_ms_dispatch(Message *m)
  void Monitor::dispatch_op(MonOpRequestRef op)
  {
    op->mark_event("mon:dispatch_op");
+
    MonSession *s = op->get_session();
    ceph_assert(s);
    if (s->closed) {
@@ -4664,6 +4670,11 @@ void Monitor::dispatch_op(MonOpRequestRef op)
        paxos_service[PAXOS_MGR]->dispatch(op);
        return;
  
+    case MSG_MNVMEOF_GW_BEACON:
+       paxos_service[PAXOS_NVMEGW]->dispatch(op);
+       return;
+
+
      // MgrStat
      case MSG_MON_MGR_REPORT:
      case CEPH_MSG_STATFS:
@@ -5351,6 +5362,9 @@ void Monitor::handle_subscribe(MonOpRequestRef op)
      } else if (p->first.find("kv:") == 0) {
        kvmon()->check_sub(s->sub_map[p->first]);
      }
+    else if (p->first == "NVMeofGw") {
+        nvmegwmon()->check_sub(s->sub_map[p->first]);
+    }
    }
  
    if (reply) {
diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h

index 13afacafde7dd6b01ce4450499eb3a121d48eaf3..0f8481eea6dc90f097bcb8aa6f33e0cb42b0343c 100644 (file)
--- a/src/mon/Monitor.h
+++ b/src/mon/Monitor.h
@@ -712,6 +712,11 @@ public:
      return (class KVMonitor*) paxos_service[PAXOS_KV].get();
    }
  
+  class NVMeofGwMon *nvmegwmon() {
+      return (class NVMeofGwMon*) paxos_service[PAXOS_NVMEGW].get();
+  }
+
+
    friend class Paxos;
    friend class OSDMonitor;
    friend class MDSMonitor;
diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc

new file mode 100755 (executable)

index 0000000..9af9f81
--- /dev/null
+++ b/src/mon/NVMeofGwMap.cc
@@ -0,0 +1,659 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include <boost/tokenizer.hpp>
+#include "include/stringify.h"
+#include "NVMeofGwMon.h"
+#include "NVMeofGwMap.h"
+#include "OSDMonitor.h"
+
+using std::map;
+using std::make_pair;
+using std::ostream;
+using std::ostringstream;
+using std::string;
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix *_dout << "nvmeofgw " << __PRETTY_FUNCTION__ << " "
+
+void NVMeofGwMap::to_gmap(std::map<NvmeGroupKey, NvmeGwMonClientStates>& Gmap) const {
+    Gmap.clear();
+    for (const auto& created_map_pair: created_gws) {
+        const auto& group_key = created_map_pair.first;
+        const NvmeGwMonStates& gw_created_map = created_map_pair.second;
+        for (const auto& gw_created_pair: gw_created_map) {
+            const auto& gw_id = gw_created_pair.first;
+            const auto& gw_created  = gw_created_pair.second;
+
+            auto gw_state = NvmeGwClientState(gw_created.ana_grp_id, epoch, gw_created.availability);
+            for (const auto& sub: gw_created.subsystems) {
+                gw_state.subsystems.insert({sub.nqn, NqnState(sub.nqn, gw_created.sm_state, gw_created )});
+            }
+            Gmap[group_key][gw_id] = gw_state;
+            dout (20) << gw_id << " Gw-Client: " << gw_state << dendl;
+        }
+    }
+}
+
+void NVMeofGwMap::add_grp_id(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, const NvmeAnaGrpId grpid)
+{
+    Tmdata tm_data;
+    Blocklist_data blklist_data;
+    created_gws[group_key][gw_id].sm_state[grpid] = gw_states_per_group_t::GW_STANDBY_STATE;
+    fsm_timers[group_key][gw_id].data[grpid] = tm_data;
+    created_gws[group_key][gw_id].blocklist_data[grpid] = blklist_data;
+}
+
+void NVMeofGwMap::remove_grp_id(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, const NvmeAnaGrpId grpid)
+{
+    created_gws[group_key][gw_id].sm_state.erase(grpid);
+    created_gws[group_key][gw_id].blocklist_data.erase(grpid);
+    fsm_timers[group_key][gw_id].data.erase(grpid);
+}
+
+int  NVMeofGwMap::cfg_add_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key) {
+    std::set<NvmeAnaGrpId> allocated;
+    for (auto& itr: created_gws[group_key]) {
+        allocated.insert(itr.second.ana_grp_id);
+        if (itr.first == gw_id) {
+            dout(1) << __func__ << " ERROR create GW: already exists in map " << gw_id << dendl;
+            return -EEXIST ;
+        }
+    }
+    // Allocate the new group id
+    NvmeAnaGrpId i = 0;
+    bool was_allocated = false;
+    for (NvmeAnaGrpId elem: allocated) {// "allocated" is a sorted set (!),so if found any gap between numbers, it should be filled
+        if (i != elem) {
+            allocated.insert(i);
+            was_allocated = true;
+            break;
+        }
+        i++;
+    }
+    if (!was_allocated) allocated.insert(i);
+    dout(10) << "allocated ANA groupId " << i << " for GW " << gw_id << dendl;
+    for (auto& itr: created_gws[group_key]) { // add new allocated grp_id to maps of created gateways
+         add_grp_id(itr.first, group_key, i);
+    }
+    NvmeGwMonState gw_created(i);
+    created_gws[group_key][gw_id] = gw_created;
+    created_gws[group_key][gw_id].performed_full_startup = true;
+    for (NvmeAnaGrpId elem: allocated) {
+        add_grp_id(gw_id, group_key, elem); // add all existed grp_ids to newly created gateway
+        dout(4) << "adding group " << elem << " to gw " << gw_id << dendl;
+    }
+    dout(10) << __func__ << " Created GWS:  " << created_gws  <<  dendl;
+    return 0;
+}
+
+int NVMeofGwMap::cfg_delete_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key) {
+    int rc = 0;
+    for (auto& gws_states: created_gws[group_key]) {
+
+        if (gws_states.first == gw_id) {
+            auto& state = gws_states.second;
+            for (auto& state_itr: created_gws[group_key][gw_id].sm_state ) {
+                bool modified;
+                fsm_handle_gw_delete(gw_id, group_key,state_itr.second , state_itr.first, modified);
+            }
+            dout(10) << " Delete GW :"<< gw_id  << " ANA grpid: " << state.ana_grp_id  << dendl;
+            for (auto& itr: created_gws[group_key]) {
+                remove_grp_id(itr.first, group_key, state.ana_grp_id);// Update state map and other maps
+                                                                      // of all created gateways. Removed key = anagrp
+            }
+            fsm_timers[group_key].erase(gw_id);
+            if (fsm_timers[group_key].size() == 0)
+                fsm_timers.erase(group_key);
+
+            created_gws[group_key].erase(gw_id);
+            if (created_gws[group_key].size() == 0)
+                created_gws.erase(group_key);
+            return rc;
+        }
+    }
+
+    return -EINVAL;
+}
+
+
+int NVMeofGwMap::process_gw_map_gw_down(const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
+                                            bool &propose_pending) {
+    int rc = 0;
+    auto& gws_states = created_gws[group_key];
+    auto  gw_state = gws_states.find(gw_id);
+    if (gw_state != gws_states.end()) {
+        dout(10) << "GW down " << gw_id << dendl;
+        auto& st = gw_state->second;
+        st.set_unavailable_state();
+        for (auto& state_itr: created_gws[group_key][gw_id].sm_state ) {
+            fsm_handle_gw_down(gw_id, group_key, state_itr.second, state_itr.first, propose_pending);
+            state_itr.second = gw_states_per_group_t::GW_STANDBY_STATE;
+        }
+        propose_pending = true; // map should reflect that gw becames unavailable
+        if (propose_pending) validate_gw_map(group_key);
+    }
+    else {
+        dout(1)  << __FUNCTION__ << "ERROR GW-id was not found in the map " << gw_id << dendl;
+        rc = -EINVAL;
+    }
+    return rc;
+}
+
+
+void NVMeofGwMap::process_gw_map_ka(const NvmeGwId &gw_id, const NvmeGroupKey& group_key,  epoch_t& last_osd_epoch, bool &propose_pending)
+{
+    auto& gws_states = created_gws[group_key];
+    auto  gw_state = gws_states.find(gw_id);
+    auto& st = gw_state->second;
+    dout(20)  << "KA beacon from the GW " << gw_id << " in state " << (int)st.availability << dendl;
+
+    if (st.availability == gw_availability_t::GW_CREATED) {
+        // first time appears - allow IO traffic for this GW
+        st.availability = gw_availability_t::GW_AVAILABLE;
+        for (auto& state_itr: created_gws[group_key][gw_id].sm_state ) state_itr.second = gw_states_per_group_t::GW_STANDBY_STATE;
+        if (st.ana_grp_id != REDUNDANT_GW_ANA_GROUP_ID) { // not a redundand GW
+            st.active_state(st.ana_grp_id);
+        }
+        propose_pending = true;
+    }
+    else if (st.availability == gw_availability_t::GW_UNAVAILABLE) {
+        st.availability = gw_availability_t::GW_AVAILABLE;
+        if (st.ana_grp_id == REDUNDANT_GW_ANA_GROUP_ID) {
+            for (auto& state_itr: created_gws[group_key][gw_id].sm_state ) state_itr.second = gw_states_per_group_t::GW_STANDBY_STATE;
+            propose_pending = true;
+        }
+        else {
+            //========= prepare to Failback to this GW =========
+            // find the GW that took over on the group st.ana_grp_id
+            find_failback_gw(gw_id, group_key, propose_pending);
+        }
+    }
+    else if (st.availability == gw_availability_t::GW_AVAILABLE) {
+        for (auto& state_itr: created_gws[group_key][gw_id].sm_state ) {
+          fsm_handle_gw_alive(gw_id, group_key, gw_state->second, state_itr.second, state_itr.first, last_osd_epoch, propose_pending);
+        }
+    }
+    if (propose_pending) validate_gw_map(group_key);
+}
+
+
+void NVMeofGwMap::handle_abandoned_ana_groups(bool& propose)
+{
+    propose = false;
+    for (auto& group_state: created_gws) {
+        auto& group_key = group_state.first;
+        auto& gws_states = group_state.second;
+
+            for (auto& gw_state : gws_states) { // loop for GWs inside nqn group
+                auto& gw_id = gw_state.first;
+                NvmeGwMonState& state = gw_state.second;
+
+                //1. Failover missed : is there is a GW in unavailable state? if yes, is its ANA group handled by some other GW?
+                if (state.availability == gw_availability_t::GW_UNAVAILABLE && state.ana_grp_id != REDUNDANT_GW_ANA_GROUP_ID) {
+                    auto found_gw_for_ana_group = false;
+                    for (auto& gw_state2 : gws_states) {
+                        NvmeGwMonState& state2 = gw_state2.second;
+                        if (state2.availability == gw_availability_t::GW_AVAILABLE && state2.sm_state[state.ana_grp_id] == gw_states_per_group_t::GW_ACTIVE_STATE) {
+                            found_gw_for_ana_group = true;
+                            break;
+                        }
+                    }
+                    if (found_gw_for_ana_group == false) { //choose the GW for handle ana group
+                        dout(10)<< "Was not found the GW " << " that handles ANA grp " << (int)state.ana_grp_id << " find candidate "<< dendl;
+                        for (auto& state_itr: created_gws[group_key][gw_id].sm_state ) {
+                            find_failover_candidate(gw_id, group_key, state_itr.first, propose);
+                        }
+                    }
+                }
+
+                //2. Failback missed: Check this GW is Available and Standby and no other GW is doing Failback to it
+                else if (state.availability == gw_availability_t::GW_AVAILABLE
+                            && state.ana_grp_id != REDUNDANT_GW_ANA_GROUP_ID &&
+                            state.sm_state[state.ana_grp_id] == gw_states_per_group_t::GW_STANDBY_STATE)
+                {
+                    find_failback_gw(gw_id, group_key, propose);
+                }
+            }
+            if (propose) {
+                validate_gw_map(group_key);
+            }
+    }
+}
+
+
+void  NVMeofGwMap::set_failover_gw_for_ANA_group(const NvmeGwId &failed_gw_id, const NvmeGroupKey& group_key, const NvmeGwId &gw_id,  NvmeAnaGrpId ANA_groupid)
+{
+    NvmeGwMonState& gw_state = created_gws[group_key][gw_id];
+    epoch_t epoch;
+    dout(10) << "Found failover GW " << gw_id << " for ANA group " << (int)ANA_groupid << dendl;
+    int rc = blocklist_gw(failed_gw_id, group_key, ANA_groupid, epoch, true);
+    if (rc) {
+        gw_state.active_state(ANA_groupid); //start failover even when nonces are empty !
+    }
+    else{
+        gw_state.sm_state[ANA_groupid] = gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL;
+        gw_state.blocklist_data[ANA_groupid].osd_epoch = epoch;
+        gw_state.blocklist_data[ANA_groupid].is_failover = true;
+        start_timer(gw_id, group_key, ANA_groupid, 30); //start Failover preparation timer
+    }
+}
+
+void  NVMeofGwMap::find_failback_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key,   bool &propose)
+{
+    auto& gws_states = created_gws[group_key];
+    auto& gw_state = created_gws[group_key][gw_id];
+    bool do_failback = false;
+
+    dout(10) << "Find failback GW for GW " << gw_id << dendl;
+    for (auto& gw_state_it: gws_states) {
+        auto& st = gw_state_it.second;
+        if (st.sm_state[gw_state.ana_grp_id] != gw_states_per_group_t::GW_STANDBY_STATE) {// some other gw owns or owned the desired ana-group
+            do_failback = true;// if candidate is in state ACTIVE for the desired ana-group, then failback starts immediately, otherwise need to wait
+            dout(10) << "Found some gw " << gw_state_it.first  <<  " in state " << st.sm_state[gw_state.ana_grp_id]  << dendl;
+            break;
+        }
+    }
+
+    if (do_failback == false) {
+        // No other gw currently performs some activity with desired ana group of coming-up GW - so it just takes over on the group
+        dout(10)  << "Failback GW candidate was not found, just set Optimized to group " << gw_state.ana_grp_id << " to GW " << gw_id << dendl;
+        gw_state.active_state(gw_state.ana_grp_id);
+        propose = true;
+        return;
+    }
+    //try to do_failback
+    for (auto& gw_state_it: gws_states) {
+        auto& failback_gw_id = gw_state_it.first;
+        auto& st = gw_state_it.second;
+        if (st.sm_state[gw_state.ana_grp_id] == gw_states_per_group_t::GW_ACTIVE_STATE) {
+            dout(10)  << "Found Failback GW " << failback_gw_id << " that previously took over the ANAGRP " << gw_state.ana_grp_id << " of the available GW " << gw_id << dendl;
+            st.sm_state[gw_state.ana_grp_id] = gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED;
+            start_timer(failback_gw_id, group_key, gw_state.ana_grp_id, 3);// Add timestamp of start Failback preparation
+            gw_state.sm_state[gw_state.ana_grp_id] = gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED;
+            propose = true;
+            break;
+        }
+    }
+}
+
+void  NVMeofGwMap::find_failover_candidate(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId grpid, bool &propose_pending)
+{
+    dout(10) <<__func__<< " " << gw_id << dendl;
+    #define ILLEGAL_GW_ID " "
+    #define MIN_NUM_ANA_GROUPS 0xFFF
+    int min_num_ana_groups_in_gw = 0;
+    int current_ana_groups_in_gw = 0;
+    NvmeGwId min_loaded_gw_id = ILLEGAL_GW_ID;
+    auto& gws_states = created_gws[group_key];
+    auto gw_state = gws_states.find(gw_id);
+
+    // this GW may handle several ANA groups and  for each of them need to found the candidate GW
+    if (gw_state->second.sm_state[grpid] == gw_states_per_group_t::GW_ACTIVE_STATE || gw_state->second.ana_grp_id == grpid) {
+
+        for (auto& found_gw_state: gws_states) { // for all the gateways of the subsystem
+            auto st = found_gw_state.second;
+            if (st.sm_state[grpid] ==  gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL) {   // some GW already started failover/failback on this group
+               dout(4) << "Warning : Failover" << st.blocklist_data[grpid].is_failover <<  " already started for the group " << grpid <<  " by GW " << found_gw_state.first << dendl;
+               gw_state->second.standby_state(grpid);
+               return ;
+            }
+        }
+        // Find a GW that takes over the ANA group(s)
+        min_num_ana_groups_in_gw = MIN_NUM_ANA_GROUPS;
+        min_loaded_gw_id = ILLEGAL_GW_ID;
+        for (auto& found_gw_state: gws_states) { // for all the gateways of the subsystem
+            auto st = found_gw_state.second;
+            if (st.availability == gw_availability_t::GW_AVAILABLE) {
+                current_ana_groups_in_gw = 0;
+                for (auto& state_itr: created_gws[group_key][gw_id].sm_state ) {
+                    NvmeAnaGrpId anagrp = state_itr.first;
+                    if (st.sm_state[anagrp] == gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED || st.sm_state[anagrp] == gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED
+                                                                                          || st.sm_state[anagrp] == gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL) {
+                        current_ana_groups_in_gw = 0xFFFF;
+                        break; // dont take into account   GWs in the transitive state
+                    }
+                    else if (st.sm_state[anagrp] == gw_states_per_group_t::GW_ACTIVE_STATE) {
+                        current_ana_groups_in_gw++; // how many ANA groups are handled by this GW
+                    }
+                }
+                if (min_num_ana_groups_in_gw > current_ana_groups_in_gw) {
+                    min_num_ana_groups_in_gw = current_ana_groups_in_gw;
+                    min_loaded_gw_id = found_gw_state.first;
+                    dout(10) << "choose: gw-id  min_ana_groups " << min_loaded_gw_id << current_ana_groups_in_gw << " min " << min_num_ana_groups_in_gw << dendl;
+                }
+            }
+        }
+        if (min_loaded_gw_id != ILLEGAL_GW_ID) {
+            propose_pending = true;
+            set_failover_gw_for_ANA_group(gw_id, group_key, min_loaded_gw_id, grpid);
+        }
+        else {
+            if (gw_state->second.sm_state[grpid] == gw_states_per_group_t::GW_ACTIVE_STATE) {// not found candidate but map changed.
+                propose_pending = true;
+                dout(10) << "gw down:  no candidate found " << dendl;
+            }
+        }
+        gw_state->second.standby_state(grpid);
+    }
+}
+
+void NVMeofGwMap::fsm_handle_gw_alive(const NvmeGwId &gw_id, const NvmeGroupKey& group_key,  NvmeGwMonState & gw_state, gw_states_per_group_t state, NvmeAnaGrpId grpid, epoch_t& last_osd_epoch, bool &map_modified)
+{
+    switch (state) {
+    case gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL:
+    {
+        int timer_val = get_timer(gw_id, group_key, grpid);
+        NvmeGwMonState& gw_map = created_gws[group_key][gw_id];
+        if (gw_map.blocklist_data[grpid].osd_epoch <= last_osd_epoch) {
+            dout(10) << "is-failover: " << gw_map.blocklist_data[grpid].is_failover << " osd epoch changed from " << gw_map.blocklist_data[grpid].osd_epoch << " to "<< last_osd_epoch
+                    << " Ana-grp: " << grpid  << " timer:" << timer_val << dendl;
+            gw_state.active_state(grpid);                   // Failover Gw still alive and guaranteed that
+            cancel_timer(gw_id, group_key, grpid);          // ana group wouldnt be taken back  during blocklist wait period
+            map_modified = true;
+        }
+        else{
+            dout(20) << "osd epoch not changed from " <<  gw_map.blocklist_data[grpid].osd_epoch << " to "<< last_osd_epoch
+                    << " Ana-grp: " << grpid  << " timer:" << timer_val << dendl;
+        }
+    }
+    break;
+
+    default:
+        break;
+    }
+}
+
+ void NVMeofGwMap::fsm_handle_gw_down(const NvmeGwId &gw_id, const NvmeGroupKey& group_key,   gw_states_per_group_t state, NvmeAnaGrpId grpid,  bool &map_modified)
+ {
+    switch (state)
+    {
+        case gw_states_per_group_t::GW_STANDBY_STATE:
+        case gw_states_per_group_t::GW_IDLE_STATE:
+            // nothing to do
+            break;
+
+        case gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL:
+        {
+            cancel_timer(gw_id, group_key, grpid);
+            map_modified = true;
+        }break;
+
+        case gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED:
+            cancel_timer(gw_id, group_key,  grpid);
+            map_modified = true;
+            for (auto& gw_st: created_gws[group_key]) {
+                auto& st = gw_st.second;
+                if (st.sm_state[grpid] == gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED) { // found GW   that was intended for  Failback for this ana grp
+                    dout(4) << "Warning: Outgoing Failback when GW is down back - to rollback it"  <<" GW "  <<gw_id << "for ANA Group " << grpid << dendl;
+                    st.standby_state(grpid);
+                    break;
+                }
+            }
+            break;
+
+        case gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED:
+            // nothing to do - let failback timer expire
+            break;
+
+        case gw_states_per_group_t::GW_ACTIVE_STATE:
+        {
+            find_failover_candidate(gw_id, group_key, grpid, map_modified);
+        }
+        break;
+
+        default:{
+            dout(4) << "Error : Invalid state " << state << "for GW " << gw_id  << dendl;
+        }
+
+    }
+ }
+
+
+void NVMeofGwMap::fsm_handle_gw_delete(const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
+     gw_states_per_group_t state , NvmeAnaGrpId grpid, bool &map_modified) {
+    switch (state)
+    {
+        case gw_states_per_group_t::GW_STANDBY_STATE:
+        case gw_states_per_group_t::GW_IDLE_STATE:
+        case gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED:
+        {
+            NvmeGwMonState& gw_state = created_gws[group_key][gw_id];
+
+            if (grpid == gw_state.ana_grp_id) {// Try to find GW that temporary owns my group - if found, this GW should pass to standby for  this group
+                auto& gateway_states = created_gws[group_key];
+                for (auto& gs: gateway_states) {
+                    if (gs.second.sm_state[grpid] == gw_states_per_group_t::GW_ACTIVE_STATE  || gs.second.sm_state[grpid] == gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED) {
+                        gs.second.standby_state(grpid);
+                        map_modified = true;
+                        if (gs.second.sm_state[grpid] == gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED)
+                            cancel_timer(gs.first, group_key, grpid);
+                        break;
+                    }
+                }
+            }
+        }
+        break;
+
+        case gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL:
+        {
+            NvmeGwMonState& gw_state = created_gws[group_key][gw_id];
+            cancel_timer(gw_id, group_key, grpid);
+            map_modified = true;
+            gw_state.standby_state(grpid);
+        }
+        break;
+
+        case gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED:
+        {
+            cancel_timer(gw_id, group_key, grpid);
+            map_modified = true;
+            for (auto& nqn_gws_state: created_gws[group_key]) {
+                auto& st = nqn_gws_state.second;
+
+                if (st.sm_state[grpid] == gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED) { // found GW   that was intended for  Failback for this ana grp
+                    dout(4) << "Warning: Outgoing Failback when GW is deleted - to rollback it" << " GW " << gw_id << "for ANA Group " << grpid << dendl;
+                    st.standby_state(grpid);
+                    break;
+                }
+            }
+        }
+        break;
+
+        case gw_states_per_group_t::GW_ACTIVE_STATE:
+        {
+            NvmeGwMonState& gw_state = created_gws[group_key][gw_id];
+            map_modified = true;
+            gw_state.standby_state(grpid);
+        }
+        break;
+
+        default: {
+            dout(4) << "Error : Invalid state " << state << "for GW " << gw_id  << dendl;
+        }
+    }
+    if (map_modified) validate_gw_map(group_key);
+}
+
+void NVMeofGwMap::fsm_handle_to_expired(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId grpid,  bool &map_modified)
+{
+    auto& fbp_gw_state = created_gws[group_key][gw_id];// GW in Fail-back preparation state fbp
+    bool grp_owner_found = false;
+    if (fbp_gw_state.sm_state[grpid] == gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED) {
+        for (auto& gw_state: created_gws[group_key]) {
+            auto& st = gw_state.second;
+            if (st.ana_grp_id == grpid) {// group owner
+                grp_owner_found = true;
+                if (st.availability == gw_availability_t::GW_AVAILABLE) {
+                   if ( !(fbp_gw_state.last_gw_map_epoch_valid  && st.last_gw_map_epoch_valid) ) {
+                     //Timer is not cancelled so it would expire over and over as long as both gws are not updated
+                     dout(10) << "gw " << gw_id  <<" or gw " << gw_state.first  << "map epochs are not updated "<< dendl;
+                     return;
+                   }
+                }
+                cancel_timer(gw_id, group_key, grpid);
+                map_modified = true;
+                if (st.sm_state[grpid] == gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED && st.availability == gw_availability_t::GW_AVAILABLE )
+                {
+                    fbp_gw_state.standby_state(grpid);// Previous failover GW  set to standby
+                    st.active_state(grpid);
+                    dout(10)  << "Expired Failback-preparation timer from GW " << gw_id << " ANA groupId "<< grpid << dendl;
+                    map_modified = true;
+                    break;
+                }
+                else if (st.sm_state[grpid] == gw_states_per_group_t::GW_STANDBY_STATE  &&  st.availability == gw_availability_t::GW_AVAILABLE) {
+                   st.standby_state(grpid);// GW failed during the persistency interval
+                   dout(10)  << "Failback unsuccessfull. GW: " << gw_state.first << " becomes Standby for the ANA groupId " << grpid  << dendl;
+                }
+                fbp_gw_state.standby_state(grpid);
+                dout(10) << "Failback unsuccessfull GW: " << gw_id << " becomes Standby for the ANA groupId " << grpid  << dendl;
+                map_modified = true;
+                break;
+            }
+       }
+      if (grp_owner_found == false) {
+         dout(4) << "group owner not found " << grpid << " GW: " << gw_id << dendl;// when  GW group owner is deleted the fbk gw is put to standby
+      }
+    }
+    else if (fbp_gw_state.sm_state[grpid] == gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL) {
+        dout(4) << "Warning: Expired GW_WAIT_FAILOVER_PREPARED timer from GW, Force exit the GW " << gw_id << " ANA groupId: "<< grpid << dendl;
+        fbp_gw_state.set_unavailable_state();
+        map_modified = true;
+    }
+    if (map_modified) validate_gw_map(group_key);
+}
+
+struct CMonRequestProposal : public Context {
+  NVMeofGwMap *m;
+  entity_addrvec_t addr_vect;
+  utime_t expires;
+  CMonRequestProposal(NVMeofGwMap *mon , entity_addrvec_t addr_vector, utime_t until) : m(mon), addr_vect(addr_vector), expires (until)  {}
+  void finish(int r) {
+      dout(10) << "osdmon is  writable? " << m->mon->osdmon()->is_writeable() << dendl;
+      if (m->mon->osdmon()->is_writeable()) {
+        epoch_t epoch = m->mon->osdmon()->blocklist(addr_vect, expires);
+        dout(10) << "epoch " << epoch <<dendl;
+        m->mon->nvmegwmon()->request_proposal(m->mon->osdmon());
+      }
+      else {
+          m->mon->osdmon()->wait_for_writeable_ctx( new CMonRequestProposal(m, addr_vect, expires));
+      }
+  }
+};
+
+int NVMeofGwMap::blocklist_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId grpid, epoch_t &epoch, bool failover)
+{
+    NvmeGwMonState& gw_map =  created_gws[group_key][gw_id];  //find_already_created_gw(gw_id, group_key);
+
+     if (gw_map.nonce_map[grpid].size() > 0) {
+        NvmeNonceVector &nonce_vector = gw_map.nonce_map[grpid];;
+        std::string str = "[";
+        entity_addrvec_t addr_vect;
+
+        double d = g_conf().get_val<double>("mon_osd_blocklist_default_expire");
+        utime_t expires = ceph_clock_now();
+        expires += d;
+        dout(10) << " blocklist timestamp " << expires << dendl;
+        for (auto &it: nonce_vector ) {
+            if (str != "[") str += ",";
+            str += it;
+        }
+        str += "]";
+        bool rc = addr_vect.parse(&str[0]);
+        dout(10) << str << " rc " << rc <<  " network vector: " << addr_vect << " " << addr_vect.size() << dendl;
+        if (rc)
+            return 1;
+
+        if (!mon->osdmon()->is_writeable()) {
+            dout(10) << "osdmon is not writable, waiting, epoch = " << epoch << dendl;
+            mon->osdmon()->wait_for_writeable_ctx( new CMonRequestProposal(this, addr_vect, expires ));// return false;
+        }
+        else {
+            epoch = mon->osdmon()->blocklist(addr_vect, expires);
+            if (!mon->osdmon()->is_writeable()) {
+              dout(10) << "osdmon is not writable after blocklist is done, waiting, epoch = " << epoch << dendl;
+              mon->osdmon()->wait_for_writeable_ctx( new CMonRequestProposal(this, addr_vect, expires ));// return false;
+            }
+            else{
+               mon->nvmegwmon()->request_proposal(mon->osdmon());
+            }
+        }
+        dout(10) << str << " mon->osdmon()->blocklist: epoch : " << epoch <<  " address vector: " << addr_vect << " " << addr_vect.size() << dendl;
+    }
+    else{
+        dout(4) << "Error: No nonces context present for gw: " <<gw_id  << " ANA group: " << grpid << dendl;
+        return 1;
+    }
+    return 0;
+}
+
+void  NVMeofGwMap::validate_gw_map(const NvmeGroupKey& group_key)
+{
+   for (auto& gw_created: created_gws[group_key]) {
+        auto gw_id = gw_created.first;
+        for (auto& state_itr: created_gws[group_key][gw_id].sm_state ) {
+            NvmeAnaGrpId ana_group = state_itr.first;
+            int count = 0;
+            for (auto& gw_created_pair: created_gws[group_key]) {
+               auto& st = gw_created_pair.second;
+               if (st.sm_state[ana_group] == gw_states_per_group_t::GW_ACTIVE_STATE) {
+                  count ++;
+                  if (count == 2) {
+                     dout(1) << "Critical Error : number active states per ana-group " << ana_group << "more than 1 in pool-group " << group_key << dendl;
+                     dout(4) << created_gws[group_key] << dendl;
+                 }
+              }
+            }
+        }
+        break;
+  }
+}
+
+void NVMeofGwMap::update_active_timers( bool &propose_pending ) {
+    const auto now = std::chrono::system_clock::now();
+    for (auto& group_to: fsm_timers) {
+        auto& group_key = group_to.first;
+        auto& pool = group_key.first;
+        auto& group = group_key.second;
+        for (auto& gw_to: group_to.second) {
+            auto& gw_id = gw_to.first;
+            auto& to = gw_to.second;
+            for (auto &to_itr:to.data)
+            {
+                if (to.data[to_itr.first].timer_started == 0) continue;
+                dout(20) << "Checking timer for GW " << gw_id << " ANA GRP " << to_itr.first<< " value(seconds): "<< (int)to.data[to_itr.first].timer_value << dendl;
+                if (now >= to.data[to_itr.first].end_time) {
+                    fsm_handle_to_expired(gw_id, std::make_pair(pool, group), to_itr.first, propose_pending);
+                }
+            }
+        }
+    }
+}
+
+void NVMeofGwMap::start_timer(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid, uint8_t value_sec) {
+    fsm_timers[group_key][gw_id].data[anagrpid].timer_started = 1;
+    fsm_timers[group_key][gw_id].data[anagrpid].timer_value = value_sec;
+    dout(10) << "start timer for ana " << anagrpid << " gw " << gw_id << "value sec " << (int)value_sec << dendl;
+    const auto now = std::chrono::system_clock::now();
+    fsm_timers[group_key][gw_id].data[anagrpid].end_time = now + std::chrono::seconds(value_sec);
+}
+
+int  NVMeofGwMap::get_timer(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid) {
+    auto timer = fsm_timers[group_key][gw_id].data[anagrpid].timer_value;
+    return timer;
+}
+
+void NVMeofGwMap::cancel_timer(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid) {
+    fsm_timers[group_key][gw_id].data[anagrpid].timer_started = 0;
+}
diff --git a/src/mon/NVMeofGwMap.h b/src/mon/NVMeofGwMap.h

new file mode 100755 (executable)

index 0000000..2390176
--- /dev/null
+++ b/src/mon/NVMeofGwMap.h
@@ -0,0 +1,95 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef MON_NVMEOFGWMAP_H_
+#define MON_NVMEOFGWMAP_H_
+#include <map>
+#include <iostream>
+#include "include/encoding.h"
+#include "include/utime.h"
+#include "common/Formatter.h"
+#include "common/ceph_releases.h"
+#include "common/version.h"
+#include "common/options.h"
+#include "common/Clock.h"
+#include "msg/Message.h"
+#include "common/ceph_time.h"
+#include "NVMeofGwTypes.h"
+
+using ceph::coarse_mono_clock;
+class Monitor;
+/*-------------------*/
+class NVMeofGwMap
+{
+public:
+    Monitor*                            mon           = NULL;
+    epoch_t                             epoch         = 0;      // epoch is for Paxos synchronization  mechanizm
+    bool                                delay_propose = false;
+
+    std::map<NvmeGroupKey, NvmeGwMonStates>  created_gws;
+    std::map<NvmeGroupKey, NvmeGwTimers> fsm_timers;// map that handles timers started by all Gateway FSMs
+    void to_gmap(std::map<NvmeGroupKey, NvmeGwMonClientStates>& Gmap) const;
+
+    int   cfg_add_gw                    (const NvmeGwId &gw_id, const NvmeGroupKey& group_key);
+    int   cfg_delete_gw                 (const NvmeGwId &gw_id, const NvmeGroupKey& group_key);
+    void  process_gw_map_ka             (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, epoch_t& last_osd_epoch,  bool &propose_pending);
+    int   process_gw_map_gw_down        (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose_pending);
+    void  update_active_timers          (bool &propose_pending);
+    void  handle_abandoned_ana_groups   (bool &propose_pending);
+    void  handle_removed_subsystems     (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, const std::vector<NvmeNqnId> &current_subsystems, bool &propose_pending);
+    void  start_timer (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid, uint8_t value);
+private:
+    void add_grp_id   (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, const NvmeAnaGrpId grpid);
+    void remove_grp_id(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, const NvmeAnaGrpId grpid);
+    void fsm_handle_gw_down    (const NvmeGwId &gw_id, const NvmeGroupKey& group_key,  gw_states_per_group_t state, NvmeAnaGrpId grpid,  bool &map_modified);
+    void fsm_handle_gw_delete  (const NvmeGwId &gw_id, const NvmeGroupKey& group_key,  gw_states_per_group_t state, NvmeAnaGrpId grpid,  bool &map_modified);
+    void fsm_handle_gw_alive   (const NvmeGwId &gw_id, const NvmeGroupKey& group_key,  NvmeGwMonState & gw_state, gw_states_per_group_t state,
+                                                                                   NvmeAnaGrpId grpid, epoch_t& last_osd_epoch, bool &map_modified);
+    void fsm_handle_to_expired (const NvmeGwId &gw_id, const NvmeGroupKey& group_key,  NvmeAnaGrpId grpid,  bool &map_modified);
+
+    void find_failover_candidate(const NvmeGwId &gw_id, const NvmeGroupKey& group_key,  NvmeAnaGrpId grpid, bool &propose_pending);
+    void find_failback_gw       (const NvmeGwId &gw_id, const NvmeGroupKey& group_key,  bool &propose_pending);
+    void set_failover_gw_for_ANA_group (const NvmeGwId &failed_gw_id, const NvmeGroupKey& group_key, const NvmeGwId &gw_id,
+                                                                                                     NvmeAnaGrpId groupid);
+
+
+    int  get_timer   (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid);
+    void cancel_timer(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid);
+    void validate_gw_map(const NvmeGroupKey& group_key);
+
+public:
+    int  blocklist_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId ANA_groupid, epoch_t &epoch, bool failover);
+    void encode(ceph::buffer::list &bl) const {
+        using ceph::encode;
+        ENCODE_START(1, 1, bl);
+        encode(epoch, bl);// global map epoch
+
+        encode(created_gws, bl); //Encode created GWs
+        encode(fsm_timers, bl);
+        ENCODE_FINISH(bl);
+    }
+
+    void decode(ceph::buffer::list::const_iterator &bl) {
+        using ceph::decode;
+        DECODE_START(1, bl);
+        decode(epoch, bl);
+
+        decode(created_gws, bl);
+        decode(fsm_timers, bl);
+        DECODE_FINISH(bl);
+    }
+};
+
+#include "NVMeofGwSerialize.h"
+
+#endif /* SRC_MON_NVMEOFGWMAP_H_ */
diff --git a/src/mon/NVMeofGwMon.cc b/src/mon/NVMeofGwMon.cc

new file mode 100644 (file)

index 0000000..6111f76
--- /dev/null
+++ b/src/mon/NVMeofGwMon.cc
@@ -0,0 +1,532 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include <boost/tokenizer.hpp>
+#include "include/stringify.h"
+#include "NVMeofGwMon.h"
+#include "messages/MNVMeofGwBeacon.h"
+#include "messages/MNVMeofGwMap.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix *_dout << "nvmeofgw " << __PRETTY_FUNCTION__ << " "
+
+using std::string;
+
+void NVMeofGwMon::init() {
+    dout(10) <<  "called " << dendl;
+}
+
+void NVMeofGwMon::on_restart() {
+    dout(10) <<  "called " << dendl;
+    last_beacon.clear();
+    last_tick = ceph::coarse_mono_clock::now();
+    synchronize_last_beacon();
+}
+
+
+void NVMeofGwMon::synchronize_last_beacon() {
+    dout(10) <<  "called, is leader : " << mon.is_leader()  <<" active " << is_active()  << dendl;
+    // Initialize last_beacon to identify transitions of available  GWs to unavailable state
+    for (const auto& created_map_pair: map.created_gws) {
+      const auto& group_key = created_map_pair.first;
+      const NvmeGwMonStates& gw_created_map = created_map_pair.second;
+      for (const auto& gw_created_pair: gw_created_map) {
+          const auto& gw_id = gw_created_pair.first;
+          if (gw_created_pair.second.availability == gw_availability_t::GW_AVAILABLE) {
+             dout(10) << "synchronize last_beacon for  GW :" << gw_id << dendl;
+             LastBeacon lb = {gw_id, group_key};
+             last_beacon[lb] = last_tick;
+          }
+      }
+    }
+}
+
+void NVMeofGwMon::on_shutdown() {
+    dout(10) <<  "called " << dendl;
+}
+
+void NVMeofGwMon::tick() {
+    if (!is_active() || !mon.is_leader()) {
+        dout(10) << "NVMeofGwMon leader : " << mon.is_leader() << "active : " << is_active()  << dendl;
+        return;
+    }
+    bool _propose_pending = false;
+  
+    const auto now = ceph::coarse_mono_clock::now();
+    const auto nvmegw_beacon_grace = g_conf().get_val<std::chrono::seconds>("mon_nvmeofgw_beacon_grace"); 
+    dout(15) <<  "NVMeofGwMon leader got a tick, pending epoch "<< pending_map.epoch     << dendl;
+
+    const auto client_tick_period = g_conf().get_val<std::chrono::seconds>("nvmeof_mon_client_tick_period");
+    //handle exception of tick overdued in order to avoid false detection of overdued beacons, like it done in  MgrMonitor::tick
+    if (last_tick != ceph::coarse_mono_clock::zero()
+          && (now - last_tick > (nvmegw_beacon_grace - client_tick_period))) {
+        // This case handles either local slowness (calls being delayed
+        // for whatever reason) or cluster election slowness (a long gap
+        // between calls while an election happened)
+        dout(10) << ": resetting beacon timeouts due to mon delay "
+                "(slow election?) of " << now - last_tick << " seconds" << dendl;
+        for (auto &i : last_beacon) {
+          i.second = now;
+        }
+    }
+
+    last_tick = now;
+    bool propose = false;
+
+    pending_map.update_active_timers(propose);  // Periodic: check active FSM timers
+    _propose_pending |= propose;
+
+    const auto cutoff = now - nvmegw_beacon_grace;
+    for (auto &itr : last_beacon) {// Pass over all the stored beacons
+        auto& lb = itr.first;
+        auto last_beacon_time = itr.second;
+        if (last_beacon_time < cutoff) {
+            dout(10) << "beacon timeout for GW " << lb.gw_id << dendl;
+            pending_map.process_gw_map_gw_down(lb.gw_id, lb.group_key, propose);
+            _propose_pending |= propose;
+            last_beacon.erase(lb);
+        }
+        else {
+           dout(20) << "beacon live for GW key: " << lb.gw_id << dendl;
+        }
+    }
+
+    pending_map.handle_abandoned_ana_groups(propose); // Periodic: take care of not handled ANA groups
+    _propose_pending |= propose;
+
+    if (_propose_pending) {
+       dout(10) << "propose pending " <<dendl;
+       propose_pending();
+    }
+}
+
+const char **NVMeofGwMon::get_tracked_conf_keys() const
+{
+  static const char* KEYS[] = {
+    NULL
+  };
+  return KEYS;
+}
+
+version_t NVMeofGwMon::get_trim_to() const
+{
+  // we don't actually need *any* old states, but keep a few.
+  int64_t max = g_conf().get_val<int64_t>("mon_max_nvmeof_epochs");
+  if (map.epoch > max) {
+    return map.epoch - max;
+  }
+  return 0;
+}
+
+void NVMeofGwMon::create_pending() {
+
+    pending_map = map;// deep copy of the object
+    pending_map.epoch++;
+    dout(10) << " pending " << pending_map  << dendl;
+}
+
+void NVMeofGwMon::encode_pending(MonitorDBStore::TransactionRef t) {
+
+    dout(10) <<  dendl;
+    ceph_assert(get_last_committed() + 1 == pending_map.epoch);
+    bufferlist bl;
+    pending_map.encode(bl);
+    put_version(t, pending_map.epoch, bl);
+    put_last_committed(t, pending_map.epoch);
+}
+
+void NVMeofGwMon::update_from_paxos(bool *need_bootstrap) {
+    version_t version = get_last_committed();
+
+    if (version != map.epoch) {
+        dout(10) << " NVMeGW loading version " << version  << " " << map.epoch << dendl;
+
+        bufferlist bl;
+        int err = get_version(version, bl);
+        ceph_assert(err == 0);
+
+        auto p = bl.cbegin();
+        map.decode(p);
+        if (!mon.is_leader()) {
+            dout(10) << "leader map: " << map <<  dendl;
+        }
+        check_subs(true);
+    }
+}
+
+void NVMeofGwMon::check_sub(Subscription *sub)
+{
+    dout(10) << "sub->next , map-epoch " << sub->next << " " << map.epoch << dendl;
+    if (sub->next <= map.epoch)
+    {
+      dout(10) << "Sending map to subscriber " << sub->session->con << " " << sub->session->con->get_peer_addr() << dendl;
+      sub->session->con->send_message2(make_message<MNVMeofGwMap>(map));
+
+      if (sub->onetime) {
+        mon.session_map.remove_sub(sub);
+      } else {
+        sub->next = map.epoch + 1;
+      }
+    }
+}
+
+void NVMeofGwMon::check_subs(bool t)
+{
+  const std::string type = "NVMeofGw";
+  dout(10) <<  "count " << mon.session_map.subs.count(type) << dendl;
+
+  if (mon.session_map.subs.count(type) == 0) {
+      return;
+  }
+  for (auto sub : *(mon.session_map.subs[type])) {
+    check_sub(sub);
+  }
+}
+
+bool NVMeofGwMon::preprocess_query(MonOpRequestRef op) {
+    dout(20) << dendl;
+
+    auto m = op->get_req<PaxosServiceMessage>();
+      switch (m->get_type()) {
+        case MSG_MNVMEOF_GW_BEACON:
+          return preprocess_beacon(op);
+
+        case MSG_MON_COMMAND:
+          try {
+        return preprocess_command(op);
+          } catch (const bad_cmd_get& e) {
+          bufferlist bl;
+          mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+          return true;
+        }
+
+        default:
+          mon.no_reply(op);
+          derr << "Unhandled message type " << m->get_type() << dendl;
+          return true;
+      }
+    return false;
+}
+
+bool NVMeofGwMon::prepare_update(MonOpRequestRef op) {
+    auto m = op->get_req<PaxosServiceMessage>();
+      switch (m->get_type()) {
+        case MSG_MNVMEOF_GW_BEACON:
+          return prepare_beacon(op);
+
+        case MSG_MON_COMMAND:
+          try {
+        return prepare_command(op);
+          } catch (const bad_cmd_get& e) {
+        bufferlist bl;
+        mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+        return false; /* nothing to propose! */
+          }
+
+        default:
+          mon.no_reply(op);
+          dout(1) << "Unhandled message type " << m->get_type() << dendl;
+          return false; /* nothing to propose! */
+      }
+}
+
+bool NVMeofGwMon::preprocess_command(MonOpRequestRef op)
+{
+    dout(10) << dendl;
+    auto m = op->get_req<MMonCommand>();
+    std::stringstream sstrm;
+    bufferlist rdata;
+    string rs;
+    int err = 0;
+    cmdmap_t cmdmap;
+    if (!cmdmap_from_json(m->cmd, &cmdmap, sstrm))
+    {
+        string rs = sstrm.str();
+        dout(4) << "Error : Invalid command "  << m->cmd << "Error " << rs << dendl;
+        mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
+        return true;
+    }
+
+    string prefix;
+    cmd_getval(cmdmap, "prefix", prefix);
+    dout(10) << "MonCommand : "<< prefix <<  dendl;
+    string format = cmd_getval_or<string>(cmdmap, "format", "plain");
+    boost::scoped_ptr<Formatter> f(Formatter::create(format));
+    if (prefix == "nvme-gw show") {
+        std::string  pool, group;
+        if (!f) {
+            f.reset(Formatter::create(format, "json-pretty", "json-pretty"));
+        }
+        cmd_getval(cmdmap, "pool", pool);
+        cmd_getval(cmdmap, "group", group);
+        auto group_key = std::make_pair(pool, group);
+        dout(10) <<"nvme-gw show  pool "<< pool << " group "<< group << dendl;
+
+        if (map.created_gws[group_key].size()) {
+            f->open_object_section("common");
+            f->dump_unsigned("epoch", map.epoch);
+            f->dump_string("pool", pool);
+            f->dump_string("group", group);
+            f->dump_unsigned("num gws", map.created_gws[group_key].size());
+            sstrm <<"[ ";
+            NvmeGwId gw_id;
+            for (auto& gw_created_pair: map.created_gws[group_key]) {
+                gw_id = gw_created_pair.first;
+                auto& st = gw_created_pair.second;
+                sstrm << st.ana_grp_id+1 << " ";
+            }
+            sstrm << "]";
+            f->dump_string("Anagrp list", sstrm.str());
+            f->close_section();
+
+            for (auto& gw_created_pair: map.created_gws[group_key]) {
+                auto& gw_id = gw_created_pair.first;
+                auto& state = gw_created_pair.second;
+                f->open_object_section("stat");
+                f->dump_string("gw-id", gw_id);
+                f->dump_unsigned("anagrp-id",state.ana_grp_id+1);
+                f->dump_unsigned("performed-full-startup", state.performed_full_startup);
+                std::stringstream  sstrm1;
+                sstrm1 << state.availability;
+                f->dump_string("Availability", sstrm1.str());
+                sstrm1.str("");
+                for (auto &state_itr: map.created_gws[group_key][gw_id].sm_state) {
+                    sstrm1 << " " << state_itr.first + 1 << ": " << state.sm_state[state_itr.first] << ",";
+                }
+                f->dump_string("ana states", sstrm1.str());
+                f->close_section();
+            }
+            f->flush(rdata);
+            sstrm.str("");
+        }
+        else {
+            sstrm << "num_gws  0";
+        }
+        getline(sstrm, rs);
+        mon.reply_command(op, err, rs, rdata, get_last_committed());
+        return true;
+    }
+    return false;
+}
+
+bool NVMeofGwMon::prepare_command(MonOpRequestRef op)
+{
+    dout(10)  << dendl;
+    auto m = op->get_req<MMonCommand>();
+    int rc;
+    std::stringstream sstrm;
+    bufferlist rdata;
+    string rs;
+    int err = 0;
+    cmdmap_t cmdmap;
+    bool response = false;
+
+    if (!cmdmap_from_json(m->cmd, &cmdmap, sstrm))
+    {
+        string rs = sstrm.str();
+        mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
+        return true;
+    }
+
+    string format = cmd_getval_or<string>(cmdmap, "format", "plain");
+    boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+    const auto prefix = cmd_getval_or<string>(cmdmap, "prefix", string{});
+
+    dout(10) << "MonCommand : "<< prefix <<  dendl;
+    if (prefix == "nvme-gw create" || prefix == "nvme-gw delete") {
+        std::string id, pool, group;
+
+        cmd_getval(cmdmap, "id", id);
+        cmd_getval(cmdmap, "pool", pool);
+        cmd_getval(cmdmap, "group", group);
+        auto group_key = std::make_pair(pool, group);
+        dout(10) << " id "<< id <<" pool "<< pool << " group "<< group << dendl;
+        if (prefix == "nvme-gw create") {
+            rc = pending_map.cfg_add_gw(id, group_key);
+            if (rc == -EINVAL) {
+                err = rc;
+                dout (4) << "Error: GW cannot be created " << id << " " << pool << " " << group << "  rc " << rc << dendl;
+                sstrm.str("");
+            }
+        }
+        else{
+            rc = pending_map.cfg_delete_gw(id, group_key);
+            if (rc == -EINVAL) {
+                dout (4) << "Error: GW not found in the database " << id << " " << pool << " " << group << "  rc " << rc << dendl;
+                err = 0;
+                sstrm.str("");
+            }
+        }
+        if ((rc != -EEXIST) && (rc != -EINVAL)) //propose pending would be generated by the PaxosService
+            response = true;
+    }
+
+    getline(sstrm, rs);
+    if (response == false) {
+       if (err < 0 && rs.length() == 0)
+       {
+         rs = cpp_strerror(err);
+         dout(10) << "Error command  err : "<< err  << " rs-len: " << rs.length() <<  dendl;
+       }
+       mon.reply_command(op, err, rs, rdata, get_last_committed());
+    }
+    else
+       wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+                            get_last_committed() + 1));
+    return response;
+}
+
+
+bool NVMeofGwMon::preprocess_beacon(MonOpRequestRef op) {
+    auto m = op->get_req<MNVMeofGwBeacon>();
+    const BeaconSubsystems& sub = m->get_subsystems();
+    dout(15) << "beacon from " << m->get_type() << " GW : " << m->get_gw_id()  << " num subsystems " << sub.size() <<  dendl;
+
+    return false; // allways  return false to call leader's prepare beacon
+}
+
+
+bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op) {
+    auto m = op->get_req<MNVMeofGwBeacon>();
+
+    dout(20) << "availability " <<  m->get_availability() << " GW : " << m->get_gw_id() <<
+        " osdmap_epoch " << m->get_last_osd_epoch() << " subsystems " << m->get_subsystems() << dendl;
+
+    NvmeGwId gw_id = m->get_gw_id();
+    NvmeGroupKey group_key = std::make_pair(m->get_gw_pool(),  m->get_gw_group());
+    gw_availability_t  avail = m->get_availability();
+    bool propose = false;
+    bool nonce_propose = false;
+    bool timer_propose = false;
+    bool gw_created = true;
+    NVMeofGwMap ack_map;
+    auto& group_gws = map.created_gws[group_key];
+    auto gw = group_gws.find(gw_id);
+    const BeaconSubsystems& sub = m->get_subsystems();
+
+    if (avail == gw_availability_t::GW_CREATED) {
+        if (gw == group_gws.end()) {
+           gw_created = false;
+           dout(10) << "Warning: GW " << gw_id << " group_key " << group_key << " was not found in the  map.Created_gws "<< map.created_gws <<dendl;
+           goto set_propose;
+        }
+        else {
+            dout(10) << "GW  prepares the full startup " << gw_id << " GW availability: " << pending_map.created_gws[group_key][gw_id].availability << dendl;
+            if (pending_map.created_gws[group_key][gw_id].availability == gw_availability_t::GW_AVAILABLE) {
+                dout(4) << " Warning :GW marked as Available in the NVmeofGwMon database, performed full startup - Force gw to exit!" << gw_id <<dendl;
+                avail = gw_availability_t::GW_UNAVAILABLE;
+                // Monitor performs Force Failover for this GW in process_gw_map_gw_down
+            }
+            else if (pending_map.created_gws[group_key][gw_id].performed_full_startup == false) {
+                pending_map.created_gws[group_key][gw_id].performed_full_startup = true;
+                propose = true;
+                goto set_propose;
+            }
+        }
+    }
+    else { // gw already created
+        if (gw != group_gws.end()) // if GW reports Available but in monitor's database it is Unavailable
+                                   // it means it did not perform "exit" after failover was set by NVMeofGWMon
+           if (pending_map.created_gws[group_key][gw_id].availability == gw_availability_t::GW_UNAVAILABLE  &&
+               pending_map.created_gws[group_key][gw_id].performed_full_startup == false &&
+               avail == gw_availability_t::GW_AVAILABLE) {
+               ack_map.created_gws[group_key][gw_id] = pending_map.created_gws[group_key][gw_id];
+               ack_map.epoch = map.epoch;
+               dout(4) << " Force gw to exit: Sending ack_map to GW: " << gw_id << dendl;
+               auto msg = make_message<MNVMeofGwMap>(ack_map);
+               mon.send_reply(op, msg.detach());
+               goto false_return;
+           }
+    }
+
+    // At this stage the gw has to be in the Created_gws
+    if (gw == group_gws.end()) {
+        dout(4) << "Administratively deleted GW sends beacon " << gw_id <<dendl;
+        goto false_return; // not sending ack to this beacon
+    }
+
+    // deep copy the whole nonce map of this GW
+    if (m->get_nonce_map().size()) {
+        if (pending_map.created_gws[group_key][gw_id].nonce_map != m->get_nonce_map())
+        {
+            dout(10) << "nonce map of GW  changed , propose pending " << gw_id << dendl;
+            pending_map.created_gws[group_key][gw_id].nonce_map = m->get_nonce_map();
+            dout(10) << "nonce map of GW " << gw_id << " "<< pending_map.created_gws[group_key][gw_id].nonce_map  << dendl;
+            nonce_propose = true;
+        }
+    }
+    else  {
+        dout(10) << "Warning: received empty nonce map in the beacon of GW " << gw_id << " "<< dendl;
+    }
+
+    if (sub.size() == 0) {
+        avail = gw_availability_t::GW_UNAVAILABLE;
+    }
+    if (pending_map.created_gws[group_key][gw_id].subsystems != sub)
+    {
+        dout(10) << "subsystems of GW changed, propose pending " << gw_id << dendl;
+        pending_map.created_gws[group_key][gw_id].subsystems =  sub;
+        dout(20) << "subsystems of GW " << gw_id << " "<< pending_map.created_gws[group_key][gw_id].subsystems << dendl;
+        nonce_propose = true;
+    }
+    pending_map.created_gws[group_key][gw_id].last_gw_map_epoch_valid = ( map.epoch == m->get_last_gwmap_epoch() );
+    if (pending_map.created_gws[group_key][gw_id].last_gw_map_epoch_valid == false) {
+      dout(20) <<  "map epoch of gw is not up-to-date " << gw_id << " epoch " << map.epoch << " beacon_epoch " << m->get_last_gwmap_epoch() <<  dendl;
+    }
+    if (avail == gw_availability_t::GW_AVAILABLE)
+    {
+        auto now = ceph::coarse_mono_clock::now();
+        // check pending_map.epoch vs m->get_version() - if different - drop the beacon
+
+        LastBeacon lb = {gw_id, group_key};
+        last_beacon[lb] = now;
+        epoch_t last_osd_epoch = m->get_last_osd_epoch();
+        pending_map.process_gw_map_ka(gw_id, group_key, last_osd_epoch, propose);
+    }
+    else if (avail == gw_availability_t::GW_UNAVAILABLE) { // state set by GW client application
+        LastBeacon lb = {gw_id, group_key};
+
+        auto it = last_beacon.find(lb);
+        if (it != last_beacon.end()) {
+            last_beacon.erase(lb);
+            pending_map.process_gw_map_gw_down(gw_id, group_key, propose);
+        }
+    }
+    pending_map.update_active_timers(timer_propose);  // Periodic: check active FSM timers
+    propose |= timer_propose;
+    propose |= nonce_propose;
+
+set_propose:
+    if (!propose) {
+       if (gw_created) {
+           ack_map.created_gws[group_key][gw_id] = map.created_gws[group_key][gw_id];// respond with a map slice correspondent to the same GW
+       }
+       ack_map.epoch = map.epoch;
+       dout(20) << "ack_map " << ack_map <<dendl;
+       auto msg = make_message<MNVMeofGwMap>(ack_map);
+       mon.send_reply(op, msg.detach());
+    }
+    else {
+       mon.no_reply(op);
+    }
+false_return:
+    if (propose) {
+      dout(10) << "decision in prepare_beacon" <<dendl;
+      return true;
+    }
+    else 
+     return false; // if no changes are need in the map
+}
diff --git a/src/mon/NVMeofGwMon.h b/src/mon/NVMeofGwMon.h

new file mode 100644 (file)

index 0000000..960a896
--- /dev/null
+++ b/src/mon/NVMeofGwMon.h
@@ -0,0 +1,87 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef  MON_NVMEGWMONITOR_H_
+#define  MON_NVMEGWMONITOR_H_
+
+#include "PaxosService.h"
+#include "NVMeofGwMap.h"
+
+struct LastBeacon {
+    NvmeGwId gw_id;
+    NvmeGroupKey group_key;
+
+    // Comparison operators to allow usage as a map key
+    bool operator<(const LastBeacon& other) const {
+        if (gw_id != other.gw_id) return gw_id < other.gw_id;
+        return group_key < other.group_key;
+    }
+
+    bool operator==(const LastBeacon& other) const {
+        return gw_id == other.gw_id &&
+               group_key == other.group_key;
+    }
+};
+
+class NVMeofGwMon: public PaxosService,
+                   public md_config_obs_t
+{
+    NVMeofGwMap map;  //NVMeGWMap
+    NVMeofGwMap pending_map;
+    std::map<LastBeacon, ceph::coarse_mono_clock::time_point> last_beacon;
+    ceph::coarse_mono_clock::time_point last_tick;
+
+public:
+    NVMeofGwMon(Monitor &mn, Paxos &p, const std::string& service_name): PaxosService(mn, p, service_name) {map.mon = &mn;}
+    ~NVMeofGwMon() override {}
+
+
+    // config observer
+    const char** get_tracked_conf_keys() const override;
+    void handle_conf_change(const ConfigProxy& conf, const std::set<std::string> &changed) override {};
+
+    // 3 pure virtual methods of the paxosService
+    void create_initial() override {};
+    void create_pending() override;
+    void encode_pending(MonitorDBStore::TransactionRef t) override;
+
+    void init() override;
+    void on_shutdown() override;
+    void on_restart() override;
+    void update_from_paxos(bool *need_bootstrap) override;
+
+    version_t get_trim_to() const override;
+
+    bool preprocess_query(MonOpRequestRef op) override;
+    bool prepare_update(MonOpRequestRef op) override;
+
+    bool preprocess_command(MonOpRequestRef op);
+    bool prepare_command(MonOpRequestRef op);
+
+    void encode_full(MonitorDBStore::TransactionRef t) override { }
+
+    bool preprocess_beacon(MonOpRequestRef op);
+    bool prepare_beacon(MonOpRequestRef op);
+
+    void tick() override;
+    void print_summary(ceph::Formatter *f, std::ostream *ss) const;
+
+    void check_subs(bool type);
+    void check_sub(Subscription *sub);
+
+private:
+    void synchronize_last_beacon();
+
+};
+
+#endif /* MON_NVMEGWMONITOR_H_ */
diff --git a/src/mon/NVMeofGwSerialize.h b/src/mon/NVMeofGwSerialize.h

new file mode 100755 (executable)

index 0000000..cd70554
--- /dev/null
+++ b/src/mon/NVMeofGwSerialize.h
@@ -0,0 +1,610 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+#ifndef MON_NVMEOFGWSERIALIZE_H_
+#define MON_NVMEOFGWSERIALIZE_H_
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define MODULE_PREFFIX "nvmeofgw "
+#define dout_prefix *_dout << MODULE_PREFFIX << __PRETTY_FUNCTION__ << " "
+
+inline std::ostream& operator<<(std::ostream& os, const gw_exported_states_per_group_t value) {
+    switch (value) {
+        case gw_exported_states_per_group_t::GW_EXPORTED_OPTIMIZED_STATE: os << "OPTIMIZED "; break;
+        case gw_exported_states_per_group_t::GW_EXPORTED_INACCESSIBLE_STATE: os << "INACCESSIBLE "; break;
+        default: os << "Invalid " << (int)value << " ";
+    }
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const gw_states_per_group_t value) {
+    switch (value) {
+        case gw_states_per_group_t::GW_IDLE_STATE:                  os << "IDLE "; break;
+        case gw_states_per_group_t::GW_STANDBY_STATE:               os << "STANDBY "; break;
+        case gw_states_per_group_t::GW_ACTIVE_STATE:                os << "ACTIVE "; break;
+        case gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED: os << "OWNER_FAILBACK_PREPARED "; break;
+        case gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED:      os << "WAIT_FAILBACK_PREPARED "; break;
+        case gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL:       os <<   "WAIT_BLOCKLIST_CMPL "; break;
+        default: os << "Invalid " << (int)value << " ";
+    }
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const gw_availability_t value) {
+    switch (value) {
+
+        case gw_availability_t::GW_CREATED: os << "CREATED"; break;
+        case gw_availability_t::GW_AVAILABLE: os << "AVAILABLE"; break;
+        case gw_availability_t::GW_UNAVAILABLE: os << "UNAVAILABLE"; break;
+
+        default: os << "Invalid " << (int)value << " ";
+    }
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const SmState value) {
+    os << "SM_STATE [ ";
+    for (auto& state_itr: value )
+        os << value.at(state_itr.first);
+    os << "]";
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const BeaconNamespace value) {
+    os << "BeaconNamespace( anagrpid:" << value.anagrpid << ", nonce:" << value.nonce <<" )";
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const BeaconListener value) {
+    os << "BeaconListener( addrfam:" << value.address_family
+                        << ", addr:" << value.address
+                        << ", svcid:" << value.svcid << " )";
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const BeaconSubsystem value) {
+    os << "BeaconSubsystem( nqn:" << value.nqn << ", listeners [ ";
+    for (const auto& list: value.listeners) os << list << " ";
+    os << "] namespaces [ ";
+    for (const auto& ns: value.namespaces) os << ns << " ";
+    os << "] )";
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const NqnState value) {
+    os << "NqnState( nqn: " << value.nqn << ", " << value.ana_state << " )";
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const NvmeGwClientState value) {
+    os <<  "NvmeGwState { group id: " << value.group_id <<  " gw_map_epoch " <<  value.gw_map_epoch << " availablilty "<< value.availability
+        << " GwSubsystems: [ ";
+    for (const auto& sub: value.subsystems) os << sub.second << " ";
+    os << " ] }";
+
+    return os;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const NvmeGroupKey value) {
+    os << "NvmeGroupKey {" << value.first << "," << value.second << "}";
+    return os;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const NvmeGwMonClientStates value) {
+    os << "NvmeGwMap ";
+    for (auto& gw_state: value) {
+        os << "\n" << MODULE_PREFFIX <<" { == gw_id: " << gw_state.first << " -> " <<  gw_state.second << "}";
+    }
+    os << "}";
+
+    return os;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const NvmeNonceVector value) {
+    for (auto & nonces : value) {
+        os <<  nonces << " ";
+    }
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const NvmeAnaNonceMap value) {
+    if(value.size()) os << "\n" << MODULE_PREFFIX;
+    for (auto &nonce_map : value) {
+        os  << "  ana_grp: " << nonce_map.first  << " [ " << nonce_map.second << "]\n"<< MODULE_PREFFIX ;
+    }
+    return os;
+}
+
+inline std::ostream& print_gw_created_t(std::ostream& os, const NvmeGwMonState value, size_t num_ana_groups) {
+    os << "==Internal map ==NvmeGwCreated { ana_group_id " << value.ana_grp_id << " osd_epochs: ";
+    for (auto& blklst_itr: value.blocklist_data)
+    {
+        os << " " << blklst_itr.first <<": " << blklst_itr.second.osd_epoch << ":" <<blklst_itr.second.is_failover ;
+    }
+    os << "\n" << MODULE_PREFFIX << "nonces: " << value.nonce_map << " }";
+
+    for (auto& state_itr: value.sm_state )
+    {
+        os << " " << state_itr.first <<": " << state_itr.second << ",";
+    }
+
+    os << "]\n"<< MODULE_PREFFIX << "availability " << value.availability << " full-startup " << value.performed_full_startup  << " ]";
+
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const NvmeGwMonState value) {
+    os << "==Internal map ==G W_CREATED_T { ana_group_id " << value.ana_grp_id << " osd_epochs: ";
+    for (auto &blklst_itr: value.blocklist_data) {
+        os << " " << blklst_itr.second.osd_epoch;
+    }
+    os << "\n" << MODULE_PREFFIX << "nonces: " << value.nonce_map << " }";
+
+    for (auto& state_itr: value.sm_state ) {
+        os << value.sm_state.at(state_itr.first) << ",";
+    }
+
+    os <<  "]\n"<< MODULE_PREFFIX << " beacon-subsystems ";
+    for (const auto& sub: value.subsystems) {
+        os << sub << ",";
+    }
+
+    os << "]\n"<< MODULE_PREFFIX << "availability " << value.availability << "]";
+
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const NvmeGwMonStates value) {
+    if(value.size()) os << "\n" << MODULE_PREFFIX;;
+
+    for (auto &gw_created_map : value) {
+        os  <<  "gw_id: " << gw_created_map.first  << " [ " ;//<< gw_created_map.second << "] \n"<< MODULE_PREFFIX;
+        print_gw_created_t(os, gw_created_map.second,  value.size());
+        os << "] \n"<< MODULE_PREFFIX;
+    }
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const NVMeofGwMap value) {
+    os << "NVMeofGwMap [ Created_gws: ";
+    for (auto& group_gws: value.created_gws) {
+        os <<  "\n" <<  MODULE_PREFFIX  << "{ " << group_gws.first << " } -> { " << group_gws.second << " }";
+    }
+    os << "]";
+    return os;
+}
+
+inline void encode(const ana_state_t& st,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode((uint32_t)st.size(), bl);
+    for (const auto& gr: st) {
+        encode((uint32_t)gr.first, bl);
+        encode((uint32_t)gr.second, bl);
+    }
+    ENCODE_FINISH(bl);
+}
+
+inline void decode(ana_state_t& st, ceph::buffer::list::const_iterator &bl) {
+    uint32_t n;
+    DECODE_START(1, bl);
+    decode(n, bl);
+    st.resize(n);
+    for (uint32_t i = 0; i < n; i++) {
+        uint32_t a, b;
+        decode(a, bl);
+        decode(b, bl);
+        st[i].first  = (gw_exported_states_per_group_t)a;
+        st[i].second = (epoch_t)b;
+    }
+    DECODE_FINISH(bl);
+}
+
+inline void encode(const GwSubsystems& subsystems,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode((uint32_t)subsystems.size(), bl);
+    for (const auto& sub: subsystems) {
+        encode(sub.second.nqn, bl);
+        encode(sub.second.ana_state, bl);
+    }
+    ENCODE_FINISH(bl);
+}
+
+inline  void decode(GwSubsystems& subsystems,  ceph::bufferlist::const_iterator& bl) {
+  uint32_t num_subsystems;
+  DECODE_START(1, bl);
+  decode(num_subsystems, bl);
+  subsystems.clear();
+  for (uint32_t i=0; i<num_subsystems; i++) {
+     std::string  nqn;
+     decode(nqn, bl);
+     ana_state_t st;
+     decode(st, bl);
+     subsystems.insert({nqn, NqnState(nqn, st)});
+  }
+  DECODE_FINISH(bl);
+}
+
+inline void encode(const NvmeGwClientState& state,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode(state.group_id, bl);
+    encode(state.gw_map_epoch, bl);
+    encode (state.subsystems, bl);
+    encode((uint32_t)state.availability, bl);
+    ENCODE_FINISH(bl);
+}
+
+inline  void decode(NvmeGwClientState& state,  ceph::bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(state.group_id, bl);
+    decode(state.gw_map_epoch, bl);
+    decode(state.subsystems, bl);
+    uint32_t avail;
+    decode(avail, bl);
+    state.availability = (gw_availability_t)avail;
+    DECODE_FINISH(bl);
+}
+
+inline  void encode(const NvmeGwTimerState& state,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode((uint32_t)state.data.size(), bl);
+    for (auto &tm_itr:state.data) {
+        encode((uint32_t)tm_itr.first, bl);// encode key
+        uint32_t tick = tm_itr.second.timer_started;
+        uint8_t  val  = tm_itr.second.timer_value;
+        encode(tick, bl);
+        encode(val,  bl);
+        auto endtime  = tm_itr.second.end_time;
+        // Convert the time point to milliseconds since the epoch
+        uint64_t  millisecondsSinceEpoch = std::chrono::duration_cast<std::chrono::milliseconds>(endtime.time_since_epoch()).count();
+        encode(millisecondsSinceEpoch , bl);
+    }
+    ENCODE_FINISH(bl);
+}
+
+inline  void decode(NvmeGwTimerState& state,  ceph::bufferlist::const_iterator& bl) {
+    uint32_t size;
+    DECODE_START(1, bl);
+    decode(size, bl);
+    for (uint32_t i = 0; i <size; i ++) {
+        uint32_t tm_key;
+        uint32_t tick;
+        uint8_t val;
+        decode(tm_key, bl);
+        decode(tick, bl);
+        decode(val,  bl);
+        Tmdata tm;
+        tm.timer_started = tick;
+        tm.timer_value = val;
+        uint64_t milliseconds;
+        decode(milliseconds, bl);
+        auto duration = std::chrono::milliseconds(milliseconds);
+        tm.end_time = std::chrono::time_point<std::chrono::system_clock>(duration);
+        state.data[tm_key] = tm;
+    }
+    DECODE_FINISH(bl);
+}
+
+inline void encode(const NvmeAnaNonceMap& nonce_map,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode((uint32_t)nonce_map.size(), bl);
+    for (auto& ana_group_nonces : nonce_map) {
+        encode(ana_group_nonces.first, bl); // ana group id
+        encode ((uint32_t)ana_group_nonces.second.size(), bl); // encode the vector size
+        for (auto& nonce: ana_group_nonces.second) encode(nonce, bl);
+    }
+    ENCODE_FINISH(bl);
+}
+
+inline void decode(NvmeAnaNonceMap& nonce_map, ceph::buffer::list::const_iterator &bl) {
+    uint32_t map_size;
+    NvmeAnaGrpId ana_grp_id;
+    uint32_t vector_size;
+    std::string nonce;
+    DECODE_START(1, bl);
+    decode(map_size, bl);
+    for (uint32_t i = 0; i<map_size; i++) {
+        decode(ana_grp_id, bl);
+        decode(vector_size,bl);
+        for (uint32_t j = 0; j < vector_size; j++) {
+            decode (nonce, bl);
+            nonce_map[ana_grp_id].push_back(nonce);
+        }
+    }
+    DECODE_FINISH(bl);
+}
+
+inline void encode(const NvmeGwMonStates& gws,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode ((uint32_t)gws.size(), bl); // number of gws in the group
+    for (auto& gw : gws) {
+        encode(gw.first, bl);// GW_id
+        encode(gw.second.ana_grp_id, bl); // GW owns this group-id
+        encode((uint32_t)gw.second.sm_state.size(), bl);
+        for (auto &state_it:gw.second.sm_state) {
+            encode((uint32_t)state_it.first, bl); //key of map
+            encode((uint32_t)state_it.second, bl);//value of map
+        }
+        encode((uint32_t)gw.second.availability, bl);
+        encode((uint16_t)gw.second.performed_full_startup, bl);
+        encode((uint16_t)gw.second.last_gw_map_epoch_valid, bl);
+        encode(gw.second.subsystems, bl);
+
+        encode((uint32_t)gw.second.blocklist_data.size(), bl);
+        for (auto &blklst_itr: gw.second.blocklist_data) {
+            encode((uint32_t)blklst_itr.first, bl);
+            encode((uint32_t)blklst_itr.second.osd_epoch, bl);
+            encode((uint32_t)blklst_itr.second.is_failover, bl);
+        }
+        encode(gw.second.nonce_map, bl);
+    }
+    ENCODE_FINISH(bl);
+}
+
+inline void decode(NvmeGwMonStates& gws, ceph::buffer::list::const_iterator &bl) {
+    gws.clear();
+    uint32_t num_created_gws;
+    DECODE_START(1, bl);
+    decode(num_created_gws, bl);
+
+    for (uint32_t i = 0; i<num_created_gws; i++) {
+        std::string gw_name;
+        decode(gw_name, bl);
+        NvmeAnaGrpId ana_grp_id;
+        decode(ana_grp_id, bl);
+
+        NvmeGwMonState gw_created(ana_grp_id);
+        uint32_t sm_state;
+        uint32_t sm_key;
+        NvmeGwId peer_name;
+        uint32_t size;
+        decode(size, bl);
+        for (uint32_t i = 0; i <size; i ++) {
+            decode(sm_key, bl);
+            decode(sm_state, bl);
+            gw_created.sm_state[sm_key] = ((gw_states_per_group_t)sm_state);
+        }
+        uint32_t avail;
+        decode(avail, bl);
+        gw_created.availability = (gw_availability_t)avail;
+        uint16_t performed_startup;
+        decode(performed_startup, bl);
+        gw_created.performed_full_startup = (bool)performed_startup;
+        uint16_t last_epoch_valid;
+        decode(last_epoch_valid, bl);
+        gw_created.last_gw_map_epoch_valid = (bool)last_epoch_valid;
+        BeaconSubsystems   subsystems;
+        decode(subsystems, bl);
+        gw_created.subsystems = subsystems;
+        decode(size, bl);
+        for (uint32_t i=0; i<size; i++) {
+            uint32_t blklist_key;
+            uint32_t osd_epoch;
+            uint32_t is_failover;
+            decode(blklist_key, bl);
+            decode(osd_epoch,   bl);
+            decode(is_failover, bl);
+            Blocklist_data blst((epoch_t)osd_epoch, (bool)is_failover);
+
+            gw_created.blocklist_data[blklist_key] = blst;
+        }
+        decode(gw_created.nonce_map, bl);
+        gws[gw_name] = gw_created;
+    }
+    DECODE_FINISH(bl);
+}
+
+inline void encode(const std::map<NvmeGroupKey, NvmeGwMonStates>& created_gws,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode ((uint32_t)created_gws.size(), bl); // number of groups
+    for (auto& group_gws: created_gws) {
+        auto& group_key = group_gws.first;
+        encode(group_key.first, bl); // pool
+        encode(group_key.second, bl); // group
+
+        auto& gws = group_gws.second;
+        encode (gws, bl); // encode group gws
+    }
+    ENCODE_FINISH(bl);
+}
+
+inline void decode(std::map<NvmeGroupKey, NvmeGwMonStates>& created_gws, ceph::buffer::list::const_iterator &bl) {
+    created_gws.clear();
+    uint32_t ngroups;
+    DECODE_START(1, bl);
+    decode(ngroups, bl);
+    for (uint32_t i = 0; i<ngroups; i++) {
+        std::string pool, group;
+        decode(pool, bl);
+        decode(group, bl);
+        NvmeGwMonStates cmap;
+        decode(cmap, bl);
+        created_gws[std::make_pair(pool, group)] = cmap;
+    }
+    DECODE_FINISH(bl);
+}
+
+inline void encode(const NvmeGwMonClientStates& subsyst_gwmap,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode((uint32_t)subsyst_gwmap.size(), bl);
+    for (auto& subsyst: subsyst_gwmap) {
+        encode(subsyst.first, bl);
+        encode(subsyst.second, bl);
+    }
+    ENCODE_FINISH(bl);
+}
+
+inline void decode(NvmeGwMonClientStates& subsyst_gwmap, ceph::buffer::list::const_iterator &bl) {
+    subsyst_gwmap.clear();
+    uint32_t num_gws;
+    DECODE_START(1, bl);
+    decode(num_gws, bl);
+
+    for (uint32_t i = 0; i < num_gws; i++) {
+        NvmeGwId gw_id;
+        decode(gw_id, bl);
+        NvmeGwClientState gw_st;
+        decode(gw_st, bl);
+        subsyst_gwmap[gw_id] = gw_st;
+    }
+    DECODE_FINISH(bl);
+}
+
+// Start encode  NvmeGroupKey, GMAP
+inline void encode(const std::map<NvmeGroupKey, NvmeGwMonClientStates>& gmap,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode ((uint32_t)gmap.size(), bl); // number of groups
+    for (auto& group_state: gmap) {
+        auto& group_key = group_state.first;
+        encode(group_key.first, bl); // pool
+        encode(group_key.second, bl); // group
+        encode(group_state.second, bl);
+    }
+    ENCODE_FINISH(bl);
+}
+// Start decode NvmeGroupKey, NvmeGwMap
+inline void decode(std::map<NvmeGroupKey, NvmeGwMonClientStates>& gmap, ceph::buffer::list::const_iterator &bl) {
+    gmap.clear();
+    uint32_t ngroups;
+    DECODE_START(1, bl);
+    decode(ngroups, bl);
+    for (uint32_t i = 0; i<ngroups; i++) {
+        std::string pool, group;
+        decode(pool, bl);
+        decode(group, bl);
+        NvmeGwMonClientStates grp_map;
+        decode(grp_map, bl);
+        gmap[std::make_pair(pool, group)] = grp_map;
+    }
+    DECODE_FINISH(bl);
+}
+
+inline void encode(const std::map<NvmeGroupKey, NvmeGwTimers>& gmetadata,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode ((uint32_t)gmetadata.size(), bl); // number of groups
+    for (auto& group_md: gmetadata) {
+        auto& group_key = group_md.first;
+        encode(group_key.first, bl); // pool
+        encode(group_key.second, bl); // group
+
+        encode(group_md.second, bl);
+    }
+    ENCODE_FINISH(bl);
+}
+
+inline void decode(std::map<NvmeGroupKey, NvmeGwTimers>& gmetadata, ceph::buffer::list::const_iterator &bl) {
+    gmetadata.clear();
+    uint32_t ngroups;
+    DECODE_START(1, bl);
+    decode(ngroups, bl);
+    for (uint32_t i = 0; i<ngroups; i++) {
+        std::string pool, group;
+        decode(pool, bl);
+        decode(group, bl);
+        NvmeGwTimers gmd;
+        decode(gmd, bl);
+        gmetadata[std::make_pair(pool, group)] = gmd;
+    }
+    DECODE_FINISH(bl);
+}
+
+inline void encode(const NvmeGwTimers& group_md,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode ((uint32_t)group_md.size(), bl); // number of groups
+    for (auto& gw_md: group_md) {
+        encode(gw_md.first, bl); // gw
+        encode(gw_md.second, bl); //  map of this gw
+    }
+    ENCODE_FINISH(bl);
+}
+
+inline void decode(NvmeGwTimers& md, ceph::buffer::list::const_iterator &bl) {
+    uint32_t num_gws;
+    DECODE_START(1, bl);
+    decode(num_gws, bl);
+    for (uint32_t i = 0; i < num_gws; i++) {
+        std::string gw_id;
+        decode(gw_id, bl);
+        NvmeGwTimerState gw_meta;
+        decode(gw_meta, bl);
+        md[gw_id] = gw_meta;
+    }
+    DECODE_FINISH(bl);
+}
+
+inline void encode(const BeaconNamespace& ns,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode(ns.anagrpid, bl);
+    encode(ns.nonce, bl);
+    ENCODE_FINISH(bl);
+}
+
+inline void decode(BeaconNamespace& ns, ceph::buffer::list::const_iterator &bl) {
+    DECODE_START(1, bl);
+    decode(ns.anagrpid, bl);
+    decode(ns.nonce, bl);
+    DECODE_FINISH(bl);
+}
+
+inline void encode(const BeaconListener& ls,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode(ls.address_family, bl);
+    encode(ls.address, bl);
+    encode(ls.svcid, bl);
+    ENCODE_FINISH(bl);
+}
+
+inline void decode(BeaconListener& ls, ceph::buffer::list::const_iterator &bl) {
+    DECODE_START(1, bl);
+    decode(ls.address_family, bl);
+    decode(ls.address, bl);
+    decode(ls.svcid, bl);
+    DECODE_FINISH(bl);
+}
+
+inline void encode(const BeaconSubsystem& sub,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode(sub.nqn, bl);
+    encode((uint32_t)sub.listeners.size(), bl);
+    for (const auto& ls: sub.listeners)
+        encode(ls, bl);
+    encode((uint32_t)sub.namespaces.size(), bl);
+    for (const auto& ns: sub.namespaces)
+        encode(ns, bl);
+    ENCODE_FINISH(bl);
+}
+
+inline void decode(BeaconSubsystem& sub, ceph::buffer::list::const_iterator &bl) {
+    DECODE_START(1, bl);
+    decode(sub.nqn, bl);
+    uint32_t s;
+    sub.listeners.clear();
+    decode(s, bl);
+    for (uint32_t i = 0; i < s; i++) {
+        BeaconListener ls;
+        decode(ls, bl);
+        sub.listeners.push_back(ls);
+    }
+
+    sub.namespaces.clear();
+    decode(s, bl);
+    for (uint32_t i = 0; i < s; i++) {
+        BeaconNamespace ns;
+        decode(ns, bl);
+        sub.namespaces.push_back(ns);
+    }
+    DECODE_FINISH(bl);
+}
+
+
+#undef dout_subsys
+#endif /* SRC_MON_NVMEOFGWSERIALIZEP_H_ */
diff --git a/src/mon/NVMeofGwTypes.h b/src/mon/NVMeofGwTypes.h

new file mode 100755 (executable)

index 0000000..5b3989d
--- /dev/null
+++ b/src/mon/NVMeofGwTypes.h
@@ -0,0 +1,208 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef MON_NVMEOFGWTYPES_H_
+#define MON_NVMEOFGWTYPES_H_
+#include <string>
+#include <iomanip>
+#include <map>
+#include <iostream>
+
+using NvmeGwId      = std::string;
+using NvmeGroupKey  = std::pair<std::string, std::string>;
+using NvmeNqnId     = std::string;
+using NvmeAnaGrpId  = uint32_t;
+
+
+enum class gw_states_per_group_t {
+    GW_IDLE_STATE = 0, //invalid state
+    GW_STANDBY_STATE,
+    GW_ACTIVE_STATE,
+    GW_OWNER_WAIT_FAILBACK_PREPARED,
+    GW_WAIT_FAILBACK_PREPARED,
+    GW_WAIT_BLOCKLIST_CMPL
+};
+
+enum class gw_exported_states_per_group_t {
+    GW_EXPORTED_OPTIMIZED_STATE = 0,
+    GW_EXPORTED_INACCESSIBLE_STATE
+};
+
+enum class gw_availability_t {
+    GW_CREATED = 0,
+    GW_AVAILABLE,
+    GW_UNAVAILABLE,
+    GW_DELETED
+};
+
+#define REDUNDANT_GW_ANA_GROUP_ID 0xFF
+using SmState  = std::map < NvmeAnaGrpId, gw_states_per_group_t>;
+
+using ana_state_t = std::vector<std::pair<gw_exported_states_per_group_t, epoch_t>>;
+
+struct BeaconNamespace {
+    NvmeAnaGrpId anagrpid;
+    std::string  nonce;
+
+    // Define the equality operator
+    bool operator==(const BeaconNamespace& other) const {
+        return anagrpid == other.anagrpid &&
+               nonce == other.nonce;
+    }
+};
+
+// Beacon Listener represents an NVME Subsystem listener,
+// which generally does not have to use TCP/IP.
+// It is derived from the SPDK listener JSON RPC representation.
+// For more details, see https://spdk.io/doc/jsonrpc.html#rpc_nvmf_listen_address.
+struct BeaconListener {
+    std::string address_family; // IPv4 or IPv6
+    std::string address;        //
+    std::string svcid;          // port
+
+    // Define the equality operator
+    bool operator==(const BeaconListener& other) const {
+        return address_family == other.address_family &&
+               address == other.address &&
+               svcid == other.svcid;
+    }
+};
+
+struct BeaconSubsystem {
+    NvmeNqnId nqn;
+    std::list<BeaconListener>  listeners;
+    std::list<BeaconNamespace> namespaces;
+
+    // Define the equality operator
+    bool operator==(const BeaconSubsystem& other) const {
+        return nqn == other.nqn &&
+               listeners == other.listeners &&
+               namespaces == other.namespaces;
+    }
+};
+
+using BeaconSubsystems = std::list<BeaconSubsystem>;
+
+using NvmeNonceVector    = std::vector<std::string>;
+using NvmeAnaNonceMap  = std::map <NvmeAnaGrpId, NvmeNonceVector>;
+
+struct Blocklist_data{
+   epoch_t     osd_epoch;
+   bool        is_failover;
+   Blocklist_data() {
+       osd_epoch = 0;
+       is_failover = true;
+   };
+   Blocklist_data(epoch_t epoch, bool failover):osd_epoch(epoch), is_failover(failover) {};
+};
+
+using BlocklistData    = std::map < NvmeAnaGrpId, Blocklist_data>;
+
+struct NvmeGwMonState {
+    NvmeAnaGrpId       ana_grp_id;                    // ana-group-id allocated for this GW, GW owns this group-id
+    gw_availability_t  availability;                  // in absence of  beacon  heartbeat messages it becomes inavailable
+    bool               last_gw_map_epoch_valid;       // "true" if the last epoch seen by the gw-client is up-to-date
+    bool               performed_full_startup;        // in order to identify gws that did not exit upon failover
+    BeaconSubsystems   subsystems;                    // gateway susbsystem and their state machine states
+    NvmeAnaNonceMap    nonce_map;
+    SmState           sm_state;                      // state machine states per ANA group
+    BlocklistData      blocklist_data;
+
+    NvmeGwMonState(): ana_grp_id(REDUNDANT_GW_ANA_GROUP_ID) {};
+
+    NvmeGwMonState(NvmeAnaGrpId id): ana_grp_id(id), availability(gw_availability_t::GW_CREATED), last_gw_map_epoch_valid(false),
+                                    performed_full_startup(false) {};
+    void set_unavailable_state() {
+        availability = gw_availability_t::GW_UNAVAILABLE;
+        performed_full_startup = false; // after setting this state the next time monitor sees GW, it expects it performed the full startup
+    }
+    void standby_state(NvmeAnaGrpId grpid) {
+           sm_state[grpid]       = gw_states_per_group_t::GW_STANDBY_STATE;
+    };
+    void active_state(NvmeAnaGrpId grpid) {
+           sm_state[grpid]       = gw_states_per_group_t::GW_ACTIVE_STATE;
+           blocklist_data[grpid].osd_epoch = 0;
+    };
+};
+
+struct NqnState {
+    std::string   nqn;          // subsystem NQN
+    ana_state_t     ana_state;    // subsystem's ANA state
+
+    // constructors
+    NqnState(const std::string& _nqn, const ana_state_t& _ana_state):
+        nqn(_nqn), ana_state(_ana_state)  {}
+    NqnState(const std::string& _nqn, const SmState& sm_state, const NvmeGwMonState & gw_created) : nqn(_nqn)  {
+        uint32_t i = 0;
+        for (auto& state_itr: sm_state) {
+            if (state_itr.first > i) {
+                uint32_t num_to_add = state_itr.first - i;
+                for (uint32_t j = 0; j<num_to_add; j++) { // add fake elements to the ana_state in order to preserve vector index == correct ana_group_id
+                    std::pair<gw_exported_states_per_group_t, epoch_t> state_pair;
+                    state_pair.first = gw_exported_states_per_group_t::GW_EXPORTED_INACCESSIBLE_STATE;
+                    state_pair.second = 0;
+                    ana_state.push_back(state_pair);
+                }
+                i += num_to_add;
+            }
+            std::pair<gw_exported_states_per_group_t, epoch_t> state_pair;
+            state_pair.first = (sm_state.at(state_itr.first) == gw_states_per_group_t::GW_ACTIVE_STATE
+                    || sm_state.at(state_itr.first) == gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL)
+                               ? gw_exported_states_per_group_t::GW_EXPORTED_OPTIMIZED_STATE
+                                       : gw_exported_states_per_group_t::GW_EXPORTED_INACCESSIBLE_STATE;
+            state_pair.second = gw_created.blocklist_data.at(state_itr.first).osd_epoch;
+            ana_state.push_back(state_pair);
+            i ++;
+        }
+    }
+};
+
+typedef std::map<NvmeNqnId, NqnState> GwSubsystems;
+
+struct NvmeGwClientState {
+    NvmeAnaGrpId              group_id;
+    epoch_t                   gw_map_epoch;
+    GwSubsystems              subsystems;
+    gw_availability_t         availability;
+    NvmeGwClientState(NvmeAnaGrpId id, epoch_t epoch, gw_availability_t available):
+        group_id(id),
+        gw_map_epoch(epoch),
+        availability(available)
+    {};
+
+    NvmeGwClientState() : NvmeGwClientState(REDUNDANT_GW_ANA_GROUP_ID, 0, gw_availability_t::GW_UNAVAILABLE) {};
+};
+
+
+struct Tmdata{
+   uint32_t     timer_started; // statemachine timer(timestamp) set in some state
+   uint8_t      timer_value;
+   std::chrono::system_clock::time_point end_time;
+    Tmdata() {
+       timer_started = 0;
+       timer_value   = 0;
+   }
+};
+
+using TmData = std::map < NvmeAnaGrpId, Tmdata>;
+
+struct NvmeGwTimerState {
+    TmData data;
+    NvmeGwTimerState() {};
+};
+
+using NvmeGwMonClientStates      = std::map<NvmeGwId, NvmeGwClientState>;
+using NvmeGwTimers               = std::map<NvmeGwId, NvmeGwTimerState>;
+using NvmeGwMonStates            = std::map<NvmeGwId, NvmeGwMonState>;
+
+#endif /* SRC_MON_NVMEOFGWTYPES_H_ */
diff --git a/src/mon/mon_types.h b/src/mon/mon_types.h

index 3429a8e9991629c68daf474fbc0a1a07f5596c2e..9dd2797852d41cfda6f86c1463fe821e22835d55 100644 (file)
--- a/src/mon/mon_types.h
+++ b/src/mon/mon_types.h
@@ -36,6 +36,7 @@ enum {
    PAXOS_HEALTH,
    PAXOS_CONFIG,
    PAXOS_KV,
+  PAXOS_NVMEGW,
    PAXOS_NUM
  };
  
diff --git a/src/msg/Message.cc b/src/msg/Message.cc

index 22208d2d1f4280ed5bd6c33a4774d34e1338a6e5..f649e0f3d3ee223e01fadf8d166658af6ae24437 100644 (file)
--- a/src/msg/Message.cc
+++ b/src/msg/Message.cc
@@ -219,6 +219,9 @@
  #include "messages/MOSDPGUpdateLogMissing.h"
  #include "messages/MOSDPGUpdateLogMissingReply.h"
  
+#include "messages/MNVMeofGwBeacon.h"
+#include "messages/MNVMeofGwMap.h"
+
  #ifdef WITH_BLKIN
  #include "Messenger.h"
  #endif
@@ -885,6 +888,10 @@ Message *decode_message(CephContext *cct,
      m = make_message<MMgrBeacon>();
      break;
  
+  case MSG_MNVMEOF_GW_BEACON:
+    m = make_message<MNVMeofGwBeacon>();
+  break;
+
    case MSG_MON_MGR_REPORT:
      m = make_message<MMonMgrReport>();
      break;
@@ -944,6 +951,9 @@ Message *decode_message(CephContext *cct,
      m = make_message<MMonHealthChecks>();
      break;
  
+  case MSG_MNVMEOF_GW_MAP:
+    m = make_message<MNVMeofGwMap>();
+    break;
      // -- simple messages without payload --
  
    case CEPH_MSG_SHUTDOWN:
diff --git a/src/msg/Message.h b/src/msg/Message.h

index 15eb3feadcede30de49661fd0a05017e2657c5c2..78557f90e48f07ea15ef217140c68f9eaa180ffd 100644 (file)
--- a/src/msg/Message.h
+++ b/src/msg/Message.h
@@ -242,6 +242,12 @@
  // *** ceph-mgr <-> MON daemons ***
  #define MSG_MGR_UPDATE     0x70b
  
+// *** nvmeof mon -> gw daemons ***
+#define MSG_MNVMEOF_GW_MAP        0x800
+
+// *** gw daemons -> nvmeof mon  ***
+#define MSG_MNVMEOF_GW_BEACON     0x801
+
  // ======================================================
  
  // abstract Message class
diff --git a/src/nvmeof/NVMeofGwClient.cc b/src/nvmeof/NVMeofGwClient.cc

new file mode 100644 (file)

index 0000000..c82423d
--- /dev/null
+++ b/src/nvmeof/NVMeofGwClient.cc
@@ -0,0 +1,32 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include "NVMeofGwClient.h"
+
+bool NVMeofGwClient::get_subsystems(subsystems_info& reply) {
+  get_subsystems_req request;
+  ClientContext context;
+
+  Status status = stub_->get_subsystems(&context, request, &reply);
+
+  return status.ok();
+}
+
+bool NVMeofGwClient::set_ana_state(const ana_info& info) {
+  req_status reply;
+  ClientContext context;
+
+  Status status = stub_->set_ana_state(&context, info, &reply);
+
+  return status.ok() && reply.status();
+}
diff --git a/src/nvmeof/NVMeofGwClient.h b/src/nvmeof/NVMeofGwClient.h

new file mode 100644 (file)

index 0000000..0224852
--- /dev/null
+++ b/src/nvmeof/NVMeofGwClient.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+
+#ifndef  __NVMEOFGWCLIENT_H__
+#define  __NVMEOFGWCLIENT_H__
+#include <iostream>
+#include <memory>
+#include <string>
+
+#include <grpcpp/grpcpp.h>
+
+#include "gateway.grpc.pb.h"
+
+using grpc::Channel;
+using grpc::ClientContext;
+using grpc::Status;
+
+class NVMeofGwClient {
+ public:
+  NVMeofGwClient(std::shared_ptr<Channel> channel)
+      : stub_(Gateway::NewStub(channel)) {}
+
+  bool get_subsystems(subsystems_info& reply);
+  bool set_ana_state(const ana_info& info);
+
+ private:
+  std::unique_ptr<Gateway::Stub> stub_;
+};
+#endif
diff --git a/src/nvmeof/NVMeofGwMonitorClient.cc b/src/nvmeof/NVMeofGwMonitorClient.cc

new file mode 100644 (file)

index 0000000..fc4358f
--- /dev/null
+++ b/src/nvmeof/NVMeofGwMonitorClient.cc
@@ -0,0 +1,451 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023,2024 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include <boost/algorithm/string/replace.hpp>
+
+#include "common/errno.h"
+#include "common/signal.h"
+#include "common/ceph_argparse.h"
+#include "include/compat.h"
+
+#include "include/stringify.h"
+#include "global/global_context.h"
+#include "global/signal_handler.h"
+
+
+#include "messages/MNVMeofGwBeacon.h"
+#include "messages/MNVMeofGwMap.h"
+#include "NVMeofGwMonitorClient.h"
+#include "NVMeofGwClient.h"
+#include "NVMeofGwMonitorGroupClient.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix *_dout << "nvmeofgw " << __PRETTY_FUNCTION__ << " "
+
+NVMeofGwMonitorClient::NVMeofGwMonitorClient(int argc, const char **argv) :
+  Dispatcher(g_ceph_context),
+  osdmap_epoch(0),
+  gwmap_epoch(0),
+  last_map_time(std::chrono::steady_clock::now()),
+  monc{g_ceph_context, poolctx},
+  client_messenger(Messenger::create(g_ceph_context, "async", entity_name_t::CLIENT(-1), "client", getpid())),
+  objecter{g_ceph_context, client_messenger.get(), &monc, poolctx},
+  client{client_messenger.get(), &monc, &objecter},
+  timer(g_ceph_context, lock),
+  orig_argc(argc),
+  orig_argv(argv)
+{
+}
+
+NVMeofGwMonitorClient::~NVMeofGwMonitorClient() = default;
+
+const char** NVMeofGwMonitorClient::get_tracked_conf_keys() const
+{
+  static const char* KEYS[] = {
+    NULL
+  };
+  return KEYS;
+}
+
+std::string read_file(const std::string& filename) {
+    std::ifstream file(filename);
+    std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+    return content;
+}
+
+void NVMeofGwMonitorClient::init_gw_ssl_opts()
+{
+  if (server_cert.empty() && client_key.empty() && client_cert.empty())
+    return;
+
+  // load the certificates content
+  // create SSL/TLS credentials
+  gw_ssl_opts.pem_root_certs = read_file(server_cert);
+  gw_ssl_opts.pem_private_key = read_file(client_key);
+  gw_ssl_opts.pem_cert_chain = read_file(client_cert);
+}
+
+std::shared_ptr<grpc::ChannelCredentials> NVMeofGwMonitorClient::gw_creds()
+{
+  // use insecure channel if no keys/certs defined
+  if (server_cert.empty() && client_key.empty() && client_cert.empty())
+    return grpc::InsecureChannelCredentials();
+  else
+    return grpc::SslCredentials(gw_ssl_opts);
+}
+
+int NVMeofGwMonitorClient::init()
+{
+  dout(10) << dendl;
+  std::string val;
+  auto args = argv_to_vec(orig_argc, orig_argv);
+
+  for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
+    if (ceph_argparse_double_dash(args, i)) {
+      break;
+    } else if (ceph_argparse_witharg(args, i, &val, "--gateway-name", (char*)NULL)) {
+      name = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--gateway-pool", (char*)NULL)) {
+      pool = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--gateway-group", (char*)NULL)) {
+      group = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--gateway-address", (char*)NULL)) {
+      gateway_address = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--monitor-group-address", (char*)NULL)) {
+      monitor_address = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--server-cert", (char*)NULL)) {
+      server_cert = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--client-key", (char*)NULL)) {
+      client_key = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--client-cert", (char*)NULL)) {
+      client_cert = val;
+    } else {
+      ++i;
+    }
+  }
+
+  dout(10) << "gateway name: " << name <<
+    " pool:" << pool <<
+    " group:" << group <<
+    " address: " << gateway_address << dendl;
+  ceph_assert(name != "" && pool != "" && gateway_address != "" && monitor_address != "");
+
+  // ensures that either all are empty or all are non-empty.
+  ceph_assert((server_cert.empty() == client_key.empty()) && (client_key.empty() == client_cert.empty()));
+  init_gw_ssl_opts();
+
+  init_async_signal_handler();
+  register_async_signal_handler(SIGHUP, sighup_handler);
+
+  std::lock_guard l(lock);
+
+  // Initialize Messenger
+  client_messenger->add_dispatcher_tail(this);
+  client_messenger->add_dispatcher_head(&objecter);
+  client_messenger->add_dispatcher_tail(&client);
+  client_messenger->start();
+
+  poolctx.start(2);
+
+  // Initialize MonClient
+  if (monc.build_initial_monmap() < 0) {
+    client_messenger->shutdown();
+    client_messenger->wait();
+    return -1;
+  }
+
+  monc.sub_want("NVMeofGw", 0, 0);
+  monc.set_want_keys(CEPH_ENTITY_TYPE_MON|CEPH_ENTITY_TYPE_OSD
+      |CEPH_ENTITY_TYPE_MDS|CEPH_ENTITY_TYPE_MGR);
+  monc.set_messenger(client_messenger.get());
+
+  // We must register our config callback before calling init(), so
+  // that we see the initial configuration message
+  monc.register_config_callback([this](const std::string &k, const std::string &v){
+      // leaving this for debugging purposes
+      dout(10) << "nvmeof config_callback: " << k << " : " << v << dendl;
+      
+      return false;
+    });
+  monc.register_config_notify_callback([this]() {
+      dout(4) << "nvmeof monc config notify callback" << dendl;
+    });
+  dout(4) << "nvmeof Registered monc callback" << dendl;
+
+  int r = monc.init();
+  if (r < 0) {
+    monc.shutdown();
+    client_messenger->shutdown();
+    client_messenger->wait();
+    return r;
+  }
+  dout(10) << "nvmeof Registered monc callback" << dendl;
+
+  r = monc.authenticate();
+  if (r < 0) {
+    derr << "Authentication failed, did you specify an ID with a valid keyring?" << dendl;
+    monc.shutdown();
+    client_messenger->shutdown();
+    client_messenger->wait();
+    return r;
+  }
+  dout(10) << "monc.authentication done" << dendl;
+  monc.set_passthrough_monmap();
+
+  client_t whoami = monc.get_global_id();
+  client_messenger->set_myname(entity_name_t::MGR(whoami.v));
+  objecter.set_client_incarnation(0);
+  objecter.init();
+  objecter.enable_blocklist_events();
+  objecter.start();
+  client.init();
+  timer.init();
+
+  tick();
+
+  dout(10) << "Complete." << dendl;
+  return 0;
+}
+
+static bool get_gw_state(const char* desc, const std::map<NvmeGroupKey, NvmeGwMonClientStates>& m, const NvmeGroupKey& group_key, const NvmeGwId& gw_id, NvmeGwClientState& out)
+{
+  auto gw_group = m.find(group_key);
+  if (gw_group == m.end()) {
+    dout(10) << "can not find group (" << group_key.first << "," << group_key.second << ") "  << desc << " map: " << m << dendl;
+    return false;
+  }
+  auto gw_state = gw_group->second.find(gw_id);
+  if (gw_state == gw_group->second.end()) {
+    dout(10) << "can not find gw id: " << gw_id << " in " << desc << "group: " << gw_group->second  << dendl;
+    return false;
+  }
+  out = gw_state->second;
+  return true;
+}
+
+void NVMeofGwMonitorClient::send_beacon()
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(lock));
+  gw_availability_t gw_availability = gw_availability_t::GW_CREATED;
+  BeaconSubsystems subs;
+  NVMeofGwClient gw_client(
+     grpc::CreateChannel(gateway_address, gw_creds()));
+  subsystems_info gw_subsystems;
+  bool ok = gw_client.get_subsystems(gw_subsystems);
+  if (ok) {
+    for (int i = 0; i < gw_subsystems.subsystems_size(); i++) {
+      const subsystem& sub = gw_subsystems.subsystems(i);
+      BeaconSubsystem bsub;
+      bsub.nqn = sub.nqn();
+      for (int j = 0; j < sub.namespaces_size(); j++) {
+        const auto& ns = sub.namespaces(j);
+        BeaconNamespace bns = {ns.anagrpid(), ns.nonce()};
+        bsub.namespaces.push_back(bns);
+      }
+      for (int k = 0; k < sub.listen_addresses_size(); k++) {
+        const auto& ls = sub.listen_addresses(k);
+        BeaconListener bls = { ls.adrfam(), ls.traddr(), ls.trsvcid() };
+        bsub.listeners.push_back(bls);
+      }
+      subs.push_back(bsub);
+    }
+  }
+
+  auto group_key = std::make_pair(pool, group);
+  NvmeGwClientState old_gw_state;
+  // if already got gateway state in the map
+  if (get_gw_state("old map", map, group_key, name, old_gw_state))
+    gw_availability = ok ? gw_availability_t::GW_AVAILABLE : gw_availability_t::GW_UNAVAILABLE;
+  dout(10) << "sending beacon as gid " << monc.get_global_id() << " availability " << (int)gw_availability <<
+    " osdmap_epoch " << osdmap_epoch << " gwmap_epoch " << gwmap_epoch << dendl;
+  auto m = ceph::make_message<MNVMeofGwBeacon>(
+      name,
+      pool,
+      group,
+      subs,
+      gw_availability,
+      osdmap_epoch,
+      gwmap_epoch);
+  monc.send_mon_message(std::move(m));
+}
+
+void NVMeofGwMonitorClient::disconnect_panic()
+{
+  auto disconnect_panic_duration = g_conf().get_val<std::chrono::seconds>("nvmeof_mon_client_disconnect_panic").count();
+  auto now = std::chrono::steady_clock::now();
+  auto elapsed_seconds = std::chrono::duration_cast<std::chrono::seconds>(now - last_map_time).count();
+  if (elapsed_seconds > disconnect_panic_duration) {
+    dout(4) << "Triggering a panic upon disconnection from the monitor, elapsed " << elapsed_seconds << ", configured disconnect panic duration " << disconnect_panic_duration << dendl;
+    throw std::runtime_error("Lost connection to the monitor (beacon timeout).");
+  }
+}
+
+void NVMeofGwMonitorClient::tick()
+{
+  dout(10) << dendl;
+
+  disconnect_panic();
+  send_beacon();
+
+  timer.add_event_after(
+      g_conf().get_val<std::chrono::seconds>("nvmeof_mon_client_tick_period").count(),
+      new LambdaContext([this](int r){
+          tick();
+      }
+  ));
+}
+
+void NVMeofGwMonitorClient::shutdown()
+{
+  std::lock_guard l(lock);
+
+  dout(4) << "nvmeof Shutting down" << dendl;
+
+
+  // stop sending beacon first, I use monc to talk with monitors
+  timer.shutdown();
+  // client uses monc and objecter
+  client.shutdown();
+  // Stop asio threads, so leftover events won't call into shut down
+  // monclient/objecter.
+  poolctx.finish();
+  // stop monc
+  monc.shutdown();
+
+  // objecter is used by monc
+  objecter.shutdown();
+  // client_messenger is used by all of them, so stop it in the end
+  client_messenger->shutdown();
+}
+
+void NVMeofGwMonitorClient::handle_nvmeof_gw_map(ceph::ref_t<MNVMeofGwMap> nmap)
+{
+  last_map_time = std::chrono::steady_clock::now(); // record time of last monitor message
+
+  auto &new_map = nmap->get_map();
+  gwmap_epoch = nmap->get_gwmap_epoch();
+  auto group_key = std::make_pair(pool, group);
+  dout(10) << "handle nvmeof gw map: " << new_map << dendl;
+
+  NvmeGwClientState old_gw_state;
+  auto got_old_gw_state = get_gw_state("old map", map, group_key, name, old_gw_state); 
+  NvmeGwClientState new_gw_state;
+  auto got_new_gw_state = get_gw_state("new map", new_map, group_key, name, new_gw_state); 
+
+  // ensure that the gateway state has not vanished
+  ceph_assert(got_new_gw_state || !got_old_gw_state);
+
+  if (!got_old_gw_state) {
+    if (!got_new_gw_state) {
+      dout(10) << "Can not find new gw state" << dendl;
+      return;
+    }
+    bool set_group_id = false;
+    while (!set_group_id) {
+      NVMeofGwMonitorGroupClient monitor_group_client(
+          grpc::CreateChannel(monitor_address, gw_creds()));
+      dout(10) << "GRPC set_group_id: " <<  new_gw_state.group_id << dendl;
+      set_group_id = monitor_group_client.set_group_id( new_gw_state.group_id);
+      if (!set_group_id) {
+             dout(10) << "GRPC set_group_id failed" << dendl;
+             auto retry_timeout = g_conf().get_val<uint64_t>("mon_nvmeofgw_set_group_id_retry");
+             usleep(retry_timeout);
+      }
+    }
+  }
+
+  if (got_old_gw_state && got_new_gw_state) {
+    dout(10) << "got_old_gw_state: " << old_gw_state << "got_new_gw_state: " << new_gw_state << dendl;
+    // Make sure we do not get out of order state changes from the monitor
+    ceph_assert(new_gw_state.gw_map_epoch >= old_gw_state.gw_map_epoch);
+
+    // If the monitor previously identified this gateway as accessible but now
+    // flags it as unavailable, it suggests that the gateway lost connection
+    // to the monitor.
+    if (old_gw_state.availability == gw_availability_t::GW_AVAILABLE &&
+       new_gw_state.availability == gw_availability_t::GW_UNAVAILABLE) {
+      dout(4) << "Triggering a panic upon disconnection from the monitor, gw state - unavailable" << dendl;
+      throw std::runtime_error("Lost connection to the monitor (gw map unavailable).");
+    }
+  }
+
+  // Gather all state changes
+  ana_info ai;
+  epoch_t max_blocklist_epoch = 0;
+  for (const auto& nqn_state_pair: new_gw_state.subsystems) {
+    auto& sub = nqn_state_pair.second;
+    const auto& nqn = nqn_state_pair.first;
+    nqn_ana_states nas;
+    nas.set_nqn(nqn);
+    const auto& old_nqn_state_pair = old_gw_state.subsystems.find(nqn);
+    auto found_old_nqn_state = (old_nqn_state_pair != old_gw_state.subsystems.end());
+
+    // old and new ana group id ranges could be different
+    auto ana_state_size = (found_old_nqn_state) ?
+       std::max(old_nqn_state_pair->second.ana_state.size(), sub.ana_state.size()) :
+       sub.ana_state.size();
+
+    for (NvmeAnaGrpId  ana_grp_index = 0; ana_grp_index < ana_state_size; ana_grp_index++) {
+      const auto initial_ana_state = std::make_pair(gw_exported_states_per_group_t::GW_EXPORTED_INACCESSIBLE_STATE, (epoch_t)0);
+      auto new_group_state = (ana_grp_index < sub.ana_state.size()) ?
+       sub.ana_state[ana_grp_index] :
+       initial_ana_state;
+      auto old_group_state = (got_old_gw_state && found_old_nqn_state && ana_grp_index < old_nqn_state_pair->second.ana_state.size()) ?
+        old_nqn_state_pair->second.ana_state[ana_grp_index] :
+       initial_ana_state;
+
+      // if no state change detected for this nqn, group id
+      if (new_group_state.first == old_group_state.first) {
+        continue;
+      }
+      ana_group_state gs;
+      gs.set_grp_id(ana_grp_index + 1); // offset by 1, index 0 is ANAGRP1
+      const auto& new_agroup_state = new_group_state.first;
+      const epoch_t& blocklist_epoch = new_group_state.second;
+
+      if (new_agroup_state == gw_exported_states_per_group_t::GW_EXPORTED_OPTIMIZED_STATE &&
+          blocklist_epoch != 0) {
+        if (blocklist_epoch > max_blocklist_epoch) max_blocklist_epoch = blocklist_epoch;
+      }
+      gs.set_state(new_agroup_state == gw_exported_states_per_group_t::GW_EXPORTED_OPTIMIZED_STATE ? OPTIMIZED : INACCESSIBLE); // Set the ANA state
+      nas.mutable_states()->Add(std::move(gs));
+      dout(10) << " grpid " << (ana_grp_index + 1) << " state: " << new_gw_state << dendl;
+    }
+    if (nas.states_size()) ai.mutable_states()->Add(std::move(nas));
+  }
+
+  // if there is state change, notify the gateway
+  if (ai.states_size()) {
+    bool set_ana_state = false;
+    while (!set_ana_state) {
+      NVMeofGwClient gw_client(
+          grpc::CreateChannel(gateway_address, gw_creds()));
+      set_ana_state = gw_client.set_ana_state(ai);
+      if (!set_ana_state) {
+       dout(10) << "GRPC set_ana_state failed" << dendl;
+       usleep(1000); // TODO conf option
+      }
+    }
+    // Update latest accepted osdmap epoch, for beacons
+    if (max_blocklist_epoch > osdmap_epoch) {
+      osdmap_epoch = max_blocklist_epoch;
+      dout(10) << "Ready for blocklist osd map epoch: " << osdmap_epoch << dendl;
+    }
+  }
+  map = new_map;
+}
+
+bool NVMeofGwMonitorClient::ms_dispatch2(const ref_t<Message>& m)
+{
+  std::lock_guard l(lock);
+  dout(10) << "got map type " << m->get_type() << dendl;
+
+  if (m->get_type() == MSG_MNVMEOF_GW_MAP) {
+    handle_nvmeof_gw_map(ref_cast<MNVMeofGwMap>(m));
+  }
+  bool handled = false;
+  return handled;
+}
+
+int NVMeofGwMonitorClient::main(std::vector<const char *> args)
+{
+  client_messenger->wait();
+
+  // Disable signal handlers
+  unregister_async_signal_handler(SIGHUP, sighup_handler);
+  shutdown_async_signal_handler();
+
+  return 0;
+}
diff --git a/src/nvmeof/NVMeofGwMonitorClient.h b/src/nvmeof/NVMeofGwMonitorClient.h

new file mode 100644 (file)

index 0000000..5bcca91
--- /dev/null
+++ b/src/nvmeof/NVMeofGwMonitorClient.h
@@ -0,0 +1,97 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023,2024 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+
+#ifndef NVMEOFGWMONITORCLIENT_H_
+#define NVMEOFGWMONITORCLIENT_H_
+
+#include "auth/Auth.h"
+#include "common/async/context_pool.h"
+#include "common/Finisher.h"
+#include "common/Timer.h"
+#include "common/LogClient.h"
+
+#include "client/Client.h"
+#include "mon/MonClient.h"
+#include "osdc/Objecter.h"
+#include "messages/MNVMeofGwMap.h"
+
+#include <grpcpp/grpcpp.h>
+#include <grpcpp/security/credentials.h>
+
+class NVMeofGwMonitorClient: public Dispatcher,
+                  public md_config_obs_t {
+private:
+  std::string name;
+  std::string pool;
+  std::string group;
+  std::string gateway_address;
+  std::string monitor_address;
+  std::string server_cert;
+  std::string client_key;
+  std::string client_cert;
+  grpc::SslCredentialsOptions
+              gw_ssl_opts;  // gateway grpc ssl options
+  epoch_t     osdmap_epoch; // last awaited osdmap_epoch
+  epoch_t     gwmap_epoch;  // last received gw map epoch
+  std::chrono::time_point<std::chrono::steady_clock>
+              last_map_time; // used to panic on disconnect
+
+  // init gw ssl opts
+  void init_gw_ssl_opts();
+
+  // returns gateway grpc credentials
+  std::shared_ptr<grpc::ChannelCredentials> gw_creds();
+
+protected:
+  ceph::async::io_context_pool poolctx;
+  MonClient monc;
+  std::unique_ptr<Messenger> client_messenger;
+  Objecter objecter;
+  Client client;
+  std::map<NvmeGroupKey, NvmeGwMonClientStates> map;
+  ceph::mutex lock = ceph::make_mutex("NVMeofGw::lock");
+  SafeTimer timer;
+
+  int orig_argc;
+  const char **orig_argv;
+
+  void send_config_beacon(); 
+  void send_beacon();
+ 
+public:
+  NVMeofGwMonitorClient(int argc, const char **argv);
+  ~NVMeofGwMonitorClient() override;
+
+  // Dispatcher interface
+  bool ms_dispatch2(const ceph::ref_t<Message>& m) override;
+  bool ms_handle_reset(Connection *con) override { return false; }
+  void ms_handle_remote_reset(Connection *con) override {}
+  bool ms_handle_refused(Connection *con) override { return false; };
+
+  // config observer bits
+  const char** get_tracked_conf_keys() const override;
+  void handle_conf_change(const ConfigProxy& conf,
+                         const std::set <std::string> &changed) override {};
+
+  int init();
+  void shutdown();
+  int main(std::vector<const char *> args);
+  void tick();
+  void disconnect_panic();
+
+  void handle_nvmeof_gw_map(ceph::ref_t<MNVMeofGwMap> m);
+};
+
+#endif
+
diff --git a/src/nvmeof/NVMeofGwMonitorGroupClient.cc b/src/nvmeof/NVMeofGwMonitorGroupClient.cc

new file mode 100644 (file)

index 0000000..27ed7b1
--- /dev/null
+++ b/src/nvmeof/NVMeofGwMonitorGroupClient.cc
@@ -0,0 +1,25 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include "NVMeofGwMonitorGroupClient.h"
+
+bool NVMeofGwMonitorGroupClient::set_group_id(const uint32_t& id) {
+  group_id_req request;
+  request.set_id(id);
+  google::protobuf::Empty reply;
+  ClientContext context;
+
+  Status status = stub_->group_id(&context, request, &reply);
+
+  return status.ok();
+}
diff --git a/src/nvmeof/NVMeofGwMonitorGroupClient.h b/src/nvmeof/NVMeofGwMonitorGroupClient.h

new file mode 100644 (file)

index 0000000..805e182
--- /dev/null
+++ b/src/nvmeof/NVMeofGwMonitorGroupClient.h
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+
+#ifndef  __NVMEOFGWMONITORGROUPCLIENT_H__
+#define  __NVMEOFGWMONITORGROUPCLIENT_H__
+#include <iostream>
+#include <memory>
+#include <string>
+
+#include <grpcpp/grpcpp.h>
+
+#include "monitor.grpc.pb.h"
+
+using grpc::Channel;
+using grpc::ClientContext;
+using grpc::Status;
+
+class NVMeofGwMonitorGroupClient {
+ public:
+  NVMeofGwMonitorGroupClient(std::shared_ptr<Channel> channel)
+      : stub_(MonitorGroup::NewStub(channel)) {}
+
+  bool set_group_id(const uint32_t& id);
+
+ private:
+  std::unique_ptr<MonitorGroup::Stub> stub_;
+};
+#endif
diff --git a/src/nvmeof/gateway b/src/nvmeof/gateway

new file mode 160000 (submodule)

index 0000000..322a86f
--- /dev/null
+++ b/src/nvmeof/gateway
@@ -0,0 +1 @@
+Subproject commit 322a86f7348af1bc173f01e6cc4b64e9a8075727
diff --git a/src/pybind/mgr/cephadm/services/nvmeof.py b/src/pybind/mgr/cephadm/services/nvmeof.py

index ac258887f6a510174e9ba7ed59e7ba26132aefff..9f9ba94557b3465464a10a440508fab351757e39 100644 (file)
--- a/src/pybind/mgr/cephadm/services/nvmeof.py
+++ b/src/pybind/mgr/cephadm/services/nvmeof.py
@@ -21,6 +21,9 @@ class NvmeofService(CephService):
      def config(self, spec: NvmeofServiceSpec) -> None:  # type: ignore
          assert self.TYPE == spec.service_type
          assert spec.pool
+        self.pool = spec.pool
+        assert spec.group is not None
+        self.group = spec.group
          self.mgr._check_pool_exists(spec.pool, spec.service_name())
  
      def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
@@ -77,8 +80,36 @@ class NvmeofService(CephService):
  
          daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
          daemon_spec.deps = []
+        if not hasattr(self, 'gws'):
+            self.gws = {} # id -> name map of gateways for this service.
+        self.gws[nvmeof_gw_id] = name # add to map of service's gateway names
          return daemon_spec
  
+    def daemon_check_post(self, daemon_descrs: List[DaemonDescription]) -> None:
+        """ Overrides the daemon_check_post to add nvmeof gateways safely
+        """
+        self.mgr.log.info(f"nvmeof daemon_check_post {daemon_descrs}")
+        # Assert configured
+        assert self.pool
+        assert self.group is not None
+        for dd in daemon_descrs:
+            self.mgr.log.info(f"nvmeof daemon_descr {dd}")
+            assert dd.daemon_id in self.gws
+            name = self.gws[dd.daemon_id]
+            self.mgr.log.info(f"nvmeof daemon name={name}")
+            # Notify monitor about this gateway creation
+            cmd = {
+                'prefix': 'nvme-gw create',
+                'id': name,
+                'group': self.group,
+                'pool': self.pool
+            }
+            self.mgr.log.info(f"create gateway: monitor command {cmd}")
+            _, _, err = self.mgr.mon_command(cmd)
+            if err:
+                self.mgr.log.error(f"Unable to send monitor command {cmd}, error {err}")
+        super().daemon_check_post(daemon_descrs)
+
      def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
          def get_set_cmd_dicts(out: str) -> List[dict]:
              gateways = json.loads(out)['gateways']
@@ -151,10 +182,41 @@ class NvmeofService(CephService):
          if not ret:
              logger.info(f'{daemon.hostname} removed from nvmeof gateways dashboard config')
  
-        # and any certificates being used for mTLS
+        # Assert configured
+        assert self.pool
+        assert self.group is not None
+        assert daemon.daemon_id in self.gws
+        name = self.gws[daemon.daemon_id]
+        self.gws.pop(daemon.daemon_id)
+        # Notify monitor about this gateway deletion
+        cmd = {
+            'prefix': 'nvme-gw delete',
+            'id': name,
+            'group': self.group,
+            'pool': self.pool
+        }
+        self.mgr.log.info(f"delete gateway: monitor command {cmd}")
+        _, _, err = self.mgr.mon_command(cmd)
+        if err:
+            self.mgr.log.error(f"Unable to send monitor command {cmd}, error {err}")
  
      def purge(self, service_name: str) -> None:
-        """Removes configuration
+        """Make sure no zombie gateway is left behind
          """
-        #  TODO: what should we purge in this case (if any)?
-        pass
+        # Assert configured
+        assert self.pool
+        assert self.group is not None
+        for daemon_id in self.gws:
+            name = self.gws[daemon_id]
+            self.gws.pop(daemon_id)
+            # Notify monitor about this gateway deletion
+            cmd = {
+                'prefix': 'nvme-gw delete',
+                'id': name,
+                'group': self.group,
+                'pool': self.pool
+            }
+            self.mgr.log.info(f"purge delete gateway: monitor command {cmd}")
+            _, _, err = self.mgr.mon_command(cmd)
+            if err:
+                self.mgr.log.error(f"Unable to send monitor command {cmd}, error {err}")
diff --git a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2

index 18786f95bbe8d36f37039d16a7ab2edb87684205..644ca586ba93f418924b1ff71102ecc1e3ca2e9b 100644 (file)
--- a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2
+++ b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2
@@ -1,7 +1,7 @@
  # {{ cephadm_managed }}
  [gateway]
  name = {{ name }}
-group = {{ spec.group if spec.group is not none else '' }}
+group = {{ spec.group }}
  addr = {{ addr }}
  port = {{ spec.port }}
  enable_auth = {{ spec.enable_auth }}
diff --git a/src/python-common/ceph/deployment/service_spec.py b/src/python-common/ceph/deployment/service_spec.py

index 4b88cf80442694ea57e964a83b67943a01fbbc77..1664c4de74eca1ba25b8b17414ce20e17eec060c 100644 (file)
--- a/src/python-common/ceph/deployment/service_spec.py
+++ b/src/python-common/ceph/deployment/service_spec.py
@@ -1355,7 +1355,7 @@ class NvmeofServiceSpec(ServiceSpec):
                   max_log_directory_backups: Optional[int] = 10,
                   log_directory: Optional[str] = '/var/log/ceph/',
                   monitor_timeout: Optional[float] = 1.0,
-                 enable_monitor_client: bool = False,
+                 enable_monitor_client: bool = True,
                   placement: Optional[PlacementSpec] = None,
                   unmanaged: bool = False,
                   preview_only: bool = False,
@@ -1381,7 +1381,7 @@ class NvmeofServiceSpec(ServiceSpec):
          #: ``name`` name of the nvmeof gateway
          self.name = name
          #: ``group`` name of the nvmeof gateway
-        self.group = group
+        self.group = group or ''
          #: ``enable_auth`` enables user authentication on nvmeof gateway
          self.enable_auth = enable_auth
          #: ``state_update_notify`` enables automatic update from OMAP in nvmeof gateway
diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt

index 2e756eeb583807710f1d9488d482576f791bd1a1..6272b3b1ed6762e9d3fab06f884024ea6eca11e5 100644 (file)
--- a/src/test/CMakeLists.txt
+++ b/src/test/CMakeLists.txt
@@ -1008,3 +1008,11 @@ add_ceph_unittest(unittest_weighted_shuffle)
  add_executable(unittest_intarith test_intarith.cc)
  add_ceph_unittest(unittest_intarith)
  #make check ends here
+
+# test_nvmeof_mon_encoding
+add_executable(test_nvmeof_mon_encoding
+  test_nvmeof_mon_encoding.cc
+  )
+target_link_libraries(test_nvmeof_mon_encoding
+  mon ceph-common global-static
+  )
diff --git a/src/test/test_nvmeof_mon_encoding.cc b/src/test/test_nvmeof_mon_encoding.cc

new file mode 100644 (file)

index 0000000..8cd2381
--- /dev/null
+++ b/src/test/test_nvmeof_mon_encoding.cc
@@ -0,0 +1,200 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2024 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <iostream>
+#include "common/ceph_argparse.h"
+#include "common/debug.h"
+#include "include/ceph_assert.h"
+#include "global/global_init.h"
+#include "mon/NVMeofGwMon.h"
+#include "messages/MNVMeofGwMap.h"
+#include "messages/MNVMeofGwBeacon.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix *_dout
+
+using namespace std;
+
+void test_NVMeofGwMap() {
+  dout(0) << __func__ << "\n\n" << dendl;
+
+  NVMeofGwMap pending_map;
+  std::string pool = "pool1";
+  std::string group = "grp1";
+  auto group_key = std::make_pair(pool, group);
+  pending_map.cfg_add_gw("GW1" ,group_key);
+  pending_map.cfg_add_gw("GW2" ,group_key);
+  pending_map.cfg_add_gw("GW3" ,group_key);
+  NvmeNonceVector new_nonces = {"abc", "def","hij"};
+  pending_map.created_gws[group_key]["GW1"].nonce_map[1] = new_nonces;
+  pending_map.created_gws[group_key]["GW1"].performed_full_startup = true;
+  int i = 0;
+  for (auto & blklst_itr : pending_map.created_gws[group_key]["GW1"].blocklist_data){
+    blklst_itr.second.osd_epoch = 2*(i++);
+    blklst_itr.second.is_failover = false;
+  }
+
+  pending_map.created_gws[group_key]["GW2"].nonce_map[2] = new_nonces;
+  dout(0) << " == Dump map before Encode : == " <<dendl;
+  dout(0) << pending_map << dendl;
+
+  ceph::buffer::list bl;
+  pending_map.encode(bl);
+  auto p = bl.cbegin();
+  pending_map.decode(p);
+  dout(0) << " == Dump map after Decode: == " <<dendl;
+  dout(0) << pending_map << dendl;
+}
+
+void test_MNVMeofGwMap() {
+  dout(0) << __func__ << "\n\n" << dendl;
+  std::map<NvmeGroupKey, NvmeGwMonClientStates> map;
+
+  std::string pool = "pool1";
+  std::string group = "grp1";
+  std::string gw_id = "GW1";
+  NvmeGwClientState state(1, 32, gw_availability_t::GW_UNAVAILABLE);
+  std::string nqn = "nqn";
+  ana_state_t ana_state;
+  NqnState nqn_state(nqn, ana_state);
+  state.subsystems.insert({nqn, nqn_state});
+
+  auto group_key = std::make_pair(pool, group);
+  map[group_key][gw_id] = state;
+
+
+
+  ceph::buffer::list bl;
+  encode(map, bl);
+  dout(0) << "encode: " << map << dendl;
+  decode(map, bl);
+  dout(0) << "decode: " << map << dendl;
+
+  BeaconSubsystem sub = { nqn, {}, {} };
+  NVMeofGwMap pending_map;
+  pending_map.cfg_add_gw("GW1" ,group_key);
+  pending_map.cfg_add_gw("GW2" ,group_key);
+  pending_map.cfg_add_gw("GW3" ,group_key);
+  NvmeNonceVector new_nonces = {"abc", "def","hij"};
+  pending_map.created_gws[group_key]["GW1"].nonce_map[1] = new_nonces;
+  pending_map.created_gws[group_key]["GW1"].subsystems.push_back(sub);
+  int i = 0;
+  for (auto & blklst_itr : pending_map.created_gws[group_key]["GW1"].blocklist_data){
+     blklst_itr.second.osd_epoch = 2*(i++);
+     blklst_itr.second.is_failover = false;
+  }
+
+  pending_map.created_gws[group_key]["GW2"].nonce_map[2] = new_nonces;
+  dout(0) << "False pending map: " << pending_map << dendl;
+
+  auto msg = make_message<MNVMeofGwMap>(pending_map);
+  msg->encode_payload(0);
+  msg->decode_payload();
+  dout(0) << "decode msg: " << *msg << dendl;
+
+  dout(0)   << "\n == Test GW Delete ==" << dendl;
+  pending_map.cfg_delete_gw("GW1" ,group_key);
+  dout(0) << "deleted GW1 " << pending_map << dendl;
+
+  pending_map.cfg_delete_gw("GW1" ,group_key);
+  dout(0) << "duplicated delete of GW1 " << pending_map << dendl;
+
+  pending_map.cfg_delete_gw("GW2" ,group_key);
+  dout(0) << "deleted GW2 " << pending_map << dendl;
+
+  dout(0) << "delete of wrong gw id" << dendl;
+  pending_map.cfg_delete_gw("wow" ,group_key);
+
+  pending_map.cfg_delete_gw("GW3" ,group_key);
+  dout(0) << "deleted GW3 . we should see the empty map " << pending_map << dendl;
+
+
+}
+
+void test_MNVMeofGwBeacon() {
+  std::string gw_id = "GW";
+  std::string gw_pool = "pool";
+  std::string gw_group = "group";
+  gw_availability_t availability = gw_availability_t::GW_AVAILABLE;
+  std::string nqn = "nqn";
+  BeaconSubsystem sub = { nqn, {}, {} };
+  BeaconSubsystems subs = { sub };
+  epoch_t osd_epoch = 17;
+  epoch_t gwmap_epoch = 42;
+
+  auto msg = make_message<MNVMeofGwBeacon>(
+      gw_id,
+      gw_pool,
+      gw_group,
+      subs,
+      availability,
+      osd_epoch,
+      gwmap_epoch);
+  msg->encode_payload(0);
+  msg->decode_payload();
+  dout(0) << "decode msg: " << *msg << dendl;
+  ceph_assert(msg->get_gw_id() == gw_id);
+  ceph_assert(msg->get_gw_pool() == gw_pool);
+  ceph_assert(msg->get_gw_group() == gw_group);
+  ceph_assert(msg->get_availability() == availability);
+  ceph_assert(msg->get_last_osd_epoch() == osd_epoch);
+  ceph_assert(msg->get_last_gwmap_epoch() == gwmap_epoch);
+  const auto& dsubs = msg->get_subsystems();
+  auto it = std::find_if(dsubs.begin(), dsubs.end(),
+                           [&nqn](const auto& element) {
+                               return element.nqn == nqn;
+                           });
+  ceph_assert(it != dsubs.end());
+}
+
+void test_NVMeofGwTimers()
+{
+    NVMeofGwMap pending_map;
+    //pending_map.Gmetadata;
+    const NvmeGroupKey group_key = std::make_pair("a","b");
+    std::string gwid = "GW1";
+    NvmeAnaGrpId  grpid = 2;
+    pending_map.start_timer(gwid, group_key, grpid, 30);
+    auto end_time  = pending_map.fsm_timers[group_key][gwid].data[grpid].end_time;
+    uint64_t  millisecondsSinceEpoch = std::chrono::duration_cast<std::chrono::milliseconds>(end_time.time_since_epoch()).count();
+    dout(0) << "Metadata milliseconds " << millisecondsSinceEpoch << " " << (int)pending_map.fsm_timers[group_key][gwid].data[grpid].timer_value << dendl;
+    ceph::buffer::list bl;
+    pending_map.encode(bl);
+    auto p = bl.cbegin();
+    pending_map.decode(p);
+
+    end_time  = pending_map.fsm_timers[group_key][gwid].data[2].end_time;
+    millisecondsSinceEpoch = std::chrono::duration_cast<std::chrono::milliseconds>(end_time.time_since_epoch()).count();
+    dout(0) << "After encode decode Metadata milliseconds " << millisecondsSinceEpoch << " " <<  (int)pending_map.fsm_timers[group_key][gwid].data[grpid].timer_value<<dendl;
+
+}
+
+int main(int argc, const char **argv)
+{
+  // Init ceph
+  auto args = argv_to_vec(argc, argv);
+  auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+                         CODE_ENVIRONMENT_UTILITY,
+                         CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+  common_init_finish(g_ceph_context);
+
+  // Run tests
+  test_NVMeofGwMap();
+  test_MNVMeofGwMap();
+  test_MNVMeofGwBeacon();
+  test_NVMeofGwTimers();
+}
+
diff --git a/src/tools/ceph-dencoder/CMakeLists.txt b/src/tools/ceph-dencoder/CMakeLists.txt

index 86cab179590c596f44f6fbbc6d86adf0c39d641f..3c8087413cb65cfea17de51ff4560eba9cdc5f1d 100644 (file)
--- a/src/tools/ceph-dencoder/CMakeLists.txt
+++ b/src/tools/ceph-dencoder/CMakeLists.txt
@@ -88,6 +88,16 @@ if(WITH_RBD)
    endif()
  endif()
  
+if(WITH_NVMEOF_GATEWAY_MONITOR_CLIENT)
+  add_denc_mod(denc-mod-nvmeof
+    nvmeof_types.cc)
+  target_link_libraries(denc-mod-nvmeof
+    mon
+    ceph-common
+    )
+endif()
+
+
  if(WITH_CEPHFS)
    add_denc_mod(denc-mod-cephfs
      mds_types.cc)
diff --git a/src/tools/ceph-dencoder/nvmeof_types.cc b/src/tools/ceph-dencoder/nvmeof_types.cc

new file mode 100644 (file)

index 0000000..86b76e4
--- /dev/null
+++ b/src/tools/ceph-dencoder/nvmeof_types.cc
@@ -0,0 +1,36 @@
+#include "acconfig.h"
+#include <cstdint>
+using namespace std;
+#include "include/ceph_features.h"
+
+#define TYPE(t)
+#define TYPE_STRAYDATA(t)
+#define TYPE_NONDETERMINISTIC(t)
+#define TYPE_FEATUREFUL(t)
+#define TYPE_FEATUREFUL_STRAYDATA(t)
+#define TYPE_FEATUREFUL_NONDETERMINISTIC(t)
+#define TYPE_FEATUREFUL_NOCOPY(t)
+#define TYPE_NOCOPY(t)
+#define MESSAGE(t)
+#include "nvmeof_types.h"
+#undef TYPE
+#undef TYPE_STRAYDATA
+#undef TYPE_NONDETERMINISTIC
+#undef TYPE_NOCOPY
+#undef TYPE_FEATUREFUL
+#undef TYPE_FEATUREFUL_STRAYDATA
+#undef TYPE_FEATUREFUL_NONDETERMINISTIC
+#undef TYPE_FEATUREFUL_NOCOPY
+#undef MESSAGE
+
+#include "denc_plugin.h"
+
+DENC_API void register_dencoders(DencoderPlugin* plugin)
+{
+#include "nvmeof_types.h"
+}
+
+DENC_API void unregister_dencoders(DencoderPlugin* plugin)
+{
+  plugin->unregister_dencoders();
+}
diff --git a/src/tools/ceph-dencoder/nvmeof_types.h b/src/tools/ceph-dencoder/nvmeof_types.h

new file mode 100644 (file)

index 0000000..96cff73
--- /dev/null
+++ b/src/tools/ceph-dencoder/nvmeof_types.h
@@ -0,0 +1,174 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2024 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_NVMEOF_TYPES_H
+#define CEPH_NVMEOF_TYPES_H
+
+#ifdef WITH_NVMEOF_GATEWAY_MONITOR_CLIENT
+#include "mon/NVMeofGwMon.h"
+#include "messages/MNVMeofGwMap.h"
+#include "messages/MNVMeofGwBeacon.h"
+TYPE(NVMeofGwMap)
+// Implement the dencoder interface
+class NVMeofGwMapDencoder {
+ private:
+   NVMeofGwMap m;
+ public:
+  NVMeofGwMapDencoder() = default;
+  explicit NVMeofGwMapDencoder(const NVMeofGwMap& m) : m(m) {}
+
+  void encode(bufferlist& bl) const {
+    using ceph::encode;
+    encode(t, bl);
+  }
+  void decode(bufferlist::const_iterator &p) {
+    using ceph::decode;
+    decode(t, p);
+  }
+  void dump(Formatter* f) {
+    f->dump_stream("NVMeofGwMap") << m;
+  }
+
+  static void generate_test_instances(std::list<NVMeofGwMapDencoder*>& ls) {
+    std::string pool = "pool1";
+    std::string group = "grp1";
+    auto group_key = std::make_pair(pool, group);
+    m.cfg_add_gw("GW1" ,group_key);
+    m.cfg_add_gw("GW2" ,group_key);
+    m.cfg_add_gw("GW3" ,group_key);
+    NvmeNonceVector new_nonces = {"abc", "def","hij"};
+    m.created_gws[group_key]["GW1"].nonce_map[1] = new_nonces;
+    m.created_gws[group_key]["GW1"].performed_full_startup = true;
+    for(int i=0; i< MAX_SUPPORTED_ANA_GROUPS; i++){
+      m.created_gws[group_key]["GW1"].blocklist_data[i].osd_epoch = i*2;
+      m.created_gws[group_key]["GW1"].blocklist_data[i].is_failover = false;
+    }
+
+    m.created_gws[group_key]["GW2"].nonce_map[2] = new_nonces;
+
+    ls.push_back(new NVMeofGwMapDencoder(m));
+
+  }
+};
+WRITE_CLASS_ENCODER(NVMeofGwMapDencoder)
+
+TYPE(MNVMeofGwMap)
+// Implement the dencoder interface
+class MNVMeofGwMapDencoder {
+ private:
+   MNVMeofGwMap m;
+ public:
+  MNVMeofGwMapDencoder() = default;
+  explicit MNVMeofGwMapDencoder(const MNVMeofGwMap& m) : m(m) {}
+
+  void encode(bufferlist& bl) const {
+    using ceph::encode;
+    encode(t, bl);
+  }
+  void decode(bufferlist::const_iterator &p) {
+    using ceph::decode;
+    decode(t, p);
+  }
+  void dump(Formatter* f) {
+    f->dump_stream("MNVMeofGwMap") << m;
+  }
+
+  static void generate_test_instances(std::list<MNVMeofGwMapDencoder*>& ls) {
+    std::map<NvmeGroupKey, NvmeGwMonClientStates> map;
+    std::string pool = "pool1";
+    std::string group = "grp1";
+    std::string gw_id = "GW1";
+    NvmeGwClientState state(1, 32, gw_availability_t::GW_UNAVAILABLE);
+    std::string nqn = "nqn";
+    ANA_STATE ana_state;
+    NqnState nqn_state(nqn, ana_state);
+    state.subsystems.insert({nqn, nqn_state});
+
+    auto group_key = std::make_pair(pool, group);
+    map[group_key][gw_id] = state;
+    BeaconSubsystem sub = { nqn, {}, {} };
+    NVMeofGwMap pending_map;
+    pending_map.cfg_add_gw("GW1" ,group_key);
+    pending_map.cfg_add_gw("GW2" ,group_key);
+    pending_map.cfg_add_gw("GW3" ,group_key);
+    NvmeNonceVector new_nonces = {"abc", "def","hij"};
+    pending_map.created_gws[group_key]["GW1"].nonce_map[1] = new_nonces;
+    pending_map.created_gws[group_key]["GW1"].subsystems.push_back(sub);
+    for(int i=0; i< MAX_SUPPORTED_ANA_GROUPS; i++){
+      pending_map.created_gws[group_key]["GW1"].blocklist_data[i].osd_epoch = i*2;
+      pending_map.created_gws[group_key]["GW1"].blocklist_data[i].is_failover = false;
+    }
+
+    pending_map.created_gws[group_key]["GW2"].nonce_map[2] = new_nonces;
+    pending_map.start_timer(gw_id, group_key, group, 30);
+
+    m = MNVMeofGwMap(pending_map);
+    ls.push_back(new MNVMeofGwMapDencoder(m));
+
+  }
+};
+WRITE_CLASS_ENCODER(MNVMeofGwMapDencoder)
+
+TYPE(MNVMeofGwBeacon)
+// Implement the dencoder interface
+class MNVMeofGwBeaconDencoder {
+ private:
+   MNVMeofGwBeacon m;
+ public:
+  MNVMeofGwBeaconDencoder() = default;
+  explicit MNVMeofGwBeaconDencoder(const MNVMeofGwBeacon& m) : m(m) {}
+
+  void encode(bufferlist& bl) const {
+    using ceph::encode;
+    encode(t, bl);
+  }
+  void decode(bufferlist::const_iterator &p) {
+    using ceph::decode;
+    decode(t, p);
+  }
+  void dump(Formatter* f) {
+    f->dump_stream("MNVMeofGwBeacon") << m;
+  }
+
+  static void generate_test_instances(std::list<MNVMeofGwBeaconDencoder*>& ls) {
+    std::string gw_id = "GW";
+    std::string gw_pool = "pool";
+    std::string gw_group = "group";
+    gw_availability_t availability = gw_availability_t::GW_AVAILABLE;
+    std::string nqn = "nqn";
+    BeaconSubsystem sub = { nqn, {}, {} };
+    std::string nqn = "nqn";
+    BeaconSubsystem sub = { nqn, {}, {} };
+    BeaconSubsystems subs = { sub };
+    epoch_t osd_epoch = 17;
+    epoch_t gwmap_epoch = 42;
+    m = MNVMeofGwBeacon(
+      gw_id,
+      gw_pool,
+      gw_group,
+      subs,
+      availability,
+      osd_epoch,
+      gwmap_epoch);
+
+    ls.push_back(new MNVMeofGwBeaconDencoder(m));
+
+  }
+};
+WRITE_CLASS_ENCODER(MNVMeofGwBeaconDencoder)
+
+
+#endif // WITH_NVMEOF_GATEWAY_MONITOR_CLIENT
+
+#endif // CEPH_NVMEOF_TYPES_H
author	Leonid Chernin <lechernin@gmail.com>
	Tue, 17 Oct 2023 13:25:07 +0000 (13:25 +0000)
committer	Alexander Indenbaum <aindenba@redhat.com>
	Wed, 31 Jul 2024 08:50:10 +0000 (08:50 +0000)
.gitmodules		patch \| blob \| history
PendingReleaseNotes		patch \| blob \| history
ceph.spec.in		patch \| blob \| history
src/CMakeLists.txt		patch \| blob \| history
src/ceph_nvmeof_monitor_client.cc	[new file with mode: 0644]	patch \| blob
src/common/options/global.yaml.in		patch \| blob \| history
src/common/options/mon.yaml.in		patch \| blob \| history
src/messages/MNVMeofGwBeacon.h	[new file with mode: 0644]	patch \| blob
src/messages/MNVMeofGwMap.h	[new file with mode: 0644]	patch \| blob
src/mon/CMakeLists.txt		patch \| blob \| history
src/mon/MonCommands.h		patch \| blob \| history
src/mon/Monitor.cc		patch \| blob \| history
src/mon/Monitor.h		patch \| blob \| history
src/mon/NVMeofGwMap.cc	[new file with mode: 0755]	patch \| blob
src/mon/NVMeofGwMap.h	[new file with mode: 0755]	patch \| blob
src/mon/NVMeofGwMon.cc	[new file with mode: 0644]	patch \| blob
src/mon/NVMeofGwMon.h	[new file with mode: 0644]	patch \| blob
src/mon/NVMeofGwSerialize.h	[new file with mode: 0755]	patch \| blob
src/mon/NVMeofGwTypes.h	[new file with mode: 0755]	patch \| blob
src/mon/mon_types.h		patch \| blob \| history
src/msg/Message.cc		patch \| blob \| history
src/msg/Message.h		patch \| blob \| history
src/nvmeof/NVMeofGwClient.cc	[new file with mode: 0644]	patch \| blob
src/nvmeof/NVMeofGwClient.h	[new file with mode: 0644]	patch \| blob
src/nvmeof/NVMeofGwMonitorClient.cc	[new file with mode: 0644]	patch \| blob
src/nvmeof/NVMeofGwMonitorClient.h	[new file with mode: 0644]	patch \| blob
src/nvmeof/NVMeofGwMonitorGroupClient.cc	[new file with mode: 0644]	patch \| blob
src/nvmeof/NVMeofGwMonitorGroupClient.h	[new file with mode: 0644]	patch \| blob
src/nvmeof/gateway	[new submodule]	patch \| blob
src/pybind/mgr/cephadm/services/nvmeof.py		patch \| blob \| history
src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2		patch \| blob \| history
src/python-common/ceph/deployment/service_spec.py		patch \| blob \| history
src/test/CMakeLists.txt		patch \| blob \| history
src/test/test_nvmeof_mon_encoding.cc	[new file with mode: 0644]	patch \| blob
src/tools/ceph-dencoder/CMakeLists.txt		patch \| blob \| history
src/tools/ceph-dencoder/nvmeof_types.cc	[new file with mode: 0644]	patch \| blob
src/tools/ceph-dencoder/nvmeof_types.h	[new file with mode: 0644]	patch \| blob