]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
mon: add NVMe-oF gateway monitor and HA
authorLeonid Chernin <lechernin@gmail.com>
Tue, 17 Oct 2023 13:25:07 +0000 (13:25 +0000)
committerAlexander Indenbaum <aindenba@redhat.com>
Thu, 20 Nov 2025 08:55:23 +0000 (10:55 +0200)
- gateway submodule

Fixes: https://tracker.ceph.com/issues/64777
This PR adds high availability support for the nvmeof Ceph service. High availability means that even in the case that a certain GW is down, there will be another available path for the initiator to be able to continue the IO through another GW. High availability is achieved by running nvmeof service consisting of at least 2 nvmeof GWs in the Ceph cluster. Every GW will be seen by the host (initiator) as a separate path to the nvme namespaces (volumes).

The implementation consists of the following main modules:

- NVMeofGWMon - a PaxosService. It is a monitor that tracks the status of the nvmeof running services, and take actions in case that services fail, and in case services restored.
- NVMeofGwMonitorClient – It is an agent that is running as a part of each nvmeof GW. It is sending beacons to the monitor to signal that the GW is alive. As a part of the beacon, the client also sends information about the service. This information is used by the monitor to take decisions and perform some operations.
- MNVMeofGwBeacon – It is a structure used by the client and the monitor to send/recv the beacons.
- MNVMeofGwMap – The map is tracking the nvmeof GWs status. It also defines what should be the new role of every GW. So in the events of GWs go down or GWs restored, the map will reflect the new role of each GW resulted by these events. The map is distributed to the NVMeofGwMonitorClient on each GW, and it knows to update the GW with the required changes.

It is also adding 2 new mon commands:
- nvme-gw create
- nvme-gw delete
- nvme-gw show

The commands are used by the ceph adm to update the monitor that a new
GW is deployed. The monitor will update the map accordingly and will
start tracking this GW until it is deleted.

Resolves rhbz#2272661

Signed-off-by: Leonid Chernin <lechernin@gmail.com>
Signed-off-by: Alexander Indenbaum <aindenba@redhat.com>
(cherry picked from commit bf9505fb569e9b95a78f9700ed8c4bd20508ef55)
additional commits:
(cherry picked from commit 4f543c150503fc9ee3c4d840b6eb5ed43e15905b)
(cherry picked from commit 7af4d96530531b1458e5d02bd9f706bb8c30c2ea)
(cherry picked from commit 249951b5545d857861562ef7a90f876cb8f3d778)
(cherry picked from commit f704d0c84257701fcbf8891dd5f285c890343219)

35 files changed:
PendingReleaseNotes
ceph.spec.in
doc/nvmeof/beacon.puml [new file with mode: 0644]
doc/nvmeof/gateway-state.puml [new file with mode: 0644]
install-deps.sh
src/CMakeLists.txt
src/ceph_nvmeof_monitor_client.cc [new file with mode: 0644]
src/common/options/mon.yaml.in
src/messages/MNVMeofGwBeacon.h [new file with mode: 0644]
src/messages/MNVMeofGwMap.h [new file with mode: 0644]
src/mon/CMakeLists.txt
src/mon/MonCommands.h
src/mon/Monitor.cc
src/mon/Monitor.h
src/mon/NVMeofGwMap.cc [new file with mode: 0755]
src/mon/NVMeofGwMap.h [new file with mode: 0755]
src/mon/NVMeofGwMon.cc [new file with mode: 0644]
src/mon/NVMeofGwMon.h [new file with mode: 0644]
src/mon/NVMeofGwSerialize.h [new file with mode: 0755]
src/mon/NVMeofGwTypes.h [new file with mode: 0755]
src/mon/mon_types.h
src/msg/Message.cc
src/msg/Message.h
src/nvmeof/NVMeofGwClient.cc [new file with mode: 0644]
src/nvmeof/NVMeofGwClient.h [new file with mode: 0644]
src/nvmeof/NVMeofGwMonitorClient.cc [new file with mode: 0644]
src/nvmeof/NVMeofGwMonitorClient.h [new file with mode: 0644]
src/nvmeof/NVMeofGwMonitorGroupClient.cc [new file with mode: 0644]
src/nvmeof/NVMeofGwMonitorGroupClient.h [new file with mode: 0644]
src/nvmeof/gateway/control/proto/gateway.proto [new file with mode: 0644]
src/nvmeof/gateway/control/proto/monitor.proto [new file with mode: 0644]
src/pybind/mgr/cephadm/services/nvmeof.py
src/python-common/ceph/deployment/service_spec.py
src/test/CMakeLists.txt
src/test/test_nvmeof_mon_encoding.cc [new file with mode: 0644]

index 20b090562f828cde7edc7441ab9f3090fb1d379b..9efc063fb37616b45e1c383b92fe0041fff1dca9 100644 (file)
@@ -571,3 +571,11 @@ Relevant tracker: https://tracker.ceph.com/issues/57090
 set using the `fs set` command. This flag prevents using a standby for another
 file system (join_fs = X) when standby for the current filesystem is not available.
 Relevant tracker: https://tracker.ceph.com/issues/61599
+* mon: add NVMe-oF gateway monitor and HA
+  This PR adds high availability support for the nvmeof Ceph service. High availability
+means that even in the case that a certain GW is down, there will be another available
+path for the initiator to be able to continue the IO through another GW.
+It is also adding 2 new mon commands, to notify monitor about the gateway creation/deletion:
+  - nvme-gw create
+  - nvme-gw delete
+Relevant tracker: https://tracker.ceph.com/issues/64777
index 3a7538fb5e31871e8d4b0e5b2880812a95ea10b9..6cdd0a7f1ff606c1d7b31cae3588bcf107998434 100644 (file)
@@ -212,6 +212,7 @@ BuildRequires:      gperf
 BuildRequires:  cmake > 3.5
 BuildRequires: fuse-devel
 BuildRequires: git
+BuildRequires: grpc-devel
 %if 0%{?fedora} || 0%{?suse_version} > 1500 || 0%{?rhel} == 9 || 0%{?openEuler}
 BuildRequires: gcc-c++ >= 11
 %endif
@@ -586,6 +587,17 @@ system. One or more instances of ceph-mon form a Paxos part-time
 parliament cluster that provides extremely reliable and durable storage
 of cluster membership, configuration, and state.
 
+%package mon-client-nvmeof
+Summary:       Ceph NVMeoF Gateway Monitor Client
+%if 0%{?suse_version}
+Group:         System/Filesystems
+%endif
+Provides:      ceph-test:/usr/bin/ceph-nvmeof-monitor-client
+Requires:      librados2 = %{_epoch_prefix}%{version}-%{release}
+%description mon-client-nvmeof
+Ceph NVMeoF Gateway Monitor Client distributes Paxos ANA info
+to NVMeoF Gateway and provides beacons to the monitor daemon
+
 %package mgr
 Summary:        Ceph Manager Daemon
 %if 0%{?suse_version}
@@ -1993,6 +2005,9 @@ if [ $1 -ge 1 ] ; then
   fi
 fi
 
+%files mon-client-nvmeof
+%{_bindir}/ceph-nvmeof-monitor-client
+
 %files fuse
 %{_bindir}/ceph-fuse
 %{_mandir}/man8/ceph-fuse.8*
diff --git a/doc/nvmeof/beacon.puml b/doc/nvmeof/beacon.puml
new file mode 100644 (file)
index 0000000..c4279b7
--- /dev/null
@@ -0,0 +1,63 @@
+@startuml
+
+package "MNVMeofGwBeacon" #DDDDDD {
+
+    class Beacon {
+        gw_id: String
+        gw_pool: String
+        gw_group: String
+        availability: Availability
+        subsystems: Subsystem[*]
+    }
+
+    note top
+        NVMeoF gateways regularly transmit
+        a Beacon message to the monitor,
+        conveying their current state.
+    end note
+
+    class Subsystem {
+        nqn: String
+        listeners: Listener[*]
+        namespaces: Namespace[*]
+    }
+
+    class Listener {
+        address: String
+        family: String
+        service: Decimal
+        
+    }
+
+    class Namespace {
+        nonce: String
+        ana_grp_id: Decimal
+    }
+
+    note right of Namespace::nonce
+        <b>RADOS</b> connection
+    end note
+
+    enum Availability {
+        CREATED
+        AVAILABLE
+        INACCESSIBLE
+    }
+
+    note right of Availability::CREATED
+        The initial state, 
+        until a <b>MNVMeofGwMap</b>
+        that contains the entry for 
+        this gateway
+    end note
+
+    Beacon::availability --> "1" Availability
+    Beacon::subsystems "1" *-- "n" Subsystem : contains
+
+    Subsystem::listeners "1" *-- "n" Listener : contains
+    Subsystem::namespaces "1" *-- "n" Namespace : contains
+}
+@enduml
+
+
+
diff --git a/doc/nvmeof/gateway-state.puml b/doc/nvmeof/gateway-state.puml
new file mode 100644 (file)
index 0000000..dbfb614
--- /dev/null
@@ -0,0 +1,41 @@
+@startuml
+
+package "MNVMeofGwMap" #DDDDDD {
+
+    class GatewayState {
+        group_id: Decimal
+        ana_info: SubsystemAnaInfo[*]
+    }
+
+    note top
+        The data structure distributed by the Monitors to NVMEoF gateways includes
+        a mapping of the <b>GatewayState</b> objects.
+
+        NVMeoF gateways are segmented into groups identified
+        by <b>(pool, group)</b> pair, with each gateway uniquely
+        identified by a gateway <b>ID</b>.
+    end note
+
+    note right of GatewayState::group_id
+        Identification of this gateway within its group
+    end note
+
+    class SubsystemAnaInfo {
+        nqn: String
+        ana_state: AnaState[0..n]
+    }
+
+    note right of SubsystemAnaInfo
+        Denotes the condition of the ANA groups
+        of the subsystem identified by <b>nqn</b>.
+    end note
+
+    enum AnaState {
+        OPTIMIZED
+        INACCESSIBLE
+    }
+
+    GatewayState "1" *-- "many" SubsystemAnaInfo : contains
+    SubsystemAnaInfo::ana_state "1" *-- "n" AnaState : contains
+}
+@enduml
index c0117175711894878809a6868b002bb5508bad47..bb452ff46786ab7ad7f77f3f60b3ebbdfd883d4c 100755 (executable)
@@ -572,6 +572,11 @@ else
                 $SUDO rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-$MAJOR_VERSION
                 $SUDO rm -f /etc/yum.repos.d/dl.fedoraproject.org*
                 if test $ID = centos -a $MAJOR_VERSION = 8 ; then
+                    # for grpc-devel
+                    # See https://copr.fedorainfracloud.org/coprs/ceph/grpc/
+                    # epel is enabled for all major versions couple of lines above
+                    $SUDO dnf copr enable -y ceph/grpc
+
                     # Enable 'powertools' or 'PowerTools' repo
                     $SUDO dnf config-manager --set-enabled $(dnf repolist --all 2>/dev/null|gawk 'tolower($0) ~ /^powertools\s/{print $1}')
                     dts_ver=11
index f91dd1abe7fcb97f289bab547378c2b5cb2dd626..484a38484ffb2a522eefeb128abc97ac8b0cfbe6 100644 (file)
@@ -860,6 +860,112 @@ if(WITH_FUSE)
   install(PROGRAMS mount.fuse.ceph DESTINATION ${CMAKE_INSTALL_SBINDIR})
 endif(WITH_FUSE)
 
+# NVMEOF GATEWAY MONITOR CLIENT
+# Supported on RPM-based platforms only, depends on grpc devel libraries/tools
+if(EXISTS "/etc/redhat-release" OR EXISTS "/etc/fedora-release")
+  option(WITH_NVMEOF_GATEWAY_MONITOR_CLIENT "build nvmeof gateway monitor client" ON)
+else()
+  option(WITH_NVMEOF_GATEWAY_MONITOR_CLIENT "build nvmeof gateway monitor client" OFF)
+endif()
+
+if(WITH_NVMEOF_GATEWAY_MONITOR_CLIENT)
+
+  # Find Protobuf installation
+  # Looks for protobuf-config.cmake file installed by Protobuf's cmake installation.
+  option(protobuf_MODULE_COMPATIBLE TRUE)
+  find_package(Protobuf REQUIRED)
+
+  set(_REFLECTION grpc++_reflection)
+  if(CMAKE_CROSSCOMPILING)
+    find_program(_PROTOBUF_PROTOC protoc)
+  else()
+    set(_PROTOBUF_PROTOC $<TARGET_FILE:protobuf::protoc>)
+  endif()
+
+  # Find gRPC installation
+  # Looks for gRPCConfig.cmake file installed by gRPC's cmake installation.
+  find_package(gRPC CONFIG REQUIRED)
+  message(STATUS "Using gRPC ${gRPC_VERSION}")
+  set(_GRPC_GRPCPP gRPC::grpc++)
+  if(CMAKE_CROSSCOMPILING)
+    find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
+  else()
+    set(_GRPC_CPP_PLUGIN_EXECUTABLE $<TARGET_FILE:gRPC::grpc_cpp_plugin>)
+  endif()
+
+  # Gateway Proto file
+  get_filename_component(nvmeof_gateway_proto "nvmeof/gateway/control/proto/gateway.proto" ABSOLUTE)
+  get_filename_component(nvmeof_gateway_proto_path "${nvmeof_gateway_proto}" PATH)
+
+  # Generated sources
+  set(nvmeof_gateway_proto_srcs "${CMAKE_CURRENT_BINARY_DIR}/gateway.pb.cc")
+  set(nvmeof_gateway_proto_hdrs "${CMAKE_CURRENT_BINARY_DIR}/gateway.pb.h")
+  set(nvmeof_gateway_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/gateway.grpc.pb.cc")
+  set(nvmeof_gateway_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/gateway.grpc.pb.h")
+
+  add_custom_command(
+        OUTPUT "${nvmeof_gateway_proto_srcs}" "${nvmeof_gateway_proto_hdrs}" "${nvmeof_gateway_grpc_srcs}" "${nvmeof_gateway_grpc_hdrs}"
+        COMMAND ${_PROTOBUF_PROTOC}
+        ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
+          --cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
+          -I "${nvmeof_gateway_proto_path}"
+          --experimental_allow_proto3_optional
+          --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}"
+          "${nvmeof_gateway_proto}"
+        DEPENDS "${nvmeof_gateway_proto}")
+
+
+  # Monitor Proto file
+  get_filename_component(nvmeof_monitor_proto "nvmeof/gateway/control/proto/monitor.proto" ABSOLUTE)
+  get_filename_component(nvmeof_monitor_proto_path "${nvmeof_monitor_proto}" PATH)
+
+  # Generated sources
+  set(nvmeof_monitor_proto_srcs "${CMAKE_CURRENT_BINARY_DIR}/monitor.pb.cc")
+  set(nvmeof_monitor_proto_hdrs "${CMAKE_CURRENT_BINARY_DIR}/monitor.pb.h")
+  set(nvmeof_monitor_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/monitor.grpc.pb.cc")
+  set(nvmeof_monitor_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/monitor.grpc.pb.h")
+
+  add_custom_command(
+        OUTPUT "${nvmeof_monitor_proto_srcs}" "${nvmeof_monitor_proto_hdrs}" "${nvmeof_monitor_grpc_srcs}" "${nvmeof_monitor_grpc_hdrs}"
+        COMMAND ${_PROTOBUF_PROTOC}
+        ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
+          --cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
+          -I "${nvmeof_monitor_proto_path}"
+          --experimental_allow_proto3_optional
+          --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}"
+          "${nvmeof_monitor_proto}"
+        DEPENDS "${nvmeof_monitor_proto}")
+
+  # Include generated *.pb.h files
+  include_directories("${CMAKE_CURRENT_BINARY_DIR}")
+
+  set(ceph_nvmeof_monitor_client_srcs
+    ${nvmeof_gateway_proto_srcs}
+    ${nvmeof_gateway_proto_hdrs}
+    ${nvmeof_gateway_grpc_srcs}
+    ${nvmeof_gateway_grpc_hdrs}
+    ${nvmeof_monitor_proto_srcs}
+    ${nvmeof_monitor_proto_hdrs}
+    ${nvmeof_monitor_grpc_srcs}
+    ${nvmeof_monitor_grpc_hdrs}
+    ceph_nvmeof_monitor_client.cc
+    nvmeof/NVMeofGwClient.cc
+    nvmeof/NVMeofGwMonitorGroupClient.cc
+    nvmeof/NVMeofGwMonitorClient.cc)
+  add_executable(ceph-nvmeof-monitor-client ${ceph_nvmeof_monitor_client_srcs})
+  add_dependencies(ceph-nvmeof-monitor-client ceph-common)
+  target_link_libraries(ceph-nvmeof-monitor-client
+    client
+    mon
+    global-static
+    ceph-common
+    ${_REFLECTION}
+    ${_GRPC_GRPCPP}
+    )
+  install(TARGETS ceph-nvmeof-monitor-client DESTINATION bin)
+endif()
+# END OF NVMEOF GATEWAY MONITOR CLIENT
+
 if(WITH_DOKAN)
   add_subdirectory(dokan)
 endif(WITH_DOKAN)
diff --git a/src/ceph_nvmeof_monitor_client.cc b/src/ceph_nvmeof_monitor_client.cc
new file mode 100644 (file)
index 0000000..0545799
--- /dev/null
@@ -0,0 +1,79 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM Inc
+ *
+ * Author: Alexander Indenbaum <aindenba@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <pthread.h>
+
+#include "include/types.h"
+#include "include/compat.h"
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "common/pick_address.h"
+#include "global/global_init.h"
+
+#include "nvmeof/NVMeofGwMonitorClient.h"
+
+static void usage()
+{
+  std::cout << "usage: ceph-nvmeof-monitor-client\n"
+               "        --gateway-name <GW_NAME>\n"
+               "        --gateway-address <GW_ADDRESS>\n"
+               "        --gateway-pool <CEPH_POOL>\n"
+               "        --gateway-group <GW_GROUP>\n"
+               "        --monitor-group-address <MONITOR_GROUP_ADDRESS>\n"
+               "        [flags]\n"
+           << std::endl;
+  generic_server_usage();
+}
+
+/**
+ * A short main() which just instantiates a Nvme and
+ * hands over control to that.
+ */
+int main(int argc, const char **argv)
+{
+  ceph_pthread_setname(pthread_self(), "ceph-nvmeof-monitor-client");
+
+  auto args = argv_to_vec(argc, argv);
+  if (args.empty()) {
+    std::cerr << argv[0] << ": -h or --help for usage" << std::endl;
+    exit(1);
+  }
+  if (ceph_argparse_need_usage(args)) {
+    usage();
+    exit(0);
+  }
+
+  auto cct = global_init(nullptr, args, CEPH_ENTITY_TYPE_CLIENT,
+                         CODE_ENVIRONMENT_UTILITY, // maybe later use CODE_ENVIRONMENT_DAEMON,
+                        CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+
+  pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC);
+
+  global_init_daemonize(g_ceph_context);
+  global_init_chdir(g_ceph_context);
+  common_init_finish(g_ceph_context);
+
+  NVMeofGwMonitorClient gw_monitor_client(argc, argv);
+  int rc = gw_monitor_client.init();
+  if (rc != 0) {
+      std::cerr << "Error in initialization: " << cpp_strerror(rc) << std::endl;
+      return rc;
+  }
+
+  return gw_monitor_client.main(args);
+}
+
index 8300e31d022c7c3b8f766f8574455c8019ae80f2..7ff915903eb562e87cf83d1bdd5c959f47bb8414 100644 (file)
@@ -72,6 +72,14 @@ options:
   default: 30
   services:
   - mon
+- name: mon_nvmeofgw_beacon_grace
+  type: secs
+  level: advanced
+  desc: Period in seconds from last beacon to monitor marking a  NVMeoF gateway as
+    failed
+  default: 10
+  services:
+  - mon
 - name: mon_mgr_inactive_grace
   type: int
   level: advanced
@@ -1337,3 +1345,10 @@ options:
   with_legacy: true
   see_also:
   - osd_heartbeat_use_min_delay_socket
+- name: nvmeof_mon_client_tick_period
+  type: secs
+  level: advanced
+  desc: Period in seconds of nvmeof gateway beacon messages to monitor
+  default: 2
+  services:
+  - mon
diff --git a/src/messages/MNVMeofGwBeacon.h b/src/messages/MNVMeofGwBeacon.h
new file mode 100644 (file)
index 0000000..202d77a
--- /dev/null
@@ -0,0 +1,132 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_NVMEOFGWBEACON_H
+#define CEPH_NVMEOFGWBEACON_H
+
+#include <cstddef>
+#include <vector>
+#include "messages/PaxosServiceMessage.h"
+#include "mon/MonCommand.h"
+#include "mon/NVMeofGwMap.h"
+#include "include/types.h"
+
+class MNVMeofGwBeacon final : public PaxosServiceMessage {
+private:
+  static constexpr int HEAD_VERSION = 1;
+  static constexpr int COMPAT_VERSION = 1;
+
+protected:
+    std::string       gw_id;
+    std::string       gw_pool;
+    std::string       gw_group;
+    BeaconSubsystems  subsystems;                           // gateway susbsystem and their state machine states
+    GW_AVAILABILITY_E availability;                         // in absence of  beacon  heartbeat messages it becomes inavailable
+    epoch_t           last_osd_epoch;
+    epoch_t           last_gwmap_epoch;
+
+public:
+  MNVMeofGwBeacon()
+    : PaxosServiceMessage{MSG_MNVMEOF_GW_BEACON, 0, HEAD_VERSION, COMPAT_VERSION}
+  {
+    set_priority(CEPH_MSG_PRIO_HIGH);
+  }
+
+  MNVMeofGwBeacon(const std::string &gw_id_,
+        const std::string& gw_pool_,
+        const std::string& gw_group_,
+        const BeaconSubsystems& subsystems_,
+        const GW_AVAILABILITY_E& availability_,
+        const epoch_t& last_osd_epoch_,
+        const epoch_t& last_gwmap_epoch_
+  )
+    : PaxosServiceMessage{MSG_MNVMEOF_GW_BEACON, 0, HEAD_VERSION, COMPAT_VERSION},
+      gw_id(gw_id_), gw_pool(gw_pool_), gw_group(gw_group_), subsystems(subsystems_),
+      availability(availability_), last_osd_epoch(last_osd_epoch_), last_gwmap_epoch(last_gwmap_epoch_)
+  {
+    set_priority(CEPH_MSG_PRIO_HIGH);
+  }
+
+  const std::string& get_gw_id() const { return gw_id; }
+  const std::string& get_gw_pool() const { return gw_pool; }
+  const std::string& get_gw_group() const { return gw_group; }
+  NvmeAnaNonceMap get_nonce_map() const {
+    NvmeAnaNonceMap nonce_map;
+    for (const auto& sub: subsystems) {
+      for (const auto& ns: sub.namespaces) {
+        auto& nonce_vec = nonce_map[ns.anagrpid-1];//Converting   ana groups to offsets
+        if (std::find(nonce_vec.begin(), nonce_vec.end(), ns.nonce) == nonce_vec.end())
+          nonce_vec.push_back(ns.nonce);
+      }
+    }
+    return nonce_map;
+  }
+
+  const GW_AVAILABILITY_E& get_availability()   const   { return availability; }
+  const epoch_t&           get_last_osd_epoch() const   { return last_osd_epoch; }
+  const epoch_t&           get_last_gwmap_epoch() const { return last_gwmap_epoch; }
+  const BeaconSubsystems&  get_subsystems()     const   { return subsystems; };
+
+private:
+  ~MNVMeofGwBeacon() final {}
+
+public:
+
+  std::string_view get_type_name() const override { return "nvmeofgwbeacon"; }
+
+  void encode_payload(uint64_t features) override {
+    using ceph::encode;
+    paxos_encode();
+    encode(gw_id, payload);
+    encode(gw_pool, payload);
+    encode(gw_group, payload);
+    encode((uint32_t)subsystems.size(), payload);
+    for (const auto& st: subsystems) {
+      encode(st, payload);
+    }
+    encode((uint32_t)availability, payload);
+    encode(last_osd_epoch, payload);
+    encode(last_gwmap_epoch, payload);
+  }
+
+  void decode_payload() override {
+    using ceph::decode;
+    auto p = payload.cbegin();
+    
+    paxos_decode(p);
+    decode(gw_id, p);
+    decode(gw_pool, p);
+    decode(gw_group, p);
+    uint32_t n;
+    decode(n, p);
+    subsystems.clear();
+    for (uint32_t i = 0; i < n; i++) {
+      BeaconSubsystem sub;
+      decode(sub, p);
+      subsystems.push_back(sub);
+    }
+    uint32_t tmp;
+    decode(tmp, p);
+    availability = static_cast<GW_AVAILABILITY_E>(tmp);
+    decode(last_osd_epoch, p);
+    decode(last_gwmap_epoch, p);
+  }
+
+private:
+  template<class T, typename... Args>
+  friend boost::intrusive_ptr<T> ceph::make_message(Args&&... args);
+};
+
+
+#endif
diff --git a/src/messages/MNVMeofGwMap.h b/src/messages/MNVMeofGwMap.h
new file mode 100644 (file)
index 0000000..add2554
--- /dev/null
@@ -0,0 +1,62 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MNVMEOFGWMAP_H
+#define CEPH_MNVMEOFGWMAP_H
+
+#include "msg/Message.h"
+#include "mon/NVMeofGwMap.h"
+
+class MNVMeofGwMap final : public Message {
+protected:
+  std::map<NvmeGroupKey, NvmeGwMap> map;
+  epoch_t                           gwmap_epoch;
+
+public:
+  const std::map<NvmeGroupKey, NvmeGwMap>& get_map() {return map;}
+  const epoch_t& get_gwmap_epoch() {return gwmap_epoch;}
+
+private:
+  MNVMeofGwMap() :
+    Message{MSG_MNVMEOF_GW_MAP} {}
+  MNVMeofGwMap(const NVMeofGwMap &map_) :
+    Message{MSG_MNVMEOF_GW_MAP}, gwmap_epoch(map_.epoch)
+  {
+    map_.to_gmap(map);
+  }
+  ~MNVMeofGwMap() final {}
+
+public:
+  std::string_view get_type_name() const override { return "nvmeofgwmap"; }
+
+  void decode_payload() override {
+    auto p = payload.cbegin();
+    decode(gwmap_epoch, p);
+    decode(map, p);
+  }
+  void encode_payload(uint64_t features) override {
+    using ceph::encode;
+    encode(gwmap_epoch, payload);
+    encode(map, payload);
+  }
+private:
+  using RefCountedObject::put;
+  using RefCountedObject::get;
+  template<class T, typename... Args>
+  friend boost::intrusive_ptr<T> ceph::make_message(Args&&... args);
+  template<class T, typename... Args>
+  friend MURef<T> crimson::make_message(Args&&... args);
+};
+
+#endif
index 784b4c3ee0b34c274930b83117f6b7b4cb9d46bb..35e27d35e85b2120447e743565be99fec09a7fff 100644 (file)
@@ -21,6 +21,8 @@ set(lib_mon_srcs
   ConnectionTracker.cc
   HealthMonitor.cc
   KVMonitor.cc
+  NVMeofGwMon.cc
+  NVMeofGwMap.cc
   ../mds/MDSAuthCaps.cc
   ../mgr/mgr_commands.cc
   ../osd/OSDCap.cc
index 8dafbd397afd4a968bbd8635d6e8cd400b111fe1..3f3e5ba9a479bd7ce226a60f4094a307b33709fa 100644 (file)
@@ -1394,8 +1394,25 @@ COMMAND("config generate-minimal-conf",
        "Generate a minimal ceph.conf file",
        "config", "r")
 
+/* NVMeofGwMon*/
+COMMAND("nvme-gw create"
+    " name=id,type=CephString"
+    " name=pool,type=CephString"
+    " name=group,type=CephString",
+    "create nvmeof gateway id for (pool, group)",
+    "mgr", "rw")
+COMMAND("nvme-gw delete"
+    " name=id,type=CephString"
+    " name=pool,type=CephString"
+    " name=group,type=CephString",
+    "delete nvmeof gateway id for (pool, group)",
+    "mgr", "rw")
 
-
+COMMAND("nvme-gw show"
+   " name=pool,type=CephString"
+   " name=group,type=CephString",
+   " show nvmeof gateways within (pool, group)",
+   "mgr", "rw")
 
 // these are tell commands that were implemented as CLI commands in
 // the broken pre-octopus way that we want to allow to work when a
index 8d759285d1fcab61353a30485afbe6f5a6818ab0..1d96397b9e167211f16cd6ecaec05f7425b1ea2e 100644 (file)
@@ -84,6 +84,7 @@
 #include "MgrStatMonitor.h"
 #include "ConfigMonitor.h"
 #include "KVMonitor.h"
+#include "NVMeofGwMon.h"
 #include "mon/HealthMonitor.h"
 #include "common/config.h"
 #include "common/cmdparse.h"
@@ -247,6 +248,7 @@ Monitor::Monitor(CephContext* cct_, string nm, MonitorDBStore *s,
   paxos_service[PAXOS_HEALTH].reset(new HealthMonitor(*this, *paxos, "health"));
   paxos_service[PAXOS_CONFIG].reset(new ConfigMonitor(*this, *paxos, "config"));
   paxos_service[PAXOS_KV].reset(new KVMonitor(*this, *paxos, "kv"));
+  paxos_service[PAXOS_NVMEGW].reset(new NVMeofGwMon(*this, *paxos, "nvmeofgw"));
 
   bool r = mon_caps.parse("allow *", NULL);
   ceph_assert(r);
@@ -3632,7 +3634,11 @@ void Monitor::handle_command(MonOpRequestRef op)
     mgrmon()->dispatch(op);
     return;
   }
-
+  if (module == "nvme-gw"){
+      nvmegwmon()->dispatch(op);
+      dout(10) << " Dispatching module " << module << " to NVMeofGwMon"  << dendl;
+      return;
+  }
   if (prefix == "fsid") {
     if (f) {
       f->open_object_section("fsid");
@@ -4461,6 +4467,7 @@ void Monitor::_ms_dispatch(Message *m)
   }
 
   MonOpRequestRef op = op_tracker.create_request<MonOpRequest>(m);
+  dout(10) << "Received message: " << op->get_req()->get_type() << dendl;
   bool src_is_mon = op->is_src_mon();
   op->mark_event("mon:_ms_dispatch");
   MonSession *s = op->get_session();
@@ -4566,6 +4573,9 @@ void Monitor::_ms_dispatch(Message *m)
 void Monitor::dispatch_op(MonOpRequestRef op)
 {
   op->mark_event("mon:dispatch_op");
+
+  dout(10) << "Received message: " << op->get_req()->get_type() << dendl;
+
   MonSession *s = op->get_session();
   ceph_assert(s);
   if (s->closed) {
@@ -4679,6 +4689,11 @@ void Monitor::dispatch_op(MonOpRequestRef op)
       paxos_service[PAXOS_MGR]->dispatch(op);
       return;
 
+    case MSG_MNVMEOF_GW_BEACON:
+       paxos_service[PAXOS_NVMEGW]->dispatch(op);
+       return;
+
+
     // MgrStat
     case MSG_MON_MGR_REPORT:
     case CEPH_MSG_STATFS:
@@ -5366,6 +5381,10 @@ void Monitor::handle_subscribe(MonOpRequestRef op)
     } else if (p->first.find("kv:") == 0) {
       kvmon()->check_sub(s->sub_map[p->first]);
     }
+    else if (p->first == "NVMeofGw") {
+        dout(10) << "NVMeofGw->check_sub " << dendl;
+        nvmegwmon()->check_sub(s->sub_map[p->first]);
+    }
   }
 
   if (reply) {
index 629846ad49032f2a7a75f9bdf6f3713775e8761e..66baaa67a650cc8b79fd0c85874302526565f23b 100644 (file)
@@ -713,6 +713,11 @@ public:
     return (class KVMonitor*) paxos_service[PAXOS_KV].get();
   }
 
+  class NVMeofGwMon *nvmegwmon() {
+      return (class NVMeofGwMon*) paxos_service[PAXOS_NVMEGW].get();
+  }
+
+
   friend class Paxos;
   friend class OSDMonitor;
   friend class MDSMonitor;
diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc
new file mode 100755 (executable)
index 0000000..869c378
--- /dev/null
@@ -0,0 +1,598 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include <boost/tokenizer.hpp>
+#include "include/stringify.h"
+#include "NVMeofGwMon.h"
+#include "NVMeofGwMap.h"
+#include "OSDMonitor.h"
+
+using std::map;
+using std::make_pair;
+using std::ostream;
+using std::ostringstream;
+using std::string;
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix *_dout << "nvmeofgw " << __PRETTY_FUNCTION__ << " "
+
+void NVMeofGwMap::to_gmap(std::map<NvmeGroupKey, NvmeGwMap>& Gmap) const {
+    Gmap.clear();
+    for (const auto& created_map_pair: Created_gws) {
+        const auto& group_key = created_map_pair.first;
+        const NvmeGwCreatedMap& gw_created_map = created_map_pair.second;
+        for (const auto& gw_created_pair: gw_created_map) {
+            const auto& gw_id = gw_created_pair.first;
+            const auto& gw_created  = gw_created_pair.second;
+
+            auto gw_state = NvmeGwState(gw_created.ana_grp_id, epoch);
+            for (const auto& sub: gw_created.subsystems) {
+                gw_state.subsystems.insert({sub.nqn, NqnState(sub.nqn, gw_created.sm_state, gw_created )});
+            }
+            Gmap[group_key][gw_id] = gw_state;
+        }
+    }
+}
+
+int  NVMeofGwMap::cfg_add_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key) {
+    // Calculate allocated group bitmask
+    bool allocated[MAX_SUPPORTED_ANA_GROUPS] = {false};
+    for (auto& itr: Created_gws[group_key]) {
+        allocated[itr.second.ana_grp_id] = true;
+        if(itr.first == gw_id) {
+            dout(1) << __func__ << " ERROR create GW: already exists in map " << gw_id << dendl;
+            return -EEXIST ;
+        }
+    }
+
+    // Allocate the new group id
+    for(int i=0; i<=MAX_SUPPORTED_ANA_GROUPS; i++) {
+        if (allocated[i] == false) {
+            NvmeGwCreated gw_created(i);
+            Created_gws[group_key][gw_id] = gw_created;
+            dout(4) << __func__ << "Created GWS:  " << Created_gws  <<  dendl;
+            return 0;
+        }
+    }
+    dout(1) << __func__ << " ERROR create GW: " << gw_id << "   ANA groupId was not allocated "   << dendl;
+    return -EINVAL;
+}
+
+int NVMeofGwMap::cfg_delete_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key) {
+    int rc = 0;
+    for (auto& gws_states: Created_gws[group_key]) {
+
+        if (gws_states.first == gw_id) {
+            auto& state = gws_states.second;
+            for(int i=0; i<MAX_SUPPORTED_ANA_GROUPS; i++){
+                bool modified;
+                fsm_handle_gw_delete (gw_id, group_key, state.sm_state[i], i, modified);
+            }
+            dout(4) << " Delete GW :"<< gw_id  << " ANA grpid: " << state.ana_grp_id  << dendl;
+            Gmetadata[group_key].erase(gw_id);
+            if(Gmetadata[group_key].size() == 0)
+                Gmetadata.erase(group_key);
+
+            Created_gws[group_key].erase(gw_id);
+            if(Created_gws[group_key].size() == 0)
+                Created_gws.erase(group_key);
+            return rc;
+        }
+    }
+
+    return -EINVAL;
+}
+
+
+int NVMeofGwMap::process_gw_map_gw_down(const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
+                                            bool &propose_pending) {
+    int rc = 0;
+    auto& gws_states = Created_gws[group_key];
+    auto  gw_state = gws_states.find(gw_id);
+    if (gw_state != gws_states.end()) {
+        dout(4) << "GW down " << gw_id << dendl;
+        auto& st = gw_state->second;
+        st.availability = GW_AVAILABILITY_E::GW_UNAVAILABLE;
+        for (NvmeAnaGrpId i = 0; i < MAX_SUPPORTED_ANA_GROUPS; i ++) {
+            fsm_handle_gw_down (gw_id, group_key, st.sm_state[i], i, propose_pending);
+            st.standby_state(i);
+        }
+    }
+    else {
+        dout(1)  << __FUNCTION__ << "ERROR GW-id was not found in the map " << gw_id << dendl;
+        rc = -EINVAL;
+    }
+    return rc;
+}
+
+
+void NVMeofGwMap::process_gw_map_ka(const NvmeGwId &gw_id, const NvmeGroupKey& group_key,  epoch_t& last_osd_epoch, bool &propose_pending)
+{
+    auto& gws_states = Created_gws[group_key];
+    auto  gw_state = gws_states.find(gw_id);
+    ceph_assert (gw_state != gws_states.end());
+    auto& st = gw_state->second;
+    dout(20)  << "KA beacon from the GW " << gw_id << " in state " << (int)st.availability << dendl;
+
+    if (st.availability == GW_AVAILABILITY_E::GW_CREATED) {
+        // first time appears - allow IO traffic for this GW
+        st.availability = GW_AVAILABILITY_E::GW_AVAILABLE;
+        for (int i = 0; i < MAX_SUPPORTED_ANA_GROUPS; i++) st.sm_state[i] = GW_STATES_PER_AGROUP_E::GW_STANDBY_STATE;
+        if (st.ana_grp_id != REDUNDANT_GW_ANA_GROUP_ID) { // not a redundand GW
+            st.active_state(st.ana_grp_id);
+        }
+        propose_pending = true;
+    }
+    else if (st.availability == GW_AVAILABILITY_E::GW_UNAVAILABLE) {
+        st.availability = GW_AVAILABILITY_E::GW_AVAILABLE;
+        if (st.ana_grp_id == REDUNDANT_GW_ANA_GROUP_ID) {
+            for (int i = 0; i < MAX_SUPPORTED_ANA_GROUPS; i++) st.sm_state[i] = GW_STATES_PER_AGROUP_E::GW_STANDBY_STATE;
+            propose_pending = true; //TODO  try to find the 1st GW overloaded by ANA groups and start  failback for ANA group that it is not an owner of
+        }
+        else {
+            //========= prepare to Failback to this GW =========
+            // find the GW that took over on the group st.ana_grp_id
+            find_failback_gw(gw_id, group_key, propose_pending);
+        }
+    }
+    else if (st.availability == GW_AVAILABILITY_E::GW_AVAILABLE) {
+        for(int i=0; i<MAX_SUPPORTED_ANA_GROUPS; i++){
+          fsm_handle_gw_alive (gw_id, group_key, gw_state->second, st.sm_state[i], i, last_osd_epoch, propose_pending);
+        }
+    }
+}
+
+
+void NVMeofGwMap::handle_abandoned_ana_groups(bool& propose)
+{
+    propose = false;
+    for (auto& group_state: Created_gws) {
+        auto& group_key = group_state.first;
+        auto& gws_states = group_state.second;
+
+            for (auto& gw_state : gws_states) { // loop for GWs inside nqn group
+                auto& gw_id = gw_state.first;
+                NvmeGwCreated& state = gw_state.second;
+
+                //1. Failover missed : is there is a GW in unavailable state? if yes, is its ANA group handled by some other GW?
+                if (state.availability == GW_AVAILABILITY_E::GW_UNAVAILABLE && state.ana_grp_id != REDUNDANT_GW_ANA_GROUP_ID) {
+                    auto found_gw_for_ana_group = false;
+                    for (auto& gw_state2 : gws_states) {
+                        NvmeGwCreated& state2 = gw_state2.second;
+                        if (state2.availability == GW_AVAILABILITY_E::GW_AVAILABLE && state2.sm_state[state.ana_grp_id] == GW_STATES_PER_AGROUP_E::GW_ACTIVE_STATE) {
+                            found_gw_for_ana_group = true; // dout(4) << "Found GW " << ptr2.first << " that handles ANA grp " << (int)state->optimized_ana_group_id << dendl;
+                            break;
+                        }
+                    }
+                    if (found_gw_for_ana_group == false) { //choose the GW for handle ana group
+                        dout(4)<< "Was not found the GW " << " that handles ANA grp " << (int)state.ana_grp_id << " find candidate "<< dendl;
+
+                        for (int i = 0; i < MAX_SUPPORTED_ANA_GROUPS; i++)
+                            find_failover_candidate( gw_id, group_key, i, propose );
+                    }
+                }
+
+                //2. Failback missed: Check this GW is Available and Standby and no other GW is doing Failback to it
+                else if (state.availability == GW_AVAILABILITY_E::GW_AVAILABLE
+                            && state.ana_grp_id != REDUNDANT_GW_ANA_GROUP_ID &&
+                            state.sm_state[state.ana_grp_id] == GW_STATES_PER_AGROUP_E::GW_STANDBY_STATE)
+                {
+                    find_failback_gw(gw_id, group_key, propose);
+                }
+            }
+    }
+}
+
+
+void  NVMeofGwMap::set_failover_gw_for_ANA_group(const NvmeGwId &failed_gw_id, const NvmeGroupKey& group_key, const NvmeGwId &gw_id,  NvmeAnaGrpId ANA_groupid)
+{
+    NvmeGwCreated& gw_state = Created_gws[group_key][gw_id];
+    gw_state.failover_peer[ANA_groupid] = failed_gw_id;
+    epoch_t epoch;
+    dout(4) << "Found failower GW " << gw_id << " for ANA group " << (int)ANA_groupid << dendl;
+    int rc = blocklist_gw (failed_gw_id, group_key, ANA_groupid, epoch, true);
+    if(rc){
+        gw_state.active_state(ANA_groupid); //TODO check whether it is valid to  start failover when nonces are empty !
+        //ceph_assert(false);
+    }
+    else{
+        gw_state.sm_state[ANA_groupid] = GW_STATES_PER_AGROUP_E::GW_WAIT_BLOCKLIST_CMPL;
+        gw_state.blocklist_data[ANA_groupid].osd_epoch = epoch;
+        gw_state.blocklist_data[ANA_groupid].is_failover = true;
+        start_timer(gw_id, group_key, ANA_groupid, 30); //start Failover preparation timer
+    }
+}
+
+void  NVMeofGwMap::find_failback_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key,   bool &propose)
+{
+    auto& gws_states = Created_gws[group_key];
+    auto& gw_state = Created_gws[group_key][gw_id];
+    bool do_failback = false;
+
+    dout(4) << "Find failback GW for GW " << gw_id << dendl;
+    for (auto& gw_state_it: gws_states) {
+        auto& st = gw_state_it.second;
+        if (st.sm_state[gw_state.ana_grp_id] != GW_STATES_PER_AGROUP_E::GW_STANDBY_STATE) {// some other gw owns or owned the desired ana-group
+            do_failback = true;// if candidate is in state ACTIVE for the desired ana-group, then failback starts immediately, otherwise need to wait
+            dout(4) << "Found some gw " << gw_state_it.first  <<  " in state " << st.sm_state[gw_state.ana_grp_id]  << dendl;
+            break;
+        }
+    }
+
+    if (do_failback == false) {
+        // No other gw currently performs some activity with desired ana group of coming-up GW - so it just takes over on the group
+        dout(4)  << "Failback GW candidate was not found, just set Optimized to group " << gw_state.ana_grp_id << " to GW " << gw_id << dendl;
+        gw_state.active_state(gw_state.ana_grp_id);
+        propose = true;
+        return;
+    }
+    //try to do_failback
+    for (auto& gw_state_it: gws_states) {
+        auto& failback_gw_id = gw_state_it.first;
+        auto& st = gw_state_it.second;
+        if (st.sm_state[gw_state.ana_grp_id] == GW_STATES_PER_AGROUP_E::GW_ACTIVE_STATE) {
+            ceph_assert(st.failover_peer[gw_state.ana_grp_id] == gw_id);
+
+            dout(4)  << "Found Failback GW " << failback_gw_id << " that previously took over the ANAGRP " << gw_state.ana_grp_id << " of the available GW " << gw_id << dendl;
+            st.sm_state[gw_state.ana_grp_id] = GW_STATES_PER_AGROUP_E::GW_WAIT_FAILBACK_PREPARED;
+            start_timer(failback_gw_id, group_key, gw_state.ana_grp_id, 3);// Add timestamp of start Failback preparation
+            gw_state.sm_state[gw_state.ana_grp_id] = GW_STATES_PER_AGROUP_E::GW_OWNER_WAIT_FAILBACK_PREPARED;
+            propose = true;
+            break;
+        }
+    }
+}
+
+
+// TODO When decision to change ANA state of group is prepared, need to consider that last seen FSM state is "approved" - means it was returned in beacon alone with map version
+void  NVMeofGwMap::find_failover_candidate(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId grpid, bool &propose_pending)
+{
+    dout(4) <<__func__<< " " << gw_id << dendl;
+    #define ILLEGAL_GW_ID " "
+    #define MIN_NUM_ANA_GROUPS 0xFFF
+    int min_num_ana_groups_in_gw = 0;
+    int current_ana_groups_in_gw = 0;
+    NvmeGwId min_loaded_gw_id = ILLEGAL_GW_ID;
+
+    auto& gws_states = Created_gws[group_key];
+
+    auto gw_state = gws_states.find(gw_id);
+    ceph_assert(gw_state != gws_states.end());
+
+    // this GW may handle several ANA groups and  for each of them need to found the candidate GW
+    if (gw_state->second.sm_state[grpid] == GW_STATES_PER_AGROUP_E::GW_ACTIVE_STATE || gw_state->second.ana_grp_id == grpid) {
+
+        for (auto& found_gw_state: gws_states) { // for all the gateways of the subsystem
+            auto st = found_gw_state.second;
+            if(st.sm_state[grpid] ==  GW_STATES_PER_AGROUP_E::GW_WAIT_BLOCKLIST_CMPL){   // some GW already started failover/failback on this group
+               dout(4) << "Failover" << st.blocklist_data[grpid].is_failover <<  " already started for the group " << grpid <<  " by GW " << found_gw_state.first << dendl;
+               gw_state->second.standby_state(grpid);
+               return ;
+            }
+        }
+
+        // Find a GW that takes over the ANA group(s)
+        min_num_ana_groups_in_gw = MIN_NUM_ANA_GROUPS;
+        min_loaded_gw_id = ILLEGAL_GW_ID;
+        for (auto& found_gw_state: gws_states) { // for all the gateways of the subsystem
+            auto st = found_gw_state.second;
+            if (st.availability == GW_AVAILABILITY_E::GW_AVAILABLE) {
+                current_ana_groups_in_gw = 0;
+                for (int j = 0; j < MAX_SUPPORTED_ANA_GROUPS; j++) {
+                    if (st.sm_state[j] == GW_STATES_PER_AGROUP_E::GW_OWNER_WAIT_FAILBACK_PREPARED || st.sm_state[j] == GW_STATES_PER_AGROUP_E::GW_WAIT_FAILBACK_PREPARED
+                                                                                          || st.sm_state[j] == GW_STATES_PER_AGROUP_E::GW_WAIT_BLOCKLIST_CMPL){
+                        current_ana_groups_in_gw = 0xFFFF;
+                        break; // dont take into account   GWs in the transitive state
+                    }
+                    else if (st.sm_state[j] == GW_STATES_PER_AGROUP_E::GW_ACTIVE_STATE)
+                        //dout(4) << " process GW down " << current_ana_groups_in_gw << dendl;
+                        current_ana_groups_in_gw++; // how many ANA groups are handled by this GW
+                    }
+
+                    if (min_num_ana_groups_in_gw > current_ana_groups_in_gw) {
+                        min_num_ana_groups_in_gw = current_ana_groups_in_gw;
+                        min_loaded_gw_id = found_gw_state.first;
+                        dout(4) << "choose: gw-id  min_ana_groups " << min_loaded_gw_id << current_ana_groups_in_gw << " min " << min_num_ana_groups_in_gw << dendl;
+                    }
+                }
+            }
+            if (min_loaded_gw_id != ILLEGAL_GW_ID) {
+                propose_pending = true;
+                set_failover_gw_for_ANA_group(gw_id, group_key, min_loaded_gw_id, grpid);
+            }
+            else {
+                if (gw_state->second.sm_state[grpid] == GW_STATES_PER_AGROUP_E::GW_ACTIVE_STATE){// not found candidate but map changed.
+                    propose_pending = true;
+                    dout(4) << "gw down no candidate found " << dendl;
+                }
+            }
+            gw_state->second.standby_state(grpid);
+        }
+}
+
+void NVMeofGwMap::fsm_handle_gw_alive (const NvmeGwId &gw_id, const NvmeGroupKey& group_key,  NvmeGwCreated & gw_state, GW_STATES_PER_AGROUP_E state, NvmeAnaGrpId grpid, epoch_t& last_osd_epoch, bool &map_modified)
+{
+    switch (state) {
+    case GW_STATES_PER_AGROUP_E::GW_WAIT_BLOCKLIST_CMPL:
+    {
+        int timer_val = get_timer(gw_id, group_key, grpid);
+        NvmeGwCreated& gw_map = Created_gws[group_key][gw_id];
+            if(gw_map.blocklist_data[grpid].osd_epoch <= last_osd_epoch){
+                dout(4) << "is-failover: " << gw_map.blocklist_data[grpid].is_failover << " osd epoch changed from " << gw_map.blocklist_data[grpid].osd_epoch << " to "<< last_osd_epoch
+                                       << " Ana-grp: " << grpid  << " timer:" << timer_val << dendl;
+                gw_state.active_state(grpid);                   // Failover Gw still alive and guaranteed that
+                cancel_timer(gw_id, group_key, grpid);          // ana group wouldnt be taken back  during blocklist wait period
+                map_modified = true;
+            }
+            else{
+                dout(20) << "osd epoch not changed from " <<  gw_map.blocklist_data[grpid].osd_epoch << " to "<< last_osd_epoch
+                                                   << " Ana-grp: " << grpid  << " timer:" << timer_val << dendl;
+            }
+    }
+    break;
+
+    default:
+        break;
+    }
+}
+
+ void NVMeofGwMap::fsm_handle_gw_down(const NvmeGwId &gw_id, const NvmeGroupKey& group_key,   GW_STATES_PER_AGROUP_E state, NvmeAnaGrpId grpid,  bool &map_modified)
+ {
+    switch (state)
+    {
+        case GW_STATES_PER_AGROUP_E::GW_STANDBY_STATE:
+        case GW_STATES_PER_AGROUP_E::GW_IDLE_STATE:
+            // nothing to do
+            break;
+
+        case GW_STATES_PER_AGROUP_E::GW_WAIT_BLOCKLIST_CMPL:
+        {
+            cancel_timer(gw_id, group_key, grpid);
+        }break;
+
+        case GW_STATES_PER_AGROUP_E::GW_WAIT_FAILBACK_PREPARED:
+            cancel_timer(gw_id, group_key,  grpid);
+
+            for (auto& gw_st: Created_gws[group_key]) {
+                auto& st = gw_st.second;
+                if (st.sm_state[grpid] == GW_STATES_PER_AGROUP_E::GW_OWNER_WAIT_FAILBACK_PREPARED) { // found GW   that was intended for  Failback for this ana grp
+                    dout(4) << "Warning: Outgoing Failback when GW is down back - to rollback it"  <<" GW "  <<gw_id << "for ANA Group " << grpid << dendl;
+                    st.standby_state(grpid);
+                    map_modified = true;
+                    break;
+                }
+            }
+            break;
+
+        case GW_STATES_PER_AGROUP_E::GW_OWNER_WAIT_FAILBACK_PREPARED:
+            // nothing to do - let failback timer expire
+            break;
+
+        case GW_STATES_PER_AGROUP_E::GW_ACTIVE_STATE:
+        {
+            find_failover_candidate( gw_id, group_key, grpid, map_modified);
+        }
+        break;
+
+        default:{
+            ceph_assert(false);
+        }
+
+    }
+ }
+
+
+void NVMeofGwMap::fsm_handle_gw_delete (const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
+     GW_STATES_PER_AGROUP_E state , NvmeAnaGrpId grpid, bool &map_modified) {
+    switch (state)
+    {
+        case GW_STATES_PER_AGROUP_E::GW_STANDBY_STATE:
+        case GW_STATES_PER_AGROUP_E::GW_IDLE_STATE:
+        case GW_STATES_PER_AGROUP_E::GW_OWNER_WAIT_FAILBACK_PREPARED:
+        {
+            NvmeGwCreated& gw_state = Created_gws[group_key][gw_id];
+
+            if (grpid == gw_state.ana_grp_id) {// Try to find GW that temporary owns my group - if found, this GW should pass to standby for  this group
+                auto& gateway_states = Created_gws[group_key];
+                for (auto& gs: gateway_states) {
+                    if (gs.second.sm_state[grpid] == GW_STATES_PER_AGROUP_E::GW_ACTIVE_STATE  || gs.second.sm_state[grpid] == GW_STATES_PER_AGROUP_E::GW_WAIT_FAILBACK_PREPARED){
+                        gs.second.standby_state(grpid);
+                        map_modified = true;
+                        if (gs.second.sm_state[grpid] == GW_STATES_PER_AGROUP_E::GW_WAIT_FAILBACK_PREPARED)
+                            cancel_timer(gs.first, group_key, grpid);
+                        break;
+                    }
+                }
+            }
+        }
+        break;
+
+        case GW_STATES_PER_AGROUP_E::GW_WAIT_BLOCKLIST_CMPL:
+        {
+            NvmeGwCreated& gw_state = Created_gws[group_key][gw_id];
+            cancel_timer(gw_id, group_key, grpid);
+            map_modified = true;
+            gw_state.standby_state(grpid);
+        }
+        break;
+
+        case GW_STATES_PER_AGROUP_E::GW_WAIT_FAILBACK_PREPARED:
+        {
+            cancel_timer(gw_id, group_key, grpid);
+            for (auto& nqn_gws_state: Created_gws[group_key]) {
+                auto& st = nqn_gws_state.second;
+
+                if (st.sm_state[grpid] == GW_STATES_PER_AGROUP_E::GW_OWNER_WAIT_FAILBACK_PREPARED) { // found GW   that was intended for  Failback for this ana grp
+                    dout(4) << "Warning: Outgoing Failback when GW is deleted - to rollback it" << " GW " << gw_id << "for ANA Group " << grpid << dendl;
+                    st.standby_state(grpid);
+                    map_modified = true;
+                    break;
+                }
+            }
+        }
+        break;
+
+        case GW_STATES_PER_AGROUP_E::GW_ACTIVE_STATE:
+        {
+            NvmeGwCreated& gw_state = Created_gws[group_key][gw_id];
+            map_modified = true;
+            gw_state.standby_state(grpid);
+        }
+        break;
+
+        default: {
+            ceph_assert(false);
+        }
+    }
+}
+
+void NVMeofGwMap::fsm_handle_to_expired(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId grpid,  bool &map_modified)
+{
+    auto& fbp_gw_state = Created_gws[group_key][gw_id];// GW in Fail-back preparation state fbp
+    bool grp_owner_found = false;
+    if (fbp_gw_state.sm_state[grpid] == GW_STATES_PER_AGROUP_E::GW_WAIT_FAILBACK_PREPARED) {
+        for (auto& gw_state: Created_gws[group_key]) {
+            auto& st = gw_state.second;
+            if (st.ana_grp_id == grpid){// group owner
+                grp_owner_found = true;
+                if( ! (fbp_gw_state.last_gw_map_epoch_valid  && st.last_gw_map_epoch_valid) ){
+                   //Timer is not cancelled so it would expire over and over as long as both gws are not updated
+                   dout(1) << "gw " << gw_id  <<" or gw " << gw_state.first  << "map epochs are not updated "<< dendl;
+                   return;
+                }
+                cancel_timer(gw_id, group_key, grpid);
+                if (st.sm_state[grpid] == GW_STATES_PER_AGROUP_E::GW_OWNER_WAIT_FAILBACK_PREPARED && st.availability == GW_AVAILABILITY_E::GW_AVAILABLE )
+                {
+                    fbp_gw_state.standby_state(grpid);// Previous failover GW  set to standby
+                    st.active_state(grpid);
+                    dout(4)  << "Expired Failback-preparation timer from GW " << gw_id << " ANA groupId "<< grpid << dendl;
+                    map_modified = true;
+                    break;
+                }
+                else if(st.sm_state[grpid] == GW_STATES_PER_AGROUP_E::GW_STANDBY_STATE  &&  st.availability == GW_AVAILABILITY_E::GW_AVAILABLE) {
+                   st.active_state(grpid);// GW failed and started during the persistency interval
+                   dout(4)  << "Failback unsuccessfull. GW: " << gw_state.first << "becomes Active for the ana group " << grpid  << dendl;
+                }
+                fbp_gw_state.standby_state(grpid);
+                dout(4) << "Failback unsuccessfull GW: " << gw_id << "becomes standby for the ana group " << grpid  << dendl;
+                map_modified = true;
+                break;
+            }
+       }
+      ceph_assert(grp_owner_found);// when  GW group owner is deleted the fbk gw is put to standby
+    }
+    else if(fbp_gw_state.sm_state[grpid] == GW_STATES_PER_AGROUP_E::GW_WAIT_BLOCKLIST_CMPL){
+        dout(1) << " Expired GW_WAIT_FAILOVER_PREPARED timer from GW " << gw_id << " ANA groupId: "<< grpid << dendl;
+        ceph_assert(false);
+    }
+}
+
+NvmeGwCreated& NVMeofGwMap::find_already_created_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key)
+{
+    auto& group_gws = Created_gws[group_key];
+    auto  gw_it = group_gws.find(gw_id);
+    ceph_assert(gw_it != group_gws.end());//should not happen
+    return gw_it->second;
+}
+
+struct CMonRequestProposal : public Context {
+  NVMeofGwMap *m;
+  CMonRequestProposal(NVMeofGwMap *mon) : m(mon) {}
+  void finish(int r) {
+      dout(4) << "osdmon is  writable? " << m->mon->osdmon()->is_writeable() << dendl;
+      if(m->mon->osdmon()->is_writeable()){
+        m->mon->nvmegwmon()->request_proposal(m->mon->osdmon());
+      }
+      else {
+          m->mon->osdmon()->wait_for_writeable_ctx( new CMonRequestProposal(m));
+      }
+  }
+};
+
+int NVMeofGwMap::blocklist_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId grpid, epoch_t &epoch, bool failover)
+{
+    NvmeGwCreated& gw_map =  Created_gws[group_key][gw_id];  //find_already_created_gw(gw_id, group_key);
+
+     if (gw_map.nonce_map[grpid].size() > 0){
+        NvmeNonceVector &nonce_vector = gw_map.nonce_map[grpid];;
+        std::string str = "[";
+        entity_addrvec_t addr_vect;
+
+        double d = g_conf().get_val<double>("mon_osd_blocklist_default_expire");
+        utime_t expires = ceph_clock_now();
+        expires += d;
+        dout(4) << " blocklist timestamp " << expires << dendl;
+        for(auto &it: nonce_vector ){
+            if(str != "[") str += ",";
+            str += it;
+        }
+        str += "]";
+        bool rc = addr_vect.parse(&str[0]);
+        dout(10) << str << " rc " << rc <<  " network vector: " << addr_vect << " " << addr_vect.size() << dendl;
+        ceph_assert(rc);
+        epoch = mon->osdmon()->blocklist(addr_vect, expires);
+
+        if (!mon->osdmon()->is_writeable()) {
+            dout(4) << "osdmon is not writable, waiting " << dendl;
+            mon->osdmon()->wait_for_writeable_ctx( new CMonRequestProposal(this ));// return false;
+        }
+        else  mon->nvmegwmon()->request_proposal(mon->osdmon());
+        dout(4) << str << " mon->osdmon()->blocklist:  " << epoch <<  " address vector: " << addr_vect << " " << addr_vect.size() << dendl;
+    }
+    else{
+        dout(1) << "Error: No nonces context present for gw: " <<gw_id  << " ANA group: " << grpid << dendl;
+        return 1;
+    }
+    return 0;
+}
+
+void NVMeofGwMap::update_active_timers( bool &propose_pending ){
+
+    //dout(4) << __func__  <<  " called,  p_monitor: " << mon << dendl;
+    const auto now = std::chrono::system_clock::now();
+    for (auto& group_md: Gmetadata) {
+        auto& group_key = group_md.first;
+        auto& pool = group_key.first;
+        auto& group = group_key.second;
+        for (auto& gw_md: group_md.second) {
+            auto& gw_id = gw_md.first;
+            auto& md = gw_md.second;
+            for (size_t ana_grpid = 0; ana_grpid < MAX_SUPPORTED_ANA_GROUPS; ana_grpid ++) {
+                if (md.data[ana_grpid].timer_started == 0) continue;
+                dout(20) << "Checking timer for GW " << gw_id << " ANA GRP " << ana_grpid<< " value(seconds): "<< (int)md.data[ana_grpid].timer_value << dendl;
+                if(now >= md.data[ana_grpid].end_time){
+                    fsm_handle_to_expired (gw_id, std::make_pair(pool, group), ana_grpid, propose_pending);
+                }
+            }
+        }
+    }
+}
+
+
+void NVMeofGwMap::start_timer(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid, uint8_t value_sec) {
+    Gmetadata[group_key][gw_id].data[anagrpid].timer_started = 1;
+    Gmetadata[group_key][gw_id].data[anagrpid].timer_value = value_sec;
+    dout(4) << "start timer for ana " << anagrpid << " gw " << gw_id << "value sec " << (int)value_sec << dendl;
+    const auto now = std::chrono::system_clock::now();
+    Gmetadata[group_key][gw_id].data[anagrpid].end_time = now + std::chrono::seconds(value_sec);
+}
+
+int  NVMeofGwMap::get_timer(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid) {
+    auto timer = Gmetadata[group_key][gw_id].data[anagrpid].timer_value;
+    return timer;
+}
+
+void NVMeofGwMap::cancel_timer(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid) {
+    Gmetadata[group_key][gw_id].data[anagrpid].timer_started = 0;
+}
diff --git a/src/mon/NVMeofGwMap.h b/src/mon/NVMeofGwMap.h
new file mode 100755 (executable)
index 0000000..bf46b31
--- /dev/null
@@ -0,0 +1,92 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef MON_NVMEOFGWMAP_H_
+#define MON_NVMEOFGWMAP_H_
+#include <map>
+#include <iostream>
+#include "include/encoding.h"
+#include "include/utime.h"
+#include "common/Formatter.h"
+#include "common/ceph_releases.h"
+#include "common/version.h"
+#include "common/options.h"
+#include "common/Clock.h"
+#include "msg/Message.h"
+#include "common/ceph_time.h"
+#include "NVMeofGwTypes.h"
+
+using ceph::coarse_mono_clock;
+class Monitor;
+/*-------------------*/
+class NVMeofGwMap
+{
+public:
+    Monitor*                            mon           = NULL;
+    epoch_t                             epoch         = 0;      // epoch is for Paxos synchronization  mechanizm
+    bool                                delay_propose = false;
+
+    std::map<NvmeGroupKey, NvmeGwCreatedMap>  Created_gws;
+    std::map<NvmeGroupKey, NvmeGwMetaDataMap> Gmetadata;
+    void to_gmap(std::map<NvmeGroupKey, NvmeGwMap>& Gmap) const;
+
+    int   cfg_add_gw                    (const NvmeGwId &gw_id, const NvmeGroupKey& group_key);
+    int   cfg_delete_gw                 (const NvmeGwId &gw_id, const NvmeGroupKey& group_key);
+    void  process_gw_map_ka             (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, epoch_t& last_osd_epoch,  bool &propose_pending);
+    int   process_gw_map_gw_down        (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose_pending);
+    void  update_active_timers          (bool &propose_pending);
+    void  handle_abandoned_ana_groups   (bool &propose_pending);
+    void  handle_removed_subsystems     (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, const std::vector<NvmeNqnId> &current_subsystems, bool &propose_pending);
+    void  start_timer (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid, uint8_t value);
+private:
+    NvmeGwCreated&   find_already_created_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key);
+    void fsm_handle_gw_down    (const NvmeGwId &gw_id, const NvmeGroupKey& group_key,  GW_STATES_PER_AGROUP_E state, NvmeAnaGrpId grpid,  bool &map_modified);
+    void fsm_handle_gw_delete  (const NvmeGwId &gw_id, const NvmeGroupKey& group_key,  GW_STATES_PER_AGROUP_E state, NvmeAnaGrpId grpid,  bool &map_modified);
+    void fsm_handle_gw_alive   (const NvmeGwId &gw_id, const NvmeGroupKey& group_key,  NvmeGwCreated & gw_state, GW_STATES_PER_AGROUP_E state,
+                                                                                   NvmeAnaGrpId grpid, epoch_t& last_osd_epoch, bool &map_modified);
+    void fsm_handle_to_expired (const NvmeGwId &gw_id, const NvmeGroupKey& group_key,  NvmeAnaGrpId grpid,  bool &map_modified);
+
+    void find_failover_candidate(const NvmeGwId &gw_id, const NvmeGroupKey& group_key,  NvmeAnaGrpId grpid, bool &propose_pending);
+    void find_failback_gw       (const NvmeGwId &gw_id, const NvmeGroupKey& group_key,  bool &propose_pending);
+    void set_failover_gw_for_ANA_group (const NvmeGwId &failed_gw_id, const NvmeGroupKey& group_key, const NvmeGwId &gw_id,
+                                                                                                     NvmeAnaGrpId groupid);
+    int  blocklist_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId ANA_groupid, epoch_t &epoch, bool failover);
+
+    int  get_timer   (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid);
+    void cancel_timer(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid);
+
+public:
+    void encode(ceph::buffer::list &bl) const {
+        using ceph::encode;
+        ENCODE_START(1, 1, bl);
+        encode(epoch, bl);// global map epoch
+
+        encode(Created_gws, bl); //Encode created GWs
+        encode(Gmetadata, bl);
+        ENCODE_FINISH(bl);
+    }
+
+    void decode(ceph::buffer::list::const_iterator &bl) {
+        using ceph::decode;
+        DECODE_START(1, bl);
+        decode(epoch, bl);
+
+        decode(Created_gws, bl);
+        decode(Gmetadata, bl);
+        DECODE_FINISH(bl);
+    }
+};
+
+#include "NVMeofGwSerialize.h"
+
+#endif /* SRC_MON_NVMEOFGWMAP_H_ */
diff --git a/src/mon/NVMeofGwMon.cc b/src/mon/NVMeofGwMon.cc
new file mode 100644 (file)
index 0000000..bdbdec7
--- /dev/null
@@ -0,0 +1,546 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include <boost/tokenizer.hpp>
+#include "include/stringify.h"
+#include "NVMeofGwMon.h"
+#include "messages/MNVMeofGwBeacon.h"
+#include "messages/MNVMeofGwMap.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix *_dout << "nvmeofgw " << __PRETTY_FUNCTION__ << " "
+
+using std::string;
+
+void NVMeofGwMon::init(){
+    dout(4) <<  "called " << dendl;
+    g_conf().add_observer(this);
+}
+
+void NVMeofGwMon::on_restart(){
+    dout(4) <<  "called " << dendl;
+    last_beacon.clear();
+    last_tick = ceph::coarse_mono_clock::now();
+}
+
+
+void NVMeofGwMon::synchronize_last_beacon(){
+    dout(4) <<  "called " << dendl;
+    last_beacon.clear();
+    last_tick = ceph::coarse_mono_clock::now();
+    // Initialize last_beacon to identify transitions of available  GWs to unavailable state
+    for (const auto& created_map_pair: pending_map.Created_gws) {
+      const auto& group_key = created_map_pair.first;
+      const NvmeGwCreatedMap& gw_created_map = created_map_pair.second;
+      for (const auto& gw_created_pair: gw_created_map) {
+          const auto& gw_id = gw_created_pair.first;
+          if (gw_created_pair.second.availability == GW_AVAILABILITY_E::GW_AVAILABLE){
+             dout(4) << "synchronize last_beacon for  GW :" << gw_id << dendl;
+             LastBeacon lb = {gw_id, group_key};
+             last_beacon[lb] = last_tick;
+          }
+      }
+    }
+}
+
+void NVMeofGwMon::on_shutdown() {
+     g_conf().remove_observer(this);
+}
+
+void NVMeofGwMon::tick(){
+    if (!is_active() || !mon.is_leader()){
+        dout(10) << "NVMeofGwMon leader : " << mon.is_leader() << "active : " << is_active()  << dendl;
+        last_leader = false;
+        return;
+    }
+    bool _propose_pending = false;
+  
+    const auto now = ceph::coarse_mono_clock::now();
+    const auto nvmegw_beacon_grace = g_conf().get_val<std::chrono::seconds>("mon_nvmeofgw_beacon_grace"); 
+    dout(15) <<  "NVMeofGwMon leader got a real tick, pending epoch "<< pending_map.epoch     << dendl;
+
+    const auto mgr_tick_period = g_conf().get_val<std::chrono::seconds>("mgr_tick_period");
+
+    if (last_tick != ceph::coarse_mono_clock::zero()
+          && (now - last_tick > (nvmegw_beacon_grace - mgr_tick_period))) {
+        // This case handles either local slowness (calls being delayed
+        // for whatever reason) or cluster election slowness (a long gap
+        // between calls while an election happened)
+        dout(4) << ": resetting beacon timeouts due to mon delay "
+                "(slow election?) of " << now - last_tick << " seconds" << dendl;
+        for (auto &i : last_beacon) {
+          i.second = now;
+        }
+    }
+
+    last_tick = now;
+    bool propose = false;
+
+    pending_map.update_active_timers(propose);  // Periodic: check active FSM timers
+    _propose_pending |= propose;
+
+
+    //handle exception of tick overdued in order to avoid false detection of overdued beacons , see MgrMonitor::tick
+    const auto cutoff = now - nvmegw_beacon_grace;
+    for(auto &itr : last_beacon){// Pass over all the stored beacons
+        auto& lb = itr.first;
+        auto last_beacon_time = itr.second;
+        if(last_beacon_time < cutoff){
+            dout(4) << "beacon timeout for GW " << lb.gw_id << dendl;
+            pending_map.process_gw_map_gw_down( lb.gw_id, lb.group_key, propose);
+            _propose_pending |= propose;
+            last_beacon.erase(lb);
+        }
+        else {
+           dout(20) << "beacon live for GW key: " << lb.gw_id << dendl;
+        }
+    }
+
+    pending_map.handle_abandoned_ana_groups(propose); // Periodic: take care of not handled ANA groups
+    _propose_pending |= propose;
+
+    if(_propose_pending){
+       //pending_map.delay_propose = true; // not to send map to clients immediately in "update_from_paxos"
+       dout(4) << "propose pending " <<dendl;
+       propose_pending();
+    }
+
+    // if propose_pending returned true , call propose_pending method of the paxosService
+    // todo understand the logic of paxos.plugged for sending several propose_pending see MgrMonitor::tick
+}
+
+const char **NVMeofGwMon::get_tracked_conf_keys() const
+{
+  static const char* KEYS[] = {
+    NULL
+  };
+  return KEYS;
+}
+
+void NVMeofGwMon::handle_conf_change(const ConfigProxy& conf,
+                                    const std::set<std::string> &changed)
+{
+  dout(4) << "changed " << changed << dendl;
+}
+
+void NVMeofGwMon::create_pending(){
+
+    pending_map = map;// deep copy of the object
+    // TODO  since "pending_map"  can be reset  each time during paxos re-election even in the middle of the changes ...
+    pending_map.epoch++;
+    dout(4) << " pending " << pending_map  << dendl;
+    if(last_leader == false){ // peon becomes leader and gets updated map , need to synchronize the last_beacon
+        synchronize_last_beacon();
+        last_leader = true;
+    }
+}
+
+void NVMeofGwMon::encode_pending(MonitorDBStore::TransactionRef t){
+
+    dout(10) <<  dendl;
+    ceph_assert(get_last_committed() + 1 == pending_map.epoch);
+    bufferlist bl;
+    pending_map.encode(bl);
+    put_version(t, pending_map.epoch, bl);
+    put_last_committed(t, pending_map.epoch);
+}
+
+void NVMeofGwMon::update_from_paxos(bool *need_bootstrap){
+    version_t version = get_last_committed();
+
+    //dout(4) <<  MY_MON_PREFFIX << __func__ << " version "  << version  << " map.epoch " << map.epoch << dendl;
+
+    if (version != map.epoch) {
+        dout(4) << " NVMeGW loading version " << version  << " " << map.epoch << dendl;
+
+        bufferlist bl;
+        int err = get_version(version, bl);
+        ceph_assert(err == 0);
+
+        auto p = bl.cbegin();
+        map.decode(p);
+        if(!mon.is_leader()) {
+            dout(4) << "leader map: " << map <<  dendl;
+        }
+        check_subs(true);
+    }
+}
+
+void NVMeofGwMon::check_sub(Subscription *sub)
+{
+   /* MgrMonitor::check_sub*/
+    //if (sub->type == "NVMeofGw") {
+    dout(10) << "sub->next , map-epoch " << sub->next << " " << map.epoch << dendl;
+    if (sub->next <= map.epoch)
+    {
+      dout(4) << "Sending map to subscriber " << sub->session->con << " " << sub->session->con->get_peer_addr() << dendl;
+      sub->session->con->send_message2(make_message<MNVMeofGwMap>(map));
+
+      if (sub->onetime) {
+        mon.session_map.remove_sub(sub);
+      } else {
+        sub->next = map.epoch + 1;
+      }
+    }
+}
+
+void NVMeofGwMon::check_subs(bool t)
+{
+  const std::string type = "NVMeofGw";
+  dout(4) <<  "count " << mon.session_map.subs.count(type) << dendl;
+
+  if (mon.session_map.subs.count(type) == 0){
+      return;
+  }
+  for (auto sub : *(mon.session_map.subs[type])){
+    check_sub(sub);
+  }
+}
+
+bool NVMeofGwMon::preprocess_query(MonOpRequestRef op){
+    dout(20) << dendl;
+
+    auto m = op->get_req<PaxosServiceMessage>();
+      switch (m->get_type()) {
+        case MSG_MNVMEOF_GW_BEACON:
+          return preprocess_beacon(op);
+
+        case MSG_MON_COMMAND:
+          try {
+        return preprocess_command(op);
+          } catch (const bad_cmd_get& e) {
+          bufferlist bl;
+          mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+          return true;
+        }
+
+        default:
+          mon.no_reply(op);
+          derr << "Unhandled message type " << m->get_type() << dendl;
+          return true;
+      }
+    return false;
+}
+
+bool NVMeofGwMon::prepare_update(MonOpRequestRef op){
+    auto m = op->get_req<PaxosServiceMessage>();
+      switch (m->get_type()) {
+        case MSG_MNVMEOF_GW_BEACON:
+          return prepare_beacon(op);
+
+        case MSG_MON_COMMAND:
+          try {
+        return prepare_command(op);
+          } catch (const bad_cmd_get& e) {
+        bufferlist bl;
+        mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+        return false; /* nothing to propose! */
+          }
+
+        default:
+          mon.no_reply(op);
+          dout(1) << "Unhandled message type " << m->get_type() << dendl;
+          return false; /* nothing to propose! */
+      }
+    return true;
+}
+
+bool NVMeofGwMon::preprocess_command(MonOpRequestRef op)
+{
+    dout(4) << dendl;
+    auto m = op->get_req<MMonCommand>();
+    std::stringstream ss;
+    bufferlist rdata;
+
+    cmdmap_t cmdmap;
+    if (!cmdmap_from_json(m->cmd, &cmdmap, ss))
+    {
+        string rs = ss.str();
+        dout(1) << "Invalid command "  << m->cmd << "Error " << rs << dendl;
+        mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
+        return true;
+    }
+
+    string prefix;
+    cmd_getval(cmdmap, "prefix", prefix);
+    dout(4) << "MonCommand : "<< prefix <<  dendl;
+
+   /* MonSession *session = op->get_session();
+    if (!session)
+    {
+        dout(4) << "MonCommand : "<< prefix << " access denied due to lack of session" <<  dendl;
+        mon.reply_command(op, -EACCES, "access denied", rdata,
+                          get_last_committed());
+        return true;
+    }
+   */
+    string format = cmd_getval_or<string>(cmdmap, "format", "plain");
+    boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+    // TODO   need to check formatter per preffix  - if f is NULL
+
+    return false;
+}
+
+bool NVMeofGwMon::prepare_command(MonOpRequestRef op)
+{
+    dout(4)  << dendl;
+    auto m = op->get_req<MMonCommand>();
+    int rc;
+    std::stringstream ss;
+    bufferlist rdata;
+    string rs;
+    int err = 0;
+    cmdmap_t cmdmap;
+
+    if (!cmdmap_from_json(m->cmd, &cmdmap, ss))
+    {
+        string rs = ss.str();
+        mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
+        return true;
+    }
+
+    MonSession *session = op->get_session();
+    if (!session)
+    {
+        mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
+        return true;
+    }
+
+    string format = cmd_getval_or<string>(cmdmap, "format", "plain");
+    boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+    const auto prefix = cmd_getval_or<string>(cmdmap, "prefix", string{});
+
+    dout(4) << "MonCommand : "<< prefix <<  dendl;
+    if( prefix == "nvme-gw create" || prefix == "nvme-gw delete" ) {
+        std::string id, pool, group;
+
+        cmd_getval(cmdmap, "id", id);
+        cmd_getval(cmdmap, "pool", pool);
+        cmd_getval(cmdmap, "group", group);
+        auto group_key = std::make_pair(pool, group);
+        dout(4) << " id "<< id <<" pool "<< pool << " group "<< group << dendl;
+        if(prefix == "nvme-gw create"){
+            rc = pending_map.cfg_add_gw(id, group_key);
+            ceph_assert(rc!= -EINVAL);
+        }
+        else{
+            rc = pending_map.cfg_delete_gw(id, group_key);
+            if(rc== -EINVAL){
+                dout (1) << "Error: GW not found in the database " << id << " " << pool << " " << group << "  rc " << rc << dendl;
+                err = rc;
+                ss.str("");
+            }
+        }
+        if((rc != -EEXIST) && (rc != -EINVAL)){
+            //propose pending would be generated by the PaxosService
+            goto update;
+        }
+        else {
+            goto reply_no_propose;
+        }
+    }
+    else if ( prefix == "nvme-gw show" ){
+        std::string  pool, group;
+        if (!f) {
+          f.reset(Formatter::create(format, "json-pretty", "json-pretty"));
+        }
+        cmd_getval(cmdmap, "pool", pool);
+        cmd_getval(cmdmap, "group", group);
+        auto group_key = std::make_pair(pool, group);
+        dout(4) <<"nvme-gw show  pool "<< pool << " group "<< group << dendl;
+
+        if( map.Created_gws[group_key].size()){
+            f->open_object_section("common");
+            f->dump_string("pool", pool);
+            f->dump_string("group", group);
+            f->dump_unsigned("num gws", map.Created_gws[group_key].size());
+            ss <<"[ ";
+            NvmeAnaGrpId anas[MAX_SUPPORTED_ANA_GROUPS];
+            int i = 0;
+            for (auto& gw_created_pair: map.Created_gws[group_key]) {
+                 auto& st = gw_created_pair.second;
+                 ss << st.ana_grp_id+1 << " ";
+                 anas[i++] = st.ana_grp_id;
+            }
+            ss << "]";
+            f->dump_string("Anagrp list", ss.str());
+            f->close_section();
+
+            for (auto& gw_created_pair: map.Created_gws[group_key]) {
+                 auto& gw_id = gw_created_pair.first;
+                 auto& state = gw_created_pair.second;
+                 f->open_object_section("stat");
+                 f->dump_string("gw-id", gw_id);
+                 f->dump_unsigned("anagrp-id",state.ana_grp_id+1);
+                 f->dump_unsigned("last-gw_map-epoch-valid",state.last_gw_map_epoch_valid);
+                 std::stringstream  ss1;
+                 ss1 << state.availability;
+                 f->dump_string("Availability", ss1.str());
+                 ss1.str("");
+                 for (size_t i = 0; i < map.Created_gws[group_key].size(); i++) {
+                         ss1 << " " << anas[i]+1 <<": " << state.sm_state[anas[i]] << ",";
+                 }
+                 f->dump_string("ana states", ss1.str());
+                 f->close_section();
+            }
+            f->flush(rdata);
+            ss.str("");
+        }
+        else {
+            ss << "num_gws  0";
+        }
+    }
+
+  reply_no_propose:
+    getline(ss, rs);
+    if (err < 0 && rs.length() == 0)
+    {
+        rs = cpp_strerror(err);
+        dout(1) << "Error command  err : "<< err  << " rs-len: " << rs.length() <<  dendl;
+    }
+    mon.reply_command(op, err, rs, rdata, get_last_committed());
+    return false; /* nothing to propose */
+
+  update:
+    getline(ss, rs);
+    wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+                            get_last_committed() + 1));
+    return true;
+}
+
+
+bool NVMeofGwMon::preprocess_beacon(MonOpRequestRef op){
+    //dout(4)   << dendl;
+    auto m = op->get_req<MNVMeofGwBeacon>();
+    const BeaconSubsystems& sub = m->get_subsystems();
+     //mon.no_reply(op); // we never reply to beacons
+     dout(15) << "beacon from " << m->get_type() << " GW : " << m->get_gw_id()  << " num subsystems " << sub.size() <<  dendl;
+     MonSession *session = op->get_session();
+     if (!session){
+         dout(4) << "beacon no session "  << dendl;
+         return true;
+     }
+
+    return false; // allways  return false to call leader's prepare beacon
+}
+
+
+//#define BYPASS_GW_CREATE_CLI
+
+bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op){
+    //dout(4)  << dendl;
+    auto m = op->get_req<MNVMeofGwBeacon>();
+
+    dout(20) << "availability " <<  m->get_availability() << " GW : " << m->get_gw_id() <<
+        " osdmap_epoch " << m->get_last_osd_epoch() << " subsystems " << m->get_subsystems() << dendl;
+
+    NvmeGwId gw_id = m->get_gw_id();
+    NvmeGroupKey group_key = std::make_pair(m->get_gw_pool(),  m->get_gw_group());
+    GW_AVAILABILITY_E  avail = m->get_availability();
+    bool propose = false;
+    bool nonce_propose = false;
+    bool timer_propose = false;
+    bool gw_created = true;
+    NVMeofGwMap ack_map;
+    auto& group_gws = map.Created_gws[group_key];
+    auto gw = group_gws.find(gw_id);
+    const BeaconSubsystems& sub = m->get_subsystems();
+
+    if (avail == GW_AVAILABILITY_E::GW_CREATED){
+        if (gw == group_gws.end()) {
+           gw_created = false;
+           dout(4) << "Warning: GW " << gw_id << " group_key " << group_key << " was not found in the  map.Created_gws "<< map.Created_gws <<dendl;
+        }
+        goto set_propose;
+    }
+
+    // At this stage the gw has to be in the Created_gws
+    if(gw == group_gws.end()){
+        dout(1) << "Error : Administratively deleted GW sends beacon " << gw_id <<dendl;
+        goto false_return; // not sending ack to this beacon
+    }
+
+    // deep copy the whole nonce map of this GW
+    if(m->get_nonce_map().size()) {
+        if(pending_map.Created_gws[group_key][gw_id].nonce_map != m->get_nonce_map())
+        {
+            dout(4) << "nonce map of GW  changed , propose pending " << gw_id << dendl;
+            pending_map.Created_gws[group_key][gw_id].nonce_map = m->get_nonce_map();
+            dout(4) << "nonce map of GW " << gw_id << " "<< pending_map.Created_gws[group_key][gw_id].nonce_map  << dendl;
+            nonce_propose = true;
+        }
+    }
+    else  {
+        dout(4) << "Warning: received empty nonce map in the beacon of GW " << gw_id << " "<< dendl;
+    }
+
+    //pending_map.handle_removed_subsystems(gw_id, group_key, configured_subsystems, propose);
+
+    //if  no subsystem configured set gw as avail = GW_AVAILABILITY_E::GW_UNAVAILABLE
+
+    if(sub.size() == 0) {
+        avail = GW_AVAILABILITY_E::GW_UNAVAILABLE;
+    }
+    pending_map.Created_gws[group_key][gw_id].subsystems =  sub;
+    pending_map.Created_gws[group_key][gw_id].last_gw_map_epoch_valid = ( map.epoch == m->get_last_gwmap_epoch() );
+    if( pending_map.Created_gws[group_key][gw_id].last_gw_map_epoch_valid == false ){
+      dout(1) <<  "map epoch of gw is not up-to-date " << gw_id << " epoch " << map.epoch << " beacon_epoch " << m->get_last_gwmap_epoch() <<  dendl;
+    }
+    if(avail == GW_AVAILABILITY_E::GW_AVAILABLE)
+    {
+        //dout(4) <<"subsystems from beacon " << pending_map.Created_gws << dendl;
+        auto now = ceph::coarse_mono_clock::now();
+        // check pending_map.epoch vs m->get_version() - if different - drop the beacon
+
+        LastBeacon lb = {gw_id, group_key};
+        last_beacon[lb] = now;
+        epoch_t last_osd_epoch = m->get_last_osd_epoch();
+        pending_map.process_gw_map_ka(gw_id, group_key, last_osd_epoch, propose);
+    }
+    else if(avail == GW_AVAILABILITY_E::GW_UNAVAILABLE){ // state set by GW client application
+        //  TODO: remove from last_beacon if found . if gw was found in last_beacon call process_gw_map_gw_down
+
+        LastBeacon lb = {gw_id, group_key};
+
+        auto it = last_beacon.find(lb);
+        if (it != last_beacon.end()){
+            last_beacon.erase(lb);
+            pending_map.process_gw_map_gw_down(gw_id, group_key, propose);
+        }
+    }
+    pending_map.update_active_timers(timer_propose);  // Periodic: check active FSM timers
+    propose |= timer_propose;
+    propose |= nonce_propose;
+
+set_propose:
+    if(!propose) {
+      if(gw_created){
+          ack_map.Created_gws[group_key][gw_id] = map.Created_gws[group_key][gw_id];// respond with a map slice correspondent to the same GW
+      }
+      ack_map.epoch = map.epoch;
+      dout(20) << "ack_map " << ack_map <<dendl;
+      auto msg = make_message<MNVMeofGwMap>(ack_map);
+      mon.send_reply(op, msg.detach());
+    }
+false_return:
+    if (propose){
+      dout(4) << "decision in prepare_beacon" <<dendl;
+      return true;
+    }
+    else 
+     return false; // if no changes are need in the map
+}
diff --git a/src/mon/NVMeofGwMon.h b/src/mon/NVMeofGwMon.h
new file mode 100644 (file)
index 0000000..33313e8
--- /dev/null
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef  MON_NVMEGWMONITOR_H_
+#define  MON_NVMEGWMONITOR_H_
+
+#include "PaxosService.h"
+#include "NVMeofGwMap.h"
+
+struct LastBeacon {
+    NvmeGwId gw_id;
+    NvmeGroupKey group_key;
+
+    // Comparison operators to allow usage as a map key
+    bool operator<(const LastBeacon& other) const {
+        if (gw_id != other.gw_id) return gw_id < other.gw_id;
+        return group_key < other.group_key;
+    }
+
+    bool operator==(const LastBeacon& other) const {
+        return gw_id == other.gw_id &&
+               group_key == other.group_key;
+    }
+};
+
+class NVMeofGwMon: public PaxosService,
+                   public md_config_obs_t
+{
+    NVMeofGwMap map;  //NVMeGWMap
+    NVMeofGwMap pending_map;
+
+    std::map<LastBeacon, ceph::coarse_mono_clock::time_point> last_beacon;
+
+    // when the mon was not updating us for some period (e.g. during slow
+    // election) to reset last_beacon timeouts
+    ceph::coarse_mono_clock::time_point last_tick;
+
+    std::vector<MonCommand> command_descs;
+    std::vector<MonCommand> pending_command_descs;
+
+public:
+    NVMeofGwMon(Monitor &mn, Paxos &p, const std::string& service_name): PaxosService(mn, p, service_name) {map.mon = &mn; last_leader = false;}
+    ~NVMeofGwMon() override {}
+
+
+    // config observer
+    const char** get_tracked_conf_keys() const override;
+    void handle_conf_change(const ConfigProxy& conf, const std::set<std::string> &changed) override;
+
+    // 3 pure virtual methods of the paxosService
+    void create_initial()override{};
+    void create_pending()override ;
+    void encode_pending(MonitorDBStore::TransactionRef t)override ;
+
+    void init() override;
+    void on_shutdown() override;
+    void on_restart() override;
+    void update_from_paxos(bool *need_bootstrap) override;
+
+
+    bool preprocess_query(MonOpRequestRef op) override;
+    bool prepare_update(MonOpRequestRef op) override;
+
+    bool preprocess_command(MonOpRequestRef op);
+    bool prepare_command(MonOpRequestRef op);
+
+    void encode_full(MonitorDBStore::TransactionRef t) override { }
+
+    bool preprocess_beacon(MonOpRequestRef op);
+    bool prepare_beacon(MonOpRequestRef op);
+
+    void tick() override;
+    void print_summary(ceph::Formatter *f, std::ostream *ss) const;
+
+    void check_subs(bool type);
+    void check_sub(Subscription *sub);
+
+private:
+    bool last_leader;
+    void synchronize_last_beacon();
+
+};
+
+#endif /* MON_NVMEGWMONITOR_H_ */
diff --git a/src/mon/NVMeofGwSerialize.h b/src/mon/NVMeofGwSerialize.h
new file mode 100755 (executable)
index 0000000..0c8414e
--- /dev/null
@@ -0,0 +1,602 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+#ifndef MON_NVMEOFGWSERIALIZE_H_
+#define MON_NVMEOFGWSERIALIZE_H_
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define MODULE_PREFFIX "nvmeofgw "
+#define dout_prefix *_dout << MODULE_PREFFIX << __PRETTY_FUNCTION__ << " "
+
+inline std::ostream& operator<<(std::ostream& os, const GW_EXPORTED_STATES_PER_AGROUP_E value) {
+    switch (value) {
+        case GW_EXPORTED_STATES_PER_AGROUP_E::GW_EXPORTED_OPTIMIZED_STATE: os << "OPTIMIZED "; break;
+        case GW_EXPORTED_STATES_PER_AGROUP_E::GW_EXPORTED_INACCESSIBLE_STATE: os << "INACCESSIBLE "; break;
+        default: os << "Invalid " << (int)value << " ";
+    }
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const GW_STATES_PER_AGROUP_E value) {
+    switch (value) {
+        case GW_STATES_PER_AGROUP_E::GW_IDLE_STATE:                  os << "IDLE "; break;
+        case GW_STATES_PER_AGROUP_E::GW_STANDBY_STATE:               os << "STANDBY "; break;
+        case GW_STATES_PER_AGROUP_E::GW_ACTIVE_STATE:                os << "ACTIVE "; break;
+        case GW_STATES_PER_AGROUP_E::GW_OWNER_WAIT_FAILBACK_PREPARED: os << "OWNER_FAILBACK_PREPARED "; break;
+        case GW_STATES_PER_AGROUP_E::GW_WAIT_FAILBACK_PREPARED:      os << "WAIT_FAILBACK_PREPARED "; break;
+        case GW_STATES_PER_AGROUP_E::GW_WAIT_BLOCKLIST_CMPL:       os <<   "WAIT_BLOCKLIST_CMPL "; break;
+        default: os << "Invalid " << (int)value << " ";
+    }
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const GW_AVAILABILITY_E value) {
+    switch (value) {
+
+        case GW_AVAILABILITY_E::GW_CREATED: os << "CREATED"; break;
+        case GW_AVAILABILITY_E::GW_AVAILABLE: os << "AVAILABLE"; break;
+        case GW_AVAILABILITY_E::GW_UNAVAILABLE: os << "UNAVAILABLE"; break;
+
+        default: os << "Invalid " << (int)value << " ";
+    }
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const SM_STATE value) {
+    os << "SM_STATE [ ";
+    for (int i = 0; i < MAX_SUPPORTED_ANA_GROUPS; i++) os << value[i];
+    os << "]";
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const BeaconNamespace value) {
+    os << "BeaconNamespace( anagrpid:" << value.anagrpid << ", nonce:" << value.nonce <<" )";
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const BeaconListener value) {
+    os << "BeaconListener( addrfam:" << value.address_family
+                        << ", addr:" << value.address
+                        << ", svcid:" << value.svcid << " )";
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const BeaconSubsystem value) {
+    os << "BeaconSubsystem( nqn:" << value.nqn << ", listeners [ ";
+    for (const auto& list: value.listeners) os << list << " ";
+    os << "] namespaces [ ";
+    for (const auto& ns: value.namespaces) os << ns << " ";
+    os << "] )";
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const NqnState value) {
+    os << "NqnState( nqn: " << value.nqn << ", " << value.ana_state << " )";
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const NvmeGwState value) {
+    os <<  "NvmeGwState { group id: " << value.group_id <<  " gw_map_epoch " <<  value.gw_map_epoch
+        << " GwSubsystems: [ ";
+    for (const auto& sub: value.subsystems) os << sub.second << " ";
+    os << " ] }";
+
+    return os;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const NvmeGroupKey value) {
+    os << "NvmeGroupKey {" << value.first << "," << value.second << "}";
+    return os;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const NvmeGwMap value) {
+    os << "NvmeGwMap ";
+    for (auto& gw_state: value) {
+        os << "\n" << MODULE_PREFFIX <<" { == gw_id: " << gw_state.first << " -> " <<  gw_state.second << "}";
+    }
+    os << "}";
+
+    return os;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const NvmeNonceVector value) {
+    for (auto & nonces : value) {
+        os <<  nonces << " ";
+    }
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const NvmeAnaNonceMap value) {
+    if(value.size()) os << "\n" << MODULE_PREFFIX;
+    for (auto &nonce_map : value) {
+        os  << "  ana_grp: " << nonce_map.first  << " [ " << nonce_map.second << "]\n"<< MODULE_PREFFIX ;
+    }
+    return os;
+}
+
+inline std::ostream& print_gw_created_t(std::ostream& os, const NvmeGwCreated value, size_t num_ana_groups, NvmeAnaGrpId *anas) {
+    os << "==Internal map ==NvmeGwCreated { ana_group_id " << value.ana_grp_id << " osd_epochs: ";
+    for(size_t i = 0; i < num_ana_groups; i ++){
+        os << " " << anas[i] <<": " << value.blocklist_data[anas[i]].osd_epoch << ":" <<value.blocklist_data[anas[i]].is_failover ;
+    }
+    os << "\n" << MODULE_PREFFIX << "nonces: " << value.nonce_map << " }";
+    os << "\n" << MODULE_PREFFIX << "saved-nonces: " << value.copied_nonce_map << " }";
+    for (size_t i = 0; i < num_ana_groups; i++) {
+        os << " " << anas[i] <<": " << value.sm_state[anas[i]] << ",";
+    }
+    os <<  "]\n"<< MODULE_PREFFIX << " failover peers ";
+    for (size_t i = 0; i < num_ana_groups; i++) {
+        os << anas[i] <<": "  << value.failover_peer[anas[i]] << ",";
+    }
+    os << "]\n"<< MODULE_PREFFIX << "availability " << value.availability << "]";
+
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const NvmeGwCreated value) {
+    os << "==Internal map ==G W_CREATED_T { ana_group_id " << value.ana_grp_id << " osd_epochs: ";
+    for(int i = 0; i < MAX_SUPPORTED_ANA_GROUPS; i ++){
+        os << " " << value.blocklist_data[i].osd_epoch;
+    }
+    os << "\n" << MODULE_PREFFIX << "nonces: " << value.nonce_map << " }";
+
+    for (int i = 0; i < MAX_SUPPORTED_ANA_GROUPS; i++) {
+        os << value.sm_state[i] << ",";
+    }
+    os <<  "]\n"<< MODULE_PREFFIX << " failover peers ";
+    for (int i = 0; i < MAX_SUPPORTED_ANA_GROUPS; i++) {
+        os << value.failover_peer[i] << ",";
+    }
+    os <<  "]\n"<< MODULE_PREFFIX << " beacon-subsystems ";
+    for (const auto& sub: value.subsystems) {
+        os << sub << ",";
+    }
+
+    os << "]\n"<< MODULE_PREFFIX << "availability " << value.availability << "]";
+
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const NvmeGwCreatedMap value) {
+    if(value.size()) os << "\n" << MODULE_PREFFIX;;
+    NvmeAnaGrpId anas[MAX_SUPPORTED_ANA_GROUPS];
+    int i = 0;
+    for(auto &it: value ){
+       anas[i++] = it.second.ana_grp_id; // effective ana groups for these GWs within group_pool
+    }
+    for (auto &gw_created_map : value) {
+        os  <<  "gw_id: " << gw_created_map.first  << " [ " ;//<< gw_created_map.second << "] \n"<< MODULE_PREFFIX;
+        print_gw_created_t(os, gw_created_map.second,  value.size(), anas);
+        os << "] \n"<< MODULE_PREFFIX;
+    }
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const NVMeofGwMap value) {
+    os << "NVMeofGwMap [ Created_gws: ";
+    for (auto& group_gws: value.Created_gws) {
+        os <<  "\n" <<  MODULE_PREFFIX  << "{ " << group_gws.first << " } -> { " << group_gws.second << " }";
+    }
+    os << "]";
+    return os;
+}
+
+inline void encode(const ANA_STATE& st,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode((uint32_t)st.size(), bl);
+    for (const auto& gr: st){
+        encode((uint32_t)gr.first, bl);
+        encode((uint32_t)gr.second, bl);
+    }
+    ENCODE_FINISH(bl);
+}
+
+inline void decode(ANA_STATE& st, ceph::buffer::list::const_iterator &bl) {
+    uint32_t n;
+    DECODE_START(1, bl);
+    decode(n, bl);
+    st.resize(n);
+    for (uint32_t i = 0; i < n; i++) {
+        uint32_t a, b;
+        decode(a, bl);
+        decode(b, bl);
+        st[i].first  = (GW_EXPORTED_STATES_PER_AGROUP_E)a;
+        st[i].second = (epoch_t)b;
+    }
+    DECODE_FINISH(bl);
+}
+
+inline void encode(const GwSubsystems& subsystems,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode((uint32_t)subsystems.size(), bl);
+    for (const auto& sub: subsystems){
+        encode(sub.second.nqn, bl);
+        encode(sub.second.ana_state, bl);
+    }
+    ENCODE_FINISH(bl);
+}
+
+inline  void decode(GwSubsystems& subsystems,  ceph::bufferlist::const_iterator& bl) {
+  uint32_t num_subsystems;
+  DECODE_START(1, bl);
+  decode(num_subsystems, bl);
+  subsystems.clear();
+  for (uint32_t i=0; i<num_subsystems; i++){
+     std::string  nqn;
+     decode(nqn, bl);
+     ANA_STATE st;
+     decode(st, bl);
+     subsystems.insert({nqn, NqnState(nqn, st)});
+  }
+  DECODE_FINISH(bl);
+}
+
+inline void encode(const NvmeGwState& state,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode(state.group_id, bl);
+    encode(state.gw_map_epoch, bl);
+    encode (state.subsystems, bl);
+    ENCODE_FINISH(bl);
+}
+
+inline  void decode(NvmeGwState& state,  ceph::bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(state.group_id, bl);
+    decode(state.gw_map_epoch, bl);
+    decode(state.subsystems, bl);
+    DECODE_FINISH(bl);
+}
+
+inline  void encode(const NvmeGwMetaData& state,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode((uint32_t)MAX_SUPPORTED_ANA_GROUPS, bl);
+    for(int i = 0; i <MAX_SUPPORTED_ANA_GROUPS; i ++){
+        int    tick = state.data[i].timer_started;
+        uint8_t val = state.data[i].timer_value;
+        encode(tick, bl);
+        encode(val,  bl);
+        auto endtime = state.data[i].end_time;
+        // Convert the time point to milliseconds since the epoch
+        uint64_t  millisecondsSinceEpoch = std::chrono::duration_cast<std::chrono::milliseconds>(endtime.time_since_epoch()).count();
+        encode(millisecondsSinceEpoch , bl);
+    }
+    ENCODE_FINISH(bl);
+}
+
+inline  void decode(NvmeGwMetaData& state,  ceph::bufferlist::const_iterator& bl) {
+    uint32_t s;
+      
+    DECODE_START(1, bl);
+    decode(s, bl);
+    ceph_assert(s == (uint32_t)MAX_SUPPORTED_ANA_GROUPS);
+    for(int i = 0; i <MAX_SUPPORTED_ANA_GROUPS; i ++){
+        int tick;
+        uint8_t val;
+        decode(tick, bl);
+        decode(val,  bl);
+        state.data[i].timer_started = tick;
+        state.data[i].timer_value = val;
+        uint64_t milliseconds;
+        decode(milliseconds, bl);
+        auto duration = std::chrono::milliseconds(milliseconds);
+        state.data[i].end_time = std::chrono::time_point<std::chrono::system_clock>(duration);
+    }
+    DECODE_FINISH(bl);
+}
+
+inline void encode(const NvmeAnaNonceMap& nonce_map,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode((uint32_t)nonce_map.size(), bl);
+    for (auto& ana_group_nonces : nonce_map) {
+        encode(ana_group_nonces.first, bl); // ana group id
+        encode ((uint32_t)ana_group_nonces.second.size(), bl); // encode the vector size
+        for (auto& nonce: ana_group_nonces.second) encode(nonce, bl);
+    }
+    ENCODE_FINISH(bl);
+}
+
+inline void decode(NvmeAnaNonceMap& nonce_map, ceph::buffer::list::const_iterator &bl) {
+    uint32_t map_size;
+    NvmeAnaGrpId ana_grp_id;
+    uint32_t vector_size;
+    std::string nonce;
+    DECODE_START(1, bl);
+    decode(map_size, bl);
+    for(uint32_t i = 0; i<map_size; i++){
+        decode(ana_grp_id, bl);
+        decode(vector_size,bl);
+        for(uint32_t j = 0; j < vector_size; j++){
+            decode (nonce, bl);
+            nonce_map[ana_grp_id].push_back(nonce);
+        }
+    }
+    DECODE_FINISH(bl);
+}
+
+inline void encode(const NvmeGwCreatedMap& gws,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode ((uint32_t)gws.size(), bl); // number of gws in the group
+    for(auto& gw : gws){
+        encode(gw.first, bl);// GW_id
+        encode(gw.second.ana_grp_id, bl); // GW owns this group-id
+
+        for(int i = 0; i <MAX_SUPPORTED_ANA_GROUPS; i ++){
+            encode((uint32_t)(gw.second.sm_state[i]), bl);
+        }
+        for(int i = 0; i <MAX_SUPPORTED_ANA_GROUPS; i ++){
+            encode((gw.second.failover_peer[i]), bl);
+        }
+        encode((uint32_t)gw.second.availability, bl);
+        encode((uint32_t)gw.second.last_gw_map_epoch_valid, bl);
+        encode(gw.second.subsystems, bl);
+
+        for(int i=0; i< MAX_SUPPORTED_ANA_GROUPS; i++){
+            encode(gw.second.blocklist_data[i].osd_epoch, bl);
+            encode(gw.second.blocklist_data[i].is_failover, bl);
+        }
+        encode(gw.second.nonce_map, bl);
+        encode(gw.second.copied_nonce_map, bl);
+    }
+    ENCODE_FINISH(bl);
+}
+
+inline void decode(NvmeGwCreatedMap& gws, ceph::buffer::list::const_iterator &bl) {
+    gws.clear();
+    uint32_t num_created_gws;
+    DECODE_START(1, bl);
+    decode(num_created_gws, bl);
+
+    for(uint32_t i = 0; i<num_created_gws; i++){
+        std::string gw_name;
+        decode(gw_name, bl);
+        NvmeAnaGrpId ana_grp_id;
+        decode(ana_grp_id, bl);
+
+        NvmeGwCreated gw_created(ana_grp_id);
+        uint32_t sm_state;
+        NvmeGwId peer_name;
+        for(int i = 0; i <MAX_SUPPORTED_ANA_GROUPS; i ++){
+            decode(sm_state, bl);
+            gw_created.sm_state[i] = (GW_STATES_PER_AGROUP_E)  sm_state;
+        }
+        for(int i = 0; i <MAX_SUPPORTED_ANA_GROUPS; i ++){
+            decode(peer_name, bl);
+            gw_created.failover_peer[i] = peer_name;
+        }
+        uint32_t avail;
+        decode(avail, bl);
+        gw_created.availability = (GW_AVAILABILITY_E)avail;
+        uint32_t gwmap_epoch;
+        decode(gwmap_epoch, bl);
+        gw_created.last_gw_map_epoch_valid = (bool)gwmap_epoch;
+        BeaconSubsystems   subsystems;
+        decode(subsystems, bl);
+        gw_created.subsystems = subsystems;
+
+        for(int i=0; i< MAX_SUPPORTED_ANA_GROUPS; i++){
+            decode(gw_created.blocklist_data[i].osd_epoch, bl);
+            decode(gw_created.blocklist_data[i].is_failover, bl);
+        }
+        decode(gw_created.nonce_map, bl);
+        decode(gw_created.copied_nonce_map, bl);
+
+        gws[gw_name] = gw_created;
+    }
+    DECODE_FINISH(bl);
+}
+
+inline void encode(const std::map<NvmeGroupKey, NvmeGwCreatedMap>& created_gws,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode ((uint32_t)created_gws.size(), bl); // number of groups
+    for (auto& group_gws: created_gws) {
+        auto& group_key = group_gws.first;
+        encode(group_key.first, bl); // pool
+        encode(group_key.second, bl); // group
+
+        auto& gws = group_gws.second;
+        encode (gws, bl); // encode group gws
+    }
+    ENCODE_FINISH(bl);
+}
+
+inline void decode(std::map<NvmeGroupKey, NvmeGwCreatedMap>& created_gws, ceph::buffer::list::const_iterator &bl) {
+    created_gws.clear();
+    uint32_t ngroups;
+    DECODE_START(1, bl);
+    decode(ngroups, bl);
+    for(uint32_t i = 0; i<ngroups; i++){
+        std::string pool, group;
+        decode(pool, bl);
+        decode(group, bl);
+        NvmeGwCreatedMap cmap;
+        decode(cmap, bl);
+        created_gws[std::make_pair(pool, group)] = cmap;
+    }
+    DECODE_FINISH(bl);
+}
+
+inline void encode(const NvmeGwMap& subsyst_gwmap,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode((uint32_t)subsyst_gwmap.size(), bl);
+    for (auto& subsyst: subsyst_gwmap) {
+        encode(subsyst.first, bl);
+        encode(subsyst.second, bl);
+    }
+    ENCODE_FINISH(bl);
+}
+
+inline void decode(NvmeGwMap& subsyst_gwmap, ceph::buffer::list::const_iterator &bl) {
+    subsyst_gwmap.clear();
+    uint32_t num_gws;
+    DECODE_START(1, bl);
+    decode(num_gws, bl);
+
+    for (uint32_t i = 0; i < num_gws; i++) {
+        NvmeGwId gw_id;
+        decode(gw_id, bl);
+        NvmeGwState gw_st;
+        decode(gw_st, bl);
+        subsyst_gwmap[gw_id] = gw_st;
+    }
+    DECODE_FINISH(bl);
+}
+
+// Start encode  NvmeGroupKey, GMAP
+inline void encode(const std::map<NvmeGroupKey, NvmeGwMap>& gmap,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode ((uint32_t)gmap.size(), bl); // number of groups
+    for (auto& group_state: gmap) {
+        auto& group_key = group_state.first;
+        encode(group_key.first, bl); // pool
+        encode(group_key.second, bl); // group
+        encode(group_state.second, bl);
+    }
+    ENCODE_FINISH(bl);
+}
+// Start decode NvmeGroupKey, NvmeGwMap
+inline void decode(std::map<NvmeGroupKey, NvmeGwMap>& gmap, ceph::buffer::list::const_iterator &bl) {
+    gmap.clear();
+    uint32_t ngroups;
+    DECODE_START(1, bl);
+    decode(ngroups, bl);
+    for(uint32_t i = 0; i<ngroups; i++){
+        std::string pool, group;
+        decode(pool, bl);
+        decode(group, bl);
+        NvmeGwMap grp_map;
+        decode(grp_map, bl);
+        gmap[std::make_pair(pool, group)] = grp_map;
+    }
+    DECODE_FINISH(bl);
+}
+
+inline void encode(const std::map<NvmeGroupKey, NvmeGwMetaDataMap>& gmetadata,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode ((uint32_t)gmetadata.size(), bl); // number of groups
+    for (auto& group_md: gmetadata) {
+        auto& group_key = group_md.first;
+        encode(group_key.first, bl); // pool
+        encode(group_key.second, bl); // group
+
+        encode(group_md.second, bl);
+    }
+    ENCODE_FINISH(bl);
+}
+
+inline void decode(std::map<NvmeGroupKey, NvmeGwMetaDataMap>& gmetadata, ceph::buffer::list::const_iterator &bl) {
+    gmetadata.clear();
+    uint32_t ngroups;
+    DECODE_START(1, bl);
+    decode(ngroups, bl);
+    for(uint32_t i = 0; i<ngroups; i++){
+        std::string pool, group;
+        decode(pool, bl);
+        decode(group, bl);
+        NvmeGwMetaDataMap gmd;
+        decode(gmd, bl);
+        gmetadata[std::make_pair(pool, group)] = gmd;
+    }
+    DECODE_FINISH(bl);
+}
+
+inline void encode(const NvmeGwMetaDataMap& group_md,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode ((uint32_t)group_md.size(), bl); // number of groups
+    for (auto& gw_md: group_md) {
+        encode(gw_md.first, bl); // gw
+        encode(gw_md.second, bl); //  map of this gw
+    }
+    ENCODE_FINISH(bl);
+}
+
+inline void decode(NvmeGwMetaDataMap& md, ceph::buffer::list::const_iterator &bl) {
+    uint32_t num_gws;
+    DECODE_START(1, bl);
+    decode(num_gws, bl);
+    for (uint32_t i = 0; i < num_gws; i++) {
+        std::string gw_id;
+        decode(gw_id, bl);
+        NvmeGwMetaData gw_meta;
+        decode(gw_meta, bl);
+        md[gw_id] = gw_meta;
+    }
+    DECODE_FINISH(bl);
+}
+
+inline void encode(const BeaconNamespace& ns,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode(ns.anagrpid, bl);
+    encode(ns.nonce, bl);
+    ENCODE_FINISH(bl);
+}
+
+inline void decode(BeaconNamespace& ns, ceph::buffer::list::const_iterator &bl) {
+    DECODE_START(1, bl);
+    decode(ns.anagrpid, bl);
+    decode(ns.nonce, bl);
+    DECODE_FINISH(bl);
+}
+
+inline void encode(const BeaconListener& ls,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode(ls.address_family, bl);
+    encode(ls.address, bl);
+    encode(ls.svcid, bl);
+    ENCODE_FINISH(bl);
+}
+
+inline void decode(BeaconListener& ls, ceph::buffer::list::const_iterator &bl) {
+    DECODE_START(1, bl);
+    decode(ls.address_family, bl);
+    decode(ls.address, bl);
+    decode(ls.svcid, bl);
+    DECODE_FINISH(bl);
+}
+
+inline void encode(const BeaconSubsystem& sub,  ceph::bufferlist &bl) {
+    ENCODE_START(1, 1, bl);
+    encode(sub.nqn, bl);
+    encode((uint32_t)sub.listeners.size(), bl);
+    for (const auto& ls: sub.listeners)
+        encode(ls, bl);
+    encode((uint32_t)sub.namespaces.size(), bl);
+    for (const auto& ns: sub.namespaces)
+        encode(ns, bl);
+    ENCODE_FINISH(bl);
+}
+
+inline void decode(BeaconSubsystem& sub, ceph::buffer::list::const_iterator &bl) {
+    DECODE_START(1, bl);
+    decode(sub.nqn, bl);
+    uint32_t s;
+    sub.listeners.clear();
+    decode(s, bl);
+    for (uint32_t i = 0; i < s; i++) {
+        BeaconListener ls;
+        decode(ls, bl);
+        sub.listeners.push_back(ls);
+    }
+
+    sub.namespaces.clear();
+    decode(s, bl);
+    for (uint32_t i = 0; i < s; i++) {
+        BeaconNamespace ns;
+        decode(ns, bl);
+        sub.namespaces.push_back(ns);
+    }
+    DECODE_FINISH(bl);
+}
+
+
+#undef dout_subsys
+#endif /* SRC_MON_NVMEOFGWSERIALIZEP_H_ */
diff --git a/src/mon/NVMeofGwTypes.h b/src/mon/NVMeofGwTypes.h
new file mode 100755 (executable)
index 0000000..5eb1a3b
--- /dev/null
@@ -0,0 +1,167 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef MON_NVMEOFGWTYPES_H_
+#define MON_NVMEOFGWTYPES_H_
+#include <string>
+#include <iomanip>
+#include <map>
+#include <iostream>
+
+using NvmeGwId      = std::string;
+using NvmeGroupKey  = std::pair<std::string, std::string>;
+using NvmeNqnId     = std::string;
+using NvmeAnaGrpId  = uint32_t;
+
+
+enum class GW_STATES_PER_AGROUP_E {
+    GW_IDLE_STATE = 0, //invalid state
+    GW_STANDBY_STATE,
+    GW_ACTIVE_STATE,
+    GW_OWNER_WAIT_FAILBACK_PREPARED,
+    GW_WAIT_FAILBACK_PREPARED,
+    GW_WAIT_BLOCKLIST_CMPL
+};
+
+enum class GW_EXPORTED_STATES_PER_AGROUP_E {
+    GW_EXPORTED_OPTIMIZED_STATE = 0,
+    GW_EXPORTED_INACCESSIBLE_STATE
+};
+
+enum class GW_AVAILABILITY_E {
+    GW_CREATED = 0,
+    GW_AVAILABLE,
+    GW_UNAVAILABLE,
+    GW_DELETED
+};
+
+#define MAX_SUPPORTED_ANA_GROUPS 16
+#define REDUNDANT_GW_ANA_GROUP_ID 0xFF
+
+typedef GW_STATES_PER_AGROUP_E          SM_STATE         [MAX_SUPPORTED_ANA_GROUPS];
+
+using ANA_STATE = std::vector<std::pair<GW_EXPORTED_STATES_PER_AGROUP_E, epoch_t>>;
+
+struct BeaconNamespace {
+    NvmeAnaGrpId anagrpid;
+    std::string  nonce;
+};
+
+struct BeaconListener {
+    std::string address_family; // IPv4 or IPv6
+    std::string address;        //
+    std::string svcid;          // port
+};
+
+struct BeaconSubsystem {
+    NvmeNqnId nqn;
+    std::list<BeaconListener>  listeners;
+    std::list<BeaconNamespace> namespaces;
+};
+
+using BeaconSubsystems = std::list<BeaconSubsystem>;
+
+using NvmeNonceVector    = std::vector<std::string>;
+using NvmeAnaNonceMap  = std::map <NvmeAnaGrpId, NvmeNonceVector>;
+
+struct NvmeGwCreated {
+    NvmeAnaGrpId       ana_grp_id;                    // ana-group-id allocated for this GW, GW owns this group-id
+    GW_AVAILABILITY_E  availability;                  // in absence of  beacon  heartbeat messages it becomes inavailable
+    bool               last_gw_map_epoch_valid;       // "true" if the last epoch seen by the gw-client is up-to-date
+    BeaconSubsystems   subsystems;                    // gateway susbsystem and their state machine states
+    NvmeAnaNonceMap    nonce_map;
+    NvmeAnaNonceMap    copied_nonce_map;
+    SM_STATE           sm_state;                      // state machine states per ANA group
+    NvmeGwId           failover_peer[MAX_SUPPORTED_ANA_GROUPS];
+    struct{
+       epoch_t     osd_epoch;
+       bool        is_failover;
+    }blocklist_data[MAX_SUPPORTED_ANA_GROUPS];
+
+    NvmeGwCreated(): ana_grp_id(REDUNDANT_GW_ANA_GROUP_ID) {};
+
+    NvmeGwCreated(NvmeAnaGrpId id): ana_grp_id(id), availability(GW_AVAILABILITY_E::GW_CREATED), last_gw_map_epoch_valid(false)
+    {
+        for (int i = 0; i < MAX_SUPPORTED_ANA_GROUPS; i++){
+            sm_state[i] = GW_STATES_PER_AGROUP_E::GW_STANDBY_STATE;
+            failover_peer[i]  = "";
+            blocklist_data[i].osd_epoch = 0;
+            blocklist_data[i].is_failover = true;
+        }
+    };
+
+    void standby_state(NvmeAnaGrpId grpid) {
+           sm_state[grpid]       = GW_STATES_PER_AGROUP_E::GW_STANDBY_STATE;
+           failover_peer[grpid]  = "";
+    };
+    void active_state(NvmeAnaGrpId grpid) {
+           sm_state[grpid]       = GW_STATES_PER_AGROUP_E::GW_ACTIVE_STATE;
+           blocklist_data[grpid].osd_epoch = 0;
+    };
+};
+
+struct NqnState {
+    std::string   nqn;          // subsystem NQN
+    ANA_STATE     ana_state;    // subsystem's ANA state
+
+    // constructors
+    NqnState(const std::string& _nqn, const ANA_STATE& _ana_state):
+        nqn(_nqn), ana_state(_ana_state)  {}
+    NqnState(const std::string& _nqn, const SM_STATE& sm_state, const NvmeGwCreated & gw_created) : nqn(_nqn)  {
+        for (int i=0; i < MAX_SUPPORTED_ANA_GROUPS; i++){
+            std::pair<GW_EXPORTED_STATES_PER_AGROUP_E, epoch_t> state_pair;
+            state_pair.first = (  sm_state[i] == GW_STATES_PER_AGROUP_E::GW_ACTIVE_STATE
+                              || sm_state[i] == GW_STATES_PER_AGROUP_E::GW_WAIT_BLOCKLIST_CMPL)
+                           ? GW_EXPORTED_STATES_PER_AGROUP_E::GW_EXPORTED_OPTIMIZED_STATE
+                           : GW_EXPORTED_STATES_PER_AGROUP_E::GW_EXPORTED_INACCESSIBLE_STATE;
+            state_pair.second = gw_created.blocklist_data[i].osd_epoch;
+            ana_state.push_back(state_pair);
+        }
+    }
+};
+
+typedef std::map<NvmeNqnId, NqnState> GwSubsystems;
+
+struct NvmeGwState {
+    NvmeAnaGrpId              group_id;
+    epoch_t                   gw_map_epoch;
+    GwSubsystems              subsystems;
+
+    NvmeGwState(NvmeAnaGrpId id, epoch_t epoch):
+        group_id(id),
+        gw_map_epoch(epoch)
+    {};
+
+    NvmeGwState() : NvmeGwState(REDUNDANT_GW_ANA_GROUP_ID, 0) {};
+};
+
+struct NvmeGwMetaData {
+   struct{
+      uint32_t     timer_started; // statemachine timer(timestamp) set in some state
+      uint8_t      timer_value;
+      std::chrono::system_clock::time_point end_time;
+   } data[MAX_SUPPORTED_ANA_GROUPS];
+
+    NvmeGwMetaData() {
+        for (int i=0; i<MAX_SUPPORTED_ANA_GROUPS; i++){
+            data[i].timer_started = 0;
+            data[i].timer_value = 0;
+        }
+    };
+};
+
+using NvmeGwMap             = std::map<NvmeGwId, NvmeGwState>;
+using NvmeGwMetaDataMap     = std::map<NvmeGwId, NvmeGwMetaData>;
+using NvmeGwCreatedMap      = std::map<NvmeGwId, NvmeGwCreated>;
+
+#endif /* SRC_MON_NVMEOFGWTYPES_H_ */
index 02aad5a0b870b4bef7eae32563eba536c3de848e..8d774fc54d44bec91d5d04ded4443519b4ed42d5 100644 (file)
@@ -36,6 +36,7 @@ enum {
   PAXOS_HEALTH,
   PAXOS_CONFIG,
   PAXOS_KV,
+  PAXOS_NVMEGW,
   PAXOS_NUM
 };
 
index 348546abdf01102a9bdf09ed79af323ecf64017f..024f167ae72cbfcb8a4c5b7c0a014ee51010088e 100644 (file)
 #include "messages/MOSDPGUpdateLogMissing.h"
 #include "messages/MOSDPGUpdateLogMissingReply.h"
 
+#include "messages/MNVMeofGwBeacon.h"
+#include "messages/MNVMeofGwMap.h"
+
 #ifdef WITH_BLKIN
 #include "Messenger.h"
 #endif
@@ -866,6 +869,10 @@ Message *decode_message(CephContext *cct,
     m = make_message<MMgrBeacon>();
     break;
 
+  case MSG_MNVMEOF_GW_BEACON:
+    m = make_message<MNVMeofGwBeacon>();
+  break;
+
   case MSG_MON_MGR_REPORT:
     m = make_message<MMonMgrReport>();
     break;
@@ -925,6 +932,9 @@ Message *decode_message(CephContext *cct,
     m = make_message<MMonHealthChecks>();
     break;
 
+  case MSG_MNVMEOF_GW_MAP:
+    m = make_message<MNVMeofGwMap>();
+    break;
     // -- simple messages without payload --
 
   case CEPH_MSG_SHUTDOWN:
index f27c5448ea2daa5fce762bae87374704edab799b..f62e27ff4f3037396c9c34772a3fac4c1358ad5d 100644 (file)
 #define MSG_MGR_MAP               0x704
 
 // *** ceph-mon(MgrMonitor) -> ceph-mgr
-#define MSG_MGR_DIGEST               0x705
+#define MSG_MGR_DIGEST            0x705
 // *** cephmgr -> ceph-mon
 #define MSG_MON_MGR_REPORT        0x706
 #define MSG_SERVICE_MAP           0x707
 #define MSG_MGR_COMMAND_REPLY     0x70a
 
 // *** ceph-mgr <-> MON daemons ***
-#define MSG_MGR_UPDATE     0x70b
+#define MSG_MGR_UPDATE            0x70b
+
+// *** nvmeof mon -> gw daemons ***
+#define MSG_MNVMEOF_GW_MAP        0x800
+
+// *** gw daemons -> nvmeof mon  ***
+#define MSG_MNVMEOF_GW_BEACON     0x801
 
 // ======================================================
 
diff --git a/src/nvmeof/NVMeofGwClient.cc b/src/nvmeof/NVMeofGwClient.cc
new file mode 100644 (file)
index 0000000..c82423d
--- /dev/null
@@ -0,0 +1,32 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include "NVMeofGwClient.h"
+
+bool NVMeofGwClient::get_subsystems(subsystems_info& reply) {
+  get_subsystems_req request;
+  ClientContext context;
+
+  Status status = stub_->get_subsystems(&context, request, &reply);
+
+  return status.ok();
+}
+
+bool NVMeofGwClient::set_ana_state(const ana_info& info) {
+  req_status reply;
+  ClientContext context;
+
+  Status status = stub_->set_ana_state(&context, info, &reply);
+
+  return status.ok() && reply.status();
+}
diff --git a/src/nvmeof/NVMeofGwClient.h b/src/nvmeof/NVMeofGwClient.h
new file mode 100644 (file)
index 0000000..0224852
--- /dev/null
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+
+#ifndef  __NVMEOFGWCLIENT_H__
+#define  __NVMEOFGWCLIENT_H__
+#include <iostream>
+#include <memory>
+#include <string>
+
+#include <grpcpp/grpcpp.h>
+
+#include "gateway.grpc.pb.h"
+
+using grpc::Channel;
+using grpc::ClientContext;
+using grpc::Status;
+
+class NVMeofGwClient {
+ public:
+  NVMeofGwClient(std::shared_ptr<Channel> channel)
+      : stub_(Gateway::NewStub(channel)) {}
+
+  bool get_subsystems(subsystems_info& reply);
+  bool set_ana_state(const ana_info& info);
+
+ private:
+  std::unique_ptr<Gateway::Stub> stub_;
+};
+#endif
diff --git a/src/nvmeof/NVMeofGwMonitorClient.cc b/src/nvmeof/NVMeofGwMonitorClient.cc
new file mode 100644 (file)
index 0000000..28330e4
--- /dev/null
@@ -0,0 +1,384 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023,2024 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include <boost/algorithm/string/replace.hpp>
+
+#include "common/errno.h"
+#include "common/signal.h"
+#include "common/ceph_argparse.h"
+#include "include/compat.h"
+
+#include "include/stringify.h"
+#include "global/global_context.h"
+#include "global/signal_handler.h"
+
+
+#include "messages/MNVMeofGwBeacon.h"
+#include "messages/MNVMeofGwMap.h"
+#include "NVMeofGwMonitorClient.h"
+#include "NVMeofGwClient.h"
+#include "NVMeofGwMonitorGroupClient.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix *_dout << "nvmeofgw " << __PRETTY_FUNCTION__ << " "
+
+NVMeofGwMonitorClient::NVMeofGwMonitorClient(int argc, const char **argv) :
+  Dispatcher(g_ceph_context),
+  osdmap_epoch(0),
+  gwmap_epoch(0),
+  monc{g_ceph_context, poolctx},
+  client_messenger(Messenger::create(g_ceph_context, "async", entity_name_t::CLIENT(-1), "client", getpid())),
+  objecter{g_ceph_context, client_messenger.get(), &monc, poolctx},
+  client{client_messenger.get(), &monc, &objecter},
+  timer(g_ceph_context, lock),
+  orig_argc(argc),
+  orig_argv(argv)
+{
+}
+
+NVMeofGwMonitorClient::~NVMeofGwMonitorClient() = default;
+
+const char** NVMeofGwMonitorClient::get_tracked_conf_keys() const
+{
+  static const char* KEYS[] = {
+    NULL
+  };
+  return KEYS;
+}
+
+int NVMeofGwMonitorClient::init()
+{
+  dout(0) << dendl;
+  std::string val;
+  auto args = argv_to_vec(orig_argc, orig_argv);
+
+  for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
+    if (ceph_argparse_double_dash(args, i)) {
+      break;
+    } else if (ceph_argparse_witharg(args, i, &val, "--gateway-name", (char*)NULL)) {
+      name = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--gateway-pool", (char*)NULL)) {
+      pool = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--gateway-group", (char*)NULL)) {
+      group = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--gateway-address", (char*)NULL)) {
+      gateway_address = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--monitor-group-address", (char*)NULL)) {
+      monitor_address = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--server-key", (char*)NULL)) {
+      server_key = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--server-cert", (char*)NULL)) {
+      server_cert = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--client-cert", (char*)NULL)) {
+      client_cert = val;
+    } else {
+      ++i;
+    }
+  }
+
+  dout(0) << "gateway name: " << name <<
+    " pool:" << pool <<
+    " group:" << group <<
+    " address: " << gateway_address << dendl;
+  ceph_assert(name != "" && pool != "" && gateway_address != "" && monitor_address != "");
+
+  // todo
+  ceph_assert(server_key == "" && server_cert == "" && client_cert == "");
+
+  init_async_signal_handler();
+  register_async_signal_handler(SIGHUP, sighup_handler);
+
+  std::lock_guard l(lock);
+
+  // Initialize Messenger
+  client_messenger->add_dispatcher_tail(this);
+  client_messenger->add_dispatcher_head(&objecter);
+  client_messenger->add_dispatcher_tail(&client);
+  client_messenger->start();
+
+  poolctx.start(2);
+
+  // Initialize MonClient
+  if (monc.build_initial_monmap() < 0) {
+    client_messenger->shutdown();
+    client_messenger->wait();
+    return -1;
+  }
+
+  monc.sub_want("NVMeofGw", 0, 0);
+  monc.set_want_keys(CEPH_ENTITY_TYPE_MON|CEPH_ENTITY_TYPE_OSD
+      |CEPH_ENTITY_TYPE_MDS|CEPH_ENTITY_TYPE_MGR);
+  monc.set_messenger(client_messenger.get());
+
+  // We must register our config callback before calling init(), so
+  // that we see the initial configuration message
+  monc.register_config_callback([this](const std::string &k, const std::string &v){
+      // leaving this for debugging purposes
+      dout(10) << "nvmeof config_callback: " << k << " : " << v << dendl;
+      
+      return false;
+    });
+  monc.register_config_notify_callback([this]() {
+      dout(4) << "nvmeof monc config notify callback" << dendl;
+    });
+  dout(4) << "nvmeof Registered monc callback" << dendl;
+
+  int r = monc.init();
+  if (r < 0) {
+    monc.shutdown();
+    client_messenger->shutdown();
+    client_messenger->wait();
+    return r;
+  }
+  dout(0) << "nvmeof Registered monc callback" << dendl;
+
+  r = monc.authenticate();
+  if (r < 0) {
+    derr << "Authentication failed, did you specify an ID with a valid keyring?" << dendl;
+    monc.shutdown();
+    client_messenger->shutdown();
+    client_messenger->wait();
+    return r;
+  }
+  dout(0) << "monc.authentication done" << dendl;
+  monc.set_passthrough_monmap();
+
+  client_t whoami = monc.get_global_id();
+  client_messenger->set_myname(entity_name_t::MGR(whoami.v));
+  objecter.set_client_incarnation(0);
+  objecter.init();
+  objecter.enable_blocklist_events();
+  objecter.start();
+  client.init();
+  timer.init();
+
+  tick();
+
+  dout(0) << "Complete." << dendl;
+  return 0;
+}
+
+static bool get_gw_state(const char* desc, const std::map<NvmeGroupKey, NvmeGwMap>& m, const NvmeGroupKey& group_key, const NvmeGwId& gw_id, NvmeGwState& out)
+{
+  auto gw_group = m.find(group_key);
+  if (gw_group == m.end()) {
+    dout(0) << "can not find group (" << group_key.first << "," << group_key.second << ") "  << desc << " map: " << m << dendl;
+    return false;
+  }
+  auto gw_state = gw_group->second.find(gw_id);
+  if (gw_state == gw_group->second.end()) {
+    dout(0) << "can not find gw id: " << gw_id << " in " << desc << "group: " << gw_group->second  << dendl;
+    return false;
+  }
+  out = gw_state->second;
+  return true;
+}
+
+void NVMeofGwMonitorClient::send_beacon()
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(lock));
+  //dout(0) << "sending beacon as gid " << monc.get_global_id() << dendl;
+  GW_AVAILABILITY_E gw_availability = GW_AVAILABILITY_E::GW_CREATED;
+  BeaconSubsystems subs;
+  NVMeofGwClient gw_client(
+     grpc::CreateChannel(gateway_address, grpc::InsecureChannelCredentials()));
+  subsystems_info gw_subsystems;
+  bool ok = gw_client.get_subsystems(gw_subsystems);
+  if (ok) {
+    for (int i = 0; i < gw_subsystems.subsystems_size(); i++) {
+      const subsystem& sub = gw_subsystems.subsystems(i);
+      BeaconSubsystem bsub;
+      bsub.nqn = sub.nqn();
+      for (int j = 0; j < sub.namespaces_size(); j++) {
+        const auto& ns = sub.namespaces(j);
+        BeaconNamespace bns = {ns.anagrpid(), ns.nonce()};
+        bsub.namespaces.push_back(bns);
+      }
+      for (int k = 0; k < sub.listen_addresses_size(); k++) {
+        const auto& ls = sub.listen_addresses(k);
+        BeaconListener bls = { ls.adrfam(), ls.traddr(), ls.trsvcid() };
+        bsub.listeners.push_back(bls);
+      }
+      subs.push_back(bsub);
+    }
+  }
+
+  auto group_key = std::make_pair(pool, group);
+  NvmeGwState old_gw_state;
+  // if already got gateway state in the map
+  if (get_gw_state("old map", map, group_key, name, old_gw_state))
+    gw_availability = ok ? GW_AVAILABILITY_E::GW_AVAILABLE : GW_AVAILABILITY_E::GW_UNAVAILABLE;
+  dout(0) << "sending beacon as gid " << monc.get_global_id() << " availability " << (int)gw_availability <<
+    " osdmap_epoch " << osdmap_epoch << " gwmap_epoch " << gwmap_epoch << dendl;
+  auto m = ceph::make_message<MNVMeofGwBeacon>(
+      name,
+      pool,
+      group,
+      subs,
+      gw_availability,
+      osdmap_epoch,
+      gwmap_epoch);
+  monc.send_mon_message(std::move(m));
+}
+
+void NVMeofGwMonitorClient::tick()
+{
+  dout(0) << dendl;
+  send_beacon();
+
+  timer.add_event_after(
+      g_conf().get_val<std::chrono::seconds>("nvmeof_mon_client_tick_period").count(),
+      new LambdaContext([this](int r){
+          tick();
+      }
+  ));
+}
+
+void NVMeofGwMonitorClient::shutdown()
+{
+  std::lock_guard l(lock);
+
+  dout(4) << "nvmeof Shutting down" << dendl;
+
+
+  // stop sending beacon first, I use monc to talk with monitors
+  timer.shutdown();
+  // client uses monc and objecter
+  client.shutdown();
+  // Stop asio threads, so leftover events won't call into shut down
+  // monclient/objecter.
+  poolctx.finish();
+  // stop monc
+  monc.shutdown();
+
+  // objecter is used by monc
+  objecter.shutdown();
+  // client_messenger is used by all of them, so stop it in the end
+  client_messenger->shutdown();
+}
+
+void NVMeofGwMonitorClient::handle_nvmeof_gw_map(ceph::ref_t<MNVMeofGwMap> nmap)
+{
+  auto &new_map = nmap->get_map();
+  gwmap_epoch = nmap->get_gwmap_epoch();
+  auto group_key = std::make_pair(pool, group);
+  dout(0) << "handle nvmeof gw map: " << new_map << dendl;
+
+  NvmeGwState old_gw_state;
+  auto got_old_gw_state = get_gw_state("old map", map, group_key, name, old_gw_state); 
+  NvmeGwState new_gw_state;
+  auto got_new_gw_state = get_gw_state("new map", new_map, group_key, name, new_gw_state); 
+
+  if (!got_old_gw_state) {
+    if (!got_new_gw_state) {
+      dout(0) << "Can not find new gw state" << dendl;
+      return;
+    }
+    bool set_group_id = false;
+    while (!set_group_id) {
+      NVMeofGwMonitorGroupClient monitor_group_client(
+          grpc::CreateChannel(monitor_address, grpc::InsecureChannelCredentials()));
+      dout(0) << "GRPC set_group_id: " <<  new_gw_state.group_id << dendl;
+      set_group_id = monitor_group_client.set_group_id( new_gw_state.group_id);
+      if (!set_group_id) {
+             dout(0) << "GRPC set_group_id failed" << dendl;
+             usleep(1000); // TODO: conf options
+      }
+    }
+  }
+
+  // Make sure we do not get out of order state changes from the monitor
+  if (got_old_gw_state && got_new_gw_state) {
+    dout(0) << "got_old_gw_state: " << old_gw_state << "got_new_gw_state: " << new_gw_state << dendl;
+    ceph_assert(new_gw_state.gw_map_epoch >= old_gw_state.gw_map_epoch);
+  }
+
+  // Gather all state changes
+  ana_info ai;
+  epoch_t max_blocklist_epoch = 0;
+  for (const auto& nqn_state_pair: new_gw_state.subsystems) {
+    auto& sub = nqn_state_pair.second;
+    const auto& nqn = nqn_state_pair.first;
+    nqn_ana_states nas;
+    nas.set_nqn(nqn);
+    for (NvmeAnaGrpId  ana_grp_index = 0; ana_grp_index < sub.ana_state.size(); ana_grp_index++) {
+      const auto& old_nqn_state_pair = old_gw_state.subsystems.find(nqn);
+      auto found_old_nqn_state = (old_nqn_state_pair != old_gw_state.subsystems.end());
+      auto new_group_state = sub.ana_state[ana_grp_index];
+
+      // if no state change detected for this nqn, group id
+      if (got_old_gw_state && found_old_nqn_state &&
+           new_group_state == old_nqn_state_pair->second.ana_state[ana_grp_index]) {
+        continue;
+      }
+      ana_group_state gs;
+      gs.set_grp_id(ana_grp_index + 1); // offset by 1, index 0 is ANAGRP1
+      const auto& new_agroup_state = new_group_state.first;
+      const epoch_t& blocklist_epoch = new_group_state.second;
+
+      if (new_agroup_state == GW_EXPORTED_STATES_PER_AGROUP_E::GW_EXPORTED_OPTIMIZED_STATE &&
+          blocklist_epoch != 0) {
+        if (blocklist_epoch > max_blocklist_epoch) max_blocklist_epoch = blocklist_epoch;
+      }
+      gs.set_state(new_agroup_state == GW_EXPORTED_STATES_PER_AGROUP_E::GW_EXPORTED_OPTIMIZED_STATE ? OPTIMIZED : INACCESSIBLE); // Set the ANA state
+      nas.mutable_states()->Add(std::move(gs));
+      dout(0) << " grpid " << (ana_grp_index + 1) << " state: " << new_gw_state << dendl;
+    }
+    if (nas.states_size()) ai.mutable_states()->Add(std::move(nas));
+  }
+
+  // if there is state change, notify the gateway
+  if (ai.states_size()) {
+    bool set_ana_state = false;
+    while (!set_ana_state) {
+      NVMeofGwClient gw_client(
+          grpc::CreateChannel(gateway_address, grpc::InsecureChannelCredentials()));
+      set_ana_state = gw_client.set_ana_state(ai);
+      if (!set_ana_state) {
+       dout(0) << "GRPC set_ana_state failed" << dendl;
+       usleep(1000); // TODO conf option
+      }
+    }
+    // Update latest accepted osdmap epoch, for beacons
+    if (max_blocklist_epoch > osdmap_epoch) {
+      osdmap_epoch = max_blocklist_epoch;
+      dout(0) << "Ready for blocklist osd map epoch: " << osdmap_epoch << dendl;
+    }
+  }
+  map = new_map;
+}
+
+bool NVMeofGwMonitorClient::ms_dispatch2(const ref_t<Message>& m)
+{
+  std::lock_guard l(lock);
+  dout(0) << "got map type " << m->get_type() << dendl;
+
+  if (m->get_type() == MSG_MNVMEOF_GW_MAP) {
+    handle_nvmeof_gw_map(ref_cast<MNVMeofGwMap>(m));
+  }
+  bool handled = false;
+  return handled;
+}
+
+int NVMeofGwMonitorClient::main(std::vector<const char *> args)
+{
+  client_messenger->wait();
+
+  // Disable signal handlers
+  unregister_async_signal_handler(SIGHUP, sighup_handler);
+  shutdown_async_signal_handler();
+
+  return 0;
+}
diff --git a/src/nvmeof/NVMeofGwMonitorClient.h b/src/nvmeof/NVMeofGwMonitorClient.h
new file mode 100644 (file)
index 0000000..1c4c58f
--- /dev/null
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023,2024 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+
+#ifndef NVMEOFGWMONITORCLIENT_H_
+#define NVMEOFGWMONITORCLIENT_H_
+
+#include "auth/Auth.h"
+#include "common/async/context_pool.h"
+#include "common/Finisher.h"
+#include "common/Timer.h"
+#include "common/LogClient.h"
+
+#include "client/Client.h"
+#include "mon/MonClient.h"
+#include "osdc/Objecter.h"
+#include "messages/MNVMeofGwMap.h"
+
+class NVMeofGwMonitorClient: public Dispatcher,
+                  public md_config_obs_t {
+private:
+  std::string name;
+  std::string pool;
+  std::string group;
+  std::string gateway_address;
+  std::string monitor_address;
+  std::string server_key;
+  std::string server_cert;
+  std::string client_cert;
+  epoch_t     osdmap_epoch; // last awaited osdmap_epoch
+  epoch_t     gwmap_epoch; // last received gw map epoch
+
+protected:
+  ceph::async::io_context_pool poolctx;
+  MonClient monc;
+  std::unique_ptr<Messenger> client_messenger;
+  Objecter objecter;
+  Client client;
+  std::map<NvmeGroupKey, NvmeGwMap> map;
+  ceph::mutex lock = ceph::make_mutex("NVMeofGw::lock");
+  SafeTimer timer;
+
+  int orig_argc;
+  const char **orig_argv;
+
+  void send_config_beacon(); 
+  void send_beacon();
+public:
+  NVMeofGwMonitorClient(int argc, const char **argv);
+  ~NVMeofGwMonitorClient() override;
+
+  // Dispatcher interface
+  bool ms_dispatch2(const ceph::ref_t<Message>& m) override;
+  bool ms_handle_reset(Connection *con) override { return false; }
+  void ms_handle_remote_reset(Connection *con) override {}
+  bool ms_handle_refused(Connection *con) override { return false; };
+
+  // config observer bits
+  const char** get_tracked_conf_keys() const override;
+  void handle_conf_change(const ConfigProxy& conf,
+                         const std::set <std::string> &changed) override {};
+
+  int init();
+  void shutdown();
+  int main(std::vector<const char *> args);
+  void tick();
+
+  void handle_nvmeof_gw_map(ceph::ref_t<MNVMeofGwMap> m);
+};
+
+#endif
+
diff --git a/src/nvmeof/NVMeofGwMonitorGroupClient.cc b/src/nvmeof/NVMeofGwMonitorGroupClient.cc
new file mode 100644 (file)
index 0000000..27ed7b1
--- /dev/null
@@ -0,0 +1,25 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include "NVMeofGwMonitorGroupClient.h"
+
+bool NVMeofGwMonitorGroupClient::set_group_id(const uint32_t& id) {
+  group_id_req request;
+  request.set_id(id);
+  google::protobuf::Empty reply;
+  ClientContext context;
+
+  Status status = stub_->group_id(&context, request, &reply);
+
+  return status.ok();
+}
diff --git a/src/nvmeof/NVMeofGwMonitorGroupClient.h b/src/nvmeof/NVMeofGwMonitorGroupClient.h
new file mode 100644 (file)
index 0000000..805e182
--- /dev/null
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+
+#ifndef  __NVMEOFGWMONITORGROUPCLIENT_H__
+#define  __NVMEOFGWMONITORGROUPCLIENT_H__
+#include <iostream>
+#include <memory>
+#include <string>
+
+#include <grpcpp/grpcpp.h>
+
+#include "monitor.grpc.pb.h"
+
+using grpc::Channel;
+using grpc::ClientContext;
+using grpc::Status;
+
+class NVMeofGwMonitorGroupClient {
+ public:
+  NVMeofGwMonitorGroupClient(std::shared_ptr<Channel> channel)
+      : stub_(MonitorGroup::NewStub(channel)) {}
+
+  bool set_group_id(const uint32_t& id);
+
+ private:
+  std::unique_ptr<MonitorGroup::Stub> stub_;
+};
+#endif
diff --git a/src/nvmeof/gateway/control/proto/gateway.proto b/src/nvmeof/gateway/control/proto/gateway.proto
new file mode 100644 (file)
index 0000000..3b44515
--- /dev/null
@@ -0,0 +1,488 @@
+//
+//  Copyright (c) 2021 International Business Machines
+//  All rights reserved.
+//
+//  SPDX-License-Identifier: MIT
+//
+//  Authors: anita.shekar@ibm.com, sandy.kaur@ibm.com
+//
+
+
+syntax = "proto3";
+
+enum AddressFamily {
+    ipv4 = 0;
+    ipv6 = 1;
+}
+
+enum LogLevel {
+    ERROR = 0;
+    WARNING = 1;
+    NOTICE = 2;
+    INFO = 3;
+    DEBUG = 4;
+}
+
+enum GwLogLevel {
+    notset = 0;
+    debug = 10;
+    info = 20;
+    warning = 30;
+    error = 40;
+    critical = 50;
+}
+
+service Gateway {
+       // Creates a namespace from an RBD image
+       rpc namespace_add(namespace_add_req) returns (nsid_status) {}
+
+       // Creates a subsystem
+       rpc create_subsystem(create_subsystem_req) returns(req_status) {}
+
+       // Deletes a subsystem
+       rpc delete_subsystem(delete_subsystem_req) returns(req_status) {}
+
+       // List namespaces
+       rpc list_namespaces(list_namespaces_req) returns(namespaces_info) {}
+
+       // Resizes a namespace
+       rpc namespace_resize(namespace_resize_req) returns (req_status) {}
+
+       // Gets namespace's IO stats
+       rpc namespace_get_io_stats(namespace_get_io_stats_req) returns (namespace_io_stats_info) {}
+
+       // Sets namespace's qos limits
+       rpc namespace_set_qos_limits(namespace_set_qos_req) returns (req_status) {}
+
+       // Changes namespace's load balancing group
+       rpc namespace_change_load_balancing_group(namespace_change_load_balancing_group_req) returns (req_status) {}
+
+       // Deletes a namespace
+       rpc namespace_delete(namespace_delete_req) returns (req_status) {}
+
+       // Adds a host to a subsystem
+       rpc add_host(add_host_req) returns (req_status) {}
+
+       // Removes a host from a subsystem
+       rpc remove_host(remove_host_req) returns (req_status) {}
+
+       // List hosts
+       rpc list_hosts(list_hosts_req) returns(hosts_info) {}
+
+       // List connections
+       rpc list_connections(list_connections_req) returns(connections_info) {}
+
+       // Creates a listener for a subsystem at a given IP/Port
+       rpc create_listener(create_listener_req) returns(req_status) {}
+
+       // Deletes a listener from a subsystem at a given IP/Port
+       rpc delete_listener(delete_listener_req) returns(req_status) {}
+
+       // List listeners
+       rpc list_listeners(list_listeners_req) returns(listeners_info) {}
+
+       // List subsystems
+       rpc list_subsystems(list_subsystems_req) returns(subsystems_info_cli) {}
+
+        // Gets subsystems
+        rpc get_subsystems(get_subsystems_req) returns(subsystems_info) {}
+
+       // Set gateway ANA states
+       rpc set_ana_state(ana_info) returns(req_status) {}
+
+       // Gets spdk nvmf log flags and level
+       rpc get_spdk_nvmf_log_flags_and_level(get_spdk_nvmf_log_flags_and_level_req) returns(spdk_nvmf_log_flags_and_level_info) {}
+
+        // Disables spdk nvmf logs
+        rpc disable_spdk_nvmf_logs(disable_spdk_nvmf_logs_req) returns(req_status) {}
+
+       // Set spdk nvmf logs
+       rpc set_spdk_nvmf_logs(set_spdk_nvmf_logs_req) returns(req_status) {}
+
+       // Get gateway info
+       rpc get_gateway_info(get_gateway_info_req) returns(gateway_info) {}
+
+       // Get gateway log level
+       rpc get_gateway_log_level(get_gateway_log_level_req) returns(gateway_log_level_info) {}
+
+       // Set gateway log level
+       rpc set_gateway_log_level(set_gateway_log_level_req) returns(req_status) {}
+}
+
+// Request messages
+
+message namespace_add_req {
+       string rbd_pool_name = 1;
+       string rbd_image_name = 2;
+       string subsystem_nqn = 3;
+       optional uint32 nsid = 4;
+       uint32 block_size = 5;
+       optional string uuid = 6;
+       optional int32 anagrpid = 7;
+       optional bool create_image = 8;
+       optional uint64 size = 9;
+       optional bool force = 10;
+}
+
+message namespace_resize_req {
+       string subsystem_nqn = 1;
+       optional uint32 nsid = 2;
+       optional string uuid = 3;
+       uint64 new_size = 4;
+}
+
+message namespace_get_io_stats_req {
+       string subsystem_nqn = 1;
+       optional uint32 nsid = 2;
+       optional string uuid = 3;
+}
+
+message namespace_set_qos_req {
+       string subsystem_nqn = 1;
+       optional uint32 nsid = 2;
+       optional string uuid = 3;
+       optional uint64 rw_ios_per_second = 4;
+       optional uint64 rw_mbytes_per_second = 5;
+       optional uint64 r_mbytes_per_second = 6;
+       optional uint64 w_mbytes_per_second = 7;
+}
+
+message namespace_change_load_balancing_group_req {
+       string subsystem_nqn = 1;
+       optional uint32 nsid = 2;
+       optional string uuid = 3;
+       int32 anagrpid = 4;
+}
+
+message namespace_delete_req {
+       string subsystem_nqn = 1;
+       optional uint32 nsid = 2;
+       optional string uuid = 3;
+}
+
+message create_subsystem_req {
+       string subsystem_nqn = 1;
+       string serial_number = 2;
+       optional uint32 max_namespaces = 3;
+       bool enable_ha = 4;
+}
+
+message delete_subsystem_req {
+       string subsystem_nqn = 1;
+       optional bool force = 2;
+}
+
+message list_namespaces_req {
+       string subsystem = 1;
+       optional uint32 nsid = 2;
+       optional string uuid = 3;
+}
+
+message add_host_req {
+       string subsystem_nqn = 1;
+       string host_nqn = 2;
+}
+
+message remove_host_req {
+       string subsystem_nqn = 1;
+       string host_nqn = 2;
+}
+
+message list_hosts_req {
+       string subsystem = 1;
+}
+
+message list_connections_req {
+       string subsystem = 1;
+}
+
+message create_listener_req {
+       string nqn = 1;
+       string host_name = 2;
+       string traddr = 3;
+       optional AddressFamily adrfam = 5;
+       optional uint32 trsvcid = 6;
+}
+
+message delete_listener_req {
+       string nqn = 1;
+       string host_name = 2;
+       string traddr = 3;
+       optional AddressFamily adrfam = 5;
+       optional uint32 trsvcid = 6;
+       optional bool force = 7;
+}
+
+message list_listeners_req {
+       string subsystem = 1;
+}
+
+message list_subsystems_req {
+       optional string subsystem_nqn = 1;
+       optional string serial_number = 2;
+}
+
+message get_subsystems_req {
+}
+
+message get_spdk_nvmf_log_flags_and_level_req {
+}
+
+message disable_spdk_nvmf_logs_req {
+}
+
+message set_spdk_nvmf_logs_req {
+       optional LogLevel log_level = 1;
+       optional LogLevel print_level = 2;
+}
+
+message get_gateway_info_req {
+       optional string cli_version = 1;
+}
+
+message get_gateway_log_level_req {
+}
+
+message set_gateway_log_level_req {
+       GwLogLevel log_level = 1;
+}
+
+// From https://nvmexpress.org/wp-content/uploads/NVM-Express-1_4-2019.06.10-Ratified.pdf page 138
+// Asymmetric Namespace Access state for all namespaces in this ANA
+// Group when accessed through this controller.
+// Value Description Reference
+// 01h ANA Optimized state 8.20.3.1
+// 02h ANA Non-Optimized state 8.20.3.2
+// 03h ANA Inaccessible state 8.20.3.3
+// 04h ANA Persistent Loss state 8.20.3.4
+// 0Fh ANA Change state 8.20.3.5
+// All others Reserved
+enum ana_state {
+  UNSET         = 0;
+  OPTIMIZED     = 1;
+  NON_OPTIMIZED = 2;
+  INACCESSIBLE  = 3;
+}
+
+message ana_group_state {
+       uint32     grp_id = 1;  // groupd id
+       ana_state  state  = 2;  // ANA state
+}
+
+message nqn_ana_states {
+       string     nqn                     = 1; // subsystem nqn
+       repeated   ana_group_state  states = 2; // list of group states
+}
+
+message ana_info {
+       repeated nqn_ana_states states = 1; // list of nqn states
+}
+
+// Return messages 
+
+message req_status {
+       int32 status = 1;
+       string error_message = 2;
+}
+
+message nsid_status {
+       int32 status = 1;
+       string error_message = 2;
+       uint32 nsid = 3;
+}
+
+message subsystems_info {
+        repeated subsystem subsystems = 1;
+}
+
+message subsystem {
+        string nqn = 1;
+        string subtype = 2;
+        repeated listen_address listen_addresses = 3;
+        repeated host hosts = 4;
+        bool allow_any_host = 5;
+        optional string serial_number = 6;
+        optional string model_number = 7;
+        optional uint32 max_namespaces = 8;
+        optional uint32 min_cntlid = 9;
+        optional uint32 max_cntlid = 10;
+        repeated namespace namespaces = 11;
+}
+
+message listen_address {
+        string trtype = 1;
+        string adrfam = 2;
+        string traddr = 3;
+        string trsvcid = 4;
+        optional string transport = 5;
+}
+
+message namespace {
+    uint32 nsid = 1;
+    string name = 2;
+    optional string bdev_name = 3;
+    optional string nguid = 4;
+    optional string uuid = 5;
+    optional uint32 anagrpid = 6;
+    optional string nonce = 7;
+}
+
+message subsystems_info_cli {
+       int32 status = 1;
+       string error_message = 2;
+       repeated subsystem_cli subsystems = 3;
+}
+
+message subsystem_cli {
+       string nqn = 1;
+       bool enable_ha = 2;
+       string serial_number = 3;
+       string model_number = 4;
+       uint32 min_cntlid = 5;
+       uint32 max_cntlid = 6;
+       uint32 namespace_count = 7;
+       string subtype = 8;
+       uint32 max_namespaces = 9;
+}
+
+message gateway_info {
+       string cli_version = 1;
+       string version = 2;
+       string name = 3;
+       string group = 4;
+       string addr = 5;
+       string port = 6;
+       bool bool_status = 7;
+       int32 status = 8;
+       string error_message = 9;
+       optional string spdk_version = 10;
+        uint32 load_balancing_group = 11;
+       string hostname = 12;
+}
+
+message cli_version {
+       int32 status = 1;
+       string error_message = 2;
+       string version = 3;
+}
+
+message gw_version {
+       int32 status = 1;
+       string error_message = 2;
+       string version = 3;
+}
+
+message listener_info {
+       string host_name = 1;
+       string trtype = 2;
+       AddressFamily adrfam = 3;
+       string traddr = 4;
+       uint32 trsvcid = 5;
+}
+
+message listeners_info {
+       int32 status = 1;
+       string error_message = 2;
+       repeated listener_info listeners = 3;
+}
+
+message host {
+    string nqn = 1;
+}
+
+message hosts_info {
+       int32 status = 1;
+       string error_message = 2;
+       bool allow_any_host = 3;
+       string subsystem_nqn = 4;
+       repeated host hosts = 5;
+}
+
+message connection {
+       string nqn = 1;
+       string traddr = 2;
+       uint32 trsvcid = 3;
+        string trtype = 4;
+       AddressFamily adrfam = 5;
+       bool connected = 6;
+       int32 qpairs_count = 7;
+       int32 controller_id = 8;
+}
+
+message connections_info {
+       int32 status = 1;
+       string error_message = 2;
+       string subsystem_nqn = 3;
+       repeated connection connections = 4;
+}
+
+message namespace_cli {
+       uint32 nsid = 1;
+       string bdev_name = 2;
+       string rbd_image_name = 3;
+       string rbd_pool_name = 4;
+       uint32 load_balancing_group = 5;
+       uint32 block_size = 6;
+       uint64 rbd_image_size = 7;
+       string uuid = 8;
+       uint64 rw_ios_per_second = 9;
+       uint64 rw_mbytes_per_second = 10;
+       uint64 r_mbytes_per_second = 11;
+       uint64 w_mbytes_per_second = 12;
+}
+
+message namespaces_info {
+       int32 status = 1;
+       string error_message = 2;
+       string subsystem_nqn = 3;
+       repeated namespace_cli namespaces = 4;
+}
+
+message namespace_io_stats_info {
+       int32 status = 1;
+       string error_message = 2;
+       string subsystem_nqn = 3;
+       uint32 nsid = 4;
+       string uuid = 5;
+       string bdev_name = 6;
+       uint64 tick_rate = 7;
+       uint64 ticks = 8;
+       uint64 bytes_read = 9;
+       uint64 num_read_ops = 10;
+       uint64 bytes_written = 11;
+       uint64 num_write_ops = 12;
+       uint64 bytes_unmapped = 13;
+       uint64 num_unmap_ops = 14;
+       uint64 read_latency_ticks = 15;
+       uint64 max_read_latency_ticks = 16;
+       uint64 min_read_latency_ticks = 17;
+       uint64 write_latency_ticks = 18;
+       uint64 max_write_latency_ticks = 19;
+       uint64 min_write_latency_ticks = 20;
+       uint64 unmap_latency_ticks = 21;
+       uint64 max_unmap_latency_ticks = 22;
+       uint64 min_unmap_latency_ticks = 23;
+       uint64 copy_latency_ticks = 24;
+       uint64 max_copy_latency_ticks = 25;
+       uint64 min_copy_latency_ticks = 26;
+       repeated uint32 io_error = 27;
+}
+
+message spdk_log_flag_info {
+    string name = 1;
+    bool enabled = 2;
+}
+
+message spdk_nvmf_log_flags_and_level_info {
+       int32 status = 1;
+       string error_message = 2;
+       repeated spdk_log_flag_info nvmf_log_flags = 3;
+       LogLevel log_level = 4;
+       LogLevel log_print_level = 5;
+}
+
+message gateway_log_level_info {
+       int32 status = 1;
+       string error_message = 2;
+       GwLogLevel log_level = 3;
+}
diff --git a/src/nvmeof/gateway/control/proto/monitor.proto b/src/nvmeof/gateway/control/proto/monitor.proto
new file mode 100644 (file)
index 0000000..0882141
--- /dev/null
@@ -0,0 +1,19 @@
+//
+//  Copyright (c) 2023 International Business Machines
+//  All rights reserved.
+//
+//  SPDX-License-Identifier: MIT
+//
+
+syntax = "proto3";
+import "google/protobuf/empty.proto";
+
+service MonitorGroup {
+       // Called by the monitor client to set the gateway's group id
+       rpc group_id(group_id_req) returns (google.protobuf.Empty) {}
+}
+
+// Request messages
+message group_id_req {
+       uint32 id = 1;
+}
index af54240b842c24ae3fdf5ffae9dc1ce885c41958..454f5b8c11a5b35e493a22affa8f310733496b2b 100644 (file)
@@ -20,6 +20,9 @@ class NvmeofService(CephService):
     def config(self, spec: NvmeofServiceSpec) -> None:  # type: ignore
         assert self.TYPE == spec.service_type
         assert spec.pool
+        self.pool = spec.pool
+        assert spec.group is not None
+        self.group = spec.group
         self.mgr._check_pool_exists(spec.pool, spec.service_name())
 
     def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
@@ -53,6 +56,19 @@ class NvmeofService(CephService):
         daemon_spec.extra_files = {'ceph-nvmeof.conf': gw_conf}
         daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
         daemon_spec.deps = []
+        # Notify monitor about this gateway creation
+        cmd = {
+            'prefix': 'nvme-gw create',
+            'id': name,
+            'group': spec.group,
+            'pool': spec.pool
+        }
+        _, _, err = self.mgr.mon_command(cmd)
+        # if send command failed, raise assertion exception, failing the daemon creation
+        assert not err, f"Unable to send monitor command {cmd}, error {err}"
+        if not hasattr(self, 'gws'):
+            self.gws = {} # id -> name map of gateways for this service.
+        self.gws[nvmeof_gw_id] = name # add to map of service's gateway names
         return daemon_spec
 
     def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
@@ -86,8 +102,31 @@ class NvmeofService(CephService):
         logger.debug(f'Post remove daemon {self.TYPE}.{daemon.daemon_id}')
         # to clean the keyring up
         super().post_remove(daemon, is_failed_deploy=is_failed_deploy)
-        # TODO: remove config for dashboard nvmeof gateways if any
-        # and any certificates being used for mTLS
+
+        # remove config for dashboard nvmeof gateways if any
+        ret, out, err = self.mgr.mon_command({
+            'prefix': 'dashboard nvmeof-gateway-rm',
+            'name': daemon.hostname,
+        })
+        if not ret:
+            logger.info(f'{daemon.hostname} removed from nvmeof gateways dashboard config')
+
+        # Assert configured
+        assert self.pool
+        assert self.group is not None
+        assert daemon.daemon_id in self.gws
+        name = self.gws[daemon.daemon_id]
+        self.gws.pop(daemon.daemon_id)
+        # Notify monitor about this gateway deletion
+        cmd = {
+            'prefix': 'nvme-gw delete',
+            'id': name,
+            'group': self.group,
+            'pool': self.pool
+        }
+        _, _, err = self.mgr.mon_command(cmd)
+        if err:
+            self.mgr.log.error(f"Unable to send monitor command {cmd}, error {err}")
 
     def purge(self, service_name: str) -> None:
         """Removes configuration
index 6cf481feacf063bfffe13c3b5c12291ffc0732b2..45e858fe20fb9f478ea40e1de6ea4d52d3154572 100644 (file)
@@ -1269,7 +1269,7 @@ class NvmeofServiceSpec(ServiceSpec):
         #: ``name`` name of the nvmeof gateway
         self.name = name
         #: ``group`` name of the nvmeof gateway
-        self.group = group
+        self.group = group or ''
         #: ``enable_auth`` enables user authentication on nvmeof gateway
         self.enable_auth = enable_auth
         #: ``server_key`` gateway server key
index 841b7ffa6cac9baf003e052a4422bc2c90250b15..53f34ef2ca662f47f017c6c10e400d96a34c2535 100644 (file)
@@ -981,3 +981,11 @@ add_ceph_unittest(unittest_weighted_shuffle)
 add_executable(unittest_intarith test_intarith.cc)
 add_ceph_unittest(unittest_intarith)
 #make check ends here
+
+# test_nvmeof_mon_encoding
+add_executable(test_nvmeof_mon_encoding
+  test_nvmeof_mon_encoding.cc
+  )
+target_link_libraries(test_nvmeof_mon_encoding
+  mon ceph-common global-static
+  )
diff --git a/src/test/test_nvmeof_mon_encoding.cc b/src/test/test_nvmeof_mon_encoding.cc
new file mode 100644 (file)
index 0000000..9d84b58
--- /dev/null
@@ -0,0 +1,197 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2024 IBM, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <iostream>
+#include "common/ceph_argparse.h"
+#include "common/debug.h"
+#include "include/ceph_assert.h"
+#include "global/global_init.h"
+#include "mon/NVMeofGwMon.h"
+#include "messages/MNVMeofGwMap.h"
+#include "messages/MNVMeofGwBeacon.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix *_dout
+
+using namespace std;
+
+void test_NVMeofGwMap() {
+  dout(0) << __func__ << "\n\n" << dendl;
+
+  NVMeofGwMap pending_map;
+  std::string pool = "pool1";
+  std::string group = "grp1";
+  auto group_key = std::make_pair(pool, group);
+  pending_map.cfg_add_gw("GW1" ,group_key);
+  pending_map.cfg_add_gw("GW2" ,group_key);
+  pending_map.cfg_add_gw("GW3" ,group_key);
+  NvmeNonceVector new_nonces = {"abc", "def","hij"};
+  pending_map.Created_gws[group_key]["GW1"].nonce_map[1] = new_nonces;
+  for(int i=0; i< MAX_SUPPORTED_ANA_GROUPS; i++){
+    pending_map.Created_gws[group_key]["GW1"].blocklist_data[i].osd_epoch = i*2;
+    pending_map.Created_gws[group_key]["GW1"].blocklist_data[i].is_failover = false;
+  }
+
+  pending_map.Created_gws[group_key]["GW2"].nonce_map[2] = new_nonces;
+  dout(0) << pending_map << dendl;
+
+  ceph::buffer::list bl;
+  pending_map.encode(bl);
+  auto p = bl.cbegin();
+  pending_map.decode(p);
+  dout(0) << "Dump map after decode encode:" <<dendl;
+  dout(0) << pending_map << dendl;
+}
+
+void test_MNVMeofGwMap() {
+  dout(0) << __func__ << "\n\n" << dendl;
+  std::map<NvmeGroupKey, NvmeGwMap> map;
+
+  std::string pool = "pool1";
+  std::string group = "grp1";
+  std::string gw_id = "GW1";
+  NvmeGwState state(1, 32);
+  std::string nqn = "nqn";
+  ANA_STATE ana_state;
+  NqnState nqn_state(nqn, ana_state);
+  state.subsystems.insert({nqn, nqn_state});
+
+  auto group_key = std::make_pair(pool, group);
+  map[group_key][gw_id] = state;
+
+
+
+  ceph::buffer::list bl;
+  encode(map, bl);
+  dout(0) << "encode: " << map << dendl;
+  decode(map, bl);
+  dout(0) << "decode: " << map << dendl;
+
+  BeaconSubsystem sub = { nqn, {}, {} };
+  NVMeofGwMap pending_map;
+  pending_map.cfg_add_gw("GW1" ,group_key);
+  pending_map.cfg_add_gw("GW2" ,group_key);
+  pending_map.cfg_add_gw("GW3" ,group_key);
+  NvmeNonceVector new_nonces = {"abc", "def","hij"};
+  pending_map.Created_gws[group_key]["GW1"].nonce_map[1] = new_nonces;
+  pending_map.Created_gws[group_key]["GW1"].copied_nonce_map[1] = new_nonces;
+  pending_map.Created_gws[group_key]["GW1"].subsystems.push_back(sub);
+  for(int i=0; i< MAX_SUPPORTED_ANA_GROUPS; i++){
+    pending_map.Created_gws[group_key]["GW1"].blocklist_data[i].osd_epoch = i*2;
+    pending_map.Created_gws[group_key]["GW1"].blocklist_data[i].is_failover = false;
+  }
+
+  pending_map.Created_gws[group_key]["GW2"].nonce_map[2] = new_nonces;
+  dout(0) << "False pending map: " << pending_map << dendl;
+
+  auto msg = make_message<MNVMeofGwMap>(pending_map);
+  msg->encode_payload(0);
+  msg->decode_payload();
+  dout(0) << "decode msg: " << *msg << dendl;
+
+  dout(0)   << "\n == Test GW Delete ==" << dendl;
+  pending_map.cfg_delete_gw("GW1" ,group_key);
+  dout(0) << "deleted GW1 " << pending_map << dendl;
+
+  pending_map.cfg_delete_gw("GW1" ,group_key);
+  dout(0) << "duplicated delete of GW1 " << pending_map << dendl;
+
+  pending_map.cfg_delete_gw("GW2" ,group_key);
+  dout(0) << "deleted GW2 " << pending_map << dendl;
+
+  dout(0) << "delete of wrong gw id" << dendl;
+  pending_map.cfg_delete_gw("wow" ,group_key);
+
+  pending_map.cfg_delete_gw("GW3" ,group_key);
+  dout(0) << "deleted GW3 . we should see the empty map " << pending_map << dendl;
+
+
+}
+
+void test_MNVMeofGwBeacon() {
+  std::string gw_id = "GW";
+  std::string gw_pool = "pool";
+  std::string gw_group = "group";
+  GW_AVAILABILITY_E availability = GW_AVAILABILITY_E::GW_AVAILABLE;
+  std::string nqn = "nqn";
+  BeaconSubsystem sub = { nqn, {}, {} };
+  BeaconSubsystems subs = { sub };
+  epoch_t osd_epoch = 17;
+  epoch_t gwmap_epoch = 42;
+
+  auto msg = make_message<MNVMeofGwBeacon>(
+      gw_id,
+      gw_pool,
+      gw_group,
+      subs,
+      availability,
+      osd_epoch,
+      gwmap_epoch);
+  msg->encode_payload(0);
+  msg->decode_payload();
+  dout(0) << "decode msg: " << *msg << dendl;
+  ceph_assert(msg->get_gw_id() == gw_id);
+  ceph_assert(msg->get_gw_pool() == gw_pool);
+  ceph_assert(msg->get_gw_group() == gw_group);
+  ceph_assert(msg->get_availability() == availability);
+  ceph_assert(msg->get_last_osd_epoch() == osd_epoch);
+  ceph_assert(msg->get_last_gwmap_epoch() == gwmap_epoch);
+  const auto& dsubs = msg->get_subsystems();
+  auto it = std::find_if(dsubs.begin(), dsubs.end(),
+                           [&nqn](const auto& element) {
+                               return element.nqn == nqn;
+                           });
+  ceph_assert(it != dsubs.end());
+}
+
+void test_NVMeofGwTimers()
+{
+    NVMeofGwMap pending_map;
+    //pending_map.Gmetadata;
+    const NvmeGroupKey group_key = std::make_pair("a","b");
+    std::string gwid = "GW1";
+    NvmeAnaGrpId  grpid = 2;
+    pending_map.start_timer(gwid, group_key, grpid, 30);
+    auto end_time  = pending_map.Gmetadata[group_key][gwid].data[grpid].end_time;
+    uint64_t  millisecondsSinceEpoch = std::chrono::duration_cast<std::chrono::milliseconds>(end_time.time_since_epoch()).count();
+    dout(0) << "Metadata milliseconds " << millisecondsSinceEpoch << " " << (int)pending_map.Gmetadata[group_key][gwid].data[grpid].timer_value << dendl;
+    ceph::buffer::list bl;
+    pending_map.encode(bl);
+    auto p = bl.cbegin();
+    pending_map.decode(p);
+
+    end_time  = pending_map.Gmetadata[group_key][gwid].data[2].end_time;
+    millisecondsSinceEpoch = std::chrono::duration_cast<std::chrono::milliseconds>(end_time.time_since_epoch()).count();
+    dout(0) << "After encode decode Metadata milliseconds " << millisecondsSinceEpoch << " " <<  (int)pending_map.Gmetadata[group_key][gwid].data[grpid].timer_value<<dendl;
+
+}
+
+int main(int argc, const char **argv)
+{
+  // Init ceph
+  auto args = argv_to_vec(argc, argv);
+  auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+                         CODE_ENVIRONMENT_UTILITY,
+                         CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+  common_init_finish(g_ceph_context);
+
+  // Run tests
+  test_NVMeofGwMap();
+  test_MNVMeofGwMap();
+  test_MNVMeofGwBeacon();
+  test_NVMeofGwTimers();
+}
+