mgr: ensure that all modules have started before advertising active mgr

author Laura Flores <lflores@ibm.com>

Fri, 25 Apr 2025 22:11:19 +0000 (22:11 +0000)

committer Laura Flores <lflores@ibm.com>

Tue, 21 Apr 2026 15:56:37 +0000 (10:56 -0500)
author Laura Flores <lflores@ibm.com>
Fri, 25 Apr 2025 22:11:19 +0000 (22:11 +0000)
committer Laura Flores <lflores@ibm.com>
Tue, 21 Apr 2026 15:56:37 +0000 (10:56 -0500)
diff --git a/doc/mgr/administrator.rst b/doc/mgr/administrator.rst

index 140c4751551d16325e2fb724dc57b1f9bcdc98ee..12c6a9bb420cee0ecc3e02e95437b3abe40d1881 100644 (file)
--- a/doc/mgr/administrator.rst
+++ b/doc/mgr/administrator.rst
@@ -31,6 +31,52 @@ which should now include a mgr status line::
  
      mgr active: $name
  
+Interpreting Ceph-Mgr Statuses
+==============================
+
+A cluster's health status will show each ``ceph-mgr`` daemon in one of three states:
+
+1. **active**
+
+   This mgr daemon has been fully initialized, which means it is ready to receive
+   and execute commands. Only one mgr will be in this state at a time.
+
+2. **active (starting)**
+
+   This mgr daemon has been chosen to be ``active``, but it is not done initializing.
+   Although it is not yet ready to execute commands, an operator may still issue commands,
+   which will be held and executed once the manager becomes ``active``. Only one mgr will
+   be in this state at a time.
+
+3. **standby**
+
+   This mgr daemon is not currently receiving or executing commands, but it is there to
+   take over if the current active mgr becomes unavailable. An operator may also manually
+   promote standby manager to active via ``ceph mgr fail`` if desired. All other mgr daemons
+   which are not ``active`` or ``active (starting)`` will be in this state.
+
+Each of these states are visible in the output of the ``ceph -s``. For example:
+
+.. code-block:: console
+
+   $ ceph -s
+     cluster:
+       id:     b150f540-745a-460c-a566-376b28b95ac3
+       health: HEALTH_OK
+
+     services:
+       mon: 3 daemons, quorum a,b,c (age 47m) [leader: a]
+       mgr: x(active, starting, since 3s)
+       mds: 1/1 daemons up, 2 standby
+       osd: 4 osds: 4 up (since 47m), 4 in (since 47m)
+
+     data:
+       volumes: 1/1 healthy
+       pools:   4 pools, 177 pgs
+       objects: 24 objects, 451 KiB
+       usage:   4.0 GiB used, 400 GiB / 404 GiB avail
+       pgs:     177 active+clean
+
  Client authentication
  ---------------------
  
diff --git a/qa/suites/rados/mgr/tasks/4-units/mgr_module_loading_time.yaml b/qa/suites/rados/mgr/tasks/4-units/mgr_module_loading_time.yaml

new file mode 100644 (file)

index 0000000..bf80f1f
--- /dev/null
+++ b/qa/suites/rados/mgr/tasks/4-units/mgr_module_loading_time.yaml
@@ -0,0 +1,13 @@
+overrides:
+  ceph:
+    log-ignorelist:
+      - \(CEPHADM_STRAY_DAEMON\)
+      - \(CEPHADM_STRAY_HOST\)
+      - \(MGR_DOWN\)
+      - evicting unresponsive client
+
+tasks:
+  - workunit:
+      clients:
+        client.0:
+          - mgr/test_mgr_module_loading_time.sh
diff --git a/qa/workunits/mgr/test_mgr_module_loading_time.sh b/qa/workunits/mgr/test_mgr_module_loading_time.sh

new file mode 100755 (executable)

index 0000000..e55a761
--- /dev/null
+++ b/qa/workunits/mgr/test_mgr_module_loading_time.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+setup_cephadm() {
+    # This will create CEPHADM_STRAY_HOST warnings, but we just want to be able to run an orch command.
+    echo "Enabling cephadm module..."
+    ceph mgr module enable cephadm
+    ceph orch set backend cephadm
+}
+
+check_cluster_status() {
+    echo "Checking cluster status..."
+    ceph -s
+}
+
+set_balancer_delay() {
+    echo "Setting balancer module load delay..."
+    ceph config set mgr mgr_module_load_delay_name balancer
+    ceph config set mgr mgr_module_load_delay 10000
+}
+
+test_loading_time() {
+    echo "Testing with module load delay of 10000 ms..."
+    ceph mgr fail
+
+    local orch_status_output
+    if ! orch_status_output=$(ceph orch status 2>&1); then
+        echo "FAIL: 'ceph orch status' failed to run:"
+        echo "$orch_status_output"
+        exit 1
+    fi
+
+    echo "$orch_status_output"
+
+    if [[ "$orch_status_output" == *"Backend: cephadm"* ]]; then
+        echo "PASS: Excess loading time was properly supported."
+    elif [[ "$orch_status_output" == *"Error ENOTSUP: Warning: due to ceph-mgr restart, some PG states may not be up to date"* ]]; then
+        echo "FAIL: Excess loading time was not properly supported."
+        exit 1
+    else
+        echo "FAIL: Unexpected error in 'ceph orch status':"
+        echo "$orch_status_output"
+        exit 1
+    fi
+}
+
+main() {
+    setup_cephadm || return 1
+    check_cluster_status || return 1
+    set_balancer_delay || return 1
+    test_loading_time || return 1
+}
+
+main "$@"
diff --git a/src/common/options/mgr.yaml.in b/src/common/options/mgr.yaml.in

index 5b923e507dab9d379276ef4ee4bd26e63ac6aaa5..87f6c36cca1e86c2d03202c4cd83a06b2665853a 100644 (file)
--- a/src/common/options/mgr.yaml.in
+++ b/src/common/options/mgr.yaml.in
@@ -160,6 +160,29 @@ options:
    flags:
    - no_mon_update
    - cluster_create
+- name: mgr_module_load_delay
+  type: millisecs
+  level: dev
+  default: 0
+  desc: Number of milliseconds for Manager modules to delay loading. For testing purposes only.
+  long_desc: Number of milliseconds for Manager modules to delay loading. For testing purposes only.
+  see_also:
+  - mgr_module_load_delay_name
+  flags:
+  - runtime
+  services:
+  - mgr
+- name: mgr_module_load_delay_name
+  type: str
+  level: dev
+  desc: Specify which Manager module is to delay loading by mgr_module_load_delay milliseconds. For testing purposes only.
+  long_desc: Specify which Manager module is to delay loading by mgr_module_load_delay milliseconds. For testing purposes only.
+  see_also:
+  - mgr_module_load_delay
+  flags:
+  - runtime
+  services:
+  - mgr
  - name: cephadm_path
    type: str
    level: advanced
diff --git a/src/mgr/ActivePyModules.cc b/src/mgr/ActivePyModules.cc

index 3a9e8620a6082a631e5291eec1c657d9ecb01755..adaa422754b80741e3e25ed46c6e27e36828bdde 100644 (file)
--- a/src/mgr/ActivePyModules.cc
+++ b/src/mgr/ActivePyModules.cc
@@ -537,6 +537,14 @@ void ActivePyModules::start_one(PyModuleRef py_module)
    // Send all python calls down a Finisher to avoid blocking
    // C++ code, and avoid any potential lock cycles.
    finisher.queue(new LambdaContext([this, active_module, name](int) {
+    // Delay loading in testing scenarios
+    auto delay = g_conf().get_val<std::chrono::milliseconds>("mgr_module_load_delay");
+    std::string delayed_module = g_conf().get_val<std::string>("mgr_module_load_delay_name");
+    if ((name == delayed_module) && (delay > std::chrono::milliseconds{0})) {
+      dout(4) << "Delaying load time for module '" << name
+              << "' by " << delay << "..." << dendl;
+      std::this_thread::sleep_for(delay);
+    }
      int r = active_module->load(this);
      std::lock_guard l(lock);
      pending_modules.erase(name);
@@ -553,6 +561,12 @@ void ActivePyModules::start_one(PyModuleRef py_module)
          << active_module->get_fin_thread_name() << dendl;
        active_module->finisher.start();
      }
+
+    // Signal when we're finally done starting up modules
+    if (pending_modules.empty() && recheck_modules_start) {
+      finisher.queue(recheck_modules_start);
+      recheck_modules_start = nullptr;
+    }
    }));
  }
  
@@ -1758,3 +1772,13 @@ PyObject* ActivePyModules::get_daemon_health_metrics()
        return f.get();
    });
  }
+
+void ActivePyModules::check_all_modules_started(Context *modules_start_complete) {
+  std::lock_guard l(lock);
+  if (pending_modules.empty()) {
+    // Modules are already done starting, signal completion right away
+    finisher.queue(modules_start_complete);
+  } else {
+    recheck_modules_start = modules_start_complete; // signal that we need to check again later
+  }
+}
diff --git a/src/mgr/ActivePyModules.h b/src/mgr/ActivePyModules.h

index 31ebb0c71e637681aee39230158d47a7d86bdc61..f17113390382ce1a2291c0b5942f994059207f86 100644 (file)
--- a/src/mgr/ActivePyModules.h
+++ b/src/mgr/ActivePyModules.h
@@ -46,6 +46,11 @@ class ActivePyModules
  {
    // module class instances not yet created
    std::set<std::string, std::less<>> pending_modules;
+  // This context is set during mgr initialization when
+  // modules need more time to finish starting up.
+  // See ActivePyModules::check_all_modules_started() and
+  // ActivePyModules::start_one().
+  Context *recheck_modules_start = nullptr;
    // module class instances already created
    std::map<std::string, std::shared_ptr<ActivePyModule>> modules;
    PyModuleConfig &module_config;
@@ -251,5 +256,11 @@ public:
  
    bool inject_python_on() const;
    void update_cache_metrics();
+  // Sends the "active" beacon right away if all mgr modules
+  // have finished startup. If some modules are still pending
+  // startup, the "active" beacon is scheduled to send later
+  // only after all modules are ready.
+  // See "PyModuleRegistry::check_all_modules_started()".
+  void check_all_modules_started(Context *modules_start_complete);
  };
  
diff --git a/src/mgr/Mgr.cc b/src/mgr/Mgr.cc

index 418e8ae984aee0c5fe3d402d7dee0645adb7e760..bde8732ae66ed94f6a7ff5fbc6a2b53de299e91d 100644 (file)
--- a/src/mgr/Mgr.cc
+++ b/src/mgr/Mgr.cc
@@ -171,7 +171,15 @@ void Mgr::background_init(Context *completion)
  
    finisher.queue(new LambdaContext([this, completion](int r){
      init();
-    completion->complete(0);
+    py_module_registry->check_all_modules_started(
+       new LambdaContext([this, completion](int){
+         {
+           std::lock_guard l(lock);
+           initializing = false;
+           initialized = true;
+         }
+       completion->complete(0);
+      }));
    }));
  }
  
@@ -389,8 +397,6 @@ void Mgr::init()
  #endif
  
    dout(4) << "Complete." << dendl;
-  initializing = false;
-  initialized = true;
  }
  
  void Mgr::load_all_metadata()
diff --git a/src/mgr/PyModuleRegistry.cc b/src/mgr/PyModuleRegistry.cc

index 42555289b4f07fbed14f75a0a34ffd92d1b0b6e8..e4e0b3cd6b79c0f0166f1b6d39d8a3090cc0ca96 100644 (file)
--- a/src/mgr/PyModuleRegistry.cc
+++ b/src/mgr/PyModuleRegistry.cc
@@ -507,3 +507,7 @@ void PyModuleRegistry::handle_config_notify()
      active_modules->config_notify();
    }
  }
+
+void PyModuleRegistry::check_all_modules_started(Context *modules_start_complete) {
+  active_modules->check_all_modules_started(modules_start_complete);
+}
diff --git a/src/mgr/PyModuleRegistry.h b/src/mgr/PyModuleRegistry.h

index a89ae6c380fb8cc72beb1e0e23870c7f0f51abdf..ba5a61d01b2bc95d00598031cd51801a694d7c0e 100644 (file)
--- a/src/mgr/PyModuleRegistry.h
+++ b/src/mgr/PyModuleRegistry.h
@@ -238,5 +238,12 @@ public:
      return active_modules->get_module_finisher(name);
    }
  
+  // Sends the "active" beacon right away if all mgr modules
+  // have finished startup. If some modules are still pending
+  // startup, the "active" beacon is scheduled to send later
+  // after all modules are ready.
+  // See "Mgr::background_init()".
+  void check_all_modules_started(Context *modules_start_complete);
+
    // <<< (end of ActivePyModules cheeky call-throughs)
  };
author	Laura Flores <lflores@ibm.com>
	Fri, 25 Apr 2025 22:11:19 +0000 (22:11 +0000)
committer	Laura Flores <lflores@ibm.com>
	Tue, 21 Apr 2026 15:56:37 +0000 (10:56 -0500)
doc/mgr/administrator.rst		patch \| blob \| history
qa/suites/rados/mgr/tasks/4-units/mgr_module_loading_time.yaml	[new file with mode: 0644]	patch \| blob
qa/workunits/mgr/test_mgr_module_loading_time.sh	[new file with mode: 0755]	patch \| blob
src/common/options/mgr.yaml.in		patch \| blob \| history
src/mgr/ActivePyModules.cc		patch \| blob \| history
src/mgr/ActivePyModules.h		patch \| blob \| history
src/mgr/Mgr.cc		patch \| blob \| history
src/mgr/PyModuleRegistry.cc		patch \| blob \| history
src/mgr/PyModuleRegistry.h		patch \| blob \| history