From: John Spray Date: Mon, 27 Nov 2017 12:42:37 +0000 (-0500) Subject: mgr: add health checks for failed modules X-Git-Tag: v13.0.2~448^2~4 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=d9a47181c45eaad2312476f6565a20b21b09ff97;p=ceph.git mgr: add health checks for failed modules Signed-off-by: John Spray --- diff --git a/qa/suites/rados/mgr/tasks/module_selftest.yaml b/qa/suites/rados/mgr/tasks/module_selftest.yaml index ffdfe8be2c2..6c7ce49a681 100644 --- a/qa/suites/rados/mgr/tasks/module_selftest.yaml +++ b/qa/suites/rados/mgr/tasks/module_selftest.yaml @@ -14,6 +14,8 @@ tasks: - Reduced data availability - Degraded data redundancy - objects misplaced + - Synthetic exception in serve + - influxdb python module not found - cephfs_test_runner: modules: - tasks.mgr.test_module_selftest diff --git a/qa/tasks/mgr/mgr_test_case.py b/qa/tasks/mgr/mgr_test_case.py index 70791af7d7e..cc0222b487d 100644 --- a/qa/tasks/mgr/mgr_test_case.py +++ b/qa/tasks/mgr/mgr_test_case.py @@ -74,13 +74,23 @@ class MgrTestCase(CephTestCase): "{1} are required".format( len(self.mgr_cluster.mgr_ids), self.MGRS_REQUIRED)) - # Restart all the daemons + # Stop all the daemons for daemon in self.mgr_cluster.mgr_daemons.values(): daemon.stop() for mgr_id in self.mgr_cluster.mgr_ids: self.mgr_cluster.mgr_fail(mgr_id) + # Unload all non-default plugins + loaded = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd( + "mgr", "module", "ls"))['enabled_modules'] + unload_modules = set(loaded) - {"status", "restful"} + + for m in unload_modules: + self.mgr_cluster.mon_manager.raw_cluster_cmd( + "mgr", "module", "disable", m) + + # Start all the daemons for daemon in self.mgr_cluster.mgr_daemons.values(): daemon.restart() diff --git a/qa/tasks/mgr/test_module_selftest.py b/qa/tasks/mgr/test_module_selftest.py index cd85a3e0d0a..a25c7d65fcb 100644 --- a/qa/tasks/mgr/test_module_selftest.py +++ b/qa/tasks/mgr/test_module_selftest.py @@ -119,3 +119,14 @@ class TestModuleSelftest(MgrTestCase): "mgr", "self-test", "run" ) self.assertEqual(exc_raised.exception.exitstatus, errno.EIO) + + # A health alert should be raised for a module that has thrown + # an exception from its serve() method + self.wait_for_health( + "Module 'selftest' has failed: Synthetic exception in serve", + timeout=30) + + self.mgr_cluster.mon_manager.raw_cluster_cmd( + "mgr", "module", "disable", "selftest") + + self.wait_for_health_clear(timeout=30) diff --git a/src/mgr/PyModuleRegistry.cc b/src/mgr/PyModuleRegistry.cc index 5c114b9e8b6..9a8b2684931 100644 --- a/src/mgr/PyModuleRegistry.cc +++ b/src/mgr/PyModuleRegistry.cc @@ -290,3 +290,61 @@ std::vector PyModuleRegistry::get_commands() const return result; } +void PyModuleRegistry::get_health_checks(health_check_map_t *checks) +{ + Mutex::Locker l(lock); + + // Only the active mgr reports module issues + if (active_modules) { + active_modules->get_health_checks(checks); + + std::map dependency_modules; + std::map failed_modules; + + /* + * Break up broken modules into two categories: + * - can_run=false: the module is working fine but explicitly + * telling you that a dependency is missing. Advise the user to + * read the message from the module and install what's missing. + * - failed=true or loaded=false: something unexpected is broken, + * either at runtime (from serve()) or at load time. This indicates + * a bug and the user should be guided to inspect the mgr log + * to investigate and gather evidence. + */ + + for (const auto &i : modules) { + auto module = i.second; + if (module->is_enabled() && !module->get_can_run()) { + dependency_modules[module->get_name()] = module->get_error_string(); + } else if ((module->is_enabled() && !module->is_loaded()) + || module->is_failed()) { + failed_modules[module->get_name()] = module->get_error_string(); + } + } + + if (!dependency_modules.empty()) { + std::ostringstream ss; + if (dependency_modules.size() == 1) { + auto iter = dependency_modules.begin(); + ss << "Module '" << iter->first << "' has failed dependency: " + << iter->second; + } else if (dependency_modules.size() > 1) { + ss << dependency_modules.size() << " modules have failed dependencies"; + } + checks->add("MGR_MODULE_DEPENDENCY", HEALTH_WARN, ss.str()); + } + + if (!failed_modules.empty()) { + std::ostringstream ss; + if (failed_modules.size() == 1) { + auto iter = failed_modules.begin(); + ss << "Module '" << iter->first << "' has failed: " + << iter->second; + } else if (failed_modules.size() > 1) { + ss << failed_modules.size() << " modules have failed"; + } + checks->add("MGR_MODULE_ERROR", HEALTH_ERR, ss.str()); + } + } +} + diff --git a/src/mgr/PyModuleRegistry.h b/src/mgr/PyModuleRegistry.h index 397c46de720..1064c19b151 100644 --- a/src/mgr/PyModuleRegistry.h +++ b/src/mgr/PyModuleRegistry.h @@ -130,12 +130,27 @@ public: return modules.at(module_name); } + /** + * Pass through command to the named module for execution. + * + * The command must exist in the COMMANDS reported by the module. If it + * doesn't then this will abort. + * + * If ActivePyModules has not been instantiated yet then this will + * return EAGAIN. + */ int handle_command( std::string const &module_name, const cmdmap_t &cmdmap, std::stringstream *ds, std::stringstream *ss); + /** + * Pass through health checks reported by modules, and report any + * modules that have failed (i.e. unhandled exceptions in serve()) + */ + void get_health_checks(health_check_map_t *checks); + // FIXME: breaking interface so that I don't have to go rewrite all // the places that call into these (for now) // >>> @@ -154,11 +169,6 @@ public: } } - void get_health_checks(health_check_map_t *checks) - { - assert(active_modules); - active_modules->get_health_checks(checks); - } std::map get_services() const { assert(active_modules);