From 14b78e45383f77dc18de5d31a36e0220c8cd13ad Mon Sep 17 00:00:00 2001 From: Lucian Petrut Date: Fri, 29 Jan 2021 09:54:10 +0000 Subject: [PATCH] rbd: improve Windows remap failure handling At the moment, if an image can't be remapped when the centralized RBD service starts, the service will stop and already started daemons will continue running. This change adds a new option: "--remap-failure-fatal". If set, when an image can't be remmaped, the service stops AND cleans up the running daemons. By default, an error will be logged and the service will continue running. Signed-off-by: Lucian Petrut --- src/common/win32/service.cc | 13 ++++++++++--- src/common/win32/service.h | 2 +- src/tools/rbd_wnbd/rbd_wnbd.cc | 25 ++++++++++++++++++++----- src/tools/rbd_wnbd/rbd_wnbd.h | 1 + 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/src/common/win32/service.cc b/src/common/win32/service.cc index 20ee5ccae6c5a..846c5d09dfaa8 100644 --- a/src/common/win32/service.cc +++ b/src/common/win32/service.cc @@ -74,14 +74,14 @@ void WINAPI ServiceBase::run() if (err) { lderr(s_service->cct) << "Failed to start service. Error code: " << err << dendl; - s_service->set_status(SERVICE_STOPPED); + s_service->shutdown(true); } else { ldout(s_service->cct, 5) << "Successfully started service." << dendl; s_service->set_status(SERVICE_RUNNING); } } -void ServiceBase::shutdown() +void ServiceBase::shutdown(bool ignore_errors) { DWORD original_state = status.dwCurrentState; set_status(SERVICE_STOP_PENDING); @@ -89,7 +89,14 @@ void ServiceBase::shutdown() int err = shutdown_hook(); if (err) { derr << "Shutdown service hook failed. Error code: " << err << dendl; - set_status(original_state); + if (ignore_errors) { + derr << "Ignoring shutdown hook failure, marking the service as stopped." + << dendl; + set_status(SERVICE_STOPPED); + } else { + derr << "Reverting to original service state." << dendl; + set_status(original_state); + } } else { dout(5) << "Shutdown hook completed." << dendl; set_status(SERVICE_STOPPED); diff --git a/src/common/win32/service.h b/src/common/win32/service.h index 5adcdea223f1c..20d37a876639a 100644 --- a/src/common/win32/service.h +++ b/src/common/win32/service.h @@ -24,7 +24,7 @@ protected: static void run(); static void control_handler(DWORD request); - void shutdown(); + void shutdown(bool ignore_errors = false); void stop(); void set_status(DWORD current_state, DWORD exit_code = NO_ERROR); diff --git a/src/tools/rbd_wnbd/rbd_wnbd.cc b/src/tools/rbd_wnbd/rbd_wnbd.cc index ce85ae94c01ea..5d45fa75fa66c 100644 --- a/src/tools/rbd_wnbd/rbd_wnbd.cc +++ b/src/tools/rbd_wnbd/rbd_wnbd.cc @@ -687,7 +687,7 @@ int disconnect_all_mappings( dout(5) << "Removing mapping: " << cfg.devpath << ". Timeout: " << cfg.soft_disconnect_timeout - << "ms. Hard disconnect: " << cfg.hard_disconnect + << "s. Hard disconnect: " << cfg.hard_disconnect << dendl; r = do_unmap(&cfg, unregister); @@ -718,19 +718,22 @@ class RBDService : public ServiceBase { int thread_count; int service_start_timeout; int image_map_timeout; + bool remap_failure_fatal; public: RBDService(bool _hard_disconnect, int _soft_disconnect_timeout, int _thread_count, int _service_start_timeout, - int _image_map_timeout) + int _image_map_timeout, + bool _remap_failure_fatal) : ServiceBase(g_ceph_context) , hard_disconnect(_hard_disconnect) , soft_disconnect_timeout(_soft_disconnect_timeout) , thread_count(_thread_count) , service_start_timeout(_service_start_timeout) , image_map_timeout(_image_map_timeout) + , remap_failure_fatal(_remap_failure_fatal) { } @@ -877,8 +880,14 @@ exit: // Restart registered mappings before accepting new ones. int r = restart_registered_mappings( thread_count, service_start_timeout, image_map_timeout); - if (r) - return r; + if (r) { + if (remap_failure_fatal) { + derr << "Couldn't remap all images. Cleaning up." << dendl; + return r; + } else { + dout(0) << "Ignoring image remap failure." << dendl; + } + } return create_pipe_server(); } @@ -937,6 +946,9 @@ Service options: unmapping images. Default: 8 --start-timeout The service start timeout in seconds. Default: 120 --map-timeout Individual image map timeout in seconds. Default: 20 + --remap-failure-fatal If set, the service will stop when failing to remap + an image at start time, unmapping images that have + been mapped so far. Show|List options: --format plain|json|xml Output format (default: plain) @@ -1424,6 +1436,8 @@ static int parse_args(std::vector& args, cfg->persistent = false; } else if (ceph_argparse_flag(args, i, "--pretty-format", (char *)NULL)) { cfg->pretty_format = true; + } else if (ceph_argparse_flag(args, i, "--remap-failure-fatal", (char *)NULL)) { + cfg->remap_failure_fatal = true; } else if (ceph_argparse_witharg(args, i, &cfg->parent_pipe, err, "--pipe-name", (char *)NULL)) { if (!err.str().empty()) { @@ -1638,7 +1652,8 @@ static int rbd_wnbd(int argc, const char *argv[]) RBDService service(cfg.hard_disconnect, cfg.soft_disconnect_timeout, cfg.service_thread_count, cfg.service_start_timeout, - cfg.image_map_timeout); + cfg.image_map_timeout, + cfg.remap_failure_fatal); // This call will block until the service stops. r = RBDService::initialize(&service); if (r < 0) diff --git a/src/tools/rbd_wnbd/rbd_wnbd.h b/src/tools/rbd_wnbd/rbd_wnbd.h index 62cb1dd6a3882..d17eb792b0ac0 100644 --- a/src/tools/rbd_wnbd/rbd_wnbd.h +++ b/src/tools/rbd_wnbd/rbd_wnbd.h @@ -66,6 +66,7 @@ struct Config { int service_start_timeout = DEFAULT_SERVICE_START_TIMEOUT; int image_map_timeout = DEFAULT_IMAGE_MAP_TIMEOUT; + bool remap_failure_fatal = false; // TODO: consider moving those fields to a separate structure. Those // provide connection information without actually being configurable. -- 2.39.5