]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
rbd: improve Windows remap failure handling
authorLucian Petrut <lpetrut@cloudbasesolutions.com>
Fri, 29 Jan 2021 09:54:10 +0000 (09:54 +0000)
committerLucian Petrut <lpetrut@cloudbasesolutions.com>
Wed, 3 Feb 2021 07:20:46 +0000 (07:20 +0000)
At the moment, if an image can't be remapped when the centralized
RBD service starts, the service will stop and already started
daemons will continue running.

This change adds a new option: "--remap-failure-fatal". If set,
when an image can't be remmaped, the service stops AND cleans up
the running daemons. By default, an error will be logged and the
service will continue running.

Signed-off-by: Lucian Petrut <lpetrut@cloudbasesolutions.com>
src/common/win32/service.cc
src/common/win32/service.h
src/tools/rbd_wnbd/rbd_wnbd.cc
src/tools/rbd_wnbd/rbd_wnbd.h

index 20ee5ccae6c5a24f167864def2b24766b56e0378..846c5d09dfaa8859e1a0c6fd586042418a0c5c8c 100644 (file)
@@ -74,14 +74,14 @@ void WINAPI ServiceBase::run()
   if (err) {
     lderr(s_service->cct) << "Failed to start service. Error code: "
                           << err << dendl;
-    s_service->set_status(SERVICE_STOPPED);
+    s_service->shutdown(true);
   } else {
     ldout(s_service->cct, 5) << "Successfully started service." << dendl;
     s_service->set_status(SERVICE_RUNNING);
   }
 }
 
-void ServiceBase::shutdown()
+void ServiceBase::shutdown(bool ignore_errors)
 {
   DWORD original_state = status.dwCurrentState;
   set_status(SERVICE_STOP_PENDING);
@@ -89,7 +89,14 @@ void ServiceBase::shutdown()
   int err = shutdown_hook();
   if (err) {
     derr << "Shutdown service hook failed. Error code: " << err << dendl;
-    set_status(original_state);
+    if (ignore_errors) {
+      derr << "Ignoring shutdown hook failure, marking the service as stopped."
+           << dendl;
+      set_status(SERVICE_STOPPED);
+    } else {
+      derr << "Reverting to original service state." << dendl;
+      set_status(original_state);
+    }
   } else {
     dout(5) << "Shutdown hook completed." << dendl;
     set_status(SERVICE_STOPPED);
index 5adcdea223f1caa5378b59a66bc97400f0fd6b5f..20d37a876639ad0d4f0c0cfb1c97e6c4ba434e71 100644 (file)
@@ -24,7 +24,7 @@ protected:
   static void run();
   static void control_handler(DWORD request);
 
-  void shutdown();
+  void shutdown(bool ignore_errors = false);
   void stop();
 
   void set_status(DWORD current_state, DWORD exit_code = NO_ERROR);
index ce85ae94c01ea5ba3f53576f8013495fa5c90ff8..5d45fa75fa66c09b22b345a5f52ddecfba3c684f 100644 (file)
@@ -687,7 +687,7 @@ int disconnect_all_mappings(
 
         dout(5) << "Removing mapping: " << cfg.devpath
                 << ". Timeout: " << cfg.soft_disconnect_timeout
-                << "ms. Hard disconnect: " << cfg.hard_disconnect
+                << "s. Hard disconnect: " << cfg.hard_disconnect
                 << dendl;
 
         r = do_unmap(&cfg, unregister);
@@ -718,19 +718,22 @@ class RBDService : public ServiceBase {
     int thread_count;
     int service_start_timeout;
     int image_map_timeout;
+    bool remap_failure_fatal;
 
   public:
     RBDService(bool _hard_disconnect,
                int _soft_disconnect_timeout,
                int _thread_count,
                int _service_start_timeout,
-               int _image_map_timeout)
+               int _image_map_timeout,
+               bool _remap_failure_fatal)
       : ServiceBase(g_ceph_context)
       , hard_disconnect(_hard_disconnect)
       , soft_disconnect_timeout(_soft_disconnect_timeout)
       , thread_count(_thread_count)
       , service_start_timeout(_service_start_timeout)
       , image_map_timeout(_image_map_timeout)
+      , remap_failure_fatal(_remap_failure_fatal)
     {
     }
 
@@ -877,8 +880,14 @@ exit:
       // Restart registered mappings before accepting new ones.
       int r = restart_registered_mappings(
         thread_count, service_start_timeout, image_map_timeout);
-      if (r)
-        return r;
+      if (r) {
+        if (remap_failure_fatal) {
+          derr << "Couldn't remap all images. Cleaning up." << dendl;
+          return r;
+        } else {
+          dout(0) << "Ignoring image remap failure." << dendl;
+        }
+      }
 
       return create_pipe_server();
     }
@@ -937,6 +946,9 @@ Service options:
                               unmapping images. Default: 8
   --start-timeout             The service start timeout in seconds. Default: 120
   --map-timeout               Individual image map timeout in seconds. Default: 20
+  --remap-failure-fatal       If set, the service will stop when failing to remap
+                              an image at start time, unmapping images that have
+                              been mapped so far.
 
 Show|List options:
   --format plain|json|xml Output format (default: plain)
@@ -1424,6 +1436,8 @@ static int parse_args(std::vector<const char*>& args,
       cfg->persistent = false;
     } else if (ceph_argparse_flag(args, i, "--pretty-format", (char *)NULL)) {
       cfg->pretty_format = true;
+    } else if (ceph_argparse_flag(args, i, "--remap-failure-fatal", (char *)NULL)) {
+      cfg->remap_failure_fatal = true;
     } else if (ceph_argparse_witharg(args, i, &cfg->parent_pipe, err,
                                      "--pipe-name", (char *)NULL)) {
       if (!err.str().empty()) {
@@ -1638,7 +1652,8 @@ static int rbd_wnbd(int argc, const char *argv[])
       RBDService service(cfg.hard_disconnect, cfg.soft_disconnect_timeout,
                          cfg.service_thread_count,
                          cfg.service_start_timeout,
-                         cfg.image_map_timeout);
+                         cfg.image_map_timeout,
+                         cfg.remap_failure_fatal);
       // This call will block until the service stops.
       r = RBDService::initialize(&service);
       if (r < 0)
index 62cb1dd6a3882deb888c8ac4fe6c6ce490304f5b..d17eb792b0ac0bc37db1e28dec32a71a15de00ba 100644 (file)
@@ -66,6 +66,7 @@ struct Config {
 
   int service_start_timeout = DEFAULT_SERVICE_START_TIMEOUT;
   int image_map_timeout = DEFAULT_IMAGE_MAP_TIMEOUT;
+  bool remap_failure_fatal = false;
 
   // TODO: consider moving those fields to a separate structure. Those
   // provide connection information without actually being configurable.