From: Sage Weil <sage@redhat.com>
Date: Tue, 10 Mar 2020 14:28:57 +0000 (-0500)
Subject: cephadm: bootstrap: wait for mgr to restart after enabling a module
X-Git-Tag: v15.1.1~45^2
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=c565db4595e8dd041f8e28ebadd6a8f40de265e4;p=ceph.git

cephadm: bootstrap: wait for mgr to restart after enabling a module

It was possible to enable a module (mon updates mgrmap) and then
do a mgr command and have that command reach the mgr before it got the
latest mgrmap and restarted.

Fixes: https://tracker.ceph.com/issues/44531
Signed-off-by: Sage Weil <sage@redhat.com>
---

diff --git a/src/cephadm/cephadm b/src/cephadm/cephadm
index 0bb96ef3041..4f35d4284b7 100755
--- a/src/cephadm/cephadm
+++ b/src/cephadm/cephadm
@@ -1999,10 +1999,31 @@ def command_bootstrap():
         return j.get('mgrmap', {}).get('available', False)
     is_available('mgr', is_mgr_available)
 
+    # wait for mgr to restart (after enabling a module)
+    def wait_for_mgr_restart():
+        # first get latest mgrmap epoch from the mon
+        out = cli(['mgr', 'dump'])
+        j = json.loads(out)
+        epoch = j['epoch']
+        # wait for mgr to have it
+        logger.info('Waiting for the mgr to restart...')
+        def mgr_has_latest_epoch():
+            # type: () -> bool
+            try:
+                out = cli(['tell', 'mgr', 'mgr_status'])
+                j = json.loads(out)
+                return j['mgrmap_epoch'] >= epoch
+            except Exception as e:
+                logger.debug('tell mgr mgr_status failed: %s' % e)
+                return False
+        is_available('Mgr epoch %d' % epoch, mgr_has_latest_epoch)
+
     # ssh
     if not args.skip_ssh:
         logger.info('Enabling cephadm module...')
         cli(['mgr', 'module', 'enable', 'cephadm'])
+        wait_for_mgr_restart()
+
         logger.info('Setting orchestrator backend to cephadm...')
         cli(['orch', 'set', 'backend', 'cephadm'])
 
@@ -2042,22 +2063,7 @@ def command_bootstrap():
     if not args.skip_dashboard:
         logger.info('Enabling the dashboard module...')
         cli(['mgr', 'module', 'enable', 'dashboard'])
-
-        # wait for the service to become available
-        logger.info('Waiting for the dashboard to start...')
-        def is_dashboard_available():
-            # type: () -> bool
-            timeout=args.timeout if args.timeout else 30 # seconds
-            try:
-                out = cli(['-h'], timeout=timeout)
-                return 'dashboard' in out
-            except RuntimeError as e:
-                # sometimes -h command times out/errors out
-                logger.debug('Command errored out: %s' % e)
-                return False
-        is_available('Dashboard', is_dashboard_available)
-
-
+        wait_for_mgr_restart()
 
         # dashboard crt and key
         if args.dashboard_key and args.dashboard_crt: