]> git.apps.os.sepia.ceph.com Git - teuthology.git/commitdiff
nuke: bring stale kernel client handling back
authorIlya Dryomov <idryomov@gmail.com>
Wed, 1 Feb 2017 19:37:49 +0000 (20:37 +0100)
committerIlya Dryomov <idryomov@gmail.com>
Wed, 1 Feb 2017 20:26:45 +0000 (21:26 +0100)
Commit 1d47a121b385 ("Fix nuke, redo some cleanup functions") broke
stale kernel client map/mount handling by dropping reboot arguments.
While for kcephfs we can use 'umount -f' to avoid sync (it used to not
work, but is mostly fixed now, I believe), currently there is nothing
we can do for a local filesystem mounted on top of krbd.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
teuthology/nuke/actions.py

index 2de8b49a7d6a0389a08ccd7148ba1ecef7486465..d3a3a53d5e69ef5f720b0406c5871db913d161f9 100644 (file)
@@ -151,28 +151,36 @@ def remove_osd_tmpfs(ctx):
     )
 
 
+def stale_kernel_mount(remote):
+    proc = remote.run(
+        args=[
+            'grep', '-q', ' ceph ', '/etc/mtab',
+            run.Raw('||'),
+            'grep', '-q', '^/dev/rbd', '/etc/mtab',
+        ],
+        check_status=False
+    )
+    # grep exists with 1 if no lines were selected
+    return proc.exitstatus != 1
+
+
 def reboot(ctx, remotes):
-    nodes = {}
     for remote in remotes:
-        log.info('rebooting %s', remote.name)
+        if stale_kernel_mount(remote):
+            log.warn('Stale kernel mount on %s!', remote.name)
+            log.info('force/no-sync rebooting %s', remote.name)
+            args = ['sync', run.Raw('&'),
+                    'sleep', '5', run.Raw(';'),
+                    'sudo', 'reboot', '-f', '-n']
+        else:
+            log.info('rebooting %s', remote.name)
+            args = ['sudo', 'reboot']
         try:
-            proc = remote.run(
-                args=[
-                    'sync',
-                    run.Raw('&'),
-                    'sleep', '5',
-                    run.Raw(';'),
-                    'sudo', 'reboot',
-                    ],
-                wait=False,
-                )
+            remote.run(args=args, wait=False)
         except Exception:
             log.exception('ignoring exception during reboot command')
-        nodes[remote] = proc
         # we just ignore these procs because reboot -f doesn't actually
         # send anything back to the ssh client!
-        # for remote, proc in nodes.iteritems():
-        # proc.wait()
     if remotes:
         log.info('waiting for nodes to reboot')
         time.sleep(8)  # if we try and reconnect too quickly, it succeeds!