]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
ceph: delay raising exceptions until all daemons are stopped
authorJosh Durgin <josh.durgin@dreamhost.com>
Thu, 2 Feb 2012 17:26:25 +0000 (09:26 -0800)
committerJosh Durgin <josh.durgin@dreamhost.com>
Thu, 2 Feb 2012 17:26:25 +0000 (09:26 -0800)
If a daemon crashes, the exception is raised when we stop it. This
caused some daemons to continue running during cleanup, since the rest
of the daemons of the same type would not be shut down. Also log each
daemon that crashed, for easier debugging.

Fixes: #1744
teuthology/task/ceph.py

index 4af35022ba81f4678e6f614036b2f8d8c5176e2f..6acb3ccde520427f90a64f77423fd69bf4a61417 100644 (file)
@@ -7,6 +7,7 @@ import logging
 import os
 import shutil
 import subprocess
+import sys
 import tempfile
 
 from teuthology import misc as teuthology
@@ -31,15 +32,24 @@ class DaemonState(object):
             self.logger.info("%s.%s: %s"%(self.role, self.id_, msg))
 
     def stop(self):
-        if self.proc is not None:
-            self.proc.stdin.close()
-            run.wait([self.proc])
-            self.proc = None
-            self.log("Stopped")
+        """
+        Note: this can raise a run.CommandFailedError,
+        run.CommandCrashedError, or run.ConnectionLostError.
+        """
+        if not self.running():
+            self.log('tried to stop a non-running daemon')
+            return
+        self.proc.stdin.close()
+        self.log('waiting for process to exit')
+        run.wait([self.proc])
+        self.proc = None
+        self.log("Stopped")
 
     def restart(self):
         self.log("Restarting")
-        self.stop()
+        if self.proc is not None:
+            self.log("stopping old one...")
+            self.stop()
         self.proc = self.remote.run(*self.command_args, **self.command_kwargs)
         self.log("Started")
 
@@ -795,8 +805,17 @@ def run_daemon(ctx, config, type):
         yield
     finally:
         log.info('Shutting down %s daemons...' % type)
-        [i.stop() for i in ctx.daemons.iter_daemons_of_role(type)]
-
+        exc_info = (None, None, None)
+        for daemon in ctx.daemons.iter_daemons_of_role(type):
+            try:
+                daemon.stop()
+            except (run.CommandFailedError,
+                    run.CommandCrashedError,
+                    run.ConnectionLostError):
+                exc_info = sys.exc_info()
+                log.exception('Saw exception from %s.%s', daemon.role, daemon.id_)
+        if exc_info != (None, None, None):
+            raise exc_info[0], exc_info[1], exc_info[2]
 
 def healthy(ctx, config):
     log.info('Waiting until ceph is healthy...')