import logging
import random
import signal
+import socket
import time
from gevent import sleep
from gevent.event import Event
from teuthology import misc
+from teuthology.exceptions import CommandFailedError
from teuthology.task import Task
+from teuthology.orchestra import run
log = logging.getLogger(__name__)
continue
self.log('kill {label}'.format(label=daemon.id_))
- daemon.signal(signal.SIGTERM)
+ try:
+ daemon.signal(signal.SIGTERM)
+ except socket.error:
+ pass
killed_daemons.append(daemon)
stats['kill'] += 1
if killed_daemons:
# wait for a while before restarting
-
delay = self.max_revive_delay
if self.randomize:
delay = random.randrange(0.0, self.max_revive_delay)
for daemon in killed_daemons:
self.log('waiting for {label}'.format(label=daemon.id_))
- daemon.stop()
+ try:
+ run.wait([daemon.proc], timeout=600)
+ except CommandFailedError:
+ pass
+ except:
+ self.log('Failed to stop {label}'.format(label=daemon.id_))
+
+ try:
+ # try to capture a core dump
+ daemon.signal(signal.SIGABRT)
+ except socket.error:
+ pass
+ raise
+ finally:
+ daemon.reset()
+
for daemon in killed_daemons:
self.log('reviving {label}'.format(label=daemon.id_))
daemon.start()