From a0457492ccaaf1cc3342f5284459402563c6224a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 5 Jul 2013 18:04:40 -0700 Subject: [PATCH] mon_thrasher: add pause/unpause of mons to thrashing This adds an additional element of laggyness to the cluster which should cause mons to call new elections. Signed-off-by: Sage Weil --- teuthology/task/ceph_manager.py | 3 ++ teuthology/task/mon_thrash.py | 59 +++++++++++++++++++++++++++++---- 2 files changed, 56 insertions(+), 6 deletions(-) diff --git a/teuthology/task/ceph_manager.py b/teuthology/task/ceph_manager.py index bfa867b09a6c1..a82e648777465 100644 --- a/teuthology/task/ceph_manager.py +++ b/teuthology/task/ceph_manager.py @@ -872,6 +872,9 @@ class CephManager: ## monitors + def signal_mon(self, mon, sig): + self.ctx.daemons.get_daemon('mon', mon).signal(sig) + def kill_mon(self, mon): if self.config.get('powercycle'): (remote,) = self.ctx.cluster.only('mon.{m}'.format(m=mon)).remotes.iterkeys() diff --git a/teuthology/task/mon_thrash.py b/teuthology/task/mon_thrash.py index 122c26d1818e9..e56d8761ffb60 100644 --- a/teuthology/task/mon_thrash.py +++ b/teuthology/task/mon_thrash.py @@ -50,6 +50,9 @@ class MonitorThrasher: quorum. Setting it to false however would allow the task to run with as many as just one single monitor. (default: True) + freeze_mon_probability: how often to freeze the mon instead of killing it, + in % (default: 0) + freeze_mon_duration: how many seconds to freeze the mon (default: 15) scrub Scrub after each iteration (default: True) Note: if 'store-thrash' is set to True, then 'maintain-quorum' must also @@ -62,8 +65,8 @@ class MonitorThrasher: - mon_thrash: revive_delay: 20 thrash_delay: 1 - store_thrash: true - store_thrash_probability: 40 + thrash_store: true + thrash_store_probability: 40 seed: 31337 maintain_quorum: true thrash_many: true @@ -103,6 +106,9 @@ class MonitorThrasher: self.scrub = self.config.get('scrub', True) + self.freeze_mon_probability = float(self.config.get('freeze_mon_probability', 10)) + self.freeze_mon_duration = float(self.config.get('freeze_mon_duration', 15.0)) + assert self.max_killable() > 0, \ 'Unable to kill at least one monitor with the current config.' @@ -128,7 +134,7 @@ class MonitorThrasher: def should_thrash_store(self): if not self.store_thrash: return False - return self.rng.randrange(0,101) >= self.store_thrash_probability + return self.rng.randrange(0,101) < self.store_thrash_probability def thrash_store(self, mon): addr = self.ctx.ceph.conf['mon.%s' % mon]['mon addr'] @@ -139,6 +145,17 @@ class MonitorThrasher: 'error forcing store sync on mon.{id}:\n{ret}'.format( id=mon,ret=out) + def should_freeze_mon(self): + return self.rng.randrange(0,101) < self.freeze_mon_probability + + def freeze_mon(self, mon): + log.info('Sending STOP to mon %s', mon) + self.manager.signal_mon(mon, 19) # STOP + + def unfreeze_mon(self, mon): + log.info('Sending CONT to mon %s', mon) + self.manager.signal_mon(mon, 18) # CONT + def kill_mon(self, mon): self.log('killing mon.{id}'.format(id=mon)) self.manager.kill_mon(mon) @@ -157,11 +174,14 @@ class MonitorThrasher: def do_thrash(self): self.log('start thrashing') self.log('seed: {s}, revive delay: {r}, thrash delay: {t} '\ - 'thrash many: {tm}, maintain quorum: {mq} '\ - 'store thrash: {st}, probability: {stp}'.format( + 'thrash many: {tm}, maintain quorum: {mq} '\ + 'store thrash: {st}, probability: {stp} '\ + 'freeze mon: prob {fp} duration {fd}'.format( s=self.random_seed,r=self.revive_delay,t=self.thrash_delay, tm=self.thrash_many, mq=self.maintain_quorum, - st=self.store_thrash,stp=self.store_thrash_probability)) + st=self.store_thrash,stp=self.store_thrash_probability, + fp=self.freeze_mon_probability,fd=self.freeze_mon_duration, + )) while not self.stopping: mons = _get_mons(self.ctx) @@ -176,6 +196,14 @@ class MonitorThrasher: mons_to_kill = self.rng.sample(mons, kill_up_to) self.log('monitors to thrash: {m}'.format(m=mons_to_kill)) + mons_to_freeze = [] + for mon in mons: + if mon in mons_to_kill: + continue + if self.should_freeze_mon(): + mons_to_freeze.append(mon) + self.log('monitors to freeze: {m}'.format(m=mons_to_freeze)) + for mon in mons_to_kill: self.log('thrashing mon.{m}'.format(m=mon)) @@ -185,6 +213,15 @@ class MonitorThrasher: self.kill_mon(mon) + if mons_to_freeze: + for mon in mons_to_freeze: + self.freeze_mon(mon) + self.log('waiting for {delay} secs to unfreeze mons'.format( + delay=self.freeze_mon_duration)) + time.sleep(self.freeze_mon_duration) + for mon in mons_to_freeze: + self.unfreeze_mon(mon) + if self.maintain_quorum: self.manager.wait_for_mon_quorum_size(len(mons)-len(mons_to_kill)) for m in mons: @@ -201,6 +238,16 @@ class MonitorThrasher: for mon in mons_to_kill: self.revive_mon(mon) + # do more freezes + if mons_to_freeze: + for mon in mons_to_freeze: + self.freeze_mon(mon) + self.log('waiting for {delay} secs to unfreeze mons'.format( + delay=self.freeze_mon_duration)) + time.sleep(self.freeze_mon_duration) + for mon in mons_to_freeze: + self.unfreeze_mon(mon) + self.manager.wait_for_mon_quorum_size(len(mons)) for m in mons: s = self.manager.get_mon_status(m) -- 2.39.5