quorum. Setting it to false however would allow the
task to run with as many as just one single monitor.
(default: True)
+ freeze_mon_probability: how often to freeze the mon instead of killing it,
+ in % (default: 0)
+ freeze_mon_duration: how many seconds to freeze the mon (default: 15)
scrub Scrub after each iteration (default: True)
Note: if 'store-thrash' is set to True, then 'maintain-quorum' must also
- mon_thrash:
revive_delay: 20
thrash_delay: 1
- store_thrash: true
- store_thrash_probability: 40
+ thrash_store: true
+ thrash_store_probability: 40
seed: 31337
maintain_quorum: true
thrash_many: true
self.scrub = self.config.get('scrub', True)
+ self.freeze_mon_probability = float(self.config.get('freeze_mon_probability', 10))
+ self.freeze_mon_duration = float(self.config.get('freeze_mon_duration', 15.0))
+
assert self.max_killable() > 0, \
'Unable to kill at least one monitor with the current config.'
def should_thrash_store(self):
if not self.store_thrash:
return False
- return self.rng.randrange(0,101) >= self.store_thrash_probability
+ return self.rng.randrange(0,101) < self.store_thrash_probability
def thrash_store(self, mon):
addr = self.ctx.ceph.conf['mon.%s' % mon]['mon addr']
'error forcing store sync on mon.{id}:\n{ret}'.format(
id=mon,ret=out)
+ def should_freeze_mon(self):
+ return self.rng.randrange(0,101) < self.freeze_mon_probability
+
+ def freeze_mon(self, mon):
+ log.info('Sending STOP to mon %s', mon)
+ self.manager.signal_mon(mon, 19) # STOP
+
+ def unfreeze_mon(self, mon):
+ log.info('Sending CONT to mon %s', mon)
+ self.manager.signal_mon(mon, 18) # CONT
+
def kill_mon(self, mon):
self.log('killing mon.{id}'.format(id=mon))
self.manager.kill_mon(mon)
def do_thrash(self):
self.log('start thrashing')
self.log('seed: {s}, revive delay: {r}, thrash delay: {t} '\
- 'thrash many: {tm}, maintain quorum: {mq} '\
- 'store thrash: {st}, probability: {stp}'.format(
+ 'thrash many: {tm}, maintain quorum: {mq} '\
+ 'store thrash: {st}, probability: {stp} '\
+ 'freeze mon: prob {fp} duration {fd}'.format(
s=self.random_seed,r=self.revive_delay,t=self.thrash_delay,
tm=self.thrash_many, mq=self.maintain_quorum,
- st=self.store_thrash,stp=self.store_thrash_probability))
+ st=self.store_thrash,stp=self.store_thrash_probability,
+ fp=self.freeze_mon_probability,fd=self.freeze_mon_duration,
+ ))
while not self.stopping:
mons = _get_mons(self.ctx)
mons_to_kill = self.rng.sample(mons, kill_up_to)
self.log('monitors to thrash: {m}'.format(m=mons_to_kill))
+ mons_to_freeze = []
+ for mon in mons:
+ if mon in mons_to_kill:
+ continue
+ if self.should_freeze_mon():
+ mons_to_freeze.append(mon)
+ self.log('monitors to freeze: {m}'.format(m=mons_to_freeze))
+
for mon in mons_to_kill:
self.log('thrashing mon.{m}'.format(m=mon))
self.kill_mon(mon)
+ if mons_to_freeze:
+ for mon in mons_to_freeze:
+ self.freeze_mon(mon)
+ self.log('waiting for {delay} secs to unfreeze mons'.format(
+ delay=self.freeze_mon_duration))
+ time.sleep(self.freeze_mon_duration)
+ for mon in mons_to_freeze:
+ self.unfreeze_mon(mon)
+
if self.maintain_quorum:
self.manager.wait_for_mon_quorum_size(len(mons)-len(mons_to_kill))
for m in mons:
for mon in mons_to_kill:
self.revive_mon(mon)
+ # do more freezes
+ if mons_to_freeze:
+ for mon in mons_to_freeze:
+ self.freeze_mon(mon)
+ self.log('waiting for {delay} secs to unfreeze mons'.format(
+ delay=self.freeze_mon_duration))
+ time.sleep(self.freeze_mon_duration)
+ for mon in mons_to_freeze:
+ self.unfreeze_mon(mon)
+
self.manager.wait_for_mon_quorum_size(len(mons))
for m in mons:
s = self.manager.get_mon_status(m)