From: Sage Weil Date: Tue, 27 Jun 2017 18:57:53 +0000 (-0400) Subject: qa/tasks/mon_clock_skew_check: vastly simplify X-Git-Tag: v12.1.1~58^2~28 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=b25ddafa169e918a83706189e746f4a94a25c654;p=ceph.git qa/tasks/mon_clock_skew_check: vastly simplify Signed-off-by: Sage Weil --- diff --git a/qa/tasks/mon_clock_skew_check.py b/qa/tasks/mon_clock_skew_check.py index 891e6ec484e..547339f79a1 100644 --- a/qa/tasks/mon_clock_skew_check.py +++ b/qa/tasks/mon_clock_skew_check.py @@ -13,43 +13,19 @@ log = logging.getLogger(__name__) class ClockSkewCheck: """ - Periodically check if there are any clock skews among the monitors in the - quorum. By default, assume no skews are supposed to exist; that can be - changed using the 'expect-skew' option. If 'fail-on-skew' is set to false, - then we will always succeed and only report skews if any are found. - - This class does not spawn a thread. It assumes that, if that is indeed - wanted, it should be done by a third party (for instance, the task using - this class). We intend it as such in order to reuse this class if need be. + Check if there are any clock skews among the monitors in the + quorum. This task accepts the following options: - interval amount of seconds to wait in-between checks. (default: 30.0) - max-skew maximum skew, in seconds, that is considered tolerable before - issuing a warning. (default: 0.05) + interval amount of seconds to wait before check. (default: 30.0) expect-skew 'true' or 'false', to indicate whether to expect a skew during the run or not. If 'true', the test will fail if no skew is found, and succeed if a skew is indeed found; if 'false', it's the other way around. (default: false) - never-fail Don't fail the run if a skew is detected and we weren't - expecting it, or if no skew is detected and we were expecting - it. (default: False) - - at-least-once Runs at least once, even if we are told to stop. - (default: True) - at-least-once-timeout If we were told to stop but we are attempting to - run at least once, timeout after this many seconds. - (default: 600) - - Example: - Expect a skew higher than 0.05 seconds, but only report it without - failing the teuthology run. - mon_clock_skew_check: - interval: 30 - max-skew: 0.05 - expect_skew: true - never-fail: true + expect-skew: true """ def __init__(self, ctx, manager, config, logger): @@ -63,181 +39,15 @@ class ClockSkewCheck: if self.config is None: self.config = dict() - self.check_interval = float(self.config.get('interval', 30.0)) - - first_mon = teuthology.get_first_mon(ctx, config) - remote = ctx.cluster.only(first_mon).remotes.keys()[0] - proc = remote.run( - args=[ - 'sudo', - 'ceph-mon', - '-i', first_mon[4:], - '--show-config-value', 'mon_clock_drift_allowed' - ], stdout=StringIO(), wait=True - ) - self.max_skew = self.config.get('max-skew', float(proc.stdout.getvalue())) - - self.expect_skew = self.config.get('expect-skew', False) - self.never_fail = self.config.get('never-fail', False) - self.at_least_once = self.config.get('at-least-once', True) - self.at_least_once_timeout = self.config.get('at-least-once-timeout', 600.0) - - def info(self, x): - """ - locally define logger for info messages - """ - self.logger.info(x) - - def warn(self, x): - """ - locally define logger for warnings - """ - self.logger.warn(x) - - def debug(self, x): - """ - locally define logger for debug messages - """ - self.logger.info(x) - self.logger.debug(x) - - def finish(self): - """ - Break out of the do_check loop. - """ - self.stopping = True - - def sleep_interval(self): - """ - If a sleep interval is set, sleep for that amount of time. - """ - if self.check_interval > 0.0: - self.debug('sleeping for {s} seconds'.format( - s=self.check_interval)) - time.sleep(self.check_interval) - - def print_skews(self, skews): - """ - Display skew values. - """ - total = len(skews) - if total > 0: - self.info('---------- found {n} skews ----------'.format(n=total)) - for mon_id, values in skews.iteritems(): - self.info('mon.{id}: {v}'.format(id=mon_id, v=values)) - self.info('-------------------------------------') - else: - self.info('---------- no skews were found ----------') - - def do_check(self): - """ - Clock skew checker. Loops until finish() is called. - """ - self.info('start checking for clock skews') - skews = dict() - ran_once = False - - started_on = None - - while not self.stopping or (self.at_least_once and not ran_once): - - if self.at_least_once and not ran_once and self.stopping: - if started_on is None: - self.info('kicking-off timeout (if any)') - started_on = time.time() - elif self.at_least_once_timeout > 0.0: - assert time.time() - started_on < self.at_least_once_timeout, \ - 'failed to obtain a timecheck before timeout expired' - - quorum_size = len(teuthology.get_mon_names(self.ctx)) - self.manager.wait_for_mon_quorum_size(quorum_size) - - health = self.manager.get_mon_health(True) - timechecks = health['timechecks'] - - clean_check = False - if timechecks['round_status'] == 'finished': - assert (timechecks['round'] % 2) == 0, \ - 'timecheck marked as finished but round ' \ - 'disagrees (r {r})'.format( - r=timechecks['round']) - clean_check = True - else: - assert timechecks['round_status'] == 'on-going', \ - 'timecheck status expected \'on-going\' ' \ - 'but found \'{s}\' instead'.format( - s=timechecks['round_status']) - if 'mons' in timechecks.keys() and len(timechecks['mons']) > 1: - self.info('round still on-going, but there are available reports') - else: - self.info('no timechecks available just yet') - self.sleep_interval() - continue - - assert len(timechecks['mons']) > 1, \ - 'there are not enough reported timechecks; ' \ - 'expected > 1 found {n}'.format(n=len(timechecks['mons'])) - - for check in timechecks['mons']: - mon_skew = float(check['skew']) - mon_health = check['health'] - mon_id = check['name'] - if abs(mon_skew) > self.max_skew: - assert mon_health == 'HEALTH_WARN', \ - 'mon.{id} health is \'{health}\' but skew {s} > max {ms}'.format( - id=mon_id,health=mon_health,s=abs(mon_skew),ms=self.max_skew) - - log_str = 'mon.{id} with skew {s} > max {ms}'.format( - id=mon_id,s=abs(mon_skew),ms=self.max_skew) - - """ add to skew list """ - details = check['details'] - skews[mon_id] = {'skew': mon_skew, 'details': details} - - if self.expect_skew: - self.info('expected skew: {str}'.format(str=log_str)) - else: - self.warn('unexpected skew: {str}'.format(str=log_str)) - - if clean_check or (self.expect_skew and len(skews) > 0): - ran_once = True - self.print_skews(skews) - self.sleep_interval() - - total = len(skews) - self.print_skews(skews) - - error_str = '' - found_error = False - - if self.expect_skew: - if total == 0: - error_str = 'We were expecting a skew, but none was found!' - found_error = True - else: - if total > 0: - error_str = 'We were not expecting a skew, but we did find it!' - found_error = True - - if found_error: - self.info(error_str) - if not self.never_fail: - assert False, error_str - -@contextlib.contextmanager def task(ctx, config): - """ - Use clas ClockSkewCheck to check for clock skews on the monitors. - This task will spawn a thread running ClockSkewCheck's do_check(). - - All the configuration will be directly handled by ClockSkewCheck, - so please refer to the class documentation for further information. - """ if config is None: config = {} assert isinstance(config, dict), \ 'mon_clock_skew_check task only accepts a dict for configuration' + interval = float(config.get('interval', 30.0)) + expect_skew = config.get('expect-skew', False) + log.info('Beginning mon_clock_skew_check...') first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() @@ -247,15 +57,20 @@ def task(ctx, config): logger=log.getChild('ceph_manager'), ) - skew_check = ClockSkewCheck(ctx, - manager, config, - logger=log.getChild('mon_clock_skew_check')) - skew_check_thread = gevent.spawn(skew_check.do_check) - try: - yield - finally: - log.info('joining mon_clock_skew_check') - skew_check.finish() - skew_check_thread.get() - + quorum_size = len(teuthology.get_mon_names(ctx)) + manager.wait_for_mon_quorum_size(quorum_size) + + # wait a bit + log.info('sleeping for {s} seconds'.format( + s=interval)) + time.sleep(interval) + + health = manager.get_mon_health(True) + log.info('got health %s' % health) + if expect_skew: + if 'MON_CLOCK_SKEW' not in health['checks']: + raise RuntimeError('expected MON_CLOCK_SKEW but got none') + else: + if 'MON_CLOCK_SKEW' in health['checks']: + raise RuntimeError('got MON_CLOCK_SKEW but expected none')