From: Sage Weil Date: Fri, 12 Oct 2018 13:38:05 +0000 (-0500) Subject: mgr/devicehealth: warn based on life_expectancy_max X-Git-Tag: v14.0.1~42^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=11bae94c436af7d7b635822fc34daa328b42be74;p=ceph.git mgr/devicehealth: warn based on life_expectancy_max The failure interval needs to be sufficiently precise that it establishes an approximate upper bound on the device life expectancy. Also, deal with the fact that the max value may be '0.000000' intead of ''. Signed-off-by: Sage Weil --- diff --git a/src/pybind/mgr/devicehealth/module.py b/src/pybind/mgr/devicehealth/module.py index 6f069137881a..579a9bf3295a 100644 --- a/src/pybind/mgr/devicehealth/module.py +++ b/src/pybind/mgr/devicehealth/module.py @@ -425,21 +425,24 @@ class Module(MgrModule): assert osdmap is not None for dev in devs['devices']: devid = dev['devid'] - if 'life_expectancy_min' not in dev: + if 'life_expectancy_max' not in dev: continue # ignore devices that are not consumed by any daemons if not dev['daemons']: continue + if not dev['life_expectancy_max'] or \ + dev['life_expectancy_max'] == '0.000000': + continue # life_expectancy_(min/max) is in the format of: # '%Y-%m-%d %H:%M:%S.%f', e.g.: # '2019-01-20 21:12:12.000000' - life_expectancy_min = datetime.strptime( - dev['life_expectancy_min'], + life_expectancy_max = datetime.strptime( + dev['life_expectancy_max'], '%Y-%m-%d %H:%M:%S.%f') - self.log.debug('device %s expectancy min %s', dev, - life_expectancy_min) + self.log.debug('device %s expectancy max %s', dev, + life_expectancy_max) - if life_expectancy_min - now <= mark_out_threshold_td: + if life_expectancy_max - now <= mark_out_threshold_td: if self.self_heal: # dev['daemons'] == ["osd.0","osd.1","osd.2"] if dev['daemons']: @@ -448,11 +451,11 @@ class Module(MgrModule): osd_ids = map(lambda x: x[4:], osds) for _id in osd_ids: if self.is_osd_in(osdmap, _id): - osds_in[_id] = life_expectancy_min + osds_in[_id] = life_expectancy_max else: osds_out[_id] = 1 - if life_expectancy_min - now <= warn_threshold_td: + if life_expectancy_max - now <= warn_threshold_td: # device can appear in more than one location in case # of SCSI multipath device_locations = map(lambda x: x['host'] + ':' + x['dev'], @@ -462,11 +465,8 @@ class Module(MgrModule): % (dev['devid'], ','.join(device_locations), ','.join(dev.get('daemons', ['none'])), - dev['life_expectancy_min'], + dev['life_expectancy_max'], dev.get('life_expectancy_max', 'unknown'))) - # TODO: by default, dev['life_expectancy_max'] == '0.000000', - # so dev.get('life_expectancy_max', 'unknown') - # above should be altered. # OSD might be marked 'out' (which means it has no # data), however PGs are still attached to it.