-
"""
Device health monitoring
"""
DEFAULTS = {
'enable_monitoring': str(True),
'scrape_frequency': str(86400),
- 'retention_period': str(86400*14),
+ 'retention_period': str(86400 * 14),
'pool_name': 'device_health_metrics',
'mark_out_threshold': str(86400*14),
'warn_threshold': str(86400*14*2),
DEVICE_HEALTH_TOOMANY: 'Too many daemons are expected to fail soon',
}
+
class Module(MgrModule):
OPTIONS = [
- { 'name': 'enable_monitoring' },
- { 'name': 'scrape_frequency' },
- { 'name': 'pool_name' },
- { 'name': 'retention_period' },
- { 'name': 'mark_out_threshold' },
- { 'name': 'warn_threshold' },
- { 'name': 'self_heal' },
+ {'name': 'enable_monitoring'},
+ {'name': 'scrape_frequency'},
+ {'name': 'pool_name'},
+ {'name': 'retention_period'},
+ {'name': 'mark_out_threshold'},
+ {'name': 'warn_threshold'},
+ {'name': 'self_heal'},
]
COMMANDS = [
{
"cmd": "device scrape-daemon-health-metrics "
"name=who,type=CephString",
- "desc": "Scrape and store device health metrics for a given daemon",
+ "desc": "Scrape and store device health metrics "
+ "for a given daemon",
"perm": "r"
},
{
- "cmd": "device scrape-health-metrics name=devid,type=CephString,req=False",
+ "cmd": "device scrape-health-metrics "
+ "name=devid,type=CephString,req=False",
"desc": "Scrape and store health metrics",
"perm": "r"
},
{
- "cmd": "device show-health-metrics name=devid,type=CephString name=sample,type=CephString,req=False",
+ "cmd": "device show-health-metrics "
+ "name=devid,type=CephString "
+ "name=sample,type=CephString,req=False",
"desc": "Show stored device metrics for the device",
"perm": "r"
},
self.run = True
self.event = Event()
- def handle_command(self, inbuf, cmd):
+ def handle_command(self, _, cmd):
self.log.error("handle_command")
if cmd['prefix'] == 'device query-daemon-health-metrics':
who = cmd.get('who', '')
if who[0:4] != 'osd.':
- return (-errno.EINVAL, '', 'not a valid <osd.NNN> id')
+ return -errno.EINVAL, '', 'not a valid <osd.NNN> id'
osd_id = who[4:]
result = CommandResult('')
self.send_command(result, 'osd', osd_id, json.dumps({
'format': 'json',
}), '')
r, outb, outs = result.wait()
- return (r, outb, outs)
+ return r, outb, outs
elif cmd['prefix'] == 'device scrape-daemon-health-metrics':
who = cmd.get('who', '')
if who[0:4] != 'osd.':
- return (-errno.EINVAL, '', 'not a valid <osd.NNN> id')
- id = int(who[4:])
- return self.scrape_osd(id)
+ return -errno.EINVAL, '', 'not a valid <osd.NNN> id'
+ osd_id = int(who[4:])
+ return self.scrape_osd(osd_id)
elif cmd['prefix'] == 'device scrape-health-metrics':
if 'devid' in cmd:
return self.scrape_device(cmd['devid'])
- return self.scrape_all();
+ return self.scrape_all()
elif cmd['prefix'] == 'device show-health-metrics':
return self.show_device_metrics(cmd['devid'], cmd.get('sample'))
elif cmd['prefix'] == 'device check-health':
assert before != after
def refresh_config(self):
- self.enable_monitoring = self.get_config('enable_monitoring', '') is not '' or 'false'
+ self.enable_monitoring = self.get_config('enable_monitoring',
+ '') is not '' or 'false'
for opt, value in iteritems(DEFAULTS):
setattr(self, opt, self.get_config(opt) or value)
assert r == 0
ioctx = self.rados.open_ioctx(self.pool_name)
- return (ioctx)
+ return ioctx
def scrape_osd(self, osd_id):
ioctx = self.open_connection()
def scrape_device(self, devid):
r = self.get("device " + devid)
if not r or 'device' not in r.keys():
- return (-errno.ENOENT, '', 'device ' + devid + ' not found')
+ return -errno.ENOENT, '', 'device ' + devid + ' not found'
daemons = r['device'].get('daemons', [])
osds = [int(r[4:]) for r in daemons if r.startswith('osd.')]
if not osds:
return (-errno.EAGAIN, '',
- 'device ' + devid + ' not claimed by any active OSD daemons')
+ 'device ' + devid + ' not claimed by any active '
+ 'OSD daemons')
osd_id = osds[0]
ioctx = self.open_connection()
raw_smart_data = self.do_scrape_osd(osd_id, ioctx, devid=devid)
erase = []
try:
with rados.ReadOpCtx() as op:
- iter, ret = ioctx.get_omap_keys(op, "", 500) # fixme
+ iter, ret = ioctx.get_omap_keys(op, "", 500) # fixme
assert ret == 0
ioctx.operate_read_op(op, devid)
for key, _ in list(iter):
# verify device exists
r = self.get("device " + devid)
if not r or 'device' not in r.keys():
- return (-errno.ENOENT, '', 'device ' + devid + ' not found')
+ return -errno.ENOENT, '', 'device ' + devid + ' not found'
# fetch metrics
ioctx = self.open_connection()
res = {}
with rados.ReadOpCtx() as op:
- iter, ret = ioctx.get_omap_vals(op, "", sample or '', 500) # fixme
+ omap_iter, ret = ioctx.get_omap_vals(op, "", sample or '', 500) # fixme
assert ret == 0
try:
ioctx.operate_read_op(op, devid)
- for key, value in list(iter):
+ for key, value in list(omap_iter):
if sample and key != sample:
break
try:
if r != 0:
self.log.warn('Could not set osd.%s primary-affinity, r: [%s], outs: [%s]' % (osd_id, r, outb, outs))
-
def extract_smart_features(self, raw):
# FIXME: extract and normalize raw smartctl --json output and
# generate a dict of the fields we care about.