From 49adbc44df8ca8fa9d1e42986e8cd43558dc8688 Mon Sep 17 00:00:00 2001 From: Paul Cuzner Date: Sat, 19 Sep 2020 16:29:46 +1200 Subject: [PATCH] cephadm: check content from list_daemons is valid The response form list_daemons is now checked and the main loop bypasses dead threads to prvent repeated errors being added to the errors field within the health item. Signed-off-by: Paul Cuzner --- src/cephadm/cephadm | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/cephadm/cephadm b/src/cephadm/cephadm index a329fee19d659..b9b08b0d8124b 100755 --- a/src/cephadm/cephadm +++ b/src/cephadm/cephadm @@ -5436,7 +5436,7 @@ class CephadmDaemon(): try: data = json.loads(facts.dump()) except json.decoder.JSONDecodeError: - errors.append("host-facts received invalid JSON") + errors.append("host-facts provided invalid JSON") logger.warning(errors[-1]) data = {} with self.cephadm_cache_lock: @@ -5479,7 +5479,7 @@ class CephadmDaemon(): try: data = json.loads(stdout) except json.decoder.JSONDecodeError: - errors.append("ceph-volume thread received bad json data") + errors.append("ceph-volume thread provided bad json data") logger.warning(errors[-1]) data = [] else: @@ -5511,14 +5511,20 @@ class CephadmDaemon(): logger.debug("executing list-daemons scrape") errors = [] s_time = time.time() - ld = list_daemons() + + # list daemons should ideally be invoked with a fsid + data = list_daemons() + if not isinstance(data, list): + errors.append("list-daemons didn't supply a list?") + logger.warning(errors[-1]) + data = [] elapsed = time.time() - s_time with self.cephadm_cache_lock: self.cephadm_cache['daemons'] = { "scrape_timestamp": s_time, "scrape_duration_secs": elapsed, "scrape_errors": errors, - "data": ld, + "data": data, } logger.debug(f"completed list-daemons scrape - {elapsed}s") @@ -5588,14 +5594,16 @@ class CephadmDaemon(): if ctr >= CephadmDaemon.thread_check_interval: ctr = 0 for worker in self.workers: + if self.cephadm_cache['health']['tasks'][worker.name] == 'inactive': + continue if not worker.is_alive(): logger.warning(f"{worker.name} thread not running") stop_time = datetime.datetime.now().strftime("%Y/%m/%d %H:%M:%S") with self.cephadm_cache_lock: - # update health in the cache + # update health status in the cache self.cephadm_cache['health']['tasks'][worker.name] = "inactive" self.cephadm_cache['health']['errors'].append(f"{worker.name} stopped at {stop_time}") - + time.sleep(CephadmDaemon.loop_delay) ctr += CephadmDaemon.loop_delay -- 2.39.5