From: Sage Weil Date: Thu, 15 Jul 2021 20:17:23 +0000 (-0500) Subject: mon: allow a MON_DOWN grace period after cluster mkfs X-Git-Tag: v17.1.0~1377^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=eee041f2f070b88b01d45c04624872681dd158be;p=ceph.git mon: allow a MON_DOWN grace period after cluster mkfs During teuthology tests, the initial cluster bootstrap often starts up the mon sbut doesn't include all mons in the initial quorum, due to mon startup misalignment and random delays. Provide a short grace period where we will not raise a MON_DOWN alert even though the quorum is not complete. Fixes: https://tracker.ceph.com/issues/43584 Signed-off-by: Sage Weil --- diff --git a/src/common/options/mon.yaml.in b/src/common/options/mon.yaml.in index 535dc65e26a..1ec9309397b 100644 --- a/src/common/options/mon.yaml.in +++ b/src/common/options/mon.yaml.in @@ -39,6 +39,13 @@ options: default: 5 services: - mon +- name: mon_down_mkfs_grace + type: secs + level: advanced + desc: Period in seconds that the cluster may have a mon down after cluster creation + default: 1_min + services: + - mon - name: mon_mgr_beacon_grace type: secs level: advanced diff --git a/src/mon/HealthMonitor.cc b/src/mon/HealthMonitor.cc index bf8cf394bd9..3adbdc3de59 100644 --- a/src/mon/HealthMonitor.cc +++ b/src/mon/HealthMonitor.cc @@ -797,7 +797,9 @@ void HealthMonitor::check_for_mon_down(health_check_map_t *checks) { int max = mon.monmap->size(); int actual = mon.get_quorum().size(); - if (actual < max) { + const auto now = ceph::real_clock::now(); + if (actual < max && + now > mon.monmap->created.to_real_time() + g_conf().get_val("mon_down_mkfs_grace")) { ostringstream ss; ss << (max-actual) << "/" << max << " mons down, quorum " << mon.get_quorum_names();