From eee041f2f070b88b01d45c04624872681dd158be Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 15 Jul 2021 15:17:23 -0500 Subject: [PATCH] mon: allow a MON_DOWN grace period after cluster mkfs During teuthology tests, the initial cluster bootstrap often starts up the mon sbut doesn't include all mons in the initial quorum, due to mon startup misalignment and random delays. Provide a short grace period where we will not raise a MON_DOWN alert even though the quorum is not complete. Fixes: https://tracker.ceph.com/issues/43584 Signed-off-by: Sage Weil --- src/common/options/mon.yaml.in | 7 +++++++ src/mon/HealthMonitor.cc | 4 +++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/common/options/mon.yaml.in b/src/common/options/mon.yaml.in index 535dc65e26a2e..1ec9309397ba5 100644 --- a/src/common/options/mon.yaml.in +++ b/src/common/options/mon.yaml.in @@ -39,6 +39,13 @@ options: default: 5 services: - mon +- name: mon_down_mkfs_grace + type: secs + level: advanced + desc: Period in seconds that the cluster may have a mon down after cluster creation + default: 1_min + services: + - mon - name: mon_mgr_beacon_grace type: secs level: advanced diff --git a/src/mon/HealthMonitor.cc b/src/mon/HealthMonitor.cc index bf8cf394bd9a6..3adbdc3de59f9 100644 --- a/src/mon/HealthMonitor.cc +++ b/src/mon/HealthMonitor.cc @@ -797,7 +797,9 @@ void HealthMonitor::check_for_mon_down(health_check_map_t *checks) { int max = mon.monmap->size(); int actual = mon.get_quorum().size(); - if (actual < max) { + const auto now = ceph::real_clock::now(); + if (actual < max && + now > mon.monmap->created.to_real_time() + g_conf().get_val("mon_down_mkfs_grace")) { ostringstream ss; ss << (max-actual) << "/" << max << " mons down, quorum " << mon.get_quorum_names(); -- 2.39.5