From: Ilya Dryomov Date: Mon, 19 Jun 2023 17:53:39 +0000 (+0200) Subject: mon/MonClient: resurrect original client_mount_timeout handling X-Git-Tag: v16.2.15~64^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=6ba265ad1dd6d5ea7cc9a33e35dd3c605a5d188b;p=ceph.git mon/MonClient: resurrect original client_mount_timeout handling While reducing a "waiting for config" timeout from 30 seconds to 3 (mon_client_hunt_interval default) and instead introducing 10 retries, commit 3c2b30e4c5dd ("mon/MonClient: apply timeout while fetching config") also subjected authenticate() to these retries. However, authenticate() is going by client_mount_timeout which defaults to 5 minutes. As a result, when the monitors are unreachable or there are other connectivity issues, we end up taking 50 minutes to return ETIMEDOUT from rados_connect(). Fixes: https://tracker.ceph.com/issues/61733 Signed-off-by: Ilya Dryomov (cherry picked from commit 6d2b449afd47c36df37a335eed62bd7510808c37) --- diff --git a/src/mon/MonClient.cc b/src/mon/MonClient.cc index a9fba82883b9..a27c7ae009f2 100644 --- a/src/mon/MonClient.cc +++ b/src/mon/MonClient.cc @@ -155,11 +155,8 @@ int MonClient::get_monmap_and_config() if (r < 0) { return r; } - r = authenticate(std::chrono::duration(cct->_conf.get_val("client_mount_timeout")).count()); - if (r == -ETIMEDOUT) { - shutdown(); - continue; - } + r = authenticate( + cct->_conf.get_val("client_mount_timeout").count()); if (r < 0) { break; } diff --git a/src/test/librados/misc.cc b/src/test/librados/misc.cc index 9c052a5d91c0..2f21669d0b56 100644 --- a/src/test/librados/misc.cc +++ b/src/test/librados/misc.cc @@ -85,6 +85,26 @@ TEST(LibRadosMiscConnectFailure, ConnectFailure) { rados_shutdown(cluster); } +TEST(LibRadosMiscConnectFailure, ConnectTimeout) { + rados_t cluster; + + ASSERT_EQ(0, rados_create(&cluster, NULL)); + ASSERT_EQ(0, rados_conf_set(cluster, "mon_host", "255.0.1.2:3456")); + ASSERT_EQ(0, rados_conf_set(cluster, "key", + "AQAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAA==")); + ASSERT_EQ(0, rados_conf_set(cluster, "client_mount_timeout", "2s")); + + utime_t start = ceph_clock_now(); + ASSERT_EQ(-ETIMEDOUT, rados_connect(cluster)); + utime_t end = ceph_clock_now(); + + utime_t dur = end - start; + ASSERT_GE(dur, utime_t(2, 0)); + ASSERT_LT(dur, utime_t(4, 0)); + + rados_shutdown(cluster); +} + TEST(LibRadosMiscPool, PoolCreationRace) { rados_t cluster_a, cluster_b;