]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mon/MonClient: resurrect original client_mount_timeout handling
authorIlya Dryomov <idryomov@gmail.com>
Mon, 19 Jun 2023 17:53:39 +0000 (19:53 +0200)
committerIlya Dryomov <idryomov@gmail.com>
Wed, 19 Jul 2023 09:46:43 +0000 (11:46 +0200)
While reducing a "waiting for config" timeout from 30 seconds to 3
(mon_client_hunt_interval default) and instead introducing 10 retries,
commit 3c2b30e4c5dd ("mon/MonClient: apply timeout while fetching
config") also subjected authenticate() to these retries.  However,
authenticate() is going by client_mount_timeout which defaults to
5 minutes.  As a result, when the monitors are unreachable or there
are other connectivity issues, we end up taking 50 minutes to return
ETIMEDOUT from rados_connect().

Fixes: https://tracker.ceph.com/issues/61733
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
(cherry picked from commit 6d2b449afd47c36df37a335eed62bd7510808c37)

src/mon/MonClient.cc
src/test/librados/misc.cc

index a9fba82883b9ef63facdc68d47e2405ddd56b68e..a27c7ae009f2a855b8e33fd793653ca944eaf475 100644 (file)
@@ -155,11 +155,8 @@ int MonClient::get_monmap_and_config()
     if (r < 0) {
       return r;
     }
-    r = authenticate(std::chrono::duration<double>(cct->_conf.get_val<std::chrono::seconds>("client_mount_timeout")).count());
-    if (r == -ETIMEDOUT) {
-      shutdown();
-      continue;
-    }
+    r = authenticate(
+      cct->_conf.get_val<std::chrono::seconds>("client_mount_timeout").count());
     if (r < 0) {
       break;
     }
index 9c052a5d91c04ea4d012c055f53de76dfe8802a9..2f21669d0b5609b6b07fe991b47c0fb8da90bbff 100644 (file)
@@ -85,6 +85,26 @@ TEST(LibRadosMiscConnectFailure, ConnectFailure) {
   rados_shutdown(cluster);
 }
 
+TEST(LibRadosMiscConnectFailure, ConnectTimeout) {
+  rados_t cluster;
+
+  ASSERT_EQ(0, rados_create(&cluster, NULL));
+  ASSERT_EQ(0, rados_conf_set(cluster, "mon_host", "255.0.1.2:3456"));
+  ASSERT_EQ(0, rados_conf_set(cluster, "key",
+                              "AQAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAA=="));
+  ASSERT_EQ(0, rados_conf_set(cluster, "client_mount_timeout", "2s"));
+
+  utime_t start = ceph_clock_now();
+  ASSERT_EQ(-ETIMEDOUT, rados_connect(cluster));
+  utime_t end = ceph_clock_now();
+
+  utime_t dur = end - start;
+  ASSERT_GE(dur, utime_t(2, 0));
+  ASSERT_LT(dur, utime_t(4, 0));
+
+  rados_shutdown(cluster);
+}
+
 TEST(LibRadosMiscPool, PoolCreationRace) {
   rados_t cluster_a, cluster_b;