From: Oguzhan Ozmen Date: Tue, 3 Mar 2026 01:39:19 +0000 (+0000) Subject: rgw: make CONN_STATUS_EXPIRE_SECS a cfg option X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=2d8061cf7545eac0a9f9e0a5e29362e4baa7873d;p=ceph.git rgw: make CONN_STATUS_EXPIRE_SECS a cfg option Introduce a new radosgw option 'rgw_rest_conn_ip_fail_timeout_secs' to be able to set the constant CONN_STATUS_EXPIRE_SECS dynamically. Signed-off-by: Oguzhan Ozmen --- diff --git a/src/common/options/rgw.yaml.in b/src/common/options/rgw.yaml.in index 2d1c4cb777f..701f45759f7 100644 --- a/src/common/options/rgw.yaml.in +++ b/src/common/options/rgw.yaml.in @@ -2080,6 +2080,24 @@ options: default: false services: - rgw + see_also: + - rgw_rest_conn_ip_fail_timeout_secs + with_legacy: true +- name: rgw_rest_conn_ip_fail_timeout_secs + desc: IP failure tracking timeout (requires rgw_rest_conn_connect_to_resolved_ips=true) + type: uint + level: advanced + long_desc: When rgw_rest_conn_connect_to_resolved_ips is enabled, RGW tracks + per-IP connection failures by remembering the timestamp of the most recent + failure. This option controls how long (in seconds) an IP address remains + marked as "failed" before RGW considers it eligible for retry. + After this timeout expires, the IP will be tried again in the normal + round-robin rotation. + default: 2 + services: + - rgw + see_also: + - rgw_rest_conn_connect_to_resolved_ips with_legacy: true - name: rgw_obj_stripe_size type: size diff --git a/src/rgw/rgw_rest_conn.cc b/src/rgw/rgw_rest_conn.cc index 412a33d95a1..6d27faa27c5 100644 --- a/src/rgw/rgw_rest_conn.cc +++ b/src/rgw/rgw_rest_conn.cc @@ -167,7 +167,7 @@ void RGWRESTConn::populate_connect_to(RGWEndpoint& endpoint, ResolvedEndpoint& r return; } - static constexpr uint32_t CONN_STATUS_EXPIRE_SECS = 2; + const auto ip_fail_timeout = cct->_conf->rgw_rest_conn_ip_fail_timeout_secs; const size_t num_ips = resolved_endpoint.resolved_ips.size(); // Round-robin through IPs, skipping any that are marked down @@ -182,7 +182,7 @@ void RGWRESTConn::populate_connect_to(RGWEndpoint& endpoint, ResolvedEndpoint& r } auto diff = ceph::to_seconds(ceph::real_clock::now() - last_fail); - if (diff >= CONN_STATUS_EXPIRE_SECS) { + if (diff >= ip_fail_timeout) { // Failure expired, mark IP as up and use it ip_status.mark_up(); ldout(cct, 5) << "IP " << ip_status.connect_to << " failure expired, marking up" << dendl; @@ -204,7 +204,7 @@ int RGWRESTConn::get_endpoint(RGWEndpoint& endpoint) return -EINVAL; } - static constexpr uint32_t CONN_STATUS_EXPIRE_SECS = 2; + const auto ip_fail_timeout = cct->_conf->rgw_rest_conn_ip_fail_timeout_secs; auto now = ceph::real_clock::now(); // Helper to check if an endpoint has at least one available IP @@ -217,7 +217,7 @@ int RGWRESTConn::get_endpoint(RGWEndpoint& endpoint) // Fast path: if no recent failures at endpoint level, all IPs are available const auto& ep_last_fail = res_ep.last_failure_time.load(); if (ceph::real_clock::is_zero(ep_last_fail) || - ceph::to_seconds(now - ep_last_fail) >= CONN_STATUS_EXPIRE_SECS) { + ceph::to_seconds(now - ep_last_fail) >= ip_fail_timeout) { return true; } @@ -228,7 +228,7 @@ int RGWRESTConn::get_endpoint(RGWEndpoint& endpoint) return true; // This IP is up } auto diff = ceph::to_seconds(now - last_fail); - if (diff >= CONN_STATUS_EXPIRE_SECS) { + if (diff >= ip_fail_timeout) { return true; // This IP's failure has expired } } diff --git a/src/rgw/rgw_rest_conn.h b/src/rgw/rgw_rest_conn.h index 66c849f1a1e..325a9ab7a1d 100644 --- a/src/rgw/rgw_rest_conn.h +++ b/src/rgw/rgw_rest_conn.h @@ -69,7 +69,7 @@ inline param_vec_t make_param_list(const std::map *pp) * ResolvedIP - Per-IP connection status tracking. * * Each resolved IP address has its own failure status. An IP is considered - * "down" if last_failure is non-zero and less than CONN_STATUS_EXPIRE_SECS old. + * "down" if last_failure is non-zero and less than rgw_rest_conn_ip_fail_timeout_secs old. * After the timeout, the IP becomes eligible for retry. */ struct ResolvedIP {