From: liangmingyuan <liangmingyuan@baidu.com>
Date: Mon, 5 Aug 2024 07:30:33 +0000 (+0800)
Subject: rgw/beast:  optimize for accept when meeting error in listenning
X-Git-Tag: v19.2.1~120^2
X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F60244%2Fhead;p=ceph.git

rgw/beast:  optimize for accept when meeting error in listenning

It is not suitable to stop accept socket when meeting any error in
previous socket listen and accept. This will results in radosgw
stop work after a occasional case. For example, Too many open files
warning may occur at high iops(or just after reshard, sockets opened
may increase for doing operations blocked).

Signed-off-by: Mingyuan Liang <liangmingyuan@baidu.com>
(cherry picked from commit a7090783cf820045633c66ca04926cb3f2b5a4aa)
---

diff --git a/src/rgw/rgw_asio_frontend.cc b/src/rgw/rgw_asio_frontend.cc
index 86029e7f634e0..b8bcff3a06fa3 100644
--- a/src/rgw/rgw_asio_frontend.cc
+++ b/src/rgw/rgw_asio_frontend.cc
@@ -67,6 +67,44 @@ auto make_stack_allocator() {
   return boost::context::protected_fixedsize_stack{512*1024};
 }
 
+static constexpr std::chrono::milliseconds BACKOFF_MAX_WAIT(5000);
+
+class RGWAsioBackoff {
+  using Clock = ceph::coarse_mono_clock;
+  using Timer = boost::asio::basic_waitable_timer<Clock>;
+  Timer timer;
+
+  ceph::timespan cur_wait;
+  void update_wait_time();
+public:
+  explicit RGWAsioBackoff(boost::asio::io_context& context) :
+                          timer(context),
+                          cur_wait(std::chrono::milliseconds(1)) {
+  }
+
+  void backoff_sleep(boost::asio::yield_context yield);
+  void reset() {
+    cur_wait = std::chrono::milliseconds(1);
+  }
+};
+
+void RGWAsioBackoff::update_wait_time()
+{
+  if (cur_wait < BACKOFF_MAX_WAIT) {
+    cur_wait = cur_wait * 2;
+  }
+  if (cur_wait > BACKOFF_MAX_WAIT) {
+    cur_wait = BACKOFF_MAX_WAIT;
+  }
+}
+
+void RGWAsioBackoff::backoff_sleep(boost::asio::yield_context yield)
+{
+  update_wait_time();
+  timer.expires_after(cur_wait);
+  timer.async_wait(yield);
+}
+
 using namespace std;
 
 template <typename Stream>
@@ -440,6 +478,7 @@ class AsioFrontend {
 
   std::atomic<bool> going_down{false};
 
+  RGWAsioBackoff backoff;
   CephContext* ctx() const { return cct.get(); }
   std::optional<dmc::ClientCounters> client_counters;
   std::unique_ptr<dmc::ClientConfig> client_config;
@@ -452,7 +491,8 @@ class AsioFrontend {
 	       dmc::SchedulerCtx& sched_ctx,
 	       boost::asio::io_context& context)
     : env(env), conf(conf), context(context),
-      pause_mutex(context.get_executor())
+      pause_mutex(context.get_executor()),
+      backoff(context)
   {
     auto sched_t = dmc::get_scheduler_t(ctx());
     switch(sched_t){
@@ -1024,9 +1064,19 @@ void AsioFrontend::accept(Listener& l, boost::asio::yield_context yield)
       return;
     } else if (ec) {
       ldout(ctx(), 1) << "accept failed: " << ec.message() << dendl;
+      if (ec == boost::system::errc::too_many_files_open ||
+          ec == boost::system::errc::too_many_files_open_in_system ||
+          ec == boost::system::errc::no_buffer_space ||
+          ec == boost::system::errc::not_enough_memory) {
+        // always retry accept() if we hit a resource limit
+        backoff.backoff_sleep(yield);
+        continue;
+      }
+      ldout(ctx(), 0) << "accept stopped due to error: " << ec.message() << dendl;
       return;
     }
 
+    backoff.reset();
     on_accept(l, std::move(l.socket));
   }
 }