]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
src/msg: fix high CPU consumption of msgr worker thread
authorzhangjianwei <zhangjianwei2_yewu@cmss.chinamobile.com>
Fri, 15 Sep 2023 05:50:57 +0000 (13:50 +0800)
committerzhangjianwei2 <zhangjianwei2_yewu@cmss.chinamobile.com>
Fri, 19 Apr 2024 08:55:02 +0000 (16:55 +0800)
problem analysis:
- std::multimap<clock_type::time_point, TimeEvent> time_events
  - time precision is nanoseconds
  - in EventCenter::process_events function
    - end_time > now : Nanosecond comparison
    - std::chrono::microseconds>(end_time - now) :
      - but converted to microseconds difference
    - so timeout_microseconds = 0
      - epoll_wait(..., 0) not sleep
    - rados bench count : 6000
      - Proportion of 0 events processed
        - 41898337 / 44796903 = 93.52%
        - osd single msgr worker thread cpu high to 100%

solution:
- due to epoll_wait is milliseconds
- add ms_time_events_min_wait_interval
  to control how long time_events should wait at least
- so default value aligned to 1000 microseconds
- rados bench count : 6000
  - Proportion of 0 events processed
    - 424466 / 4489181 = 9.45%
  - osd single msgr worker thread cpu high to 30~40%

issue: https://tracker.ceph.com/issues/62512
co-author: yanghonggang <yanghonggang_yewu@cmss.chinamobile.com>
Signed-off-by: zhangjianwei <zhangjianwei2_yewu@cmss.chinamobile.com>
src/common/options/global.yaml.in
src/msg/async/Event.cc
src/msg/async/ProtocolV1.cc
src/msg/async/ProtocolV2.cc

index fba6d72455ec4538078df13eb47a2df26fdc4bc3..cbdb1185da64d446cb7b7905bfe18e2d9c7eced7 100644 (file)
@@ -1276,6 +1276,23 @@ options:
   desc: Inject a network congestions that stuck with N times operations
   default: 0
   with_legacy: true
+- name: ms_time_events_min_wait_interval
+  type: uint
+  level: dev
+  desc: In microseconds, msgr-worker's time_events min wait time for epoll_wait timeout
+  default: 1000
+  min: 0
+  max: 60000000
+  with_legacy: true
+- name: ms_client_throttle_retry_time_interval
+  type: uint
+  level: dev
+  desc: In microseconds, user client, the time interval between the next retry
+        when the throttle get_or_fail.
+  default: 5000
+  min: 1000
+  max: 60000000
+  with_legacy: true
 - name: ms_blackhole_osd
   type: bool
   level: dev
index 926fdcdb1cc3dd4374515b00270c226180ade473..08e117ea54a79a47338b89eb908293c28c99cca3 100644 (file)
@@ -404,6 +404,8 @@ int EventCenter::process_events(unsigned timeout_microseconds,  ceph::timespan *
 
     if (end_time > now) {
       timeout_microseconds = std::chrono::duration_cast<std::chrono::microseconds>(end_time - now).count();
+      timeout_microseconds = std::max<unsigned>(timeout_microseconds,
+                                                cct->_conf->ms_time_events_min_wait_interval);
     } else {
       timeout_microseconds = 0;
     }
index b45ad8ca5155f37dcadbe2ef7b1dffa4b5122deb..0ddd267926d98774a68db9f24967233cec64f05a 100644 (file)
@@ -677,7 +677,7 @@ CtPtr ProtocolV1::throttle_message() {
       // short time, so we can wait a ms.
       if (connection->register_time_events.empty()) {
         connection->register_time_events.insert(
-            connection->center->create_time_event(1000,
+            connection->center->create_time_event(cct->_conf->ms_client_throttle_retry_time_interval,
                                                   connection->wakeup_handler));
       }
       return nullptr;
@@ -710,7 +710,8 @@ CtPtr ProtocolV1::throttle_bytes() {
         if (connection->register_time_events.empty()) {
           connection->register_time_events.insert(
               connection->center->create_time_event(
-                  1000, connection->wakeup_handler));
+                          cct->_conf->ms_client_throttle_retry_time_interval,
+                          connection->wakeup_handler));
         }
         return nullptr;
       }
@@ -737,7 +738,7 @@ CtPtr ProtocolV1::throttle_dispatch_queue() {
       // short time, so we can wait a ms.
       if (connection->register_time_events.empty()) {
         connection->register_time_events.insert(
-            connection->center->create_time_event(1000,
+            connection->center->create_time_event(cct->_conf->ms_client_throttle_retry_time_interval,
                                                   connection->wakeup_handler));
       }
       return nullptr;
index 08426b796b88b16c9e0142a7dc7d58d2d8d071f7..7a459363a0edcdf534e6cc2e5e54a2ca857b6db4 100644 (file)
@@ -1552,7 +1552,7 @@ CtPtr ProtocolV2::throttle_message() {
       // short time, so we can wait a ms.
       if (connection->register_time_events.empty()) {
         connection->register_time_events.insert(
-            connection->center->create_time_event(1000,
+            connection->center->create_time_event(cct->_conf->ms_client_throttle_retry_time_interval,
                                                   connection->wakeup_handler));
       }
       return nullptr;
@@ -1584,7 +1584,8 @@ CtPtr ProtocolV2::throttle_bytes() {
         if (connection->register_time_events.empty()) {
           connection->register_time_events.insert(
               connection->center->create_time_event(
-                  1000, connection->wakeup_handler));
+                        cct->_conf->ms_client_throttle_retry_time_interval,
+                        connection->wakeup_handler));
         }
         return nullptr;
       }
@@ -1612,7 +1613,7 @@ CtPtr ProtocolV2::throttle_dispatch_queue() {
       // short time, so we can wait a ms.
       if (connection->register_time_events.empty()) {
         connection->register_time_events.insert(
-            connection->center->create_time_event(1000,
+            connection->center->create_time_event(cct->_conf->ms_client_throttle_retry_time_interval,
                                                   connection->wakeup_handler));
       }
       return nullptr;