]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
crimson/osd/osd_operation: fix dump_historic_slow_ops command works
authorjunxiang Mu <1948535941@qq.com>
Fri, 19 Apr 2024 02:57:08 +0000 (22:57 -0400)
committerjunxiang Mu <1948535941@qq.com>
Thu, 30 May 2024 02:26:13 +0000 (22:26 -0400)
* Use two separate lists to store normal ops and slow ops, respectively

Fixes: https://tracker.ceph.com/issues/65531
Signed-off-by: junxiang Mu <1948535941@qq.com>
src/crimson/osd/osd_operation.cc
src/crimson/osd/osd_operation.h

index 920fdc11480469ab0a27e8dba62b708105923c14..6cf80e60c6b415c42d386e916447525521ee61ec 100644 (file)
@@ -33,18 +33,11 @@ void OSDOperationRegistry::do_stop()
                     /* add_ref= */ false
                   };
                 });
-  last_of_recents = std::end(historic_registry);
   // to_ref_down is going off
 }
 
 OSDOperationRegistry::OSDOperationRegistry()
-  : OperationRegistryT(seastar::this_shard_id())
-{
-  constexpr auto historic_reg_index =
-    static_cast<size_t>(OperationTypeCode::historic_client_request);
-  auto& historic_registry = get_registry<historic_reg_index>();
-  last_of_recents = std::begin(historic_registry);
-}
+  : OperationRegistryT(seastar::this_shard_id()) {}
 
 static auto get_duration(const ClientRequest& client_request)
 {
@@ -55,51 +48,46 @@ static auto get_duration(const ClientRequest& client_request)
 
 void OSDOperationRegistry::put_historic(const ClientRequest& op)
 {
+  using crimson::common::local_conf;
   // unlink the op from the client request registry. this is a part of
   // the re-link procedure. finally it will be in historic registry.
   constexpr auto client_reg_index =
     static_cast<size_t>(OperationTypeCode::client_request);
   constexpr auto historic_reg_index =
     static_cast<size_t>(OperationTypeCode::historic_client_request);
+  constexpr auto slow_historic_reg_index = 
+    static_cast<size_t>(OperationTypeCode::historic_slow_client_request);
+
   auto& client_registry = get_registry<client_reg_index>();
-  auto& historic_registry = get_registry<historic_reg_index>();
-  historic_registry.splice(std::end(historic_registry),
-                          client_registry,
-                          client_registry.iterator_to(op));
+  // slow ops should put into historic_slow_client_request list
+  // we only save the newest op
+  if (get_duration(op) > local_conf()->osd_op_complaint_time) {
+    auto& slow_historic_registry = get_registry<slow_historic_reg_index>();
+    slow_historic_registry.splice(std::end(slow_historic_registry),
+          client_registry,
+          client_registry.iterator_to(op));
+
+    if (num_slow_ops >= local_conf()->osd_op_history_slow_op_size) {
+      slow_historic_registry.pop_front();
+    } else {
+      ++num_slow_ops;
+    }
+  } else {
+    auto& historic_registry = get_registry<historic_reg_index>();
+    historic_registry.splice(std::end(historic_registry),
+          client_registry,
+          client_registry.iterator_to(op));
+
+    if (num_recent_ops >= local_conf()->osd_op_history_size) {
+      historic_registry.pop_front();
+    } else {
+      ++num_recent_ops;
+    }
+  }
+
   ClientRequest::ICRef(
     &op, /* add_ref= */true
   ).detach(); // yes, "leak" it for now!
-
-  // check whether the history size limit is not exceeded; if so, then
-  // purge the oldest op.
-  // NOTE: Operation uses the auto-unlink feature of boost::intrusive.
-  // NOTE: the cleaning happens in OSDOperationRegistry::do_stop()
-  using crimson::common::local_conf;
-  if (num_recent_ops >= local_conf()->osd_op_history_size) {
-    ++last_of_recents;
-    ++num_slow_ops;
-  } else {
-    ++num_recent_ops;
-  }
-  if (num_slow_ops > local_conf()->osd_op_history_slow_op_size) {
-    // we're interested in keeping slowest ops. if the slow op history
-    // is disabled, the list will have only one element, so the full-blown
-    // search will boil down into `.front()`.
-    const auto fastest_historic_iter = std::min_element(
-      std::cbegin(historic_registry), last_of_recents,
-      [] (const auto& lop, const auto& rop) {
-        const auto& lclient_request = static_cast<const ClientRequest&>(lop);
-        const auto& rclient_request = static_cast<const ClientRequest&>(rop);
-       return get_duration(lclient_request) < get_duration(rclient_request);
-    });
-    assert(fastest_historic_iter != std::end(historic_registry));
-    const auto& fastest_historic_op =
-      static_cast<const ClientRequest&>(*fastest_historic_iter);
-    historic_registry.erase(fastest_historic_iter);
-    // clear a previously "leaked" op
-    ClientRequest::ICRef(&fastest_historic_op, /* add_ref= */false);
-    --num_slow_ops;
-  }
 }
 
 size_t OSDOperationRegistry::dump_historic_client_requests(ceph::Formatter* f) const
@@ -125,33 +113,20 @@ size_t OSDOperationRegistry::dump_historic_client_requests(ceph::Formatter* f) c
 
 size_t OSDOperationRegistry::dump_slowest_historic_client_requests(ceph::Formatter* f) const
 {
-  const auto& historic_client_registry =
-    get_registry<static_cast<size_t>(OperationTypeCode::historic_client_request)>(); //ClientRequest::type)>();
+  const auto& slow_historic_client_registry =
+    get_registry<static_cast<size_t>(OperationTypeCode::historic_slow_client_request)>(); //ClientRequest::type)>();
   f->open_object_section("op_history");
-  f->dump_int("size", historic_client_registry.size());
+  f->dump_int("size", slow_historic_client_registry.size());
   // TODO: f->dump_int("duration", history_duration.load());
   // the intrusive list is configured to not store the size
-  std::multimap<utime_t,
-               const ClientRequest*,
-               std::greater<utime_t>> sorted_slowest_ops;
-  // iterating over the entire registry as a slow op could be also
-  // in the "recently added" part.
-  std::transform(std::begin(historic_client_registry),
-                std::end(historic_client_registry),
-                std::inserter(sorted_slowest_ops, std::end(sorted_slowest_ops)),
-                [] (const Operation& op) {
-                  const auto& cop = static_cast<const ClientRequest&>(op);
-                  return std::make_pair(get_duration(cop), &cop);
-                });
-  f->open_array_section("ops");
-  using crimson::common::local_conf;
   size_t ops_count = 0;
-  for (auto it = std::begin(sorted_slowest_ops);
-       ops_count < local_conf()->osd_op_history_slow_op_size
-          && it != std::end(sorted_slowest_ops);
-       ++it, ++ops_count)
   {
-    it->second->dump(f);
+    f->open_array_section("ops");
+    for (const auto& op : slow_historic_client_registry) {
+      op.dump(f);
+      ++ops_count;
+    }
+    f->close_section();
   }
   f->close_section();
   return ops_count;
index 1064a5c8e03e1cfef1a19691b64d08378527708b..b80945e258d2e6468afe7b2d6582a4e81c6ccffb 100644 (file)
@@ -50,6 +50,7 @@ enum class OperationTypeCode {
   background_recovery_sub,
   internal_client_request,
   historic_client_request,
+  historic_slow_client_request,
   logmissing_request,
   logmissing_request_reply,
   snaptrim_event,
@@ -72,6 +73,7 @@ static constexpr const char* const OP_NAMES[] = {
   "background_recovery_sub",
   "internal_client_request",
   "historic_client_request",
+  "historic_slow_client_request",
   "logmissing_request",
   "logmissing_request_reply",
   "snaptrim_event",
@@ -230,7 +232,6 @@ struct OSDOperationRegistry : OperationRegistryT<
   size_t dump_slowest_historic_client_requests(ceph::Formatter* f) const;
 
 private:
-  op_list::const_iterator last_of_recents;
   size_t num_recent_ops = 0;
   size_t num_slow_ops = 0;
 };