]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
crimson/osd: add support for slowest historic op tracking 46382/head
authorRadosław Zarzyński <rzarzyns@redhat.com>
Fri, 27 May 2022 17:31:18 +0000 (19:31 +0200)
committerRadosław Zarzyński <rzarzyns@redhat.com>
Mon, 30 May 2022 14:37:19 +0000 (16:37 +0200)
Signed-off-by: Radosław Zarzyński <rzarzyns@redhat.com>
src/crimson/admin/osd_admin.cc
src/crimson/admin/osd_admin.h
src/crimson/common/operation.h
src/crimson/osd/osd.cc
src/crimson/osd/osd_operation.cc
src/crimson/osd/osd_operation.h
src/crimson/osd/osd_operations/client_request.h

index eba4c4c1769c14c002a1091f0230ac2631ab7129..24c901b287df8aa912ab42508c7859e45b1f9039 100644 (file)
@@ -483,4 +483,29 @@ private:
 template std::unique_ptr<AdminSocketHook>
 make_asok_hook<DumpHistoricOpsHook>(const crimson::osd::OSDOperationRegistry& op_registry);
 
+
+class DumpSlowestHistoricOpsHook : public AdminSocketHook {
+public:
+  explicit DumpSlowestHistoricOpsHook(const crimson::osd::OSDOperationRegistry& op_registry) :
+    AdminSocketHook{"dump_historic_slow_ops", "", "show slowest recent ops"},
+    op_registry(op_registry)
+  {}
+  seastar::future<tell_result_t> call(const cmdmap_t&,
+                                     std::string_view format,
+                                     ceph::bufferlist&& input) const final
+  {
+    logger().warn("{}", __func__);
+    unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")};
+    f->open_object_section("historic_slow_ops");
+    op_registry.dump_slowest_historic_client_requests(f.get());
+    f->close_section();
+    f->dump_int("num_ops", 0);
+    return seastar::make_ready_future<tell_result_t>(std::move(f));
+  }
+private:
+  const crimson::osd::OSDOperationRegistry& op_registry;
+};
+template std::unique_ptr<AdminSocketHook>
+make_asok_hook<DumpSlowestHistoricOpsHook>(const crimson::osd::OSDOperationRegistry& op_registry);
+
 } // namespace crimson::admin
index 5ae6187c8e1dd1c0c6f280a25cabeb0a90de14d4..b3e0adc2a91cf5eb7f8da3a60f304249c7622092 100644 (file)
@@ -19,6 +19,7 @@ class OsdStatusHook;
 class SendBeaconHook;
 class DumpInFlightOpsHook;
 class DumpHistoricOpsHook;
+class DumpSlowestHistoricOpsHook;
 
 template<class Hook, class... Args>
 std::unique_ptr<AdminSocketHook> make_asok_hook(Args&&... args);
index 90e7d157fa8ef044dfb5b87de0f5505f0713a1c5..60b5d59e28dd8e09cbef3735784bb5783d27331d 100644 (file)
@@ -120,6 +120,10 @@ struct TimeEvent : Event<T> {
   void dump(ceph::Formatter *f) const {
     detail::dump_time_event(typeid(T).name(), internal_backend.timestamp, f);
   }
+
+  auto get_timestamp() const {
+    return internal_backend.timestamp;
+  }
 };
 
 
index fe16c203195df864d57c8ef898588e16faf574a0..4db655e81e959e513130b232d7993f58ea265cea 100644 (file)
@@ -551,6 +551,8 @@ seastar::future<> OSD::start_asok_admin()
       std::as_const(get_shard_services().registry)));
     asok->register_command(make_asok_hook<DumpHistoricOpsHook>(
       std::as_const(get_shard_services().registry)));
+    asok->register_command(make_asok_hook<DumpSlowestHistoricOpsHook>(
+      std::as_const(get_shard_services().registry)));
   });
 }
 
index afc06ae3798489f6046ae10fb0fdae488891d845..c2f6908c82511217f9497ec0c8cd19b8886781b8 100644 (file)
@@ -32,10 +32,27 @@ void OSDOperationRegistry::do_stop()
                     /* add_ref= */ false
                   };
                 });
+  last_of_recents = std::end(historic_registry);
   // to_ref_down is going off
 }
 
-void OSDOperationRegistry::put_historic(const ClientRequest& op) {
+OSDOperationRegistry::OSDOperationRegistry()
+{
+  constexpr auto historic_reg_index =
+    static_cast<size_t>(OperationTypeCode::historic_client_request);
+  auto& historic_registry = get_registry<historic_reg_index>();
+  last_of_recents = std::begin(historic_registry);
+}
+
+static auto get_duration(const ClientRequest& client_request)
+{
+  // TODO: consider enhancing `CompletionEvent` with computing duration
+  // once -- when it's enetered.
+  return client_request.get_completed() - client_request.get_started();
+}
+
+void OSDOperationRegistry::put_historic(const ClientRequest& op)
+{
   // unlink the op from the client request registry. this is a part of
   // the re-link procedure. finally it will be in historic registry.
   constexpr auto client_reg_index =
@@ -44,7 +61,6 @@ void OSDOperationRegistry::put_historic(const ClientRequest& op) {
     static_cast<size_t>(OperationTypeCode::historic_client_request);
   auto& client_registry = get_registry<client_reg_index>();
   auto& historic_registry = get_registry<historic_reg_index>();
-
   historic_registry.splice(std::end(historic_registry),
                           client_registry,
                           client_registry.iterator_to(op));
@@ -57,11 +73,29 @@ void OSDOperationRegistry::put_historic(const ClientRequest& op) {
   // NOTE: Operation uses the auto-unlink feature of boost::intrusive.
   // NOTE: the cleaning happens in OSDOperationRegistry::do_stop()
   using crimson::common::local_conf;
-  if (historic_registry.size() > local_conf()->osd_op_history_size) {
-    const auto& oldest_historic_op =
-      static_cast<const ClientRequest&>(historic_registry.front());
+  if (num_recent_ops >= local_conf()->osd_op_history_size) {
+    ++last_of_recents;
+    ++num_slow_ops;
+  } else {
+    ++num_recent_ops;
+  }
+  if (num_slow_ops > local_conf()->osd_op_history_slow_op_size) {
+    // we're interested in keeping slowest ops. if the slow op history
+    // is disabled, the list will have only one element, so the full-blown
+    // search will boil down into `.front()`.
+    const auto fastest_historic_iter = std::min_element(
+      std::cbegin(historic_registry), last_of_recents,
+      [] (const auto& lop, const auto& rop) {
+        const auto& lclient_request = static_cast<const ClientRequest&>(lop);
+        const auto& rclient_request = static_cast<const ClientRequest&>(rop);
+       return get_duration(lclient_request) < get_duration(rclient_request);
+    });
+    assert(fastest_historic_iter != std::end(historic_registry));
+    const auto& fastest_historic_op =
+      static_cast<const ClientRequest&>(*fastest_historic_iter);
     // clear a previously "leaked" op
-    ClientRequest::ICRef(&oldest_historic_op, /* add_ref= */false);
+    ClientRequest::ICRef(&fastest_historic_op, /* add_ref= */false);
+    --num_slow_ops;
   }
 }
 
@@ -97,6 +131,40 @@ size_t OSDOperationRegistry::dump_historic_client_requests(ceph::Formatter* f) c
   return ops_count;
 }
 
+size_t OSDOperationRegistry::dump_slowest_historic_client_requests(ceph::Formatter* f) const
+{
+  const auto& historic_client_registry =
+    get_registry<static_cast<size_t>(OperationTypeCode::historic_client_request)>(); //ClientRequest::type)>();
+  f->open_object_section("op_history");
+  f->dump_int("size", historic_client_registry.size());
+  // TODO: f->dump_int("duration", history_duration.load());
+  // the intrusive list is configured to not store the size
+  std::multimap<utime_t,
+               const ClientRequest*,
+               std::greater<utime_t>> sorted_slowest_ops;
+  // iterating over the entire registry as a slow op could be also
+  // in the "recently added" part.
+  std::transform(std::begin(historic_client_registry),
+                std::end(historic_client_registry),
+                std::inserter(sorted_slowest_ops, std::end(sorted_slowest_ops)),
+                [] (const Operation& op) {
+                  const auto& cop = static_cast<const ClientRequest&>(op);
+                  return std::make_pair(get_duration(cop), &cop);
+                });
+  f->open_array_section("ops");
+  using crimson::common::local_conf;
+  size_t ops_count = 0;
+  for (auto it = std::begin(sorted_slowest_ops);
+       ops_count < local_conf()->osd_op_history_slow_op_size
+          && it != std::end(sorted_slowest_ops);
+       ++it, ++ops_count)
+  {
+    it->second->dump(f);
+  }
+  f->close_section();
+  return ops_count;
+}
+
 OperationThrottler::OperationThrottler(ConfigProxy &conf)
   : scheduler(crimson::osd::scheduler::make_scheduler(conf))
 {
index a9cd48cae124ddda51674cc9bf2cd40b9ea890de..2f9a69d39e6ae5464e60e8245223711a13109bc1 100644 (file)
@@ -104,6 +104,7 @@ class TrackableOperationT : public OperationT<T> {
     return static_cast<const T*>(this);
   }
 
+protected:
   template<class EventT>
   decltype(auto) get_event() {
     // all out derivates are supposed to define the list of tracking
@@ -112,7 +113,11 @@ class TrackableOperationT : public OperationT<T> {
     return std::get<EventT>(that()->tracking_events);
   }
 
-protected:
+  template<class EventT>
+  decltype(auto) get_event() const {
+    return std::get<EventT>(that()->tracking_events);
+  }
+
   using OperationT<T>::OperationT;
 
   struct StartEvent : TimeEvent<StartEvent> {};
@@ -170,13 +175,20 @@ protected:
 struct OSDOperationRegistry : OperationRegistryT<
   static_cast<size_t>(OperationTypeCode::last_op)
 > {
+  OSDOperationRegistry();
+
   void do_stop() override;
 
   void put_historic(const class ClientRequest& op);
 
   size_t dump_client_requests(ceph::Formatter* f) const;
   size_t dump_historic_client_requests(ceph::Formatter* f) const;
+  size_t dump_slowest_historic_client_requests(ceph::Formatter* f) const;
 
+private:
+  op_list::const_iterator last_of_recents;
+  size_t num_recent_ops = 0;
+  size_t num_slow_ops = 0;
 };
 /**
  * Throttles set of currently running operations
index 15827ad9ebd8cabea08d34ed677e8709eb9731cd..82ccbe5f38f3bc44fc4b69fdb711638be4b454ea 100644 (file)
@@ -155,6 +155,14 @@ public:
 
   friend class LttngBackend;
   friend class HistoricBackend;
+
+  auto get_started() const {
+    return get_event<StartEvent>().get_timestamp();
+  };
+
+  auto get_completed() const {
+    return get_event<CompletionEvent>().get_timestamp();
+  };
 };
 
 }