]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
exporter: handle exceptions gracefully 55773/head
authorDivyansh Kamboj <dkamboj@redhat.com>
Tue, 30 Apr 2024 07:44:10 +0000 (13:14 +0530)
committerDivyansh Kamboj <dkamboj@redhat.com>
Wed, 8 May 2024 12:05:32 +0000 (17:35 +0530)
ceph exporter crahes, and fails to handle exceptions in
dump_asok_metrics(). add try and catch blocks to handle the exceptions gracefully.

Signed-off-by: Divyansh Kamboj <dkamboj@redhat.com>
src/exporter/DaemonMetricCollector.cc
src/exporter/DaemonMetricCollector.h

index dda0899e18637db2cbb206daa63564ccfe71e4ee..dccb247fb74e6522d4cc3423dec48ca9e78818f2 100644 (file)
@@ -84,6 +84,66 @@ std::string boost_string_to_std(boost::json::string js) {
 
 std::string quote(std::string value) { return "\"" + value + "\""; }
 
+void DaemonMetricCollector::parse_asok_metrics(
+    std::string &counter_dump_response, std::string &counter_schema_response,
+    int64_t prio_limit, const std::string &daemon_name) {
+  json_object counter_dump =
+      boost::json::parse(counter_dump_response).as_object();
+  json_object counter_schema =
+      boost::json::parse(counter_schema_response).as_object();
+
+  for (auto &perf_group_item : counter_schema) {
+    std::string perf_group = {perf_group_item.key().begin(),
+                              perf_group_item.key().end()};
+    json_array perf_group_schema_array = perf_group_item.value().as_array();
+    json_array perf_group_dump_array = counter_dump[perf_group].as_array();
+    for (auto schema_itr = perf_group_schema_array.begin(),
+              dump_itr = perf_group_dump_array.begin();
+         schema_itr != perf_group_schema_array.end() &&
+         dump_itr != perf_group_dump_array.end();
+         ++schema_itr, ++dump_itr) {
+      auto counters = schema_itr->at("counters").as_object();
+      auto counters_labels = schema_itr->at("labels").as_object();
+      auto counters_values = dump_itr->at("counters").as_object();
+      labels_t labels;
+
+      for (auto &label : counters_labels) {
+        std::string label_key = {label.key().begin(), label.key().end()};
+        labels[label_key] = quote(label.value().as_string().c_str());
+      }
+      for (auto &counter : counters) {
+        json_object counter_group = counter.value().as_object();
+        if (counter_group["priority"].as_int64() < prio_limit) {
+          continue;
+        }
+        std::string counter_name_init = {counter.key().begin(),
+                                         counter.key().end()};
+        std::string counter_name = perf_group + "_" + counter_name_init;
+        promethize(counter_name);
+
+        auto extra_labels = get_extra_labels(daemon_name);
+        if (extra_labels.empty()) {
+          dout(1) << "Unable to parse instance_id from daemon_name: "
+                  << daemon_name << dendl;
+          continue;
+        }
+        labels.insert(extra_labels.begin(), extra_labels.end());
+
+        // For now this is only required for rgw multi-site metrics
+        auto multisite_labels_and_name = add_fixed_name_metrics(counter_name);
+        if (!multisite_labels_and_name.first.empty()) {
+          labels.insert(multisite_labels_and_name.first.begin(),
+                        multisite_labels_and_name.first.end());
+          counter_name = multisite_labels_and_name.second;
+        }
+        auto perf_values = counters_values.at(counter_name_init);
+        dump_asok_metric(counter_group, perf_values, counter_name, labels);
+      }
+    }
+  }
+}
+
+
 void DaemonMetricCollector::dump_asok_metrics(bool sort_metrics, int64_t counter_prio,
                                               bool sockClientsPing, std::string &dump_response,
                                               std::string &schema_response,
@@ -125,71 +185,36 @@ void DaemonMetricCollector::dump_asok_metrics(bool sort_metrics, int64_t counter
       continue;
     }
 
-    json_object counter_dump = boost::json::parse(counter_dump_response).as_object();
-    json_object counter_schema = boost::json::parse(counter_schema_response).as_object();
-
-    for (auto &perf_group_item : counter_schema) {
-      std::string perf_group = {perf_group_item.key().begin(),
-                                perf_group_item.key().end()};
-      json_array perf_group_schema_array = perf_group_item.value().as_array();
-      json_array perf_group_dump_array = counter_dump[perf_group].as_array();
-      for (auto schema_itr = perf_group_schema_array.begin(),
-                dump_itr = perf_group_dump_array.begin();
-           schema_itr != perf_group_schema_array.end() &&
-           dump_itr != perf_group_dump_array.end();
-           ++schema_itr, ++dump_itr) {
-        auto counters = schema_itr->at("counters").as_object();
-        auto counters_labels = schema_itr->at("labels").as_object();
-        auto counters_values = dump_itr->at("counters").as_object();
-        labels_t labels;
-
-        for (auto &label: counters_labels) {
-          std::string label_key = {label.key().begin(), label.key().end()};
-          labels[label_key] = quote(label.value().as_string().c_str());
-        }
-        for (auto &counter : counters) {
-          json_object counter_group = counter.value().as_object();
-          if (counter_group["priority"].as_int64() < prio_limit) {
-            continue;
-          }
-          std::string counter_name_init =  {counter.key().begin(), counter.key().end()};
-          std::string counter_name = perf_group + "_" + counter_name_init;
-          promethize(counter_name);
-
-          auto extra_labels = get_extra_labels(daemon_name);
-          if (extra_labels.empty()) {
-            dout(1) << "Unable to parse instance_id from daemon_name: " << daemon_name << dendl;
-            continue;
-          }
-          labels.insert(extra_labels.begin(), extra_labels.end());
-
-          // For now this is only required for rgw multi-site metrics
-          auto multisite_labels_and_name = add_fixed_name_metrics(counter_name);
-          if (!multisite_labels_and_name.first.empty()) {
-            labels.insert(multisite_labels_and_name.first.begin(), multisite_labels_and_name.first.end());
-            counter_name = multisite_labels_and_name.second;
-          }
-          auto perf_values = counters_values.at(counter_name_init);
-          dump_asok_metric(counter_group, perf_values, counter_name, labels);
-        }
-      }
-    }
-    std::string config_show = !config_show_response ? "" :
+    try {
+      std::string config_show = !config_show_response ? "" :
         asok_request(sock_client, "config show", daemon_name);
-    if (config_show.size() == 0) {
+      if (config_show.size() == 0) {
+        failures++;
+        continue;
+      }
+      json_object pid_file_json = boost::json::parse(config_show).as_object();
+      std::string pid_path =
+          boost_string_to_std(pid_file_json["pid_file"].as_string());
+      std::string pid_str = read_file_to_string(pid_path);
+      if (!pid_path.size()) {
+        dout(1) << "pid path is empty; process metrics won't be fetched for: "
+                << daemon_name << dendl;
+      }
+      if (!pid_str.empty()) {
+        daemon_pids.push_back({daemon_name, std::stoi(pid_str)});
+      }
+      parse_asok_metrics(counter_dump_response, counter_schema_response,
+                         prio_limit, daemon_name);
+    } catch (const std::invalid_argument &e) {
       failures++;
+      dout(1) << "failed to handle " << daemon_name << ": " << e.what()
+              << dendl;
+      continue;
+    } catch (const std::runtime_error &e) {
+      failures++;
+      dout(1) << "failed to parse json for " << daemon_name << ": " << e.what()
+              << dendl;
       continue;
-    }
-    json_object pid_file_json = boost::json::parse(config_show).as_object();
-    std::string pid_path =
-        boost_string_to_std(pid_file_json["pid_file"].as_string());
-    std::string pid_str = read_file_to_string(pid_path);
-    if (!pid_path.size()) {
-      dout(1) << "pid path is empty; process metrics won't be fetched for: "
-              << daemon_name << dendl;
-    }
-    if (!pid_str.empty()) {
-      daemon_pids.push_back({daemon_name, std::stoi(pid_str)});
     }
   }
   dout(10) << "Perf counters retrieved for " << clients.size() - failures << "/"
index 2dcdc9ce231e19fa06afa22aa852cc0f28f0b9d8..d2e929b4d670fbe56ab24fa3ff1947e438b55b58 100644 (file)
@@ -52,6 +52,9 @@ private:
   void dump_asok_metric(boost::json::object perf_info,
                         boost::json::value perf_values, std::string name,
                         labels_t labels);
+  void parse_asok_metrics(std::string &counter_dump_response,
+                          std::string &counter_schema_response,
+                          int64_t prio_limit, const std::string &daemon_name);
   void get_process_metrics(std::vector<std::pair<std::string, int>> daemon_pids);
   std::string asok_request(AdminSocketClient &asok, std::string command, std::string daemon_name);
 };