]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: add 'ceph [tell|daemon] osd.id smart' 19342/head
authorYaarit Hatuka <yaarithatuka@gmail.com>
Tue, 5 Dec 2017 14:25:00 +0000 (09:25 -0500)
committerYaarit Hatuka <yaarithatuka@gmail.com>
Thu, 18 Jan 2018 13:54:51 +0000 (08:54 -0500)
Also added 'ceph daemon osd.id list_devices' which prints to stdout the
OSD devices. 'ceph [tell|daemon] osd.id smart' probes the OSD devices
for SMART data and prints it to stdout in a JSON format. It assumes smartctl '--json' feature
exists.

Signed-off-by: Yaarit Hatuka <yaarithatuka@gmail.com>
src/common/options.cc
src/osd/OSD.cc
src/osd/OSD.h
src/pybind/mgr/smart/__init__.py [new file with mode: 0644]
src/pybind/mgr/smart/module.py [new file with mode: 0644]

index b286457b9b35f18d44f1c8ad1dd6990ce05e4ab8..ee73aa106b9fbfee6e1f736b3e32c21f9df27f02 100644 (file)
@@ -1632,6 +1632,10 @@ std::vector<Option> get_global_options() {
     .set_min(2)
     .set_description("Number of striping periods to zero head of MDS journal write position"),
 
+     Option("osd_smart_report_timeout", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("Timeout (in seconds) for smarctl to run, default is set to 5"),
+
     Option("osd_check_max_object_name_len_on_startup", Option::TYPE_BOOL, Option::LEVEL_DEV)
     .set_default(true)
     .set_description(""),
index bf613d58d2602c61fe13cf565684843cb7b4aac1..afa979e93a0fb00b2815a52b4a55eaa85b3fd01c 100644 (file)
@@ -46,6 +46,7 @@
 #include "common/version.h"
 #include "common/io_priority.h"
 #include "common/pick_address.h"
+#include "common/SubProcess.h"
 
 #include "os/ObjectStore.h"
 #ifdef HAVE_LIBFUSE
 #include "common/config.h"
 #include "common/EventTrace.h"
 
+#include "json_spirit/json_spirit_reader.h"
+#include "json_spirit/json_spirit_writer.h"
+
 #ifdef WITH_LTTNG
 #define TRACEPOINT_DEFINE
 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
@@ -2349,6 +2353,19 @@ will start to track new ops received afterwards.";
     set<int> poollist = get_mapped_pools();
     f->dump_stream("pool_list") << poollist;
     f->close_section();
+  } else if (admin_command == "smart") {
+    probe_smart(ss);
+  } else if (admin_command == "list_devices") {
+    set<string> devnames;
+    store->get_devices(&devnames);
+    f->open_object_section("list_devices");
+    for (auto dev : devnames) {
+      f->dump_string("device", "/dev/" + dev);
+      if (dev.find("dm-") == 0) {
+       continue;
+      }
+    }
+    f->close_section();
   } else {
     assert(0 == "broken asok registration");
   }
@@ -2881,6 +2898,18 @@ void OSD::final_init()
                                      asok_hook,
                                      "dump pools whose PG(s) are mapped to this OSD.");
 
+  assert(r == 0);
+
+  r = admin_socket->register_command("smart", "smart",
+                                     asok_hook,
+                                     "probe OSD devices for SMART data.");
+
+  assert(r == 0);
+
+  r = admin_socket->register_command("list_devices", "list_devices",
+                                     asok_hook,
+                                     "list OSD devices.");
+
   test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
   // Note: pools are CephString instead of CephPoolname because
   // these commands traditionally support both pool names and numbers
@@ -3367,6 +3396,8 @@ int OSD::shutdown()
   cct->get_admin_socket()->unregister_command("dump_pgstate_history");
   cct->get_admin_socket()->unregister_command("compact");
   cct->get_admin_socket()->unregister_command("get_mapped_pools");
+  cct->get_admin_socket()->unregister_command("smart");
+  cct->get_admin_socket()->unregister_command("list_devices");
   delete asok_hook;
   asok_hook = NULL;
 
@@ -6043,6 +6074,9 @@ COMMAND("compact",
         "compact object store's omap. "
         "WARNING: Compaction probably slows your requests",
         "osd", "rw", "cli,rest")
+COMMAND("smart",
+        "runs smartctl on this osd devices.  ",
+        "osd", "rw", "cli,rest")
 };
 
 void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data)
@@ -6457,6 +6491,10 @@ void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, buffe
     ss << "compacted omap in " << duration << " seconds";
   }
 
+  else if (prefix == "smart") {
+    probe_smart(ds);
+  }
+
   else {
     ss << "unrecognized command! " << cmd;
     r = -EINVAL;
@@ -6475,6 +6513,75 @@ void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, buffe
   }
 }
 
+void OSD::probe_smart(ostream& ss)
+{
+  set<string> devnames;
+  store->get_devices(&devnames);
+  uint64_t smart_timeout = cct->_conf->get_val<uint64_t>("osd_smart_report_timeout");
+  std::string result;
+
+  json_spirit::mObject json_map; // == typedef std::map<std::string, mValue> mObject;
+  json_spirit::mValue smart_json;
+
+  for (auto dev : devnames) {
+      // smartctl works only on physical devices; filter out any logical device
+      if (dev.find("dm-") == 0) {
+         continue;
+      }
+
+      if (probe_smart_device(("/dev/" + dev).c_str(), smart_timeout, &result)) {
+         derr << "probe_smart_device failed for /dev/" << dev << ", continuing to next device"<< dendl;
+         continue;
+      }
+
+      // TODO: change to read_or_throw?
+      if (!json_spirit::read(result, smart_json)) {
+         derr << "smartctl JSON output of /dev/" + dev + " is invalid" << dendl;
+      }
+      else { //json is valid, assigning
+         json_map[dev] = smart_json;
+      }
+      // no need to result.clear() or clear smart_json
+  }
+  json_spirit::write(json_map, ss, json_spirit::pretty_print);
+}
+
+int OSD::probe_smart_device(const char *device, int timeout, std::string *result)
+{
+  // when using --json, smartctl will report its errors in JSON format to stdout 
+  SubProcessTimed smartctl("sudo", SubProcess::CLOSE, SubProcess::PIPE, SubProcess::CLOSE, timeout);
+  smartctl.add_cmd_args(
+      "smartctl",
+      "-a",
+      //"-x",
+      "--json",
+      device,
+      NULL);
+
+  int ret = smartctl.spawn();
+  if (ret != 0) {
+    derr << "failed run smartctl: " << smartctl.err() << dendl;
+    return ret;
+  }
+
+  bufferlist output;
+  ret = output.read_fd(smartctl.get_stdout(), 100*1024);
+  if (ret < 0) {
+    derr << "failed read from smartctl: " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  derr << "smartctl output is: " << output.c_str() << dendl;
+  *result = output.c_str(); 
+
+  if (smartctl.join() != 0) {
+    derr << smartctl.err() << dendl;
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
 bool OSD::heartbeat_dispatch(Message *m)
 {
   dout(30) << "heartbeat_dispatch " << m << dendl;
index 07eb73a81a0ce2af614d58c8915ddfd8bdfeef1d..e18fc10f275c855a99045b9ba8262e938afde7bd 100644 (file)
@@ -2349,6 +2349,9 @@ private:
 
   float get_osd_recovery_sleep();
 
+  void probe_smart(ostream& ss);
+  int probe_smart_device(const char *device, int timeout, std::string *result);
+
 public:
   static int peek_meta(ObjectStore *store, string& magic,
                       uuid_d& cluster_fsid, uuid_d& osd_fsid, int& whoami);
diff --git a/src/pybind/mgr/smart/__init__.py b/src/pybind/mgr/smart/__init__.py
new file mode 100644 (file)
index 0000000..79f5b86
--- /dev/null
@@ -0,0 +1,2 @@
+
+from module import *  # NOQA
diff --git a/src/pybind/mgr/smart/module.py b/src/pybind/mgr/smart/module.py
new file mode 100644 (file)
index 0000000..8b2d54c
--- /dev/null
@@ -0,0 +1,35 @@
+
+"""
+Pulling smart data from OSD
+"""
+
+import json
+from mgr_module import MgrModule, CommandResult
+
+
+class Module(MgrModule):
+    COMMANDS = [
+        {
+            "cmd": "osd smart get "
+                   "name=osd_id,type=CephString,req=true",
+            "desc": "Get smart data for osd.id",
+            "perm": "r"
+        },
+    ]
+
+    def handle_command(self, cmd):
+        self.log.error("handle_command")
+
+        if cmd['prefix'] == 'osd smart get':
+            result = CommandResult('')
+            self.send_command(result, 'osd', cmd['osd_id'], json.dumps({
+                'prefix': 'smart',
+                'format': 'json',
+            }), '')
+            r, outb, outs = result.wait()
+            return (r, outb, outs)
+
+        else:
+            # mgr should respect our self.COMMANDS and not call us for
+            # any prefix we don't advertise
+            raise NotImplementedError(cmd['prefix'])