]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: OSD device smart data include additional nvme data 25672/head
authorhsiang41 <rick.chen@prophetstor.com>
Fri, 28 Dec 2018 09:07:32 +0000 (17:07 +0800)
committerSage Weil <sage@redhat.com>
Fri, 4 Jan 2019 00:10:26 +0000 (18:10 -0600)
Add nvme addition data into the deveh health data. That use nvme tool
and command syntax "nvme <vendor> smart-log-add <dev> -json". The nvme
json output append in the dev smart "nvme_smart_health_information_add_log".

- made run_smartctl static/private
- changed get_metrics to take a const string, not c str

Signed-off-by: Rick Chen <rick.chen@prophetstor.com>
Signed-off-by: Sage Weil <sage@redhat.com>
src/common/blkdev.cc
src/common/blkdev.h
src/mon/Monitor.cc
src/osd/OSD.cc
sudoers.d/ceph-osd-smartctl

index 0ed7740b903a83c1c7856e4145f51915ef697d3f..bc5be07059fc17d33628fbf5462ad51ac4c06b13 100644 (file)
@@ -522,32 +522,121 @@ std::string get_device_id(const std::string& devname,
   return device_id;
 }
 
-int block_device_get_metrics(const char *device, int timeout,
-                            json_spirit::mValue *result)
+static std::string get_device_vendor(const std::string& devname)
 {
-  std::string s;
-  if (int r = block_device_run_smartctl(device, timeout, &s); r != 0) {
-    s = "{\"error\": \"smartctl failed\", \"dev\": \"";
-    s += device;
-    s += "\", \"smartctl_error_code\": " + stringify(r);
-    s += "\", \"smartctl_output\": \"" + s;
-    s += + "\"}";
+  struct udev_device *dev;
+  static struct udev *udev;
+  const char *data;
+
+  udev = udev_new();
+  if (!udev) {
+    return {};
   }
-  if (json_spirit::read(s, *result)) {
-    return 0;
+  dev = udev_device_new_from_subsystem_sysname(udev, "block", devname.c_str());
+  if (!dev) {
+    udev_unref(udev);
+    return {};
   }
-  s = "{\"error\": \"smartctl returned invalid JSON\", \"dev\": \"";
-  s += device;
-  s += "\"}";
-  if (json_spirit::read(s, *result)) {
-    return 0;
+
+  std::string id_vendor, id_model;
+  data = udev_device_get_property_value(dev, "ID_VENDOR");
+  if (data) {
+    id_vendor = data;
+  }
+  data = udev_device_get_property_value(dev, "ID_MODEL");
+  if (data) {
+    id_model = data;
+  }
+  udev_device_unref(dev);
+  udev_unref(udev);
+
+  std::transform(id_vendor.begin(), id_vendor.end(), id_vendor.begin(),
+                ::tolower);
+  std::transform(id_model.begin(), id_model.end(), id_model.begin(),
+                ::tolower);
+
+  if (id_vendor.size()) {
+    return id_vendor;
+  }
+  if (id_model.size()) {
+    int pos = id_model.find(" ");
+    if (pos > 0) {
+      return id_model.substr(0, pos);
+    } else {
+      return id_model;
+    }
+  }
+
+  std::string vendor, model;
+  char buf[1024] = {0};
+  BlkDev blkdev(devname);
+  if (!blkdev.vendor(buf, sizeof(buf))) {
+    vendor = buf;
   }
-  return -EINVAL;
+  if (!blkdev.model(buf, sizeof(buf))) {
+    model = buf;
+  }
+  if (vendor.size()) {
+    return vendor;
+  }
+  if (model.size()) {
+     int pos = model.find(" ");
+    if (pos > 0) {
+      return model.substr(0, pos);
+    } else {
+      return model;
+    }
+  }
+
+  return {};
 }
 
-int block_device_run_smartctl(const char *device, int timeout,
-                             std::string *result)
+static int block_device_run_vendor_nvme(
+  const string& devname, const string& vendor, int timeout,
+  std::string *result)
+{
+  string device = "/dev/" + devname;
+
+  SubProcessTimed nvmecli(
+    "sudo", SubProcess::CLOSE, SubProcess::PIPE, SubProcess::CLOSE,
+    timeout);
+  nvmecli.add_cmd_args(
+    "nvme",
+    vendor.c_str(),
+    "smart-log-add",
+    "--json",
+    device.c_str(),
+    NULL);
+  int ret = nvmecli.spawn();
+  if (ret != 0) {
+    *result = std::string("error spawning nvme command: ") + nvmecli.err();
+    return ret;
+  }
+
+  bufferlist output;
+  ret = output.read_fd(nvmecli.get_stdout(), 100*1024);
+  if (ret < 0) {
+    bufferlist err;
+    err.read_fd(nvmecli.get_stderr(), 100 * 1024);
+    *result = std::string("failed to execute nvme: ") + err.to_str();
+  } else {
+    ret = 0;
+    *result = output.to_str();
+  }
+
+  if (nvmecli.join() != 0) {
+    *result = std::string("nvme returned an error: ") + nvmecli.err();
+    return -EINVAL;
+  }
+
+  return ret;
+}
+
+static int block_device_run_smartctl(const string& devname, int timeout,
+                                    std::string *result)
 {
+  string device = "/dev/" + devname;
+
   // when using --json, smartctl will report its errors in JSON format to stdout 
   SubProcessTimed smartctl(
     "sudo", SubProcess::CLOSE, SubProcess::PIPE, SubProcess::CLOSE,
@@ -557,7 +646,7 @@ int block_device_run_smartctl(const char *device, int timeout,
     "-a",
     //"-x",
     "--json",
-    device,
+    device.c_str(),
     NULL);
 
   int ret = smartctl.spawn();
@@ -583,6 +672,53 @@ int block_device_run_smartctl(const char *device, int timeout,
   return ret;
 }
 
+int block_device_get_metrics(const string& devname, int timeout,
+                            json_spirit::mValue *result)
+{
+  std::string s;
+
+  // smartctl
+  if (int r = block_device_run_smartctl(devname, timeout, &s);
+      r != 0) {
+    s = "{\"error\": \"smartctl failed\", \"dev\": \"/dev/";
+    s += devname;
+    s += "\", \"smartctl_error_code\": " + stringify(r);
+    s += "\", \"smartctl_output\": \"" + s;
+    s += + "\"}";
+  }
+  if (!json_spirit::read(s, *result)) {
+    s = "{\"error\": \"smartctl returned invalid JSON\", \"dev\": \"/dev/";
+    s += devname;
+    s += "\"}";
+  }
+  if (!json_spirit::read(s, *result)) {
+    return -EINVAL;
+  }
+
+  json_spirit::mObject& base = result->get_obj();
+  string vendor = get_device_vendor(devname);
+  if (vendor.size()) {
+    base["nvme_vendor"] = vendor;
+    s.clear();
+    json_spirit::mValue nvme_json;
+    if (int r = block_device_run_vendor_nvme(devname, vendor, timeout, &s);
+       r == 0) {
+      if (json_spirit::read(s, nvme_json) != 0) {
+       base["nvme_smart_health_information_add_log"] = nvme_json;
+      } else {
+       base["nvme_smart_health_information_add_log_error"] = "bad json output: "
+         + s;
+      }
+    } else {
+      base["nvme_smart_health_information_add_log_error_code"] = r;
+      base["nvme_smart_health_information_add_log_error"] = s;
+    }
+  } else {
+    base["nvme_vendor"] = "unknown";
+  }
+
+  return 0;
+}
 
 #elif defined(__APPLE__)
 #include <sys/disk.h>
@@ -865,6 +1001,12 @@ int block_device_run_smartctl(const char *device, int timeout,
   return -EOPNOTSUPP;  
 }
 
+int block_device_run_nvme(const char *device, const char *vendor, int timeout,
+             std::string *result)
+{
+  return -EOPNOTSUPP;
+}
+
 static int block_device_devname(int fd, char *devname, size_t max)
 {
   struct fiodgname_arg arg;
@@ -1000,4 +1142,10 @@ int block_device_run_smartctl(const char *device, int timeout,
   return -EOPNOTSUPP;
 }
 
+int block_device_run_nvme(const char *device, const char *vendor, int timeout,
+            std::string *result)
+{
+  return -EOPNOTSUPP;
+}
+
 #endif
index 7dc3e422fc3c6bbb40ddc41e8061f17b1f5d5889..ec745f1926450a7bc8a6c5e054f6b5008818e72b 100644 (file)
@@ -24,9 +24,7 @@ extern int get_device_by_path(const char *path, char* partition, char* device, s
 extern std::string get_device_id(const std::string& devname,
                                 std::string *err=0);
 extern void get_dm_parents(const std::string& dev, std::set<std::string> *ls);
-extern int block_device_run_smartctl(const char *device, int timeout,
-                                    std::string *result);
-extern int block_device_get_metrics(const char *device, int timeout,
+extern int block_device_get_metrics(const string& devname, int timeout,
                                    json_spirit::mValue *result);
 
 // do everything to translate a device to the raw physical devices that
index 468e44baced3b4426aa3eddc184b238f8ce1faf2..2947a81a7f5af016a3d9d12d98f237da9c44a796 100644 (file)
@@ -3660,7 +3660,7 @@ void Monitor::handle_command(MonOpRequestRef op)
        continue;
       }
       json_spirit::mValue smart_json;
-      if (block_device_get_metrics(("/dev/" + devname).c_str(), smart_timeout,
+      if (block_device_get_metrics(devname, smart_timeout,
                                   &smart_json)) {
        dout(10) << "block_device_get_metrics failed for /dev/" << devname
                 << dendl;
index db30fb3686c3188de557e293d3314a71cbb25db3..d116b1c736172d10491ec41461fdde7d0ac4c4cb 100644 (file)
@@ -6635,7 +6635,7 @@ void OSD::probe_smart(const string& only_devid, ostream& ss)
     }
 
     json_spirit::mValue smart_json;
-    if (block_device_get_metrics(("/dev/" + dev).c_str(), smart_timeout,
+    if (block_device_get_metrics(dev, smart_timeout,
                                 &smart_json)) {
       dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
       continue;
index a5cdb536398e91ccdadf1adca7957af30dc908fc..ba788d6c0be4f1bca11c4e498d6fb4b8da6882ec 100644 (file)
@@ -1,3 +1,4 @@
 ## allow ceph-osd (which runs as user ceph) to collect device health metrics
 
 ceph ALL=NOPASSWD: /usr/sbin/smartctl -a --json /dev/*
+ceph ALL=NOPASSWD: /usr/sbin/nvme * smart-log-add --json /dev/*