From: hsiang41 Date: Fri, 28 Dec 2018 09:07:32 +0000 (+0800) Subject: osd: OSD device smart data include additional nvme data X-Git-Tag: v14.1.0~499^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=2da890117c130d5d6b288cefea98f06c35f6ad76;p=ceph.git osd: OSD device smart data include additional nvme data Add nvme addition data into the deveh health data. That use nvme tool and command syntax "nvme smart-log-add -json". The nvme json output append in the dev smart "nvme_smart_health_information_add_log". - made run_smartctl static/private - changed get_metrics to take a const string, not c str Signed-off-by: Rick Chen Signed-off-by: Sage Weil --- diff --git a/src/common/blkdev.cc b/src/common/blkdev.cc index 0ed7740b903a..bc5be07059fc 100644 --- a/src/common/blkdev.cc +++ b/src/common/blkdev.cc @@ -522,32 +522,121 @@ std::string get_device_id(const std::string& devname, return device_id; } -int block_device_get_metrics(const char *device, int timeout, - json_spirit::mValue *result) +static std::string get_device_vendor(const std::string& devname) { - std::string s; - if (int r = block_device_run_smartctl(device, timeout, &s); r != 0) { - s = "{\"error\": \"smartctl failed\", \"dev\": \""; - s += device; - s += "\", \"smartctl_error_code\": " + stringify(r); - s += "\", \"smartctl_output\": \"" + s; - s += + "\"}"; + struct udev_device *dev; + static struct udev *udev; + const char *data; + + udev = udev_new(); + if (!udev) { + return {}; } - if (json_spirit::read(s, *result)) { - return 0; + dev = udev_device_new_from_subsystem_sysname(udev, "block", devname.c_str()); + if (!dev) { + udev_unref(udev); + return {}; } - s = "{\"error\": \"smartctl returned invalid JSON\", \"dev\": \""; - s += device; - s += "\"}"; - if (json_spirit::read(s, *result)) { - return 0; + + std::string id_vendor, id_model; + data = udev_device_get_property_value(dev, "ID_VENDOR"); + if (data) { + id_vendor = data; + } + data = udev_device_get_property_value(dev, "ID_MODEL"); + if (data) { + id_model = data; + } + udev_device_unref(dev); + udev_unref(udev); + + std::transform(id_vendor.begin(), id_vendor.end(), id_vendor.begin(), + ::tolower); + std::transform(id_model.begin(), id_model.end(), id_model.begin(), + ::tolower); + + if (id_vendor.size()) { + return id_vendor; + } + if (id_model.size()) { + int pos = id_model.find(" "); + if (pos > 0) { + return id_model.substr(0, pos); + } else { + return id_model; + } + } + + std::string vendor, model; + char buf[1024] = {0}; + BlkDev blkdev(devname); + if (!blkdev.vendor(buf, sizeof(buf))) { + vendor = buf; } - return -EINVAL; + if (!blkdev.model(buf, sizeof(buf))) { + model = buf; + } + if (vendor.size()) { + return vendor; + } + if (model.size()) { + int pos = model.find(" "); + if (pos > 0) { + return model.substr(0, pos); + } else { + return model; + } + } + + return {}; } -int block_device_run_smartctl(const char *device, int timeout, - std::string *result) +static int block_device_run_vendor_nvme( + const string& devname, const string& vendor, int timeout, + std::string *result) +{ + string device = "/dev/" + devname; + + SubProcessTimed nvmecli( + "sudo", SubProcess::CLOSE, SubProcess::PIPE, SubProcess::CLOSE, + timeout); + nvmecli.add_cmd_args( + "nvme", + vendor.c_str(), + "smart-log-add", + "--json", + device.c_str(), + NULL); + int ret = nvmecli.spawn(); + if (ret != 0) { + *result = std::string("error spawning nvme command: ") + nvmecli.err(); + return ret; + } + + bufferlist output; + ret = output.read_fd(nvmecli.get_stdout(), 100*1024); + if (ret < 0) { + bufferlist err; + err.read_fd(nvmecli.get_stderr(), 100 * 1024); + *result = std::string("failed to execute nvme: ") + err.to_str(); + } else { + ret = 0; + *result = output.to_str(); + } + + if (nvmecli.join() != 0) { + *result = std::string("nvme returned an error: ") + nvmecli.err(); + return -EINVAL; + } + + return ret; +} + +static int block_device_run_smartctl(const string& devname, int timeout, + std::string *result) { + string device = "/dev/" + devname; + // when using --json, smartctl will report its errors in JSON format to stdout SubProcessTimed smartctl( "sudo", SubProcess::CLOSE, SubProcess::PIPE, SubProcess::CLOSE, @@ -557,7 +646,7 @@ int block_device_run_smartctl(const char *device, int timeout, "-a", //"-x", "--json", - device, + device.c_str(), NULL); int ret = smartctl.spawn(); @@ -583,6 +672,53 @@ int block_device_run_smartctl(const char *device, int timeout, return ret; } +int block_device_get_metrics(const string& devname, int timeout, + json_spirit::mValue *result) +{ + std::string s; + + // smartctl + if (int r = block_device_run_smartctl(devname, timeout, &s); + r != 0) { + s = "{\"error\": \"smartctl failed\", \"dev\": \"/dev/"; + s += devname; + s += "\", \"smartctl_error_code\": " + stringify(r); + s += "\", \"smartctl_output\": \"" + s; + s += + "\"}"; + } + if (!json_spirit::read(s, *result)) { + s = "{\"error\": \"smartctl returned invalid JSON\", \"dev\": \"/dev/"; + s += devname; + s += "\"}"; + } + if (!json_spirit::read(s, *result)) { + return -EINVAL; + } + + json_spirit::mObject& base = result->get_obj(); + string vendor = get_device_vendor(devname); + if (vendor.size()) { + base["nvme_vendor"] = vendor; + s.clear(); + json_spirit::mValue nvme_json; + if (int r = block_device_run_vendor_nvme(devname, vendor, timeout, &s); + r == 0) { + if (json_spirit::read(s, nvme_json) != 0) { + base["nvme_smart_health_information_add_log"] = nvme_json; + } else { + base["nvme_smart_health_information_add_log_error"] = "bad json output: " + + s; + } + } else { + base["nvme_smart_health_information_add_log_error_code"] = r; + base["nvme_smart_health_information_add_log_error"] = s; + } + } else { + base["nvme_vendor"] = "unknown"; + } + + return 0; +} #elif defined(__APPLE__) #include @@ -865,6 +1001,12 @@ int block_device_run_smartctl(const char *device, int timeout, return -EOPNOTSUPP; } +int block_device_run_nvme(const char *device, const char *vendor, int timeout, + std::string *result) +{ + return -EOPNOTSUPP; +} + static int block_device_devname(int fd, char *devname, size_t max) { struct fiodgname_arg arg; @@ -1000,4 +1142,10 @@ int block_device_run_smartctl(const char *device, int timeout, return -EOPNOTSUPP; } +int block_device_run_nvme(const char *device, const char *vendor, int timeout, + std::string *result) +{ + return -EOPNOTSUPP; +} + #endif diff --git a/src/common/blkdev.h b/src/common/blkdev.h index 7dc3e422fc3c..ec745f192645 100644 --- a/src/common/blkdev.h +++ b/src/common/blkdev.h @@ -24,9 +24,7 @@ extern int get_device_by_path(const char *path, char* partition, char* device, s extern std::string get_device_id(const std::string& devname, std::string *err=0); extern void get_dm_parents(const std::string& dev, std::set *ls); -extern int block_device_run_smartctl(const char *device, int timeout, - std::string *result); -extern int block_device_get_metrics(const char *device, int timeout, +extern int block_device_get_metrics(const string& devname, int timeout, json_spirit::mValue *result); // do everything to translate a device to the raw physical devices that diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 468e44baced3..2947a81a7f5a 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -3660,7 +3660,7 @@ void Monitor::handle_command(MonOpRequestRef op) continue; } json_spirit::mValue smart_json; - if (block_device_get_metrics(("/dev/" + devname).c_str(), smart_timeout, + if (block_device_get_metrics(devname, smart_timeout, &smart_json)) { dout(10) << "block_device_get_metrics failed for /dev/" << devname << dendl; diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index db30fb3686c3..d116b1c73617 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -6635,7 +6635,7 @@ void OSD::probe_smart(const string& only_devid, ostream& ss) } json_spirit::mValue smart_json; - if (block_device_get_metrics(("/dev/" + dev).c_str(), smart_timeout, + if (block_device_get_metrics(dev, smart_timeout, &smart_json)) { dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl; continue; diff --git a/sudoers.d/ceph-osd-smartctl b/sudoers.d/ceph-osd-smartctl index a5cdb536398e..ba788d6c0be4 100644 --- a/sudoers.d/ceph-osd-smartctl +++ b/sudoers.d/ceph-osd-smartctl @@ -1,3 +1,4 @@ ## allow ceph-osd (which runs as user ceph) to collect device health metrics ceph ALL=NOPASSWD: /usr/sbin/smartctl -a --json /dev/* +ceph ALL=NOPASSWD: /usr/sbin/nvme * smart-log-add --json /dev/*