From 924e18f468cbe6d4b93de8cf989b71059a9c5747 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 26 Sep 2019 18:06:27 +0200 Subject: [PATCH] krbd: avoid udev netlink socket overrun Because the event(s) we are interested in can be deliveled while we are still in the kernel finishing map or unmap, we start listening for udev events before going into the kernel. However, if (un)mapping takes its time, udev netlink socket can be fairly easily overrun -- the filtering is done on the listener side, so we get to process everything, not just rbd events. If any of the events of interest get dropped (ENOBUFS), we hang in poll(). Go into the kernel in a separate thread and leave the main thread to run the event loop. The return value is communicated to the reactor though a pipe. Fixes: https://tracker.ceph.com/issues/41404 Signed-off-by: Ilya Dryomov (cherry picked from commit 5444a1111523bc100bea60958b2671674f6208ac) --- src/krbd.cc | 175 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 128 insertions(+), 47 deletions(-) diff --git a/src/krbd.cc b/src/krbd.cc index baae7e724962d..41eea2e74fb17 100644 --- a/src/krbd.cc +++ b/src/krbd.cc @@ -23,7 +23,9 @@ #include #include #include +#include #include +#include #include "auth/KeyRing.h" #include "common/errno.h" @@ -33,6 +35,7 @@ #include "common/safe_io.h" #include "common/secret.h" #include "common/TextTable.h" +#include "common/Thread.h" #include "include/ceph_assert.h" #include "include/stringify.h" #include "include/krbd.h" @@ -225,31 +228,69 @@ static int build_map_buf(CephContext *cct, const krbd_spec& spec, return 0; } +/* + * Return: + * - didn't map + * <0 or udev error, true> - mapped + */ template -static int wait_for_mapping(udev_monitor *mon, F udev_device_handler) +static std::pair wait_for_mapping(int sysfs_r_fd, udev_monitor *mon, + F udev_device_handler) { - struct pollfd fds[1]; + struct pollfd fds[2]; + int sysfs_r = INT_MAX, udev_r = INT_MAX; + int r; - fds[0].fd = udev_monitor_get_fd(mon); + fds[0].fd = sysfs_r_fd; fds[0].events = POLLIN; + fds[1].fd = udev_monitor_get_fd(mon); + fds[1].events = POLLIN; for (;;) { - if (poll(fds, 1, -1) < 0) { + if (poll(fds, 2, -1) < 0) { ceph_abort_msgf("poll failed: %d", -errno); } - for (;;) { - struct udev_device *dev; + if (fds[0].revents) { + r = safe_read_exact(sysfs_r_fd, &sysfs_r, sizeof(sysfs_r)); + if (r < 0) { + ceph_abort_msgf("safe_read_exact failed: %d", r); + } + if (sysfs_r < 0) { + return std::make_pair(sysfs_r, false); + } + if (udev_r != INT_MAX) { + ceph_assert(!sysfs_r); + return std::make_pair(udev_r, true); + } + fds[0].fd = -1; + } - dev = udev_monitor_receive_device(mon); - if (!dev) { - if (errno != EINTR && errno != EAGAIN) { - return -errno; + if (fds[1].revents) { + for (;;) { + struct udev_device *dev; + + dev = udev_monitor_receive_device(mon); + if (!dev) { + if (errno != EINTR && errno != EAGAIN) { + udev_r = -errno; + if (sysfs_r != INT_MAX) { + ceph_assert(!sysfs_r); + return std::make_pair(udev_r, true); + } + fds[1].fd = -1; + } + break; + } + if (udev_device_handler(dev)) { + udev_r = 0; + if (sysfs_r != INT_MAX) { + ceph_assert(!sysfs_r); + return std::make_pair(udev_r, true); + } + fds[1].fd = -1; + break; } - break; - } - if (udev_device_handler(dev)) { - return 0; } } } @@ -330,6 +371,9 @@ static int do_map(struct udev *udev, const krbd_spec& spec, const string& buf, string *pname) { struct udev_monitor *mon; + std::thread mapper; + bool mapped; + int fds[2]; int r; mon = udev_monitor_new_from_netlink(udev, "udev"); @@ -348,18 +392,34 @@ static int do_map(struct udev *udev, const krbd_spec& spec, const string& buf, if (r < 0) goto out_mon; - r = sysfs_write_rbd_add(buf); - if (r < 0) { - cerr << "rbd: sysfs write failed" << std::endl; + if (pipe2(fds, O_NONBLOCK) < 0) { + r = -errno; goto out_mon; } - r = wait_for_mapping(mon, UdevMapHandler(&spec, pname)); + mapper = make_named_thread("mapper", [&buf, sysfs_r_fd = fds[1]]() { + int sysfs_r = sysfs_write_rbd_add(buf); + int r = safe_write(sysfs_r_fd, &sysfs_r, sizeof(sysfs_r)); + if (r < 0) { + ceph_abort_msgf("safe_write failed: %d", r); + } + }); + + std::tie(r, mapped) = wait_for_mapping(fds[0], mon, + UdevMapHandler(&spec, pname)); if (r < 0) { - cerr << "rbd: wait failed" << std::endl; - goto out_mon; + if (!mapped) { + std::cerr << "rbd: sysfs write failed" << std::endl; + } else { + std::cerr << "rbd: udev wait failed" << std::endl; + /* TODO: fall back to enumeration */ + } } + mapper.join(); + close(fds[0]); + close(fds[1]); + out_mon: udev_monitor_unref(mon); return r; @@ -623,6 +683,9 @@ private: static int do_unmap(struct udev *udev, dev_t devno, const string& buf) { struct udev_monitor *mon; + std::thread unmapper; + bool unmapped; + int fds[2]; int r; mon = udev_monitor_new_from_netlink(udev, "udev"); @@ -637,40 +700,58 @@ static int do_unmap(struct udev *udev, dev_t devno, const string& buf) if (r < 0) goto out_mon; - /* - * On final device close(), kernel sends a block change event, in - * response to which udev apparently runs blkid on the device. This - * makes unmap fail with EBUSY, if issued right after final close(). - * Try to circumvent this with a retry before turning to udev. - */ - for (int tries = 0; ; tries++) { - r = sysfs_write_rbd_remove(buf); - if (r >= 0) { - break; - } else if (r == -EBUSY && tries < 2) { - if (!tries) { - usleep(250 * 1000); + if (pipe2(fds, O_NONBLOCK) < 0) { + r = -errno; + goto out_mon; + } + + unmapper = make_named_thread("unmapper", [&buf, sysfs_r_fd = fds[1]]() { + /* + * On final device close(), kernel sends a block change event, in + * response to which udev apparently runs blkid on the device. This + * makes unmap fail with EBUSY, if issued right after final close(). + * Try to circumvent this with a retry before turning to udev. + */ + for (int tries = 0; ; tries++) { + int sysfs_r = sysfs_write_rbd_remove(buf); + if (sysfs_r == -EBUSY && tries < 2) { + if (!tries) { + usleep(250 * 1000); + } else { + /* + * libudev does not provide the "wait until the queue is empty" + * API or the sufficient amount of primitives to build it from. + */ + std::string err = run_cmd("udevadm", "settle", "--timeout", "10", + (char *)NULL); + if (!err.empty()) + std::cerr << "rbd: " << err << std::endl; + } } else { - /* - * libudev does not provide the "wait until the queue is empty" - * API or the sufficient amount of primitives to build it from. - */ - string err = run_cmd("udevadm", "settle", "--timeout", "10", (char*)NULL); - if (!err.empty()) - cerr << "rbd: " << err << std::endl; + int r = safe_write(sysfs_r_fd, &sysfs_r, sizeof(sysfs_r)); + if (r < 0) { + ceph_abort_msgf("safe_write failed: %d", r); + } + break; } - } else { - cerr << "rbd: sysfs write failed" << std::endl; - goto out_mon; } - } + }); - r = wait_for_mapping(mon, UdevUnmapHandler(devno)); + std::tie(r, unmapped) = wait_for_mapping(fds[0], mon, + UdevUnmapHandler(devno)); if (r < 0) { - cerr << "rbd: wait failed" << std::endl; - goto out_mon; + if (!unmapped) { + std::cerr << "rbd: sysfs write failed" << std::endl; + } else { + std::cerr << "rbd: udev wait failed: " << cpp_strerror(r) << std::endl; + r = 0; + } } + unmapper.join(); + close(fds[0]); + close(fds[1]); + out_mon: udev_monitor_unref(mon); return r; -- 2.39.5