From: Ilya Dryomov Date: Mon, 7 Oct 2019 13:32:39 +0000 (+0200) Subject: krbd: retry on transient errors from udev_enumerate_scan_devices() X-Git-Tag: v12.2.13~28^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=d145fe15cef905ba1d75d6b900f87b1145d7f887;p=ceph.git krbd: retry on transient errors from udev_enumerate_scan_devices() udev_enumerate_scan_devices() doesn't handle disappearing devices well. If called while some devices are being removed, it sometimes propagates ENOENT and ENODEV errors encountered operating on directory entries in /sys that no longer exist. Some of these errors are suppressed, but this isn't reliable and varies across versions. In particular, systemd 239 suppresses ENODEV from sd_device_new_from_syspath() but doesn't suppress ENODEV from sd_device_get_devnum(). In systemd 243 the call to sd_device_get_devnum() has been moved, but it still leaks ENOENT from sd_device_get_is_initialized() (referring to the body of FOREACH_DIRENT_ALL loop in enumerator_scan_dir_and_add_devices()). Assume that all ENOENT and ENODEV errors are transient and retry the call to udev_enumerate_scan_devices(). Don't limit the number, but log each retry. Fixes: https://tracker.ceph.com/issues/41036 Signed-off-by: Ilya Dryomov (cherry picked from commit e5921ef4a89f497a0bff6510fce0bb5c242d6172) --- diff --git a/src/krbd.cc b/src/krbd.cc index c35f15ac454..a2230b4a2a6 100644 --- a/src/krbd.cc +++ b/src/krbd.cc @@ -434,6 +434,7 @@ static int devno_to_krbd_id(struct udev *udev, dev_t devno, string *pid) struct udev_device *dev; int r; +retry: enm = udev_enumerate_new(udev); if (!enm) return -ENOMEM; @@ -455,8 +456,14 @@ static int devno_to_krbd_id(struct udev *udev, dev_t devno, string *pid) } r = udev_enumerate_scan_devices(enm); - if (r < 0) + if (r < 0) { + if (r == -ENOENT || r == -ENODEV) { + std::cerr << "rbd: udev enumerate failed, retrying" << std::endl; + udev_enumerate_unref(enm); + goto retry; + } goto out_enm; + } l = udev_enumerate_get_list_entry(enm); if (!l) { @@ -492,6 +499,7 @@ static int spec_to_devno_and_krbd_id(struct udev *udev, const char *pool, string err; int r; +retry: enm = udev_enumerate_new(udev); if (!enm) return -ENOMEM; @@ -513,8 +521,14 @@ static int spec_to_devno_and_krbd_id(struct udev *udev, const char *pool, goto out_enm; r = udev_enumerate_scan_devices(enm); - if (r < 0) + if (r < 0) { + if (r == -ENOENT || r == -ENODEV) { + std::cerr << "rbd: udev enumerate failed, retrying" << std::endl; + udev_enumerate_unref(enm); + goto retry; + } goto out_enm; + } l = udev_enumerate_get_list_entry(enm); if (!l) { @@ -771,6 +785,7 @@ static int do_dump(struct udev *udev, Formatter *f, TextTable *tbl) bool have_output = false; int r; +retry: enm = udev_enumerate_new(udev); if (!enm) return -ENOMEM; @@ -780,8 +795,14 @@ static int do_dump(struct udev *udev, Formatter *f, TextTable *tbl) goto out_enm; r = udev_enumerate_scan_devices(enm); - if (r < 0) + if (r < 0) { + if (r == -ENOENT || r == -ENODEV) { + std::cerr << "rbd: udev enumerate failed, retrying" << std::endl; + udev_enumerate_unref(enm); + goto retry; + } goto out_enm; + } udev_list_entry_foreach(l, udev_enumerate_get_list_entry(enm)) { struct udev_device *dev;