From: Ilya Dryomov Date: Mon, 7 Oct 2019 13:32:39 +0000 (+0200) Subject: krbd: retry on transient errors from udev_enumerate_scan_devices() X-Git-Tag: v13.2.7~7^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=342447e90e32fbe46ce738f55d39ad3f70f5dff5;p=ceph.git krbd: retry on transient errors from udev_enumerate_scan_devices() udev_enumerate_scan_devices() doesn't handle disappearing devices well. If called while some devices are being removed, it sometimes propagates ENOENT and ENODEV errors encountered operating on directory entries in /sys that no longer exist. Some of these errors are suppressed, but this isn't reliable and varies across versions. In particular, systemd 239 suppresses ENODEV from sd_device_new_from_syspath() but doesn't suppress ENODEV from sd_device_get_devnum(). In systemd 243 the call to sd_device_get_devnum() has been moved, but it still leaks ENOENT from sd_device_get_is_initialized() (referring to the body of FOREACH_DIRENT_ALL loop in enumerator_scan_dir_and_add_devices()). Assume that all ENOENT and ENODEV errors are transient and retry the call to udev_enumerate_scan_devices(). Don't limit the number, but log each retry. Fixes: https://tracker.ceph.com/issues/41036 Signed-off-by: Ilya Dryomov (cherry picked from commit e5921ef4a89f497a0bff6510fce0bb5c242d6172) Conflicts: src/krbd.cc [ rbd namespaces not in mimic ] --- diff --git a/src/krbd.cc b/src/krbd.cc index 4fd4ced4416..c87980f26f1 100644 --- a/src/krbd.cc +++ b/src/krbd.cc @@ -434,6 +434,7 @@ static int devno_to_krbd_id(struct udev *udev, dev_t devno, string *pid) struct udev_device *dev; int r; +retry: enm = udev_enumerate_new(udev); if (!enm) return -ENOMEM; @@ -455,8 +456,14 @@ static int devno_to_krbd_id(struct udev *udev, dev_t devno, string *pid) } r = udev_enumerate_scan_devices(enm); - if (r < 0) + if (r < 0) { + if (r == -ENOENT || r == -ENODEV) { + std::cerr << "rbd: udev enumerate failed, retrying" << std::endl; + udev_enumerate_unref(enm); + goto retry; + } goto out_enm; + } l = udev_enumerate_get_list_entry(enm); if (!l) { @@ -520,13 +527,20 @@ static int spec_to_devno_and_krbd_id(struct udev *udev, const char *pool, string err; int r; +retry: enm = udev_enumerate_new(udev); if (!enm) return -ENOMEM; r = enumerate_devices(enm, pool, image, snap); - if (r < 0) + if (r < 0) { + if (r == -ENOENT || r == -ENODEV) { + std::cerr << "rbd: udev enumerate failed, retrying" << std::endl; + udev_enumerate_unref(enm); + goto retry; + } goto out_enm; + } l = udev_enumerate_get_list_entry(enm); if (!l) { @@ -783,6 +797,7 @@ static int do_dump(struct udev *udev, Formatter *f, TextTable *tbl) bool have_output = false; int r; +retry: enm = udev_enumerate_new(udev); if (!enm) return -ENOMEM; @@ -792,8 +807,14 @@ static int do_dump(struct udev *udev, Formatter *f, TextTable *tbl) goto out_enm; r = udev_enumerate_scan_devices(enm); - if (r < 0) + if (r < 0) { + if (r == -ENOENT || r == -ENODEV) { + std::cerr << "rbd: udev enumerate failed, retrying" << std::endl; + udev_enumerate_unref(enm); + goto retry; + } goto out_enm; + } udev_list_entry_foreach(l, udev_enumerate_get_list_entry(enm)) { struct udev_device *dev; @@ -849,13 +870,20 @@ static int is_mapped_image(struct udev *udev, const char *pool, if (strcmp(snap, "") == 0) snap = "-"; +retry: enm = udev_enumerate_new(udev); if (!enm) return -ENOMEM; r = enumerate_devices(enm, pool, image, snap); - if (r < 0) + if (r < 0) { + if (r == -ENOENT || r == -ENODEV) { + std::cerr << "rbd: udev enumerate failed, retrying" << std::endl; + udev_enumerate_unref(enm); + goto retry; + } goto out_enm; + } l = udev_enumerate_get_list_entry(enm); if (l) {