From: Ilya Dryomov Date: Mon, 24 Aug 2020 17:01:46 +0000 (+0200) Subject: krbd: optionally skip waiting for udev events X-Git-Tag: v17.0.0~1057^2~1 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=2e657d134a1af489dcf623d207be259ce3dda9bc;p=ceph.git krbd: optionally skip waiting for udev events Add support for noudev option to allow mapping and unmapping images from a privileged container in a non-initial network namespace (e.g. when using Multus CNI). Fixes: https://tracker.ceph.com/issues/47128 Signed-off-by: Ilya Dryomov --- diff --git a/doc/man/8/rbd.rst b/doc/man/8/rbd.rst index 52a684fcdfe7a..b81b6a7087d1f 100644 --- a/doc/man/8/rbd.rst +++ b/doc/man/8/rbd.rst @@ -843,12 +843,25 @@ Per mapping (block device) `rbd device map` options: backend that the data is incompressible, disabling compression in aggressive mode (since 5.8). +* udev - Wait for udev device manager to finish executing all matching + "add" rules and release the device before exiting (default). This option + is not passed to the kernel. + +* noudev - Don't wait for udev device manager. When enabled, the device may + not be fully usable immediately on exit. + `rbd device unmap` options: * force - Force the unmapping of a block device that is open (since 4.9). The driver will wait for running requests to complete and then unmap; requests sent to the driver after initiating the unmap will be failed. +* udev - Wait for udev device manager to finish executing all matching + "remove" rules and clean up after the device before exiting (default). + This option is not passed to the kernel. + +* noudev - Don't wait for udev device manager. + Examples ======== diff --git a/src/include/krbd.h b/src/include/krbd.h index 2c63bc7ea54aa..977d45fe2e360 100644 --- a/src/include/krbd.h +++ b/src/include/krbd.h @@ -15,13 +15,46 @@ #include "rados/librados.h" +/* + * Don't wait for udev add uevents in krbd_map() and udev remove + * uevents in krbd_unmap*(). Instead, make do with the respective + * kernel uevents and return as soon as they are received. + * + * systemd-udevd sends out udev uevents after it finishes processing + * the respective kernel uevents, which mostly boils down to executing + * all matching udev rules. With this flag set, on return from + * krbd_map() systemd-udevd may still be poking at the device: it + * may still be open with tools such as blkid and various ioctls to + * be run against it, none of the persistent symlinks to the device + * node may be there, etc. udev used to be responsible for creating + * the device node as well, but that has been handled by devtmpfs in + * the kernel for many years now, so the device node (as returned + * through @pdevnode) is guaranteed to be there. + * + * If set, krbd_map() and krbd_unmap*() can be invoked from any + * network namespace that is owned by the initial user namespace + * (which is a formality because things like loading kernel modules + * and creating block devices are not namespaced and require global + * privileges, i.e. capabilities in the initial user namespace). + * Otherwise, krbd_map() and krbd_unmap*() must be invoked from + * the initial network namespace. + * + * If set, krbd_unmap*() doesn't attempt to settle the udev queue + * before retrying unmap for the last time. Some EBUSY errors due + * to systemd-udevd poking at the device at the time krbd_unmap*() + * is invoked that are otherwise covered by the retry logic may be + * returned. + */ +#define KRBD_CTX_F_NOUDEV (1U << 0) + #ifdef __cplusplus extern "C" { #endif struct krbd_ctx; -int krbd_create_from_context(rados_config_t cct, struct krbd_ctx **pctx); +int krbd_create_from_context(rados_config_t cct, uint32_t flags, + struct krbd_ctx **pctx); void krbd_destroy(struct krbd_ctx *ctx); int krbd_map(struct krbd_ctx *ctx, diff --git a/src/krbd.cc b/src/krbd.cc index 0019f6f5fec80..0d463dea52e92 100644 --- a/src/krbd.cc +++ b/src/krbd.cc @@ -66,6 +66,7 @@ DEFINE_UDEV_UPTR(device) /* udev_device_uptr */ struct krbd_ctx { CephContext *cct; struct udev *udev; + uint32_t flags; /* KRBD_CTX_F_* */ }; struct krbd_spec { @@ -380,14 +381,47 @@ private: std::string *m_pdevnode; }; -static int do_map(struct udev *udev, const krbd_spec& spec, const string& buf, +static const char *get_event_source(const krbd_ctx *ctx) +{ + if (ctx->flags & KRBD_CTX_F_NOUDEV) { + /* + * For block devices (unlike network interfaces, they don't + * carry any namespace tags), the kernel broadcasts uevents + * into all network namespaces that are owned by the initial + * user namespace. This restriction is new in 4.18: starting + * with 2.6.35 and through 4.17 the kernel broadcast uevents + * into all network namespaces, period. + * + * However, when invoked from a non-initial user namespace, + * udev_monitor_receive_device() has always ignored both kernel + * and udev uevents by virtue of requiring SCM_CREDENTIALS and + * checking that ucred->uid == 0. When UIDs and GIDs are sent to + * a process in a user namespace, they are translated according + * to that process's UID and GID mappings and, unless root in the + * user namespace is mapped to the global root, that check fails. + * Normally they show up as 65534(nobody) because the global root + * is not mapped. + */ + return "kernel"; + } + + /* + * Like most netlink messages, udev uevents don't cross network + * namespace boundaries and are therefore confined to the initial + * network namespace. + */ + return "udev"; +} + +static int do_map(krbd_ctx *ctx, const krbd_spec& spec, const string& buf, string *pname) { bool mapped; int fds[2]; int r; - udev_monitor_uptr mon(udev_monitor_new_from_netlink(udev, "udev")); + udev_monitor_uptr mon(udev_monitor_new_from_netlink(ctx->udev, + get_event_source(ctx))); if (!mon) return -ENOMEM; @@ -473,7 +507,7 @@ static int map_image(struct krbd_ctx *ctx, const krbd_spec& spec, if (r < 0) return r; - return do_map(ctx->udev, spec, buf, pname); + return do_map(ctx, spec, buf, pname); } static int devno_to_krbd_id(struct udev *udev, dev_t devno, string *pid) @@ -681,13 +715,14 @@ private: dev_t m_devno; }; -static int do_unmap(struct udev *udev, dev_t devno, const string& buf) +static int do_unmap(krbd_ctx *ctx, dev_t devno, const string& buf) { bool unmapped; int fds[2]; int r; - udev_monitor_uptr mon(udev_monitor_new_from_netlink(udev, "udev")); + udev_monitor_uptr mon(udev_monitor_new_from_netlink(ctx->udev, + get_event_source(ctx))); if (!mon) return -ENOMEM; @@ -710,7 +745,8 @@ static int do_unmap(struct udev *udev, dev_t devno, const string& buf) if (pipe2(fds, O_NONBLOCK) < 0) return -errno; - auto unmapper = make_named_thread("unmapper", [&buf, sysfs_r_fd = fds[1]]() { + auto unmapper = make_named_thread( + "unmapper", [&buf, sysfs_r_fd = fds[1], flags = ctx->flags]() { /* * On final device close(), kernel sends a block change event, in * response to which udev apparently runs blkid on the device. This @@ -722,7 +758,7 @@ static int do_unmap(struct udev *udev, dev_t devno, const string& buf) if (sysfs_r == -EBUSY && tries < 2) { if (!tries) { usleep(250 * 1000); - } else { + } else if (!(flags & KRBD_CTX_F_NOUDEV)) { /* * libudev does not provide the "wait until the queue is empty" * API or the sufficient amount of primitives to build it from. @@ -805,7 +841,7 @@ static int unmap_image(struct krbd_ctx *ctx, const char *devnode, } append_unmap_options(&buf, options); - return do_unmap(ctx->udev, wholedevno, buf); + return do_unmap(ctx, wholedevno, buf); } static int unmap_image(struct krbd_ctx *ctx, const krbd_spec& spec, @@ -837,7 +873,7 @@ static int unmap_image(struct krbd_ctx *ctx, const krbd_spec& spec, } append_unmap_options(&buf, options); - return do_unmap(ctx->udev, devno, buf); + return do_unmap(ctx, devno, buf); } static bool dump_one_image(Formatter *f, TextTable *tbl, @@ -955,7 +991,7 @@ static int is_mapped_image(struct udev *udev, const krbd_spec& spec, return 0; /* not mapped */ } -extern "C" int krbd_create_from_context(rados_config_t cct, +extern "C" int krbd_create_from_context(rados_config_t cct, uint32_t flags, struct krbd_ctx **pctx) { struct krbd_ctx *ctx = new struct krbd_ctx(); @@ -966,6 +1002,7 @@ extern "C" int krbd_create_from_context(rados_config_t cct, delete ctx; return -ENOMEM; } + ctx->flags = flags; *pctx = ctx; return 0; diff --git a/src/test/librbd/fsx.cc b/src/test/librbd/fsx.cc index bbdc86e667bb1..4a60149ec720f 100644 --- a/src/test/librbd/fsx.cc +++ b/src/test/librbd/fsx.cc @@ -1954,7 +1954,7 @@ create_image() goto failed_shutdown; } #if defined(WITH_KRBD) - r = krbd_create_from_context(rados_cct(cluster), &krbd); + r = krbd_create_from_context(rados_cct(cluster), 0, &krbd); if (r < 0) { simple_err("Could not create libkrbd handle", r); goto failed_shutdown; diff --git a/src/tools/rbd/action/Kernel.cc b/src/tools/rbd/action/Kernel.cc index cccd7932f4966..dcbca088ff75b 100644 --- a/src/tools/rbd/action/Kernel.cc +++ b/src/tools/rbd/action/Kernel.cc @@ -196,6 +196,8 @@ static int parse_map_options(const std::string &options_string, if (put_map_option_value("compression_hint", value_char, map_option_compression_hint_cb, map_options)) return -EINVAL; + } else if (!strcmp(this_char, "udev") || !strcmp(this_char, "noudev")) { + put_map_option("udev", this_char, map_options); } else { std::cerr << "rbd: unknown map option '" << this_char << "'" << std::endl; return -EINVAL; @@ -223,6 +225,8 @@ static int parse_unmap_options(const std::string &options_string, if (!strcmp(this_char, "force")) { put_map_option("force", this_char, unmap_options); + } else if (!strcmp(this_char, "udev") || !strcmp(this_char, "noudev")) { + put_map_option("udev", this_char, unmap_options); } else { std::cerr << "rbd: unknown unmap option '" << this_char << "'" << std::endl; @@ -238,7 +242,7 @@ static int do_kernel_list(Formatter *f) { struct krbd_ctx *krbd; int r; - r = krbd_create_from_context(g_ceph_context, &krbd); + r = krbd_create_from_context(g_ceph_context, 0, &krbd); if (r < 0) return r; @@ -363,20 +367,21 @@ static int do_kernel_map(const char *poolname, const char *nspace_name, #if defined(WITH_KRBD) struct krbd_ctx *krbd; std::ostringstream oss; + uint32_t flags = 0; char *devnode; int r; - r = krbd_create_from_context(g_ceph_context, &krbd); - if (r < 0) - return r; - - for (std::map::iterator it = map_options.begin(); - it != map_options.end(); ) { + for (auto it = map_options.begin(); it != map_options.end(); ) { // for compatibility with < 3.7 kernels, assume that rw is on by // default and omit it even if it was specified by the user // (see ceph.git commit fb0f1986449b) if (it->first == "rw" && it->second == "rw") { it = map_options.erase(it); + } else if (it->first == "udev") { + if (it->second == "noudev") { + flags |= KRBD_CTX_F_NOUDEV; + } + it = map_options.erase(it); } else { if (it != map_options.begin()) oss << ","; @@ -385,6 +390,10 @@ static int do_kernel_map(const char *poolname, const char *nspace_name, } } + r = krbd_create_from_context(g_ceph_context, flags, &krbd); + if (r < 0) + return r; + r = krbd_is_mapped(krbd, poolname, nspace_name, imgname, snapname, &devnode); if (r < 0) { std::cerr << "rbd: warning: can't get image map information: " @@ -421,18 +430,27 @@ static int do_kernel_unmap(const char *dev, const char *poolname, #if defined(WITH_KRBD) struct krbd_ctx *krbd; std::ostringstream oss; + uint32_t flags = 0; int r; - r = krbd_create_from_context(g_ceph_context, &krbd); + for (auto it = unmap_options.begin(); it != unmap_options.end(); ) { + if (it->first == "udev") { + if (it->second == "noudev") { + flags |= KRBD_CTX_F_NOUDEV; + } + it = unmap_options.erase(it); + } else { + if (it != unmap_options.begin()) + oss << ","; + oss << it->second; + ++it; + } + } + + r = krbd_create_from_context(g_ceph_context, flags, &krbd); if (r < 0) return r; - for (auto it = unmap_options.cbegin(); it != unmap_options.cend(); ++it) { - if (it != unmap_options.cbegin()) - oss << ","; - oss << it->second; - } - if (dev) r = krbd_unmap(krbd, dev, oss.str().c_str()); else