From: Ilya Dryomov Date: Mon, 24 Aug 2020 17:01:46 +0000 (+0200) Subject: krbd: optionally skip waiting for udev events X-Git-Tag: v14.2.12~80^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=6b5a85102272ef5f73261b41a985c851e3000293;p=ceph.git krbd: optionally skip waiting for udev events Add support for noudev option to allow mapping and unmapping images from a privileged container in a non-initial network namespace (e.g. when using Multus CNI). Fixes: https://tracker.ceph.com/issues/47128 Signed-off-by: Ilya Dryomov (cherry picked from commit 2e657d134a1af489dcf623d207be259ce3dda9bc) Conflicts: doc/man/8/rbd.rst [ crush_location, read_from_replica and compression_hint map options not in nautilus ] src/krbd.cc [ commits 08bf0b628803 ("krbd: do away with explicit memory management") and 1e67e240f4dd ("krbd: misc cleanups") not in nautilus ] src/tools/rbd/action/Kernel.cc [ commits 34f539d8af33 ("rbd: delay parsing of default kernel map options") and da4ffd834fb8 ("rbd: rename some MapOptions instances to unmap_options") not in nautilus ] --- diff --git a/doc/man/8/rbd.rst b/doc/man/8/rbd.rst index 45c3315a824..8ae247f77fb 100644 --- a/doc/man/8/rbd.rst +++ b/doc/man/8/rbd.rst @@ -764,12 +764,25 @@ Per mapping (block device) `rbd device map` options: solid-state drives). For filestore with filestore_punch_hole = false, the recommended setting is image object size (typically 4M). +* udev - Wait for udev device manager to finish executing all matching + "add" rules and release the device before exiting (default). This option + is not passed to the kernel. + +* noudev - Don't wait for udev device manager. When enabled, the device may + not be fully usable immediately on exit. + `rbd device unmap` options: * force - Force the unmapping of a block device that is open (since 4.9). The driver will wait for running requests to complete and then unmap; requests sent to the driver after initiating the unmap will be failed. +* udev - Wait for udev device manager to finish executing all matching + "remove" rules and clean up after the device before exiting (default). + This option is not passed to the kernel. + +* noudev - Don't wait for udev device manager. + Examples ======== diff --git a/src/include/krbd.h b/src/include/krbd.h index 2c63bc7ea54..977d45fe2e3 100644 --- a/src/include/krbd.h +++ b/src/include/krbd.h @@ -15,13 +15,46 @@ #include "rados/librados.h" +/* + * Don't wait for udev add uevents in krbd_map() and udev remove + * uevents in krbd_unmap*(). Instead, make do with the respective + * kernel uevents and return as soon as they are received. + * + * systemd-udevd sends out udev uevents after it finishes processing + * the respective kernel uevents, which mostly boils down to executing + * all matching udev rules. With this flag set, on return from + * krbd_map() systemd-udevd may still be poking at the device: it + * may still be open with tools such as blkid and various ioctls to + * be run against it, none of the persistent symlinks to the device + * node may be there, etc. udev used to be responsible for creating + * the device node as well, but that has been handled by devtmpfs in + * the kernel for many years now, so the device node (as returned + * through @pdevnode) is guaranteed to be there. + * + * If set, krbd_map() and krbd_unmap*() can be invoked from any + * network namespace that is owned by the initial user namespace + * (which is a formality because things like loading kernel modules + * and creating block devices are not namespaced and require global + * privileges, i.e. capabilities in the initial user namespace). + * Otherwise, krbd_map() and krbd_unmap*() must be invoked from + * the initial network namespace. + * + * If set, krbd_unmap*() doesn't attempt to settle the udev queue + * before retrying unmap for the last time. Some EBUSY errors due + * to systemd-udevd poking at the device at the time krbd_unmap*() + * is invoked that are otherwise covered by the retry logic may be + * returned. + */ +#define KRBD_CTX_F_NOUDEV (1U << 0) + #ifdef __cplusplus extern "C" { #endif struct krbd_ctx; -int krbd_create_from_context(rados_config_t cct, struct krbd_ctx **pctx); +int krbd_create_from_context(rados_config_t cct, uint32_t flags, + struct krbd_ctx **pctx); void krbd_destroy(struct krbd_ctx *ctx); int krbd_map(struct krbd_ctx *ctx, diff --git a/src/krbd.cc b/src/krbd.cc index b9cfeaf574d..2a1b2c98c0d 100644 --- a/src/krbd.cc +++ b/src/krbd.cc @@ -49,6 +49,7 @@ static const int UDEV_BUF_SIZE = 1 << 20; /* doubled to 2M (SO_RCVBUFFORCE) */ struct krbd_ctx { CephContext *cct; struct udev *udev; + uint32_t flags; /* KRBD_CTX_F_* */ }; static const std::string SNAP_HEAD_NAME("-"); @@ -369,7 +370,39 @@ private: std::string *m_pdevnode; }; -static int do_map(struct udev *udev, const krbd_spec& spec, const string& buf, +static const char *get_event_source(const krbd_ctx *ctx) +{ + if (ctx->flags & KRBD_CTX_F_NOUDEV) { + /* + * For block devices (unlike network interfaces, they don't + * carry any namespace tags), the kernel broadcasts uevents + * into all network namespaces that are owned by the initial + * user namespace. This restriction is new in 4.18: starting + * with 2.6.35 and through 4.17 the kernel broadcast uevents + * into all network namespaces, period. + * + * However, when invoked from a non-initial user namespace, + * udev_monitor_receive_device() has always ignored both kernel + * and udev uevents by virtue of requiring SCM_CREDENTIALS and + * checking that ucred->uid == 0. When UIDs and GIDs are sent to + * a process in a user namespace, they are translated according + * to that process's UID and GID mappings and, unless root in the + * user namespace is mapped to the global root, that check fails. + * Normally they show up as 65534(nobody) because the global root + * is not mapped. + */ + return "kernel"; + } + + /* + * Like most netlink messages, udev uevents don't cross network + * namespace boundaries and are therefore confined to the initial + * network namespace. + */ + return "udev"; +} + +static int do_map(krbd_ctx *ctx, const krbd_spec& spec, const string& buf, string *pname) { struct udev_monitor *mon; @@ -378,7 +411,7 @@ static int do_map(struct udev *udev, const krbd_spec& spec, const string& buf, int fds[2]; int r; - mon = udev_monitor_new_from_netlink(udev, "udev"); + mon = udev_monitor_new_from_netlink(ctx->udev, get_event_source(ctx)); if (!mon) return -ENOMEM; @@ -464,7 +497,7 @@ static int map_image(struct krbd_ctx *ctx, const krbd_spec& spec, } } - return do_map(ctx->udev, spec, buf, pname); + return do_map(ctx, spec, buf, pname); } static int devno_to_krbd_id(struct udev *udev, dev_t devno, string *pid) @@ -703,7 +736,7 @@ private: dev_t m_devno; }; -static int do_unmap(struct udev *udev, dev_t devno, const string& buf) +static int do_unmap(krbd_ctx *ctx, dev_t devno, const string& buf) { struct udev_monitor *mon; std::thread unmapper; @@ -711,7 +744,7 @@ static int do_unmap(struct udev *udev, dev_t devno, const string& buf) int fds[2]; int r; - mon = udev_monitor_new_from_netlink(udev, "udev"); + mon = udev_monitor_new_from_netlink(ctx->udev, get_event_source(ctx)); if (!mon) return -ENOMEM; @@ -735,7 +768,8 @@ static int do_unmap(struct udev *udev, dev_t devno, const string& buf) goto out_mon; } - unmapper = make_named_thread("unmapper", [&buf, sysfs_r_fd = fds[1]]() { + unmapper = make_named_thread( + "unmapper", [&buf, sysfs_r_fd = fds[1], flags = ctx->flags]() { /* * On final device close(), kernel sends a block change event, in * response to which udev apparently runs blkid on the device. This @@ -747,7 +781,7 @@ static int do_unmap(struct udev *udev, dev_t devno, const string& buf) if (sysfs_r == -EBUSY && tries < 2) { if (!tries) { usleep(250 * 1000); - } else { + } else if (!(flags & KRBD_CTX_F_NOUDEV)) { /* * libudev does not provide the "wait until the queue is empty" * API or the sufficient amount of primitives to build it from. @@ -832,7 +866,7 @@ static int unmap_image(struct krbd_ctx *ctx, const char *devnode, } } - return do_unmap(ctx->udev, wholedevno, build_unmap_buf(id, options)); + return do_unmap(ctx, wholedevno, build_unmap_buf(id, options)); } static int unmap_image(struct krbd_ctx *ctx, const krbd_spec& spec, @@ -863,7 +897,7 @@ static int unmap_image(struct krbd_ctx *ctx, const krbd_spec& spec, } } - return do_unmap(ctx->udev, devno, build_unmap_buf(id, options)); + return do_unmap(ctx, devno, build_unmap_buf(id, options)); } static bool dump_one_image(Formatter *f, TextTable *tbl, @@ -997,7 +1031,7 @@ out_enm: return r; } -extern "C" int krbd_create_from_context(rados_config_t cct, +extern "C" int krbd_create_from_context(rados_config_t cct, uint32_t flags, struct krbd_ctx **pctx) { struct krbd_ctx *ctx = new struct krbd_ctx(); @@ -1008,6 +1042,7 @@ extern "C" int krbd_create_from_context(rados_config_t cct, delete ctx; return -ENOMEM; } + ctx->flags = flags; *pctx = ctx; return 0; diff --git a/src/test/librbd/fsx.cc b/src/test/librbd/fsx.cc index 56f8e6c60ca..15a9f0c995b 100644 --- a/src/test/librbd/fsx.cc +++ b/src/test/librbd/fsx.cc @@ -1953,7 +1953,7 @@ create_image() goto failed_shutdown; } #if defined(WITH_KRBD) - r = krbd_create_from_context(rados_cct(cluster), &krbd); + r = krbd_create_from_context(rados_cct(cluster), 0, &krbd); if (r < 0) { simple_err("Could not create libkrbd handle", r); goto failed_shutdown; diff --git a/src/tools/rbd/action/Kernel.cc b/src/tools/rbd/action/Kernel.cc index 3ebb5bc1de4..bac0bda10f1 100644 --- a/src/tools/rbd/action/Kernel.cc +++ b/src/tools/rbd/action/Kernel.cc @@ -153,6 +153,8 @@ static int parse_map_options(const std::string &options_string) } else if (!strcmp(this_char, "alloc_size")) { if (put_map_option_value("alloc_size", value_char, map_option_int_cb)) return -EINVAL; + } else if (!strcmp(this_char, "udev") || !strcmp(this_char, "noudev")) { + put_map_option("udev", this_char); } else { std::cerr << "rbd: unknown map option '" << this_char << "'" << std::endl; return -EINVAL; @@ -179,6 +181,8 @@ static int parse_unmap_options(const std::string &options_string) if (!strcmp(this_char, "force")) { put_map_option("force", this_char); + } else if (!strcmp(this_char, "udev") || !strcmp(this_char, "noudev")) { + put_map_option("udev", this_char); } else { std::cerr << "rbd: unknown unmap option '" << this_char << "'" << std::endl; return -EINVAL; @@ -193,7 +197,7 @@ static int do_kernel_list(Formatter *f) { struct krbd_ctx *krbd; int r; - r = krbd_create_from_context(g_ceph_context, &krbd); + r = krbd_create_from_context(g_ceph_context, 0, &krbd); if (r < 0) return r; @@ -317,20 +321,21 @@ static int do_kernel_map(const char *poolname, const char *nspace_name, #if defined(WITH_KRBD) struct krbd_ctx *krbd; std::ostringstream oss; + uint32_t flags = 0; char *devnode; int r; - r = krbd_create_from_context(g_ceph_context, &krbd); - if (r < 0) - return r; - - for (std::map::iterator it = map_options.begin(); - it != map_options.end(); ) { + for (auto it = map_options.begin(); it != map_options.end(); ) { // for compatibility with < 3.7 kernels, assume that rw is on by // default and omit it even if it was specified by the user // (see ceph.git commit fb0f1986449b) if (it->first == "rw" && it->second == "rw") { it = map_options.erase(it); + } else if (it->first == "udev") { + if (it->second == "noudev") { + flags |= KRBD_CTX_F_NOUDEV; + } + it = map_options.erase(it); } else { if (it != map_options.begin()) oss << ","; @@ -339,6 +344,10 @@ static int do_kernel_map(const char *poolname, const char *nspace_name, } } + r = krbd_create_from_context(g_ceph_context, flags, &krbd); + if (r < 0) + return r; + r = krbd_is_mapped(krbd, poolname, nspace_name, imgname, snapname, &devnode); if (r < 0) { std::cerr << "rbd: warning: can't get image map information: " @@ -375,18 +384,27 @@ static int do_kernel_unmap(const char *dev, const char *poolname, #if defined(WITH_KRBD) struct krbd_ctx *krbd; std::ostringstream oss; + uint32_t flags = 0; int r; - r = krbd_create_from_context(g_ceph_context, &krbd); + for (auto it = map_options.begin(); it != map_options.end(); ) { + if (it->first == "udev") { + if (it->second == "noudev") { + flags |= KRBD_CTX_F_NOUDEV; + } + it = map_options.erase(it); + } else { + if (it != map_options.begin()) + oss << ","; + oss << it->second; + ++it; + } + } + + r = krbd_create_from_context(g_ceph_context, flags, &krbd); if (r < 0) return r; - for (auto it = map_options.cbegin(); it != map_options.cend(); ++it) { - if (it != map_options.cbegin()) - oss << ","; - oss << it->second; - } - if (dev) r = krbd_unmap(krbd, dev, oss.str().c_str()); else