backend that the data is incompressible, disabling compression in aggressive
mode (since 5.8).
+* udev - Wait for udev device manager to finish executing all matching
+ "add" rules and release the device before exiting (default). This option
+ is not passed to the kernel.
+
+* noudev - Don't wait for udev device manager. When enabled, the device may
+ not be fully usable immediately on exit.
+
`rbd device unmap` options:
* force - Force the unmapping of a block device that is open (since 4.9). The
driver will wait for running requests to complete and then unmap; requests
sent to the driver after initiating the unmap will be failed.
+* udev - Wait for udev device manager to finish executing all matching
+ "remove" rules and clean up after the device before exiting (default).
+ This option is not passed to the kernel.
+
+* noudev - Don't wait for udev device manager.
+
Examples
========
#include "rados/librados.h"
+/*
+ * Don't wait for udev add uevents in krbd_map() and udev remove
+ * uevents in krbd_unmap*(). Instead, make do with the respective
+ * kernel uevents and return as soon as they are received.
+ *
+ * systemd-udevd sends out udev uevents after it finishes processing
+ * the respective kernel uevents, which mostly boils down to executing
+ * all matching udev rules. With this flag set, on return from
+ * krbd_map() systemd-udevd may still be poking at the device: it
+ * may still be open with tools such as blkid and various ioctls to
+ * be run against it, none of the persistent symlinks to the device
+ * node may be there, etc. udev used to be responsible for creating
+ * the device node as well, but that has been handled by devtmpfs in
+ * the kernel for many years now, so the device node (as returned
+ * through @pdevnode) is guaranteed to be there.
+ *
+ * If set, krbd_map() and krbd_unmap*() can be invoked from any
+ * network namespace that is owned by the initial user namespace
+ * (which is a formality because things like loading kernel modules
+ * and creating block devices are not namespaced and require global
+ * privileges, i.e. capabilities in the initial user namespace).
+ * Otherwise, krbd_map() and krbd_unmap*() must be invoked from
+ * the initial network namespace.
+ *
+ * If set, krbd_unmap*() doesn't attempt to settle the udev queue
+ * before retrying unmap for the last time. Some EBUSY errors due
+ * to systemd-udevd poking at the device at the time krbd_unmap*()
+ * is invoked that are otherwise covered by the retry logic may be
+ * returned.
+ */
+#define KRBD_CTX_F_NOUDEV (1U << 0)
+
#ifdef __cplusplus
extern "C" {
#endif
struct krbd_ctx;
-int krbd_create_from_context(rados_config_t cct, struct krbd_ctx **pctx);
+int krbd_create_from_context(rados_config_t cct, uint32_t flags,
+ struct krbd_ctx **pctx);
void krbd_destroy(struct krbd_ctx *ctx);
int krbd_map(struct krbd_ctx *ctx,
struct krbd_ctx {
CephContext *cct;
struct udev *udev;
+ uint32_t flags; /* KRBD_CTX_F_* */
};
struct krbd_spec {
std::string *m_pdevnode;
};
-static int do_map(struct udev *udev, const krbd_spec& spec, const string& buf,
+static const char *get_event_source(const krbd_ctx *ctx)
+{
+ if (ctx->flags & KRBD_CTX_F_NOUDEV) {
+ /*
+ * For block devices (unlike network interfaces, they don't
+ * carry any namespace tags), the kernel broadcasts uevents
+ * into all network namespaces that are owned by the initial
+ * user namespace. This restriction is new in 4.18: starting
+ * with 2.6.35 and through 4.17 the kernel broadcast uevents
+ * into all network namespaces, period.
+ *
+ * However, when invoked from a non-initial user namespace,
+ * udev_monitor_receive_device() has always ignored both kernel
+ * and udev uevents by virtue of requiring SCM_CREDENTIALS and
+ * checking that ucred->uid == 0. When UIDs and GIDs are sent to
+ * a process in a user namespace, they are translated according
+ * to that process's UID and GID mappings and, unless root in the
+ * user namespace is mapped to the global root, that check fails.
+ * Normally they show up as 65534(nobody) because the global root
+ * is not mapped.
+ */
+ return "kernel";
+ }
+
+ /*
+ * Like most netlink messages, udev uevents don't cross network
+ * namespace boundaries and are therefore confined to the initial
+ * network namespace.
+ */
+ return "udev";
+}
+
+static int do_map(krbd_ctx *ctx, const krbd_spec& spec, const string& buf,
string *pname)
{
bool mapped;
int fds[2];
int r;
- udev_monitor_uptr mon(udev_monitor_new_from_netlink(udev, "udev"));
+ udev_monitor_uptr mon(udev_monitor_new_from_netlink(ctx->udev,
+ get_event_source(ctx)));
if (!mon)
return -ENOMEM;
if (r < 0)
return r;
- return do_map(ctx->udev, spec, buf, pname);
+ return do_map(ctx, spec, buf, pname);
}
static int devno_to_krbd_id(struct udev *udev, dev_t devno, string *pid)
dev_t m_devno;
};
-static int do_unmap(struct udev *udev, dev_t devno, const string& buf)
+static int do_unmap(krbd_ctx *ctx, dev_t devno, const string& buf)
{
bool unmapped;
int fds[2];
int r;
- udev_monitor_uptr mon(udev_monitor_new_from_netlink(udev, "udev"));
+ udev_monitor_uptr mon(udev_monitor_new_from_netlink(ctx->udev,
+ get_event_source(ctx)));
if (!mon)
return -ENOMEM;
if (pipe2(fds, O_NONBLOCK) < 0)
return -errno;
- auto unmapper = make_named_thread("unmapper", [&buf, sysfs_r_fd = fds[1]]() {
+ auto unmapper = make_named_thread(
+ "unmapper", [&buf, sysfs_r_fd = fds[1], flags = ctx->flags]() {
/*
* On final device close(), kernel sends a block change event, in
* response to which udev apparently runs blkid on the device. This
if (sysfs_r == -EBUSY && tries < 2) {
if (!tries) {
usleep(250 * 1000);
- } else {
+ } else if (!(flags & KRBD_CTX_F_NOUDEV)) {
/*
* libudev does not provide the "wait until the queue is empty"
* API or the sufficient amount of primitives to build it from.
}
append_unmap_options(&buf, options);
- return do_unmap(ctx->udev, wholedevno, buf);
+ return do_unmap(ctx, wholedevno, buf);
}
static int unmap_image(struct krbd_ctx *ctx, const krbd_spec& spec,
}
append_unmap_options(&buf, options);
- return do_unmap(ctx->udev, devno, buf);
+ return do_unmap(ctx, devno, buf);
}
static bool dump_one_image(Formatter *f, TextTable *tbl,
return 0; /* not mapped */
}
-extern "C" int krbd_create_from_context(rados_config_t cct,
+extern "C" int krbd_create_from_context(rados_config_t cct, uint32_t flags,
struct krbd_ctx **pctx)
{
struct krbd_ctx *ctx = new struct krbd_ctx();
delete ctx;
return -ENOMEM;
}
+ ctx->flags = flags;
*pctx = ctx;
return 0;
goto failed_shutdown;
}
#if defined(WITH_KRBD)
- r = krbd_create_from_context(rados_cct(cluster), &krbd);
+ r = krbd_create_from_context(rados_cct(cluster), 0, &krbd);
if (r < 0) {
simple_err("Could not create libkrbd handle", r);
goto failed_shutdown;
if (put_map_option_value("compression_hint", value_char,
map_option_compression_hint_cb, map_options))
return -EINVAL;
+ } else if (!strcmp(this_char, "udev") || !strcmp(this_char, "noudev")) {
+ put_map_option("udev", this_char, map_options);
} else {
std::cerr << "rbd: unknown map option '" << this_char << "'" << std::endl;
return -EINVAL;
if (!strcmp(this_char, "force")) {
put_map_option("force", this_char, unmap_options);
+ } else if (!strcmp(this_char, "udev") || !strcmp(this_char, "noudev")) {
+ put_map_option("udev", this_char, unmap_options);
} else {
std::cerr << "rbd: unknown unmap option '" << this_char << "'"
<< std::endl;
struct krbd_ctx *krbd;
int r;
- r = krbd_create_from_context(g_ceph_context, &krbd);
+ r = krbd_create_from_context(g_ceph_context, 0, &krbd);
if (r < 0)
return r;
#if defined(WITH_KRBD)
struct krbd_ctx *krbd;
std::ostringstream oss;
+ uint32_t flags = 0;
char *devnode;
int r;
- r = krbd_create_from_context(g_ceph_context, &krbd);
- if (r < 0)
- return r;
-
- for (std::map<std::string, std::string>::iterator it = map_options.begin();
- it != map_options.end(); ) {
+ for (auto it = map_options.begin(); it != map_options.end(); ) {
// for compatibility with < 3.7 kernels, assume that rw is on by
// default and omit it even if it was specified by the user
// (see ceph.git commit fb0f1986449b)
if (it->first == "rw" && it->second == "rw") {
it = map_options.erase(it);
+ } else if (it->first == "udev") {
+ if (it->second == "noudev") {
+ flags |= KRBD_CTX_F_NOUDEV;
+ }
+ it = map_options.erase(it);
} else {
if (it != map_options.begin())
oss << ",";
}
}
+ r = krbd_create_from_context(g_ceph_context, flags, &krbd);
+ if (r < 0)
+ return r;
+
r = krbd_is_mapped(krbd, poolname, nspace_name, imgname, snapname, &devnode);
if (r < 0) {
std::cerr << "rbd: warning: can't get image map information: "
#if defined(WITH_KRBD)
struct krbd_ctx *krbd;
std::ostringstream oss;
+ uint32_t flags = 0;
int r;
- r = krbd_create_from_context(g_ceph_context, &krbd);
+ for (auto it = unmap_options.begin(); it != unmap_options.end(); ) {
+ if (it->first == "udev") {
+ if (it->second == "noudev") {
+ flags |= KRBD_CTX_F_NOUDEV;
+ }
+ it = unmap_options.erase(it);
+ } else {
+ if (it != unmap_options.begin())
+ oss << ",";
+ oss << it->second;
+ ++it;
+ }
+ }
+
+ r = krbd_create_from_context(g_ceph_context, flags, &krbd);
if (r < 0)
return r;
- for (auto it = unmap_options.cbegin(); it != unmap_options.cend(); ++it) {
- if (it != unmap_options.cbegin())
- oss << ",";
- oss << it->second;
- }
-
if (dev)
r = krbd_unmap(krbd, dev, oss.str().c_str());
else