From f6854ac65d2a838e2f523979e341136e5f201b5c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 19 Feb 2021 16:47:17 +0100 Subject: [PATCH] krbd: make sure the device node is accessible after the mapping We have always assumed this to be the case and users' scripts and orchestration tools have grown to depend on this. Let's add some enforcement, prompted by [1]: "I am running my Kubernetes worker node inside of an LXC container which doesn't benefit from the device node created by the kernel, so I'm using udev to create the /dev/rbd* device nodes inside of the LXC container." which, through the unfortunate interaction with ceph-csi rbd plugin, results in data loss for "volumeMode: Filesystem" PVs because it ends up recreating the filesystem every time the PV is attached to the pod: "When deleting the pod and re-creating it, I can see that the RBD image is indeed being reformatted. This seems to be because when blkid is being run to check if the image is formatted, the /dev/rbd* device has not yet been created by udev. By the time the code gets down to running mkfs, the device is there and the damage is done." [1] https://github.com/ceph/ceph-csi/issues/1820 Fixes: https://tracker.ceph.com/issues/49410 Signed-off-by: Ilya Dryomov --- src/krbd.cc | 49 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/src/krbd.cc b/src/krbd.cc index 5e011d7a9a0..88a50b38d54 100644 --- a/src/krbd.cc +++ b/src/krbd.cc @@ -340,8 +340,9 @@ static std::pair wait_for_mapping(int sysfs_r_fd, udev_monitor *mon, class UdevMapHandler { public: - UdevMapHandler(const krbd_spec *spec, std::string *pdevnode) : - m_spec(spec), m_pdevnode(pdevnode) {} + UdevMapHandler(const krbd_spec *spec, std::string *pdevnode, + std::string *majnum, std::string *minnum) : + m_spec(spec), m_pdevnode(pdevnode), m_majnum(majnum), m_minnum(minnum) {} /* * Catch /sys/devices/rbd// and wait for the corresponding @@ -374,13 +375,14 @@ public: if (m_bus_dev && !m_block_devs.empty()) { for (const auto& p : m_block_devs) { if (udev_device_get_devnode(p.get()) == m_devnode) { - ceph_assert(!strcmp( - udev_device_get_sysattr_value(m_bus_dev.get(), "major"), - udev_device_get_property_value(p.get(), "MAJOR"))); - ceph_assert(!have_minor_attr() || !strcmp( - udev_device_get_sysattr_value(m_bus_dev.get(), "minor"), - udev_device_get_property_value(p.get(), "MINOR"))); *m_pdevnode = std::move(m_devnode); + *m_majnum = udev_device_get_property_value(p.get(), "MAJOR"); + *m_minnum = udev_device_get_property_value(p.get(), "MINOR"); + ceph_assert(*m_majnum == udev_device_get_sysattr_value( + m_bus_dev.get(), "major")); + ceph_assert(!have_minor_attr() || + *m_minnum == udev_device_get_sysattr_value( + m_bus_dev.get(), "minor")); return true; } } @@ -395,6 +397,8 @@ private: std::string m_devnode; const krbd_spec *m_spec; std::string *m_pdevnode; + std::string *m_majnum; + std::string *m_minnum; }; static const char *get_event_source(const krbd_ctx *ctx) @@ -432,6 +436,8 @@ static const char *get_event_source(const krbd_ctx *ctx) static int do_map(krbd_ctx *ctx, const krbd_spec& spec, const string& buf, string *pname) { + std::string majnum, minnum; + struct stat sb; bool mapped; int fds[2]; int r; @@ -474,7 +480,8 @@ static int do_map(krbd_ctx *ctx, const krbd_spec& spec, const string& buf, }); std::tie(r, mapped) = wait_for_mapping(fds[0], mon.get(), - UdevMapHandler(&spec, pname)); + UdevMapHandler(&spec, pname, &majnum, + &minnum)); if (r < 0) { if (!mapped) { std::cerr << "rbd: sysfs write failed" << std::endl; @@ -487,6 +494,30 @@ static int do_map(krbd_ctx *ctx, const krbd_spec& spec, const string& buf, mapper.join(); close(fds[0]); close(fds[1]); + + /* + * Make sure our device node is there. This is intended to help + * diagnose environments where "rbd map" is run from a container with + * a private /dev and some external mechanism (e.g. udev) is used to + * add the device to the container asynchronously, possibly seconds + * after "rbd map" successfully exits. These setups are very fragile + * and in some cases can even lead to data loss, depending on higher + * level logic and orchestration layers involved. + */ + if (stat(pname->c_str(), &sb) < 0 || !S_ISBLK(sb.st_mode)) { + std::cerr << "rbd: mapping succeeded but " << *pname + << " is not accessible, is host /dev mounted?" << std::endl; + return -EINVAL; + } + if (stringify(major(sb.st_rdev)) != majnum || + stringify(minor(sb.st_rdev)) != minnum) { + std::cerr << "rbd: mapping succeeded but " << *pname + << " (" << major(sb.st_rdev) << ":" << minor(sb.st_rdev) + << ") does not match expected " << majnum << ":" << minnum + << std::endl; + return -EINVAL; + } + return r; } -- 2.39.5