From 5dc01efd97109fcb7fc3ded5ffbe502c8e7439b2 Mon Sep 17 00:00:00 2001 From: Adam Kupczyk Date: Tue, 29 Nov 2022 15:38:08 +0000 Subject: [PATCH] blk/kernel: Add O_EXCL for block devices Change behaviour when target file is block device "mknod name b major minor". Append O_EXCL flag for first open of the block device. The problem is that if 2 different files for same block devices are created, it is possible to ::flock each of them in 2 separate processes. In some container cases when we recreate bluestore osd dir with ceph-bluestore-tool prime-osd command, we can end up with completely different files. Open with O_EXCL is immune to that. Signed-off-by: Adam Kupczyk --- src/blk/kernel/KernelDevice.cc | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/blk/kernel/KernelDevice.cc b/src/blk/kernel/KernelDevice.cc index d9c1e529c071c..e15b727e0a640 100644 --- a/src/blk/kernel/KernelDevice.cc +++ b/src/blk/kernel/KernelDevice.cc @@ -133,8 +133,25 @@ int KernelDevice::open(const string& p) int r = 0, i = 0; dout(1) << __func__ << " path " << path << dendl; + struct stat statbuf; + bool is_block; + r = stat(path.c_str(), &statbuf); + if (r != 0) { + derr << __func__ << " stat got: " << cpp_strerror(r) << dendl; + goto out_fail; + } + is_block = (statbuf.st_mode & S_IFMT) == S_IFBLK; for (i = 0; i < WRITE_LIFE_MAX; i++) { - int fd = ::open(path.c_str(), O_RDWR | O_DIRECT); + int flags = 0; + if (lock_exclusive && is_block && (i == 0)) { + // If opening block device use O_EXCL flag. It gives us best protection, + // as no other process can overwrite the data for as long as we are running. + // For block devices ::flock is not enough, + // since 2 different inodes with same major/minor can be locked. + // Exclusion by O_EXCL works in containers too. + flags |= O_EXCL; + } + int fd = ::open(path.c_str(), O_RDWR | O_DIRECT | flags); if (fd < 0) { r = -errno; break; @@ -187,6 +204,10 @@ int KernelDevice::open(const string& p) } if (lock_exclusive) { + // We need to keep soft locking (via flock()) because O_EXCL does not work for regular files. + // This is as good as we can get. Other processes can still overwrite the data, + // but at least we are protected from mounting same device twice in ceph processes. + // We also apply soft locking for block devices, as it populates /proc/locks. (see lslocks) r = _lock(); if (r < 0) { derr << __func__ << " failed to lock " << path << ": " << cpp_strerror(r) -- 2.39.5