From: Adam Kupczyk Date: Wed, 2 Feb 2022 19:28:14 +0000 (+0100) Subject: os/bluestore/bluefs: Make volume selector operations atomic X-Git-Tag: v16.2.15~163^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=25cc264547e1e427c0924e854868a8461a23b531;p=ceph.git os/bluestore/bluefs: Make volume selector operations atomic Make all RocksDBBlueFSVolumeSelector files/extents/size tracking atomic. It used to be synchronized by BlueFS global lock. Now, in Fine Grain Locking era, it is necessary to prevent corruption. Fixes: https://tracker.ceph.com/issues/53906 Signed-off-by: Adam Kupczyk (cherry picked from commit 372bda350966624d5081635e659f7c46947980c2) --- diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 023a6b9d52d58..e02c385c3d7df 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -3758,11 +3758,11 @@ class RocksDBBlueFSVolumeSelector : public BlueFSVolumeSelector }; // add +1 row for corresponding per-device totals // add +1 column for per-level actual (taken from file size) total - typedef matrix_2d per_level_per_dev_usage_t; + typedef matrix_2d, BlueFS::MAX_BDEV + 1, LEVEL_MAX - LEVEL_FIRST + 1> per_level_per_dev_usage_t; per_level_per_dev_usage_t per_level_per_dev_usage; // file count per level, add +1 to keep total file count - uint64_t per_level_files[LEVEL_MAX - LEVEL_FIRST + 1] = { 0 }; + std::atomic per_level_files[LEVEL_MAX - LEVEL_FIRST + 1] = { 0 }; // Note: maximum per-device totals below might be smaller than corresponding // perf counters by up to a single alloc unit (1M) due to superblock extent. @@ -3835,27 +3835,27 @@ public: for (auto& p : fnode.extents) { auto& cur = per_level_per_dev_usage.at(p.bdev, pos); auto& max = per_level_per_dev_max.at(p.bdev, pos); - cur += p.length; - if (cur > max) { - max = cur; + uint64_t v = cur.fetch_add(p.length) + p.length; + while (v > max) { + max.exchange(v); } { //update per-device totals auto& cur = per_level_per_dev_usage.at(p.bdev, LEVEL_MAX - LEVEL_FIRST); auto& max = per_level_per_dev_max.at(p.bdev, LEVEL_MAX - LEVEL_FIRST); - cur += p.length; - if (cur > max) { - max = cur; - } + uint64_t v = cur.fetch_add(p.length) + p.length; + while (v > max) { + max.exchange(v); + } } } { //update per-level actual totals auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos); auto& max = per_level_per_dev_max.at(BlueFS::MAX_BDEV, pos); - cur += fnode.size; - if (cur > max) { - max = cur; + uint64_t v = cur.fetch_add(fnode.size) + fnode.size; + while (v > max) { + max.exchange(v); } } ++per_level_files[pos]; @@ -3884,26 +3884,26 @@ public: ceph_assert(per_level_files[LEVEL_MAX - LEVEL_FIRST] > 0); --per_level_files[LEVEL_MAX - LEVEL_FIRST]; } - void add_usage(void* hint, uint64_t fsize) override { + void add_usage(void* hint, uint64_t size_more) override { if (hint == nullptr) return; size_t pos = (size_t)hint - LEVEL_FIRST; //update per-level actual totals auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos); auto& max = per_level_per_dev_max.at(BlueFS::MAX_BDEV, pos); - cur += fsize; - if (cur > max) { - max = cur; + uint64_t v = cur.fetch_add(size_more) + size_more; + while (v > max) { + max.exchange(v); } } - void sub_usage(void* hint, uint64_t fsize) override { + void sub_usage(void* hint, uint64_t size_less) override { if (hint == nullptr) return; size_t pos = (size_t)hint - LEVEL_FIRST; //update per-level actual totals auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos); - ceph_assert(cur >= fsize); - per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos) -= fsize; + ceph_assert(cur >= size_less); + cur -= size_less; } uint8_t select_prefer_bdev(void* h) override;