From 02d907dde78cddb0889f65f4d76793fd1c407315 Mon Sep 17 00:00:00 2001 From: Adam Kupczyk Date: Mon, 7 Oct 2024 11:45:41 +0000 Subject: [PATCH] os/bluestore: Make truncate() drop unused allocations Now when truncate() drops unused allocations. Modified Close() in BlueRocksEnv to unconditionally call truncate. Fixes: https://tracker.ceph.com/issues/68488 Signed-off-by: Adam Kupczyk (cherry picked from commit 9fc65f160cd3764a68fb3697d067c358761fc837) Conflicts: src/os/bluestore/BlueFS.cc, trivial --- src/os/bluestore/BlueFS.cc | 65 +++++++++++++++++++++++++------- src/os/bluestore/BlueRocksEnv.cc | 14 ++----- 2 files changed, 56 insertions(+), 23 deletions(-) diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 60d07bfb51ca0..cb81a99308b04 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -3660,15 +3660,16 @@ uint64_t BlueFS::_flush_special(FileWriter *h) int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/ { std::lock_guard hl(h->lock); + auto& fnode = h->file->fnode; dout(10) << __func__ << " 0x" << std::hex << offset << std::dec - << " file " << h->file->fnode << dendl; + << " file " << fnode << dendl; if (h->file->deleted) { dout(10) << __func__ << " deleted, no-op" << dendl; return 0; } // we never truncate internal log files - ceph_assert(h->file->fnode.ino > 1); + ceph_assert(fnode.ino > 1); // truncate off unflushed data? if (h->pos < offset && @@ -3682,20 +3683,58 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/ if (r < 0) return r; } - if (offset == h->file->fnode.size) { - return 0; // no-op! - } - if (offset > h->file->fnode.size) { + if (offset > fnode.size) { ceph_abort_msg("truncate up not supported"); } - ceph_assert(h->file->fnode.size >= offset); + ceph_assert(offset <= fnode.size); _flush_bdev(h); - - std::lock_guard ll(log.lock); - vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size - offset); - h->file->fnode.size = offset; - h->file->is_dirty = true; - log.t.op_file_update_inc(h->file->fnode); + { + std::lock_guard ll(log.lock); + std::lock_guard dl(dirty.lock); + bool changed_extents = false; + vselector->sub_usage(h->file->vselector_hint, fnode); + uint64_t x_off = 0; + auto p = fnode.seek(offset, &x_off); + uint64_t cut_off = + (p == fnode.extents.end()) ? 0 : p2roundup(x_off, alloc_size[p->bdev]); + uint64_t new_allocated; + if (0 == cut_off) { + // whole pextent to remove + changed_extents = true; + new_allocated = offset; + } else if (cut_off < p->length) { + dirty.pending_release[p->bdev].insert(p->offset + cut_off, p->length - cut_off); + new_allocated = (offset - x_off) + cut_off; + p->length = cut_off; + changed_extents = true; + ++p; + } else { + ceph_assert(cut_off >= p->length); + new_allocated = (offset - x_off) + p->length; + // just leave it here + ++p; + } + while (p != fnode.extents.end()) { + dirty.pending_release[p->bdev].insert(p->offset, p->length); + p = fnode.extents.erase(p); + changed_extents = true; + } + if (changed_extents) { + fnode.size = offset; + fnode.allocated = new_allocated; + fnode.reset_delta(); + log.t.op_file_update(fnode); + // sad, but is_dirty must be set to signal flushing of the log + h->file->is_dirty = true; + } else { + if (offset != fnode.size) { + fnode.size = offset; + //skipping log.t.op_file_update_inc, it will be done by flush() + h->file->is_dirty = true; + } + } + vselector->add_usage(h->file->vselector_hint, fnode); + } return 0; } diff --git a/src/os/bluestore/BlueRocksEnv.cc b/src/os/bluestore/BlueRocksEnv.cc index 68040af428280..7cbe0a1d12146 100644 --- a/src/os/bluestore/BlueRocksEnv.cc +++ b/src/os/bluestore/BlueRocksEnv.cc @@ -221,18 +221,12 @@ class BlueRocksWritableFile : public rocksdb::WritableFile { } rocksdb::Status Close() override { - fs->fsync(h); - // mimic posix env, here. shrug. - size_t block_size; - size_t last_allocated_block; - GetPreallocationStatus(&block_size, &last_allocated_block); - if (last_allocated_block > 0) { - int r = fs->truncate(h, h->pos); - if (r < 0) - return err_to_status(r); + int r = fs->truncate(h, h->pos); + if (r < 0) { + return err_to_status(r); } - + fs->fsync(h); return rocksdb::Status::OK(); } -- 2.39.5