From 9ae9e641580198c0ec520a0fa4364bf5b75a0734 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 1 Dec 2016 14:36:37 -0500 Subject: [PATCH] os/bluestore/BlueFS: pre-extend file size for WAL (.log) files When rocksdb has log recycling on (this is required!), it will do robust checksums on log records and change playback behavior to tolerate trailing garbage in the log file. This normally allows it to overwrite previous log files, but it can also let us overwrite arbitrary garbage on the device. If we allocate some new space for a .log file (already indicated by the WRITER_WAL hint), extend the size immediately so that each subsequent append doesn't have to (unless/until we do another allocation). This is safe as long as rocksdb recycling is enabled (which it is by default). This is faster because we don't have to flush the bluefs log on every log append during the period after startup before rocksdb starts recycling log files. Signed-off-by: Sage Weil --- src/common/config_opts.h | 1 + src/os/bluestore/BlueFS.cc | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index b9b7b57c1beac..5695823ef315c 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -982,6 +982,7 @@ OPTION(bluefs_min_flush_size, OPT_U64, 65536) // ignore flush until its this bi OPTION(bluefs_compact_log_sync, OPT_BOOL, false) // sync or async log compaction? OPTION(bluefs_buffered_io, OPT_BOOL, false) OPTION(bluefs_allocator, OPT_STR, "bitmap") // stupid | bitmap +OPTION(bluefs_preextend_wal_files, OPT_BOOL, true) // this *requires* that rocksdb has recycling enabled OPTION(bluestore_bluefs, OPT_BOOL, true) OPTION(bluestore_bluefs_env_mirror, OPT_BOOL, false) // mirror to normal Env for debug diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 130fb16bfe6f7..3706d2d8e2542 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -1444,6 +1444,17 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length) << dendl; return r; } + if (g_conf->bluefs_preextend_wal_files && + h->writer_type == WRITER_WAL) { + // NOTE: this *requires* that rocksdb also has log recycling + // enabled and is therefore doing robust CRCs on the log + // records. otherwise, we will fail to reply the rocksdb log + // properly due to garbage on the device. + h->file->fnode.size = h->file->fnode.get_allocated(); + dout(10) << __func__ << " extending WAL size to 0x" << std::hex + << h->file->fnode.size << std::dec << " to include allocated" + << dendl; + } must_dirty = true; } if (h->file->fnode.size < offset + length) { -- 2.39.5