From 2eb096a5be8e00abfa0c104c07339191d9df3737 Mon Sep 17 00:00:00 2001 From: Steve Capper Date: Wed, 6 May 2015 12:04:37 +0000 Subject: [PATCH] FileJournal: Remove CEPH_PAGE_SIZE assumptions Ceph currently assumes a FileJournal block_size equal to a multiple of CEPH_PAGE_SIZE. For x86, this will always be 4KB which matches the sector size of Advanced Format drives, and works quite well. Other architectures, such as ARM and PowerPC, can have a much larger CEPH_PAGE_SIZE == PAGE_SIZE (64KB). Unfortunately, a block_size of 64KB can lead to a significant increase in the amount of disk activity required by the journal over one with a block_size of 4KB (especially when carrying out a lot of 4KB writes). This patch removes the assumption from the FileJournal that the block_size should always be at least CEPH_PAGE_SIZE, and replaces it with the assumption that this should be at least: CEPH_MINIMUM_BLOCK_SIZE == 4KB. For PAGE_SIZE values of 4KB (for x86 say), this patch shouldn't lead to any change in behaviour. For a 64KB PAGE_SIZE where the FileJournal is hosted on a block device directly, the block_size will go down from 64KB to 4KB with this patch. Where the FileJournal is hosted on an ext4 filesystem we will see a similar reduction in block_size. If the FileJournal is hosted on an XFS or btrfs filesystem then we won't see any reduction in the block_size (because both XFS and btrfs return the PAGE_CACHE_SIZE as the minimum io block size in fstat). Signed-off-by: Steve Capper --- src/os/FileJournal.cc | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/os/FileJournal.cc b/src/os/FileJournal.cc index c6bb6f2c0755d..3ae6bd08cfd99 100644 --- a/src/os/FileJournal.cc +++ b/src/os/FileJournal.cc @@ -40,6 +40,7 @@ #define dout_prefix *_dout << "journal " const static int64_t ONE_MEG(1 << 20); +const static int CEPH_MINIMUM_BLOCK_SIZE(4096); int FileJournal::_open(bool forwrite, bool create) { @@ -154,8 +155,7 @@ int FileJournal::_open_block_device() << dendl; max_size = bdev_sz; - /* block devices have to write in blocks of CEPH_PAGE_SIZE */ - block_size = CEPH_PAGE_SIZE; + block_size = CEPH_MINIMUM_BLOCK_SIZE; if (g_conf->journal_discard) { discard = block_device_support_discard(fn.c_str()); @@ -295,7 +295,7 @@ int FileJournal::_open_file(int64_t oldsize, blksize_t blksize, else { max_size = oldsize; } - block_size = MAX(blksize, (blksize_t)CEPH_PAGE_SIZE); + block_size = MAX(blksize, (blksize_t)CEPH_MINIMUM_BLOCK_SIZE); if (create && g_conf->journal_zero_on_create) { derr << "FileJournal::_open_file : zeroing journal" << dendl; @@ -505,9 +505,9 @@ int FileJournal::open(uint64_t fs_op_seq) << block_size << " (required for direct_io journal mode)" << dendl; return -EINVAL; } - if ((header.alignment % CEPH_PAGE_SIZE) && directio) { - dout(0) << "open journal alignment " << header.alignment << " is not multiple of page size " << CEPH_PAGE_SIZE - << " (required for direct_io journal mode)" << dendl; + if ((header.alignment % CEPH_MINIMUM_BLOCK_SIZE) && directio) { + dout(0) << "open journal alignment " << header.alignment << " is not multiple of minimum block size " + << CEPH_MINIMUM_BLOCK_SIZE << " (required for direct_io journal mode)" << dendl; return -EINVAL; } @@ -952,15 +952,15 @@ int FileJournal::prepare_single_write(bufferlist& bl, off64_t& queue_pos, uint64 void FileJournal::align_bl(off64_t pos, bufferlist& bl) { // make sure list segments are page aligned - if (directio && (!bl.is_page_aligned() || - !bl.is_n_page_sized())) { - bl.rebuild_page_aligned(); + if (directio && (!bl.is_aligned(block_size) || + !bl.is_n_align_sized(CEPH_MINIMUM_BLOCK_SIZE))) { + bl.rebuild_aligned(CEPH_MINIMUM_BLOCK_SIZE); dout(10) << __func__ << " total memcopy: " << bl.get_memcopy_count() << dendl; - if ((bl.length() & ~CEPH_PAGE_MASK) != 0 || - (pos & ~CEPH_PAGE_MASK) != 0) + if ((bl.length() & (CEPH_MINIMUM_BLOCK_SIZE - 1)) != 0 || + (pos & (CEPH_MINIMUM_BLOCK_SIZE - 1)) != 0) dout(0) << "rebuild_page_aligned failed, " << bl << dendl; - assert((bl.length() & ~CEPH_PAGE_MASK) == 0); - assert((pos & ~CEPH_PAGE_MASK) == 0); + assert((bl.length() & (CEPH_MINIMUM_BLOCK_SIZE - 1)) == 0); + assert((pos & (CEPH_MINIMUM_BLOCK_SIZE - 1)) == 0); } } -- 2.47.3