From 3b5a0ea4c6697c1b48a6863c528dbf9ae31fd433 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 14 Oct 2021 12:45:20 +0800 Subject: [PATCH] mds: add truncate size handling support for fscrypt The kclient will only send truncate requests with the modified and encrypted last block contents when new size is smaller and is not aligned to CEPH_FSCRYPT_BLOCK_SIZE, which is 4KB for now. Or if the Fx caps is issued and the new size is larger the kclient will buffer the truncating. Or it will send truncate requests wihtout the last block filled. When the fscrypt is enabled and when truncating with a smaller size, both the old size and new size in the truncate request will always be rounded up to CEPH_FSCRYPT_BLOCK_SIZE, which is 4K for now, in kclient. For example if truncating a file size from 3KB to 2KB, the MDS will always get old_size == new_size == 4KB. So we need to check whether there has last block data passed together with the truncate request to make sure whether truncating to a smaller size. The kclinet will send it's 'change_attr' along with the truncate req, and the MDS will compare it with the one in CInode just after the MDS successfully xlockes the CInode's filelock, if they are different that means it's possibly some clients have update the file or have dirty caps just before MDS xlockes the CInode's filelock. We will let the kclient retry it by returning a -EAGAIN errno. Then the MDS will write the last block to OSD and then truncate the size as normal. Currently the last block contents will be journaled together with the project inode only and it will be cleared after the truncate being finished, and won't make it persistent together with the CInode:inode_t in the metadata pool. Signed-off-by: Xiubo Li --- src/mds/MDCache.cc | 132 ++++++++++++++++++++++++++++++++++--- src/mds/MDCache.h | 2 + src/mds/Server.cc | 39 ++++++++++- src/mds/events/EMetaBlob.h | 7 +- src/mds/fscrypt.h | 41 ++++++++++++ src/mds/mdstypes.h | 17 ++++- 6 files changed, 221 insertions(+), 17 deletions(-) create mode 100644 src/mds/fscrypt.h diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 0ddba6482ce..15f46653869 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -63,6 +63,7 @@ #include "events/ESessions.h" #include "InoTable.h" +#include "fscrypt.h" #include "common/Timer.h" @@ -6415,6 +6416,22 @@ void MDCache::truncate_inode(CInode *in, LogSegment *ls) _truncate_inode(in, ls); } +struct C_IO_MDC_TruncateWriteFinish : public MDCacheIOContext { + CInode *in; + LogSegment *ls; + uint32_t block_size; + C_IO_MDC_TruncateWriteFinish(MDCache *c, CInode *i, LogSegment *l, uint32_t bs) : + MDCacheIOContext(c, false), in(i), ls(l), block_size(bs) { + } + void finish(int r) override { + ceph_assert(r == 0 || r == -CEPHFS_ENOENT); + mdcache->truncate_inode_write_finish(in, ls, block_size); + } + void print(ostream& out) const override { + out << "file_truncate_write(" << in->ino() << ")"; + } +}; + struct C_IO_MDC_TruncateFinish : public MDCacheIOContext { CInode *in; LogSegment *ls; @@ -6434,13 +6451,16 @@ void MDCache::_truncate_inode(CInode *in, LogSegment *ls) { const auto& pi = in->get_inode(); dout(10) << "_truncate_inode " - << pi->truncate_from << " -> " << pi->truncate_size - << " on " << *in << dendl; + << pi->truncate_from << " -> " << pi->truncate_size + << " fscrypt last block length is " << pi->fscrypt_last_block.length() + << " on " << *in << dendl; ceph_assert(pi->is_truncating()); ceph_assert(pi->truncate_size < (1ULL << 63)); ceph_assert(pi->truncate_from < (1ULL << 63)); - ceph_assert(pi->truncate_size < pi->truncate_from); + ceph_assert(pi->truncate_size < pi->truncate_from || + (pi->truncate_size == pi->truncate_from && + pi->fscrypt_last_block.length())); SnapRealm *realm = in->find_snaprealm(); @@ -6454,13 +6474,62 @@ void MDCache::_truncate_inode(CInode *in, LogSegment *ls) snapc = &nullsnap; ceph_assert(in->last == CEPH_NOSNAP); } - dout(10) << "_truncate_inode snapc " << snapc << " on " << *in << dendl; + dout(10) << "_truncate_inode snapc " << snapc << " on " << *in + << " fscrypt_last_block length is " << pi->fscrypt_last_block.length() + << dendl; auto layout = pi->layout; - filer.truncate(in->ino(), &layout, *snapc, - pi->truncate_size, pi->truncate_from-pi->truncate_size, - pi->truncate_seq, ceph::real_time::min(), 0, - new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls), - mds->finisher)); + struct ceph_fscrypt_last_block_header header; + memset(&header, 0, sizeof(header)); + bufferlist data; + if (pi->fscrypt_last_block.length()) { + auto bl = pi->fscrypt_last_block.cbegin(); + DECODE_START(1, bl); + decode(header.change_attr, bl); + decode(header.file_offset, bl); + decode(header.block_size, bl); + + /* + * The block_size will be in unit of KB, so if the last block is not + * located in a file hole, the struct_len should be larger than the + * header.block_size. + */ + if (struct_len > header.block_size) { + bl.copy(header.block_size, data); + } + DECODE_FINISH(bl); + } + + if (data.length()) { + dout(10) << "_truncate_inode write on inode " << *in << " change_attr: " + << header.change_attr << " offset: " << header.file_offset << " blen: " + << header.block_size << dendl; + filer.write(in->ino(), &layout, *snapc, header.file_offset, header.block_size, + data, ceph::real_time::min(), 0, + new C_OnFinisher(new C_IO_MDC_TruncateWriteFinish(this, in, ls, + header.block_size), + mds->finisher)); + } else { // located in file hole. + uint64_t length = pi->truncate_from - pi->truncate_size; + + /* + * When the fscrypt is enabled the truncate_from and truncate_size + * possibly equal and both are aligned up to header.block_size. In + * this case we will always request a larger length to make sure the + * OSD won't miss truncating the last object. + */ + if (pi->fscrypt_last_block.length()) { + dout(10) << "_truncate_inode truncate on inode " << *in << " hits a hole!" << dendl; + length += header.block_size; + } + ceph_assert(length); + + dout(10) << "_truncate_inode truncate on inode " << *in << dendl; + filer.truncate(in->ino(), &layout, *snapc, pi->truncate_size, length, + pi->truncate_seq, ceph::real_time::min(), 0, + new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls), + mds->finisher)); + } + } struct C_MDC_TruncateLogged : public MDCacheLogContext { @@ -6473,6 +6542,50 @@ struct C_MDC_TruncateLogged : public MDCacheLogContext { } }; +void MDCache::truncate_inode_write_finish(CInode *in, LogSegment *ls, + uint32_t block_size) +{ + const auto& pi = in->get_inode(); + dout(10) << "_truncate_inode_write " + << pi->truncate_from << " -> " << pi->truncate_size + << " on " << *in << dendl; + + ceph_assert(pi->is_truncating()); + ceph_assert(pi->truncate_size < (1ULL << 63)); + ceph_assert(pi->truncate_from < (1ULL << 63)); + ceph_assert(pi->truncate_size < pi->truncate_from || + (pi->truncate_size == pi->truncate_from && + pi->fscrypt_last_block.length())); + + + SnapRealm *realm = in->find_snaprealm(); + SnapContext nullsnap; + const SnapContext *snapc; + if (realm) { + dout(10) << " realm " << *realm << dendl; + snapc = &realm->get_snap_context(); + } else { + dout(10) << " NO realm, using null context" << dendl; + snapc = &nullsnap; + ceph_assert(in->last == CEPH_NOSNAP); + } + dout(10) << "_truncate_inode_write snapc " << snapc << " on " << *in + << " fscrypt_last_block length is " << pi->fscrypt_last_block.length() + << dendl; + auto layout = pi->layout; + /* + * When the fscrypt is enabled the truncate_from and truncate_size + * possibly equal and both are aligned up to header.block_size. In + * this case we will always request a larger length to make sure the + * OSD won't miss truncating the last object. + */ + uint64_t length = pi->truncate_from - pi->truncate_size + block_size; + filer.truncate(in->ino(), &layout, *snapc, pi->truncate_size, length, + pi->truncate_seq, ceph::real_time::min(), 0, + new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls), + mds->finisher)); +} + void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls) { dout(10) << "truncate_inode_finish " << *in << dendl; @@ -6489,6 +6602,7 @@ void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls) pi.inode->version = in->pre_dirty(); pi.inode->truncate_from = 0; pi.inode->truncate_pending--; + pi.inode->fscrypt_last_block = bufferlist(); EUpdate *le = new EUpdate(mds->mdlog, "truncate finish"); mds->mdlog->start_entry(le); diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index c0487afd7d5..af86663d93e 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -734,6 +734,8 @@ class MDCache { void truncate_inode(CInode *in, LogSegment *ls); void _truncate_inode(CInode *in, LogSegment *ls); void truncate_inode_finish(CInode *in, LogSegment *ls); + void truncate_inode_write_finish(CInode *in, LogSegment *ls, + uint32_t block_size); void truncate_inode_logged(CInode *in, MutationRef& mut); void add_recovered_truncate(CInode *in, LogSegment *ls); diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 628241d754d..61bd2a55d78 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -50,6 +50,7 @@ #include "common/perf_counters.h" #include "include/compat.h" #include "osd/OSDMap.h" +#include "fscrypt.h" #include @@ -5082,7 +5083,15 @@ void Server::handle_client_setattr(MDRequestRef& mdr) bool truncating_smaller = false; if (mask & CEPH_SETATTR_SIZE) { - truncating_smaller = req->head.args.setattr.size < old_size; + if (req->get_data().length() > + sizeof(struct ceph_fscrypt_last_block_header) + fscrypt_last_block_max_size) { + dout(10) << __func__ << ": the last block size is too large" << dendl; + respond_to_request(mdr, -CEPHFS_EINVAL); + return; + } + + truncating_smaller = req->head.args.setattr.size < old_size || + (req->head.args.setattr.size == old_size && req->get_data().length()); if (truncating_smaller && pip->is_truncating()) { dout(10) << " waiting for pending truncate from " << pip->truncate_from << " to " << pip->truncate_size << " to complete on " << *cur << dendl; @@ -5091,6 +5100,32 @@ void Server::handle_client_setattr(MDRequestRef& mdr) cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr)); return; } + + if (truncating_smaller && req->get_data().length()) { + struct ceph_fscrypt_last_block_header header; + memset(&header, 0, sizeof(header)); + auto bl = req->get_data().cbegin(); + DECODE_START(1, bl); + decode(header.change_attr, bl); + DECODE_FINISH(bl); + + dout(20) << __func__ << " mdr->retry:" << mdr->retry + << " header.change_attr: " << header.change_attr + << " header.file_offset: " << header.file_offset + << " header.block_size: " << header.block_size + << dendl; + + if (header.change_attr != pip->change_attr) { + dout(5) << __func__ << ": header.change_attr:" << header.change_attr + << " != current change_attr:" << pip->change_attr + << ", let client retry it!" << dendl; + // flush the journal to make sure the clients will get the lasted + // change_attr as possible for the next retry + mds->mdlog->flush(); + respond_to_request(mdr, -CEPHFS_EAGAIN); + return; + } + } } bool changed_ranges = false; @@ -5125,7 +5160,7 @@ void Server::handle_client_setattr(MDRequestRef& mdr) pi.inode->time_warp_seq++; // maybe not a timewarp, but still a serialization point. if (mask & CEPH_SETATTR_SIZE) { if (truncating_smaller) { - pi.inode->truncate(old_size, req->head.args.setattr.size); + pi.inode->truncate(old_size, req->head.args.setattr.size, req->get_data()); le->metablob.add_truncate_start(cur->ino()); } else { pi.inode->size = req->head.args.setattr.size; diff --git a/src/mds/events/EMetaBlob.h b/src/mds/events/EMetaBlob.h index a766504f5f4..6e4a8ea5ef0 100644 --- a/src/mds/events/EMetaBlob.h +++ b/src/mds/events/EMetaBlob.h @@ -470,9 +470,10 @@ private: sr->encode(snapbl); lump.nfull++; - lump.add_dfull(dn->get_name(), dn->get_alternate_name(), dn->first, dn->last, dn->get_projected_version(), - pi, in->dirfragtree, in->get_projected_xattrs(), in->symlink, - in->oldest_snap, snapbl, state, in->get_old_inodes()); + lump.add_dfull(dn->get_name(), dn->get_alternate_name(), dn->first, dn->last, + dn->get_projected_version(), pi, in->dirfragtree, + in->get_projected_xattrs(), in->symlink, in->oldest_snap, snapbl, + state, in->get_old_inodes()); // make note of where this inode was last journaled in->last_journaled = event_seq; diff --git a/src/mds/fscrypt.h b/src/mds/fscrypt.h new file mode 100644 index 00000000000..c3817a3df15 --- /dev/null +++ b/src/mds/fscrypt.h @@ -0,0 +1,41 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPHFS_FSCRYPT_H +#define CEPHFS_FSCRYPT_H + +struct ceph_fscrypt_last_block_header { + __u8 ver; + __u8 compat; + + /* If the last block is located in a file hole the length + * will be sizeof(i_version + file_offset + block_size), + * or will plus to extra BLOCK SIZE. + */ + uint32_t data_len; + + /* inode change attr version */ + uint64_t change_attr; + + /* + * For a file hole, this will be 0, or it will be the offset from + * which will write the last block + */ + uint64_t file_offset; + + /* It should always be the fscrypt block size */ + uint32_t block_size; +}; + +#endif diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h index 8749ef094bd..d2bebb04f2a 100644 --- a/src/mds/mdstypes.h +++ b/src/mds/mdstypes.h @@ -469,8 +469,12 @@ struct inode_t { bool is_file() const { return (mode & S_IFMT) == S_IFREG; } bool is_truncating() const { return (truncate_pending > 0); } + void truncate(uint64_t old_size, uint64_t new_size, const bufferlist &fbl) { + truncate(old_size, new_size); + fscrypt_last_block = fbl; + } void truncate(uint64_t old_size, uint64_t new_size) { - ceph_assert(new_size < old_size); + ceph_assert(new_size <= old_size); if (old_size > max_size_ever) max_size_ever = old_size; truncate_from = old_size; @@ -627,6 +631,8 @@ struct inode_t { std::vector fscrypt_auth; std::vector fscrypt_file; + bufferlist fscrypt_last_block; + private: bool older_is_consistent(const inode_t &other) const; }; @@ -635,7 +641,7 @@ private: template class Allocator> void inode_t::encode(ceph::buffer::list &bl, uint64_t features) const { - ENCODE_START(18, 6, bl); + ENCODE_START(19, 6, bl); encode(ino, bl); encode(rdev, bl); @@ -693,13 +699,14 @@ void inode_t::encode(ceph::buffer::list &bl, uint64_t features) const encode(!fscrypt_auth.empty(), bl); encode(fscrypt_auth, bl); encode(fscrypt_file, bl); + encode(fscrypt_last_block, bl); ENCODE_FINISH(bl); } template class Allocator> void inode_t::decode(ceph::buffer::list::const_iterator &p) { - DECODE_START_LEGACY_COMPAT_LEN(18, 6, 6, p); + DECODE_START_LEGACY_COMPAT_LEN(19, 6, 6, p); decode(ino, p); decode(rdev, p); @@ -806,6 +813,10 @@ void inode_t::decode(ceph::buffer::list::const_iterator &p) decode(fscrypt_auth, p); decode(fscrypt_file, p); } + + if (struct_v >= 19) { + decode(fscrypt_last_block, p); + } DECODE_FINISH(p); } -- 2.39.5