From f4f359e5bdecb4733371e545b21828960be0de2e Mon Sep 17 00:00:00 2001 From: Dhairya Parmar Date: Fri, 12 Jul 2024 20:50:47 +0530 Subject: [PATCH] client: restrict bufferlist to total write size In linux, systems calls like write() anyways don't allow writes > 2GiB, the total write size passed to the Client::_write is clamped to INT_MAX but bufferlist is not handled. This edge case is patched here. bug that arises due to buffer list beyond INT_MAX stalls async i/o due to: unknown file: Failure C++ exception with description "End of buffer [buffer:2]" thrown in the test body. 2024-05-28T16:17:06.854+0530 7f9a5d24c9c0 2 client.4311 unmount 2024-05-28T16:17:06.854+0530 7f9a5d24c9c0 2 client.4311 unmounting which results in disconnected inode and cap leaks: 2024-05-28T16:17:11.855+0530 7f9a5d24c9c0 1 client.4311 dump_inode: DISCONNECTED inode 0x10000000001 #0x10000000001 ref 3 0x10000000001.head(faked_ino=0 nref=3 ll_ref=0 cap_refs={4=0,1024=1,4096=1,8192=2} open={3=0} mode=100666 size=0/4294967296 nlink=1 btime=2024-05-28T16:17:03.387546+0530 mtime=2024-05-28T16:17:03.387546+0530 ctime=2024-05-28T16:17:03.387546+0530 change_attr=0 caps=pAsxLsXsxFsxcrwb(0=pAsxLsXsxFsxcrwb) objectset[0x10000000001 ts 0/0 objects 1 dirty_or_tx 0] 0x7f9a2c009530) 2024-05-28T16:17:11.855+0530 7f9a5d24c9c0 2 client.4311 cache still has 0+1 items, waiting (for caps to release?) This commit changes the way Client::_write accepts data. So, now instead of accepting ptr, iovec array and iovcnt, the helper now accepts a bufferlist which should be contructed by the caller itself. The reason behind this change is simple - to declutter the API. For more context checkout this conversation https://github.com/ceph/ceph/pull/58564#discussion_r2000226752 Fixes: https://tracker.ceph.com/issues/66245 Signed-off-by: Dhairya Parmar --- src/client/Client.cc | 49 +++++++++++++++++++++++++------------------- src/client/Client.h | 6 +++--- 2 files changed, 31 insertions(+), 24 deletions(-) diff --git a/src/client/Client.cc b/src/client/Client.cc index 00b85a8e746..bde79ea2bf7 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -11305,7 +11305,9 @@ int Client::write(int fd, const char *buf, loff_t size, loff_t offset) #endif /* We can't return bytes written larger than INT_MAX, clamp size to that */ size = std::min(size, (loff_t)INT_MAX); - int r = _write(fh, offset, size, buf, NULL, false); + bufferlist bl; + bl.append(buf, size); + int r = _write(fh, offset, size, std::move(bl)); ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl; return r; } @@ -11330,7 +11332,7 @@ int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov, if(iovcnt < 0) { return -CEPHFS_EINVAL; } - loff_t totallen = 0; + size_t totallen = 0; for (int i = 0; i < iovcnt; i++) { totallen += iov[i].iov_len; } @@ -11340,12 +11342,29 @@ int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov, * 32-bit signed integers. Clamp the I/O sizes in those functions so that * we don't do I/Os larger than the values we can return. */ + bufferlist data; if (clamp_to_int) { - totallen = std::min(totallen, (loff_t)INT_MAX); + totallen = std::min(totallen, (size_t)INT_MAX); + size_t total_appended = 0; + for (int i = 0; i < iovcnt; i++) { + if (iov[i].iov_len > 0) { + if (total_appended + iov[i].iov_len >= totallen) { + data.append((const char *)iov[i].iov_base, totallen - total_appended); + break; + } else { + data.append((const char *)iov[i].iov_base, iov[i].iov_len); + total_appended += iov[i].iov_len; + } + } + } + } else { + for (int i = 0; i < iovcnt; i++) { + data.append((const char *)iov[i].iov_base, iov[i].iov_len); + } } if (write) { - int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt, onfinish, do_fsync, syncdataonly); + int64_t w = _write(fh, offset, totallen, std::move(data), onfinish, do_fsync, syncdataonly); ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl; return w; } else { @@ -11529,9 +11548,8 @@ bool Client::C_Write_Finisher::try_complete() return false; } -int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf, - const struct iovec *iov, int iovcnt, Context *onfinish, - bool do_fsync, bool syncdataonly) +int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, bufferlist bl, + Context *onfinish, bool do_fsync, bool syncdataonly) { ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); @@ -11601,19 +11619,6 @@ int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf, ceph_assert(in->inline_version > 0); } - // copy into fresh buffer (since our write may be resub, async) - bufferlist bl; - if (buf) { - if (size > 0) - bl.append(buf, size); - } else if (iov){ - for (int i = 0; i < iovcnt; i++) { - if (iov[i].iov_len > 0) { - bl.append((const char *)iov[i].iov_base, iov[i].iov_len); - } - } - } - int want, have; if (f->mode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; @@ -16066,7 +16071,9 @@ int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data) tout(cct) << off << std::endl; tout(cct) << len << std::endl; - int r = _write(fh, off, len, data, NULL, 0); + bufferlist bl; + bl.append(data, len); + int r = _write(fh, off, len, std::move(bl)); ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r << dendl; return r; diff --git a/src/client/Client.h b/src/client/Client.h index d5108c12262..b4817f3a1c3 100644 --- a/src/client/Client.h +++ b/src/client/Client.h @@ -1715,9 +1715,9 @@ private: void do_readahead(Fh *f, Inode *in, uint64_t off, uint64_t len); int64_t _write_success(Fh *fh, utime_t start, uint64_t fpos, int64_t offset, uint64_t size, Inode *in); - int64_t _write(Fh *fh, int64_t offset, uint64_t size, const char *buf, - const struct iovec *iov, int iovcnt, Context *onfinish = nullptr, - bool do_fsync = false, bool syncdataonly = false); + int64_t _write(Fh *fh, int64_t offset, uint64_t size, bufferlist bl, + Context *onfinish = nullptr, bool do_fsync = false, + bool syncdataonly = false); int64_t _preadv_pwritev_locked(Fh *fh, const struct iovec *iov, int iovcnt, int64_t offset, bool write, bool clamp_to_int, -- 2.39.5