]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
libcephfs: expose zerocopy in libcephfs
authorFrank S. Filz <ffilzlnx@mindspring.com>
Tue, 18 Mar 2025 12:50:26 +0000 (12:50 +0000)
committerIgor Golikov <igolikov@ibm.com>
Tue, 18 Mar 2025 13:01:12 +0000 (13:01 +0000)
Zero copy can be used both with and without non-blocking I/O.

Signed-off-by: Igor Golikov <igolikov@ibm.com>
Fixes: https://tracker.ceph.com/issues/69919
src/include/cephfs/libcephfs.h
src/libcephfs.cc

index 4fc975801faf96c0a37dd0e7e5e6814b6ffa91a3..3054c1ddb07b64b0ff075810952e0a126c907a9f 100644 (file)
@@ -120,15 +120,19 @@ struct ceph_snapdiff_entry_t {
 
 struct ceph_ll_io_info {
   void (*callback) (struct ceph_ll_io_info *cb_info);
+  void (*release) (void *);
+  void *release_data;
   void *priv; // private for caller
   struct Fh *fh;
-  const struct iovec *iov;
+  struct iovec *iov;
   int iovcnt;
+  int iovmax; // maximum iovcnt is allowed, 0 for no limit
   int64_t off;
   int64_t result;
   bool write;
   bool fsync;
   bool syncdataonly;
+  bool zerocopy;
 };
 
 /* setattr mask bits (up to an int in size) */
@@ -1958,6 +1962,8 @@ int64_t ceph_ll_readv(struct ceph_mount_info *cmount, struct Fh *fh,
                      const struct iovec *iov, int iovcnt, int64_t off);
 int64_t ceph_ll_writev(struct ceph_mount_info *cmount, struct Fh *fh,
                       const struct iovec *iov, int iovcnt, int64_t off);
+int64_t ceph_ll_readv_writev(struct ceph_mount_info *cmount,
+                         struct ceph_ll_io_info *io_info);
 int64_t ceph_ll_nonblocking_readv_writev(struct ceph_mount_info *cmount,
                                         struct ceph_ll_io_info *io_info);
 int ceph_ll_close(struct ceph_mount_info *cmount, struct Fh* filehandle);
index ce1b6ccd0b05af277b067167f802f5b38f09b72d..7eeb0761abbfaf8db14592ee17dfa012a3472003 100644 (file)
@@ -2080,31 +2080,157 @@ extern "C" int64_t ceph_ll_writev(class ceph_mount_info *cmount,
   return (cmount->get_client()->ll_writev(fh, iov, iovcnt, off));
 }
 
+struct ceph_ll_readv_writev_buffer {
+  ceph_ll_readv_writev_buffer() : iov() {};
+  ~ceph_ll_readv_writev_buffer() {
+    delete iov;
+  }
+  int prepare_iovs(int iovmax) {
+    iovcnt = bl.get_num_buffers();
+    if (iovmax != 0 && iovcnt > iovmax) {
+      // TODO: refine the interface to accomodate for the buffer count exceeding
+      //       the desired maximum iovec length.
+      return -EINVAL;
+    }
+    iov = new struct iovec [iovcnt];
+    auto pb = std::cbegin(bl.buffers());
+    for (auto n = 0; n < iovcnt; n++) {
+      iov[n].iov_base = (void*)(pb->c_str());
+      iov[n].iov_len = pb->length();
+      ++pb;
+    }
+    return 0;
+  }
+  bool bigbuffer;
+  bufferlist bl;
+  int iovcnt;
+  struct iovec *iov;
+};
+
+extern "C"  void LL_onfinish_release(void *);
+
 class LL_Onfinish : public Context {
 public:
   LL_Onfinish(struct ceph_ll_io_info *io_info)
-    : io_info(io_info) {}
-  bufferlist bl;
+    : buf(), io_info(io_info) {}
+  struct ceph_ll_readv_writev_buffer buf;
 private:
+  // no ownership on io_info. should be released by the caller
   struct ceph_ll_io_info *io_info;
+  void complete(int r) override {
+    finish(r); // fills in io_info->result which may be different than r
+    if (io_info->result < 0 || io_info->write || !io_info->zerocopy) {
+      // this is an error, write, or a non-zerocopy read, delete...
+      // for a succesful zero-copy read, we need to keep this object until the
+      // caller is done with the buffers so it can use this as a hook to release
+      // them.
+      delete this;
+    }
+  }
   void finish(int r) override {
     if (!io_info->write && r > 0) {
-      copy_bufferlist_to_iovec(io_info->iov, io_info->iovcnt, &bl, r);
+      if (io_info->zerocopy) {
+        int r2 = buf.prepare_iovs(io_info->iovmax);
+
+        if (r2 < 0) {
+          io_info->result = r;
+          io_info->callback(io_info);
+          return;
+        }
+        
+        io_info->iovcnt = buf.iovcnt;
+        io_info->iov = buf.iov;
+        // This is a zero-copy read, set up for returning zero copy buffer
+        io_info->release = LL_onfinish_release;
+        io_info->release_data = this;
+      } else {
+        copy_bufferlist_to_iovec(io_info->iov, io_info->iovcnt, &buf.bl, r);
+      }
     }
     io_info->result = r;
     io_info->callback(io_info);
   }
 };
 
+extern "C" void LL_onfinish_release(void *release_data)
+{
+  Context *onfinish = (Context *) release_data;
+
+  // Cleanup this object (and the included ceph_ll_readv_writev_buffer) when
+  // the calller is done. This is only called for a zero-copy read.
+  delete onfinish;
+}
+
 extern "C" int64_t ceph_ll_nonblocking_readv_writev(class ceph_mount_info *cmount,
                                                    struct ceph_ll_io_info *io_info)
 {
+  // a zero copy read MUST provide a length 1 iovec,
+  // where the buffer is a nullptr and the length is the requested read length
+  if (!io_info->write && io_info->zerocopy && io_info->iovcnt != 1) {
+    return -EINVAL;
+  }
+
   LL_Onfinish *onfinish = new LL_Onfinish(io_info);
 
+  // Note the above instantiates a ceph_ll_readv_writev_buffer which will be
+  // used by a sucessful zero-copy read. The caller will then call
+  // LL_onfinish_release when done with the read buffers.
+
   return (cmount->get_client()->ll_preadv_pwritev(
                        io_info->fh, io_info->iov, io_info->iovcnt,
-                       io_info->off, io_info->write, onfinish, &onfinish->bl,
-                       io_info->fsync, io_info->syncdataonly));
+                       io_info->off, io_info->write, onfinish, &onfinish->buf.bl,
+                       io_info->fsync, io_info->syncdataonly, io_info->zerocopy));
+}
+
+extern "C" void ceph_ll_readv_writev_release(void *release_data)
+{
+  ceph_ll_readv_writev_buffer *buf = (ceph_ll_readv_writev_buffer *) release_data;
+
+  // Cleanup the buffer when the caller is done. This is only called for a
+  // zero-copy read.
+  delete buf;
+}
+
+extern "C" int64_t ceph_ll_readv_writev(class ceph_mount_info *cmount,
+                                    struct ceph_ll_io_info *io_info)
+{
+  ceph_ll_readv_writev_buffer *buf = new ceph_ll_readv_writev_buffer;
+
+  // a zero copy read MUST provide a length 1 iovec,
+  // where the buffer is a nullptr and the length is the requested read length
+  if (!io_info->write && io_info->zerocopy && io_info->iovcnt != 1) {
+    return -EINVAL;
+  }
+
+  io_info->result = (cmount->get_client()->ll_preadv_pwritev(
+                       io_info->fh, io_info->iov, io_info->iovcnt,
+                       io_info->off, io_info->write, nullptr, &buf->bl,
+                       io_info->fsync, io_info->syncdataonly, io_info->zerocopy));
+
+  if (!io_info->write && io_info->result > 0) {
+    if (io_info->zerocopy) {
+      buf->prepare_iovs(io_info->iovmax);
+      io_info->iovcnt = buf->iovcnt;
+      io_info->iov = buf->iov;
+      // This is a zero-copy read, set up for returning zero copy buffer
+      io_info->release = ceph_ll_readv_writev_release;
+      // the caller is responsible to call the io_info->release(io_info->release_data)
+      // when ready
+      io_info->release_data = buf;
+    } else {
+      copy_bufferlist_to_iovec(io_info->iov, io_info->iovcnt, &buf->bl,
+                               io_info->result);
+    }
+  }
+
+  // Note caller of a successful zero-copy read will end up calling
+  // ceph_ll_readv_writev_release to release the buffers, otherwise clean it
+  // up now since it is no longer needed.
+  if (io_info->write || io_info->result <= 0 || !io_info->zerocopy)
+    delete buf;
+
+  io_info->callback(io_info);
+  return io_info->result;
 }
 
 extern "C" int ceph_ll_close(class ceph_mount_info *cmount, Fh* fh)