]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
LIBCEPHFS: Expose zerocopy in libcephfs zero 58228/head
authorFrank S. Filz <ffilzlnx@mindspring.com>
Fri, 14 Jun 2024 00:06:01 +0000 (17:06 -0700)
committerFrank S. Filz <ffilzlnx@mindspring.com>
Thu, 21 Nov 2024 22:39:24 +0000 (14:39 -0800)
Zero copy can be used both with and without non-blocking I/O.

Signed-off-by: Frank S. Filz <ffilzlnx@mindspring.com>
src/include/cephfs/libcephfs.h
src/libcephfs.cc

index ba0b76e072b57e1d79a1559ffe6fdc60e17877ab..0e323cf642c238f0fd0dd63ec92b634e4ecb3c65 100644 (file)
@@ -120,15 +120,19 @@ struct ceph_snapdiff_entry_t {
 
 struct ceph_ll_io_info {
   void (*callback) (struct ceph_ll_io_info *cb_info);
+  void (*release) (void *);
+  void *release_data;
   void *priv; // private for caller
   struct Fh *fh;
-  const struct iovec *iov;
+  struct iovec *iov;
   int iovcnt;
+  int iovmax; // maximum iovcnt is allowed, 0 for no limit
   int64_t off;
   int64_t result;
   bool write;
   bool fsync;
   bool syncdataonly;
+  bool zerocopy;
 };
 
 /* setattr mask bits (up to an int in size) */
@@ -1947,6 +1951,8 @@ int64_t ceph_ll_readv(struct ceph_mount_info *cmount, struct Fh *fh,
                      const struct iovec *iov, int iovcnt, int64_t off);
 int64_t ceph_ll_writev(struct ceph_mount_info *cmount, struct Fh *fh,
                       const struct iovec *iov, int iovcnt, int64_t off);
+void ceph_ll_readv_writev(struct ceph_mount_info *cmount,
+                         struct ceph_ll_io_info *io_info);
 int64_t ceph_ll_nonblocking_readv_writev(struct ceph_mount_info *cmount,
                                         struct ceph_ll_io_info *io_info);
 int ceph_ll_close(struct ceph_mount_info *cmount, struct Fh* filehandle);
index 7eea6665f614594ec129a5ee3b7e5f3adfaf1b8c..120b998cd9877bf5ebea9b0fc2313e9273686fd5 100644 (file)
@@ -2019,31 +2019,145 @@ extern "C" int64_t ceph_ll_writev(class ceph_mount_info *cmount,
   return (cmount->get_client()->ll_writev(fh, iov, iovcnt, off));
 }
 
+struct ceph_ll_readv_writev_buffer {
+  ceph_ll_readv_writev_buffer() : iov() {};
+  ~ceph_ll_readv_writev_buffer() {
+    delete iov;
+  }
+  int prepare_iovs(int iovmax) {
+    iovcnt = bl.get_num_buffers();
+    if (iovmax != 0 && iovcnt > iovmax) {
+      // TODO: refine the interface to accomodate for the buffer count exceeding
+      //       the desired maximum iovec length.
+      return -CEPHFS_EINVAL;
+    }
+    iov = new struct iovec [iovcnt];
+    auto pb = std::cbegin(bl.buffers());
+    for (auto n = 0; n < iovcnt; n++) {
+      iov[n].iov_base = (void*)(pb->c_str());
+      iov[n].iov_len = pb->length();
+      ++pb;
+    }
+    return 0;
+  }
+  bool bigbuffer;
+  bufferlist bl;
+  int iovcnt;
+  struct iovec *iov;
+};
+
+extern "C"  void LL_onfinish_release(void *);
+
 class LL_Onfinish : public Context {
 public:
   LL_Onfinish(struct ceph_ll_io_info *io_info)
-    : io_info(io_info) {}
-  bufferlist bl;
+    : buf(), io_info(io_info) {}
+  struct ceph_ll_readv_writev_buffer buf;
 private:
   struct ceph_ll_io_info *io_info;
+  void complete(int r) override {
+    finish(r); // fills in io_info->result which may be different than r
+    if (io_info->result < 0 || io_info->write || !io_info->zerocopy) {
+      // this is an error, write, or a non-zerocopy read, delete...
+      // for a succesful zero-copy read, we need to keep this object until the
+      // caller is done with the buffers so it can use this as a hook to release
+      // them.
+      delete this;
+    }
+  }
   void finish(int r) override {
     if (!io_info->write && r > 0) {
-      copy_bufferlist_to_iovec(io_info->iov, io_info->iovcnt, &bl, r);
+      if (io_info->zerocopy) {
+        int r2 = buf.prepare_iovs(io_info->iovmax);
+
+        if (r2 < 0) {
+          io_info->result = r;
+          io_info->callback(io_info);
+          return;
+        }
+        
+        io_info->iovcnt = buf.iovcnt;
+        io_info->iov = buf.iov;
+        // This is a zero-copy read, set up for returning zero copy buffer
+        io_info->release = LL_onfinish_release;
+        io_info->release_data = this;
+      } else {
+        copy_bufferlist_to_iovec(io_info->iov, io_info->iovcnt, &buf.bl, r);
+      }
     }
     io_info->result = r;
     io_info->callback(io_info);
   }
 };
 
+extern "C" void LL_onfinish_release(void *release_data)
+{
+  Context *onfinish = (Context *) release_data;
+
+  // Cleanup this object (and the included ceph_ll_readv_writev_buffer) when
+  // the calller is done. This is only called for a zero-copy read.
+  delete onfinish;
+}
+
 extern "C" int64_t ceph_ll_nonblocking_readv_writev(class ceph_mount_info *cmount,
                                                    struct ceph_ll_io_info *io_info)
 {
   LL_Onfinish *onfinish = new LL_Onfinish(io_info);
 
+  // Note the above instantiates a ceph_ll_readv_writev_buffer which will be
+  // used by a sucessful zero-copy read. The caller will then call
+  // LL_onfinish_release when done with the read buffers.
+
   return (cmount->get_client()->ll_preadv_pwritev(
                        io_info->fh, io_info->iov, io_info->iovcnt,
-                       io_info->off, io_info->write, onfinish, &onfinish->bl,
-                       io_info->fsync, io_info->syncdataonly));
+                       io_info->off, io_info->write, onfinish, &onfinish->buf.bl,
+                       io_info->fsync, io_info->syncdataonly, io_info->zerocopy));
+}
+
+extern "C" void ceph_ll_readv_writev_release(void *release_data)
+{
+  ceph_ll_readv_writev_buffer *buf = (ceph_ll_readv_writev_buffer *) release_data;
+
+  // Cleanup the buffer when the caller is done. This is only called for a
+  // zero-copy read.
+  delete buf;
+}
+
+extern "C" void ceph_ll_readv_writev(class ceph_mount_info *cmount,
+                                    struct ceph_ll_io_info *io_info)
+{
+  ceph_ll_readv_writev_buffer *buf = new ceph_ll_readv_writev_buffer;
+
+  if (!io_info->write && io_info->zerocopy) {
+    ceph_assert(io_info->iovcnt == 1);
+  }
+
+  io_info->result = (cmount->get_client()->ll_preadv_pwritev(
+                       io_info->fh, io_info->iov, io_info->iovcnt,
+                       io_info->off, io_info->write, nullptr, &buf->bl,
+                       io_info->fsync, io_info->syncdataonly, io_info->zerocopy));
+
+  if (!io_info->write && io_info->result > 0) {
+    if (io_info->zerocopy) {
+      buf->prepare_iovs(io_info->iovmax);
+      io_info->iovcnt = buf->iovcnt;
+      io_info->iov = buf->iov;
+      // This is a zero-copy read, set up for returning zero copy buffer
+      io_info->release = ceph_ll_readv_writev_release;
+      io_info->release_data = buf;
+    } else {
+      copy_bufferlist_to_iovec(io_info->iov, io_info->iovcnt, &buf->bl,
+                               io_info->result);
+    }
+  }
+
+  // Note caller of a successful zero-copy read will end up calling
+  // ceph_ll_readv_writev_release to release the buffers, otherwise clean it
+  // up now since it is no longer needed.
+  if (io_info->write || io_info->result <= 0 || !io_info->zerocopy)
+    delete buf;
+
+  io_info->callback(io_info);
 }
 
 extern "C" int ceph_ll_close(class ceph_mount_info *cmount, Fh* fh)