]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
ceph osd: add support for new op writesame
authorMike Christie <mchristi@redhat.com>
Wed, 6 Apr 2016 17:13:14 +0000 (12:13 -0500)
committerDavid Disseldorp <ddiss@suse.de>
Mon, 25 Apr 2016 13:07:57 +0000 (15:07 +0200)
This adds a new ceph request writesame that writes a buffer of length
writesame.data_length bytes at writesame.offset over
writesame.length bytes.

This command maps to SCSI's WRITE SAME request, so users like LIO+rbd
can pass this to the OSD. Right now, it only saves having to transfer
writesame.length bytes over the network, but future versions will be
to fully offload it by passing it directly to the FS/devices if they
support it.

v2:
- Fix tab/spaces to matching coding style.
- Allow zero write length. Check for invalid data lengths.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Reviewed-by: David Disseldorp <ddiss@suse.de>
src/include/rados.h
src/osd/ReplicatedPG.cc
src/osd/ReplicatedPG.h
src/tracing/osd.tp

index c58277fdc6c08eb8648bf688d3173f64e6939ca6..1856230acf52cac32a8bd42f519d4d31a5308e20 100644 (file)
@@ -256,6 +256,9 @@ extern const char *ceph_osd_state_name(int s);
        f(CACHE_PIN,    __CEPH_OSD_OP(WR, DATA, 36),    "cache-pin")        \
        f(CACHE_UNPIN,  __CEPH_OSD_OP(WR, DATA, 37),    "cache-unpin")      \
                                                                            \
+       /* ESX/SCSI */                                                      \
+       f(WRITESAME,    __CEPH_OSD_OP(WR, DATA, 38),    "write-same")       \
+                                                                           \
        /** multi **/                                                       \
        f(CLONERANGE,   __CEPH_OSD_OP(WR, MULTI, 1),    "clonerange")       \
        f(ASSERT_SRC_VERSION, __CEPH_OSD_OP(RD, MULTI, 2), "assert-src-version") \
@@ -533,6 +536,11 @@ struct ceph_osd_op {
                        __le64 expected_object_size;
                        __le64 expected_write_size;
                } __attribute__ ((packed)) alloc_hint;
+               struct {
+                       __le64 offset;
+                       __le64 length;
+                       __le64 data_length;
+               } __attribute__ ((packed)) writesame;
        };
        __le32 payload_len;
 } __attribute__ ((packed));
index fc202b6f9e4c269224ad8d337cf556602d673965..60cb614d57d7174bf487548c887744028b8d89b1 100644 (file)
@@ -3701,6 +3701,40 @@ int ReplicatedPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
   }
 }
 
+int ReplicatedPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
+{
+  ceph_osd_op& op = osd_op.op;
+  vector<OSDOp> write_ops(1);
+  OSDOp& write_op = write_ops[0];
+  uint64_t write_length = op.writesame.length;
+  int result = 0;
+
+  if (!write_length)
+    return 0;
+
+  if (!op.writesame.data_length || write_length % op.writesame.data_length)
+    return -EINVAL;
+
+  if (op.writesame.data_length != osd_op.indata.length()) {
+    derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
+    return -EINVAL;
+  }
+
+  while (write_length) {
+    write_op.indata.append(osd_op.indata.c_str(), op.writesame.data_length);
+    write_length -= op.writesame.data_length;
+  }
+
+  write_op.op.op = CEPH_OSD_OP_WRITE;
+  write_op.op.extent.offset = op.writesame.offset;
+  write_op.op.extent.length = op.writesame.length;
+  result = do_osd_ops(ctx, write_ops);
+  if (result < 0)
+    derr << "do_writesame do_osd_ops failed " << result << dendl;
+
+  return result;
+}
+
 // ========================================================================
 // low level osd ops
 
@@ -5085,6 +5119,12 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
       }
       break;
 
+    case CEPH_OSD_OP_WRITESAME:
+      ++ctx->num_write;
+      tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
+      result = do_writesame(ctx, osd_op);
+      break;
+
     case CEPH_OSD_OP_ROLLBACK :
       ++ctx->num_write;
       tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
index 660855611c9684df2eba3edcf9dabbeac6691395..fc5e5d466731923f23c6873781f6cfd064ce2ff8 100644 (file)
@@ -1412,6 +1412,8 @@ protected:
   int do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr);
   int do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr);
 
+  int do_writesame(OpContext *ctx, OSDOp& osd_op);
+
   bool pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata);
   int get_pgls_filter(bufferlist::iterator& iter, PGLSFilter **pfilter);
 
index 7a2ffd98118307f3c98158b6265e7407065e3f88..36ffa7e85f7dcc471ef76091b294ab7503be647c 100644 (file)
@@ -381,6 +381,24 @@ TRACEPOINT_EVENT(osd, do_osd_op_pre_writefull,
     )
 )
 
+TRACEPOINT_EVENT(osd, do_osd_op_pre_writesame,
+    TP_ARGS(
+        const char*, oid,
+        uint64_t, snap,
+        uint64_t, osize,
+        uint64_t, offset,
+        uint64_t, length,
+        uint64_t, data_length),
+    TP_FIELDS(
+        ctf_string(oid, oid)
+        ctf_integer(uint64_t, snap, snap)
+        ctf_integer(uint64_t, osize, osize)
+        ctf_integer(uint64_t, offset, offset)
+        ctf_integer(uint64_t, length, length)
+        ctf_integer(uint64_t, data_length, data_length)
+    )
+)
+
 TRACEPOINT_EVENT(osd, do_osd_op_pre_rollback,
     TP_ARGS(
         const char*, oid,