From fe9a27d31d0dad60446faae32a009f5554bca956 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 20 Aug 2008 09:19:28 -0700 Subject: [PATCH] osd: clean up osd opcodes, introduce WRITEFULL --- src/include/ceph_fs.h | 73 ++++++++++++++++++++++----- src/messages/MOSDOp.h | 34 +------------ src/messages/MOSDOpReply.h | 2 +- src/messages/MOSDSubOp.h | 2 +- src/messages/MOSDSubOpReply.h | 2 +- src/osd/ReplicatedPG.cc | 92 ++++++++++++----------------------- 6 files changed, 99 insertions(+), 106 deletions(-) diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index 0ea0f88bdc71d..0fe459e8167ee 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -97,7 +97,7 @@ struct ceph_timespec { * to feed encoded frags as values into frag_contains_value. */ static inline __u32 frag_make(__u32 b, __u32 v) { - return (b << 24) | + return (b << 24) | (v & (0xffffffu << (24-b)) & 0xffffffu); } static inline __u32 frag_bits(__u32 f) { return f >> 24; } @@ -534,7 +534,7 @@ struct ceph_mds_session_head { /* client_request */ /* - * mds ops. + * mds ops. * & 0x1000 -> write op * & 0x10000 -> follow symlink (e.g. stat(), not lstat()). & & 0x100000 -> use weird ino/path trace @@ -552,7 +552,7 @@ enum { CEPH_MDS_OP_LSETXATTR = 0x01104, CEPH_MDS_OP_LRMXATTR = 0x01105, CEPH_MDS_OP_LSETLAYOUT= 0x01106, - + CEPH_MDS_OP_STAT = 0x10100, CEPH_MDS_OP_UTIME = 0x11101, CEPH_MDS_OP_CHMOD = 0x11102, @@ -914,16 +914,19 @@ struct ceph_mds_snap_realm { * osd ops */ enum { + /* read */ CEPH_OSD_OP_READ = 1, CEPH_OSD_OP_STAT = 2, - CEPH_OSD_OP_REPLICATE = 3, - CEPH_OSD_OP_UNREPLICATE = 4, - CEPH_OSD_OP_WRNOOP = 10, - CEPH_OSD_OP_WRITE = 11, - CEPH_OSD_OP_DELETE = 12, + + /* modify */ + CEPH_OSD_OP_WRNOOP = 10, /* write no-op (i.e. sync) */ + CEPH_OSD_OP_WRITE = 11, /* write extent */ + CEPH_OSD_OP_DELETE = 12, /* delete object */ CEPH_OSD_OP_TRUNCATE = 13, - CEPH_OSD_OP_ZERO = 14, + CEPH_OSD_OP_ZERO = 14, /* zero extent */ + CEPH_OSD_OP_WRITEFULL = 15, /* write complete object */ + /* lock */ CEPH_OSD_OP_WRLOCK = 20, CEPH_OSD_OP_WRUNLOCK = 21, CEPH_OSD_OP_RDLOCK = 22, @@ -931,13 +934,61 @@ enum { CEPH_OSD_OP_UPLOCK = 24, CEPH_OSD_OP_DNLOCK = 25, + /* subop */ CEPH_OSD_OP_PULL = 30, CEPH_OSD_OP_PUSH = 31, - CEPH_OSD_OP_BALANCEREADS = 101, - CEPH_OSD_OP_UNBALANCEREADS = 102 + CEPH_OSD_OP_BALANCEREADS = 40, + CEPH_OSD_OP_UNBALANCEREADS = 41 }; +static inline int ceph_osd_op_is_read(int op) +{ + return op < 10; +} +static inline int ceph_osd_op_is_modify(int op) +{ + return op >= 10 && op < 20; +} +static inline int ceph_osd_op_is_lock(int op) +{ + return op >= 20 && op < 30; +} +static inline int ceph_osd_op_is_subop(int op) +{ + return op >= 30 && op < 40; +} + +static inline const char* ceph_osd_op_name(int op) +{ + switch (op) { + case CEPH_OSD_OP_READ: return "read"; + case CEPH_OSD_OP_STAT: return "stat"; + + case CEPH_OSD_OP_WRNOOP: return "wrnoop"; + case CEPH_OSD_OP_WRITE: return "write"; + case CEPH_OSD_OP_DELETE: return "delete"; + case CEPH_OSD_OP_TRUNCATE: return "truncate"; + case CEPH_OSD_OP_ZERO: return "zero"; + case CEPH_OSD_OP_WRITEFULL: return "writefull"; + + case CEPH_OSD_OP_WRLOCK: return "wrlock"; + case CEPH_OSD_OP_WRUNLOCK: return "wrunlock"; + case CEPH_OSD_OP_RDLOCK: return "rdlock"; + case CEPH_OSD_OP_RDUNLOCK: return "rdunlock"; + case CEPH_OSD_OP_UPLOCK: return "uplock"; + case CEPH_OSD_OP_DNLOCK: return "dnlock"; + + case CEPH_OSD_OP_BALANCEREADS: return "balance-reads"; + case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads"; + + case CEPH_OSD_OP_PULL: return "pull"; + case CEPH_OSD_OP_PUSH: return "push"; + default: return ""; + } +} + + /* * osd op flags */ diff --git a/src/messages/MOSDOp.h b/src/messages/MOSDOp.h index 9c2b68609b683..c05b62e59a230 100644 --- a/src/messages/MOSDOp.h +++ b/src/messages/MOSDOp.h @@ -30,34 +30,6 @@ #define EINCLOCKED 100 class MOSDOp : public Message { -public: - static const char* get_opname(int op) { - switch (op) { - case CEPH_OSD_OP_READ: return "read"; - case CEPH_OSD_OP_STAT: return "stat"; - - case CEPH_OSD_OP_WRNOOP: return "wrnoop"; - case CEPH_OSD_OP_WRITE: return "write"; - case CEPH_OSD_OP_ZERO: return "zero"; - case CEPH_OSD_OP_DELETE: return "delete"; - case CEPH_OSD_OP_TRUNCATE: return "truncate"; - case CEPH_OSD_OP_WRLOCK: return "wrlock"; - case CEPH_OSD_OP_WRUNLOCK: return "wrunlock"; - case CEPH_OSD_OP_RDLOCK: return "rdlock"; - case CEPH_OSD_OP_RDUNLOCK: return "rdunlock"; - case CEPH_OSD_OP_UPLOCK: return "uplock"; - case CEPH_OSD_OP_DNLOCK: return "dnlock"; - - case CEPH_OSD_OP_BALANCEREADS: return "balance-reads"; - case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads"; - - case CEPH_OSD_OP_PULL: return "pull"; - case CEPH_OSD_OP_PUSH: return "push"; - default: assert(0); - } - return 0; - } - private: ceph_osd_request_head head; vector snaps; @@ -84,9 +56,7 @@ public: const int get_op() { return head.op; } void set_op(int o) { head.op = o; } - bool is_read() { - return get_op() < 10; - } + bool is_read() { return ceph_osd_op_is_read(get_op()); } loff_t get_length() const { return head.length; } loff_t get_offset() const { return head.offset; } @@ -161,7 +131,7 @@ public: const char *get_type_name() { return "osd_op"; } void print(ostream& out) { out << "osd_op(" << get_reqid() - << " " << get_opname(get_op()) + << " " << ceph_osd_op_name(get_op()) << " " << head.oid; if (get_length()) out << " " << get_offset() << "~" << get_length(); out << " " << pg_t(head.layout.ol_pgid); diff --git a/src/messages/MOSDOpReply.h b/src/messages/MOSDOpReply.h index 1ad1356fcedd2..eac34b7de1d9c 100644 --- a/src/messages/MOSDOpReply.h +++ b/src/messages/MOSDOpReply.h @@ -88,7 +88,7 @@ public: void print(ostream& out) { out << "osd_op_reply(" << get_tid() - << " " << MOSDOp::get_opname(get_op()) + << " " << ceph_osd_op_name(get_op()) << " " << head.oid; if (get_length()) out << " " << get_offset() << "~" << get_length(); if (get_op() >= 10) { diff --git a/src/messages/MOSDSubOp.h b/src/messages/MOSDSubOp.h index 20906f43e5f56..9f26fd0d39a41 100644 --- a/src/messages/MOSDSubOp.h +++ b/src/messages/MOSDSubOp.h @@ -116,7 +116,7 @@ public: const char *get_type_name() { return "osd_sub_op"; } void print(ostream& out) { out << "osd_sub_op(" << reqid - << " " << MOSDOp::get_opname(op) + << " " << ceph_osd_op_name(op) << " " << poid << " v " << version << " snapset=" << snapset << " snapc=" << snapc; diff --git a/src/messages/MOSDSubOpReply.h b/src/messages/MOSDSubOpReply.h index c06d6ead14ef4..67eab56011104 100644 --- a/src/messages/MOSDSubOpReply.h +++ b/src/messages/MOSDSubOpReply.h @@ -126,7 +126,7 @@ public: void print(ostream& out) { out << "osd_sub_op_reply(" << reqid - << " " << MOSDOp::get_opname(op) + << " " << ceph_osd_op_name(op) << " " << poid; if (length) out << " " << offset << "~" << length; if (op >= 10) { diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index ce3e869b3b456..e6ab7172411fa 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -363,34 +363,10 @@ void ReplicatedPG::do_op(MOSDOp *op) osd->logger->inc("op"); - switch (op->get_op()) { - - // reads - case CEPH_OSD_OP_READ: - case CEPH_OSD_OP_STAT: + if (ceph_osd_op_is_read(op->get_op())) op_read(op); - break; - - // writes - case CEPH_OSD_OP_WRNOOP: - case CEPH_OSD_OP_WRITE: - case CEPH_OSD_OP_ZERO: - case CEPH_OSD_OP_DELETE: - case CEPH_OSD_OP_TRUNCATE: - case CEPH_OSD_OP_WRLOCK: - case CEPH_OSD_OP_WRUNLOCK: - case CEPH_OSD_OP_RDLOCK: - case CEPH_OSD_OP_RDUNLOCK: - case CEPH_OSD_OP_UPLOCK: - case CEPH_OSD_OP_DNLOCK: - case CEPH_OSD_OP_BALANCEREADS: - case CEPH_OSD_OP_UNBALANCEREADS: + else op_modify(op); - break; - - default: - assert(0); - } } @@ -401,7 +377,6 @@ void ReplicatedPG::do_sub_op(MOSDSubOp *op) osd->logger->inc("subop"); switch (op->op) { - // rep stuff case CEPH_OSD_OP_PULL: sub_op_pull(op); @@ -409,26 +384,13 @@ void ReplicatedPG::do_sub_op(MOSDSubOp *op) case CEPH_OSD_OP_PUSH: sub_op_push(op); break; - - // writes - case CEPH_OSD_OP_WRNOOP: - case CEPH_OSD_OP_WRITE: - case CEPH_OSD_OP_ZERO: - case CEPH_OSD_OP_DELETE: - case CEPH_OSD_OP_TRUNCATE: - case CEPH_OSD_OP_WRLOCK: - case CEPH_OSD_OP_WRUNLOCK: - case CEPH_OSD_OP_RDLOCK: - case CEPH_OSD_OP_RDUNLOCK: - case CEPH_OSD_OP_UPLOCK: - case CEPH_OSD_OP_DNLOCK: - case CEPH_OSD_OP_BALANCEREADS: - case CEPH_OSD_OP_UNBALANCEREADS: - sub_op_modify(op); - break; - + default: - assert(0); + if (ceph_osd_op_is_modify(op->op) || + ceph_osd_op_is_lock(op->op)) + sub_op_modify(op); + else + assert(0); } } @@ -648,7 +610,7 @@ void ReplicatedPG::op_read(MOSDOp *op) object_t oid = op->get_oid(); pobject_t poid(info.pgid.pool(), 0, oid); - dout(10) << "op_read " << MOSDOp::get_opname(op->get_op()) + dout(10) << "op_read " << ceph_osd_op_name(op->get_op()) << " " << oid << " " << op->get_offset() << "~" << op->get_length() << dendl; @@ -829,10 +791,7 @@ void ReplicatedPG::prepare_transaction(ObjectStore::Transaction& t, osd_reqid_t if (snapset.head_exists && // head exists snapc.snaps.size() && // there are snaps snapc.snaps[0] > snapset.seq && // existing object is old - (op == CEPH_OSD_OP_WRITE || // is a (non-lock) modification - op == CEPH_OSD_OP_ZERO || - op == CEPH_OSD_OP_TRUNCATE || - op == CEPH_OSD_OP_DELETE)) { + ceph_osd_op_is_modify(op)) { // is a (non-lock) modification // clone pobject_t coid = poid; coid.oid.snap = snapc.seq; @@ -936,7 +895,6 @@ void ReplicatedPG::prepare_transaction(ObjectStore::Transaction& t, osd_reqid_t bufferlist nbl; nbl.claim(bl); // give buffers to store; we keep *op in memory for a long time! t.write(info.pgid.to_coll(), poid, offset, length, nbl); - if (inc_lock) t.setattr(info.pgid.to_coll(), poid, "inc_lock", &inc_lock, sizeof(inc_lock)); snapset.head_exists = true; interval_set<__u64> ch; ch.insert(offset, length); @@ -945,10 +903,21 @@ void ReplicatedPG::prepare_transaction(ObjectStore::Transaction& t, osd_reqid_t } break; + case CEPH_OSD_OP_WRITEFULL: + { // write full object + assert(bl.length() == length); + bufferlist nbl; + nbl.claim(bl); // give buffers to store; we keep *op in memory for a long time! + t.truncate(info.pgid.to_coll(), poid, 0); + t.write(info.pgid.to_coll(), poid, offset, length, nbl); + snapset.head_exists = true; + snapset.head_overlap.clear(); + } + break; + case CEPH_OSD_OP_ZERO: { // zero t.zero(info.pgid.to_coll(), poid, offset, length); - if (inc_lock) t.setattr(info.pgid.to_coll(), poid, "inc_lock", &inc_lock, sizeof(inc_lock)); interval_set<__u64> ch; ch.insert(offset, length); ch.intersection_of(snapset.head_overlap); @@ -959,7 +928,6 @@ void ReplicatedPG::prepare_transaction(ObjectStore::Transaction& t, osd_reqid_t case CEPH_OSD_OP_TRUNCATE: { // truncate t.truncate(info.pgid.to_coll(), poid, length); - if (inc_lock) t.setattr(info.pgid.to_coll(), poid, "inc_lock", &inc_lock, sizeof(inc_lock)); interval_set<__u64> keep; keep.insert(0, length); snapset.head_overlap.intersection_of(keep); @@ -979,7 +947,9 @@ void ReplicatedPG::prepare_transaction(ObjectStore::Transaction& t, osd_reqid_t // object collection, version if (op != CEPH_OSD_OP_DELETE) { - // object version + if (inc_lock && ceph_osd_op_is_modify(op)) + t.setattr(info.pgid.to_coll(), poid, "inc_lock", &inc_lock, sizeof(inc_lock)); + t.setattr(info.pgid.to_coll(), poid, "version", &at_version, sizeof(at_version)); bufferlist snapsetbl; @@ -1332,10 +1302,11 @@ void ReplicatedPG::op_modify(MOSDOp *op) int whoami = osd->get_nodeid(); pobject_t poid(info.pgid.pool(), 0, op->get_oid()); - const char *opname = MOSDOp::get_opname(op->get_op()); + const char *opname = ceph_osd_op_name(op->get_op()); // make sure it looks ok - if (op->get_op() == CEPH_OSD_OP_WRITE && + if ((op->get_op() == CEPH_OSD_OP_WRITE || + op->get_op() == CEPH_OSD_OP_WRITEFULL) && op->get_length() != op->get_data().length()) { dout(0) << "op_modify got bad write, claimed length " << op->get_length() << " != payload length " << op->get_data().length() @@ -1396,7 +1367,7 @@ void ReplicatedPG::op_modify(MOSDOp *op) dout(3) << "op_modify " << opname << " dup op " << op->get_reqid() << ", doing WRNOOP" << dendl; op->set_op(CEPH_OSD_OP_WRNOOP); - opname = MOSDOp::get_opname(op->get_op()); + opname = ceph_osd_op_name(op->get_op()); } @@ -1458,7 +1429,8 @@ void ReplicatedPG::op_modify(MOSDOp *op) } } - if (op->get_op() == CEPH_OSD_OP_WRITE) { + if (op->get_op() == CEPH_OSD_OP_WRITE || + op->get_op() == CEPH_OSD_OP_WRITEFULL) { osd->logger->inc("c_wr"); osd->logger->inc("c_wrb", op->get_length()); } @@ -1499,7 +1471,7 @@ void ReplicatedPG::op_modify(MOSDOp *op) void ReplicatedPG::sub_op_modify(MOSDSubOp *op) { pobject_t poid = op->poid; - const char *opname = MOSDOp::get_opname(op->op); + const char *opname = ceph_osd_op_name(op->op); dout(10) << "sub_op_modify " << opname << " " << poid -- 2.39.5