From c9746ff62e79e0bcd4251efa0d6875558d213ed7 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 24 Sep 2019 14:31:31 -0500 Subject: [PATCH] osd/PrimaryLogPG: include op_returns in pg log [dup] records, reply If the op has the RETURNVEC flag set, - Record the per-op return value and overall return code in the pg log and dup records. - Include the same in the actual MOSDOpReply back to the client. Signed-off-by: Sage Weil --- src/messages/MOSDOpReply.h | 11 ++---- src/osd/PGLog.cc | 1 + src/osd/PrimaryLogPG.cc | 75 +++++++++++++++++++++++--------------- src/osd/PrimaryLogPG.h | 5 ++- 4 files changed, 53 insertions(+), 39 deletions(-) diff --git a/src/messages/MOSDOpReply.h b/src/messages/MOSDOpReply.h index c5ada193dcb..2428c7ac839 100644 --- a/src/messages/MOSDOpReply.h +++ b/src/messages/MOSDOpReply.h @@ -145,13 +145,10 @@ public: retry_attempt = req->get_retry_attempt(); do_redirect = false; - // zero out ops payload_len and possibly out data - for (unsigned i = 0; i < ops.size(); i++) { - if (ignore_out_data && - (ceph_osd_op_mode_modify(ops[i].op.op) || - ceph_osd_op_mode_cache(ops[i].op.op))) { - // WIP: we will soon support some limited payload here - ceph_assert(ops[i].outdata.length() == 0); + // zero out data? + if (ignore_out_data) { + for (unsigned i = 0; i < ops.size(); i++) { + ops[i].outdata.clear(); } } } diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc index 0189390c1f5..377c9c8a726 100644 --- a/src/osd/PGLog.cc +++ b/src/osd/PGLog.cc @@ -85,6 +85,7 @@ void PGLog::IndexedLog::trim( auto it = e.extra_reqid_return_codes.find(idx); if (it != e.extra_reqid_return_codes.end()) { return_code = it->second; + // FIXME: we aren't setting op_returns for these extra_reqids } } ++idx; diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index ff7b1c0dc02..082adc52159 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -2087,13 +2087,14 @@ void PrimaryLogPG::do_op(OpRequestRef& op) if (r) { dout(20) << __func__ << " returned an error: " << r << dendl; - close_op_ctx(ctx); if (op->may_write() && get_osdmap()->require_osd_release >= ceph_release_t::kraken) { - record_write_error(op, oid, nullptr, r); + record_write_error(op, oid, nullptr, r, + ctx->op->allows_returnvec() ? ctx : nullptr); } else { osd->reply_op_error(op, r); } + close_op_ctx(ctx); return; } @@ -2456,7 +2457,8 @@ void PrimaryLogPG::finish_manifest_flush(hobject_t oid, ceph_tid_t tid, int r, } void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid, - MOSDOpReply *orig_reply, int r) + MOSDOpReply *orig_reply, int r, + OpContext *ctx_for_op_returns) { dout(20) << __func__ << " r=" << r << dendl; ceph_assert(op->may_write()); @@ -2465,6 +2467,10 @@ void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid, entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid, get_next_version(), eversion_t(), 0, reqid, utime_t(), r)); + if (ctx_for_op_returns) { + entries.back().set_op_returns(*ctx_for_op_returns->ops); + dout(20) << __func__ << " op_returns=" << entries.back().op_returns << dendl; + } struct OnComplete { PrimaryLogPG *pg; @@ -3720,27 +3726,25 @@ void PrimaryLogPG::execute_ctx(OpContext *ctx) return; } - bool successful_write = !ctx->op_t->empty() && op->may_write() && result >= 0; - // prepare the reply - ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, - successful_write); - dout(20) << __func__ << " alloc reply " << ctx->reply << dendl; - - // Write operations aren't allowed to return a data payload because - // we can't do so reliably. If the client has to resend the request - // and it has already been applied, we will return 0 with no - // payload. Non-deterministic behavior is no good. However, it is - // possible to construct an operation that does a read, does a guard - // check (e.g., CMPXATTR), and then a write. Then we either succeed - // with the write, or return a CMPXATTR and the read value. - if (successful_write) { - // WIP: we will soon support a >=0 result code here. - if (result) { - derr << " non-zero write result code " << result << dendl; - ceph_assert(result == 0); + bool ignore_out_data = false; + if (!ctx->op_t->empty() && + op->may_write() && + result >= 0) { + // successful update + if (ctx->op->allows_returnvec()) { + // enforce reasonable bound on the return buffer sizes +#warning write me + } else { + // legacy behavior -- zero result and return data etc. + ignore_out_data = true; + result = 0; } } - ctx->reply->set_result(result); + + // prepare the reply + ctx->reply = new MOSDOpReply(m, result, get_osdmap_epoch(), 0, + ignore_out_data); + dout(20) << __func__ << " alloc reply " << ctx->reply << dendl; // read or error? if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) { @@ -3788,7 +3792,6 @@ void PrimaryLogPG::execute_ctx(OpContext *ctx) MOSDOpReply *reply = ctx->reply; ctx->reply = nullptr; reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0); - close_op_ctx(ctx); if (result == -ENOENT) { reply->set_enoent_reply_versions(info.last_update, @@ -3796,7 +3799,9 @@ void PrimaryLogPG::execute_ctx(OpContext *ctx) } reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); // append to pg log for dup detection - don't save buffers for now - record_write_error(op, soid, reply, result); + record_write_error(op, soid, reply, result, + ctx->op->allows_returnvec() ? ctx : nullptr); + close_op_ctx(ctx); return; } @@ -8294,12 +8299,13 @@ int PrimaryLogPG::prepare_transaction(OpContext *ctx) finish_ctx(ctx, ctx->new_obs.exists ? pg_log_entry_t::MODIFY : - pg_log_entry_t::DELETE); + pg_log_entry_t::DELETE, + result); return result; } -void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type) +void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, int result) { const hobject_t& soid = ctx->obs->oi.soid; dout(20) << __func__ << " " << soid << " " << ctx @@ -8357,10 +8363,19 @@ void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type) } // append to log - ctx->log.push_back(pg_log_entry_t(log_op_type, soid, ctx->at_version, - ctx->obs->oi.version, - ctx->user_at_version, ctx->reqid, - ctx->mtime, 0)); + ctx->log.push_back( + pg_log_entry_t(log_op_type, soid, ctx->at_version, + ctx->obs->oi.version, + ctx->user_at_version, ctx->reqid, + ctx->mtime, + (ctx->op && ctx->op->allows_returnvec()) ? result : 0)); + if (ctx->op && ctx->op->allows_returnvec()) { + // also the per-op values + ctx->log.back().set_op_returns(*ctx->ops); + dout(20) << __func__ << " op_returns " << ctx->log.back().op_returns + << dendl; + } + ctx->log.back().clean_regions = ctx->clean_regions; dout(20) << __func__ << " object " << soid << " marks clean_regions " << ctx->log.back().clean_regions << dendl; diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h index b0e111d9378..36ff68e0b3a 100644 --- a/src/osd/PrimaryLogPG.h +++ b/src/osd/PrimaryLogPG.h @@ -1174,7 +1174,7 @@ protected: const hobject_t& head, const hobject_t& coid, object_info_t *poi); void execute_ctx(OpContext *ctx); - void finish_ctx(OpContext *ctx, int log_op_type); + void finish_ctx(OpContext *ctx, int log_op_type, int result=0); void reply_ctx(OpContext *ctx, int err); void reply_ctx(OpContext *ctx, int err, eversion_t v, version_t uv); void make_writeable(OpContext *ctx); @@ -1502,7 +1502,8 @@ public: ThreadPool::TPHandle &handle) override; void do_op(OpRequestRef& op); void record_write_error(OpRequestRef op, const hobject_t &soid, - MOSDOpReply *orig_reply, int r); + MOSDOpReply *orig_reply, int r, + OpContext *ctx_for_op_returns=nullptr); void do_pg_op(OpRequestRef op); void do_scan( OpRequestRef op, -- 2.39.5