From b1a38263ba2b5d69ed33ed13ad978fc10cb9798c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 13 Feb 2017 16:36:37 -0500 Subject: [PATCH] osd: use MOSDScrubReserve instead of MOSDSubOp for scrub reservations This is the last MOSDSubOp user. Note that while the next step is to move to AsyncReserver internally, this isn't quite yet possible since AsyncReserve "blocks" indefinitely so we wouldn't generate a REJECT. Changing how we schdule scrubs internally will take a bit more work. Signed-off-by: Sage Weil --- src/messages/MOSDScrubReserve.h | 93 +++++++++++++++++++ src/messages/MRecoveryReserve.h | 10 +-- src/msg/Message.cc | 4 + src/msg/Message.h | 2 +- src/osd/OSD.cc | 1 + src/osd/OSD.h | 1 + src/osd/PG.cc | 155 +++++++++++++++++++------------- src/osd/PG.h | 8 +- src/osd/PrimaryLogPG.cc | 38 +++++++- 9 files changed, 240 insertions(+), 72 deletions(-) create mode 100644 src/messages/MOSDScrubReserve.h diff --git a/src/messages/MOSDScrubReserve.h b/src/messages/MOSDScrubReserve.h new file mode 100644 index 00000000000..6eb39967a94 --- /dev/null +++ b/src/messages/MOSDScrubReserve.h @@ -0,0 +1,93 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MOSDSCRUBRESERVE_H +#define CEPH_MOSDSCRUBRESERVE_H + +#include "MOSDFastDispatchOp.h" + +class MOSDScrubReserve : public MOSDFastDispatchOp { + static const int HEAD_VERSION = 1; + static const int COMPAT_VERSION = 1; +public: + spg_t pgid; + epoch_t map_epoch; + enum { + REQUEST = 0, + GRANT = 1, + RELEASE = 2, + REJECT = 3, + }; + int32_t type; + pg_shard_t from; + + epoch_t get_map_epoch() const override { + return map_epoch; + } + spg_t get_spg() const override { + return pgid; + } + + MOSDScrubReserve() + : MOSDFastDispatchOp(MSG_OSD_SCRUB_RESERVE, HEAD_VERSION, COMPAT_VERSION), + map_epoch(0), type(-1) {} + MOSDScrubReserve(spg_t pgid, + epoch_t map_epoch, + int type, + pg_shard_t from) + : MOSDFastDispatchOp(MSG_OSD_SCRUB_RESERVE, HEAD_VERSION, COMPAT_VERSION), + pgid(pgid), map_epoch(map_epoch), + type(type), from(from) {} + + const char *get_type_name() const { + return "MOSDScrubReserve"; + } + + void print(ostream& out) const { + out << "MOSDScrubReserve(" << pgid << " "; + switch (type) { + case REQUEST: + out << "REQUEST "; + break; + case GRANT: + out << "GRANT "; + break; + case REJECT: + out << "REJECT "; + break; + case RELEASE: + out << "RELEASE "; + break; + } + out << "e" << map_epoch << ")"; + return; + } + + void decode_payload() { + bufferlist::iterator p = payload.begin(); + ::decode(pgid, p); + ::decode(map_epoch, p); + ::decode(type, p); + ::decode(from, p); + } + + void encode_payload(uint64_t features) { + ::encode(pgid, payload); + ::encode(map_epoch, payload); + ::encode(type, payload); + ::encode(from, payload); + } +}; + +#endif diff --git a/src/messages/MRecoveryReserve.h b/src/messages/MRecoveryReserve.h index 66cce149af6..c0e975004d8 100644 --- a/src/messages/MRecoveryReserve.h +++ b/src/messages/MRecoveryReserve.h @@ -45,19 +45,19 @@ public: } void print(ostream& out) const override { - out << "MRecoveryReserve "; + out << "MRecoveryReserve(" << pgid; switch (type) { case REQUEST: - out << "REQUEST "; + out << " REQUEST"; break; case GRANT: - out << "GRANT "; + out << " GRANT"; break; case RELEASE: - out << "RELEASE "; + out << " RELEASE"; break; } - out << " pgid: " << pgid << ", query_epoch: " << query_epoch; + out << " e" << query_epoch << ")"; return; } diff --git a/src/msg/Message.cc b/src/msg/Message.cc index 39b0575ff65..e19fd017b1c 100644 --- a/src/msg/Message.cc +++ b/src/msg/Message.cc @@ -81,6 +81,7 @@ using namespace std; #include "messages/MOSDPGCreate.h" #include "messages/MOSDPGTrim.h" #include "messages/MOSDScrub.h" +#include "messages/MOSDScrubReserve.h" #include "messages/MOSDRepScrub.h" #include "messages/MOSDRepScrubMap.h" #include "messages/MOSDPGScan.h" @@ -511,6 +512,9 @@ Message *decode_message(CephContext *cct, int crcflags, case MSG_OSD_SCRUB: m = new MOSDScrub; break; + case MSG_OSD_SCRUB_RESERVE: + m = new MOSDScrubReserve; + break; case MSG_REMOVE_SNAPS: m = new MRemoveSnaps; break; diff --git a/src/msg/Message.h b/src/msg/Message.h index f5e5acc6866..67c1edce82d 100644 --- a/src/msg/Message.h +++ b/src/msg/Message.h @@ -88,7 +88,7 @@ #define MSG_REMOVE_SNAPS 90 #define MSG_OSD_SCRUB 91 -//#define MSG_OSD_PG_MISSING 92 // obsolete +#define MSG_OSD_SCRUB_RESERVE 92 // previous PG_MISSING #define MSG_OSD_REP_SCRUB 93 #define MSG_OSD_PG_SCAN 94 diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 4e3afa6e174..a7ce5da7bb1 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -100,6 +100,7 @@ #include "messages/MOSDAlive.h" #include "messages/MOSDScrub.h" +#include "messages/MOSDScrubReserve.h" #include "messages/MOSDRepScrub.h" #include "messages/MMonCommand.h" diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 2f14e049f6d..33f54765e73 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -2336,6 +2336,7 @@ protected: case MSG_OSD_EC_WRITE_REPLY: case MSG_OSD_EC_READ: case MSG_OSD_EC_READ_REPLY: + case MSG_OSD_SCRUB_RESERVE: case MSG_OSD_REP_SCRUB: case MSG_OSD_REP_SCRUBMAP: case MSG_OSD_PG_UPDATE_LOG_MISSING: diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 477e00e9c37..a017e8b0045 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -49,7 +49,7 @@ #include "messages/MOSDPGUpdateLogMissing.h" #include "messages/MOSDPGUpdateLogMissingReply.h" #include "messages/MOSDBackoff.h" - +#include "messages/MOSDScrubReserve.h" #include "messages/MOSDSubOp.h" #include "messages/MOSDRepOp.h" #include "messages/MOSDSubOpReply.h" @@ -3724,67 +3724,75 @@ void PG::_request_scrub_map( replica.osd, repscrubop, get_osdmap()->get_epoch()); } -void PG::sub_op_scrub_reserve(OpRequestRef op) +void PG::handle_scrub_reserve_request(OpRequestRef op) { - const MOSDSubOp *m = static_cast(op->get_req()); - assert(m->get_type() == MSG_OSD_SUBOP); - dout(7) << "sub_op_scrub_reserve" << dendl; - + dout(7) << __func__ << " " << *op->get_req() << dendl; + op->mark_started(); if (scrubber.reserved) { - dout(10) << "Ignoring reserve request: Already reserved" << dendl; + dout(10) << __func__ << " ignoring reserve request: Already reserved" + << dendl; return; } - - op->mark_started(); - scrubber.reserved = osd->inc_scrubs_pending(); - - MOSDSubOpReply *reply = new MOSDSubOpReply( - m, pg_whoami, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK); - ::encode(scrubber.reserved, reply->get_data()); - osd->send_message_osd_cluster(reply, m->get_connection()); + if (op->get_req()->get_type() == MSG_OSD_SCRUB_RESERVE) { + const MOSDScrubReserve *m = + static_cast(op->get_req()); + Message *reply = new MOSDScrubReserve( + spg_t(info.pgid.pgid, primary.shard), + m->map_epoch, + scrubber.reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT, + pg_whoami); + osd->send_message_osd_cluster(reply, op->get_req()->get_connection()); + } else { + // for jewel compat only + const MOSDSubOp *req = static_cast(op->get_req()); + assert(req->get_type() == MSG_OSD_SUBOP); + MOSDSubOpReply *reply = new MOSDSubOpReply( + req, pg_whoami, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK); + ::encode(scrubber.reserved, reply->get_data()); + osd->send_message_osd_cluster(reply, op->get_req()->get_connection()); + } } -void PG::sub_op_scrub_reserve_reply(OpRequestRef op) +void PG::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) { - const MOSDSubOpReply *reply = static_cast(op->get_req()); - assert(reply->get_type() == MSG_OSD_SUBOPREPLY); - dout(7) << "sub_op_scrub_reserve_reply" << dendl; - + dout(7) << __func__ << " " << *op->get_req() << dendl; + op->mark_started(); if (!scrubber.reserved) { dout(10) << "ignoring obsolete scrub reserve reply" << dendl; return; } + if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) { + dout(10) << " already had osd." << from << " reserved" << dendl; + } else { + dout(10) << " osd." << from << " scrub reserve = success" << dendl; + scrubber.reserved_peers.insert(from); + sched_scrub(); + } +} +void PG::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) +{ + dout(7) << __func__ << " " << *op->get_req() << dendl; op->mark_started(); - - pg_shard_t from = reply->from; - bufferlist::iterator p = const_cast(reply->get_data()).begin(); - bool reserved; - ::decode(reserved, p); - + if (!scrubber.reserved) { + dout(10) << "ignoring obsolete scrub reserve reply" << dendl; + return; + } if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) { dout(10) << " already had osd." << from << " reserved" << dendl; } else { - if (reserved) { - dout(10) << " osd." << from << " scrub reserve = success" << dendl; - scrubber.reserved_peers.insert(from); - } else { - /* One decline stops this pg from being scheduled for scrubbing. */ - dout(10) << " osd." << from << " scrub reserve = fail" << dendl; - scrubber.reserve_failed = true; - } + /* One decline stops this pg from being scheduled for scrubbing. */ + dout(10) << " osd." << from << " scrub reserve = fail" << dendl; + scrubber.reserve_failed = true; sched_scrub(); } } -void PG::sub_op_scrub_unreserve(OpRequestRef op) +void PG::handle_scrub_reserve_release(OpRequestRef op) { - assert(op->get_req()->get_type() == MSG_OSD_SUBOP); - dout(7) << "sub_op_scrub_unreserve" << dendl; - + dout(7) << __func__ << " " << *op->get_req() << dendl; op->mark_started(); - clear_scrub_reserved(); } @@ -3828,17 +3836,27 @@ void PG::scrub_reserve_replicas() ++i) { if (*i == pg_whoami) continue; dout(10) << "scrub requesting reserve from osd." << *i << dendl; - vector scrub(1); - scrub[0].op.op = CEPH_OSD_OP_SCRUB_RESERVE; - hobject_t poid; - eversion_t v; - osd_reqid_t reqid; - MOSDSubOp *subop = new MOSDSubOp( - reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0, - get_osdmap()->get_epoch(), osd->get_tid(), v); - subop->ops = scrub; - osd->send_message_osd_cluster( - i->osd, subop, get_osdmap()->get_epoch()); + if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) { + osd->send_message_osd_cluster( + i->osd, + new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard), + get_osdmap()->get_epoch(), + MOSDScrubReserve::REQUEST, pg_whoami), + get_osdmap()->get_epoch()); + } else { + // for jewel compat only + vector scrub(1); + scrub[0].op.op = CEPH_OSD_OP_SCRUB_RESERVE; + hobject_t poid; + eversion_t v; + osd_reqid_t reqid; + MOSDSubOp *subop = new MOSDSubOp( + reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0, + get_osdmap()->get_epoch(), osd->get_tid(), v); + subop->ops = scrub; + osd->send_message_osd_cluster( + i->osd, subop, get_osdmap()->get_epoch()); + } } } @@ -3850,16 +3868,26 @@ void PG::scrub_unreserve_replicas() ++i) { if (*i == pg_whoami) continue; dout(10) << "scrub requesting unreserve from osd." << *i << dendl; - vector scrub(1); - scrub[0].op.op = CEPH_OSD_OP_SCRUB_UNRESERVE; - hobject_t poid; - eversion_t v; - osd_reqid_t reqid; - MOSDSubOp *subop = new MOSDSubOp( - reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0, - get_osdmap()->get_epoch(), osd->get_tid(), v); - subop->ops = scrub; - osd->send_message_osd_cluster(i->osd, subop, get_osdmap()->get_epoch()); + if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) { + osd->send_message_osd_cluster( + i->osd, + new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard), + get_osdmap()->get_epoch(), + MOSDScrubReserve::RELEASE, pg_whoami), + get_osdmap()->get_epoch()); + } else { + // for jewel compat only + vector scrub(1); + scrub[0].op.op = CEPH_OSD_OP_SCRUB_UNRESERVE; + hobject_t poid; + eversion_t v; + osd_reqid_t reqid; + MOSDSubOp *subop = new MOSDSubOp( + reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0, + get_osdmap()->get_epoch(), osd->get_tid(), v); + subop->ops = scrub; + osd->send_message_osd_cluster(i->osd, subop, get_osdmap()->get_epoch()); + } } } @@ -5545,6 +5573,8 @@ bool PG::can_discard_request(OpRequestRef& op) return can_discard_replica_op(op); case MSG_OSD_REP_SCRUB: return can_discard_replica_op(op); + case MSG_OSD_SCRUB_RESERVE: + return can_discard_replica_op(op); case MSG_OSD_REP_SCRUBMAP: return can_discard_replica_op(op); case MSG_OSD_PG_UPDATE_LOG_MISSING: @@ -5651,6 +5681,11 @@ bool PG::op_must_wait_for_map(epoch_t cur_epoch, OpRequestRef& op) cur_epoch, static_cast(op->get_req())->map_epoch); + case MSG_OSD_SCRUB_RESERVE: + return !have_same_or_newer_map( + cur_epoch, + static_cast(op->get_req())->map_epoch); + case MSG_OSD_REP_SCRUBMAP: return !have_same_or_newer_map( cur_epoch, diff --git a/src/osd/PG.h b/src/osd/PG.h index 3aa4776785a..853ef96fa84 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -1334,9 +1334,11 @@ public: ThreadPool::TPHandle &handle); void do_replica_scrub_map(OpRequestRef op); void sub_op_scrub_map(OpRequestRef op); - void sub_op_scrub_reserve(OpRequestRef op); - void sub_op_scrub_reserve_reply(OpRequestRef op); - void sub_op_scrub_unreserve(OpRequestRef op); + + void handle_scrub_reserve_request(OpRequestRef op); + void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from); + void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from); + void handle_scrub_reserve_release(OpRequestRef op); void reject_reservation(); void schedule_backfill_full_retry(); diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 1481b4104bb..75a5eea675d 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -41,6 +41,7 @@ #include "messages/MOSDPGUpdateLogMissing.h" #include "messages/MOSDPGUpdateLogMissingReply.h" #include "messages/MCommandReply.h" +#include "messages/MOSDScrubReserve.h" #include "mds/inode_backtrace.h" // Ugh #include "common/EventTrace.h" @@ -1685,6 +1686,27 @@ void PrimaryLogPG::do_request( do_backfill_remove(op); break; + case MSG_OSD_SCRUB_RESERVE: + { + const MOSDScrubReserve *m = + static_cast(op->get_req()); + switch (m->type) { + case MOSDScrubReserve::REQUEST: + handle_scrub_reserve_request(op); + break; + case MOSDScrubReserve::GRANT: + handle_scrub_reserve_grant(op, m->from); + break; + case MOSDScrubReserve::REJECT: + handle_scrub_reserve_reject(op, m->from); + break; + case MOSDScrubReserve::RELEASE: + handle_scrub_reserve_release(op); + break; + } + } + break; + case MSG_OSD_REP_SCRUB: replica_scrub(op, handle); break; @@ -3255,10 +3277,10 @@ void PrimaryLogPG::do_sub_op(OpRequestRef op) sub_op_remove(op); return; case CEPH_OSD_OP_SCRUB_RESERVE: - sub_op_scrub_reserve(op); + handle_scrub_reserve_request(op); return; case CEPH_OSD_OP_SCRUB_UNRESERVE: - sub_op_scrub_unreserve(op); + handle_scrub_reserve_release(op); return; case CEPH_OSD_OP_SCRUB_MAP: sub_op_scrub_map(op); @@ -3275,7 +3297,17 @@ void PrimaryLogPG::do_sub_op_reply(OpRequestRef op) const OSDOp& first = r->ops[0]; switch (first.op.op) { case CEPH_OSD_OP_SCRUB_RESERVE: - sub_op_scrub_reserve_reply(op); + { + pg_shard_t from = r->from; + bufferlist::iterator p = const_cast(r->get_data()).begin(); + bool reserved; + ::decode(reserved, p); + if (reserved) { + handle_scrub_reserve_grant(op, from); + } else { + handle_scrub_reserve_reject(op, from); + } + } return; } } -- 2.39.5