From: Sage Weil Date: Wed, 1 Feb 2017 22:30:32 +0000 (-0500) Subject: osd/PrimaryLogPG: PG-wide backoffs X-Git-Tag: v12.0.1~441^2~4 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=a3d50f87d8c27162b5bbde89ec92a44ef53f1127;p=ceph.git osd/PrimaryLogPG: PG-wide backoffs Issue at top of do_request. Release on activation or peering interval change. Signed-off-by: Sage Weil --- diff --git a/src/common/config_opts.h b/src/common/config_opts.h index ac27b480d2a9..6af9f43d7173 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -842,6 +842,8 @@ OPTION(osd_command_max_records, OPT_INT, 256) OPTION(osd_max_pg_blocked_by, OPT_U32, 16) // max peer osds to report that are blocking our progress OPTION(osd_op_log_threshold, OPT_INT, 5) // how many op log messages to show in one go OPTION(osd_verify_sparse_read_holes, OPT_BOOL, false) // read fiemap-reported holes and verify they are zeros +OPTION(osd_peering_aggressive_backoff, OPT_BOOL, false) // issue aggressive client backoff during peering +OPTION(osd_debug_crash_on_ignored_backoff, OPT_BOOL, false) // crash osd if client ignores a backoff; useful for debugging OPTION(osd_debug_drop_ping_probability, OPT_DOUBLE, 0) OPTION(osd_debug_drop_ping_duration, OPT_INT, 0) OPTION(osd_debug_drop_op_probability, OPT_DOUBLE, 0) // probability of stalling/dropping a client op diff --git a/src/osd/PG.cc b/src/osd/PG.cc index d8c7ab93582d..e0000c30abd1 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -973,6 +973,8 @@ void PG::clear_primary_state() missing_loc.clear(); + release_pg_backoffs(); + pg_log.reset_recovery_pointers(); scrubber.reserved_peers.clear(); @@ -1836,6 +1838,7 @@ void PG::activate(ObjectStore::Transaction& t, } state_set(PG_STATE_ACTIVATING); + release_pg_backoffs(); } if (is_primary()) { projected_last_update = info.last_update; @@ -5543,6 +5546,8 @@ bool PG::can_discard_request(OpRequestRef& op) switch (op->get_req()->get_type()) { case CEPH_MSG_OSD_OP: return can_discard_op(op); + case CEPH_MSG_OSD_BACKOFF: + return false; // never discard case MSG_OSD_SUBOP: return can_discard_replica_op(op); case MSG_OSD_REPOP: @@ -5591,6 +5596,9 @@ bool PG::op_must_wait_for_map(epoch_t cur_epoch, OpRequestRef& op) cur_epoch, static_cast(op->get_req())->get_map_epoch()); + case CEPH_MSG_OSD_BACKOFF: + return false; // we don't care about maps + case MSG_OSD_SUBOP: return !have_same_or_newer_map( cur_epoch, diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 98d7559890f0..727b537c5726 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -1627,6 +1627,47 @@ void PrimaryLogPG::do_request( if (can_discard_request(op)) { return; } + + // pg-wide backoffs + Message *m = op->get_req(); + if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) { + SessionRef session((Session *)m->get_connection()->get_priv()); + if (!session) + return; // drop it. + session->put(); // get_priv takes a ref, and so does the SessionRef + + if (op->get_req()->get_type() == CEPH_MSG_OSD_OP) { + Backoff *b = session->have_backoff(info.pgid.pgid.get_hobj_start()); + if (b) { + dout(10) << " have backoff " << *b << " " << *m << dendl; + assert(!b->is_acked() || !g_conf->osd_debug_crash_on_ignored_backoff); + return; + } + + bool backoff = + is_down() || + is_incomplete() || + (!is_active() && is_peered()); + if (g_conf->osd_peering_aggressive_backoff && !backoff) { + if (is_peering()) { + backoff = true; + } + } + if (backoff) { + add_pg_backoff(session); + return; + } + } + // pg backoff acks at pg-level + if (op->get_req()->get_type() == CEPH_MSG_OSD_BACKOFF) { + MOSDBackoff *ba = static_cast(m); + if (ba->begin != ba->end) { + handle_backoff(op); + return; + } + } + } + if (flushes_in_progress > 0) { dout(20) << flushes_in_progress << " flushes_in_progress pending "