From: Samuel Just Date: Fri, 29 Mar 2019 22:36:55 +0000 (-0700) Subject: osd/: move the backfill space reservation back into PG X-Git-Tag: v15.1.0~2774^2~54 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=cabe5b52ccc08d9401967baaab244f540b56aba1;p=ceph-ci.git osd/: move the backfill space reservation back into PG PeeringState really only needs an interface for requesting it. Signed-off-by: Samuel Just --- diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 52661e59300..06d2da98647 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -4082,34 +4082,96 @@ void PG::handle_scrub_reserve_release(OpRequestRef op) clear_scrub_reserved(); } +// Compute pending backfill data +static int64_t pending_backfill(CephContext *cct, int64_t bf_bytes, int64_t local_bytes) +{ + lgeneric_dout(cct, 20) << __func__ << " Adjust local usage " + << (local_bytes >> 10) << "KiB" + << " primary usage " << (bf_bytes >> 10) + << "KiB" << dendl; + + return std::max((int64_t)0, bf_bytes - local_bytes); +} + + // We can zero the value of primary num_bytes as just an atomic. // However, setting above zero reserves space for backfill and requires // the OSDService::stat_lock which protects all OSD usage -void PG::set_reserved_num_bytes(int64_t primary, int64_t local) { - ceph_assert(osd->stat_lock.is_locked_by_me()); - primary_num_bytes.store(primary); - local_num_bytes.store(local); - return; +bool PG::try_reserve_recovery_space( + int64_t primary_bytes, int64_t local_bytes) { + // Use tentative_bacfill_full() to make sure enough + // space is available to handle target bytes from primary. + + // TODO: If we passed num_objects from primary we could account for + // an estimate of the metadata overhead. + + // TODO: If we had compressed_allocated and compressed_original from primary + // we could compute compression ratio and adjust accordingly. + + // XXX: There is no way to get omap overhead and this would only apply + // to whatever possibly different partition that is storing the database. + + // update_osd_stat() from heartbeat will do this on a new + // statfs using ps->primary_bytes. + uint64_t pending_adjustment = 0; + if (primary_bytes) { + // For erasure coded pool overestimate by a full stripe per object + // because we don't know how each objected rounded to the nearest stripe + if (pool.info.is_erasure()) { + primary_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count(); + primary_bytes += get_pgbackend()->get_ec_stripe_chunk_size() * + info.stats.stats.sum.num_objects; + local_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count(); + local_bytes += get_pgbackend()->get_ec_stripe_chunk_size() * + info.stats.stats.sum.num_objects; + } + pending_adjustment = pending_backfill( + cct, + primary_bytes, + local_bytes); + dout(10) << __func__ << " primary_bytes " << (primary_bytes >> 10) + << "KiB" + << " local " << (local_bytes >> 10) << "KiB" + << " pending_adjustments " << (pending_adjustment >> 10) << "KiB" + << dendl; + } + + // This lock protects not only the stats OSDService but also setting the + // pg primary_bytes. That's why we don't immediately unlock + Mutex::Locker l(osd->stat_lock); + osd_stat_t cur_stat = osd->osd_stat; + if (cct->_conf->osd_debug_reject_backfill_probability > 0 && + (rand()%1000 < (cct->_conf->osd_debug_reject_backfill_probability*1000.0))) { + dout(10) << "backfill reservation rejected: failure injection" + << dendl; + return false; + } else if (!cct->_conf->osd_debug_skip_full_check_in_backfill_reservation && + osd->tentative_backfill_full(this, pending_adjustment, cur_stat)) { + dout(10) << "backfill reservation rejected: backfill full" + << dendl; + return false; + } else { + // Don't reserve space if skipped reservation check, this is used + // to test the other backfill full check AND in case a corruption + // of num_bytes requires ignoring that value and trying the + // backfill anyway. + if (primary_bytes && + !cct->_conf->osd_debug_skip_full_check_in_backfill_reservation) { + primary_num_bytes.store(primary_bytes); + local_num_bytes.store(local_bytes); + } else { + unreserve_recovery_space(); + } + return true; + } } -void PG::clear_reserved_num_bytes() { +void PG::unreserve_recovery_space() { primary_num_bytes.store(0); local_num_bytes.store(0); return; } -void PG::reject_reservation() -{ - clear_reserved_num_bytes(); - osd->send_message_osd_cluster( - primary.osd, - new MBackfillReserve( - MBackfillReserve::REJECT, - spg_t(info.pgid.pgid, primary.shard), - get_osdmap_epoch()), - get_osdmap_epoch()); -} - void PG::clear_scrub_reserved() { scrubber.reserved_peers.clear(); diff --git a/src/osd/PG.h b/src/osd/PG.h index 76188e151c5..785a742436a 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -789,8 +789,8 @@ public: return primary_num_bytes.load() > 0; } - void set_reserved_num_bytes(int64_t primary, int64_t local); - void clear_reserved_num_bytes(); + bool try_reserve_recovery_space(int64_t primary, int64_t local) override; + void unreserve_recovery_space() override; // If num_bytes are inconsistent and local_num- goes negative // it's ok, because it would then be ignored. @@ -1429,8 +1429,6 @@ protected: void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from); void handle_scrub_reserve_release(OpRequestRef op); - void reject_reservation(); - // -- recovery state -- struct QueuePeeringEvt : Context { diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc index 4829edd7143..4d3451f18aa 100644 --- a/src/osd/PeeringState.cc +++ b/src/osd/PeeringState.cc @@ -1115,6 +1115,20 @@ bool PeeringState::all_unfound_are_queried_or_lost( } +void PeeringState::reject_reservation() +{ + pl->unreserve_recovery_space(); + pl->send_cluster_message( + primary.osd, + new MBackfillReserve( + MBackfillReserve::REJECT, + spg_t(info.pgid.pgid, primary.shard), + get_osdmap_epoch()), + get_osdmap_epoch()); +} + + + /*------------ Peering State Machine----------------*/ #undef dout_prefix #define dout_prefix (context< PeeringMachine >().dpp->gen_prefix(*_dout) \ @@ -1860,7 +1874,7 @@ boost::statechart::result PeeringState::RepNotRecovering::react(const RejectRemoteReservation &evt) { DECLARE_LOCALS - pg->reject_reservation(); + ps->reject_reservation(); post_event(RemoteReservationRejected()); return discard_event(); } @@ -1900,7 +1914,7 @@ PeeringState::RepWaitRecoveryReserved::react( const RemoteReservationCanceled &evt) { DECLARE_LOCALS - pg->clear_reserved_num_bytes(); + pl->unreserve_recovery_space(); pl->cancel_remote_recovery_reservation(); return transit(); @@ -1922,77 +1936,16 @@ PeeringState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx) context< PeeringMachine >().log_enter(state_name); } -// Compute pending backfill data -static int64_t pending_backfill(CephContext *cct, int64_t bf_bytes, int64_t local_bytes) -{ - lgeneric_dout(cct, 20) << __func__ << " Adjust local usage " << (local_bytes >> 10) << "KiB" - << " primary usage " << (bf_bytes >> 10) << "KiB" << dendl; - return std::max((int64_t)0, bf_bytes - local_bytes); -} - boost::statechart::result PeeringState::RepNotRecovering::react(const RequestBackfillPrio &evt) { DECLARE_LOCALS - // Use tentative_bacfill_full() to make sure enough - // space is available to handle target bytes from primary. - - // TODO: If we passed num_objects from primary we could account for - // an estimate of the metadata overhead. - - // TODO: If we had compressed_allocated and compressed_original from primary - // we could compute compression ratio and adjust accordingly. - - // XXX: There is no way to get omap overhead and this would only apply - // to whatever possibly different partition that is storing the database. - // update_osd_stat() from heartbeat will do this on a new - // statfs using ps->primary_num_bytes. - uint64_t pending_adjustment = 0; - int64_t primary_num_bytes = evt.primary_num_bytes; - int64_t local_num_bytes = evt.local_num_bytes; - if (primary_num_bytes) { - // For erasure coded pool overestimate by a full stripe per object - // because we don't know how each objected rounded to the nearest stripe - if (ps->pool.info.is_erasure()) { - primary_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count(); - primary_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * ps->info.stats.stats.sum.num_objects; - local_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count(); - local_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * ps->info.stats.stats.sum.num_objects; - } - pending_adjustment = pending_backfill( - context< PeeringMachine >().cct, - primary_num_bytes, - local_num_bytes); - psdout(10) << __func__ << " primary_num_bytes " << (primary_num_bytes >> 10) << "KiB" - << " local " << (local_num_bytes >> 10) << "KiB" - << " pending_adjustments " << (pending_adjustment >> 10) << "KiB" - << dendl; - } - // This lock protects not only the stats OSDService but also setting the pg primary_num_bytes - // That's why we don't immediately unlock - Mutex::Locker l(pg->osd->stat_lock); - osd_stat_t cur_stat = pg->osd->osd_stat; - if (ps->cct->_conf->osd_debug_reject_backfill_probability > 0 && - (rand()%1000 < (ps->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) { - psdout(10) << "backfill reservation rejected: failure injection" - << dendl; - post_event(RejectRemoteReservation()); - } else if (!ps->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation && - pg->osd->tentative_backfill_full(pg, pending_adjustment, cur_stat)) { - psdout(10) << "backfill reservation rejected: backfill full" - << dendl; + if (!pl->try_reserve_recovery_space( + evt.primary_num_bytes, evt.local_num_bytes)) { post_event(RejectRemoteReservation()); } else { - // Don't reserve space if skipped reservation check, this is used - // to test the other backfill full check AND in case a corruption - // of num_bytes requires ignoring that value and trying the - // backfill anyway. - if (primary_num_bytes && !ps->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation) - pg->set_reserved_num_bytes(primary_num_bytes, local_num_bytes); - else - pg->clear_reserved_num_bytes(); // Use un-ec-adjusted bytes for stats. ps->info.stats.stats.sum.num_bytes = evt.local_num_bytes; @@ -2072,7 +2025,7 @@ PeeringState::RepWaitBackfillReserved::react( const RejectRemoteReservation &evt) { DECLARE_LOCALS - pg->reject_reservation(); + ps->reject_reservation(); post_event(RemoteReservationRejected()); return discard_event(); } @@ -2082,7 +2035,7 @@ PeeringState::RepWaitBackfillReserved::react( const RemoteReservationRejected &evt) { DECLARE_LOCALS - pg->clear_reserved_num_bytes(); + pl->unreserve_recovery_space(); pl->cancel_remote_recovery_reservation(); return transit(); @@ -2093,7 +2046,7 @@ PeeringState::RepWaitBackfillReserved::react( const RemoteReservationCanceled &evt) { DECLARE_LOCALS - pg->clear_reserved_num_bytes(); + pl->unreserve_recovery_space(); pl->cancel_remote_recovery_reservation(); return transit(); @@ -2113,7 +2066,7 @@ PeeringState::RepRecovering::react(const RemoteRecoveryPreempted &) DECLARE_LOCALS - pg->clear_reserved_num_bytes(); + pl->unreserve_recovery_space(); pl->send_cluster_message( ps->primary.osd, new MRecoveryReserve( @@ -2130,7 +2083,7 @@ PeeringState::RepRecovering::react(const BackfillTooFull &) DECLARE_LOCALS - pg->clear_reserved_num_bytes(); + pl->unreserve_recovery_space(); pl->send_cluster_message( ps->primary.osd, new MBackfillReserve( @@ -2147,7 +2100,7 @@ PeeringState::RepRecovering::react(const RemoteBackfillPreempted &) DECLARE_LOCALS - pg->clear_reserved_num_bytes(); + pl->unreserve_recovery_space(); pl->send_cluster_message( ps->primary.osd, new MBackfillReserve( @@ -2162,7 +2115,7 @@ void PeeringState::RepRecovering::exit() { context< PeeringMachine >().log_exit(state_name, enter_time); DECLARE_LOCALS - pg->clear_reserved_num_bytes(); + pl->unreserve_recovery_space(); pl->cancel_remote_recovery_reservation(); utime_t dur = ceph_clock_now() - enter_time; @@ -2894,7 +2847,7 @@ void PeeringState::ReplicaActive::exit() { context< PeeringMachine >().log_exit(state_name, enter_time); DECLARE_LOCALS - pg->clear_reserved_num_bytes(); + pl->unreserve_recovery_space(); pl->cancel_remote_recovery_reservation(); utime_t dur = ceph_clock_now() - enter_time; diff --git a/src/osd/PeeringState.h b/src/osd/PeeringState.h index b89f1f6780a..18e30ac8fa1 100644 --- a/src/osd/PeeringState.h +++ b/src/osd/PeeringState.h @@ -131,6 +131,11 @@ public: virtual void on_backfill_canceled() = 0; virtual void on_recovery_reserved() = 0; + // recovery space accounting + virtual bool try_reserve_recovery_space( + int64_t primary_num_bytes, int64_t local_num_bytes) = 0; + virtual void unreserve_recovery_space() = 0; + virtual epoch_t oldest_stored_osdmap() = 0; virtual LogChannel &get_clog() = 0; @@ -1249,6 +1254,8 @@ public: bool adjust_need_up_thru(const OSDMapRef osdmap); PastIntervals::PriorSet build_prior(); + void reject_reservation(); + public: PeeringState( CephContext *cct,