From: Ronen Friedman Date: Thu, 12 Nov 2020 08:39:31 +0000 (+0200) Subject: osd: scrub refactoring: decalaring the interfaces exposed by a scrubber object X-Git-Tag: v16.1.0~270^2~9 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=d492c7b109c36469d62dab36faf715a6fb8b717a;p=ceph.git osd: scrub refactoring: decalaring the interfaces exposed by a scrubber object The interfaces exposed by the scrubber, to be used by: 1 - the PG 2 - the scrubbing state-machine (a Scrubber's object) +-------------------+ | | | PG +--------------+ | | | +-------------------+ | v +---ScrubPgIF----+ +----------+----------------+-----+ | | | Scrubber | | | | | +----------+---------------+------+ +---------------+ ^ ScrubMachineListener | +-------------------+ | | | | | Scrubber FSM +------------+ | | +-------------------+ Signed-off-by: Ronen Friedman --- diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 6558158843b0..b6e1474b324e 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -202,6 +202,8 @@ WRITE_EQ_OPERATORS_2(pg_shard_t, osd, shard) WRITE_CMP_OPERATORS_2(pg_shard_t, osd, shard) std::ostream& operator<<(std::ostream &lhs, const pg_shard_t &rhs); +using HobjToShardSetMapping = std::map>; + class IsPGRecoverablePredicate { public: /** @@ -1107,6 +1109,24 @@ public: return true; } + template + T value_or(key_t key, const T& default_value) const { + auto i = opts.find(key); + if (i == opts.end()) { + return default_value; + } + return boost::get(i->second); + } + + template + T value_or(key_t key, T&& default_value) const { + auto i = opts.find(key); + if (i == opts.end()) { + return std::move(default_value); + } + return boost::get(i->second); + } + const value_t& get(key_t key) const; bool unset(key_t key); @@ -6001,6 +6021,8 @@ struct PushOp { WRITE_CLASS_ENCODER_FEATURES(PushOp) std::ostream& operator<<(std::ostream& out, const PushOp &op); +enum class scrub_level_t : bool { shallow = false, deep = true }; +enum class scrub_type_t : bool { not_repair = false, do_repair = true }; /* * summarize pg contents for purposes of a scrub @@ -6538,5 +6560,9 @@ public: const ceph::buffer::list& xattr_data) const override; }; +// alias name for this structure: +using missing_map_t = std::map, + std::optional>>; #endif diff --git a/src/osd/scrub_machine_lstnr.h b/src/osd/scrub_machine_lstnr.h new file mode 100644 index 000000000000..cfaca4b10f25 --- /dev/null +++ b/src/osd/scrub_machine_lstnr.h @@ -0,0 +1,131 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once +/** + * \file the PgScrubber interface used by the scrub FSM + */ +#include "common/version.h" +#include "include/Context.h" + +#include "osd_types.h" + +namespace Scrub { + +/// used when PgScrubber is called by the scrub-machine, to tell the FSM +/// how to continue +enum class FsmNext { do_discard, next_chunk, goto_notactive }; + +/// the interface exposed by the PgScrubber into its internal +/// preemption_data object +struct preemption_t { + + virtual ~preemption_t(){}; + + [[nodiscard]] virtual bool is_preemptable() const = 0; + + [[nodiscard]] virtual bool was_preempted() const = 0; + + virtual void adjust_parameters() = 0; + + /** + * Try to preempt the scrub. + * 'true' (i.e. - preempted) if: + * preemptable && not already preempted + */ + virtual bool do_preempt() = 0; + + /** + * disables preemptions. + * Returns 'true' if we were already preempted + */ + virtual bool disable_and_test() = 0; +}; + +} // namespace Scrub + +struct ScrubMachineListener { + + virtual ~ScrubMachineListener(){}; + + virtual bool select_range() = 0; + + /// walk the log to find the latest update that affects our chunk + virtual eversion_t search_log_for_updates() const = 0; + + virtual eversion_t get_last_update_applied() const = 0; + + virtual void requeue_waiting() const = 0; + + virtual int pending_active_pushes() const = 0; + + virtual int build_primary_map_chunk() = 0; + + virtual int build_replica_map_chunk() = 0; + + virtual void scrub_compare_maps() = 0; + + virtual void on_init() = 0; + + virtual void on_replica_init() = 0; + + virtual void replica_handling_done() = 0; + + /// the version of 'scrub_clear_state()' that does not try to invoke FSM services + /// (thus can be called from FSM reactions) + virtual void clear_pgscrub_state(bool keep_repair_state) = 0; + + virtual void add_delayed_scheduling() = 0; + + /** + * @returns have we asked at least one replica? + * 'false' means we are configured with no replicas, and + * should expect no maps to arrive. + */ + virtual bool get_replicas_maps(bool replica_can_preempt) = 0; + + virtual Scrub::FsmNext on_digest_updates() = 0; + + virtual void send_replica_map(bool was_preempted) = 0; + + virtual void replica_update_start_epoch() = 0; + + [[nodiscard]] virtual bool has_pg_marked_new_updates() const = 0; + + virtual void set_subset_last_update(eversion_t e) = 0; + + [[nodiscard]] virtual bool was_epoch_changed() const = 0; + + virtual Scrub::preemption_t* get_preemptor() = 0; + + /** + * a "technical" collection of the steps performed once all + * rep maps are available: + * - the maps are compared + * - the scrub region markers (start_ & end_) are advanced + * - callbacks and ops that were pending are free to run + */ + virtual void maps_compare_n_cleanup() = 0; + + /** + * order the PgScrubber to initiate the process of reserving replicas' scrub + * resources. + */ + virtual void reserve_replicas() = 0; + + virtual void unreserve_replicas() = 0; + + /** + * the FSM interface into the "are we waiting for maps, either our own or from + * replicas" state. + * The FSM can only: + * - mark the local map as available, and + * - query status + */ + virtual void mark_local_map_ready() = 0; + + [[nodiscard]] virtual bool are_all_maps_available() const = 0; + + /// a log/debug interface + virtual std::string dump_awaited_maps() const = 0; +}; diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h new file mode 100644 index 000000000000..d736319af1be --- /dev/null +++ b/src/osd/scrubber_common.h @@ -0,0 +1,257 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once + +#include "common/scrub_types.h" +#include "include/types.h" +#include "os/ObjectStore.h" + +#include "OpRequest.h" + +namespace ceph { +class Formatter; +} + +namespace Scrub { + +/// high/low OP priority +enum class scrub_prio_t : bool { low_priority = false, high_priority = true }; + +} // namespace Scrub + + +/** + * Flags affecting the scheduling and behaviour of the *next* scrub. + * + * we hold two of these flag collections: one + * for the next scrub, and one frozen at initiation (i.e. in pg::queue_scrub()) + */ +struct requested_scrub_t { + + // flags to indicate explicitly requested scrubs (by admin): + // bool must_scrub, must_deep_scrub, must_repair, need_auto; + + /** + * 'must_scrub' is set by an admin command (or by need_auto). + * Affects the priority of the scrubbing, and the sleep periods + * during the scrub. + */ + bool must_scrub{false}; + + /** + * scrub must not be aborted. + * Set for explicitly requested scrubs, and for scrubs originated by the pairing + * process with the 'repair' flag set (in the RequestScrub event). + * + * Will be copied into the 'required' scrub flag upon scrub start. + */ + bool req_scrub{false}; + + /** + * Set from: + * - scrub_requested() with need_auto param set, which only happens in + * - scrub_finish() - if deep_scrub_on_error is set, and we have errors + * + * If set, will prevent the OSD from casually postponing our scrub. When scrubbing + * starts, will cause must_scrub, must_deep_scrub and auto_repair to be set. + */ + bool need_auto{false}; + + /** + * Set for scrub-after-recovery just before we initiate the recovery deep scrub, + * or if scrub_requested() was called with either need_auto ot repair. + * Affects PG_STATE_DEEP_SCRUB. + */ + bool must_deep_scrub{false}; + + /** + * (An intermediary flag used by pg::sched_scrub() on the first time + * a planned scrub has all its resources). Determines whether the next + * repair/scrub will be 'deep'. + * + * Note: 'dumped' by PgScrubber::dump() and such. In reality, being a + * temporary that is set and reset by the same operation, will never + * appear externally to be set + */ + bool time_for_deep{false}; + + bool deep_scrub_on_error{false}; + + /** + * If set, we should see must_deep_scrub and must_repair set, too + * + * - 'must_repair' is checked by the OSD when scheduling the scrubs. + * - also checked & cleared at pg::queue_scrub() + */ + bool must_repair{false}; + + /* + * the value of auto_repair is determined in sched_scrub() (once per scrub. previous + * value is not remembered). Set if + * - allowed by configuration and backend, and + * - must_scrub is not set (i.e. - this is a periodic scrub), + * - time_for_deep was just set + */ + bool auto_repair{false}; + + /** + * indicating that we are scrubbing post repair to verify everything is fixed. + * Otherwise - PG_STATE_FAILED_REPAIR will be asserted. + */ + bool check_repair{false}; +}; + +ostream& operator<<(ostream& out, const requested_scrub_t& sf); + +/** + * The interface used by the PG when requesting scrub-related info or services + */ +struct ScrubPgIF { + + virtual ~ScrubPgIF(){}; + + friend ostream& operator<<(ostream& out, const ScrubPgIF& s) { return s.show(out); } + + virtual ostream& show(ostream& out) const = 0; + + // --------------- triggering state-machine events: + + virtual void send_start_scrub() = 0; + + virtual void send_start_after_repair() = 0; + + virtual void send_scrub_resched() = 0; + + virtual void replica_scrub_resched(epoch_t epoch_queued) = 0; + + virtual void active_pushes_notification() = 0; + + virtual void update_applied_notification(epoch_t epoch_queued) = 0; + + virtual void digest_update_notification() = 0; + + virtual void send_scrub_unblock() = 0; + + virtual void send_replica_maps_ready() = 0; + + virtual void send_replica_pushes_upd() = 0; + + // -------------------------------------------------- + + virtual void reset_epoch(epoch_t epoch_queued) = 0; + + [[nodiscard]] virtual bool are_callbacks_pending() + const = 0; // currently only used for an assert + + /** + * the scrubber is marked 'active': + * - for the primary: when all replica OSDs grant us the requested resources + * - for replicas: upon receiving the scrub request from the primary + */ + [[nodiscard]] virtual bool is_scrub_active() const = 0; + + /// are we waiting for resource reservation grants form our replicas? + [[nodiscard]] virtual bool is_reserving() const = 0; + + /// handle a message carrying a replica map + virtual void map_from_replica(OpRequestRef op) = 0; + + virtual void replica_scrub_op(OpRequestRef op) = 0; + + virtual void replica_scrub(epoch_t epoch_queued) = 0; + + virtual void set_op_parameters(requested_scrub_t&) = 0; + + virtual void scrub_clear_state(bool keep_repair_state = false) = 0; + + virtual void handle_query_state(ceph::Formatter* f) = 0; + + virtual void dump(ceph::Formatter* f) const = 0; + + /** + * we allow some number of preemptions of the scrub, which mean we do + * not block. Then we start to block. Once we start blocking, we do + * not stop until the scrub range is completed. + */ + virtual bool write_blocked_by_scrub(const hobject_t& soid) = 0; + + /// true if the given range intersects the scrub interval in any way + virtual bool range_intersects_scrub(const hobject_t& start, const hobject_t& end) = 0; + + /// the op priority, taken from the primary's request message + virtual Scrub::scrub_prio_t replica_op_priority() const = 0; + + /// the priority of the on-going scrub (used when requeuing events) + virtual unsigned int scrub_requeue_priority( + Scrub::scrub_prio_t with_priority) const = 0; + virtual unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority, + unsigned int suggested_priority) const = 0; + + virtual void add_callback(Context* context) = 0; + + /// should we requeue blocked ops? + [[nodiscard]] virtual bool should_requeue_blocked_ops( + eversion_t last_recovery_applied) const = 0; + + /// add to scrub statistics, but only if the soid is below the scrub start + virtual void stats_of_handled_objects(const object_stat_sum_t& delta_stats, + const hobject_t& soid) = 0; + + /** + * the version of 'scrub_clear_state()' that does not try to invoke FSM services + * (thus can be called from FSM reactions) + */ + virtual void clear_pgscrub_state(bool keep_repair_state) = 0; + + /** + * triggers the 'RemotesReserved' (all replicas granted scrub resources) + * state-machine event + */ + virtual void send_remotes_reserved() = 0; + + /** + * triggers the 'ReservationFailure' (at least one replica denied us the requested + * resources) state-machine event + */ + virtual void send_reservation_failure() = 0; + + virtual void cleanup_store(ObjectStore::Transaction* t) = 0; + + virtual bool get_store_errors(const scrub_ls_arg_t& arg, + scrub_ls_result_t& res_inout) const = 0; + + // --------------- reservations ----------------------------------- + + /** + * message all replicas with a request to "unreserve" scrub + */ + virtual void unreserve_replicas() = 0; + + /** + * clear both local and OSD-managed resource reservation flags + * (note: no replica un/reservation messages are involved!) + */ + virtual void clear_scrub_reservations() = 0; + + /** + * Reserve local scrub resources (managed by the OSD) + * + * Fails if OSD's local-scrubs budget was exhausted + * \returns were local resources reserved? + */ + virtual bool reserve_local() = 0; + + // on the replica: + virtual void handle_scrub_reserve_request(OpRequestRef op) = 0; + virtual void handle_scrub_reserve_release(OpRequestRef op) = 0; + + // and on the primary: + virtual void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) = 0; + virtual void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) = 0; + + virtual void reg_next_scrub(const requested_scrub_t& request_flags) = 0; + virtual void unreg_next_scrub() = 0; + virtual void scrub_requested(scrub_level_t scrub_level, + scrub_type_t scrub_type, + requested_scrub_t& req_flags) = 0; +};