From: Joao Eduardo Luis Date: Mon, 20 Apr 2015 13:39:29 +0000 (+0100) Subject: mon: Monitor: rework scrub (2) X-Git-Tag: v9.0.3~18^2~5 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=0a19fd4a2686d07cce632d20d91cf419e41dada6;p=ceph.git mon: Monitor: rework scrub (2) Instead of hanging while scrubbing all the keys in the store, scrub just a few keys each time and keep scrubbing until we reach go through all the keys in the store. This may miss a few keys if proposals are committed in-between scrub rounds, but we'll get them eventually (if they are not trimmed first). Fixes: #11773 Signed-off-by: Joao Eduardo Luis --- diff --git a/src/common/config_opts.h b/src/common/config_opts.h index f91cf472465b..ee08e0ea78b6 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -244,7 +244,7 @@ OPTION(mon_data_avail_crit, OPT_INT, 5) OPTION(mon_data_avail_warn, OPT_INT, 30) OPTION(mon_data_size_warn, OPT_U64, 15*1024*1024*1024) // issue a warning when the monitor's data store goes over 15GB (in bytes) OPTION(mon_scrub_interval, OPT_INT, 3600*24) // once a day -OPTION(mon_scrub_max_keys, OPT_INT, -1) // max number of keys to scrub each time +OPTION(mon_scrub_max_keys, OPT_INT, 100) // max number of keys to scrub each time OPTION(mon_config_key_max_entry_size, OPT_INT, 4096) // max num bytes per config-key entry OPTION(mon_sync_timeout, OPT_DOUBLE, 60.0) OPTION(mon_sync_max_payload_size, OPT_U32, 1048576) // max size for a sync chunk payload (say, 1MB) diff --git a/src/messages/MMonScrub.h b/src/messages/MMonScrub.h index b16728bcdd55..9bfa61a70f86 100644 --- a/src/messages/MMonScrub.h +++ b/src/messages/MMonScrub.h @@ -18,7 +18,7 @@ class MMonScrub : public Message { - static const int HEAD_VERSION = 1; + static const int HEAD_VERSION = 2; static const int COMPAT_VERSION = 1; public: @@ -38,14 +38,17 @@ public: op_type_t op; version_t version; ScrubResult result; + int32_t num_keys; + pair key; MMonScrub() - : Message(MSG_MON_SCRUB, HEAD_VERSION, COMPAT_VERSION) + : Message(MSG_MON_SCRUB, HEAD_VERSION, COMPAT_VERSION), + num_keys(-1) { } - MMonScrub(op_type_t op, version_t v) + MMonScrub(op_type_t op, version_t v, int32_t num_keys) : Message(MSG_MON_SCRUB, HEAD_VERSION, COMPAT_VERSION), - op(op), version(v) + op(op), version(v), num_keys(num_keys) { } const char *get_type_name() const { return "mon_scrub"; } @@ -55,6 +58,8 @@ public: out << " v " << version; if (op == OP_RESULT) out << " " << result; + out << " num_keys " << num_keys; + out << " key (" << key << ")"; out << ")"; } @@ -63,6 +68,8 @@ public: ::encode(o, payload); ::encode(version, payload); ::encode(result, payload); + ::encode(num_keys, payload); + ::encode(key, payload); } void decode_payload() { @@ -72,6 +79,10 @@ public: op = (op_type_t)o; ::decode(version, p); ::decode(result, p); + if (header.version >= 2) { + ::decode(num_keys, p); + ::decode(key, p); + } } }; diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 1284b385930b..862d92b3c896 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -2711,7 +2711,7 @@ void Monitor::handle_command(MMonCommand *m) if (prefix == "scrub") { wait_for_paxos_write(); if (is_leader()) { - int r = scrub(); + int r = scrub_start(); reply_command(m, r, "", rdata, 0); } else if (is_peon()) { forward_request_leader(m); @@ -4194,7 +4194,7 @@ void Monitor::handle_mon_get_map(MMonGetMap *m) // ---------------------------------------------- // scrub -int Monitor::scrub() +int Monitor::scrub_start() { dout(10) << __func__ << dendl; assert(is_leader()); @@ -4211,28 +4211,43 @@ int Monitor::scrub() scrub_result.clear(); scrub_version = paxos->get_version(); + scrub_state.reset(new ScrubState); + + scrub(); + return 0; +} + +int Monitor::scrub() +{ + assert(is_leader()); + assert(scrub_state); + + // scrub all keys if we're the only monitor in the quorum + int32_t num_keys = + (quorum.size() == 1 ? -1 : cct->_conf->mon_scrub_max_keys); for (set::iterator p = quorum.begin(); p != quorum.end(); ++p) { if (*p == rank) continue; - MMonScrub *r = new MMonScrub(MMonScrub::OP_SCRUB, scrub_version); + MMonScrub *r = new MMonScrub(MMonScrub::OP_SCRUB, scrub_version, + num_keys); + r->key = scrub_state->last_key; messenger->send_message(r, monmap->get_inst(*p)); } - scrub_state.reset(new ScrubState); - // scrub my keys - pair start; - bool r = _scrub(&scrub_result[rank], start, cct->_conf->mon_scrub_max_keys); - assert(!r); + bool r = _scrub(&scrub_result[rank], + &scrub_state->last_key, + &num_keys); - scrub_state.reset(); + scrub_state->finished = !r; - if (scrub_result.size() == quorum.size()) + if (quorum.size() == 1) { + assert(scrub_state->finished == true); scrub_finish(); - + } return 0; } @@ -4246,9 +4261,13 @@ void Monitor::handle_scrub(MMonScrub *m) break; if (m->version != paxos->get_version()) break; - MMonScrub *reply = new MMonScrub(MMonScrub::OP_RESULT, m->version); - pair start; - _scrub(&reply->result, start, cct->_conf->mon_scrub_max_keys); + + MMonScrub *reply = new MMonScrub(MMonScrub::OP_RESULT, + m->version, + m->num_keys); + + reply->key = m->key; + _scrub(&reply->result, &reply->key, &reply->num_keys); m->get_connection()->send_message(reply); } break; @@ -4263,8 +4282,14 @@ void Monitor::handle_scrub(MMonScrub *m) assert(scrub_result.count(from) == 0); scrub_result[from] = m->result; - if (scrub_result.size() == quorum.size()) - scrub_finish(); + if (scrub_result.size() == quorum.size()) { + scrub_check_results(); + scrub_result.clear(); + if (scrub_state->finished) + scrub_finish(); + else + scrub(); + } } break; } @@ -4272,40 +4297,58 @@ void Monitor::handle_scrub(MMonScrub *m) } bool Monitor::_scrub(ScrubResult *r, - pair &start, - int num_keys) + pair *start, + int *num_keys) { + assert(r != NULL); + assert(start != NULL); + assert(num_keys != NULL); + set prefixes = get_sync_targets_names(); prefixes.erase("paxos"); // exclude paxos, as this one may have extra states for proposals, etc. - dout(10) << __func__ << " prefixes " << prefixes << dendl; + dout(10) << __func__ << " start (" << *start << ")" + << " num_keys " << *num_keys << dendl; - MonitorDBStore::Synchronizer it = store->get_synchronizer(start, prefixes); + MonitorDBStore::Synchronizer it = store->get_synchronizer(*start, prefixes); int scrubbed_keys = 0; + pair last_key; while (it->has_next_chunk()) { - if (num_keys > 0 && scrubbed_keys == num_keys) + if (*num_keys > 0 && scrubbed_keys == *num_keys) break; pair k = it->get_next_key(); + if (prefixes.count(k.first) == 0) + continue; + bufferlist bl; store->get(k.first, k.second, bl); - dout(30) << __func__ << " " << k << " bl " << bl.length() << " bytes crc " << bl.crc32c(0) << dendl; + uint32_t key_crc = bl.crc32c(0); + dout(30) << __func__ << " " << k << " bl " << bl.length() << " bytes" + << " crc " << key_crc << dendl; r->prefix_keys[k.first]++; if (r->prefix_crc.count(k.first) == 0) r->prefix_crc[k.first] = 0; r->prefix_crc[k.first] = bl.crc32c(r->prefix_crc[k.first]); + + ++scrubbed_keys; + last_key = k; } - if (scrub_state) // leader - scrub_state->last_key = it->get_last_key(); + dout(20) << __func__ << " last_key (" << last_key << ")" + << " scrubbed_keys " << scrubbed_keys + << " has_next " << it->has_next_chunk() << dendl; + + *start = last_key; + *num_keys = scrubbed_keys; return it->has_next_chunk(); } -void Monitor::scrub_finish() +void Monitor::scrub_check_results() { dout(10) << __func__ << dendl; @@ -4326,7 +4369,11 @@ void Monitor::scrub_finish() } if (!errors) clog->info() << "scrub ok on " << quorum << ": " << mine << "\n"; +} +void Monitor::scrub_finish() +{ + dout(10) << __func__ << dendl; scrub_reset(); scrub_event_start(); } diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h index 18317990e57a..eff10467087b 100644 --- a/src/mon/Monitor.h +++ b/src/mon/Monitor.h @@ -242,11 +242,13 @@ private: * * Verify all mons are storing identical content */ + int scrub_start(); int scrub(); void handle_scrub(MMonScrub *m); bool _scrub(ScrubResult *r, - pair &start, - int num_keys); + pair *start, + int *num_keys); + void scrub_check_results(); void scrub_finish(); void scrub_reset(); @@ -254,7 +256,7 @@ private: Monitor *mon; C_Scrub(Monitor *m) : mon(m) { } void finish(int r) { - mon->scrub(); + mon->scrub_start(); } }; Context *scrub_event; ///< periodic event to trigger scrub (leader) @@ -263,8 +265,9 @@ private: struct ScrubState { pair last_key; ///< last scrubbed key + bool finished; - ScrubState() { } + ScrubState() : finished(false) { } virtual ~ScrubState() { } }; ceph::shared_ptr scrub_state; ///< keeps track of current scrub