From: Matan Breizman Date: Wed, 12 Jul 2023 12:33:58 +0000 (+0000) Subject: osd/osd_types: Introduce OSDSuperblock::maps X-Git-Tag: v19.0.0~199^2~5 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=9215996586ba79302c07e5cf586ec2add1ac159e;p=ceph.git osd/osd_types: Introduce OSDSuperblock::maps replace oldest/newest_map members with an interval_set in order to support the tracking of an osdmap range gap. Map gap example: ``` 2023-08-28T18:21:05.452+0000 7f409bce2640 3 osd.4 84 handle_osd_map epochs [85,86], i have 84, src has [1,86] 2023-08-28T18:26:25.829+0000 7fcfea0c1640 3 osd.4 86 handle_osd_map epochs [208,208], i have 86, src has [208,400] 2023-08-28T18:26:25.829+0000 7fcfea0c1640 10 osd.4 86 superblock cluster_osdmap_trim_lower_bound new epoch is: 208 2023-08-28T18:26:25.829+0000 7fcfea0c1640 10 osd.4 86 handle_osd_map osd map gap [16~71,208~1] 2023-08-28T18:26:25.829+0000 7fcfea0c1640 3 osd.4 86 handle_osd_map epochs [209,248], i have 208, src has [208,400] 2023-08-28T18:26:25.833+0000 7fcfea0c1640 10 osd.4 86 handle_osd_map osd map gap [31~56,208~4] 2023-08-28T18:26:25.941+0000 7fcfea0c1640 3 osd.4 211 handle_osd_map epochs [212,248], i have 211, src has [208,400] 2023-08-28T18:26:25.945+0000 7fcfea0c1640 10 osd.4 211 handle_osd_map osd map gap [46~41,208~41] 2023-08-28T18:26:25.949+0000 7fcfea0c1640 3 osd.4 211 handle_osd_map epochs [209,248], i have 248, src has [208,400] 2023-08-28T18:26:25.949+0000 7fcfea0c1640 3 osd.4 211 handle_osd_map epochs [212,251], i have 248, src has [208,400] 2023-08-28T18:26:25.953+0000 7fcfea0c1640 10 osd.4 211 handle_osd_map osd map gap [61~26,208~44] 2023-08-28T18:26:26.073+0000 7fcfea0c1640 3 osd.4 251 handle_osd_map epochs [249,288], i have 251, src has [208,400] 2023-08-28T18:26:26.081+0000 7fcfea0c1640 10 osd.4 251 handle_osd_map osd map gap [76~11,208~48] 2023-08-28T18:26:26.081+0000 7fcfea0c1640 3 osd.4 251 handle_osd_map epochs [252,291], i have 255, src has [208,400] ``` Full example: https://gist.github.com/Matan-B/9b0eed8daee3bd6c3216bd3b6d11e8fb Fixes: https://tracker.ceph.com/issues/61962 Signed-off-by: Matan Breizman --- diff --git a/src/crimson/osd/osd.cc b/src/crimson/osd/osd.cc index cfe4f54ab2e5..ccb7435332b3 100644 --- a/src/crimson/osd/osd.cc +++ b/src/crimson/osd/osd.cc @@ -708,19 +708,17 @@ void OSD::dump_status(Formatter* f) const f->dump_stream("osd_fsid") << superblock.osd_fsid; f->dump_unsigned("whoami", superblock.whoami); f->dump_string("state", pg_shard_manager.get_osd_state_string()); - f->dump_unsigned("oldest_map", superblock.oldest_map); + f->dump_stream("maps") << superblock.maps; f->dump_unsigned("cluster_osdmap_trim_lower_bound", superblock.cluster_osdmap_trim_lower_bound); - f->dump_unsigned("newest_map", superblock.newest_map); f->dump_unsigned("num_pgs", pg_shard_manager.get_num_pgs()); } void OSD::print(std::ostream& out) const { out << "{osd." << superblock.whoami << " " - << superblock.osd_fsid << " [" << superblock.oldest_map - << "," << superblock.newest_map << "] " - << "tlb:" << superblock.cluster_osdmap_trim_lower_bound + << superblock.osd_fsid << " maps " << superblock.maps + << " tlb:" << superblock.cluster_osdmap_trim_lower_bound << " pgs:" << pg_shard_manager.get_num_pgs() << "}"; } @@ -934,16 +932,16 @@ seastar::future<> OSD::_handle_osd_map(Ref m) const auto first = m->get_first(); const auto last = m->get_last(); logger().info("handle_osd_map epochs [{}..{}], i have {}, src has [{}..{}]", - first, last, superblock.newest_map, + first, last, superblock.get_newest_map(), m->cluster_osdmap_trim_lower_bound, m->newest_map); // make sure there is something new, here, before we bother flushing // the queues and such - if (last <= superblock.newest_map) { + if (last <= superblock.get_newest_map()) { return seastar::now(); } // missing some? bool skip_maps = false; - epoch_t start = superblock.newest_map + 1; + epoch_t start = superblock.get_newest_map() + 1; if (first > start) { logger().info("handle_osd_map message skips epochs {}..{}", start, first - 1); @@ -967,10 +965,13 @@ seastar::future<> OSD::_handle_osd_map(Ref m) return pg_shard_manager.store_maps(t, start, m).then([=, this, &t] { // even if this map isn't from a mon, we may have satisfied our subscription monc->sub_got("osdmap", last); - if (!superblock.oldest_map || skip_maps) { - superblock.oldest_map = first; + + if (!superblock.maps.empty()) { + // TODO: support osdmap trimming + // See: } - superblock.newest_map = last; + + superblock.insert_osdmap_epochs(first, last); superblock.current_epoch = last; // note in the superblock that we were clean thru the prior epoch diff --git a/src/crimson/osd/shard_services.cc b/src/crimson/osd/shard_services.cc index a6431305d806..404f28d7d7f3 100644 --- a/src/crimson/osd/shard_services.cc +++ b/src/crimson/osd/shard_services.cc @@ -711,16 +711,16 @@ seastar::future<> OSDSingletonState::send_incremental_map( { logger().info("{}: first osdmap: {} " "superblock's oldest map: {}", - __func__, first, superblock.oldest_map); - if (first >= superblock.oldest_map) { + __func__, first, superblock.get_oldest_map()); + if (first >= superblock.get_oldest_map()) { return load_map_bls( - first, superblock.newest_map + first, superblock.get_newest_map() ).then([this, &conn, first](auto&& bls) { auto m = crimson::make_message( monc.get_fsid(), osdmap->get_encoding_features()); m->cluster_osdmap_trim_lower_bound = first; - m->newest_map = superblock.newest_map; + m->newest_map = superblock.get_newest_map(); m->maps = std::move(bls); return conn.send(std::move(m)); }); @@ -736,8 +736,8 @@ seastar::future<> OSDSingletonState::send_incremental_map( * See: OSD::handle_osd_map for how classic updates the * cluster's trim lower bound. */ - m->cluster_osdmap_trim_lower_bound = superblock.oldest_map; - m->newest_map = superblock.newest_map; + m->cluster_osdmap_trim_lower_bound = superblock.get_oldest_map(); + m->newest_map = superblock.get_newest_map(); m->maps.emplace(osdmap->get_epoch(), std::move(bl)); return conn.send(std::move(m)); }); diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 4e3f862b77eb..116e93680c6f 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -3656,7 +3656,7 @@ bool OSDMonitor::prepare_boot(MonOpRequestRef op) } // fresh osd? - if (m->sb.newest_map == 0 && osdmap.exists(from)) { + if (m->sb.get_newest_map() == 0 && osdmap.exists(from)) { const osd_info_t& i = osdmap.get_info(from); if (i.up_from > i.lost_at) { dout(10) << " fresh osd; marking lost_at too" << dendl; diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 83c49a08a48d..93f5ca238fab 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1380,7 +1380,7 @@ MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to, MOSDMap *m = new MOSDMap(monc->get_fsid(), osdmap->get_encoding_features()); m->cluster_osdmap_trim_lower_bound = sblock.cluster_osdmap_trim_lower_bound; - m->newest_map = sblock.newest_map; + m->newest_map = sblock.get_newest_map(); int max = cct->_conf->osd_map_message_max; ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes; @@ -1459,12 +1459,12 @@ void OSDService::send_incremental_map(epoch_t since, Connection *con, MOSDMap *m = NULL; while (!m) { OSDSuperblock sblock(get_superblock()); - if (since < sblock.oldest_map) { + if (since < sblock.get_oldest_map()) { // just send latest full map MOSDMap *m = new MOSDMap(monc->get_fsid(), osdmap->get_encoding_features()); m->cluster_osdmap_trim_lower_bound = sblock.cluster_osdmap_trim_lower_bound; - m->newest_map = sblock.newest_map; + m->newest_map = sblock.get_newest_map(); get_map_bl(to, m->maps[to]); send_map(m, con); return; @@ -1650,7 +1650,7 @@ void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op) * splitting. The simplest thing is to detect such cases here and drop * them without an error (the client will resend anyway). */ - ceph_assert(m->get_map_epoch() <= superblock.newest_map); + ceph_assert(m->get_map_epoch() <= superblock.get_newest_map()); OSDMapRef opmap = try_get_map(m->get_map_epoch()); if (!opmap) { dout(7) << __func__ << ": " << *pg << " no longer have map for " @@ -2705,10 +2705,9 @@ void OSD::asok_command( f->dump_stream("osd_fsid") << superblock.osd_fsid; f->dump_unsigned("whoami", superblock.whoami); f->dump_string("state", get_state_name(get_state())); - f->dump_unsigned("oldest_map", superblock.oldest_map); + f->dump_stream("maps") << superblock.maps; f->dump_unsigned("cluster_osdmap_trim_lower_bound", superblock.cluster_osdmap_trim_lower_bound); - f->dump_unsigned("newest_map", superblock.newest_map); f->dump_unsigned("num_pgs", num_pgs); f->close_section(); } else if (prefix == "flush_journal") { @@ -3763,7 +3762,7 @@ int OSD::init() dout(5) << "Upgrading superblock adding: " << diff << dendl; if (!superblock.cluster_osdmap_trim_lower_bound) { - superblock.cluster_osdmap_trim_lower_bound = superblock.oldest_map; + superblock.cluster_osdmap_trim_lower_bound = superblock.get_oldest_map(); } ObjectStore::Transaction t; @@ -6277,7 +6276,7 @@ void OSD::tick_without_osd_lock() if (max_waiting_epoch > get_osdmap()->get_epoch()) { dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch << ", requesting new map" << dendl; - osdmap_subscribe(superblock.newest_map + 1, false); + osdmap_subscribe(superblock.get_newest_map() + 1, false); } } @@ -6638,8 +6637,7 @@ void OSD::start_boot() } dout(1) << __func__ << dendl; set_state(STATE_PREBOOT); - dout(10) << "start_boot - have maps " << superblock.oldest_map - << ".." << superblock.newest_map << dendl; + dout(10) << "start_boot - have maps " << superblock.maps << dendl; monc->get_version("osdmap", CB_OSD_GetVersion(this)); } @@ -7952,20 +7950,20 @@ void OSD::trim_maps(epoch_t oldest, bool skip_maps) */ epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound()); dout(20) << __func__ << ": min=" << min << " oldest_map=" - << superblock.oldest_map << " skip_maps=" << skip_maps + << superblock.get_oldest_map() << " skip_maps=" << skip_maps << dendl; - if (min <= superblock.oldest_map) + if (min <= superblock.get_oldest_map()) return; // Trim from the superblock's oldest_map up to `min`. // Break if we have exceeded the txn target size. // If skip_maps is true, we will trim up `min` unconditionally. ObjectStore::Transaction t; - while (superblock.oldest_map < min) { - dout(20) << " removing old osdmap epoch " << superblock.oldest_map << dendl; - t.remove(coll_t::meta(), get_osdmap_pobject_name(superblock.oldest_map)); - t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(superblock.oldest_map)); - ++superblock.oldest_map; + while (superblock.superblock.get_oldest_map() < min) { + dout(20) << " removing old osdmap epoch " << superblock.superblock.get_oldest_map() << dendl; + t.remove(coll_t::meta(), get_osdmap_pobject_name(superblock.superblock.get_oldest_map())); + t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(superblock.superblock.get_oldest_map())); + superblock.maps.erase(superblock.get_oldest_map()); if (t.get_num_ops() > cct->_conf->osd_target_transaction_size) { service.publish_superblock(superblock); write_superblock(cct, superblock, t); @@ -8057,15 +8055,15 @@ void OSD::handle_osd_map(MOSDMap *m) epoch_t first = m->get_first(); epoch_t last = m->get_last(); dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have " - << superblock.newest_map + << superblock.get_newest_map() << ", src has [" << m->cluster_osdmap_trim_lower_bound << "," << m->newest_map << "]" << dendl; logger->inc(l_osd_map); logger->inc(l_osd_mape, last - first + 1); - if (first <= superblock.newest_map) - logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1); + if (first <= superblock.get_newest_map()) + logger->inc(l_osd_mape_dup, superblock.get_newest_map() - first + 1); if (superblock.cluster_osdmap_trim_lower_bound < m->cluster_osdmap_trim_lower_bound) { @@ -8074,12 +8072,12 @@ void OSD::handle_osd_map(MOSDMap *m) dout(10) << " superblock cluster_osdmap_trim_lower_bound new epoch is: " << superblock.cluster_osdmap_trim_lower_bound << dendl; ceph_assert( - superblock.cluster_osdmap_trim_lower_bound >= superblock.oldest_map); + superblock.cluster_osdmap_trim_lower_bound >= superblock.get_oldest_map()); } // make sure there is something new, here, before we bother flushing // the queues and such - if (last <= superblock.newest_map) { + if (last <= superblock.get_newest_map()) { dout(10) << " no new maps here, dropping" << dendl; m->put(); return; @@ -8087,11 +8085,11 @@ void OSD::handle_osd_map(MOSDMap *m) // missing some? bool skip_maps = false; - if (first > superblock.newest_map + 1) { + if (first > superblock.get_newest_map() + 1) { dout(10) << "handle_osd_map message skips epochs " - << superblock.newest_map + 1 << ".." << (first-1) << dendl; - if (m->cluster_osdmap_trim_lower_bound <= superblock.newest_map + 1) { - osdmap_subscribe(superblock.newest_map + 1, false); + << superblock.get_newest_map() + 1 << ".." << (first-1) << dendl; + if (m->cluster_osdmap_trim_lower_bound <= superblock.get_newest_map() + 1) { + osdmap_subscribe(superblock.get_newest_map() + 1, false); m->put(); return; } @@ -8116,7 +8114,7 @@ void OSD::handle_osd_map(MOSDMap *m) map> purged_snaps; // store new maps: queue for disk and put in the osdmap cache - epoch_t start = std::max(superblock.newest_map + 1, first); + epoch_t start = std::max(superblock.get_newest_map() + 1, first); for (epoch_t e = start; e <= last; e++) { if (txn_size >= t.get_num_bytes()) { derr << __func__ << " transaction size overflowed" << dendl; @@ -8227,14 +8225,11 @@ void OSD::handle_osd_map(MOSDMap *m) rerequest_full_maps(); } - if (superblock.oldest_map) { + if (!superblock.maps.empty()) { trim_maps(m->cluster_osdmap_trim_lower_bound, skip_maps); - pg_num_history.prune(superblock.oldest_map); + pg_num_history.prune(superblock.get_oldest_map()); } - - if (!superblock.oldest_map || skip_maps) - superblock.oldest_map = first; - superblock.newest_map = last; + superblock.insert_osdmap_epochs(first, last); superblock.current_epoch = last; // note in the superblock that we were clean thru the prior epoch @@ -8360,7 +8355,7 @@ void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m) for (epoch_t cur = first; cur <= last; cur++) { dout(10) << " advance to epoch " << cur << " (<= last " << last - << " <= newest_map " << superblock.newest_map + << " <= newest_map " << superblock.get_newest_map() << ")" << dendl; OSDMapRef newmap = get_map(cur); diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 664d8a287406..948abeaafc8a 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -5705,12 +5705,12 @@ void pg_hit_set_history_t::generate_test_instances(list& void OSDSuperblock::encode(ceph::buffer::list &bl) const { - ENCODE_START(10, 5, bl); + ENCODE_START(11, 5, bl); encode(cluster_fsid, bl); encode(whoami, bl); encode(current_epoch, bl); - encode(oldest_map, bl); - encode(newest_map, bl); + encode((epoch_t)0, bl); // oldest_map + encode((epoch_t)0, bl); // newest_map encode(weight, bl); compat_features.encode(bl); encode(clean_thru, bl); @@ -5721,12 +5721,13 @@ void OSDSuperblock::encode(ceph::buffer::list &bl) const encode(purged_snaps_last, bl); encode(last_purged_snaps_scrub, bl); encode(cluster_osdmap_trim_lower_bound, bl); + encode(maps, bl); ENCODE_FINISH(bl); } void OSDSuperblock::decode(ceph::buffer::list::const_iterator &bl) { - DECODE_START_LEGACY_COMPAT_LEN(10, 5, 5, bl); + DECODE_START_LEGACY_COMPAT_LEN(11, 5, 5, bl); if (struct_v < 3) { string magic; decode(magic, bl); @@ -5734,6 +5735,7 @@ void OSDSuperblock::decode(ceph::buffer::list::const_iterator &bl) decode(cluster_fsid, bl); decode(whoami, bl); decode(current_epoch, bl); + epoch_t oldest_map, newest_map; decode(oldest_map, bl); decode(newest_map, bl); decode(weight, bl); @@ -5765,6 +5767,11 @@ void OSDSuperblock::decode(ceph::buffer::list::const_iterator &bl) } else { cluster_osdmap_trim_lower_bound = 0; } + if (struct_v >= 11) { + decode(maps, bl); + } else { + insert_osdmap_epochs(oldest_map, newest_map); + } DECODE_FINISH(bl); } @@ -5774,8 +5781,6 @@ void OSDSuperblock::dump(Formatter *f) const f->dump_stream("osd_fsid") << osd_fsid; f->dump_int("whoami", whoami); f->dump_int("current_epoch", current_epoch); - f->dump_int("oldest_map", oldest_map); - f->dump_int("newest_map", newest_map); f->dump_float("weight", weight); f->open_object_section("compat"); compat_features.dump(f); @@ -5786,6 +5791,7 @@ void OSDSuperblock::dump(Formatter *f) const f->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub; f->dump_int("cluster_osdmap_trim_lower_bound", cluster_osdmap_trim_lower_bound); + f->dump_stream("maps") << maps; } void OSDSuperblock::generate_test_instances(list& o) @@ -5796,8 +5802,7 @@ void OSDSuperblock::generate_test_instances(list& o) z.osd_fsid.parse("02020202-0202-0202-0202-020202020202"); z.whoami = 3; z.current_epoch = 4; - z.oldest_map = 5; - z.newest_map = 9; + z.insert_osdmap_epochs(5, 9); z.mounted = 8; z.clean_thru = 7; o.push_back(new OSDSuperblock(z)); diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 16955ef5ef4d..8b86b0a36356 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -5454,7 +5454,31 @@ public: uuid_d cluster_fsid, osd_fsid; int32_t whoami = -1; // my role in this fs. epoch_t current_epoch = 0; // most recent epoch - epoch_t oldest_map = 0, newest_map = 0; // oldest/newest maps we have. + interval_set maps; // oldest/newest maps we have. + + epoch_t get_oldest_map() const { + if (!maps.empty()) { + return maps.range_start(); + } + return 0; + } + + epoch_t get_newest_map() const { + if (!maps.empty()) { + // maps stores [oldest_map, newest_map) (exclusive) + return maps.range_end() - 1; + } + return 0; + } + + void insert_osdmap_epochs(epoch_t first, epoch_t last) { + ceph_assert(std::cmp_less_equal(first, last)); + interval_set message_epochs; + message_epochs.insert(first, last - first + 1); + maps.union_of(message_epochs); + ceph_assert(last == get_newest_map()); + } + double weight = 0.0; CompatSet compat_features; @@ -5481,7 +5505,7 @@ inline std::ostream& operator<<(std::ostream& out, const OSDSuperblock& sb) << " osd." << sb.whoami << " " << sb.osd_fsid << " e" << sb.current_epoch - << " [" << sb.oldest_map << "," << sb.newest_map << "]" + << " maps " << sb.maps << " lci=[" << sb.mounted << "," << sb.clean_thru << "]" << " tlb=" << sb.cluster_osdmap_trim_lower_bound << ")"; diff --git a/src/tools/ceph_objectstore_tool.cc b/src/tools/ceph_objectstore_tool.cc index df6d1f85c39f..19a445824834 100644 --- a/src/tools/ceph_objectstore_tool.cc +++ b/src/tools/ceph_objectstore_tool.cc @@ -1630,9 +1630,9 @@ int get_pg_metadata(ObjectStore *store, bufferlist &bl, metadata_section &ms, return -EINVAL; } - if (ms.osdmap.get_epoch() < sb.oldest_map) { + if (ms.osdmap.get_epoch() < sb.get_oldest_map()) { cerr << "PG export's map " << ms.osdmap.get_epoch() - << " is older than OSD's oldest_map " << sb.oldest_map << std::endl; + << " is older than OSD's oldest_map " << sb.get_oldest_map() << std::endl; if (!force) { cerr << " pass --force to proceed anyway (with incomplete PastIntervals)" << std::endl; diff --git a/src/tools/rebuild_mondb.cc b/src/tools/rebuild_mondb.cc index 17e4dadcfdd4..033f63aad22a 100644 --- a/src/tools/rebuild_mondb.cc +++ b/src/tools/rebuild_mondb.cc @@ -216,7 +216,7 @@ int update_osdmap(ObjectStore& fs, OSDSuperblock& sb, MonitorDBStore& ms) // osdmap starts at 1. if we have a "0" first_committed, then there is nothing // to trim. and "1 osdmaps trimmed" in the output message is misleading. so // let's make it an exception. - for (auto e = first_committed; first_committed && e < sb.oldest_map; e++) { + for (auto e = first_committed; first_committed && e < sb.get_oldest_map(); e++) { t->erase(prefix, e); t->erase(prefix, ms.combine_strings("full", e)); ntrimmed++; @@ -225,7 +225,7 @@ int update_osdmap(ObjectStore& fs, OSDSuperblock& sb, MonitorDBStore& ms) // because PaxosService::put_last_committed() set it to last_committed, if it // is zero. which breaks OSDMonitor::update_from_paxos(), in which we believe // that latest_full should always be greater than last_committed. - if (first_committed == 0 && sb.oldest_map < sb.newest_map) { + if (first_committed == 0 && sb.get_oldest_map() < sb.get_newest_map()) { first_committed = 1; } else if (ntrimmed) { first_committed += ntrimmed; @@ -240,8 +240,8 @@ int update_osdmap(ObjectStore& fs, OSDSuperblock& sb, MonitorDBStore& ms) auto ch = fs.open_collection(coll_t::meta()); OSDMap osdmap; - for (auto e = std::max(last_committed+1, sb.oldest_map); - e <= sb.newest_map; e++) { + for (auto e = std::max(last_committed+1, sb.get_oldest_map()); + e <= sb.get_newest_map(); e++) { bool have_crc = false; uint32_t crc = -1; uint64_t features = 0;