replace oldest/newest_map members with an interval_set in order to
support the tracking of an osdmap range gap.
Map gap example:
```
2023-08-28T18:21:05.452+0000
7f409bce2640 3 osd.4 84 handle_osd_map epochs [85,86], i have 84, src has [1,86]
2023-08-28T18:26:25.829+0000
7fcfea0c1640 3 osd.4 86 handle_osd_map epochs [208,208], i have 86, src has [208,400]
2023-08-28T18:26:25.829+0000
7fcfea0c1640 10 osd.4 86 superblock cluster_osdmap_trim_lower_bound new epoch is: 208
2023-08-28T18:26:25.829+0000
7fcfea0c1640 10 osd.4 86 handle_osd_map osd map gap [16~71,208~1]
2023-08-28T18:26:25.829+0000
7fcfea0c1640 3 osd.4 86 handle_osd_map epochs [209,248], i have 208, src has [208,400]
2023-08-28T18:26:25.833+0000
7fcfea0c1640 10 osd.4 86 handle_osd_map osd map gap [31~56,208~4]
2023-08-28T18:26:25.941+0000
7fcfea0c1640 3 osd.4 211 handle_osd_map epochs [212,248], i have 211, src has [208,400]
2023-08-28T18:26:25.945+0000
7fcfea0c1640 10 osd.4 211 handle_osd_map osd map gap [46~41,208~41]
2023-08-28T18:26:25.949+0000
7fcfea0c1640 3 osd.4 211 handle_osd_map epochs [209,248], i have 248, src has [208,400]
2023-08-28T18:26:25.949+0000
7fcfea0c1640 3 osd.4 211 handle_osd_map epochs [212,251], i have 248, src has [208,400]
2023-08-28T18:26:25.953+0000
7fcfea0c1640 10 osd.4 211 handle_osd_map osd map gap [61~26,208~44]
2023-08-28T18:26:26.073+0000
7fcfea0c1640 3 osd.4 251 handle_osd_map epochs [249,288], i have 251, src has [208,400]
2023-08-28T18:26:26.081+0000
7fcfea0c1640 10 osd.4 251 handle_osd_map osd map gap [76~11,208~48]
2023-08-28T18:26:26.081+0000
7fcfea0c1640 3 osd.4 251 handle_osd_map epochs [252,291], i have 255, src has [208,400]
```
Full example: https://gist.github.com/Matan-B/
9b0eed8daee3bd6c3216bd3b6d11e8fb
Fixes: https://tracker.ceph.com/issues/61962
Signed-off-by: Matan Breizman <mbreizma@redhat.com>
f->dump_stream("osd_fsid") << superblock.osd_fsid;
f->dump_unsigned("whoami", superblock.whoami);
f->dump_string("state", pg_shard_manager.get_osd_state_string());
- f->dump_unsigned("oldest_map", superblock.oldest_map);
+ f->dump_stream("maps") << superblock.maps;
f->dump_unsigned("cluster_osdmap_trim_lower_bound",
superblock.cluster_osdmap_trim_lower_bound);
- f->dump_unsigned("newest_map", superblock.newest_map);
f->dump_unsigned("num_pgs", pg_shard_manager.get_num_pgs());
}
void OSD::print(std::ostream& out) const
{
out << "{osd." << superblock.whoami << " "
- << superblock.osd_fsid << " [" << superblock.oldest_map
- << "," << superblock.newest_map << "] "
- << "tlb:" << superblock.cluster_osdmap_trim_lower_bound
+ << superblock.osd_fsid << " maps " << superblock.maps
+ << " tlb:" << superblock.cluster_osdmap_trim_lower_bound
<< " pgs:" << pg_shard_manager.get_num_pgs()
<< "}";
}
const auto first = m->get_first();
const auto last = m->get_last();
logger().info("handle_osd_map epochs [{}..{}], i have {}, src has [{}..{}]",
- first, last, superblock.newest_map,
+ first, last, superblock.get_newest_map(),
m->cluster_osdmap_trim_lower_bound, m->newest_map);
// make sure there is something new, here, before we bother flushing
// the queues and such
- if (last <= superblock.newest_map) {
+ if (last <= superblock.get_newest_map()) {
return seastar::now();
}
// missing some?
bool skip_maps = false;
- epoch_t start = superblock.newest_map + 1;
+ epoch_t start = superblock.get_newest_map() + 1;
if (first > start) {
logger().info("handle_osd_map message skips epochs {}..{}",
start, first - 1);
return pg_shard_manager.store_maps(t, start, m).then([=, this, &t] {
// even if this map isn't from a mon, we may have satisfied our subscription
monc->sub_got("osdmap", last);
- if (!superblock.oldest_map || skip_maps) {
- superblock.oldest_map = first;
+
+ if (!superblock.maps.empty()) {
+ // TODO: support osdmap trimming
+ // See: <tracker>
}
- superblock.newest_map = last;
+
+ superblock.insert_osdmap_epochs(first, last);
superblock.current_epoch = last;
// note in the superblock that we were clean thru the prior epoch
{
logger().info("{}: first osdmap: {} "
"superblock's oldest map: {}",
- __func__, first, superblock.oldest_map);
- if (first >= superblock.oldest_map) {
+ __func__, first, superblock.get_oldest_map());
+ if (first >= superblock.get_oldest_map()) {
return load_map_bls(
- first, superblock.newest_map
+ first, superblock.get_newest_map()
).then([this, &conn, first](auto&& bls) {
auto m = crimson::make_message<MOSDMap>(
monc.get_fsid(),
osdmap->get_encoding_features());
m->cluster_osdmap_trim_lower_bound = first;
- m->newest_map = superblock.newest_map;
+ m->newest_map = superblock.get_newest_map();
m->maps = std::move(bls);
return conn.send(std::move(m));
});
* See: OSD::handle_osd_map for how classic updates the
* cluster's trim lower bound.
*/
- m->cluster_osdmap_trim_lower_bound = superblock.oldest_map;
- m->newest_map = superblock.newest_map;
+ m->cluster_osdmap_trim_lower_bound = superblock.get_oldest_map();
+ m->newest_map = superblock.get_newest_map();
m->maps.emplace(osdmap->get_epoch(), std::move(bl));
return conn.send(std::move(m));
});
}
// fresh osd?
- if (m->sb.newest_map == 0 && osdmap.exists(from)) {
+ if (m->sb.get_newest_map() == 0 && osdmap.exists(from)) {
const osd_info_t& i = osdmap.get_info(from);
if (i.up_from > i.lost_at) {
dout(10) << " fresh osd; marking lost_at too" << dendl;
MOSDMap *m = new MOSDMap(monc->get_fsid(),
osdmap->get_encoding_features());
m->cluster_osdmap_trim_lower_bound = sblock.cluster_osdmap_trim_lower_bound;
- m->newest_map = sblock.newest_map;
+ m->newest_map = sblock.get_newest_map();
int max = cct->_conf->osd_map_message_max;
ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
MOSDMap *m = NULL;
while (!m) {
OSDSuperblock sblock(get_superblock());
- if (since < sblock.oldest_map) {
+ if (since < sblock.get_oldest_map()) {
// just send latest full map
MOSDMap *m = new MOSDMap(monc->get_fsid(),
osdmap->get_encoding_features());
m->cluster_osdmap_trim_lower_bound = sblock.cluster_osdmap_trim_lower_bound;
- m->newest_map = sblock.newest_map;
+ m->newest_map = sblock.get_newest_map();
get_map_bl(to, m->maps[to]);
send_map(m, con);
return;
* splitting. The simplest thing is to detect such cases here and drop
* them without an error (the client will resend anyway).
*/
- ceph_assert(m->get_map_epoch() <= superblock.newest_map);
+ ceph_assert(m->get_map_epoch() <= superblock.get_newest_map());
OSDMapRef opmap = try_get_map(m->get_map_epoch());
if (!opmap) {
dout(7) << __func__ << ": " << *pg << " no longer have map for "
f->dump_stream("osd_fsid") << superblock.osd_fsid;
f->dump_unsigned("whoami", superblock.whoami);
f->dump_string("state", get_state_name(get_state()));
- f->dump_unsigned("oldest_map", superblock.oldest_map);
+ f->dump_stream("maps") << superblock.maps;
f->dump_unsigned("cluster_osdmap_trim_lower_bound",
superblock.cluster_osdmap_trim_lower_bound);
- f->dump_unsigned("newest_map", superblock.newest_map);
f->dump_unsigned("num_pgs", num_pgs);
f->close_section();
} else if (prefix == "flush_journal") {
dout(5) << "Upgrading superblock adding: " << diff << dendl;
if (!superblock.cluster_osdmap_trim_lower_bound) {
- superblock.cluster_osdmap_trim_lower_bound = superblock.oldest_map;
+ superblock.cluster_osdmap_trim_lower_bound = superblock.get_oldest_map();
}
ObjectStore::Transaction t;
if (max_waiting_epoch > get_osdmap()->get_epoch()) {
dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
<< ", requesting new map" << dendl;
- osdmap_subscribe(superblock.newest_map + 1, false);
+ osdmap_subscribe(superblock.get_newest_map() + 1, false);
}
}
}
dout(1) << __func__ << dendl;
set_state(STATE_PREBOOT);
- dout(10) << "start_boot - have maps " << superblock.oldest_map
- << ".." << superblock.newest_map << dendl;
+ dout(10) << "start_boot - have maps " << superblock.maps << dendl;
monc->get_version("osdmap", CB_OSD_GetVersion(this));
}
*/
epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
dout(20) << __func__ << ": min=" << min << " oldest_map="
- << superblock.oldest_map << " skip_maps=" << skip_maps
+ << superblock.get_oldest_map() << " skip_maps=" << skip_maps
<< dendl;
- if (min <= superblock.oldest_map)
+ if (min <= superblock.get_oldest_map())
return;
// Trim from the superblock's oldest_map up to `min`.
// Break if we have exceeded the txn target size.
// If skip_maps is true, we will trim up `min` unconditionally.
ObjectStore::Transaction t;
- while (superblock.oldest_map < min) {
- dout(20) << " removing old osdmap epoch " << superblock.oldest_map << dendl;
- t.remove(coll_t::meta(), get_osdmap_pobject_name(superblock.oldest_map));
- t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(superblock.oldest_map));
- ++superblock.oldest_map;
+ while (superblock.superblock.get_oldest_map() < min) {
+ dout(20) << " removing old osdmap epoch " << superblock.superblock.get_oldest_map() << dendl;
+ t.remove(coll_t::meta(), get_osdmap_pobject_name(superblock.superblock.get_oldest_map()));
+ t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(superblock.superblock.get_oldest_map()));
+ superblock.maps.erase(superblock.get_oldest_map());
if (t.get_num_ops() > cct->_conf->osd_target_transaction_size) {
service.publish_superblock(superblock);
write_superblock(cct, superblock, t);
epoch_t first = m->get_first();
epoch_t last = m->get_last();
dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
- << superblock.newest_map
+ << superblock.get_newest_map()
<< ", src has [" << m->cluster_osdmap_trim_lower_bound
<< "," << m->newest_map << "]"
<< dendl;
logger->inc(l_osd_map);
logger->inc(l_osd_mape, last - first + 1);
- if (first <= superblock.newest_map)
- logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
+ if (first <= superblock.get_newest_map())
+ logger->inc(l_osd_mape_dup, superblock.get_newest_map() - first + 1);
if (superblock.cluster_osdmap_trim_lower_bound <
m->cluster_osdmap_trim_lower_bound) {
dout(10) << " superblock cluster_osdmap_trim_lower_bound new epoch is: "
<< superblock.cluster_osdmap_trim_lower_bound << dendl;
ceph_assert(
- superblock.cluster_osdmap_trim_lower_bound >= superblock.oldest_map);
+ superblock.cluster_osdmap_trim_lower_bound >= superblock.get_oldest_map());
}
// make sure there is something new, here, before we bother flushing
// the queues and such
- if (last <= superblock.newest_map) {
+ if (last <= superblock.get_newest_map()) {
dout(10) << " no new maps here, dropping" << dendl;
m->put();
return;
// missing some?
bool skip_maps = false;
- if (first > superblock.newest_map + 1) {
+ if (first > superblock.get_newest_map() + 1) {
dout(10) << "handle_osd_map message skips epochs "
- << superblock.newest_map + 1 << ".." << (first-1) << dendl;
- if (m->cluster_osdmap_trim_lower_bound <= superblock.newest_map + 1) {
- osdmap_subscribe(superblock.newest_map + 1, false);
+ << superblock.get_newest_map() + 1 << ".." << (first-1) << dendl;
+ if (m->cluster_osdmap_trim_lower_bound <= superblock.get_newest_map() + 1) {
+ osdmap_subscribe(superblock.get_newest_map() + 1, false);
m->put();
return;
}
map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps;
// store new maps: queue for disk and put in the osdmap cache
- epoch_t start = std::max(superblock.newest_map + 1, first);
+ epoch_t start = std::max(superblock.get_newest_map() + 1, first);
for (epoch_t e = start; e <= last; e++) {
if (txn_size >= t.get_num_bytes()) {
derr << __func__ << " transaction size overflowed" << dendl;
rerequest_full_maps();
}
- if (superblock.oldest_map) {
+ if (!superblock.maps.empty()) {
trim_maps(m->cluster_osdmap_trim_lower_bound, skip_maps);
- pg_num_history.prune(superblock.oldest_map);
+ pg_num_history.prune(superblock.get_oldest_map());
}
-
- if (!superblock.oldest_map || skip_maps)
- superblock.oldest_map = first;
- superblock.newest_map = last;
+ superblock.insert_osdmap_epochs(first, last);
superblock.current_epoch = last;
// note in the superblock that we were clean thru the prior epoch
for (epoch_t cur = first; cur <= last; cur++) {
dout(10) << " advance to epoch " << cur
<< " (<= last " << last
- << " <= newest_map " << superblock.newest_map
+ << " <= newest_map " << superblock.get_newest_map()
<< ")" << dendl;
OSDMapRef newmap = get_map(cur);
void OSDSuperblock::encode(ceph::buffer::list &bl) const
{
- ENCODE_START(10, 5, bl);
+ ENCODE_START(11, 5, bl);
encode(cluster_fsid, bl);
encode(whoami, bl);
encode(current_epoch, bl);
- encode(oldest_map, bl);
- encode(newest_map, bl);
+ encode((epoch_t)0, bl); // oldest_map
+ encode((epoch_t)0, bl); // newest_map
encode(weight, bl);
compat_features.encode(bl);
encode(clean_thru, bl);
encode(purged_snaps_last, bl);
encode(last_purged_snaps_scrub, bl);
encode(cluster_osdmap_trim_lower_bound, bl);
+ encode(maps, bl);
ENCODE_FINISH(bl);
}
void OSDSuperblock::decode(ceph::buffer::list::const_iterator &bl)
{
- DECODE_START_LEGACY_COMPAT_LEN(10, 5, 5, bl);
+ DECODE_START_LEGACY_COMPAT_LEN(11, 5, 5, bl);
if (struct_v < 3) {
string magic;
decode(magic, bl);
decode(cluster_fsid, bl);
decode(whoami, bl);
decode(current_epoch, bl);
+ epoch_t oldest_map, newest_map;
decode(oldest_map, bl);
decode(newest_map, bl);
decode(weight, bl);
} else {
cluster_osdmap_trim_lower_bound = 0;
}
+ if (struct_v >= 11) {
+ decode(maps, bl);
+ } else {
+ insert_osdmap_epochs(oldest_map, newest_map);
+ }
DECODE_FINISH(bl);
}
f->dump_stream("osd_fsid") << osd_fsid;
f->dump_int("whoami", whoami);
f->dump_int("current_epoch", current_epoch);
- f->dump_int("oldest_map", oldest_map);
- f->dump_int("newest_map", newest_map);
f->dump_float("weight", weight);
f->open_object_section("compat");
compat_features.dump(f);
f->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub;
f->dump_int("cluster_osdmap_trim_lower_bound",
cluster_osdmap_trim_lower_bound);
+ f->dump_stream("maps") << maps;
}
void OSDSuperblock::generate_test_instances(list<OSDSuperblock*>& o)
z.osd_fsid.parse("02020202-0202-0202-0202-020202020202");
z.whoami = 3;
z.current_epoch = 4;
- z.oldest_map = 5;
- z.newest_map = 9;
+ z.insert_osdmap_epochs(5, 9);
z.mounted = 8;
z.clean_thru = 7;
o.push_back(new OSDSuperblock(z));
uuid_d cluster_fsid, osd_fsid;
int32_t whoami = -1; // my role in this fs.
epoch_t current_epoch = 0; // most recent epoch
- epoch_t oldest_map = 0, newest_map = 0; // oldest/newest maps we have.
+ interval_set<epoch_t> maps; // oldest/newest maps we have.
+
+ epoch_t get_oldest_map() const {
+ if (!maps.empty()) {
+ return maps.range_start();
+ }
+ return 0;
+ }
+
+ epoch_t get_newest_map() const {
+ if (!maps.empty()) {
+ // maps stores [oldest_map, newest_map) (exclusive)
+ return maps.range_end() - 1;
+ }
+ return 0;
+ }
+
+ void insert_osdmap_epochs(epoch_t first, epoch_t last) {
+ ceph_assert(std::cmp_less_equal(first, last));
+ interval_set<epoch_t> message_epochs;
+ message_epochs.insert(first, last - first + 1);
+ maps.union_of(message_epochs);
+ ceph_assert(last == get_newest_map());
+ }
+
double weight = 0.0;
CompatSet compat_features;
<< " osd." << sb.whoami
<< " " << sb.osd_fsid
<< " e" << sb.current_epoch
- << " [" << sb.oldest_map << "," << sb.newest_map << "]"
+ << " maps " << sb.maps
<< " lci=[" << sb.mounted << "," << sb.clean_thru << "]"
<< " tlb=" << sb.cluster_osdmap_trim_lower_bound
<< ")";
return -EINVAL;
}
- if (ms.osdmap.get_epoch() < sb.oldest_map) {
+ if (ms.osdmap.get_epoch() < sb.get_oldest_map()) {
cerr << "PG export's map " << ms.osdmap.get_epoch()
- << " is older than OSD's oldest_map " << sb.oldest_map << std::endl;
+ << " is older than OSD's oldest_map " << sb.get_oldest_map() << std::endl;
if (!force) {
cerr << " pass --force to proceed anyway (with incomplete PastIntervals)"
<< std::endl;
// osdmap starts at 1. if we have a "0" first_committed, then there is nothing
// to trim. and "1 osdmaps trimmed" in the output message is misleading. so
// let's make it an exception.
- for (auto e = first_committed; first_committed && e < sb.oldest_map; e++) {
+ for (auto e = first_committed; first_committed && e < sb.get_oldest_map(); e++) {
t->erase(prefix, e);
t->erase(prefix, ms.combine_strings("full", e));
ntrimmed++;
// because PaxosService::put_last_committed() set it to last_committed, if it
// is zero. which breaks OSDMonitor::update_from_paxos(), in which we believe
// that latest_full should always be greater than last_committed.
- if (first_committed == 0 && sb.oldest_map < sb.newest_map) {
+ if (first_committed == 0 && sb.get_oldest_map() < sb.get_newest_map()) {
first_committed = 1;
} else if (ntrimmed) {
first_committed += ntrimmed;
auto ch = fs.open_collection(coll_t::meta());
OSDMap osdmap;
- for (auto e = std::max(last_committed+1, sb.oldest_map);
- e <= sb.newest_map; e++) {
+ for (auto e = std::max(last_committed+1, sb.get_oldest_map());
+ e <= sb.get_newest_map(); e++) {
bool have_crc = false;
uint32_t crc = -1;
uint64_t features = 0;