From 400c27da9da71a5737535dc2118b95b2a0a5156c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 5 Dec 2011 11:38:12 -0800 Subject: [PATCH] osd: track backfill with last_backfill, not interval_set<> We always fill from the bottom up anyway. Using an hobject_t also gives us a precise bound. It also makes things conceptually simpler: last_complete and last_backfill bounding each of the two dimensions of updatedness. Signed-off-by: Sage Weil --- src/messages/MOSDPGBackfill.h | 12 ++-- src/osd/PG.cc | 5 +- src/osd/PG.h | 24 +++++--- src/osd/ReplicatedPG.cc | 108 +++++++++++++++++----------------- 4 files changed, 81 insertions(+), 68 deletions(-) diff --git a/src/messages/MOSDPGBackfill.h b/src/messages/MOSDPGBackfill.h index 14b19c5e48e8b..917e1ae8db9c8 100644 --- a/src/messages/MOSDPGBackfill.h +++ b/src/messages/MOSDPGBackfill.h @@ -37,7 +37,8 @@ public: __u32 op; epoch_t map_epoch, query_epoch; pg_t pgid; - interval_set incomplete; + hobject_t last_backfill; + eversion_t last_complete; virtual void decode_payload(CephContext *cct) { bufferlist::iterator p = payload.begin(); @@ -45,7 +46,8 @@ public: ::decode(map_epoch, p); ::decode(query_epoch, p); ::decode(pgid, p); - ::decode(incomplete, p); + ::decode(last_backfill, p); + ::decode(last_complete, p); } virtual void encode_payload(CephContext *cct) { @@ -53,7 +55,8 @@ public: ::encode(map_epoch, payload); ::encode(query_epoch, payload); ::encode(pgid, payload); - ::encode(incomplete, payload); + ::encode(last_backfill, payload); + ::encode(last_complete, payload); } MOSDPGBackfill() : Message(MSG_OSD_PG_BACKFILL) {} @@ -72,7 +75,8 @@ public: out << "pg_backfill(" << get_op_name(op) << " " << pgid << " e " << map_epoch << "/" << query_epoch - << " incomp " << std::hex << incomplete << std::dec + << " lc " << last_complete + << " lb " << last_backfill << ")"; } }; diff --git a/src/osd/PG.cc b/src/osd/PG.cc index dd9416aa85ef0..f02e1fee83682 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -818,6 +818,9 @@ void PG::clear_primary_state() stray_purged.clear(); might_have_unfound.clear(); + backfill_target = -1; + peer_backfill_info = BackfillInterval(); + last_update_ondisk = eversion_t(); snap_trimq.clear(); @@ -2033,7 +2036,7 @@ void PG::read_state(ObjectStore *store) write_info(t); store->apply_transaction(t); - info.incomplete.insert(0, 1ull << 32); + info.last_backfill = hobject_t(); } // log any weirdness diff --git a/src/osd/PG.h b/src/osd/PG.h index 91550b621f930..149c087a0ca82 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -175,7 +175,7 @@ public: eversion_t log_tail; // oldest log entry. - interval_set incomplete; // incomplete hash ranges prior to last_complete + hobject_t last_backfill; // objects >= this and < last_complete may be missing interval_set purged_snaps; @@ -248,13 +248,18 @@ public: } } history; - Info() {} - Info(pg_t p) : pgid(p) { } + Info() + : last_backfill(hobject_t::get_max()) + { } + Info(pg_t p) + : pgid(p), + last_backfill(hobject_t::get_max()) + { } bool is_empty() const { return last_update.version == 0; } bool dne() const { return history.epoch_created == 0; } - bool is_incomplete() const { return !incomplete.empty(); } + bool is_incomplete() const { return last_backfill != hobject_t::get_max(); } void encode(bufferlist &bl) const { __u8 v = 25; @@ -264,7 +269,7 @@ public: ::encode(last_update, bl); ::encode(last_complete, bl); ::encode(log_tail, bl); - ::encode(incomplete, bl); + ::encode(last_backfill, bl); ::encode(stats, bl); history.encode(bl); ::encode(purged_snaps, bl); @@ -288,7 +293,7 @@ public: ::decode(log_backlog, bl); } if (v >= 24) - ::decode(incomplete, bl); + ::decode(last_backfill, bl); ::decode(stats, bl); history.decode(bl); if (v >= 22) @@ -1381,7 +1386,8 @@ protected: }; BackfillInterval backfill_info; - map peer_backfill_info; + int backfill_target; + BackfillInterval peer_backfill_info; epoch_t last_peering_reset; @@ -1758,8 +1764,8 @@ inline ostream& operator<<(ostream& out, const PG::Info& pgi) if (pgi.last_complete != pgi.last_update) out << " lc " << pgi.last_complete; out << " (" << pgi.log_tail << "," << pgi.last_update << "]"; - if (!pgi.incomplete.empty()) - out << " incomp " << std::hex << pgi.incomplete << std::dec; + if (pgi.is_incomplete()) + out << " lb " << pgi.last_backfill; } //out << " c " << pgi.epoch_created; out << " n=" << pgi.stats.stats.sum.num_objects; diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 63bab14fa6a00..ff81722f85dbc 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -858,7 +858,8 @@ void ReplicatedPG::do_scan(MOSDPGScan *m) case MOSDPGScan::OP_SCAN_DIGEST: { int from = m->get_source().num(); - BackfillInterval& bi = peer_backfill_info[from]; + assert(from == backfill_target); + BackfillInterval& bi = peer_backfill_info; bi.begin = m->begin; bi.end = m->end; bufferlist::iterator p = m->get_data().begin(); @@ -877,37 +878,34 @@ void ReplicatedPG::do_backfill(MOSDPGBackfill *m) dout(10) << "do_backfill " << *m << dendl; switch (m->op) { - case MOSDPGBackfill::OP_BACKFILL_PROGRESS: + case MOSDPGBackfill::OP_BACKFILL_FINISH: { - assert(get_role() < 0); - - info.incomplete = m->incomplete; - - ObjectStore::Transaction *t = new ObjectStore::Transaction; - write_info(*t); - int tr = osd->store->queue_transaction(&osr, t); - assert(tr == 0); + MOSDPGBackfill *reply = new MOSDPGBackfill(MOSDPGBackfill::OP_BACKFILL_FINISH_ACK, + get_osdmap()->get_epoch(), m->query_epoch, + info.pgid); + osd->cluster_messenger->send_message(reply, m->get_connection()); } - break; + // fall-thru - case MOSDPGBackfill::OP_BACKFILL_FINISH: + case MOSDPGBackfill::OP_BACKFILL_PROGRESS: { assert(get_role() < 0); - info.last_complete = info.last_update; - info.incomplete.clear(); - - ObjectStore::Transaction *t = new ObjectStore::Transaction; + + info.last_update = m->last_complete; + info.last_complete = m->last_complete; + info.last_backfill = m->last_backfill; + log.clear(); log.head = info.last_update; log.tail = info.last_update; + + info.log_tail = log.tail; + + ObjectStore::Transaction *t = new ObjectStore::Transaction; write_log(*t); write_info(*t); int tr = osd->store->queue_transaction(&osr, t); assert(tr == 0); - - MOSDPGBackfill *reply = new MOSDPGBackfill(MOSDPGBackfill::OP_BACKFILL_FINISH_ACK, - get_osdmap()->get_epoch(), m->query_epoch, info.pgid); - osd->cluster_messenger->send_message(reply, m->get_connection()); } break; @@ -5264,14 +5262,18 @@ int ReplicatedPG::recover_backfill(int max) dout(10) << "recover_backfill (" << max << ")" << dendl; assert(!backfill.empty()); - // initially just backfill one peer at a time. FIXME. - int peer = *backfill.begin(); - Info& pinfo = peer_info[peer]; - BackfillInterval& pbi = peer_backfill_info[peer]; - - dout(10) << " peer osd." << peer << " " << pinfo - << " interval " << pbi.begin << "-" << pbi.end << " " << pbi.objects.size() << " objects" << dendl; + // backfill one peer at a time. + if (backfill_target < 0) { + backfill_target = *backfill.begin(); + dout(10) << " chose backfill target osd." << backfill_target << dendl; + } + Info& pinfo = peer_info[backfill_target]; + BackfillInterval& pbi = peer_backfill_info; + dout(10) << " peer osd." << backfill_target << " " << pinfo + << " interval " << pbi.begin << "-" << pbi.end + << " " << pbi.objects.size() << " objects" << dendl; + // does the pg exist yet on the peer? if (pinfo.dne()) { // ok, we know they have no objects. @@ -5280,14 +5282,14 @@ int ReplicatedPG::recover_backfill(int max) // fill in pinfo pinfo.last_update = info.last_update; pinfo.log_tail = info.last_update; - pinfo.incomplete.insert(0, 0x100000000ull); + pinfo.last_backfill = hobject_t(); pinfo.history = info.history; - dout(10) << " peer osd." << peer << " pg dne; setting info to " << pinfo << dendl; + dout(10) << " peer osd." << backfill_target << " pg dne; setting info to " << pinfo << dendl; // create pg on remote MOSDPGInfo *mp = new MOSDPGInfo(get_osdmap()->get_epoch()); mp->pg_info.push_back(pinfo); - osd->cluster_messenger->send_message(mp, get_osdmap()->get_cluster_inst(peer)); + osd->cluster_messenger->send_message(mp, get_osdmap()->get_cluster_inst(backfill_target)); } int ops = 0; @@ -5307,7 +5309,7 @@ int ReplicatedPG::recover_backfill(int max) epoch_t e = get_osdmap()->get_epoch(); MOSDPGScan *m = new MOSDPGScan(MOSDPGScan::OP_SCAN_GET_DIGEST, e, e, info.pgid, pbi.end, hobject_t()); - osd->cluster_messenger->send_message(m, get_osdmap()->get_cluster_inst(peer)); + osd->cluster_messenger->send_message(m, get_osdmap()->get_cluster_inst(backfill_target)); start_recovery_op(pbi.end); ops++; break; @@ -5322,11 +5324,13 @@ int ReplicatedPG::recover_backfill(int max) if (pbi.begin != hobject_t::get_max()) { pbi.begin = hobject_t::get_max(); - pinfo.incomplete.clear(); + pinfo.last_backfill = hobject_t::get_max(); epoch_t e = get_osdmap()->get_epoch(); MOSDPGBackfill *m = new MOSDPGBackfill(MOSDPGBackfill::OP_BACKFILL_FINISH, e, e, info.pgid); - osd->cluster_messenger->send_message(m, get_osdmap()->get_cluster_inst(peer)); + m->last_complete = info.last_update; + m->last_backfill = hobject_t::get_max(); + osd->cluster_messenger->send_message(m, get_osdmap()->get_cluster_inst(backfill_target)); start_recovery_op(hobject_t::get_max()); ops++; } @@ -5339,7 +5343,7 @@ int ReplicatedPG::recover_backfill(int max) assert(pf < backfill_info.end); dout(20) << " removing peer " << pf << " <= local end " << backfill_info.end << dendl; - send_remove_op(pf, pv, peer); + send_remove_op(pf, pv, backfill_target); pbi.pop_front(); continue; } @@ -5350,8 +5354,8 @@ int ReplicatedPG::recover_backfill(int max) if (pbi.empty()) { assert(pbi.at_end()); dout(20) << " pushing local " << my_first << " " << backfill_info.objects.begin()->second - << " to peer osd." << peer << dendl; - push_backfill_object(my_first, mv, peer); + << " to peer osd." << backfill_target << dendl; + push_backfill_object(my_first, mv, backfill_target); backfill_info.pop_front(); pbi.begin = my_first; ++ops; @@ -5363,14 +5367,14 @@ int ReplicatedPG::recover_backfill(int max) if (peer_first < my_first) { dout(20) << " removing peer " << peer_first << " <= local " << my_first << dendl; - send_remove_op(peer_first, pv, peer); + send_remove_op(peer_first, pv, backfill_target); pbi.pop_front(); } else if (peer_first == my_first) { if (pv == mv) { dout(20) << " keeping peer " << peer_first << " " << pv << dendl; } else { dout(20) << " replacing peer " << peer_first << " with local " << mv << dendl; - push_backfill_object(my_first, mv, peer); + push_backfill_object(my_first, mv, backfill_target); ++ops; } pbi.pop_front(); @@ -5378,27 +5382,23 @@ int ReplicatedPG::recover_backfill(int max) } else { // peer_first > my_first dout(20) << " pushing local " << my_first << " " << mv - << " to peer osd." << peer << dendl; - push_backfill_object(my_first, mv, peer); + << " to peer osd." << backfill_target << dendl; + push_backfill_object(my_first, mv, backfill_target); backfill_info.pop_front(); ++ops; } } - if (!pinfo.incomplete.empty()) { - hobject_t b; - b.set_filestore_key(pinfo.incomplete.range_start()); - dout(20) << " b " << b << " pbi.begin " << pbi.begin << " " << pinfo << dendl; - if (b < pbi.begin) { - pinfo.incomplete.erase(b.get_filestore_key(), pbi.begin.get_filestore_key() - b.get_filestore_key()); - dout(10) << " peer osd." << peer << " info.incomplete now " - << std::hex << pinfo.incomplete << std::dec << dendl; - - epoch_t e = get_osdmap()->get_epoch(); - MOSDPGBackfill *m = new MOSDPGBackfill(MOSDPGBackfill::OP_BACKFILL_PROGRESS, e, e, info.pgid); - m->incomplete = pinfo.incomplete; - osd->cluster_messenger->send_message(m, get_osdmap()->get_cluster_inst(peer)); - } + if (pinfo.last_backfill < pbi.begin) { + pinfo.last_backfill = pbi.begin; + + dout(10) << " peer osd." << backfill_target << " info.last_backfill now " << pinfo.last_backfill << dendl; + + epoch_t e = get_osdmap()->get_epoch(); + MOSDPGBackfill *m = new MOSDPGBackfill(MOSDPGBackfill::OP_BACKFILL_PROGRESS, e, e, info.pgid); + m->last_backfill = pinfo.last_backfill; + m->last_complete = info.last_update; + osd->cluster_messenger->send_message(m, get_osdmap()->get_cluster_inst(backfill_target)); } return ops; } -- 2.39.5