From e1006d76c6cc7380e3093a9acdec9666d0d95361 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 6 Dec 2011 15:27:39 -0800 Subject: [PATCH] osd: more backfill changes Always ship log for updates to backfill targets to preserve the repgather ordering. Fix up recover_backfill() bounds. Re-scan the local collect on every pass in case there were concurrent modifications. (This could be optimized.) Signed-off-by: Sage Weil --- src/osd/PG.cc | 97 ++++++++++++++++++++------------------- src/osd/PG.h | 16 ++++++- src/osd/ReplicatedPG.cc | 96 +++++++++++++++++++++++--------------- src/osd/ReplicatedPG.h | 2 + src/test/test_backfill.sh | 1 + 5 files changed, 125 insertions(+), 87 deletions(-) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 03b64cbcd1a6f..0188d4a04c6c4 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -863,7 +863,8 @@ void PG::clear_primary_state() might_have_unfound.clear(); backfill_target = -1; - peer_backfill_info = BackfillInterval(); + backfill_info.clear(); + peer_backfill_info.clear(); last_update_ondisk = eversion_t(); @@ -892,7 +893,7 @@ void PG::calc_acting(int& newest_update_osd_id, vector& want) const all_info[osd->whoami] = info; for (map::iterator p = all_info.begin(); p != all_info.end(); ++p) { - dout(10) << "choose_acting osd." << p->first << " " << p->second << dendl; + dout(10) << "calc_acting osd." << p->first << " " << p->second << dendl; } // find osd with newest last_update. if there are multiples, prefer @@ -907,36 +908,35 @@ void PG::calc_acting(int& newest_update_osd_id, vector& want) const continue; } // prefer longer tail, if it will bring another peer in log contiguity - if (p->second.log_tail < newest_update_osd->second.log_tail) { - bool worse = false; - for (map::iterator q = all_info.begin(); q != all_info.end(); ++q) { - if (q->second.is_incomplete()) - continue; // don't care about log contiguity - if (q->second.last_update < newest_update_osd->second.log_tail && - q->second.last_update >= p->second.log_tail) { - dout(10) << "choose_acting prefer osd." << p->first - << " because it brings osd." << q->first << " into log contiguity" << dendl; - newest_update_osd = p; - continue; - } - if (q->second.last_update < p->second.log_tail && - q->second.last_update >= newest_update_osd->second.log_tail) { - worse = true; - break; - } - } - if (worse) + bool worse = false; + for (map::iterator q = all_info.begin(); q != all_info.end(); ++q) { + if (q->second.is_incomplete()) + continue; // don't care about log contiguity + if (q->second.last_update < newest_update_osd->second.log_tail && + q->second.last_update >= p->second.log_tail) { + dout(10) << "calc_acting prefer osd." << p->first + << " because it brings osd." << q->first << " into log contiguity" << dendl; + newest_update_osd = p; continue; + } + if (q->second.last_update < p->second.log_tail && + q->second.last_update >= newest_update_osd->second.log_tail) { + worse = true; + break; + } } + if (worse) + continue; + // prefer current primary (usually the caller), all things being equal if (p->first == acting[0]) { - dout(10) << "choose_acting prefer osd." << p->first + dout(10) << "calc_acting prefer osd." << p->first << " because it is current primary" << dendl; newest_update_osd = p; continue; } } - dout(10) << "choose_acting newest update on osd." << newest_update_osd->first + dout(10) << "calc_acting newest update on osd." << newest_update_osd->first << " with " << newest_update_osd->second << dendl; newest_update_osd_id = newest_update_osd->first; @@ -955,7 +955,7 @@ void PG::calc_acting(int& newest_update_osd_id, vector& want) const if (p->second.is_incomplete()) continue; if (primary->second.is_incomplete()) { - dout(10) << "choose_acting prefer osd." << p->first << " because not incomplete" << dendl; + dout(10) << "calc_acting prefer osd." << p->first << " because not incomplete" << dendl; primary = p; continue; } @@ -963,7 +963,7 @@ void PG::calc_acting(int& newest_update_osd_id, vector& want) const if (p->second.last_update < newest_update_osd->second.log_tail) continue; if (primary->second.last_update < newest_update_osd->second.log_tail) { - dout(10) << "choose_acting prefer osd." << p->first + dout(10) << "calc_acting prefer osd." << p->first << " because log contiguous with newest osd." << newest_update_osd->first << dendl; primary = p; continue; @@ -974,7 +974,7 @@ void PG::calc_acting(int& newest_update_osd_id, vector& want) const if (!q->second.is_incomplete() && q->second.last_update < primary->second.log_tail && q->second.last_update >= p->second.log_tail) { - dout(10) << "choose_acting prefer osd." << p->first + dout(10) << "calc_acting prefer osd." << p->first << " because it brings osd." << q->first << " into log contiguity" << dendl; primary = p; continue; @@ -985,14 +985,15 @@ void PG::calc_acting(int& newest_update_osd_id, vector& want) const if (primary->second.is_incomplete() || primary->second.last_update < newest_update_osd->second.log_tail) { - dout(10) << "choose_acting no acceptable primary, reverting to up " << up << dendl; + dout(10) << "calc_acting no acceptable primary, reverting to up " << up << dendl; want = up; return; } - dout(10) << "choose_acting primary is osd." << primary->first + dout(10) << "calc_acting primary is osd." << primary->first << " with " << primary->second << dendl; want.push_back(primary->first); + unsigned usable = 1; // select replicas that have log contiguity with primary for (vector::const_iterator i = up.begin(); @@ -1008,6 +1009,7 @@ void PG::calc_acting(int& newest_update_osd_id, vector& want) const want.push_back(*i); } else { want.push_back(*i); + usable++; dout(10) << " osd." << *i << " (up) accepted " << cur_info << dendl; } } @@ -1015,7 +1017,7 @@ void PG::calc_acting(int& newest_update_osd_id, vector& want) const for (map::const_iterator i = all_info.begin(); i != all_info.end(); ++i) { - if (want.size() >= get_osdmap()->get_pg_size(info.pgid)) + if (usable >= get_osdmap()->get_pg_size(info.pgid)) break; // skip up osds we already considered above @@ -1163,7 +1165,7 @@ void PG::activate(ObjectStore::Transaction& t, list& tfin, } last_update_applied = info.last_update; - assert(info.last_complete >= log.tail && !info.is_incomplete()); + assert(info.last_complete >= log.tail); need_up_thru = false; @@ -1227,14 +1229,13 @@ void PG::activate(ObjectStore::Transaction& t, list& tfin, assert(peer_info.count(peer)); PG::Info& pi = peer_info[peer]; - MOSDPGLog *m = 0; - dout(10) << "activate peer osd." << peer << " " << pi << dendl; if (log.tail > pi.last_update) { // reset, backfill pi.last_update = info.last_update; pi.last_complete = info.last_complete; + pi.log_tail = info.last_update; pi.last_backfill = hobject_t(); pi.history = info.history; @@ -1246,12 +1247,14 @@ void PG::activate(ObjectStore::Transaction& t, list& tfin, (*activator_map)[peer] = new MOSDPGInfo(get_osdmap()->get_epoch()); (*activator_map)[peer]->pg_info.push_back(pi); } else { - m = new MOSDPGLog(get_osdmap()->get_epoch(), pi); - osd->cluster_messenger->send_message(m, get_osdmap()->get_cluster_inst(peer)); + MOSDPGInfo *mp = new MOSDPGInfo(get_osdmap()->get_epoch()); + mp->pg_info.push_back(pi); + osd->cluster_messenger->send_message(mp, get_osdmap()->get_cluster_inst(peer)); } continue; } + MOSDPGLog *m = 0; if (pi.last_update == info.last_update) { // empty log if (!pi.is_empty() && activator_map) { @@ -4025,22 +4028,22 @@ boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt) boost::statechart::result PG::RecoveryState::Stray::react(const MInfoRec& infoevt) { PG *pg = context< RecoveryMachine >().pg; - dout(10) << "got info from osd." << infoevt.from << dendl; + dout(10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl; - if (pg->is_replica()) { - assert(pg->log.tail <= pg->info.last_complete); - assert(pg->log.head == pg->info.last_update); - post_event(Activate()); - } else { - // pg creation for backfill - dout(10) << "updating info to " << infoevt.info << dendl; + if (infoevt.info.last_update != pg->info.last_update) { + dout(10) << " reset for backfill" << dendl; pg->info = infoevt.info; - - ObjectStore::Transaction *t = new ObjectStore::Transaction; - pg->write_info(*t); - int tr = pg->osd->store->queue_transaction(&pg->osr, t); - assert(tr == 0); + assert(pg->info.log_tail == pg->info.last_update); + assert(pg->info.last_backfill == hobject_t()); + pg->log.clear(); + pg->log.head = pg->info.last_update; + pg->log.tail = pg->info.last_update; } + + assert(pg->log.tail <= pg->info.last_complete); + assert(pg->log.head == pg->info.last_update); + + post_event(Activate()); return discard_event(); } diff --git a/src/osd/PG.h b/src/osd/PG.h index 02d6887fcf350..b8dfe7c5eaf1d 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -1366,6 +1366,11 @@ public: protected: + /* + * peer_info -- projected (updates _before_ replicas ack) + * peer_missing -- committed (updates _after_ replicas ack) + */ + bool need_up_thru; set stray_set; // non-acting osds that have PG data. eversion_t oldest_update; // acting: lowest (valid) last_update in active set @@ -1385,15 +1390,22 @@ protected: struct BackfillInterval { // info about a backfill interval on a peer map objects; - hobject_t begin, end; + hobject_t begin; + hobject_t end; + /// clear content + void clear() { + objects.clear(); + begin = end = hobject_t(); + } + /// true if there are no objects in this interval bool empty() { return objects.empty(); } /// true if interval extends to the end of the range - bool at_end() { + bool extends_to_end() { return end == hobject_t::get_max(); } diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 72ece8cedb0dd..2d3b0bd08e24b 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -800,6 +800,9 @@ void ReplicatedPG::do_sub_op(MOSDSubOp *op) case CEPH_OSD_OP_PUSH: sub_op_push(op); return; + case CEPH_OSD_OP_DELETE: + sub_op_remove(op); + return; case CEPH_OSD_OP_SCRUB_RESERVE: sub_op_scrub_reserve(op); return; @@ -2951,7 +2954,8 @@ void ReplicatedPG::issue_repop(RepGather *repop, utime_t now, for (unsigned i=1; iwaitfor_ack.insert(peer); repop->waitfor_disk.insert(peer); @@ -2975,7 +2979,14 @@ void ReplicatedPG::issue_repop(RepGather *repop, utime_t now, wr->set_data(repop->ctx->op->get_data()); // _copy_ bufferlist } else { // ship resulting transaction, log entries, and pg_stats - ::encode(repop->ctx->op_t, wr->get_data()); + if (soid > pinfo.last_backfill) { + dout(10) << "issue_repop shipping empty opt to osd." << peer << ", object beyond last_backfill " + << pinfo.last_backfill << dendl; + ObjectStore::Transaction t; + ::encode(t, wr->get_data()); + } else { + ::encode(repop->ctx->op_t, wr->get_data()); + } ::encode(repop->ctx->log, wr->logbl); wr->pg_stats = info.stats; } @@ -2984,10 +2995,9 @@ void ReplicatedPG::issue_repop(RepGather *repop, utime_t now, osd->cluster_messenger->send_message(wr, get_osdmap()->get_cluster_inst(peer)); // keep peer_info up to date - Info &in = peer_info[peer]; - in.last_update = ctx->at_version; - if (in.last_complete == old_last_update) - in.last_update = ctx->at_version; + if (pinfo.last_complete == pinfo.last_update) + pinfo.last_update = ctx->at_version; + pinfo.last_update = ctx->at_version; } } @@ -4100,8 +4110,7 @@ void ReplicatedPG::sub_op_push_reply(MOSDSubOpReply *reply) pi->data_subset_pushing, pi->clone_subsets); } else { // done! - if (peer_missing[peer].is_missing(soid)) // so that we ignore backfill; imprecise! - peer_missing[peer].got(soid, pi->version); + peer_missing[peer].got(soid, pi->version); pushing[soid].erase(peer); pi = NULL; @@ -4613,6 +4622,17 @@ void ReplicatedPG::_failed_push(MOSDSubOp *op) op->put(); } +void ReplicatedPG::sub_op_remove(MOSDSubOp *op) +{ + dout(7) << "sub_op_remove " << op->poid << dendl; + + ObjectStore::Transaction *t = new ObjectStore::Transaction; + remove_object_with_snap_hardlinks(*t, op->poid); + int r = osd->store->queue_transaction(&osr, t); + assert(r == 0); + + op->put(); +} eversion_t ReplicatedPG::pick_newest_available(const hobject_t& oid) @@ -5273,31 +5293,23 @@ int ReplicatedPG::recover_backfill(int max) Info& pinfo = peer_info[backfill_target]; BackfillInterval& pbi = peer_backfill_info; - dout(10) << " peer osd." << backfill_target << " " << pinfo + hobject_t pos = pbi.begin; + + dout(10) << " peer osd." << backfill_target + << " pos " << pos + << " info " << pinfo << " interval " << pbi.begin << "-" << pbi.end << " " << pbi.objects.size() << " objects" << dendl; - // does the pg exist yet on the peer? - if (pinfo.dne()) { - // ok, we know they have no objects. - pbi.end = hobject_t::get_max(); - - // fill in pinfo - pinfo.last_update = info.last_update; - pinfo.log_tail = info.last_update; - pinfo.last_backfill = hobject_t(); - pinfo.history = info.history; - dout(10) << " peer osd." << backfill_target << " pg dne; setting info to " << pinfo << dendl; - - // create pg on remote - MOSDPGInfo *mp = new MOSDPGInfo(get_osdmap()->get_epoch()); - mp->pg_info.push_back(pinfo); - osd->cluster_messenger->send_message(mp, get_osdmap()->get_cluster_inst(backfill_target)); - } + // re-scan our local interval to cope with recent changes + dout(10) << " rescanning local backfill_info from " << pos << dendl; + backfill_info.clear(); + osr.flush(); + scan_range(pos, 10, 20, &backfill_info); int ops = 0; while (ops < max) { - if (!backfill_info.at_end() && (backfill_info.end <= pbi.begin || + if (!backfill_info.extends_to_end() && (backfill_info.end <= pbi.begin || backfill_info.empty())) { osr.flush(); scan_range(backfill_info.end, 10, 20, &backfill_info); @@ -5307,7 +5319,7 @@ int ReplicatedPG::recover_backfill(int max) << " " << backfill_info.objects << dendl; dout(20) << " peer backfill " << pbi.begin << "-" << pbi.end << " " << pbi.objects << dendl; - if (!pbi.at_end() && (pbi.end <= backfill_info.begin || + if (!pbi.extends_to_end() && (pbi.end <= backfill_info.begin || pbi.empty())) { epoch_t e = get_osdmap()->get_epoch(); MOSDPGScan *m = new MOSDPGScan(MOSDPGScan::OP_SCAN_GET_DIGEST, e, e, info.pgid, @@ -5320,9 +5332,9 @@ int ReplicatedPG::recover_backfill(int max) if (backfill_info.empty()) { // this only happens when we reach the end of the collection. - assert(backfill_info.at_end()); + assert(backfill_info.extends_to_end()); if (pbi.empty()) { - assert(pbi.at_end()); + assert(pbi.extends_to_end()); dout(10) << " reached end for both local and peer" << dendl; if (pbi.begin != hobject_t::get_max()) { pbi.begin = hobject_t::get_max(); @@ -5354,10 +5366,10 @@ int ReplicatedPG::recover_backfill(int max) eversion_t mv = backfill_info.objects.begin()->second; if (pbi.empty()) { - assert(pbi.at_end()); + assert(pbi.extends_to_end()); dout(20) << " pushing local " << my_first << " " << backfill_info.objects.begin()->second << " to peer osd." << backfill_target << dendl; - push_backfill_object(my_first, mv, backfill_target); + push_backfill_object(my_first, mv, eversion_t(), backfill_target); backfill_info.pop_front(); pbi.begin = my_first; ++ops; @@ -5371,21 +5383,25 @@ int ReplicatedPG::recover_backfill(int max) dout(20) << " removing peer " << peer_first << " <= local " << my_first << dendl; send_remove_op(peer_first, pv, backfill_target); pbi.pop_front(); + if (pbi.begin < backfill_info.begin) + pbi.begin = backfill_info.begin; } else if (peer_first == my_first) { if (pv == mv) { dout(20) << " keeping peer " << peer_first << " " << pv << dendl; } else { dout(20) << " replacing peer " << peer_first << " with local " << mv << dendl; - push_backfill_object(my_first, mv, backfill_target); + push_backfill_object(my_first, mv, pv, backfill_target); ++ops; } pbi.pop_front(); backfill_info.pop_front(); + if (pbi.begin < backfill_info.begin) + pbi.begin = backfill_info.begin; } else { // peer_first > my_first dout(20) << " pushing local " << my_first << " " << mv << " to peer osd." << backfill_target << dendl; - push_backfill_object(my_first, mv, backfill_target); + push_backfill_object(my_first, mv, eversion_t(), backfill_target); backfill_info.pop_front(); ++ops; } @@ -5394,21 +5410,24 @@ int ReplicatedPG::recover_backfill(int max) hobject_t bound = pbi.begin; bound.back_up_to_bounding_key(); if (pinfo.last_backfill < bound) { - pinfo.last_backfill = bound; - dout(10) << " peer osd." << backfill_target << " info.last_backfill now " << pinfo.last_backfill << dendl; epoch_t e = get_osdmap()->get_epoch(); MOSDPGBackfill *m = new MOSDPGBackfill(MOSDPGBackfill::OP_BACKFILL_PROGRESS, e, e, info.pgid); - m->last_backfill = pinfo.last_backfill; + m->last_backfill = bound; osd->cluster_messenger->send_message(m, get_osdmap()->get_cluster_inst(backfill_target)); } return ops; } -void ReplicatedPG::push_backfill_object(hobject_t oid, eversion_t v, int peer) +void ReplicatedPG::push_backfill_object(hobject_t oid, eversion_t v, eversion_t have, int peer) { dout(10) << "push_backfill_object " << oid << " v " << v << " to osd." << peer << dendl; + + // object is now below the waterline; mark it missing. + peer_info[peer].last_backfill = oid; + peer_missing[peer].add(oid, v, have); + start_recovery_op(oid); ObjectContext *obc = get_object_context(oid, OLOC_BLANK, false); obc->ondisk_read_lock(); @@ -5422,6 +5441,7 @@ void ReplicatedPG::scan_range(hobject_t begin, int min, int max, BackfillInterva assert(is_locked()); dout(10) << "scan_range from " << begin << dendl; bi->begin = begin; + bi->objects.clear(); // for good measure vector ls; ls.reserve(max); diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index 4d5db261a89b5..bb1a0d2b73608 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -707,6 +707,8 @@ protected: } }; + void sub_op_remove(MOSDSubOp *op); + void sub_op_modify(MOSDSubOp *op); void sub_op_modify_applied(RepModify *rm); void sub_op_modify_commit(RepModify *rm); diff --git a/src/test/test_backfill.sh b/src/test/test_backfill.sh index 75f2fa12214e1..365f457459cf6 100755 --- a/src/test/test_backfill.sh +++ b/src/test/test_backfill.sh @@ -4,3 +4,4 @@ CEPH_NUM_OSD=3 ./vstart.sh -d -n -x ./rados -p data bench 15 write -b 4096 ./ceph osd out 0 +./rados -p data bench 600 write -b 4096 -- 2.39.5