From: Sage Weil Date: Tue, 9 Dec 2008 21:33:33 +0000 (-0800) Subject: osd: 'pg repair ' to repair an inconsistent pg using replicas X-Git-Tag: v0.6~123 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=ac6e86a21c5241f1d3ae642930fdf9784adc2efd;p=ceph.git osd: 'pg repair ' to repair an inconsistent pg using replicas --- diff --git a/src/messages/MOSDScrub.h b/src/messages/MOSDScrub.h index 8768be423b5b9..2d871afd3ae5f 100644 --- a/src/messages/MOSDScrub.h +++ b/src/messages/MOSDScrub.h @@ -25,32 +25,38 @@ struct MOSDScrub : public Message { ceph_fsid fsid; vector scrub_pgs; + bool repair; MOSDScrub() {} MOSDScrub(ceph_fsid& f) : Message(MSG_OSD_SCRUB), fsid(f) {} - MOSDScrub(ceph_fsid& f, vector& pgs) : + MOSDScrub(ceph_fsid& f, vector& pgs, bool r) : Message(MSG_OSD_SCRUB), - fsid(f), scrub_pgs(pgs) {} + fsid(f), scrub_pgs(pgs), repair(r) {} const char *get_type_name() { return "scrub"; } void print(ostream& out) { out << "scrub("; if (scrub_pgs.empty()) - out << "osd)"; + out << "osd"; else - out << scrub_pgs << ")"; + out << scrub_pgs; + if (repair) + out << " repair"; + out << ")"; } void encode_payload() { ::encode(fsid, payload); ::encode(scrub_pgs, payload); + ::encode(repair, payload); } void decode_payload() { bufferlist::iterator p = payload.begin(); ::decode(fsid, p); ::decode(scrub_pgs, p); + ::decode(repair, p); } }; diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 9a4f37e0cfd5a..617098dca0778 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -481,7 +481,8 @@ bool OSDMonitor::preprocess_boot(MOSDBoot *m) if (osdmap.is_up(from) && osdmap.get_inst(from) == m->get_orig_source_inst()) { // yup. - dout(7) << "preprocess_boot dup from " << m->get_orig_source_inst() << dendl; + dout(7) << "preprocess_boot dup from " << m->get_orig_source_inst() + << " == " << osdmap.get_inst(from) << dendl; _booted(m); return true; } @@ -676,13 +677,13 @@ void OSDMonitor::send_to_waiting() void OSDMonitor::send_latest(entity_inst_t who, epoch_t start) { if (paxos->is_readable()) { - dout(5) << "send_latest to " << who << " now" << dendl; + dout(5) << "send_latest to " << who << " start " << start << " now" << dendl; if (start == 0) send_full(who); else send_incremental(who, start); } else { - dout(5) << "send_latest to " << who << " later" << dendl; + dout(5) << "send_latest to " << who << " start " << start << " later" << dendl; waiting_for_map[who] = start; } } diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc index bed58889bb4ad..f0b16c7cbc2fe 100644 --- a/src/mon/PGMonitor.cc +++ b/src/mon/PGMonitor.cc @@ -587,7 +587,7 @@ bool PGMonitor::preprocess_command(MMonCommand *m) } else ss << "invalid pgid '" << m->cmd[2] << "'"; } - else if (m->cmd[1] == "scrub" && m->cmd.size() == 3) { + else if ((m->cmd[1] == "scrub" || m->cmd[1] == "repair") && m->cmd.size() == 3) { pg_t pgid; r = -EINVAL; if (pgid.parse(m->cmd[2].c_str())) { @@ -597,7 +597,8 @@ bool PGMonitor::preprocess_command(MMonCommand *m) if (mon->osdmon()->osdmap.is_up(osd)) { vector pgs(1); pgs[0] = pgid; - mon->messenger->send_message(new MOSDScrub(mon->monmap->fsid, pgs), + mon->messenger->send_message(new MOSDScrub(mon->monmap->fsid, pgs, + m->cmd[1] == "repair"), mon->osdmon()->osdmap.get_inst(osd)); ss << "instructing pg " << pgid << " on osd" << osd << " to scrub"; r = 0; diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 1d0821f409650..b941147a50f18 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1632,8 +1632,12 @@ void OSD::handle_scrub(MOSDScrub *m) p != pg_map.end(); p++) { PG *pg = p->second; - if (pg->is_primary() && !pg->is_scrubbing()) - scrub_wq.queue(pg); + if (pg->is_primary()) { + if (m->repair) + pg->state_set(PG_STATE_REPAIR); + if (!pg->is_scrubbing()) + scrub_wq.queue(pg); + } } } else { for (vector::iterator p = m->scrub_pgs.begin(); @@ -1641,8 +1645,12 @@ void OSD::handle_scrub(MOSDScrub *m) p++) if (pg_map.count(*p)) { PG *pg = pg_map[*p]; - if (pg->is_primary() && !pg->is_scrubbing()) - scrub_wq.queue(pg); + if (pg->is_primary()) { + if (m->repair) + pg->state_set(PG_STATE_REPAIR); + if (!pg->is_scrubbing()) + scrub_wq.queue(pg); + } } } diff --git a/src/osd/PG.cc b/src/osd/PG.cc index cdd8aa7af66fb..e014454738a33 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -1174,6 +1174,18 @@ void PG::peer(ObjectStore::Transaction& t, } +void PG::init_recovery_pointers() +{ + dout(10) << "init_recovery_pointers" << dendl; + log.complete_to = log.log.begin(); + while (log.complete_to->version < info.last_complete) + log.complete_to++; + assert(log.complete_to != log.log.end()); + + if (is_primary()) + log.requested_to = log.complete_to; +} + void PG::activate(ObjectStore::Transaction& t, map *activator_map) { @@ -1228,16 +1240,11 @@ void PG::activate(ObjectStore::Transaction& t, } else { dout(10) << "activate - not complete, " << missing << dendl; - log.complete_to = log.log.begin(); - while (log.complete_to->version < info.last_complete) - log.complete_to++; - assert(log.complete_to != log.log.end()); - dout(10) << "activate - complete_to = " << log.complete_to->version << dendl; + init_recovery_pointers(); + dout(10) << "activate - complete_to = " << log.complete_to->version << dendl; if (is_primary()) { - // start recovery dout(10) << "activate - starting recovery" << dendl; - log.requested_to = log.complete_to; osd->queue_for_recovery(this); } } @@ -1403,6 +1410,12 @@ void PG::_finish_recovery(Context *c) finish_sync_event = 0; purge_strays(); update_stats(); + + if (state_test(PG_STATE_INCONSISTENT)) { + dout(10) << "_finish_recovery requeueing for scrub" << dendl; + osd->scrub_wq.queue(this); + } + } else { dout(10) << "_finish_recovery -- stale" << dendl; } @@ -1886,12 +1899,34 @@ void PG::build_scrub_map(ScrubMap &map) +void PG::repair_object(ScrubMap::object *po, int bad_peer, int ok_peer) +{ + eversion_t v; + po->attrs["version"].copy_out(0, sizeof(v), (char *)&v); + if (bad_peer != acting[0]) { + peer_missing[bad_peer].add(po->poid.oid, v, eversion_t()); + } else { + missing.add(po->poid.oid, v, eversion_t()); + missing_loc[po->poid.oid].insert(ok_peer); + + // primary recovery is log driven + if (v < info.last_complete) { + info.last_complete = v; + init_recovery_pointers(); + } + } + uptodate_set.erase(bad_peer); + osd->queue_for_recovery(this); +} + void PG::scrub() { stringstream ss; ScrubMap scrubmap; int errors = 0; + bool repair = state_test(PG_STATE_REPAIR); + osd->map_lock.get_read(); lock(); @@ -1983,28 +2018,36 @@ void PG::scrub() while (1) { ScrubMap::object *po = 0; - bool missing = false; + int pi = -1; + bool anymissing = false; for (unsigned i=0; iobjects.end()) { - missing = true; + anymissing = true; continue; } - if (!po) + if (!po) { po = &(*p[i]); + pi = i; + } else if (po->poid != p[i]->poid) { - missing = true; - if (po->poid > p[i]->poid) + anymissing = true; + if (po->poid > p[i]->poid) { po = &(*p[i]); + pi = i; + } } } if (!po) break; - if (missing) { + if (anymissing) { for (unsigned i=0; iobjects.end() || po->poid != p[i]->poid) { ss << info.pgid << " scrub osd" << acting[i] << " missing " << po->poid; osd->get_logclient()->log(LOG_ERROR, ss); num_missing++; + + if (repair) + repair_object(po, acting[i], acting[pi]); } else p[i]++; } @@ -2012,23 +2055,21 @@ void PG::scrub() } // compare - dout(10) << " po is " << (void*)po << dendl; - dout(10) << " po is " << po->poid << dendl; - bool ok = true; for (unsigned i=1; isize != p[i]->size) { ss << info.pgid << " scrub osd" << acting[i] << " " << po->poid << " size " << p[i]->size << " != " << po->size; osd->get_logclient()->log(LOG_ERROR, ss); - ok = false; + peerok = ok = false; num_bad++; } if (po->attrs.size() != p[i]->attrs.size()) { ss << info.pgid << " scrub osd" << acting[i] << " " << po->poid << " attr count " << p[i]->attrs.size() << " != " << po->attrs.size(); osd->get_logclient()->log(LOG_ERROR, ss); - ok = false; + peerok = ok = false; num_bad++; } for (map::iterator q = po->attrs.begin(); q != po->attrs.end(); q++) { @@ -2037,17 +2078,20 @@ void PG::scrub() ss << info.pgid << " scrub osd" << acting[i] << " " << po->poid << " attr " << q->first << " value mismatch"; osd->get_logclient()->log(LOG_ERROR, ss); - ok = false; + peerok = ok = false; num_bad++; } } else { ss << info.pgid << " scrub osd" << acting[i] << " " << po->poid << " attr " << q->first << " missing"; osd->get_logclient()->log(LOG_ERROR, ss); - ok = false; + peerok = ok = false; num_bad++; } } + + if (!peerok && repair) + repair_object(po, acting[i], acting[pi]); } if (ok) @@ -2061,6 +2105,9 @@ void PG::scrub() if (num_missing || num_bad) { ss << info.pgid << " scrub " << num_missing << " missing, " << num_bad << " bad objects"; osd->get_logclient()->log(LOG_ERROR, ss); + state_set(PG_STATE_INCONSISTENT); + if (repair) + state_clear(PG_STATE_CLEAN); } } @@ -2093,6 +2140,9 @@ void PG::scrub() ss << info.pgid << " scrub " << errors << " errors"; osd->get_logclient()->log(errors ? LOG_ERROR:LOG_INFO, ss); + if (!errors && repair) + state_clear(PG_STATE_INCONSISTENT); + state_clear(PG_STATE_REPAIR); // finish up info.stats.last_scrub = info.last_update; diff --git a/src/osd/PG.h b/src/osd/PG.h index 58f9424f7c886..4949e57602ed9 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -705,6 +705,8 @@ public: virtual void cancel_recovery() = 0; virtual int start_recovery_ops(int max) = 0; + void init_recovery_pointers(); + void purge_strays(); @@ -724,6 +726,7 @@ public: // -- scrub -- map peer_scrub_map; + void repair_object(ScrubMap::object *po, int bad_peer, int ok_peer); void scrub(); void build_scrub_map(ScrubMap &map); virtual int _scrub(ScrubMap &map) { return 0; } diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 6addc1129cabb..973f54d23f175 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -344,20 +344,20 @@ inline ostream& operator<<(ostream& out, const osd_stat_t& s) { /* * pg states */ -#define PG_STATE_CREATING 1 // creating -#define PG_STATE_ACTIVE 2 // i am active. (primary: replicas too) -#define PG_STATE_CLEAN 4 // peers are complete, clean of stray replicas. -#define PG_STATE_CRASHED 8 // all replicas went down, clients needs to replay -#define PG_STATE_DOWN 16 // a needed replica is down, PG offline -#define PG_STATE_REPLAY 32 // crashed, waiting for replay -#define PG_STATE_STRAY 64 // i must notify the primary i exist. -#define PG_STATE_SPLITTING 128 // i am splitting -#define PG_STATE_SCRUBBING 256 // scrubbing -#define PG_STATE_SCRUBQ 512 // queued for scrub -#define PG_STATE_DEGRADED 1024 // pg membership not complete -#define PG_STATE_INCONSISTENT 2048 // pg replicas are inconsistent (but shouldn't be) -#define PG_STATE_REPAIR 2048 // pg should repair on next scrub -#define PG_STATE_PEERING 4096 +#define PG_STATE_CREATING (1<<0) // creating +#define PG_STATE_ACTIVE (1<<1) // i am active. (primary: replicas too) +#define PG_STATE_CLEAN (1<<2) // peers are complete, clean of stray replicas. +#define PG_STATE_CRASHED (1<<3) // all replicas went down, clients needs to replay +#define PG_STATE_DOWN (1<<4) // a needed replica is down, PG offline +#define PG_STATE_REPLAY (1<<5) // crashed, waiting for replay +#define PG_STATE_STRAY (1<<6) // i must notify the primary i exist. +#define PG_STATE_SPLITTING (1<<7) // i am splitting +#define PG_STATE_SCRUBBING (1<<8) // scrubbing +#define PG_STATE_SCRUBQ (1<<9) // queued for scrub +#define PG_STATE_DEGRADED (1<<10) // pg membership not complete +#define PG_STATE_INCONSISTENT (1<<11) // pg replicas are inconsistent (but shouldn't be) +#define PG_STATE_PEERING (1<<12) // pg is (re)peering +#define PG_STATE_REPAIR (1<<13) // pg should repair on next scrub static inline std::string pg_state_string(int state) { std::string st;