struct MOSDScrub : public Message {
ceph_fsid fsid;
vector<pg_t> scrub_pgs;
+ bool repair;
MOSDScrub() {}
MOSDScrub(ceph_fsid& f) :
Message(MSG_OSD_SCRUB),
fsid(f) {}
- MOSDScrub(ceph_fsid& f, vector<pg_t>& pgs) :
+ MOSDScrub(ceph_fsid& f, vector<pg_t>& pgs, bool r) :
Message(MSG_OSD_SCRUB),
- fsid(f), scrub_pgs(pgs) {}
+ fsid(f), scrub_pgs(pgs), repair(r) {}
const char *get_type_name() { return "scrub"; }
void print(ostream& out) {
out << "scrub(";
if (scrub_pgs.empty())
- out << "osd)";
+ out << "osd";
else
- out << scrub_pgs << ")";
+ out << scrub_pgs;
+ if (repair)
+ out << " repair";
+ out << ")";
}
void encode_payload() {
::encode(fsid, payload);
::encode(scrub_pgs, payload);
+ ::encode(repair, payload);
}
void decode_payload() {
bufferlist::iterator p = payload.begin();
::decode(fsid, p);
::decode(scrub_pgs, p);
+ ::decode(repair, p);
}
};
if (osdmap.is_up(from) &&
osdmap.get_inst(from) == m->get_orig_source_inst()) {
// yup.
- dout(7) << "preprocess_boot dup from " << m->get_orig_source_inst() << dendl;
+ dout(7) << "preprocess_boot dup from " << m->get_orig_source_inst()
+ << " == " << osdmap.get_inst(from) << dendl;
_booted(m);
return true;
}
void OSDMonitor::send_latest(entity_inst_t who, epoch_t start)
{
if (paxos->is_readable()) {
- dout(5) << "send_latest to " << who << " now" << dendl;
+ dout(5) << "send_latest to " << who << " start " << start << " now" << dendl;
if (start == 0)
send_full(who);
else
send_incremental(who, start);
} else {
- dout(5) << "send_latest to " << who << " later" << dendl;
+ dout(5) << "send_latest to " << who << " start " << start << " later" << dendl;
waiting_for_map[who] = start;
}
}
} else
ss << "invalid pgid '" << m->cmd[2] << "'";
}
- else if (m->cmd[1] == "scrub" && m->cmd.size() == 3) {
+ else if ((m->cmd[1] == "scrub" || m->cmd[1] == "repair") && m->cmd.size() == 3) {
pg_t pgid;
r = -EINVAL;
if (pgid.parse(m->cmd[2].c_str())) {
if (mon->osdmon()->osdmap.is_up(osd)) {
vector<pg_t> pgs(1);
pgs[0] = pgid;
- mon->messenger->send_message(new MOSDScrub(mon->monmap->fsid, pgs),
+ mon->messenger->send_message(new MOSDScrub(mon->monmap->fsid, pgs,
+ m->cmd[1] == "repair"),
mon->osdmon()->osdmap.get_inst(osd));
ss << "instructing pg " << pgid << " on osd" << osd << " to scrub";
r = 0;
p != pg_map.end();
p++) {
PG *pg = p->second;
- if (pg->is_primary() && !pg->is_scrubbing())
- scrub_wq.queue(pg);
+ if (pg->is_primary()) {
+ if (m->repair)
+ pg->state_set(PG_STATE_REPAIR);
+ if (!pg->is_scrubbing())
+ scrub_wq.queue(pg);
+ }
}
} else {
for (vector<pg_t>::iterator p = m->scrub_pgs.begin();
p++)
if (pg_map.count(*p)) {
PG *pg = pg_map[*p];
- if (pg->is_primary() && !pg->is_scrubbing())
- scrub_wq.queue(pg);
+ if (pg->is_primary()) {
+ if (m->repair)
+ pg->state_set(PG_STATE_REPAIR);
+ if (!pg->is_scrubbing())
+ scrub_wq.queue(pg);
+ }
}
}
}
+void PG::init_recovery_pointers()
+{
+ dout(10) << "init_recovery_pointers" << dendl;
+ log.complete_to = log.log.begin();
+ while (log.complete_to->version < info.last_complete)
+ log.complete_to++;
+ assert(log.complete_to != log.log.end());
+
+ if (is_primary())
+ log.requested_to = log.complete_to;
+}
+
void PG::activate(ObjectStore::Transaction& t,
map<int, MOSDPGInfo*> *activator_map)
{
} else {
dout(10) << "activate - not complete, " << missing << dendl;
- log.complete_to = log.log.begin();
- while (log.complete_to->version < info.last_complete)
- log.complete_to++;
- assert(log.complete_to != log.log.end());
- dout(10) << "activate - complete_to = " << log.complete_to->version << dendl;
+ init_recovery_pointers();
+ dout(10) << "activate - complete_to = " << log.complete_to->version << dendl;
if (is_primary()) {
- // start recovery
dout(10) << "activate - starting recovery" << dendl;
- log.requested_to = log.complete_to;
osd->queue_for_recovery(this);
}
}
finish_sync_event = 0;
purge_strays();
update_stats();
+
+ if (state_test(PG_STATE_INCONSISTENT)) {
+ dout(10) << "_finish_recovery requeueing for scrub" << dendl;
+ osd->scrub_wq.queue(this);
+ }
+
} else {
dout(10) << "_finish_recovery -- stale" << dendl;
}
+void PG::repair_object(ScrubMap::object *po, int bad_peer, int ok_peer)
+{
+ eversion_t v;
+ po->attrs["version"].copy_out(0, sizeof(v), (char *)&v);
+ if (bad_peer != acting[0]) {
+ peer_missing[bad_peer].add(po->poid.oid, v, eversion_t());
+ } else {
+ missing.add(po->poid.oid, v, eversion_t());
+ missing_loc[po->poid.oid].insert(ok_peer);
+
+ // primary recovery is log driven
+ if (v < info.last_complete) {
+ info.last_complete = v;
+ init_recovery_pointers();
+ }
+ }
+ uptodate_set.erase(bad_peer);
+ osd->queue_for_recovery(this);
+}
+
void PG::scrub()
{
stringstream ss;
ScrubMap scrubmap;
int errors = 0;
+ bool repair = state_test(PG_STATE_REPAIR);
+
osd->map_lock.get_read();
lock();
while (1) {
ScrubMap::object *po = 0;
- bool missing = false;
+ int pi = -1;
+ bool anymissing = false;
for (unsigned i=0; i<acting.size(); i++) {
if (p[i] == m[i]->objects.end()) {
- missing = true;
+ anymissing = true;
continue;
}
- if (!po)
+ if (!po) {
po = &(*p[i]);
+ pi = i;
+ }
else if (po->poid != p[i]->poid) {
- missing = true;
- if (po->poid > p[i]->poid)
+ anymissing = true;
+ if (po->poid > p[i]->poid) {
po = &(*p[i]);
+ pi = i;
+ }
}
}
if (!po)
break;
- if (missing) {
+ if (anymissing) {
for (unsigned i=0; i<acting.size(); i++) {
if (p[i] == m[i]->objects.end() || po->poid != p[i]->poid) {
ss << info.pgid << " scrub osd" << acting[i] << " missing " << po->poid;
osd->get_logclient()->log(LOG_ERROR, ss);
num_missing++;
+
+ if (repair)
+ repair_object(po, acting[i], acting[pi]);
} else
p[i]++;
}
}
// compare
- dout(10) << " po is " << (void*)po << dendl;
- dout(10) << " po is " << po->poid << dendl;
-
bool ok = true;
for (unsigned i=1; i<acting.size(); i++) {
+ bool peerok = true;
if (po->size != p[i]->size) {
ss << info.pgid << " scrub osd" << acting[i] << " " << po->poid
<< " size " << p[i]->size << " != " << po->size;
osd->get_logclient()->log(LOG_ERROR, ss);
- ok = false;
+ peerok = ok = false;
num_bad++;
}
if (po->attrs.size() != p[i]->attrs.size()) {
ss << info.pgid << " scrub osd" << acting[i] << " " << po->poid
<< " attr count " << p[i]->attrs.size() << " != " << po->attrs.size();
osd->get_logclient()->log(LOG_ERROR, ss);
- ok = false;
+ peerok = ok = false;
num_bad++;
}
for (map<nstring,bufferptr>::iterator q = po->attrs.begin(); q != po->attrs.end(); q++) {
ss << info.pgid << " scrub osd" << acting[i] << " " << po->poid
<< " attr " << q->first << " value mismatch";
osd->get_logclient()->log(LOG_ERROR, ss);
- ok = false;
+ peerok = ok = false;
num_bad++;
}
} else {
ss << info.pgid << " scrub osd" << acting[i] << " " << po->poid
<< " attr " << q->first << " missing";
osd->get_logclient()->log(LOG_ERROR, ss);
- ok = false;
+ peerok = ok = false;
num_bad++;
}
}
+
+ if (!peerok && repair)
+ repair_object(po, acting[i], acting[pi]);
}
if (ok)
if (num_missing || num_bad) {
ss << info.pgid << " scrub " << num_missing << " missing, " << num_bad << " bad objects";
osd->get_logclient()->log(LOG_ERROR, ss);
+ state_set(PG_STATE_INCONSISTENT);
+ if (repair)
+ state_clear(PG_STATE_CLEAN);
}
}
ss << info.pgid << " scrub " << errors << " errors";
osd->get_logclient()->log(errors ? LOG_ERROR:LOG_INFO, ss);
+ if (!errors && repair)
+ state_clear(PG_STATE_INCONSISTENT);
+ state_clear(PG_STATE_REPAIR);
// finish up
info.stats.last_scrub = info.last_update;
virtual void cancel_recovery() = 0;
virtual int start_recovery_ops(int max) = 0;
+ void init_recovery_pointers();
+
void purge_strays();
// -- scrub --
map<int,ScrubMap> peer_scrub_map;
+ void repair_object(ScrubMap::object *po, int bad_peer, int ok_peer);
void scrub();
void build_scrub_map(ScrubMap &map);
virtual int _scrub(ScrubMap &map) { return 0; }
/*
* pg states
*/
-#define PG_STATE_CREATING 1 // creating
-#define PG_STATE_ACTIVE 2 // i am active. (primary: replicas too)
-#define PG_STATE_CLEAN 4 // peers are complete, clean of stray replicas.
-#define PG_STATE_CRASHED 8 // all replicas went down, clients needs to replay
-#define PG_STATE_DOWN 16 // a needed replica is down, PG offline
-#define PG_STATE_REPLAY 32 // crashed, waiting for replay
-#define PG_STATE_STRAY 64 // i must notify the primary i exist.
-#define PG_STATE_SPLITTING 128 // i am splitting
-#define PG_STATE_SCRUBBING 256 // scrubbing
-#define PG_STATE_SCRUBQ 512 // queued for scrub
-#define PG_STATE_DEGRADED 1024 // pg membership not complete
-#define PG_STATE_INCONSISTENT 2048 // pg replicas are inconsistent (but shouldn't be)
-#define PG_STATE_REPAIR 2048 // pg should repair on next scrub
-#define PG_STATE_PEERING 4096
+#define PG_STATE_CREATING (1<<0) // creating
+#define PG_STATE_ACTIVE (1<<1) // i am active. (primary: replicas too)
+#define PG_STATE_CLEAN (1<<2) // peers are complete, clean of stray replicas.
+#define PG_STATE_CRASHED (1<<3) // all replicas went down, clients needs to replay
+#define PG_STATE_DOWN (1<<4) // a needed replica is down, PG offline
+#define PG_STATE_REPLAY (1<<5) // crashed, waiting for replay
+#define PG_STATE_STRAY (1<<6) // i must notify the primary i exist.
+#define PG_STATE_SPLITTING (1<<7) // i am splitting
+#define PG_STATE_SCRUBBING (1<<8) // scrubbing
+#define PG_STATE_SCRUBQ (1<<9) // queued for scrub
+#define PG_STATE_DEGRADED (1<<10) // pg membership not complete
+#define PG_STATE_INCONSISTENT (1<<11) // pg replicas are inconsistent (but shouldn't be)
+#define PG_STATE_PEERING (1<<12) // pg is (re)peering
+#define PG_STATE_REPAIR (1<<13) // pg should repair on next scrub
static inline std::string pg_state_string(int state) {
std::string st;