- osdmon needs to lower-bound old osdmap versions it keeps around?
osd
+- pg split should be a work queue
- pg split needs to fix up pg stats. this is tricky with the clone overlap business...
-- how does an admin intervene when a pg needs to repeer despite a dead osd?
- generalize ack semantics? or just change ack from memory to journal? memory/journal/disk...
- rdlocks
- optimize remove wrt recovery pushes
* whenever the wire protocol changes. try to keep this string length
* constant.
*/
-#define CEPH_BANNER "ceph 010\n"
+#define CEPH_BANNER "ceph 011\n"
#define CEPH_BANNER_MAX_LEN 30
/*
#define CEPH_MON_PROTOCOL 2
#define CEPH_CLIENT_PROTOCOL 1
-#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v004"
-#define CEPH_MON_ONDISK_MAGIC "ceph mon volume v003"
+#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v005"
+#define CEPH_MON_ONDISK_MAGIC "ceph mon volume v004"
/*
* types in this file are defined as little-endian, and are
return true;
}
}
- else if (m->cmd[1] == "down" && m->cmd.size() > 2) {
+ else if (m->cmd[1] == "down" && m->cmd.size() == 3) {
long osd = strtol(m->cmd[2].c_str(), 0, 10);
if (osdmap.is_down(osd)) {
ss << "osd" << osd << " is already down";
return true;
}
}
- else if (m->cmd[1] == "out" && m->cmd.size() > 2) {
+ else if (m->cmd[1] == "out" && m->cmd.size() == 3) {
long osd = strtol(m->cmd[2].c_str(), 0, 10);
if (osdmap.is_out(osd)) {
ss << "osd" << osd << " is already out";
return true;
}
}
- else if (m->cmd[1] == "in" && m->cmd.size() > 2) {
+ else if (m->cmd[1] == "in" && m->cmd.size() == 3) {
long osd = strtol(m->cmd[2].c_str(), 0, 10);
if (osdmap.is_in(osd)) {
ss << "osd" << osd << " is already in";
return true;
}
}
- else if (m->cmd[1] == "reweight" && m->cmd.size() > 3) {
+ else if (m->cmd[1] == "reweight" && m->cmd.size() == 4) {
long osd = strtol(m->cmd[2].c_str(), 0, 10);
float w = strtof(m->cmd[3].c_str(), 0);
long ww = (int)((float)CEPH_OSD_IN*w);
return true;
}
}
+ else if (m->cmd[1] == "lost" && m->cmd.size() >= 3) {
+ long osd = strtol(m->cmd[2].c_str(), 0, 10);
+ if (m->cmd.size() < 4 ||
+ m->cmd[3] != "--yes-i-really-mean-it") {
+ ss << "are you SURE? this might mean real, permanent data loss. pass --yes-i-really-mean-it if you really do.";
+ }
+ else if (!osdmap.exists(osd) || !osdmap.is_down(osd)) {
+ ss << "osd" << osd << " is not down or doesn't exist";
+ } else {
+ epoch_t e = osdmap.get_info(osd).down_at;
+ pending_inc.new_lost[osd] = e;
+ ss << "marked osd lost in epoch " << e;
+ getline(ss, rs);
+ paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs));
+ return true;
+ }
+ }
else {
ss << "unknown command " << m->cmd[1];
}
* _finished_, or during which the osd cleanly shut down. when
* possible, we push this forward to the epoch the osd was eventually
* marked down.
+ *
+ * the lost_at is used to allow build_prior to proceed without waiting
+ * for an osd to recover. In certain cases, progress may be blocked
+ * because an osd is down that may contain updates (i.e., a pg may have
+ * gone rw during an interval). If the osd can't be brought online, we
+ * can force things to proceed knowing that we _might_ be losing some
+ * acked writes. If the osd comes back to life later, that's fine to,
+ * but those writes will still be lost (the divergent objects will be
+ * thrown out).
*/
struct osd_info_t {
epoch_t last_clean_first; // last interval that ended with a clean osd shutdown
epoch_t up_from; // epoch osd marked up
epoch_t up_thru; // lower bound on actual osd death (if > up_from)
epoch_t down_at; // upper bound on actual osd death (if > up_from)
+ epoch_t lost_at; // last epoch we decided data was "lost"
osd_info_t() : last_clean_first(0), last_clean_last(0),
- up_from(0), up_thru(0), down_at(0) {}
+ up_from(0), up_thru(0), down_at(0), lost_at(0) {}
void encode(bufferlist& bl) const {
::encode(last_clean_first, bl);
::encode(last_clean_last, bl);
::encode(up_from, bl);
::encode(up_thru, bl);
::encode(down_at, bl);
+ ::encode(lost_at, bl);
}
void decode(bufferlist::iterator& bl) {
::decode(last_clean_first, bl);
::decode(up_from, bl);
::decode(up_thru, bl);
::decode(down_at, bl);
+ ::decode(lost_at, bl);
}
};
WRITE_CLASS_ENCODER(osd_info_t)
map<int32_t,uint32_t> new_weight;
map<int32_t,epoch_t> new_up_thru;
map<int32_t,pair<epoch_t,epoch_t> > new_last_clean_interval;
+ map<int32_t,epoch_t> new_lost;
map<pg_t,uint32_t> new_pg_swap_primary;
list<pg_t> old_pg_swap_primary;
// extended
::encode(new_up_thru, bl);
::encode(new_last_clean_interval, bl);
+ ::encode(new_lost, bl);
::encode(new_pg_swap_primary, bl);
::encode(old_pg_swap_primary, bl);
::encode(new_max_snap, bl);
// extended
::decode(new_up_thru, p);
::decode(new_last_clean_interval, p);
+ ::decode(new_lost, p);
::decode(new_pg_swap_primary, p);
::decode(old_pg_swap_primary, p);
::decode(new_max_snap, p);
osd_info[i->first].last_clean_first = i->second.first;
osd_info[i->first].last_clean_last = i->second.second;
}
+ for (map<int32_t,epoch_t>::iterator p = inc.new_lost.begin(); p != inc.new_lost.end(); p++)
+ osd_info[p->first].lost_at = p->second;
// pg swap
for (map<pg_t,uint32_t>::iterator i = inc.new_pg_swap_primary.begin();
OSDMap *lastmap = osd->get_map(interval.last);
int crashed = 0;
+ int need_down = 0;
bool any_survived = false;
for (unsigned i=0; i<interval.acting.size(); i++) {
const osd_info_t& pinfo = osd->osdmap->get_info(interval.acting[i]);
if (interval.first <= info.history.last_epoch_started &&
interval.last >= info.history.last_epoch_started)
any_up_now = true;
+ } else if (pinfo.lost_at > interval.first) {
+ dout(10) << "build_prior prior osd" << interval.acting[i]
+ << " is down, but marked lost at " << pinfo.lost_at << dendl;
+ prior_set_down.insert(interval.acting[i]);
} else {
dout(10) << "build_prior prior osd" << interval.acting[i]
<< " is down, must notify mon" << dendl;
must_notify_mon = true;
+ need_down++;
prior_set_down.insert(interval.acting[i]);
}
}
// if nobody survived this interval, and we may have gone rw,
// then we need to wait for one of those osds to recover to
// ensure that we haven't lost any information.
- if (!any_survived && interval.maybe_went_rw) {
+ if (!any_survived && need_down && interval.maybe_went_rw) {
// fixme: how do we identify a "clean" shutdown anyway?
- dout(10) << "build_prior possibly went active+rw, no survivors, including" << dendl;
+ dout(10) << "build_prior " << need_down
+ << " osds possibly went active+rw, no survivors, including" << dendl;
for (unsigned i=0; i<interval.acting.size(); i++)
if (osd->osdmap->is_down(interval.acting[i])) {
prior_set.insert(interval.acting[i]);
prior_set_down.erase(interval.acting[i]);
+ state_set(PG_STATE_DOWN);
}
some_down = true;
}
}
- if (info.history.last_epoch_started < info.history.same_since &&
- !any_up_now) {
- dout(10) << "build_prior no osds are up from the last epoch started, PG is down for now." << dendl;
- state_set(PG_STATE_DOWN);
- }
-
dout(10) << "build_prior = " << prior_set
<< " down = " << prior_set_down << " ..."
<< (is_crashed() ? " crashed":"")