-> prior_set should be <A,B>, bc B may have independently applied updates.
ideas:
- - B can't activate, until N peers know that we have epoch 2
- - monitor can adjust failed_epoch to 1 if ALL peers in 2 confirm they didn't see B with 2
- - bah!
- can't activate pg when lone OSD without informing the monitor of alive_thru
- add alive_thru map to osdmap. any lone pg will not have activated if the osd is now down, and alive_thru does not include the given epoch.
3:
4: A C -> prior_set can be <A,C>, bc C would carry any epoch 2 updates
+
+1: A B
+2: C D .. can't have gone active
+
+1: A B
+2: B C B in prior_set, will
+
+
-> so: we need at least 1 osd from each epoch, IFF we make store sync on osdmap boundaries.
-> so, use calc_priors_during in build_prior, then make recovery code check for is_up
osd_stat_refresh_interval: .5,
+ osd_min_pg_size_without_alive: 2, // smallest pg we allow to activate without telling the monitor
+
osd_pg_bits: 6, // bits per osd
osd_object_layout: CEPH_OBJECT_LAYOUT_HASHINO,//LINEAR,//HASHINO,
osd_pg_layout: CEPH_PG_LAYOUT_CRUSH,//LINEAR,//CRUSH,
bool osd_exclusive_caching;
double osd_stat_refresh_interval;
+ int osd_min_pg_size_without_alive;
+
int osd_pg_bits;
int osd_object_layout;
int osd_pg_layout;
ceph_decode_copy(p, map->osd_state, map->max_osd);
*p += 4; /* skip length field (should match max) */
ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
- *p += 4; /* skip length field (should match max) */
- *p += map->max_osd * sizeof(u32); /* skip osd_alive_thru */
+
+ *p += sizeof(u32) + map->max_osd * sizeof(u32); /* osd_up_from */
+ *p += sizeof(u32) + map->max_osd * sizeof(u32); /* osd_up_thru */
/* pg primary swapping */
ceph_decode_32_safe(p, end, len, bad);
case MSG_OSD_BOOT:
case MSG_OSD_IN:
case MSG_OSD_OUT:
+ case MSG_OSD_ALIVE:
osdmon->dispatch(m);
break;
int from = m->get_source().num();
if (osdmap.is_up(from) &&
osdmap.get_inst(from) == m->get_source_inst() &&
- osdmap.get_alive_thru(from) >= m->map_epoch) {
+ osdmap.get_up_thru(from) >= m->map_epoch) {
// yup.
dout(7) << "preprocess_alive e" << m->map_epoch << " dup from " << m->get_source_inst() << dendl;
_alive(m);
int from = m->get_source().num();
dout(7) << "prepare_alive e" << m->map_epoch << " from " << m->get_source_inst() << dendl;
- pending_inc.new_alive_thru[from] = m->map_epoch;
+ pending_inc.new_up_thru[from] = m->map_epoch;
paxos->wait_for_commit(new C_Alive(this,m ));
return true;
}
#include "messages/MPingAck.h"
#include "messages/MOSDBoot.h"
+#include "messages/MOSDAlive.h"
#include "messages/MOSDIn.h"
#include "messages/MOSDOut.h"
#include "messages/MOSDFailure.h"
case MSG_OSD_BOOT:
m = new MOSDBoot();
break;
+ case MSG_OSD_ALIVE:
+ m = new MOSDAlive();
+ break;
case MSG_OSD_IN:
m = new MOSDIn();
break;
#include "messages/MOSDPGInfo.h"
#include "messages/MOSDPGCreate.h"
+#include "messages/MOSDAlive.h"
+
#include "messages/MPGStats.h"
#include "common/Logger.h"
memset(&my_stat, 0, sizeof(my_stat));
+ last_sent_alive = 0;
+
stat_ops = 0;
stat_qlen = 0;
stat_rd_ops = stat_rd_ops_shed_in = stat_rd_ops_shed_out = 0;
}
+void OSD::send_alive(epoch_t need)
+{
+ if (need > last_sent_alive) {
+ last_sent_alive = osdmap->get_epoch();
+ /* AAHHH FIXME, may need to retry */
+ int mon = monmap->pick_mon();
+ messenger->send_message(new MOSDAlive(osdmap->get_epoch()),
+ monmap->get_inst(mon));
+ }
+}
+
// -------------------------------------
void OSD::_refresh_my_stat(utime_t now)
};
+ // -- alive --
+ epoch_t last_sent_alive;
+ void send_alive(epoch_t need);
+
// -- stats --
DecayCounter stat_oprate;
int stat_ops; // ops since last heartbeat
map<int32_t,entity_addr_t> new_up;
map<int32_t,uint8_t> new_down;
map<int32_t,uint32_t> new_offload;
- map<int32_t,epoch_t> new_alive_thru;
+ map<int32_t,epoch_t> new_up_thru;
map<pg_t,uint32_t> new_pg_swap_primary;
list<pg_t> old_pg_swap_primary;
::encode(new_up, bl);
::encode(new_down, bl);
::encode(new_offload, bl);
- ::encode(new_alive_thru, bl);
+ ::encode(new_up_thru, bl);
::encode(new_pg_swap_primary, bl);
::encode(old_pg_swap_primary, bl);
}
::decode(new_up, p);
::decode(new_down, p);
::decode(new_offload, p);
- ::decode(new_alive_thru, p);
+ ::decode(new_up_thru, p);
::decode(new_pg_swap_primary, p);
::decode(old_pg_swap_primary, p);
}
int32_t max_osd;
vector<uint8_t> osd_state;
vector<entity_addr_t> osd_addr;
- vector<epoch_t> osd_alive_thru; // lower bound on _actual_ osd death. bumped by osd before activating pgs with no replicas.
+ vector<epoch_t> osd_up_from; // when it went up
+ vector<epoch_t> osd_up_thru; // lower bound on _actual_ osd death. bumped by osd before activating pgs with no replicas.
map<pg_t,uint32_t> pg_swap_primary; // force new osd to be pg primary (if already a member)
public:
int o = max_osd;
max_osd = m;
osd_state.resize(m);
- osd_alive_thru.resize(m);
+ osd_up_from.resize(m);
+ osd_up_thru.resize(m);
for (; o<max_osd; o++) {
osd_state[o] = 0;
- osd_alive_thru[o] = 0;
+ osd_up_from[o] = 0;
+ osd_up_thru[o] = 0;
}
osd_addr.resize(m);
}
return false;
}
- epoch_t get_alive_thru(int osd) {
+ epoch_t get_up_from(int osd) {
assert(exists(osd));
- return osd_alive_thru[osd];
+ return osd_up_from[osd];
+ }
+ epoch_t get_up_thru(int osd) {
+ assert(exists(osd));
+ return osd_up_thru[osd];
}
int get_any_up_osd() {
i++)
crush.set_offload(i->first, i->second);
- for (map<int32_t,epoch_t>::iterator i = inc.new_alive_thru.begin();
- i != inc.new_alive_thru.end();
+ for (map<int32_t,epoch_t>::iterator i = inc.new_up_thru.begin();
+ i != inc.new_up_thru.end();
i++)
- osd_alive_thru[i->first] = i->second;
+ osd_up_thru[i->first] = i->second;
for (map<int32_t,entity_addr_t>::iterator i = inc.new_up.begin();
i != inc.new_up.end();
i++) {
osd_state[i->first] |= CEPH_OSD_UP;
osd_addr[i->first] = i->second;
+ osd_up_from[i->first] = epoch;
//cout << "epoch " << epoch << " up osd" << i->first << " at " << i->second << endl;
}
::encode(max_osd, blist);
::encode(osd_state, blist);
::encode(osd_addr, blist);
- ::encode(osd_alive_thru, blist);
+ ::encode(osd_up_from, blist);
+ ::encode(osd_up_thru, blist);
::encode(pg_swap_primary, blist);
bufferlist cbl;
::decode(max_osd, p);
::decode(osd_state, p);
::decode(osd_addr, p);
- ::decode(osd_alive_thru, p);
+ ::decode(osd_up_from, p);
+ ::decode(osd_up_thru, p);
::decode(pg_swap_primary, p);
bufferlist cbl;
/******* PG ***********/
void PG::build_prior()
{
+
+ // FIXME: roll crashed logic into this function too!
+
+ /*
+ * We have to be careful to gracefully deal with situations like
+ * so. Say we have a power outage or something that takes out both
+ * OSDs, but the monitor doesn't mark them down in the same epoch.
+ * The history may look like
+ *
+ * 1: A B
+ * 2: B
+ * 3: let's say B dies for good, too (say, from the power spike)
+ * 4: A
+ *
+ * which makes it look like B may have applied updates to the PG
+ * that we need in order to proceed. This sucks...
+ *
+ * To minimize the risk of this happening, we CANNOT go active if
+ * any OSDs in the prior set are down until we send an MOSDAlive to
+ * the monitor such that the OSDMap sets osd_alive_thru to an epoch.
+ * Then, we have something like
+ *
+ * 1: A B
+ * 2: B alive_thru[B]=0
+ * 3:
+ * 4: A
+ *
+ * -> we can ignore B, bc it couldn't have gone active (alive_thru
+ * still 0).
+ *
+ * or,
+ *
+ * 1: A B
+ * 2: B alive_thru[B]=0
+ * 3: B alive_thru[B]=0
+ * 4: B alive_thru[B]=2
+ * 5: B alive_thru[B]=2
+ * 6:
+ * 7: A
+ *
+ * -> we must wait for B, bc it was alive through 2, and could have
+ updated the pg.
+ *
+ * If B is really dead, then an administrator can manually set
+ * alive_thru[b] < 2 to recover with possibly out-of-date pg
+ * content.
+ */
+
// build prior set.
prior_set.clear();
- // current
+ // current nodes, of course.
for (unsigned i=1; i<acting.size(); i++)
prior_set.insert(acting[i]);
- // and prior map(s), if OSDs are still up
- for (epoch_t epoch = MAX(1, last_epoch_started_any);
- epoch < osd->osdmap->get_epoch();
- epoch++) {
- OSDMap omap;
- osd->get_map(epoch, omap);
+ // and prior PG mappings. move backwards in time.
+ bool some_down = false;
+
+ must_notify_mon = false;
+
+ // for each acting set, we need to know same_since and last_epoch
+ epoch_t first_epoch = info.history.same_since;
+ epoch_t last_epoch = first_epoch - 1;
+ epoch_t stop = MAX(1, last_epoch_started_any);
+
+ dout(10) << "build_prior considering interval " << first_epoch << " down to " << stop << dendl;
+ OSDMap *nextmap = new OSDMap;
+ osd->get_map(last_epoch, *nextmap);
+
+ for (; last_epoch >= stop; last_epoch = first_epoch-1) {
+ OSDMap *lastmap = nextmap;
+ assert(last_epoch == lastmap->get_epoch());
vector<int> acting;
- omap.pg_to_acting_osds(get_pgid(), acting);
+ lastmap->pg_to_acting_osds(get_pgid(), acting);
+
+ // calc first_epoch, first_map
+ nextmap = new OSDMap;
+ for (first_epoch = last_epoch; first_epoch > stop; first_epoch--) {
+ osd->get_map(first_epoch-1, *nextmap);
+ vector<int> t;
+ nextmap->pg_to_acting_osds(get_pgid(), t);
+ if (t != acting)
+ break;
+ }
+
+ if (acting.empty()) {
+ dout(20) << "build_prior epochs " << first_epoch << "-" << last_epoch << " empty" << dendl;
+ continue;
+ }
+
+ bool maybe_went_active =
+ lastmap->get_up_thru(acting[0]) >= first_epoch &&
+ lastmap->get_up_from(acting[0]) < first_epoch;
+
+ dout(10) << "build_prior epochs " << first_epoch << "-" << last_epoch << " " << acting
+ << " - primary osd" << acting[0]
+ << " up [" << lastmap->get_up_from(acting[0]) << ", " << lastmap->get_up_thru(acting[0]) << "]"
+ << " -> " << maybe_went_active
+ << dendl;
for (unsigned i=0; i<acting.size(); i++) {
- dout(10) << "build prior considering epoch " << epoch << " osd" << acting[i] << dendl;
- if (osd->osdmap->is_up(acting[i]) && // is up now
- acting[i] != osd->whoami) // and is not me
- prior_set.insert(acting[i]);
+ if (osd->osdmap->is_up(acting[i])) { // is up now
+ if (acting[i] != osd->whoami) // and is not me
+ prior_set.insert(acting[i]);
+ } else {
+ dout(10) << "build_prior prior osd" << acting[i] << " is down, must notify mon" << dendl;
+ must_notify_mon = true;
+
+ if (i == 0) {
+ if (maybe_went_active) {
+ dout(10) << "build_prior prior primary osd" << acting[i] << " possibly went active epoch "
+ << (lastmap->get_up_thru(acting[i]) + 1) << dendl;
+ some_down = true;
+ prior_set.insert(acting[i]);
+ }
+ }
+ }
}
}
- dout(10) << "build_prior built " << prior_set << dendl;
+ dout(10) << "build_prior = " << prior_set
+ << (some_down ? " some_down":"")
+ << (must_notify_mon ? " must_notify_mon":"")
+ << dendl;
}
void PG::adjust_prior()
assert(missing.num_lost() == 0);
assert(info.last_complete >= log.bottom || log.backlog);
+ // -- do need to notify the monitor?
+ if (must_notify_mon) {
+ if (osd->osdmap->get_up_thru(osd->whoami) < info.history.same_since) {
+ dout(10) << "up_thru " << osd->osdmap->get_up_thru(osd->whoami)
+ << " < same_since " << info.history.same_since
+ << ", must notify monitor" << dendl;
+ osd->send_alive(info.history.same_since);
+ return;
+ } else {
+ dout(10) << "up_thru " << osd->osdmap->get_up_thru(osd->whoami)
+ << " >= same_since " << info.history.same_since
+ << ", all is well" << dendl;
+ }
+ }
// -- crash recovery?
if (is_crashed()) {
bool have_master_log;
protected:
set<int> prior_set; // current+prior OSDs, as defined by last_epoch_started_any.
+ bool must_notify_mon;
set<int> stray_set; // non-acting osds that have PG data.
set<int> uptodate_set; // current OSDs that are uptodate
eversion_t oldest_update; // lowest (valid) last_update in active set
state(0),
last_epoch_started_any(0),
have_master_log(true),
+ must_notify_mon(false),
stat_num_bytes(0), stat_num_blocks(0)
{ }
virtual ~PG() { }
$CEPH_BIN/osdmaptool --clobber --createsimple .ceph_monmap 4 --print .ceph_osdmap
$CEPH_BIN/cmonctl osd setmap -i .ceph_osdmap
-for osd in 0 1 2 3
+for osd in 0 1 #2 3
do
$CEPH_BIN/cosd --mkfs_for_osd $osd dev/osd$osd # initialize empty object store
$CEPH_BIN/cosd $ARGS dev/osd$osd --debug_ms 1 --debug_osd 20 --debug_fakestore 10 #--debug_osd 40