#include "osd/osd_types.h"
class MOSDPGInfo : public Message {
+ static const int HEAD_VERSION = 2;
+ static const int COMPAT_VERSION = 1;
+
epoch_t epoch;
public:
- vector<pg_info_t> pg_info;
+ vector<pair<pg_info_t,pg_interval_map_t> > pg_list;
epoch_t get_epoch() { return epoch; }
- MOSDPGInfo() : Message(MSG_OSD_PG_INFO) {}
- MOSDPGInfo(version_t mv) :
- Message(MSG_OSD_PG_INFO),
- epoch(mv) { }
+ MOSDPGInfo()
+ : Message(MSG_OSD_PG_INFO, HEAD_VERSION, COMPAT_VERSION) {}
+ MOSDPGInfo(version_t mv)
+ : Message(MSG_OSD_PG_INFO, HEAD_VERSION, COMPAT_VERSION),
+ epoch(mv) { }
private:
~MOSDPGInfo() {}
public:
const char *get_type_name() const { return "pg_info"; }
void print(ostream& out) const {
- out << "pg_info(" << pg_info.size() << " pgs e" << epoch << ")";
+ out << "pg_info(" << pg_list.size() << " pgs e" << epoch << ":";
+
+ for (vector<pair<pg_info_t,pg_interval_map_t> >::const_iterator i = pg_list.begin();
+ i != pg_list.end();
+ ++i) {
+ if (i != pg_list.begin())
+ out << ",";
+ out << i->first.pgid;
+ if (i->second.size())
+ out << "(" << i->second.size() << ")";
+ }
+
+ out << ")";
}
void encode_payload(uint64_t features) {
::encode(epoch, payload);
- ::encode(pg_info, payload);
+
+ // v1 was vector<pg_info_t>
+ __u32 n = pg_list.size();
+ ::encode(n, payload);
+ for (vector<pair<pg_info_t,pg_interval_map_t> >::iterator p = pg_list.begin();
+ p != pg_list.end();
+ p++)
+ ::encode(p->first, payload);
+
+ // v2 needs the pg_interval_map_t for each record
+ for (vector<pair<pg_info_t,pg_interval_map_t> >::iterator p = pg_list.begin();
+ p != pg_list.end();
+ p++)
+ ::encode(p->second, payload);
}
void decode_payload() {
bufferlist::iterator p = payload.begin();
::decode(epoch, p);
- ::decode(pg_info, p);
+
+ // decode pg_info_t portion of the vector
+ __u32 n;
+ ::decode(n, p);
+ pg_list.resize(n);
+ for (unsigned i=0; i<n; i++)
+ ::decode(pg_list[i].first, p);
+
+ if (header.version >= 2) {
+ // get the pg_interval_map_t portion
+ for (unsigned i=0; i<n; i++)
+ ::decode(pg_list[i].second, p);
+ }
}
};
class MOSDPGLog : public Message {
- static const int HEAD_VERSION = 2;
+ static const int HEAD_VERSION = 3;
+ static const int COMPAT_VERSION = 2;
epoch_t epoch;
/// query_epoch is the epoch of the query being responded to, or
pg_info_t info;
pg_log_t log;
pg_missing_t missing;
+ pg_interval_map_t past_intervals;
epoch_t get_epoch() { return epoch; }
pg_t get_pgid() { return info.pgid; }
epoch_t get_query_epoch() { return query_epoch; }
- MOSDPGLog() : Message(MSG_OSD_PG_LOG, HEAD_VERSION) { }
+ MOSDPGLog() : Message(MSG_OSD_PG_LOG, HEAD_VERSION, COMPAT_VERSION) { }
MOSDPGLog(version_t mv, pg_info_t& i)
- : Message(MSG_OSD_PG_LOG, HEAD_VERSION),
+ : Message(MSG_OSD_PG_LOG, HEAD_VERSION, COMPAT_VERSION),
epoch(mv), query_epoch(mv), info(i) { }
MOSDPGLog(version_t mv, pg_info_t& i, epoch_t query_epoch)
- : Message(MSG_OSD_PG_LOG, HEAD_VERSION),
+ : Message(MSG_OSD_PG_LOG, HEAD_VERSION, COMPAT_VERSION),
epoch(mv), query_epoch(query_epoch), info(i) { }
private:
::encode(log, payload);
::encode(missing, payload);
::encode(query_epoch, payload);
+ ::encode(past_intervals, payload);
}
void decode_payload() {
bufferlist::iterator p = payload.begin();
if (header.version >= 2) {
::decode(query_epoch, p);
}
+ if (header.version >= 3) {
+ ::decode(past_intervals, p);
+ }
}
};
PG *OSD::_create_lock_pg(pg_t pgid, bool newly_created, bool hold_map_lock,
int role, vector<int>& up, vector<int>& acting, pg_history_t history,
- pg_interval_map_t *pim,
+ pg_interval_map_t& pi,
ObjectStore::Transaction& t)
{
assert(osd_lock.is_locked());
history.last_epoch_started = history.epoch_created - 1;
}
- pg->init(role, up, acting, history, pim, &t);
+ pg->init(role, up, acting, history, pi, &t);
dout(7) << "_create_lock_pg " << *pg << dendl;
return pg;
* look up a pg. if we have it, great. if not, consider creating it IF the pg mapping
* hasn't changed since the given epoch and we are the primary.
*/
-PG *OSD::get_or_create_pg(const pg_info_t& info, epoch_t epoch, int from, int& created,
- bool primary, pg_interval_map_t *ppi,
+PG *OSD::get_or_create_pg(const pg_info_t& info, pg_interval_map_t& pi,
+ epoch_t epoch, int from, int& created, bool primary,
ObjectStore::Transaction **pt,
C_Contexts **pfin)
{
// ok, create PG locally using provided Info and History
*pt = new ObjectStore::Transaction;
*pfin = new C_Contexts(g_ceph_context);
- pg = _create_lock_pg(info.pgid, create, false, role, up, acting, history, ppi, **pt);
+ pg = _create_lock_pg(info.pgid, create, false, role, up, acting, history, pi, **pt);
created++;
dout(10) << *pg << " is new" << dendl;
history.epoch_created = history.same_up_since =
history.same_interval_since = history.same_primary_since =
osdmap->get_epoch();
+ pg_interval_map_t pi;
PG *pg = _create_lock_pg(*q, true, true,
- parent->get_role(), parent->up, parent->acting, history, NULL, t);
+ parent->get_role(), parent->up, parent->acting, history, pi, t);
children[*q] = pg;
dout(10) << " child " << *pg << dendl;
}
if (can_create_pg(pgid)) {
ObjectStore::Transaction *t = new ObjectStore::Transaction;
C_Contexts *fin = new C_Contexts(g_ceph_context);
-
+ pg_interval_map_t pi;
PG *pg = _create_lock_pg(pgid, true, false,
- 0, creating_pgs[pgid].acting, creating_pgs[pgid].acting, history, NULL,
+ 0, creating_pgs[pgid].acting, creating_pgs[pgid].acting, history, pi,
*t);
creating_pgs.erase(pgid);
for (map<int,MOSDPGInfo*>::iterator p = info_map.begin();
p != info_map.end();
++p) {
- for (vector<pg_info_t>::iterator i = p->second->pg_info.begin();
- i != p->second->pg_info.end();
+ for (vector<pair<pg_info_t,pg_interval_map_t> >::iterator i = p->second->pg_list.begin();
+ i != p->second->pg_list.end();
++i) {
dout(20) << "Sending info " << *i << " to osd." << p->first << dendl;
}
ObjectStore::Transaction *t;
C_Contexts *fin;
- pg = get_or_create_pg(it->first, m->get_epoch(), from, created, true, &it->second, &t, &fin);
+ pg = get_or_create_pg(it->first, it->second, m->get_epoch(), from, created, true, &t, &fin);
if (!pg)
continue;
int created = 0;
ObjectStore::Transaction *t;
C_Contexts *fin;
- PG *pg = get_or_create_pg(m->info, m->get_epoch(),
- from, created, false, NULL, &t, &fin);
+ PG *pg = get_or_create_pg(m->info, m->past_intervals, m->get_epoch(),
+ from, created, false, &t, &fin);
if (!pg) {
return;
}
int created = 0;
- for (vector<pg_info_t>::iterator p = m->pg_info.begin();
- p != m->pg_info.end();
+ for (vector<pair<pg_info_t,pg_interval_map_t> >::iterator p = m->pg_list.begin();
+ p != m->pg_list.end();
++p) {
ObjectStore::Transaction *t = 0;
C_Contexts *fin = 0;
- PG *pg = get_or_create_pg(*p, m->get_epoch(),
- from, created, false, NULL, &t, &fin);
+ PG *pg = get_or_create_pg(p->first, p->second, m->get_epoch(),
+ from, created, false, &t, &fin);
if (!pg)
continue;
PG::RecoveryCtx rctx(0, &info_map, 0, &fin->contexts, t);
- pg->handle_info(from, *p, &rctx);
+ pg->handle_info(from, p->first, &rctx);
int tr = store->queue_transaction(&pg->osr, t, new ObjectStore::C_DeleteTransaction(t), fin);
assert(!tr);
PG *_open_lock_pg(pg_t pg, bool no_lockdep_check=false, bool hold_map_lock=false);
PG *_create_lock_pg(pg_t pgid, bool newly_created, bool hold_map_lock,
int role, vector<int>& up, vector<int>& acting, pg_history_t history,
- pg_interval_map_t *pim,
- ObjectStore::Transaction& t);
+ pg_interval_map_t& pi, ObjectStore::Transaction& t);
PG *lookup_lock_raw_pg(pg_t pgid);
- PG *get_or_create_pg(const pg_info_t& info, epoch_t epoch, int from, int& pcreated, bool primary,
- pg_interval_map_t *ppi,
+ PG *get_or_create_pg(const pg_info_t& info, pg_interval_map_t& pi,
+ epoch_t epoch, int from, int& pcreated, bool primary,
ObjectStore::Transaction **pt,
C_Contexts **pfin);
dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl;
if (activator_map->count(peer) == 0)
(*activator_map)[peer] = new MOSDPGInfo(get_osdmap()->get_epoch());
- (*activator_map)[peer]->pg_info.push_back(info);
+ (*activator_map)[peer]->pg_list.push_back(make_pair(info, past_intervals));
} else {
dout(10) << "activate peer osd." << peer << " is up to date, but sending pg_log anyway" << dendl;
m = new MOSDPGLog(get_osdmap()->get_epoch(), info);
m->log.copy_after(log, pi.last_update);
}
+ // share past_intervals if we are creating the pg on the replica
+ if (pi.dne())
+ m->past_intervals = past_intervals;
+
if (pi.last_backfill != hobject_t::get_max())
state_set(PG_STATE_BACKFILL);
else
active++;
-
// update local version of peer's missing list!
if (m && pi.last_backfill != hobject_t()) {
for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
MOSDPGInfo *m = new MOSDPGInfo(e);
pg_info_t i = info;
i.history.last_epoch_started = e;
- m->pg_info.push_back(i);
+ m->pg_list.push_back(make_pair(i, pg_interval_map_t()));
osd->cluster_messenger->send_message(m, primary);
}
unlock();
* @param newup up set
* @param newacting acting set
* @param history pg history
+ * @param pi past_intervals
* @param t transaction to write out our new state in
*/
void PG::init(int role, vector<int>& newup, vector<int>& newacting, pg_history_t& history,
- pg_interval_map_t *pim,
+ pg_interval_map_t& pi,
ObjectStore::Transaction *t)
{
dout(10) << "init role " << role << " up " << newup << " acting " << newacting
- << " history " << history << dendl;
+ << " history " << history
+ << " " << pi.size() << " past_intervals"
+ << dendl;
set_role(role);
acting = newacting;
up = newup;
info.history = history;
+ past_intervals.swap(pi);
info.stats.up = up;
info.stats.acting = acting;
osd->reg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp);
- if (pim) {
- past_intervals.swap(*pim);
- dout(10) << "init got " << past_intervals.size() << " past intervals" << dendl;
- }
-
write_info(*t);
write_log(*t);
}
for (unsigned i=1; i<acting.size(); i++) {
int peer = acting[i];
MOSDPGInfo *m = new MOSDPGInfo(get_osdmap()->get_epoch());
- m->pg_info.push_back(info);
+ m->pg_list.push_back(make_pair(info, pg_interval_map_t()));
osd->cluster_messenger->send_message(m, get_osdmap()->get_cluster_inst(peer));
}
}
bool is_empty() const { return info.last_update == eversion_t(0,0); }
void init(int role, vector<int>& up, vector<int>& acting, pg_history_t& history,
- pg_interval_map_t *pim, ObjectStore::Transaction *t);
+ pg_interval_map_t& pim, ObjectStore::Transaction *t);
// pg on-disk state
void do_pending_flush();
// tell replicas
for (unsigned i=1; i<acting.size(); i++) {
MOSDPGInfo *m = new MOSDPGInfo(get_osdmap()->get_epoch());
- m->pg_info.push_back(info);
+ m->pg_list.push_back(make_pair(info, pg_interval_map_t()));
osd->cluster_messenger->send_message(m, get_osdmap()->get_cluster_inst(acting[i]));
}
}