dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
bufferlist bl;
- epoch_t map_epoch = PG::peek_map_epoch(store, coll_t(pgid), service.infos_oid, &bl);
+ epoch_t map_epoch = PG::peek_map_epoch(store, pgid, service.infos_oid, &bl);
PG *pg = _open_lock_pg(map_epoch == 0 ? osdmap : service.get_map(map_epoch), pgid);
// there can be no waiters here, so we don't call wake_pg_waiters
return;
ObjectStore::Transaction *t = new ObjectStore::Transaction;
- PGLog::clear_info_log(
- pg->info.pgid,
- OSD::make_infos_oid(),
- pg->log_oid,
- t);
+ PGLog::clear_info_log(pg->info.pgid, t);
for (list<coll_t>::iterator i = colls_to_remove.begin();
i != colls_to_remove.end();
static coll_t META_COLL("meta");
+// prefix pgmeta_oid keys with _ so that PGLog::read_log() can
+// easily skip them
+const string infover_key("_infover");
+const string info_key("_info");
+const string biginfo_key("_biginfo");
+const string epoch_key("_epoch");
+
+
template <class T>
static ostream& _prefix(std::ostream *_dout, T *t)
{
info_struct_v(0),
coll(p), pg_log(cct),
pgmeta_oid(p.make_pgmeta_oid()),
- log_oid(OSD::make_pg_log_oid(p)),
missing_loc(this),
recovery_item(this), scrub_item(this), scrub_finalize_item(this), snap_trim_item(this), stat_queue_item(this),
recovery_ops_active(0),
}
void PG::upgrade(ObjectStore *store, const interval_set<snapid_t> &snapcolls)
+{
+ assert(info_struct_v <= 8);
+ ObjectStore::Transaction t;
+
+ if (info_struct_v < 7) {
+ _upgrade_v7(store, snapcolls);
+ }
+
+ // 7 -> 8
+ pg_log.mark_log_for_rewrite();
+ hobject_t log_oid(OSD::make_pg_log_oid(pg_id));
+ hobject_t biginfo_oid(OSD::make_pg_biginfo_oid(pg_id));
+ t.remove(META_COLL, log_oid);
+ t.remove(META_COLL, biginfo_oid);
+ t.collection_rmattr(coll, "info");
+
+ t.touch(coll, pgmeta_oid);
+ map<string,bufferlist> v;
+ __u8 ver = cur_struct_v;
+ ::encode(ver, v[infover_key]);
+ t.omap_setkeys(coll, pgmeta_oid, v);
+
+ dirty_info = true;
+ dirty_big_info = true;
+ write_if_dirty(t);
+
+ int r = store->apply_transaction(t);
+ if (r != 0) {
+ derr << __func__ << ": apply_transaction returned "
+ << cpp_strerror(r) << dendl;
+ assert(0);
+ }
+ assert(r == 0);
+}
+
+void PG::_upgrade_v7(ObjectStore *store, const interval_set<snapid_t> &snapcolls)
{
unsigned removed = 0;
for (interval_set<snapid_t>::const_iterator i = snapcolls.begin();
}
objects.clear();
}
- ObjectStore::Transaction t;
snap_collections.clear();
- dirty_info = true;
- write_if_dirty(t);
- int r = store->apply_transaction(t);
- if (r != 0) {
- derr << __func__ << ": apply_transaction returned "
- << cpp_strerror(r) << dendl;
- assert(0);
- }
- assert(r == 0);
}
int PG::_write_info(ObjectStore::Transaction& t, epoch_t epoch,
- pg_info_t &info, coll_t coll,
- map<epoch_t,pg_interval_t> &past_intervals,
- interval_set<snapid_t> &snap_collections,
- hobject_t &infos_oid,
- __u8 info_struct_v, bool dirty_big_info, bool force_ver)
+ pg_info_t &info, coll_t coll,
+ map<epoch_t,pg_interval_t> &past_intervals,
+ interval_set<snapid_t> &snap_collections,
+ ghobject_t &pgmeta_oid,
+ bool dirty_big_info)
{
// pg state
-
- if (info_struct_v > cur_struct_v)
- return -EINVAL;
-
- // Only need to write struct_v to attr when upgrading
- if (force_ver || info_struct_v < cur_struct_v) {
- bufferlist attrbl;
- info_struct_v = cur_struct_v;
- ::encode(info_struct_v, attrbl);
- t.collection_setattr(coll, "info", attrbl);
- dirty_big_info = true;
- }
+ map<string,bufferlist> v;
// info. store purged_snaps separately.
interval_set<snapid_t> purged_snaps;
- map<string,bufferlist> v;
- ::encode(epoch, v[get_epoch_key(info.pgid)]);
+ ::encode(epoch, v[epoch_key]);
purged_snaps.swap(info.purged_snaps);
- ::encode(info, v[get_info_key(info.pgid)]);
+ ::encode(info, v[info_key]);
purged_snaps.swap(info.purged_snaps);
if (dirty_big_info) {
// potentially big stuff
- bufferlist& bigbl = v[get_biginfo_key(info.pgid)];
+ bufferlist& bigbl = v[biginfo_key];
::encode(past_intervals, bigbl);
::encode(snap_collections, bigbl);
::encode(info.purged_snaps, bigbl);
//dout(20) << "write_info bigbl " << bigbl.length() << dendl;
}
- t.omap_setkeys(META_COLL, infos_oid, v);
+ t.omap_setkeys(coll, pgmeta_oid, v);
return 0;
}
uint32_t hint_type = ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
t.collection_hint(coll, hint_type, hint);
}
+
+ ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
+ t.touch(coll, pgmeta_oid);
+ map<string,bufferlist> values;
+ __u8 struct_v = cur_struct_v;
+ ::encode(struct_v, values[infover_key]);
+ t.omap_setkeys(coll, pgmeta_oid, values);
}
void PG::write_info(ObjectStore::Transaction& t)
unstable_stats.clear();
int ret = _write_info(t, get_osdmap()->get_epoch(), info, coll,
- past_intervals, snap_collections, osd->infos_oid,
- info_struct_v, dirty_big_info);
+ past_intervals, snap_collections, pgmeta_oid,
+ dirty_big_info);
assert(ret == 0);
last_persisted_osdmap_ref = osdmap_ref;
dirty_big_info = false;
}
-epoch_t PG::peek_map_epoch(ObjectStore *store, coll_t coll, hobject_t &infos_oid, bufferlist *bl)
+epoch_t PG::peek_map_epoch(ObjectStore *store,
+ spg_t pgid,
+ hobject_t &legacy_infos_oid,
+ bufferlist *bl)
{
- assert(bl);
- spg_t pgid;
- snapid_t snap;
- bool ok = coll.is_pg(pgid, snap);
- assert(ok);
- int r = store->collection_getattr(coll, "info", *bl);
- assert(r > 0);
- bufferlist::iterator bp = bl->begin();
- __u8 struct_v = 0;
- ::decode(struct_v, bp);
- if (struct_v < 5)
- return 0;
+ coll_t coll(pgid);
+ ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
epoch_t cur_epoch = 0;
- if (struct_v < 6) {
+
+ assert(bl);
+ {
+ // validate collection name
+ spg_t pgid_temp;
+ snapid_t snap;
+ bool ok = coll.is_pg(pgid_temp, snap);
+ assert(ok);
+ }
+
+ // try for v8
+ set<string> keys;
+ keys.insert(infover_key);
+ keys.insert(epoch_key);
+ map<string,bufferlist> values;
+ int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
+ if (r == 0) {
+ assert(values.size() == 2);
+
+ // sanity check version
+ bufferlist::iterator bp = values[infover_key].begin();
+ __u8 struct_v = 0;
+ ::decode(struct_v, bp);
+ assert(struct_v >= 8);
+
+ // get epoch
+ bp = values[epoch_key].begin();
::decode(cur_epoch, bp);
- } else {
+ } else if (r == -ENOENT) {
+ // legacy: try v7 or older
+ r = store->collection_getattr(coll, "info", *bl);
+ assert(r > 0);
+ bufferlist::iterator bp = bl->begin();
+ __u8 struct_v = 0;
+ ::decode(struct_v, bp);
+ if (struct_v < 5)
+ return 0;
+ if (struct_v < 6) {
+ ::decode(cur_epoch, bp);
+ return cur_epoch;
+ }
+
// get epoch out of leveldb
- bufferlist tmpbl;
string ek = get_epoch_key(pgid);
- set<string> keys;
- keys.insert(get_epoch_key(pgid));
- map<string,bufferlist> values;
- store->omap_get_values(META_COLL, infos_oid, keys, &values);
+ keys.clear();
+ values.clear();
+ keys.insert(ek);
+ store->omap_get_values(META_COLL, legacy_infos_oid, keys, &values);
assert(values.size() == 1);
- tmpbl = values[ek];
- bufferlist::iterator p = tmpbl.begin();
+ bufferlist::iterator p = values[ek].begin();
::decode(cur_epoch, p);
+ } else {
+ assert(0 == "unable to open pg metadata");
}
return cur_epoch;
}
{
if (dirty_big_info || dirty_info)
write_info(t);
- pg_log.write_log(t, log_oid);
+ pg_log.write_log(t, coll, pgmeta_oid);
}
void PG::trim_peers()
}
dout(10) << "append_log adding " << keys.size() << " keys" << dendl;
- t.omap_setkeys(META_COLL, log_oid, keys);
+ t.omap_setkeys(coll, pgmeta_oid, keys);
pg_log.trim(&handler, trim_to, info);
}
int PG::read_info(
- ObjectStore *store, const coll_t &coll, bufferlist &bl,
+ ObjectStore *store, spg_t pgid, const coll_t &coll, bufferlist &bl,
pg_info_t &info, map<epoch_t,pg_interval_t> &past_intervals,
- hobject_t &biginfo_oid, hobject_t &infos_oid,
+ hobject_t &infos_oid,
interval_set<snapid_t> &snap_collections, __u8 &struct_v)
{
+ // try for v8 or later
+ set<string> keys;
+ keys.insert(infover_key);
+ keys.insert(info_key);
+ keys.insert(biginfo_key);
+ ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
+ map<string,bufferlist> values;
+ int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
+ if (r == 0) {
+ assert(values.size() == 3);
+
+ bufferlist::iterator p = values[infover_key].begin();
+ ::decode(struct_v, p);
+ assert(struct_v >= 8);
+
+ p = values[info_key].begin();
+ ::decode(info, p);
+
+ p = values[biginfo_key].begin();
+ ::decode(past_intervals, p);
+ ::decode(snap_collections, p);
+ ::decode(info.purged_snaps, p);
+ return 0;
+ }
+
+ // legacy (ver < 8)
bufferlist::iterator p = bl.begin();
bufferlist lbl;
::decode(struct_v, p);
} else {
if (struct_v < 6) {
+ hobject_t biginfo_oid(OSD::make_pg_biginfo_oid(pgid));
int r = store->read(META_COLL, biginfo_oid, 0, 0, lbl);
if (r < 0)
return r;
void PG::read_state(ObjectStore *store, bufferlist &bl)
{
- hobject_t biginfo_oid(OSD::make_pg_biginfo_oid(pg_id));
-
- int r = read_info(store, coll, bl, info, past_intervals, biginfo_oid,
- osd->infos_oid, snap_collections, info_struct_v);
+ int r = read_info(store, pg_id, coll, bl, info, past_intervals,
+ osd->infos_oid, snap_collections, info_struct_v);
assert(r >= 0);
ostringstream oss;
- if (pg_log.read_log(
- store, coll, META_COLL, log_oid, info,
- oss)) {
+ if (pg_log.read_log(store,
+ coll,
+ info_struct_v < 8 ? META_COLL : coll,
+ info_struct_v < 8 ? OSD::make_pg_log_oid(pg_id) : pgmeta_oid,
+ info, oss)) {
/* We don't want to leave the old format around in case the next log
* write happens to be an append_log()
*/
pg_log.mark_log_for_rewrite();
ObjectStore::Transaction t;
- t.remove(coll_t(), log_oid); // remove old version
- pg_log.write_log(t, log_oid);
+ t.remove(META_COLL, log_oid); // remove old version
+ pg_log.write_log(t, coll, pgmeta_oid);
int r = osd->store->apply_transaction(t);
assert(!r);
}
// pg state
pg_info_t info;
__u8 info_struct_v;
- static const __u8 cur_struct_v = 7;
+ static const __u8 cur_struct_v = 8;
bool must_upgrade() {
- return info_struct_v < 7;
+ return info_struct_v < 8;
}
void upgrade(
ObjectStore *store,
const interval_set<snapid_t> &snapcolls);
+ void _upgrade_v7(ObjectStore *store, const interval_set<snapid_t> &snapcolls);
const coll_t coll;
PGLog pg_log;
return stringify(pgid) + "_epoch";
}
ghobject_t pgmeta_oid;
- hobject_t log_oid;
class MissingLoc {
map<hobject_t, pg_missing_t::item> needs_recovery_map;
pg_info_t &info, coll_t coll,
map<epoch_t,pg_interval_t> &past_intervals,
interval_set<snapid_t> &snap_collections,
- hobject_t &infos_oid,
- __u8 info_struct_v, bool dirty_big_info, bool force_ver = false);
+ ghobject_t &pgmeta_oid,
+ bool dirty_big_info);
void write_if_dirty(ObjectStore::Transaction& t);
eversion_t get_next_version() const {
std::string get_corrupt_pg_log_name() const;
static int read_info(
- ObjectStore *store, const coll_t &coll,
+ ObjectStore *store, spg_t pgid, const coll_t &coll,
bufferlist &bl, pg_info_t &info, map<epoch_t,pg_interval_t> &past_intervals,
- hobject_t &biginfo_oid, hobject_t &infos_oid,
+ hobject_t &infos_oid,
interval_set<snapid_t> &snap_collections, __u8 &);
void read_state(ObjectStore *store, bufferlist &bl);
- static epoch_t peek_map_epoch(ObjectStore *store, coll_t coll,
- hobject_t &infos_oid, bufferlist *bl);
+ static epoch_t peek_map_epoch(ObjectStore *store, spg_t pgid,
+ hobject_t &legacy_infos_oid,
+ bufferlist *bl);
void update_snap_map(
vector<pg_log_entry_t> &log_entries,
ObjectStore::Transaction& t);
void PGLog::clear_info_log(
spg_t pgid,
- const hobject_t &infos_oid,
- const hobject_t &log_oid,
ObjectStore::Transaction *t) {
-
- set<string> keys_to_remove;
- keys_to_remove.insert(PG::get_epoch_key(pgid));
- keys_to_remove.insert(PG::get_biginfo_key(pgid));
- keys_to_remove.insert(PG::get_info_key(pgid));
-
- t->remove(META_COLL, log_oid);
- t->omap_rmkeys(META_COLL, infos_oid, keys_to_remove);
+ coll_t coll(pgid);
+ t->remove(coll, pgid.make_pgmeta_oid());
}
void PGLog::trim(
}
void PGLog::write_log(
- ObjectStore::Transaction& t, const ghobject_t &log_oid)
+ ObjectStore::Transaction& t, const coll_t& coll, const ghobject_t &log_oid)
{
if (is_dirty()) {
dout(10) << "write_log with: "
<< ", trimmed: " << trimmed
<< dendl;
_write_log(
- t, log, log_oid, divergent_priors,
+ t, log, coll, log_oid, divergent_priors,
dirty_to,
dirty_from,
writeout_from,
}
void PGLog::write_log(ObjectStore::Transaction& t, pg_log_t &log,
- const ghobject_t &log_oid, map<eversion_t, hobject_t> &divergent_priors)
+ const coll_t& coll, const ghobject_t &log_oid,
+ map<eversion_t, hobject_t> &divergent_priors)
{
_write_log(
- t, log, log_oid,
+ t, log, coll, log_oid,
divergent_priors, eversion_t::max(), eversion_t(), eversion_t(),
set<eversion_t>(),
true, true, 0);
void PGLog::_write_log(
ObjectStore::Transaction& t, pg_log_t &log,
- const ghobject_t &log_oid, map<eversion_t, hobject_t> &divergent_priors,
+ const coll_t& coll, const ghobject_t &log_oid,
+ map<eversion_t, hobject_t> &divergent_priors,
eversion_t dirty_to,
eversion_t dirty_from,
eversion_t writeout_from,
//dout(10) << "write_log, clearing up to " << dirty_to << dendl;
if (touch_log)
- t.touch(coll_t(), log_oid);
+ t.touch(coll, log_oid);
if (dirty_to != eversion_t()) {
t.omap_rmkeyrange(
- coll_t(), log_oid,
+ coll, log_oid,
eversion_t().get_key_name(), dirty_to.get_key_name());
clear_up_to(log_keys_debug, dirty_to.get_key_name());
}
if (dirty_to != eversion_t::max() && dirty_from != eversion_t::max()) {
// dout(10) << "write_log, clearing from " << dirty_from << dendl;
t.omap_rmkeyrange(
- coll_t(), log_oid,
+ coll, log_oid,
dirty_from.get_key_name(), eversion_t::max().get_key_name());
clear_after(log_keys_debug, dirty_from.get_key_name());
}
::encode(log.rollback_info_trimmed_to, keys["rollback_info_trimmed_to"]);
if (!to_remove.empty())
- t.omap_rmkeys(META_COLL, log_oid, to_remove);
- t.omap_setkeys(META_COLL, log_oid, keys);
+ t.omap_rmkeys(coll, log_oid, to_remove);
+ t.omap_setkeys(coll, log_oid, keys);
}
bool PGLog::read_log(ObjectStore *store, coll_t pg_coll,
ObjectMap::ObjectMapIterator p = store->get_omap_iterator(log_coll, log_oid);
if (p) for (p->seek_to_first(); p->valid() ; p->next()) {
+ // non-log pgmeta_oid keys are prefixed with _; skip those
+ if (p->key()[0] == '_')
+ continue;
bufferlist bl = p->value();//Copy bufferlist before creating iterator
bufferlist::iterator bp = bl.begin();
if (p->key() == "divergent_priors") {
static void clear_info_log(
spg_t pgid,
- const hobject_t &infos_oid,
- const hobject_t &log_oid,
ObjectStore::Transaction *t);
void trim(
pg_info_t &info, LogEntryHandler *rollbacker,
bool &dirty_info, bool &dirty_big_info);
- void write_log(ObjectStore::Transaction& t, const ghobject_t &log_oid);
+ void write_log(ObjectStore::Transaction& t, const coll_t& coll,
+ const ghobject_t &log_oid);
static void write_log(ObjectStore::Transaction& t, pg_log_t &log,
+ const coll_t& coll,
const ghobject_t &log_oid, map<eversion_t, hobject_t> &divergent_priors);
static void _write_log(
ObjectStore::Transaction& t, pg_log_t &log,
- const ghobject_t &log_oid, map<eversion_t, hobject_t> &divergent_priors,
+ const coll_t& coll, const ghobject_t &log_oid,
+ map<eversion_t, hobject_t> &divergent_priors,
eversion_t dirty_to,
eversion_t dirty_from,
eversion_t writeout_from,
};
hobject_t infos_oid = OSD::make_infos_oid();
-hobject_t biginfo_oid, log_oid;
+ghobject_t log_oid;
+hobject_t biginfo_oid;
int file_fd = fd_none;
bool debug = false;
exit(1);
}
-int get_log(ObjectStore *fs, coll_t coll, spg_t pgid, const pg_info_t &info,
+int get_log(ObjectStore *fs, __u8 struct_ver,
+ coll_t coll, spg_t pgid, const pg_info_t &info,
PGLog::IndexedLog &log, pg_missing_t &missing)
{
map<eversion_t, hobject_t> divergent_priors;
try {
ostringstream oss;
- PGLog::read_log(fs, coll, META_COLL, log_oid, info, divergent_priors, log, missing, oss);
+ assert(struct_ver > 0);
+ PGLog::read_log(fs, coll,
+ struct_ver >= 8 ? coll : META_COLL,
+ struct_ver >= 8 ? pgid.make_pgmeta_oid() : log_oid,
+ info, divergent_priors, log, missing, oss);
if (debug && oss.str().size())
cerr << oss.str() << std::endl;
}
return ENOENT;
}
- cout << "remove " << META_COLL << " " << log_oid.oid << std::endl;
+ cout << "remove " << META_COLL << " " << log_oid.hobj.oid << std::endl;
rmt->remove(META_COLL, log_oid);
cout << "remove " << META_COLL << " " << biginfo_oid.oid << std::endl;
rmt->remove(META_COLL, biginfo_oid);
//Empty for this
interval_set<snapid_t> snap_collections; // obsolete
coll_t coll(info.pgid);
-
+ ghobject_t pgmeta_oid(info.pgid.make_pgmeta_oid());
int ret = PG::_write_info(t, epoch,
info, coll,
past_intervals,
snap_collections,
- infos_oid,
- struct_ver,
- true, true);
+ pgmeta_oid,
+ true);
if (ret < 0) ret = -ret;
if (ret) cerr << "Failed to write info" << std::endl;
return ret;
}
-void write_log(ObjectStore::Transaction &t, pg_log_t &log)
-{
- map<eversion_t, hobject_t> divergent_priors;
- PGLog::write_log(t, log, log_oid, divergent_priors);
-}
-
int write_pg(ObjectStore::Transaction &t, epoch_t epoch, pg_info_t &info,
pg_log_t &log, __u8 struct_ver, map<epoch_t,pg_interval_t> &past_intervals)
{
int ret = write_info(t, epoch, info, struct_ver, past_intervals);
- if (ret) return ret;
- write_log(t, log);
+ if (ret)
+ return ret;
+ map<eversion_t, hobject_t> divergent_priors;
+ coll_t coll(info.pgid);
+ PGLog::write_log(t, log, coll, info.pgid.make_pgmeta_oid(), divergent_priors);
return 0;
}
cerr << "Exporting " << pgid << std::endl;
- int ret = get_log(fs, coll, pgid, info, log, missing);
+ int ret = get_log(fs, struct_ver, coll, pgid, info, log, missing);
if (ret > 0)
return ret;
return 1;
}
+ ghobject_t pgmeta_oid = pgid.make_pgmeta_oid();
log_oid = OSD::make_pg_log_oid(pgid);
biginfo_oid = OSD::make_pg_biginfo_oid(pgid);
}
bufferlist bl;
- map_epoch = PG::peek_map_epoch(fs, coll, infos_oid, &bl);
+ map_epoch = PG::peek_map_epoch(fs, pgid, infos_oid, &bl);
if (debug)
cerr << "map_epoch " << map_epoch << std::endl;
pg_info_t info(pgid);
map<epoch_t,pg_interval_t> past_intervals;
- hobject_t biginfo_oid = OSD::make_pg_biginfo_oid(pgid);
interval_set<snapid_t> snap_collections;
-
__u8 struct_ver;
- r = PG::read_info(fs, coll, bl, info, past_intervals, biginfo_oid,
+ r = PG::read_info(fs, pgid, coll, bl, info, past_intervals,
infos_oid, snap_collections, struct_ver);
if (r < 0) {
cerr << "read_info error " << cpp_strerror(-r) << std::endl;
} else if (op == "log") {
PGLog::IndexedLog log;
pg_missing_t missing;
- ret = get_log(fs, coll, pgid, info, log, missing);
+ ret = get_log(fs, struct_ver, coll, pgid, info, log, missing);
if (ret > 0)
goto out;