ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
ceph_osd_feature_incompat);
}
osd(osd),
whoami(osd->whoami), store(osd->store), clog(osd->clog),
pg_recovery_stats(osd->pg_recovery_stats),
+ infos_oid(sobject_t("infos", CEPH_NOSNAP)),
+ biginfos_oid(sobject_t("biginfos", CEPH_NOSNAP)),
cluster_messenger(osd->cluster_messenger),
client_messenger(osd->client_messenger),
logger(osd->logger),
dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
bufferlist bl;
- epoch_t map_epoch = PG::peek_map_epoch(store, coll_t(pgid), &bl);
+ epoch_t map_epoch = PG::peek_map_epoch(store, coll_t(pgid), service.infos_oid, &bl);
PG *pg = _open_lock_pg(map_epoch == 0 ? osdmap : service.get_map(map_epoch), pgid);
}
dout(10) << "load_pgs done" << dendl;
+ // make sure info objects exist
+ if (!store->exists(coll_t::META_COLL, service.infos_oid) ||
+ !store->exists(coll_t::META_COLL, service.biginfos_oid)) {
+ dout(10) << "load_pgs creating/touching infos, biginfos objects" << dendl;
+ ObjectStore::Transaction t;
+ t.touch(coll_t::META_COLL, service.infos_oid);
+ t.touch(coll_t::META_COLL, service.biginfos_oid);
+ store->apply_transaction(t);
+ }
+
build_past_intervals_parallel();
}
const hobject_t& ioid) :
osd(o), osdmap_ref(curmap), pool(_pool),
_lock("PG::_lock"),
- ref(0), deleting(false), dirty_info(false), dirty_log(false),
+ ref(0), deleting(false), dirty_info(false), dirty_big_info(false), dirty_log(false),
info(p), coll(p), log_oid(loid), biginfo_oid(ioid),
recovery_item(this), scrub_item(this), scrub_finalize_item(this), snap_trim_item(this), stat_queue_item(this),
recovery_ops_active(0),
_lock.Lock(no_lockdep);
// if we have unrecorded dirty state with the lock dropped, there is a bug
assert(!dirty_info);
+ assert(!dirty_big_info);
assert(!dirty_log);
dout(30) << "lock" << dendl;
_lock.Lock(no_lockdep);
// if we have unrecorded dirty state with the lock dropped, there is a bug
assert(!dirty_info);
+ assert(!dirty_big_info);
assert(!dirty_log);
dout(30) << "lock_with_map_lock_held" << dendl;
merge_old_entry(t, *d);
dirty_info = true;
+ dirty_big_info = true;
dirty_log = true;
}
if (changed) {
dirty_info = true;
+ dirty_big_info = true;
dirty_log = true;
}
}
// record our work.
dirty_info = true;
+ dirty_big_info = true;
}
/*
return;
dout(10) << __func__ << ": trimming " << pif->second << dendl;
past_intervals.erase(pif++);
+ dirty_big_info = true;
}
}
// write pg info, log
dirty_info = true;
+ dirty_big_info = true; // maybe
dirty_log = true;
// clean up stray objects
_split_into(child_pgid, child, split_bits);
child->dirty_info = true;
+ child->dirty_big_info = true;
child->dirty_log = true;
dirty_info = true;
+ dirty_big_info = true;
dirty_log = true;
}
void PG::write_info(ObjectStore::Transaction& t)
{
// pg state
- bufferlist infobl;
- __u8 struct_v = 5;
- ::encode(struct_v, infobl);
- ::encode(get_osdmap()->get_epoch(), infobl);
- t.collection_setattr(coll, "info", infobl);
+ __u8 cur_struct_v = 6;
+
+ assert(info_struct_v <= cur_struct_v);
+
+ // Only need to write struct_v to attr when upgrading
+ if (info_struct_v < cur_struct_v) {
+ bufferlist attrbl;
+ info_struct_v = cur_struct_v;
+ ::encode(info_struct_v, attrbl);
+ t.collection_setattr(coll, "info", attrbl);
+ }
+
+ // info. store purged_snaps separately.
+ interval_set<snapid_t> purged_snaps;
+ map<string,bufferlist> v;
+ string k = stringify(info.pgid) + string("_info");
+ string ek = stringify(info.pgid) + string("_epoch");
+ ::encode(get_osdmap()->get_epoch(), v[ek]);
+ purged_snaps.swap(info.purged_snaps);
+ ::encode(info, v[k]);
+ purged_snaps.swap(info.purged_snaps);
+
+ t.omap_setkeys(coll_t::META_COLL, osd->infos_oid, v);
- // potentially big stuff
- bufferlist bigbl;
- ::encode(past_intervals, bigbl);
- ::encode(snap_collections, bigbl);
- ::encode(info, bigbl);
- dout(20) << "write_info bigbl " << bigbl.length() << dendl;
- t.truncate(coll_t::META_COLL, biginfo_oid, 0);
- t.write(coll_t::META_COLL, biginfo_oid, 0, bigbl.length(), bigbl);
+ if (dirty_big_info) {
+ // potentially big stuff
+ v.clear();
+ bufferlist& bigbl = v[k];
+ ::encode(past_intervals, bigbl);
+ ::encode(snap_collections, bigbl);
+ ::encode(info.purged_snaps, bigbl);
+ dout(20) << "write_info bigbl " << bigbl.length() << dendl;
+ t.omap_setkeys(coll_t::META_COLL, osd->biginfos_oid, v);
+ }
dirty_info = false;
+ dirty_big_info = false;
}
-epoch_t PG::peek_map_epoch(ObjectStore *store, coll_t coll, bufferlist *bl)
+epoch_t PG::peek_map_epoch(ObjectStore *store, coll_t coll, hobject_t &infos_oid, bufferlist *bl)
{
assert(bl);
+ pg_t pgid;
+ snapid_t snap;
+ bool ok = coll.is_pg(pgid, snap);
+ assert(ok);
store->collection_getattr(coll, "info", *bl);
bufferlist::iterator bp = bl->begin();
__u8 struct_v = 0;
if (struct_v < 5)
return 0;
epoch_t cur_epoch = 0;
- ::decode(cur_epoch, bp);
+ if (struct_v < 6) {
+ ::decode(cur_epoch, bp);
+ } else {
+ // get epoch out of leveldb
+ bufferlist tmpbl;
+ string ek = stringify(pgid) + string("_epoch");
+ set<string> keys;
+ keys.insert(ek);
+ map<string,bufferlist> values;
+ store->omap_get_values(coll_t::META_COLL, infos_oid, keys, &values);
+ assert(values.size() == 1);
+ tmpbl = values[ek];
+ bufferlist::iterator p = tmpbl.begin();
+ ::decode(cur_epoch, p);
+ }
return cur_epoch;
}
}
int PG::read_info(ObjectStore *store, const coll_t coll, bufferlist &bl,
- pg_info_t &info, map<epoch_t,pg_interval_t> &past_intervals,
- hobject_t &biginfo_oid, interval_set<snapid_t> &snap_collections)
+ pg_info_t &info, map<epoch_t,pg_interval_t> &past_intervals, hobject_t &biginfo_oid,
+ hobject_t &infos_oid, hobject_t &biginfos_oid, interval_set<snapid_t> &snap_collections,
+ __u8 &struct_v)
{
bufferlist::iterator p = bl.begin();
- __u8 struct_v;
+ bufferlist lbl;
// info
::decode(struct_v, p);
::decode(past_intervals, p);
// snap_collections
- bl.clear();
- store->collection_getattr(coll, "snap_collections", bl);
- p = bl.begin();
+ store->collection_getattr(coll, "snap_collections", lbl);
+ p = lbl.begin();
::decode(struct_v, p);
} else {
- bl.clear();
- int r = store->read(coll_t::META_COLL, biginfo_oid, 0, 0, bl);
- if (r < 0)
- return r;
- p = bl.begin();
- ::decode(past_intervals, p);
+ if (struct_v < 6) {
+ int r = store->read(coll_t::META_COLL, biginfo_oid, 0, 0, lbl);
+ if (r < 0)
+ return r;
+ p = lbl.begin();
+ ::decode(past_intervals, p);
+ } else {
+ // get info out of leveldb
+ string k = stringify(info.pgid) + string("_info");
+ set<string> keys;
+ keys.insert(k);
+ map<string,bufferlist> values;
+ store->omap_get_values(coll_t::META_COLL, infos_oid, keys, &values);
+ assert(values.size() == 1);
+ lbl = values[k];
+ p = lbl.begin();
+ ::decode(info, p);
+
+ // biginfo
+ values.clear();
+ store->omap_get_values(coll_t::META_COLL, biginfos_oid, keys, &values);
+ assert(values.size() == 1);
+ lbl = values[k];
+ p = lbl.begin();
+ ::decode(past_intervals, p);
+ }
}
if (struct_v < 3) {
}
} else {
::decode(snap_collections, p);
- if (struct_v >= 4)
+ if (struct_v >= 4 && struct_v < 6)
::decode(info, p);
+ else if (struct_v >= 6)
+ ::decode(info.purged_snaps, p);
}
return 0;
}
void PG::read_state(ObjectStore *store, bufferlist &bl)
{
int r = read_info(store, coll, bl, info, past_intervals, biginfo_oid,
- snap_collections);
+ osd->infos_oid, osd->biginfos_oid, snap_collections, info_struct_v);
assert(r >= 0);
try {
if (!lastmap) {
dout(10) << " no lastmap" << dendl;
dirty_info = true;
+ dirty_big_info = true;
} else {
bool new_interval = pg_interval_t::check_new_interval(
oldacting, newacting,
if (new_interval) {
dout(10) << " noting past " << past_intervals.rbegin()->second << dendl;
dirty_info = true;
+ dirty_big_info = true;
}
}
adjust_local_snaps();
}
dirty_info = true;
+ dirty_big_info = true;
}
}
pg->snap_trimq.union_of(pg->pool.newly_removed_snaps);
dout(10) << *pg << " snap_trimq now " << pg->snap_trimq << dendl;
pg->dirty_info = true;
+ pg->dirty_big_info = true;
}
pg->check_recovery_sources(pg->get_osdmap());
pg->info = msg->info;
pg->reg_next_scrub();
pg->dirty_info = true;
+ pg->dirty_big_info = true; // maybe.
pg->dirty_log = true;
pg->log.claim_log(msg->log);
pg->missing.clear();
void unlock() {
//generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
assert(!dirty_info);
+ assert(!dirty_big_info);
assert(!dirty_log);
_lock.Unlock();
}
}
- bool dirty_info, dirty_log;
+ bool dirty_info, dirty_big_info, dirty_log;
public:
// pg state
pg_info_t info;
+ __u8 info_struct_v;
const coll_t coll;
IndexedLog log;
hobject_t log_oid;
std::string get_corrupt_pg_log_name() const;
static int read_info(ObjectStore *store, const coll_t coll,
- bufferlist &bl, pg_info_t &info, map<epoch_t,pg_interval_t> &past_intervals,
- hobject_t &biginfo_oid, interval_set<snapid_t> &snap_collections);
+ bufferlist &bl, pg_info_t &info, map<epoch_t,pg_interval_t> &past_intervals, hobject_t &biginfo_oid,
+ hobject_t &infos_oid, hobject_t &biginfos_oid, interval_set<snapid_t> &snap_collections, __u8 &);
void read_state(ObjectStore *store, bufferlist &bl);
- static epoch_t peek_map_epoch(ObjectStore *store,
- coll_t coll, bufferlist *bl);
+ static epoch_t peek_map_epoch(ObjectStore *store, coll_t coll,
+ hobject_t &infos_oid, bufferlist *bl);
coll_t make_snap_collection(ObjectStore::Transaction& t, snapid_t sn);
void update_snap_collections(vector<pg_log_entry_t> &log_entries,
ObjectStore::Transaction& t);