From 73c6e4cca7d8265e1e478e83d97a638cc7fa6a24 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 29 Oct 2010 15:28:37 -0700 Subject: [PATCH] osd: write potentially large pg info to object, not xattr [format change] Write past_intervals and snap_collections to a separate object instead of an attr on the collection directory. This avoids exceeding the underlying filesystem xattr limits during thrashing recovery. The struct_v on the small info xattr is used to indicate the format of the biginfo object. Also fixed pg deletion to clean out log and biginfo objects. Added incompat format flag. Signed-off-by: Sage Weil --- src/osd/OSD.cc | 6 +++++- src/osd/OSD.h | 8 ++++++++ src/osd/PG.cc | 39 +++++++++++++++++++++++---------------- src/osd/PG.h | 5 +++-- src/osd/ReplicatedPG.h | 4 ++-- src/osd/osd_types.h | 1 + 6 files changed, 42 insertions(+), 21 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index f40e804490975..1a238bc98d242 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -108,6 +108,7 @@ const struct CompatSet::Feature ceph_osd_feature_compat[] = { }; const struct CompatSet::Feature ceph_osd_feature_incompat[] = { CEPH_OSD_FEATURE_INCOMPAT_BASE, + CEPH_OSD_FEATURE_INCOMPAT_PGINFO, END_FEATURE }; const struct CompatSet::Feature ceph_osd_feature_ro_compat[] = { @@ -871,8 +872,9 @@ PG *OSD::_open_lock_pg(pg_t pgid, bool no_lockdep_check) // create PG *pg; sobject_t logoid = make_pg_log_oid(pgid); + sobject_t infooid = make_pg_biginfo_oid(pgid); if (osdmap->get_pg_type(pgid) == CEPH_PG_TYPE_REP) - pg = new ReplicatedPG(this, pool, pgid, logoid); + pg = new ReplicatedPG(this, pool, pgid, logoid, infooid); //else if (pgid.is_raid4()) //pg = new RAID4PG(this, pgid); else @@ -4138,6 +4140,8 @@ void OSD::_remove_pg(PG *pg) dout(10) << "_remove_pg " << pgid << " removing final" << dendl; { + rmt->remove(coll_t::META_COLL, pg->log_oid); + rmt->remove(coll_t::META_COLL, pg->biginfo_oid); rmt->remove_collection(coll_t(pgid)); int tr = store->queue_transaction(NULL, rmt); assert(tr == 0); diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 93f1d0263c0b7..971db37df7314 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -160,6 +160,14 @@ public: return sobject_t(object_t(s.c_str()), 0); } + sobject_t make_pg_biginfo_oid(pg_t pg) { + stringstream ss; + ss << "pginfo_" << pg; + string s; + getline(ss, s); + return sobject_t(object_t(s.c_str()), 0); + } + private: // -- superblock -- diff --git a/src/osd/PG.cc b/src/osd/PG.cc index c759d21de4541..1bd0bdf8793e8 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -2012,20 +2012,19 @@ void PG::write_info(ObjectStore::Transaction& t) { // pg state bufferlist infobl; - __u8 struct_v = 1; + __u8 struct_v = 2; ::encode(struct_v, infobl); ::encode(info, infobl); - ::encode(past_intervals, infobl); dout(20) << "write_info info " << infobl.length() << dendl; t.collection_setattr(coll, "info", infobl); - // local state - bufferlist snapbl; - struct_v = 1; - ::encode(struct_v, snapbl); - ::encode(snap_collections, snapbl); - dout(20) << "write_info snap " << snapbl.length() << dendl; - t.collection_setattr(coll, "snap_collections", snapbl); + // potentially big stuff + bufferlist bigbl; + ::encode(past_intervals, bigbl); + ::encode(snap_collections, bigbl); + dout(20) << "write_info bigbl " << bigbl.length() << dendl; + t.truncate(coll_t::META_COLL, biginfo_oid, 0); + t.write(coll_t::META_COLL, biginfo_oid, 0, bigbl.length(), bigbl); dirty_info = false; } @@ -2441,14 +2440,22 @@ void PG::read_state(ObjectStore *store) p = bl.begin(); ::decode(struct_v, p); ::decode(info, p); - ::decode(past_intervals, p); + if (struct_v < 2) { + ::decode(past_intervals, p); - // snap_collections - bl.clear(); - store->collection_getattr(coll, "snap_collections", bl); - p = bl.begin(); - ::decode(struct_v, p); - ::decode(snap_collections, p); + // snap_collections + bl.clear(); + store->collection_getattr(coll, "snap_collections", bl); + p = bl.begin(); + ::decode(struct_v, p); + ::decode(snap_collections, p); + } else { + bl.clear(); + store->read(coll_t::META_COLL, biginfo_oid, 0, 0, bl); + p = bl.begin(); + ::decode(past_intervals, p); + ::decode(snap_collections, p); + } try { read_log(store); diff --git a/src/osd/PG.h b/src/osd/PG.h index 257185ec9c160..f4cb65651e6f7 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -670,6 +670,7 @@ public: const coll_t coll; IndexedLog log; sobject_t log_oid; + sobject_t biginfo_oid; OndiskLog ondisklog; Missing missing; map > missing_loc; @@ -853,11 +854,11 @@ public: public: - PG(OSD *o, PGPool *_pool, pg_t p, const sobject_t& oid) : + PG(OSD *o, PGPool *_pool, pg_t p, const sobject_t& loid, const sobject_t& ioid) : osd(o), pool(_pool), _lock("PG::_lock"), ref(0), deleting(false), dirty_info(false), dirty_log(false), - info(p), coll(p), log_oid(oid), + info(p), coll(p), log_oid(loid), biginfo_oid(ioid), recovery_item(this), backlog_item(this), scrub_item(this), snap_trim_item(this), remove_item(this), stat_queue_item(this), recovery_ops_active(0), generate_backlog_epoch(0), diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index 065ee2c9b646b..c57bc0b1160b9 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -587,8 +587,8 @@ protected: int do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr); public: - ReplicatedPG(OSD *o, PGPool *_pool, pg_t p, const sobject_t& oid) : - PG(o, _pool, p, oid) + ReplicatedPG(OSD *o, PGPool *_pool, pg_t p, const sobject_t& oid, const sobject_t& ioid) : + PG(o, _pool, p, oid, ioid) { } ~ReplicatedPG() {} diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 706e647a92391..e0081499a5752 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -32,6 +32,7 @@ #define CEPH_OSD_FULL_RATIO .95 #define CEPH_OSD_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "initial feature set(~v.18)") +#define CEPH_OSD_FEATURE_INCOMPAT_PGINFO CompatSet::Feature(1, "pginfo object") /* osdreqid_t - caller name + incarnation# + tid to unique identify this request -- 2.39.5