]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: write potentially large pg info to object, not xattr [format change]
authorSage Weil <sage@newdream.net>
Fri, 29 Oct 2010 22:28:37 +0000 (15:28 -0700)
committerSage Weil <sage@newdream.net>
Fri, 29 Oct 2010 22:29:16 +0000 (15:29 -0700)
Write past_intervals and snap_collections to a separate object instead of
an attr on the collection directory.  This avoids exceeding the underlying
filesystem xattr limits during thrashing recovery.  The struct_v on the
small info xattr is used to indicate the format of the biginfo object.

Also fixed pg deletion to clean out log and biginfo objects.

Added incompat format flag.

Signed-off-by: Sage Weil <sage@newdream.net>
src/osd/OSD.cc
src/osd/OSD.h
src/osd/PG.cc
src/osd/PG.h
src/osd/ReplicatedPG.h
src/osd/osd_types.h

index f40e80449097520d2fb3269c0aeca34c1975820a..1a238bc98d2420dd93a4ab742d15c86022847087 100644 (file)
@@ -108,6 +108,7 @@ const struct CompatSet::Feature ceph_osd_feature_compat[] = {
 };
 const struct CompatSet::Feature ceph_osd_feature_incompat[] = {
   CEPH_OSD_FEATURE_INCOMPAT_BASE,
+  CEPH_OSD_FEATURE_INCOMPAT_PGINFO,
   END_FEATURE
 };
 const struct CompatSet::Feature ceph_osd_feature_ro_compat[] = {
@@ -871,8 +872,9 @@ PG *OSD::_open_lock_pg(pg_t pgid, bool no_lockdep_check)
   // create
   PG *pg;
   sobject_t logoid = make_pg_log_oid(pgid);
+  sobject_t infooid = make_pg_biginfo_oid(pgid);
   if (osdmap->get_pg_type(pgid) == CEPH_PG_TYPE_REP)
-    pg = new ReplicatedPG(this, pool, pgid, logoid);
+    pg = new ReplicatedPG(this, pool, pgid, logoid, infooid);
   //else if (pgid.is_raid4())
   //pg = new RAID4PG(this, pgid);
   else 
@@ -4138,6 +4140,8 @@ void OSD::_remove_pg(PG *pg)
   dout(10) << "_remove_pg " << pgid << " removing final" << dendl;
 
   {
+    rmt->remove(coll_t::META_COLL, pg->log_oid);
+    rmt->remove(coll_t::META_COLL, pg->biginfo_oid);
     rmt->remove_collection(coll_t(pgid));
     int tr = store->queue_transaction(NULL, rmt);
     assert(tr == 0);
index 93f1d0263c0b71a335a033d9c70bc37d7ec9c80a..971db37df73149254a8a1af5407ed96056d7cf7d 100644 (file)
@@ -160,6 +160,14 @@ public:
     return sobject_t(object_t(s.c_str()), 0);
   }
   
+  sobject_t make_pg_biginfo_oid(pg_t pg) {
+    stringstream ss;
+    ss << "pginfo_" << pg;
+    string s;
+    getline(ss, s);
+    return sobject_t(object_t(s.c_str()), 0);
+  }
+  
 
 private:
   // -- superblock --
index c759d21de4541e3d40d3d402648eaa0cd804ff2d..1bd0bdf8793e89c05dfa9d240210457d15c00784 100644 (file)
@@ -2012,20 +2012,19 @@ void PG::write_info(ObjectStore::Transaction& t)
 {
   // pg state
   bufferlist infobl;
-  __u8 struct_v = 1;
+  __u8 struct_v = 2;
   ::encode(struct_v, infobl);
   ::encode(info, infobl);
-  ::encode(past_intervals, infobl);
   dout(20) << "write_info info " << infobl.length() << dendl;
   t.collection_setattr(coll, "info", infobl);
  
-  // local state
-  bufferlist snapbl;
-  struct_v = 1;
-  ::encode(struct_v, snapbl);
-  ::encode(snap_collections, snapbl);
-  dout(20) << "write_info snap " << snapbl.length() << dendl;
-  t.collection_setattr(coll, "snap_collections", snapbl);
+  // potentially big stuff
+  bufferlist bigbl;
+  ::encode(past_intervals, bigbl);
+  ::encode(snap_collections, bigbl);
+  dout(20) << "write_info bigbl " << bigbl.length() << dendl;
+  t.truncate(coll_t::META_COLL, biginfo_oid, 0);
+  t.write(coll_t::META_COLL, biginfo_oid, 0, bigbl.length(), bigbl);
 
   dirty_info = false;
 }
@@ -2441,14 +2440,22 @@ void PG::read_state(ObjectStore *store)
   p = bl.begin();
   ::decode(struct_v, p);
   ::decode(info, p);
-  ::decode(past_intervals, p);
+  if (struct_v < 2) {
+    ::decode(past_intervals, p);
   
-  // snap_collections
-  bl.clear();
-  store->collection_getattr(coll, "snap_collections", bl);
-  p = bl.begin();
-  ::decode(struct_v, p);
-  ::decode(snap_collections, p);
+    // snap_collections
+    bl.clear();
+    store->collection_getattr(coll, "snap_collections", bl);
+    p = bl.begin();
+    ::decode(struct_v, p);
+    ::decode(snap_collections, p);
+  } else {
+    bl.clear();
+    store->read(coll_t::META_COLL, biginfo_oid, 0, 0, bl);
+    p = bl.begin();
+    ::decode(past_intervals, p);
+    ::decode(snap_collections, p);
+  }
 
   try {
     read_log(store);
index 257185ec9c1603eb3e0e1d0b4fc5169539ba9ad6..f4cb65651e6f7bc5f2fca29592482df6a9da20f4 100644 (file)
@@ -670,6 +670,7 @@ public:
   const coll_t coll;
   IndexedLog  log;
   sobject_t    log_oid;
+  sobject_t    biginfo_oid;
   OndiskLog   ondisklog;
   Missing     missing;
   map<sobject_t, set<int> > missing_loc;
@@ -853,11 +854,11 @@ public:
 
 
  public:  
-  PG(OSD *o, PGPool *_pool, pg_t p, const sobject_t& oid) : 
+  PG(OSD *o, PGPool *_pool, pg_t p, const sobject_t& loid, const sobject_t& ioid) : 
     osd(o), pool(_pool),
     _lock("PG::_lock"),
     ref(0), deleting(false), dirty_info(false), dirty_log(false),
-    info(p), coll(p), log_oid(oid),
+    info(p), coll(p), log_oid(loid), biginfo_oid(ioid),
     recovery_item(this), backlog_item(this), scrub_item(this), snap_trim_item(this), remove_item(this), stat_queue_item(this),
     recovery_ops_active(0),
     generate_backlog_epoch(0),
index 065ee2c9b646b88b1e30e0d049ce05e70325595e..c57bc0b1160b9a2963bc708196c913c6366ca767 100644 (file)
@@ -587,8 +587,8 @@ protected:
   int do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr);
 
 public:
-  ReplicatedPG(OSD *o, PGPool *_pool, pg_t p, const sobject_t& oid) : 
-    PG(o, _pool, p, oid)
+  ReplicatedPG(OSD *o, PGPool *_pool, pg_t p, const sobject_t& oid, const sobject_t& ioid) : 
+    PG(o, _pool, p, oid, ioid)
   { }
   ~ReplicatedPG() {}
 
index 706e647a923912424fa9cef7478259032f3eec29..e0081499a5752857c9ce9482d096340e3d10350b 100644 (file)
@@ -32,6 +32,7 @@
 #define CEPH_OSD_FULL_RATIO .95
 
 #define CEPH_OSD_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "initial feature set(~v.18)")
+#define CEPH_OSD_FEATURE_INCOMPAT_PGINFO CompatSet::Feature(1, "pginfo object")
 
 
 /* osdreqid_t - caller name + incarnation# + tid to unique identify this request