]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd/PG: store pg_info_t in leveldb (omap), purged_snaps separately
authorDavid Zafman <david.zafman@inktank.com>
Mon, 11 Feb 2013 20:20:23 +0000 (12:20 -0800)
committerDavid Zafman <david.zafman@inktank.com>
Wed, 13 Feb 2013 01:18:25 +0000 (17:18 -0800)
Separate the purged_snaps portion of pg_info_t (the one that gets big).

Feature #3891: osd: move purged_snaps out of info

Add a separate dirty_big_info flag so that we only update the pginfo
"biginfo" file if that state changes.  This lets us avoid the cost in the
general case, like a regular PG write.

Add LEVELDBINFO feature

Put info, biginfo in leveldb
Move epoch to omap

Feature #3892: osd: move pg info into leveldb

Signed-off-by: Sage Weil <sage@inktank.com>
Signed-off-by: David Zafman <david.zafman@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
Reviewed-by: Sam Just <sam.just@inktank.com>
src/osd/OSD.cc
src/osd/OSD.h
src/osd/PG.cc
src/osd/PG.h
src/osd/osd_types.h
src/tools/ceph-filestore-dump.cc

index fe85532db0b2ee7d4f56ad690941b1d5df86b678..243eccf8048b3358eb9e301b6923c0c6e915bbac 100644 (file)
@@ -139,6 +139,7 @@ static CompatSet get_osd_compat_set() {
   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
   return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
                   ceph_osd_feature_incompat);
 }
@@ -147,6 +148,8 @@ OSDService::OSDService(OSD *osd) :
   osd(osd),
   whoami(osd->whoami), store(osd->store), clog(osd->clog),
   pg_recovery_stats(osd->pg_recovery_stats),
+  infos_oid(sobject_t("infos", CEPH_NOSNAP)),
+  biginfos_oid(sobject_t("biginfos", CEPH_NOSNAP)),
   cluster_messenger(osd->cluster_messenger),
   client_messenger(osd->client_messenger),
   logger(osd->logger),
@@ -1587,7 +1590,7 @@ void OSD::load_pgs()
 
     dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
     bufferlist bl;
-    epoch_t map_epoch = PG::peek_map_epoch(store, coll_t(pgid), &bl);
+    epoch_t map_epoch = PG::peek_map_epoch(store, coll_t(pgid), service.infos_oid, &bl);
 
     PG *pg = _open_lock_pg(map_epoch == 0 ? osdmap : service.get_map(map_epoch), pgid);
 
@@ -1619,6 +1622,16 @@ void OSD::load_pgs()
   }
   dout(10) << "load_pgs done" << dendl;
 
+  // make sure info objects exist
+  if (!store->exists(coll_t::META_COLL, service.infos_oid) ||
+      !store->exists(coll_t::META_COLL, service.biginfos_oid)) {
+    dout(10) << "load_pgs creating/touching infos, biginfos objects" << dendl;
+    ObjectStore::Transaction t;
+    t.touch(coll_t::META_COLL, service.infos_oid);
+    t.touch(coll_t::META_COLL, service.biginfos_oid);
+    store->apply_transaction(t);
+  }
+  
   build_past_intervals_parallel();
 }
 
index 015694b8074b82a068a71ab53940489b20fada51..c116d4b912a741f6053b63ba901d05f2014b2a47 100644 (file)
@@ -169,6 +169,7 @@ public:
   ObjectStore *&store;
   LogClient &clog;
   PGRecoveryStats &pg_recovery_stats;
+  hobject_t infos_oid, biginfos_oid;
 private:
   Messenger *&cluster_messenger;
   Messenger *&client_messenger;
index 70a08bc388c68c6d436d38e5730748ef5538d46b..bf172d690f5642ec010037128e26a5e88f36a2cf 100644 (file)
@@ -63,7 +63,7 @@ PG::PG(OSDService *o, OSDMapRef curmap,
        const hobject_t& ioid) :
   osd(o), osdmap_ref(curmap), pool(_pool),
   _lock("PG::_lock"),
-  ref(0), deleting(false), dirty_info(false), dirty_log(false),
+  ref(0), deleting(false), dirty_info(false), dirty_big_info(false), dirty_log(false),
   info(p), coll(p), log_oid(loid), biginfo_oid(ioid),
   recovery_item(this), scrub_item(this), scrub_finalize_item(this), snap_trim_item(this), stat_queue_item(this),
   recovery_ops_active(0),
@@ -97,6 +97,7 @@ void PG::lock(bool no_lockdep)
   _lock.Lock(no_lockdep);
   // if we have unrecorded dirty state with the lock dropped, there is a bug
   assert(!dirty_info);
+  assert(!dirty_big_info);
   assert(!dirty_log);
 
   dout(30) << "lock" << dendl;
@@ -107,6 +108,7 @@ void PG::lock_with_map_lock_held(bool no_lockdep)
   _lock.Lock(no_lockdep);
   // if we have unrecorded dirty state with the lock dropped, there is a bug
   assert(!dirty_info);
+  assert(!dirty_big_info);
   assert(!dirty_log);
 
   dout(30) << "lock_with_map_lock_held" << dendl;
@@ -462,6 +464,7 @@ void PG::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead)
     merge_old_entry(t, *d);
 
   dirty_info = true;
+  dirty_big_info = true;
   dirty_log = true;
 }
 
@@ -597,6 +600,7 @@ void PG::merge_log(ObjectStore::Transaction& t,
 
   if (changed) {
     dirty_info = true;
+    dirty_big_info = true;
     dirty_log = true;
   }
 }
@@ -881,6 +885,7 @@ void PG::generate_past_intervals()
 
   // record our work.
   dirty_info = true;
+  dirty_big_info = true;
 }
 
 /*
@@ -897,6 +902,7 @@ void PG::trim_past_intervals()
       return;
     dout(10) << __func__ << ": trimming " << pif->second << dendl;
     past_intervals.erase(pif++);
+    dirty_big_info = true;
   }
 }
 
@@ -1409,6 +1415,7 @@ void PG::activate(ObjectStore::Transaction& t,
 
   // write pg info, log
   dirty_info = true;
+  dirty_big_info = true; // maybe
   dirty_log = true;
 
   // clean up stray objects
@@ -2061,8 +2068,10 @@ void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
   _split_into(child_pgid, child, split_bits);
 
   child->dirty_info = true;
+  child->dirty_big_info = true;
   child->dirty_log = true;
   dirty_info = true;
+  dirty_big_info = true;
   dirty_log = true;
 }
 
@@ -2314,27 +2323,52 @@ void PG::init(int role, vector<int>& newup, vector<int>& newacting, pg_history_t
 void PG::write_info(ObjectStore::Transaction& t)
 {
   // pg state
-  bufferlist infobl;
-  __u8 struct_v = 5;
-  ::encode(struct_v, infobl);
-  ::encode(get_osdmap()->get_epoch(), infobl);
-  t.collection_setattr(coll, "info", infobl);
+  __u8 cur_struct_v = 6;
+
+  assert(info_struct_v <= cur_struct_v);
+
+  // Only need to write struct_v to attr when upgrading
+  if (info_struct_v < cur_struct_v) {
+    bufferlist attrbl;
+    info_struct_v = cur_struct_v;
+    ::encode(info_struct_v, attrbl);
+    t.collection_setattr(coll, "info", attrbl);
+  }
+
+  // info.  store purged_snaps separately.
+  interval_set<snapid_t> purged_snaps;
+  map<string,bufferlist> v;
+  string k = stringify(info.pgid) + string("_info");
+  string ek = stringify(info.pgid) + string("_epoch");
+  ::encode(get_osdmap()->get_epoch(), v[ek]);
+  purged_snaps.swap(info.purged_snaps);
+  ::encode(info, v[k]);
+  purged_snaps.swap(info.purged_snaps);
+
+  t.omap_setkeys(coll_t::META_COLL, osd->infos_oid, v);
  
-  // potentially big stuff
-  bufferlist bigbl;
-  ::encode(past_intervals, bigbl);
-  ::encode(snap_collections, bigbl);
-  ::encode(info, bigbl);
-  dout(20) << "write_info bigbl " << bigbl.length() << dendl;
-  t.truncate(coll_t::META_COLL, biginfo_oid, 0);
-  t.write(coll_t::META_COLL, biginfo_oid, 0, bigbl.length(), bigbl);
+  if (dirty_big_info) {
+    // potentially big stuff
+    v.clear();
+    bufferlist& bigbl = v[k];
+    ::encode(past_intervals, bigbl);
+    ::encode(snap_collections, bigbl);
+    ::encode(info.purged_snaps, bigbl);
+    dout(20) << "write_info bigbl " << bigbl.length() << dendl;
+    t.omap_setkeys(coll_t::META_COLL, osd->biginfos_oid, v);
+  }
 
   dirty_info = false;
+  dirty_big_info = false;
 }
 
-epoch_t PG::peek_map_epoch(ObjectStore *store, coll_t coll, bufferlist *bl)
+epoch_t PG::peek_map_epoch(ObjectStore *store, coll_t coll, hobject_t &infos_oid, bufferlist *bl)
 {
   assert(bl);
+  pg_t pgid;
+  snapid_t snap;
+  bool ok = coll.is_pg(pgid, snap);
+  assert(ok);
   store->collection_getattr(coll, "info", *bl);
   bufferlist::iterator bp = bl->begin();
   __u8 struct_v = 0;
@@ -2342,7 +2376,21 @@ epoch_t PG::peek_map_epoch(ObjectStore *store, coll_t coll, bufferlist *bl)
   if (struct_v < 5)
     return 0;
   epoch_t cur_epoch = 0;
-  ::decode(cur_epoch, bp);
+  if (struct_v < 6) {
+    ::decode(cur_epoch, bp);
+  } else {
+    // get epoch out of leveldb
+    bufferlist tmpbl;
+    string ek = stringify(pgid) + string("_epoch");
+    set<string> keys;
+    keys.insert(ek);
+    map<string,bufferlist> values;
+    store->omap_get_values(coll_t::META_COLL, infos_oid, keys, &values);
+    assert(values.size() == 1);
+    tmpbl = values[ek];
+    bufferlist::iterator p = tmpbl.begin();
+    ::decode(cur_epoch, p);
+  }
   return cur_epoch;
 }
 
@@ -2596,11 +2644,12 @@ std::string PG::get_corrupt_pg_log_name() const
 }
 
 int PG::read_info(ObjectStore *store, const coll_t coll, bufferlist &bl,
-  pg_info_t &info, map<epoch_t,pg_interval_t> &past_intervals,
-  hobject_t &biginfo_oid, interval_set<snapid_t>  &snap_collections)
+  pg_info_t &info, map<epoch_t,pg_interval_t> &past_intervals, hobject_t &biginfo_oid,
+  hobject_t &infos_oid, hobject_t &biginfos_oid, interval_set<snapid_t>  &snap_collections,
+  __u8 &struct_v)
 {
   bufferlist::iterator p = bl.begin();
-  __u8 struct_v;
+  bufferlist lbl;
 
   // info
   ::decode(struct_v, p);
@@ -2610,17 +2659,36 @@ int PG::read_info(ObjectStore *store, const coll_t coll, bufferlist &bl,
     ::decode(past_intervals, p);
   
     // snap_collections
-    bl.clear();
-    store->collection_getattr(coll, "snap_collections", bl);
-    p = bl.begin();
+    store->collection_getattr(coll, "snap_collections", lbl);
+    p = lbl.begin();
     ::decode(struct_v, p);
   } else {
-    bl.clear();
-    int r = store->read(coll_t::META_COLL, biginfo_oid, 0, 0, bl);
-    if (r < 0)
-       return r;
-    p = bl.begin();
-    ::decode(past_intervals, p);
+    if (struct_v < 6) {
+      int r = store->read(coll_t::META_COLL, biginfo_oid, 0, 0, lbl);
+      if (r < 0)
+        return r;
+      p = lbl.begin();
+      ::decode(past_intervals, p);
+    } else {
+      // get info out of leveldb
+      string k = stringify(info.pgid) + string("_info");
+      set<string> keys;
+      keys.insert(k);
+      map<string,bufferlist> values;
+      store->omap_get_values(coll_t::META_COLL, infos_oid, keys, &values);
+      assert(values.size() == 1);
+      lbl = values[k];
+      p = lbl.begin();
+      ::decode(info, p);
+
+      // biginfo
+      values.clear();
+      store->omap_get_values(coll_t::META_COLL, biginfos_oid, keys, &values);
+      assert(values.size() == 1);
+      lbl = values[k];
+      p = lbl.begin();
+      ::decode(past_intervals, p);
+    }
   }
 
   if (struct_v < 3) {
@@ -2634,8 +2702,10 @@ int PG::read_info(ObjectStore *store, const coll_t coll, bufferlist &bl,
     }
   } else {
     ::decode(snap_collections, p);
-    if (struct_v >= 4)
+    if (struct_v >= 4 && struct_v < 6)
       ::decode(info, p);
+    else if (struct_v >= 6)
+      ::decode(info.purged_snaps, p);
   }
   return 0;
 }
@@ -2643,7 +2713,7 @@ int PG::read_info(ObjectStore *store, const coll_t coll, bufferlist &bl,
 void PG::read_state(ObjectStore *store, bufferlist &bl)
 {
   int r = read_info(store, coll, bl, info, past_intervals, biginfo_oid,
-    snap_collections);
+    osd->infos_oid, osd->biginfos_oid, snap_collections, info_struct_v);
   assert(r >= 0);
 
   try {
@@ -4564,6 +4634,7 @@ void PG::start_peering_interval(const OSDMapRef lastmap,
   if (!lastmap) {
     dout(10) << " no lastmap" << dendl;
     dirty_info = true;
+    dirty_big_info = true;
   } else {
     bool new_interval = pg_interval_t::check_new_interval(
       oldacting, newacting,
@@ -4575,6 +4646,7 @@ void PG::start_peering_interval(const OSDMapRef lastmap,
     if (new_interval) {
       dout(10) << " noting past " << past_intervals.rbegin()->second << dendl;
       dirty_info = true;
+      dirty_big_info = true;
     }
   }
 
@@ -4689,6 +4761,7 @@ void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo)
       adjust_local_snaps();
     }
     dirty_info = true;
+    dirty_big_info = true;
   }
 }
 
@@ -6054,6 +6127,7 @@ boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap)
     pg->snap_trimq.union_of(pg->pool.newly_removed_snaps);
     dout(10) << *pg << " snap_trimq now " << pg->snap_trimq << dendl;
     pg->dirty_info = true;
+    pg->dirty_big_info = true;
   }
   pg->check_recovery_sources(pg->get_osdmap());
 
@@ -6371,6 +6445,7 @@ boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
     pg->info = msg->info;
     pg->reg_next_scrub();
     pg->dirty_info = true;
+    pg->dirty_big_info = true;  // maybe.
     pg->dirty_log = true;
     pg->log.claim_log(msg->log);
     pg->missing.clear();
index 7e5fc58f06735527fe836969946416774d18dcbe..bd45f8b5270f61e0945e0848387a6ed8ea239b2f 100644 (file)
@@ -377,6 +377,7 @@ public:
   void unlock() {
     //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
     assert(!dirty_info);
+    assert(!dirty_big_info);
     assert(!dirty_log);
     _lock.Unlock();
   }
@@ -415,11 +416,12 @@ public:
   }
 
 
-  bool dirty_info, dirty_log;
+  bool dirty_info, dirty_big_info, dirty_log;
 
 public:
   // pg state
   pg_info_t        info;
+  __u8 info_struct_v;
   const coll_t coll;
   IndexedLog  log;
   hobject_t    log_oid;
@@ -1784,11 +1786,11 @@ public:
 
   std::string get_corrupt_pg_log_name() const;
   static int read_info(ObjectStore *store, const coll_t coll,
-    bufferlist &bl, pg_info_t &info, map<epoch_t,pg_interval_t> &past_intervals,
-    hobject_t &biginfo_oid, interval_set<snapid_t>  &snap_collections);
+    bufferlist &bl, pg_info_t &info, map<epoch_t,pg_interval_t> &past_intervals, hobject_t &biginfo_oid,
+    hobject_t &infos_oid, hobject_t &biginfos_oid, interval_set<snapid_t>  &snap_collections, __u8 &);
   void read_state(ObjectStore *store, bufferlist &bl);
-  static epoch_t peek_map_epoch(ObjectStore *store,
-                               coll_t coll, bufferlist *bl);
+  static epoch_t peek_map_epoch(ObjectStore *store, coll_t coll,
+                               hobject_t &infos_oid, bufferlist *bl);
   coll_t make_snap_collection(ObjectStore::Transaction& t, snapid_t sn);
   void update_snap_collections(vector<pg_log_entry_t> &log_entries,
                               ObjectStore::Transaction& t);
index 4d8789755a89f986963d3943587d5f5d57b65f01..66818bc07570b70b1b014a4dd7343710a7ce98d0 100644 (file)
@@ -38,6 +38,7 @@
 #define CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES  CompatSet::Feature(5, "categories")
 #define CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL  CompatSet::Feature(6, "hobjectpool")
 #define CEPH_OSD_FEATURE_INCOMPAT_BIGINFO CompatSet::Feature(7, "biginfo")
+#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo")
 
 
 typedef hobject_t collection_list_handle_t;
index d7f5f1773ee8b774812738ea7ede47884d14f1de..612ddee235e6039cb13ca1f2c3138ea23315d879 100644 (file)
@@ -204,24 +204,30 @@ int main(int argc, char **argv)
       continue;
     }
 
+    //XXX: This needs OSD function to generate
+    hobject_t biginfos_oid(sobject_t("biginfos", CEPH_NOSNAP));
+    hobject_t infos_oid(sobject_t("infos", CEPH_NOSNAP));
     bufferlist bl;
-    epoch_t map_epoch = PG::peek_map_epoch(fs, coll, &bl);
+    epoch_t map_epoch = PG::peek_map_epoch(fs, coll, infos_oid, &bl);
     (void)map_epoch;
 
     found = true;
 
-    pg_info_t info;
+    pg_info_t info(pgid);
     map<epoch_t,pg_interval_t> past_intervals;
     hobject_t biginfo_oid = OSD::make_pg_biginfo_oid(pgid);
     interval_set<snapid_t> snap_collections;
 
+    __u8 struct_v;
     int r = PG::read_info(fs, coll, bl, info, past_intervals, biginfo_oid,
-      snap_collections);
+      infos_oid, biginfos_oid, snap_collections, struct_v);
     if (r < 0) {
       cerr << "read_info error " << cpp_strerror(-r) << std::endl;
       ret = 1;
       continue;
     }
+    if (vm.count("debug"))
+      cout << "struct_v " << (int)struct_v << std::endl;
 
     if (type == "info") {
       formatter->open_object_section("info");