]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
OSD: store current pg epoch in info and load at that epoch
authorSamuel Just <sam.just@inktank.com>
Thu, 6 Dec 2012 01:05:38 +0000 (17:05 -0800)
committerSamuel Just <sam.just@inktank.com>
Fri, 7 Dec 2012 06:53:07 +0000 (22:53 -0800)
Prior to split, this did not matter.  With split, however, it's
crucial that a pg go through advance_pg() for the map causing
the split.  During operation, a PG lags the OSD superblock
epoch.  If the OSD dies after the OSD epoch passes the split
but before the pg epoch passes the split, the PG will be
reloaded at the OSD epoch and won't see the split operation.
The PG collection might after that point contain incorrect
objects which should have been split into a child.

Signed-off-by: Samuel Just <sam.just@inktank.com>
src/osd/OSD.cc
src/osd/PG.cc
src/osd/PG.h

index b4df253120f6218843a8d4d3bced10bbfdb88c72..58894c8609de644c24377f5ee6cfdb9f3e2617bd 100644 (file)
@@ -1512,10 +1512,21 @@ void OSD::load_pgs()
       continue;
     }
 
-    PG *pg = _open_lock_pg(osdmap, pgid);
+    bufferlist bl;
+    epoch_t map_epoch = PG::peek_map_epoch(store, *it, &bl);
+
+    PG *pg = _open_lock_pg(map_epoch == 0 ? osdmap : service.get_map(map_epoch), pgid);
 
     // read pg state, log
-    pg->read_state(store);
+    pg->read_state(store, bl);
+
+    set<pg_t> split_pgs;
+    if (osdmap->have_pg_pool(pg->info.pgid.pool()) &&
+       pg->info.pgid.is_split(pg->get_osdmap()->get_pg_num(pg->info.pgid.pool()),
+                              osdmap->get_pg_num(pg->info.pgid.pool()),
+                              &split_pgs)) {
+      service.start_split(split_pgs);
+    }
 
     service.reg_last_pg_scrub(pg->info.pgid, pg->info.history.last_scrub_stamp);
 
index 11f43ccdb67dabe16455f8a9bc468d9bee68230e..49d12ea35ef142b9b87a16662605346cd30fe2d0 100644 (file)
@@ -2294,8 +2294,9 @@ void PG::write_info(ObjectStore::Transaction& t)
 {
   // pg state
   bufferlist infobl;
-  __u8 struct_v = 4;
+  __u8 struct_v = 5;
   ::encode(struct_v, infobl);
+  ::encode(get_osdmap()->get_epoch(), infobl);
   t.collection_setattr(coll, "info", infobl);
  
   // potentially big stuff
@@ -2310,6 +2311,20 @@ void PG::write_info(ObjectStore::Transaction& t)
   dirty_info = false;
 }
 
+epoch_t PG::peek_map_epoch(ObjectStore *store, coll_t coll, bufferlist *bl)
+{
+  assert(bl);
+  store->collection_getattr(coll, "info", *bl);
+  bufferlist::iterator bp = bl->begin();
+  __u8 struct_v = 0;
+  ::decode(struct_v, bp);
+  if (struct_v < 5)
+    return 0;
+  epoch_t cur_epoch = 0;
+  ::decode(cur_epoch, bp);
+  return cur_epoch;
+}
+
 void PG::write_log(ObjectStore::Transaction& t)
 {
   dout(10) << "write_log" << dendl;
@@ -2756,15 +2771,12 @@ std::string PG::get_corrupt_pg_log_name() const
   return buf;
 }
 
-void PG::read_state(ObjectStore *store)
+void PG::read_state(ObjectStore *store, bufferlist &bl)
 {
-  bufferlist bl;
-  bufferlist::iterator p;
+  bufferlist::iterator p = bl.begin();
   __u8 struct_v;
 
   // info
-  store->collection_getattr(coll, "info", bl);
-  p = bl.begin();
   ::decode(struct_v, p);
   if (struct_v < 4)
     ::decode(info, p);
index f0e57eb120f926065c0c1ddb1b21498dd3e3461a..2cf1173203dd5e8a5fa8d53cf07cfaa9a228501b 100644 (file)
@@ -1709,7 +1709,9 @@ public:
   void trim_peers();
 
   std::string get_corrupt_pg_log_name() const;
-  void read_state(ObjectStore *store);
+  void read_state(ObjectStore *store, bufferlist &bl);
+  static epoch_t peek_map_epoch(ObjectStore *store,
+                               coll_t coll, bufferlist *bl);
   coll_t make_snap_collection(ObjectStore::Transaction& t, snapid_t sn);
   void update_snap_collections(vector<pg_log_entry_t> &log_entries,
                               ObjectStore::Transaction& t);