]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: replace version attr with oi (object_info_t) (disk format change)
authorSage Weil <sage@newdream.net>
Tue, 27 Jan 2009 19:20:37 +0000 (11:20 -0800)
committerSage Weil <sage@newdream.net>
Thu, 29 Jan 2009 18:17:45 +0000 (10:17 -0800)
Includes last reqid, mtime.  Makes backlog log entries include
a real reqid, which makes the client_reqs IndexedLog map meaningful.
Otherwise, the first backlog item we 'unindex' means the remaining
ops with reqid unknown0.0:0 aren't in the map.

This revs the osd ondisk format.

src/osd/OSD.cc
src/osd/PG.cc
src/osd/ReplicatedPG.cc
src/osd/ReplicatedPG.h
src/osd/osd_types.h

index 0cf20e0bac3df88575351923e6d1e6e081bf1ee5..bed71efc9facb4ad6c9ca0b8c6901659341f64cc 100644 (file)
@@ -2492,14 +2492,14 @@ void OSD::split_pg(PG *parent, map<pg_t,PG*>& children, ObjectStore::Transaction
       PG *child = children[pgid];
       assert(child);
       bufferlist bv;
-      store->getattr(parentid.to_coll(), poid, "version", bv);
-      eversion_t v(bv);
+      store->getattr(parentid.to_coll(), poid, "oi", bv);
+      object_info_t oi(bv);
 
-      if (v > child->info.last_update) {
-       child->info.last_update = v;
-       dout(25) << "        tagging pg with v " << v << "  > " << child->info.last_update << dendl;
+      if (oi.version > child->info.last_update) {
+       child->info.last_update = oi.version;
+       dout(25) << "        tagging pg with v " << oi.version << "  > " << child->info.last_update << dendl;
       } else {
-       dout(25) << "    not tagging pg with v " << v << " <= " << child->info.last_update << dendl;
+       dout(25) << "    not tagging pg with v " << oi.version << " <= " << child->info.last_update << dendl;
       }
       t.collection_add(pgid.to_coll(), parentid.to_coll(), poid);
       t.collection_remove(parentid.to_coll(), poid);
index e0a5238c49f62659d0027cd766c5604cd2db26ab..9352b1cce0759b25a692792fc8fe7f742e137fde 100644 (file)
@@ -541,14 +541,14 @@ bool PG::build_backlog_map(map<eversion_t,Log::Entry>& omap)
     Log::Entry e;
     e.oid = it->oid;
     bufferlist bv;
-    osd->store->getattr(info.pgid.to_coll(), poid, "version", bv);
-    e.version.decode(bv);
+    osd->store->getattr(info.pgid.to_coll(), poid, "oi", bv);
+    object_info_t oi(bv);
+    e.version = oi.version;
+    e.prior_version = oi.prior_version;
+    e.reqid = oi.last_reqid;
     if (poid.oid.snap && poid.oid.snap < CEPH_NOSNAP) {
       e.op = Log::Entry::CLONE;
       osd->store->getattr(info.pgid.to_coll(), poid, "snaps", e.snaps);
-      bufferlist bfv;
-      osd->store->getattr(info.pgid.to_coll(), poid, "from_version", bfv);
-      e.prior_version.decode(bfv);
     } else {
       e.op = Log::Entry::BACKLOG;           // FIXME if/when we do smarter op codes!
     }
@@ -1786,13 +1786,13 @@ void PG::read_log(ObjectStore *store)
       
       pobject_t poid(info.pgid.pool(), 0, i->oid);
       bufferlist bv;
-      int r = osd->store->getattr(info.pgid.to_coll(), poid, "version", bv);
-      eversion_t v;
+      int r = osd->store->getattr(info.pgid.to_coll(), poid, "oi", bv);
+      object_info_t oi;
       if (r >= 0)
-       v = eversion_t(bv);
-      if (r < 0 || v < i->version) {
+       oi.decode(bv);
+      if (r < 0 || oi.version < i->version) {
        dout(15) << "read_log  missing " << *i << dendl;
-       missing.add(i->oid, i->version, v);
+       missing.add(i->oid, i->version, oi.version);
       }
     }
   }
@@ -1993,11 +1993,13 @@ void PG::build_scrub_map(ScrubMap &map)
 void PG::repair_object(ScrubMap::object *po, int bad_peer, int ok_peer)
 {
   eversion_t v;
-  po->attrs["version"].copy_out(0, sizeof(v), (char *)&v);
+  bufferlist bv;
+  bv.push_back(po->attrs["oi"]);
+  object_info_t oi(bv);
   if (bad_peer != acting[0]) {
-    peer_missing[bad_peer].add(po->poid.oid, v, eversion_t());
+    peer_missing[bad_peer].add(po->poid.oid, oi.version, eversion_t());
   } else {
-    missing.add(po->poid.oid, v, eversion_t());
+    missing.add(po->poid.oid, oi.version, eversion_t());
     missing_loc[po->poid.oid].insert(ok_peer);
     log.last_requested = object_t();
   }
index 044034e9c91c6c4d633bef76d8d01d12593f3d74..7da7b95f41718cbcd362e311ac9f1f9893956733 100644 (file)
@@ -825,16 +825,19 @@ void ReplicatedPG::op_read(MOSDOp *op)
 
 void ReplicatedPG::_make_clone(ObjectStore::Transaction& t,
                               pobject_t head, pobject_t coid,
-                              eversion_t ov, eversion_t v, bufferlist& snapsbl)
+                              eversion_t ov, eversion_t v, osd_reqid_t& reqid, bufferlist& snapsbl)
 {
   t.clone(info.pgid.to_coll(), head, coid);
   t.setattr(info.pgid.to_coll(), coid, "snaps", snapsbl);
-  bufferlist bv(sizeof(v));
-  ::encode(v, bv);
-  t.setattr(info.pgid.to_coll(), coid, "version", bv);
-  bufferlist bov(sizeof(v));
-  ::encode(ov, bov);
-  t.setattr(info.pgid.to_coll(), coid, "from_version", bov);
+
+  object_info_t pi;
+  pi.version = v;
+  pi.prior_version = ov;
+  pi.last_reqid = reqid;
+  pi.mtime = g_clock.now();
+  bufferlist bv(sizeof(pi));
+  ::encode(pi, bv);
+  t.setattr(info.pgid.to_coll(), coid, "oi", bv);
 }
 
 void ReplicatedPG::prepare_clone(ObjectStore::Transaction& t, bufferlist& logbl, osd_reqid_t reqid, pg_stat_t& stats,
@@ -871,7 +874,7 @@ void ReplicatedPG::prepare_clone(ObjectStore::Transaction& t, bufferlist& logbl,
     ::encode(snaps, snapsbl);
     
     // prepare clone
-    _make_clone(t, poid, coid, old_version, at_version, snapsbl);
+    _make_clone(t, poid, coid, old_version, at_version, reqid, snapsbl);
     
     // add to snap bound collections
     coll_t fc = make_snap_collection(t, snaps[0]);
@@ -1150,7 +1153,8 @@ int ReplicatedPG::prepare_simple_op(ObjectStore::Transaction& t, osd_reqid_t req
 void ReplicatedPG::prepare_transaction(ObjectStore::Transaction& t, osd_reqid_t reqid,
                                       pobject_t poid,
                                       vector<ceph_osd_op>& ops, bufferlist& bl,
-                                      bool& exists, __u64& size, eversion_t& version,
+                                      bool& exists, __u64& size,
+                                      object_info_t& oi,
                                       eversion_t at_version,
                                       SnapSet& snapset, SnapContext& snapc,
                                       __u32 inc_lock, eversion_t trim_to)
@@ -1159,7 +1163,7 @@ void ReplicatedPG::prepare_transaction(ObjectStore::Transaction& t, osd_reqid_t
   eversion_t log_version = at_version;
   assert(!ops.empty());
   
-  eversion_t old_version = version;
+  eversion_t old_version = oi.version;
 
   // apply ops
   bool did_snap = false;
@@ -1178,7 +1182,7 @@ void ReplicatedPG::prepare_transaction(ObjectStore::Transaction& t, osd_reqid_t
   }
 
   // finish.
-  version = at_version;
+  oi.version = at_version;
   if (exists) {
     if (inc_lock) {
       bufferlist b(sizeof(inc_lock));
@@ -1186,9 +1190,13 @@ void ReplicatedPG::prepare_transaction(ObjectStore::Transaction& t, osd_reqid_t
       t.setattr(info.pgid.to_coll(), poid, "inc_lock", b);
     }
 
-    bufferlist bv(sizeof(at_version));
-    ::encode(at_version, bv);
-    t.setattr(info.pgid.to_coll(), poid, "version", bv);
+    oi.version = at_version;
+    oi.prior_version = old_version;
+    oi.last_reqid = reqid;
+    oi.mtime = g_clock.now();
+    bufferlist bv(sizeof(oi));
+    ::encode(oi, bv);
+    t.setattr(info.pgid.to_coll(), poid, "oi", bv);
 
     bufferlist snapsetbl;
     ::encode(snapset, snapsetbl);
@@ -1399,7 +1407,7 @@ void ReplicatedPG::issue_repop(RepGather *repop, int dest, utime_t now)
                                repop->rep_tid, repop->op->get_inc_lock(), repop->at_version);
   wr->old_exists = repop->pinfo->exists;
   wr->old_size = repop->pinfo->size;
-  wr->old_version = repop->pinfo->version;
+  wr->old_version = repop->pinfo->oi.version;
   wr->snapset = repop->pinfo->snapset;
   wr->snapc = repop->snapc;
   wr->get_data() = repop->op->get_data();   // _copy_ bufferlist
@@ -1502,9 +1510,9 @@ ReplicatedPG::ProjectedObjectInfo *ReplicatedPG::get_projected_object(pobject_t
     pinfo->size = st.st_size;
     
     bufferlist bv;
-    r = osd->store->getattr(info.pgid.to_coll(), poid, "version", bv);
+    r = osd->store->getattr(info.pgid.to_coll(), poid, "oi", bv);
     assert(r >= 0);
-    pinfo->version.decode(bv);
+    pinfo->oi.decode(bv);
     
     if (poid.oid.snap == CEPH_NOSNAP) {
       bufferlist bl;
@@ -1636,7 +1644,7 @@ void ReplicatedPG::op_modify(MOSDOp *op)
 
   dout(10) << "op_modify " << opname 
            << " " << poid.oid 
-           << " ov " << pinfo->version << " av " << at_version 
+           << " ov " << pinfo->oi.version << " av " << at_version 
           << " snapc " << snapc
           << " snapset " << pinfo->snapset
            << dendl;  
@@ -1686,7 +1694,7 @@ void ReplicatedPG::op_modify(MOSDOp *op)
   if (!noop) {
     // log and update later.
     prepare_transaction(repop->t, op->get_reqid(), poid, op->ops, op->get_data(),
-                       pinfo->exists, pinfo->size, pinfo->version, at_version,
+                       pinfo->exists, pinfo->size, pinfo->oi, at_version,
                        pinfo->snapset, snapc,
                        op->get_inc_lock(), trim_to);
   }
@@ -1799,9 +1807,11 @@ void ReplicatedPG::sub_op_modify(MOSDSubOp *op)
   osd->logger->inc("r_wrb", op->get_data().length());
   
   if (!op->noop) {
+    object_info_t oi;
+    oi.version = op->old_version;
     prepare_transaction(t, op->reqid,
                        op->poid, op->ops, op->get_data(),
-                       op->old_exists, op->old_size, op->old_version, op->version,
+                       op->old_exists, op->old_size, oi, op->version,
                        op->snapset, op->snapc,
                        op->inc_lock, op->pg_trim_to);
   }
@@ -2079,19 +2089,17 @@ void ReplicatedPG::push_to_replica(pobject_t poid, int peer)
 
   // are we doing a clone on the replica?
   if (poid.oid.snap && poid.oid.snap < CEPH_NOSNAP) {  
-    bufferlist bv, bfv;
-    int r = osd->store->getattr(info.pgid.to_coll(), poid, "version", bv);
-    assert(r >= 0);
-    r = osd->store->getattr(info.pgid.to_coll(), poid, "from_version", bfv);
+    bufferlist bv;
+    int r = osd->store->getattr(info.pgid.to_coll(), poid, "oi", bv);
     assert(r >= 0);
-    eversion_t version(bv), from_version(bfv);
+    object_info_t oi(bv);
     
     pobject_t head = poid;
     head.oid.snap = CEPH_NOSNAP;
     if (peer_missing[peer].is_missing(head.oid) &&
-       peer_missing[peer].have_old(head.oid) == from_version) {
+       peer_missing[peer].have_old(head.oid) == oi.prior_version) {
       dout(10) << "push_to_replica osd" << peer << " has correct old " << head
-              << " v" << from_version 
+              << " v" << oi.prior_version 
               << ", pushing " << poid << " attrs as a clone op" << dendl;
       interval_set<__u64> data_subset;
       map<pobject_t, interval_set<__u64> > clone_subsets;
@@ -2173,12 +2181,12 @@ void ReplicatedPG::push(pobject_t poid, int peer,
     size = bl.length();
   }
   bufferlist bv;
-  osd->store->getattr(info.pgid.to_coll(), poid, "version", bv);
-  eversion_t v(bv);
+  osd->store->getattr(info.pgid.to_coll(), poid, "oi", bv);
+  object_info_t oi(bv);
   osd->store->getattrs(info.pgid.to_coll(), poid, attrset);
 
   // ok
-  dout(7) << "push " << poid << " v " << v 
+  dout(7) << "push " << poid << " v " << oi.version 
          << " size " << size
          << " subset " << data_subset
           << " data " << bl.length()
@@ -2195,7 +2203,7 @@ void ReplicatedPG::push(pobject_t poid, int peer,
   push[0].offset = 0;
   push[0].length = size;
   MOSDSubOp *subop = new MOSDSubOp(rid, info.pgid, poid, push, false, 0,
-                                  osd->osdmap->get_epoch(), osd->get_tid(), 0, v);
+                                  osd->osdmap->get_epoch(), osd->get_tid(), 0, oi.version);
   subop->data_subset.swap(data_subset);
   subop->clone_subsets.swap(clone_subsets);
   subop->set_data(bl);   // note: claims bl, set length above here!
@@ -2203,7 +2211,7 @@ void ReplicatedPG::push(pobject_t poid, int peer,
   osd->messenger->send_message(subop, osd->osdmap->get_inst(peer));
   
   if (is_primary()) {
-    peer_missing[peer].got(poid.oid, v);
+    peer_missing[peer].got(poid.oid, oi.version);
     pushing[poid.oid].insert(peer);
   }
 }
@@ -2640,7 +2648,7 @@ int ReplicatedPG::recover_primary(int max)
                     << " to " << poid << " v" << latest->version
                     << " snaps " << latest->snaps << dendl;
            ObjectStore::Transaction t;
-           _make_clone(t, head, poid, latest->prior_version, latest->version,
+           _make_clone(t, head, poid, latest->prior_version, latest->version, latest->reqid,
                        latest->snaps);
            osd->store->apply_transaction(t);
            missing.got(latest->oid, latest->version);
@@ -2851,13 +2859,14 @@ int ReplicatedPG::_scrub(ScrubMap& scrubmap)
     stat.num_objects++;
 
     // basic checks.
-    eversion_t v;
-    if (p->attrs.count("version") == 0) {
-      dout(0) << "scrub no 'version' attr on " << poid << dendl;
+    if (p->attrs.count("oi") == 0) {
+      dout(0) << "scrub no 'oi' attr on " << poid << dendl;
       errors++;
       continue;
     }
-    p->attrs["version"].copy_out(0, sizeof(v), (char *)&v);
+    bufferlist bv;
+    bv.push_back(p->attrs["oi"]);
+    object_info_t oi(bv);
 
     stat.num_bytes += p->size;
     stat.num_kb += SHIFT_ROUND_UP(p->size, 10);
index f7debb9d4e6b439bb51c00467c7eb8c2bd5eec22..24fd92099a53b8926a16c1415561b5f5bc788b1f 100644 (file)
@@ -37,7 +37,8 @@ public:
     SnapSet snapset;
     bool exists;
     __u64 size;
-    eversion_t version;
+
+    object_info_t oi;
     
     ProjectedObjectInfo() : ref(0), exists(false), size(0) {}
   };
@@ -178,7 +179,7 @@ protected:
 
   void _make_clone(ObjectStore::Transaction& t,
                   pobject_t head, pobject_t coid,
-                  eversion_t ov, eversion_t v, bufferlist& snaps);
+                  eversion_t ov, eversion_t v, osd_reqid_t& reqid, bufferlist& snaps);
   void prepare_clone(ObjectStore::Transaction& t, bufferlist& logbl, osd_reqid_t reqid, pg_stat_t& st,
                     pobject_t poid, loff_t old_size,
                     eversion_t old_version, eversion_t& at_version,
@@ -191,7 +192,7 @@ protected:
   void prepare_transaction(ObjectStore::Transaction& t, osd_reqid_t reqid,
                           pobject_t poid, 
                           vector<ceph_osd_op>& ops, bufferlist& bl,
-                          bool& exists, __u64& size, eversion_t& version,
+                          bool& exists, __u64& size, object_info_t& oi,
                           eversion_t at_version,
                           SnapSet& snapset, SnapContext& snapc,
                           __u32 inc_lock, eversion_t trim_to);
index 586e9a10ed9843693c8ce593c3be46a7eaac98df..ba09ed835ec236753d5d4a58680034ee431e93e0 100644 (file)
@@ -25,7 +25,7 @@
 
 
 
-#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v008"
+#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v009"
 
 
 
@@ -280,6 +280,37 @@ inline ostream& operator<<(ostream& out, const eversion_t e) {
 
 
 
+struct object_info_t {
+  eversion_t version, prior_version;
+  osd_reqid_t last_reqid;
+  utime_t mtime;
+
+  void encode(bufferlist& bl) const {
+    ::encode(version, bl);
+    ::encode(prior_version, bl);
+    ::encode(last_reqid, bl);
+    ::encode(mtime, bl);
+  }
+  void decode(bufferlist::iterator& bl) {
+    ::decode(version, bl);
+    ::decode(prior_version, bl);
+    ::decode(last_reqid, bl);
+    ::decode(mtime, bl);
+  }
+  void decode(bufferlist& bl) {
+    bufferlist::iterator p = bl.begin();
+    decode(p);
+  }
+
+  object_info_t() {}
+  object_info_t(bufferlist& bl) {
+    decode(bl);
+  }
+};
+WRITE_CLASS_ENCODER(object_info_t)
+
+
+
 /** osd_stat
  * aggregate stats for an osd
  */