]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
ReplicatedPG: enforce write ordering on rollback
authorSamuel Just <sjust@redhat.com>
Mon, 27 Jul 2015 20:12:25 +0000 (13:12 -0700)
committerSamuel Just <sjust@redhat.com>
Tue, 18 Aug 2015 18:25:24 +0000 (11:25 -0700)
Previously, rollback ops could reorder w.r.t other writes due to waiting
on degraded snaps other than head.  To fix that, we'll introduce a new
map tracking objects blocked on degraded snaps.  A particular object can
only be blocked on one snap at a time (subsequent writes won't get far
enough to add another entry).

It might have been possible use the blocked_by machinery for this, but
it requires that the object have an extant obc, which we may not
have for a missing object.  Also, that machinery exists primarily to
support clone_range, which I hope to remove soon.

Signed-off-by: Samuel Just <sjust@redhat.com>
src/osd/PG.h
src/osd/ReplicatedPG.cc
src/osd/ReplicatedPG.h

index 444a8c1a3330b4ce4d28a96df5a9e86275b75e95..7c658e95ea81bbc697c9f11bbfa492d95b3904d6 100644 (file)
@@ -747,6 +747,7 @@ protected:
   map<hobject_t, list<OpRequestRef> > waiting_for_unreadable_object,
                             waiting_for_degraded_object,
                             waiting_for_blocked_object;
+  map<hobject_t, snapid_t> objects_blocked_on_degraded_snap;
   // Callbacks should assume pg (and nothing else) is locked
   map<hobject_t, list<Context*> > callbacks_for_degraded_object;
 
index 7d68fe069e5e2c4fd062b96855cf22f2c3a004be..9f02f7efde96be86c41289863e7f9901e6269b02 100644 (file)
@@ -371,22 +371,20 @@ bool ReplicatedPG::is_missing_object(const hobject_t& soid) const
   return pg_log.get_missing().missing.count(soid);
 }
 
-void ReplicatedPG::wait_for_unreadable_object(
-  const hobject_t& soid, OpRequestRef op)
+void ReplicatedPG::maybe_kick_recovery(
+  const hobject_t &soid)
 {
-  assert(is_unreadable_object(soid));
-
   eversion_t v;
-  bool needs_recovery = missing_loc.needs_recovery(soid, &v);
-  assert(needs_recovery);
+  if (!missing_loc.needs_recovery(soid, &v))
+    return;
 
   map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
   if (p != recovering.end()) {
-    dout(7) << "missing " << soid << " v " << v << ", already recovering." << dendl;
+    dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
   } else if (missing_loc.is_unfound(soid)) {
-    dout(7) << "missing " << soid << " v " << v << ", is unfound." << dendl;
+    dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
   } else {
-    dout(7) << "missing " << soid << " v " << v << ", recovering." << dendl;
+    dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
     PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
     if (is_missing_object(soid)) {
       recover_missing(soid, v, cct->_conf->osd_client_op_priority, h);
@@ -395,6 +393,14 @@ void ReplicatedPG::wait_for_unreadable_object(
     }
     pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
   }
+}
+
+void ReplicatedPG::wait_for_unreadable_object(
+  const hobject_t& soid, OpRequestRef op)
+{
+  assert(is_unreadable_object(soid));
+
+  maybe_kick_recovery(soid);
   waiting_for_unreadable_object[soid].push_back(op);
   op->mark_delayed("waiting for missing object");
 }
@@ -434,43 +440,22 @@ void ReplicatedPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef
 {
   assert(is_degraded_or_backfilling_object(soid));
 
-  // we don't have it (yet).
-  if (recovering.count(soid)) {
-    dout(7) << "degraded "
-           << soid 
-           << ", already recovering"
-           << dendl;
-  } else if (missing_loc.is_unfound(soid)) {
-    dout(7) << "degraded "
-           << soid
-           << ", still unfound, waiting"
-           << dendl;
-  } else {
-    dout(7) << "degraded " 
-           << soid 
-           << ", recovering"
-           << dendl;
-    eversion_t v;
-    assert(!actingbackfill.empty());
-    for (set<pg_shard_t>::iterator i = actingbackfill.begin();
-        i != actingbackfill.end();
-        ++i) {
-      if (*i == get_primary()) continue;
-      pg_shard_t peer = *i;
-      if (peer_missing.count(peer) &&
-         peer_missing[peer].missing.count(soid)) {
-       v = peer_missing[peer].missing[soid].need;
-       break;
-      }
-    }
-    PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
-    prep_object_replica_pushes(soid, v, h);
-    pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
-  }
+  maybe_kick_recovery(soid);
   waiting_for_degraded_object[soid].push_back(op);
   op->mark_delayed("waiting for degraded object");
 }
 
+void ReplicatedPG::block_write_on_degraded_snap(
+  const hobject_t& snap, OpRequestRef op)
+{
+  dout(20) << __func__ << ": blocking object " << snap.get_head()
+          << " on degraded snap " << snap << dendl;
+  // otherwise, we'd have blocked in do_op
+  assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
+  objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
+  wait_for_degraded_object(snap, op);
+}
+
 bool ReplicatedPG::maybe_await_blocked_snapset(
   const hobject_t &hoid,
   OpRequestRef op)
@@ -1428,6 +1413,16 @@ void ReplicatedPG::do_op(OpRequestRef& op)
     return;
   }
 
+  // blocked on snap?
+  map<hobject_t, snapid_t>::iterator blocked_iter =
+    objects_blocked_on_degraded_snap.find(head);
+  if (write_ordered && blocked_iter != objects_blocked_on_degraded_snap.end()) {
+    hobject_t to_wait_on(head);
+    to_wait_on.snap = blocked_iter->second;
+    wait_for_degraded_object(to_wait_on, op);
+    return;
+  }
+
   // missing snapdir?
   hobject_t snapdir(m->get_oid(), m->get_object_locator().key,
                    CEPH_SNAPDIR, m->get_pg().ps(), info.pgid.pool(),
@@ -5493,7 +5488,7 @@ int ReplicatedPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
     assert(is_missing_object(missing_oid));
     dout(20) << "_rollback_to attempted to roll back to a missing object "
             << missing_oid << " (requested snapid: ) " << snapid << dendl;
-    wait_for_unreadable_object(missing_oid, ctx->op);
+    block_write_on_degraded_snap(missing_oid, ctx->op);
     return ret;
   }
   if (maybe_handle_cache(ctx->op, true, rollback_to, ret, missing_oid, true)) {
@@ -5520,7 +5515,7 @@ int ReplicatedPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
     if (is_degraded_or_backfilling_object(rollback_to_sobject)) {
       dout(20) << "_rollback_to attempted to roll back to a degraded object "
               << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
-      wait_for_degraded_object(rollback_to_sobject, ctx->op);
+      block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
       ret = -EAGAIN;
     } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
       // rolling back to the head; we just need to clone it.
@@ -8635,6 +8630,11 @@ void ReplicatedPG::finish_degraded_object(const hobject_t& oid)
       (*i)->complete(0);
     }
   }
+  map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
+    oid.get_head());
+  if (i != objects_blocked_on_degraded_snap.end() &&
+      i->second == oid.snap)
+    objects_blocked_on_degraded_snap.erase(i);
 }
 
 void ReplicatedPG::_committed_pushed_object(
@@ -9232,6 +9232,9 @@ void ReplicatedPG::on_change(ObjectStore::Transaction *t)
   // NOTE: we actually assert that all currently live references are dead
   // by the time the flush for the next interval completes.
   object_contexts.clear();
+
+  // should have been cleared above by finishing all of the degraded objects
+  assert(objects_blocked_on_degraded_snap.empty());
 }
 
 void ReplicatedPG::on_role_change()
index 81f5130fa86d2d5c0b7da9e85596e871c140c137..0f4924abfd4783e1c1eddd14050fed2e712d364f 100644 (file)
@@ -1577,11 +1577,13 @@ public:
     return is_missing_object(oid) ||
       !missing_loc.readable_with_acting(oid, actingset);
   }
+  void maybe_kick_recovery(const hobject_t &soid);
   void wait_for_unreadable_object(const hobject_t& oid, OpRequestRef op);
   void wait_for_all_missing(OpRequestRef op);
 
   bool is_degraded_or_backfilling_object(const hobject_t& oid);
   void wait_for_degraded_object(const hobject_t& oid, OpRequestRef op);
+  void block_write_on_degraded_snap(const hobject_t& oid, OpRequestRef op);
 
   bool maybe_await_blocked_snapset(const hobject_t &soid, OpRequestRef op);
   void wait_for_blocked_object(const hobject_t& soid, OpRequestRef op);