]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: Interim working version with backfill reserve state changes
authorDavid Zafman <david.zafman@inktank.com>
Fri, 8 Nov 2013 03:45:46 +0000 (19:45 -0800)
committerDavid Zafman <david.zafman@inktank.com>
Thu, 9 Jan 2014 00:33:58 +0000 (16:33 -0800)
Signed-off-by: David Zafman <david.zafman@inktank.com>
src/osd/PG.cc
src/osd/PG.h
src/osd/ReplicatedPG.cc
src/osd/ReplicatedPG.h

index 66392037270112a18202695e4ba2e10985a3a334..35806a00b9f334f6a12e3ff679dd885bcd7f0228 100644 (file)
@@ -1029,9 +1029,9 @@ bool PG::choose_acting(int& newest_update_osd)
       compat_mode = false;
   }
 
-  // For now we only backfill 1 at a time as before
-  if (!backfill.empty())
-    backfill.resize(1);
+  if (compat_mode && !backfill.empty()) {
+      backfill.resize(1);
+  }
 
   // This might cause a problem if min_size is large
   // and we need to backfill more than 1 osd.  Older
@@ -4747,8 +4747,8 @@ ostream& operator<<(ostream& out, const PG& pg)
     }
   }
 
-  if (pg.get_backfill_target() >= 0)
-    out << " bft=" << pg.get_backfill_target();
+  if (!pg.backfill_targets.empty())
+    out << " bft=" << pg.backfill_targets;
 
   if (pg.last_complete_ondisk != pg.info.last_complete)
     out << " lcod " << pg.last_complete_ondisk;
@@ -5454,6 +5454,7 @@ PG::RecoveryState::Backfilling::Backfilling(my_context ctx)
   PG *pg = context< RecoveryMachine >().pg;
   pg->backfill_reserved = true;
   pg->osd->queue_for_recovery(pg);
+  pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
   pg->state_clear(PG_STATE_BACKFILL_WAIT);
   pg->state_set(PG_STATE_BACKFILL);
 }
@@ -5486,27 +5487,43 @@ void PG::RecoveryState::Backfilling::exit()
 
 PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/WaitRemoteBackfillReserved")
+    NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/WaitRemoteBackfillReserved"),
+    backfill_osd_it(context< Active >().sorted_backfill_set.begin())
 {
   context< RecoveryMachine >().log_enter(state_name);
   PG *pg = context< RecoveryMachine >().pg;
   pg->state_set(PG_STATE_BACKFILL_WAIT);
-  ConnectionRef con = pg->osd->get_con_osd_cluster(
-    pg->get_backfill_target(), pg->get_osdmap()->get_epoch());
-  if (con) {
-    if (con->has_feature(CEPH_FEATURE_BACKFILL_RESERVATION)) {
-      unsigned priority = pg->is_degraded() ? OSDService::BACKFILL_HIGH
+  post_event(RemoteBackfillReserved());
+}
+
+boost::statechart::result
+PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
+{
+  PG *pg = context< RecoveryMachine >().pg;
+
+  if (backfill_osd_it != context< Active >().sorted_backfill_set.end()) {
+    //The primary never backfills itself
+    assert(*backfill_osd_it != pg->osd->whoami);
+    ConnectionRef con = pg->osd->get_con_osd_cluster(*backfill_osd_it, pg->get_osdmap()->get_epoch());
+    if (con) {
+      if (con->has_feature(CEPH_FEATURE_BACKFILL_RESERVATION)) {
+        unsigned priority = pg->is_degraded() ? OSDService::BACKFILL_HIGH
          : OSDService::BACKFILL_LOW;
-      pg->osd->send_message_osd_cluster(
-        new MBackfillReserve(
+        pg->osd->send_message_osd_cluster(
+          new MBackfillReserve(
          MBackfillReserve::REQUEST,
          pg->info.pgid,
          pg->get_osdmap()->get_epoch(), priority),
        con.get());
-    } else {
-      post_event(RemoteBackfillReserved());
+      } else {
+        post_event(RemoteBackfillReserved());
+      }
     }
+    ++backfill_osd_it;
+  } else {
+    post_event(AllBackfillsReserved());
   }
+  return discard_event();
 }
 
 void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
@@ -5517,14 +5534,6 @@ void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
   pg->osd->recoverystate_perf->tinc(rs_waitremotebackfillreserved_latency, dur);
 }
 
-boost::statechart::result
-PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
-{
-  PG *pg = context< RecoveryMachine >().pg;
-  pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
-  return transit<Backfilling>();
-}
-
 boost::statechart::result
 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejected &evt)
 {
@@ -5817,7 +5826,6 @@ void PG::RecoveryState::Recovering::release_reservations()
 {
   PG *pg = context< RecoveryMachine >().pg;
   assert(!pg->pg_log.get_missing().have_missing());
-  pg->state_clear(PG_STATE_RECOVERING);
 
   // release remote reservations
   for (set<int>::const_iterator i = context< Active >().sorted_acting_set.begin();
@@ -5933,6 +5941,8 @@ PG::RecoveryState::Active::Active(my_context ctx)
     NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active"),
     sorted_acting_set(context< RecoveryMachine >().pg->actingbackfill.begin(),
                       context< RecoveryMachine >().pg->actingbackfill.end()),
+    sorted_backfill_set(context< RecoveryMachine >().pg->backfill_targets.begin(),
+                      context< RecoveryMachine >().pg->backfill_targets.end()),
     all_replicas_activated(false)
 {
   context< RecoveryMachine >().log_enter(state_name);
index 4047dd4b3fcd9e3fb9e0b35ce2e903886de0a158..ae827dd430e09bcb4a302023a53679e1da91d3be 100644 (file)
@@ -507,20 +507,13 @@ protected:
   
   BackfillInterval backfill_info;
   BackfillInterval peer_backfill_info;
-  vector<int> backfill_targets;
   bool backfill_reserved;
   bool backfill_reserving;
 
   friend class OSD;
 
 public:
-  // Compatibility with single backfill target code
-  int get_backfill_target() const {
-    int backfill_target = -1;
-    if (backfill_targets.size() > 0)
-      backfill_target = backfill_targets[0];
-    return backfill_target;
-  }
+  vector<int> backfill_targets;
 
 protected:
 
@@ -1083,8 +1076,8 @@ public:
   TrivialEvent(LocalRecoveryReserved)
   TrivialEvent(RemoteRecoveryReserved)
   TrivialEvent(AllRemotesReserved)
+  TrivialEvent(AllBackfillsReserved)
   TrivialEvent(Recovering)
-  TrivialEvent(WaitRemoteBackfillReserved)
   TrivialEvent(GoClean)
 
   TrivialEvent(AllReplicasActivated)
@@ -1310,6 +1303,7 @@ public:
       void exit();
 
       const set<int> sorted_acting_set;
+      const set<int> sorted_backfill_set;
       bool all_replicas_activated;
 
       typedef boost::mpl::list <
@@ -1368,8 +1362,10 @@ public:
     struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState {
       typedef boost::mpl::list<
        boost::statechart::custom_reaction< RemoteBackfillReserved >,
-       boost::statechart::custom_reaction< RemoteReservationRejected >
+       boost::statechart::custom_reaction< RemoteReservationRejected >,
+       boost::statechart::transition< AllBackfillsReserved, Backfilling >
        > reactions;
+      set<int>::const_iterator backfill_osd_it;
       WaitRemoteBackfillReserved(my_context ctx);
       void exit();
       boost::statechart::result react(const RemoteBackfillReserved& evt);
index ffa29a24c04ad9aa17f4b6228a83acb0fd77069e..64d655336d4d42cad29b7cc3f52e2e5cb088fd98 100644 (file)
@@ -274,7 +274,8 @@ void ReplicatedPG::on_peer_recover(
   publish_stats_to_osd();
   // done!
   peer_missing[peer].got(soid, recovery_info.version);
-  if (peer == get_backfill_target() && backfills_in_flight.count(soid)) {
+  if (!backfill_targets.empty() && peer == backfill_targets[0]
+    && backfills_in_flight.count(soid)) {
     map<hobject_t, ObjectContextRef>::iterator i = recovering.find(soid);
     assert(i != recovering.end());
     list<OpRequestRef> requeue_list;
@@ -364,7 +365,7 @@ bool ReplicatedPG::is_degraded_object(const hobject_t& soid)
 
     // Object is degraded if after last_backfill AND
     // we are backfilling it
-    if (peer == get_backfill_target() &&
+    if (!backfill_targets.empty() && peer == backfill_targets[0] &&
        peer_info[peer].last_backfill <= soid &&
        last_backfill_started >= soid &&
        backfills_in_flight.count(soid))
@@ -1168,10 +1169,9 @@ void ReplicatedPG::do_op(OpRequestRef op)
   // The last_backfill_started is used as the backfill line since
   // that determines the boundary for writes.
   pg_info_t *backfill_target_info = NULL;
-  int backfill_target = get_backfill_target();
   bool before_backfill = false;
-  if (backfill_target >= 0) {
-    backfill_target_info = &peer_info[backfill_target];
+  if (!backfill_targets.empty()) {
+    backfill_target_info = &peer_info[backfill_targets[0]];
     before_backfill = obc->obs.oi.soid <= last_backfill_started;
   }
 
@@ -1842,7 +1842,8 @@ void ReplicatedPG::do_scan(
   case MOSDPGScan::OP_SCAN_DIGEST:
     {
       int from = m->get_source().num();
-      assert(from == get_backfill_target());
+      //XXX: Check that from is in backfill_targets vector
+      //assert(from == get_backfill_target());
       BackfillInterval& bi = peer_backfill_info;
       bi.begin = m->begin;
       bi.end = m->end;
@@ -4599,9 +4600,8 @@ void ReplicatedPG::finish_ctx(OpContext *ctx)
   ctx->obc->ssc->snapset = ctx->new_snapset;
   info.stats.stats.add(ctx->delta_stats, ctx->obs->oi.category);
 
-  int backfill_target = get_backfill_target();
-  if (backfill_target >= 0) {
-    pg_info_t& pinfo = peer_info[backfill_target];
+  if (!backfill_targets.empty()) {
+    pg_info_t& pinfo = peer_info[backfill_targets[0]];
     if (soid <= pinfo.last_backfill)
       pinfo.stats.stats.add(ctx->delta_stats, ctx->obs->oi.category);
     else if (soid <= last_backfill_started)
@@ -5635,9 +5635,8 @@ void ReplicatedPG::issue_repop(RepGather *repop, utime_t now)
       assert(0 == "broken implementation, do not use");
     }
 
-    int backfill_target = get_backfill_target();
     // ship resulting transaction, log entries, and pg_stats
-    if (peer == backfill_target && soid > last_backfill_started &&
+    if (!backfill_targets.empty() && peer == backfill_targets[0] && soid > last_backfill_started &&
         // only skip normal (not temp pool=-1) objects
        soid.pool == (int64_t)info.pgid.pool()) {
       dout(10) << "issue_repop shipping empty opt to osd." << peer
@@ -5652,7 +5651,7 @@ void ReplicatedPG::issue_repop(RepGather *repop, utime_t now)
 
     ::encode(repop->ctx->log, wr->logbl);
 
-    if (backfill_target >= 0 && backfill_target == peer)
+    if (!backfill_targets.empty() && backfill_targets[0] == peer)
       wr->pg_stats = pinfo.stats;  // reflects backfill progress
     else
       wr->pg_stats = info.stats;
@@ -7821,7 +7820,7 @@ eversion_t ReplicatedPG::pick_newest_available(const hobject_t& oid)
   for (unsigned i=1; i<actingbackfill.size(); ++i) {
     int peer = actingbackfill[i];
     if (!peer_missing[peer].is_missing(oid)) {
-      assert(peer == get_backfill_target());
+      assert(!backfill_targets.empty() && peer == backfill_targets[0]);
       continue;
     }
     eversion_t h = peer_missing[peer].missing[oid].have;
@@ -8108,12 +8107,11 @@ void ReplicatedPG::on_shutdown()
 // For now only care about a single backfill at a time
 void ReplicatedPG::on_activate()
 {
-  int backfill_target = get_backfill_target();
-  if (backfill_target != -1) {
-    last_backfill_started = peer_info[backfill_target].last_backfill;
+  if (!backfill_targets.empty()) {
+    last_backfill_started = peer_info[backfill_targets[0]].last_backfill;
     assert(!last_backfill_started.is_max());
-    dout(10) << " chose backfill target osd." << backfill_target
-            << " from " << last_backfill_started << dendl;
+    dout(10) << " chose backfill target osd." << backfill_targets[0]
+          << " from " << last_backfill_started << dendl;
   }
 
   hit_set_setup();
@@ -8340,10 +8338,9 @@ bool ReplicatedPG::start_recovery_ops(
     work_in_progress = true;
 
   bool deferred_backfill = false;
-  int backfill_target = get_backfill_target();
   if (recovering.empty() &&
       state_test(PG_STATE_BACKFILL) &&
-      backfill_target >= 0 && started < max &&
+      !backfill_targets.empty() && started < max &&
       missing.num_missing() == 0 &&
       !waiting_on_backfill) {
     if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
@@ -8757,10 +8754,10 @@ int ReplicatedPG::recover_backfill(
   ThreadPool::TPHandle &handle, bool *work_started)
 {
   dout(10) << "recover_backfill (" << max << ")" << dendl;
-  int backfill_target = get_backfill_target();
-  assert(backfill_target >= 0);
+  assert(!backfill_targets.empty());
 
-  pg_info_t& pinfo = peer_info[backfill_target];
+  //XXX: Look through backfill_targets
+  pg_info_t& pinfo = peer_info[backfill_targets[0]];
   BackfillInterval& pbi = peer_backfill_info;
 
   // Initialize from prior backfill state
@@ -8769,7 +8766,7 @@ int ReplicatedPG::recover_backfill(
     backfill_info.reset(pinfo.last_backfill);
   }
 
-  dout(10) << " peer osd." << backfill_target
+  dout(10) << " peer osd." << backfill_targets[0]
           << " last_backfill_started " << last_backfill_started
           << " info " << pinfo
           << " interval " << pbi.begin << "-" << pbi.end
@@ -8807,11 +8804,11 @@ int ReplicatedPG::recover_backfill(
 
     if (pbi.begin <= backfill_info.begin &&
        !pbi.extends_to_end() && pbi.empty()) {
-      dout(10) << " scanning peer osd." << backfill_target << " from " << pbi.end << dendl;
+      dout(10) << " scanning peer osd." << backfill_targets[0] << " from " << pbi.end << dendl;
       epoch_t e = get_osdmap()->get_epoch();
       MOSDPGScan *m = new MOSDPGScan(MOSDPGScan::OP_SCAN_GET_DIGEST, e, e, info.pgid,
                                     pbi.end, hobject_t());
-      osd->send_message_osd_cluster(backfill_target, m, get_osdmap()->get_epoch());
+      osd->send_message_osd_cluster(backfill_targets[0], m, get_osdmap()->get_epoch());
       waiting_on_backfill = true;
       start_recovery_op(pbi.end);
       ops++;
@@ -8874,7 +8871,7 @@ int ReplicatedPG::recover_backfill(
       if (obc->get_backfill_read()) {
        dout(20) << " pushing local " << backfill_info.begin << " "
                 << backfill_info.objects.begin()->second
-                << " to peer osd." << backfill_target << dendl;
+                << " to peer osd." << backfill_targets[0] << dendl;
        to_push[backfill_info.begin] =
          boost::make_tuple(
            backfill_info.objects.begin()->second,
@@ -8908,7 +8905,7 @@ int ReplicatedPG::recover_backfill(
     handle.reset_tp_timeout();
 
     // ordered before any subsequent updates
-    send_remove_op(i->first, i->second, backfill_target);
+    send_remove_op(i->first, i->second, backfill_targets[0]);
 
     pending_backfill_updates[i->first]; // add empty stat!
   }
@@ -8922,7 +8919,7 @@ int ReplicatedPG::recover_backfill(
     handle.reset_tp_timeout();
     prep_backfill_object_push(
       i->first, i->second.get<0>(), i->second.get<1>(), i->second.get<2>(),
-      backfill_target, h);
+      backfill_targets[0], h);
   }
   pgbackend->run_recovery_op(h, cct->_conf->osd_recovery_op_priority);
 
@@ -8982,7 +8979,7 @@ int ReplicatedPG::recover_backfill(
     }
     m->last_backfill = pinfo.last_backfill;
     m->stats = pinfo.stats;
-    osd->send_message_osd_cluster(backfill_target, m, get_osdmap()->get_epoch());
+    osd->send_message_osd_cluster(backfill_targets[0], m, get_osdmap()->get_epoch());
   }
 
   dout(10) << " peer num_objects now " << pinfo.stats.stats.sum.num_objects
@@ -8999,9 +8996,10 @@ void ReplicatedPG::prep_backfill_object_push(
   PGBackend::RecoveryHandle *h)
 {
   dout(10) << "push_backfill_object " << oid << " v " << v << " to osd." << peer << dendl;
+  assert(!backfill_targets.empty());
 
   backfills_in_flight.insert(oid);
-  map<int, pg_missing_t>::iterator bpm = peer_missing.find(get_backfill_target());
+  map<int, pg_missing_t>::iterator bpm = peer_missing.find(backfill_targets[0]);
   assert(bpm != peer_missing.end());
   bpm->second.add(oid, eversion_t(), eversion_t());
 
index d42031b3af00ab8bdd7ecd837c2e5d043bb8cb1b..a9631128aec9f731df5410c9b0ae008a55731442 100644 (file)
@@ -697,7 +697,11 @@ protected:
   map<hobject_t, pg_stat_t> pending_backfill_updates;
 
   void dump_recovery_info(Formatter *f) const {
-    f->dump_int("backfill_target", get_backfill_target());
+    f->open_array_section("backfill_targets");
+    for (vector<int>::const_iterator p = backfill_targets.begin();
+        p != backfill_targets.end(); ++p)
+      f->dump_int("osd", *p);
+    f->close_section();
     f->dump_int("waiting_on_backfill", waiting_on_backfill);
     f->dump_stream("last_backfill_started") << last_backfill_started;
     {