From: Sage Weil <sage@redhat.com>
Date: Sun, 17 Sep 2017 22:29:16 +0000 (-0500)
Subject: osd/PG: move more recovery logic into PG
X-Git-Tag: v13.0.1~634^2~31
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=6493f8ec5a1bc610e82edc4a73d0aac2862dfd2d;p=ceph.git

osd/PG: move more recovery logic into PG

I suspect we eventually want to move the create_context and
dispatch_context into OSDService (if it isn't there already) and move
even more of this logic into PG.

Signed-off-by: Sage Weil <sage@redhat.com>
---

diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 342051a1c6a..de71ab9855f 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -8859,55 +8859,19 @@ void OSD::do_recovery(
     dout(20) << "  active was " << service.recovery_oids[pg->pg_id] << dendl;
 #endif
 
-    bool more = pg->start_recovery_ops(reserved_pushes, handle, &started);
+    bool wip = pg->start_recovery_ops(reserved_pushes, handle, &started);
     dout(10) << "do_recovery started " << started << "/" << reserved_pushes 
 	     << " on " << *pg << dendl;
 
     // If no recovery op is started, don't bother to manipulate the RecoveryCtx
-    if (!started && (more || !pg->have_unfound())) {
+    if (!started && (wip || !pg->have_unfound())) {
       goto out;
     }
 
     PG::RecoveryCtx rctx = create_context();
     rctx.handle = &handle;
-
-    /*
-     * if we couldn't start any recovery ops and things are still
-     * unfound, see if we can discover more missing object locations.
-     * It may be that our initial locations were bad and we errored
-     * out while trying to pull.
-     */
-    if (!more && pg->have_unfound()) {
-      pg->discover_all_missing(*rctx.query_map);
-      if (rctx.query_map->empty()) {
-	string action;
-        if (pg->state_test(PG_STATE_BACKFILLING)) {
-	  auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
-	    queued,
-	    queued,
-	    PG::DeferBackfill(cct->_conf->osd_recovery_retry_interval)));
-	  pg->queue_peering_event(evt);
-	  action = "in backfill";
-        } else if (pg->state_test(PG_STATE_RECOVERING)) {
-	  auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
-	    queued,
-	    queued,
-	    PG::DeferRecovery(cct->_conf->osd_recovery_retry_interval)));
-	  pg->queue_peering_event(evt);
-	  action = "in recovery";
-	} else {
-	  action = "already out of recovery/backfill";
-	}
-	dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl;
-      } else {
-	dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl;
-	pg->queue_recovery();
-      }
-    }
-
-    pg->write_if_dirty(*rctx.transaction);
-    OSDMapRef curmap = pg->get_osdmap();
-    dispatch_context(rctx, pg, curmap);
+    pg->stuck_on_unfound(queued, wip, &rctx);
+    dispatch_context(rctx, pg, pg->get_osdmap());
   }
 
  out:
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 36058c439b6..05779f5eaf3 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -5722,6 +5722,46 @@ void PG::queue_query(epoch_t msg_epoch,
 					 MQuery(from, q, query_epoch))));
 }
 
+void PG::stuck_on_unfound(epoch_t queued, bool wip, RecoveryCtx *rctx)
+{
+  /*
+    * if we couldn't start any recovery ops and things are still
+    * unfound, see if we can discover more missing object locations.
+    * It may be that our initial locations were bad and we errored
+    * out while trying to pull.
+    */
+  if (!wip && have_unfound()) {
+    discover_all_missing(*rctx->query_map);
+    if (rctx->query_map->empty()) {
+      string action;
+      if (state_test(PG_STATE_BACKFILLING)) {
+	auto evt = PG::CephPeeringEvtRef(
+	  new PG::CephPeeringEvt(
+	    queued,
+	    queued,
+	    PG::DeferBackfill(cct->_conf->osd_recovery_retry_interval)));
+	queue_peering_event(evt);
+	action = "in backfill";
+      } else if (state_test(PG_STATE_RECOVERING)) {
+	auto evt = PG::CephPeeringEvtRef(
+	  new PG::CephPeeringEvt(
+	    queued,
+	    queued,
+	    PG::DeferRecovery(cct->_conf->osd_recovery_retry_interval)));
+	queue_peering_event(evt);
+	action = "in recovery";
+      } else {
+	action = "already out of recovery/backfill";
+      }
+      dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl;
+    } else {
+      dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl;
+      queue_recovery();
+    }
+  }
+  write_if_dirty(*rctx->transaction);
+}
+
 void PG::handle_advance_map(
   OSDMapRef osdmap, OSDMapRef lastmap,
   vector<int>& newup, int up_primary,
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 4c64a0e6001..fe4571477ca 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -410,6 +410,18 @@ public:
 
   void handle_pg_trim(epoch_t epoch, int from, shard_id_t shard, eversion_t trim_to);
 
+  /**
+   * @param ops_begun returns how many recovery ops the function started
+   * @returns true if any useful work was accomplished; false otherwise
+   */
+  virtual bool start_recovery_ops(
+    uint64_t max,
+    ThreadPool::TPHandle &handle,
+    uint64_t *ops_begun) = 0;
+
+  // more work after the above, but with a RecoveryCtx
+  void stuck_on_unfound(epoch_t queued, bool wip, RecoveryCtx *rctx);
+
   virtual void get_watchers(std::list<obj_watch_item_t> *ls) = 0;
 
   void dump_pgstate_history(Formatter *f);
@@ -1342,15 +1354,6 @@ protected:
 
   virtual void check_local() = 0;
 
-  /**
-   * @param ops_begun returns how many recovery ops the function started
-   * @returns true if any useful work was accomplished; false otherwise
-   */
-  virtual bool start_recovery_ops(
-    uint64_t max,
-    ThreadPool::TPHandle &handle,
-    uint64_t *ops_begun) = 0;
-
   void purge_strays();
 
   void update_heartbeat_peers();