osd/PG: move more recovery logic into PG

author Sage Weil <sage@redhat.com>

Sun, 17 Sep 2017 22:29:16 +0000 (17:29 -0500)

committer Sage Weil <sage@redhat.com>

Fri, 6 Oct 2017 18:08:18 +0000 (13:08 -0500)
author Sage Weil <sage@redhat.com>
Sun, 17 Sep 2017 22:29:16 +0000 (17:29 -0500)
committer Sage Weil <sage@redhat.com>
Fri, 6 Oct 2017 18:08:18 +0000 (13:08 -0500)
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc

index 342051a1c6a0498afdbe8dc7c23c863c6d7b6cba..de71ab9855f96f25afac50f01bdd9cbadb97017e 100644 (file)
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -8859,55 +8859,19 @@ void OSD::do_recovery(
      dout(20) << "  active was " << service.recovery_oids[pg->pg_id] << dendl;
  #endif
  
-    bool more = pg->start_recovery_ops(reserved_pushes, handle, &started);
+    bool wip = pg->start_recovery_ops(reserved_pushes, handle, &started);
      dout(10) << "do_recovery started " << started << "/" << reserved_pushes 
              << " on " << *pg << dendl;
  
      // If no recovery op is started, don't bother to manipulate the RecoveryCtx
-    if (!started && (more || !pg->have_unfound())) {
+    if (!started && (wip || !pg->have_unfound())) {
        goto out;
      }
  
      PG::RecoveryCtx rctx = create_context();
      rctx.handle = &handle;
-
-    /*
-     * if we couldn't start any recovery ops and things are still
-     * unfound, see if we can discover more missing object locations.
-     * It may be that our initial locations were bad and we errored
-     * out while trying to pull.
-     */
-    if (!more && pg->have_unfound()) {
-      pg->discover_all_missing(*rctx.query_map);
-      if (rctx.query_map->empty()) {
-       string action;
-        if (pg->state_test(PG_STATE_BACKFILLING)) {
-         auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
-           queued,
-           queued,
-           PG::DeferBackfill(cct->_conf->osd_recovery_retry_interval)));
-         pg->queue_peering_event(evt);
-         action = "in backfill";
-        } else if (pg->state_test(PG_STATE_RECOVERING)) {
-         auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
-           queued,
-           queued,
-           PG::DeferRecovery(cct->_conf->osd_recovery_retry_interval)));
-         pg->queue_peering_event(evt);
-         action = "in recovery";
-       } else {
-         action = "already out of recovery/backfill";
-       }
-       dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl;
-      } else {
-       dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl;
-       pg->queue_recovery();
-      }
-    }
-
-    pg->write_if_dirty(*rctx.transaction);
-    OSDMapRef curmap = pg->get_osdmap();
-    dispatch_context(rctx, pg, curmap);
+    pg->stuck_on_unfound(queued, wip, &rctx);
+    dispatch_context(rctx, pg, pg->get_osdmap());
    }
  
   out:
diff --git a/src/osd/PG.cc b/src/osd/PG.cc

index 36058c439b6412e055158479d65760a6ff43f460..05779f5eaf366dce61e1a28de11e89f5d9f43f71 100644 (file)
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -5722,6 +5722,46 @@ void PG::queue_query(epoch_t msg_epoch,
                                          MQuery(from, q, query_epoch))));
  }
  
+void PG::stuck_on_unfound(epoch_t queued, bool wip, RecoveryCtx *rctx)
+{
+  /*
+    * if we couldn't start any recovery ops and things are still
+    * unfound, see if we can discover more missing object locations.
+    * It may be that our initial locations were bad and we errored
+    * out while trying to pull.
+    */
+  if (!wip && have_unfound()) {
+    discover_all_missing(*rctx->query_map);
+    if (rctx->query_map->empty()) {
+      string action;
+      if (state_test(PG_STATE_BACKFILLING)) {
+       auto evt = PG::CephPeeringEvtRef(
+         new PG::CephPeeringEvt(
+           queued,
+           queued,
+           PG::DeferBackfill(cct->_conf->osd_recovery_retry_interval)));
+       queue_peering_event(evt);
+       action = "in backfill";
+      } else if (state_test(PG_STATE_RECOVERING)) {
+       auto evt = PG::CephPeeringEvtRef(
+         new PG::CephPeeringEvt(
+           queued,
+           queued,
+           PG::DeferRecovery(cct->_conf->osd_recovery_retry_interval)));
+       queue_peering_event(evt);
+       action = "in recovery";
+      } else {
+       action = "already out of recovery/backfill";
+      }
+      dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl;
+    } else {
+      dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl;
+      queue_recovery();
+    }
+  }
+  write_if_dirty(*rctx->transaction);
+}
+
  void PG::handle_advance_map(
    OSDMapRef osdmap, OSDMapRef lastmap,
    vector<int>& newup, int up_primary,
diff --git a/src/osd/PG.h b/src/osd/PG.h

index 4c64a0e60010531c51c0d1c5de20c4e623a63d2b..fe4571477cabaeed459fa3d3d09939a09dd6b60c 100644 (file)
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -410,6 +410,18 @@ public:
  
    void handle_pg_trim(epoch_t epoch, int from, shard_id_t shard, eversion_t trim_to);
  
+  /**
+   * @param ops_begun returns how many recovery ops the function started
+   * @returns true if any useful work was accomplished; false otherwise
+   */
+  virtual bool start_recovery_ops(
+    uint64_t max,
+    ThreadPool::TPHandle &handle,
+    uint64_t *ops_begun) = 0;
+
+  // more work after the above, but with a RecoveryCtx
+  void stuck_on_unfound(epoch_t queued, bool wip, RecoveryCtx *rctx);
+
    virtual void get_watchers(std::list<obj_watch_item_t> *ls) = 0;
  
    void dump_pgstate_history(Formatter *f);
@@ -1342,15 +1354,6 @@ protected:
  
    virtual void check_local() = 0;
  
-  /**
-   * @param ops_begun returns how many recovery ops the function started
-   * @returns true if any useful work was accomplished; false otherwise
-   */
-  virtual bool start_recovery_ops(
-    uint64_t max,
-    ThreadPool::TPHandle &handle,
-    uint64_t *ops_begun) = 0;
-
    void purge_strays();
  
    void update_heartbeat_peers();
author	Sage Weil <sage@redhat.com>
	Sun, 17 Sep 2017 22:29:16 +0000 (17:29 -0500)
committer	Sage Weil <sage@redhat.com>
	Fri, 6 Oct 2017 18:08:18 +0000 (13:08 -0500)
src/osd/OSD.cc		patch \| blob \| history
src/osd/PG.cc		patch \| blob \| history
src/osd/PG.h		patch \| blob \| history