From 27e14504f6c4b7a8e06b038fc396c40e8e6c1456 Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Wed, 5 Apr 2017 14:09:18 -0700
Subject: [PATCH] osd: Add PG state and flag for too full for recovery

New state machine state NotRecovering
New PG state PG_STATE_RECOVERY_TOOFULL

Signed-off-by: David Zafman <dzafman@redhat.com>
---
 .../osd_internals/recovery_reservation.rst    |  1 +
 src/common/config_opts.h                      |  3 ++
 src/mon/PGMonitor.cc                          |  3 ++
 src/osd/OSD.cc                                |  1 +
 src/osd/OSD.h                                 |  1 +
 src/osd/PG.cc                                 | 31 +++++++++++++++++++
 src/osd/PG.h                                  |  9 ++++++
 src/osd/osd_types.cc                          |  4 +++
 src/osd/osd_types.h                           |  1 +
 9 files changed, 54 insertions(+)

diff --git a/doc/dev/osd_internals/recovery_reservation.rst b/doc/dev/osd_internals/recovery_reservation.rst
index cabea04cc73bb..4ab03192fe554 100644
--- a/doc/dev/osd_internals/recovery_reservation.rst
+++ b/doc/dev/osd_internals/recovery_reservation.rst
@@ -62,6 +62,7 @@ to the monitor. The state chart can set:
 
  - recovery_wait: waiting for local/remote reservations
  - recovering: recovering
+ - recovery_toofull: recovery stopped, OSD(s) above full ratio
  - backfill_wait: waiting for remote backfill reservations
  - backfilling: backfilling
  - backfill_toofull: backfill stopped, OSD(s) above backfillfull ratio
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index 9eed4d485e21c..c1630a44c010b 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -630,6 +630,9 @@ OPTION(osd_min_recovery_priority, OPT_INT, 0)
 // Seconds to wait before retrying refused backfills
 OPTION(osd_backfill_retry_interval, OPT_DOUBLE, 30.0)
 
+// Seconds to wait before retrying refused recovery
+OPTION(osd_recovery_retry_interval, OPT_DOUBLE, 30.0)
+
 // max agent flush ops
 OPTION(osd_agent_max_ops, OPT_INT, 4)
 OPTION(osd_agent_max_low_ops, OPT_INT, 2)
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc
index 6669ffcd4b358..477dabce4e0dc 100644
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -1316,6 +1316,8 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
       note["backfilling"] += p->second;
     if (p->first & PG_STATE_BACKFILL_TOOFULL)
       note["backfill_toofull"] += p->second;
+    if (p->first & PG_STATE_RECOVERY_TOOFULL)
+      note["recovery_toofull"] += p->second;
   }
 
   ceph::unordered_map<pg_t, pg_stat_t> stuck_pgs;
@@ -1403,6 +1405,7 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
 	                        PG_STATE_REPAIR |
 	                        PG_STATE_RECOVERING |
 	                        PG_STATE_RECOVERY_WAIT |
+	                        PG_STATE_RECOVERY_TOOFULL |
 	                        PG_STATE_INCOMPLETE |
 	                        PG_STATE_BACKFILL_WAIT |
 	                        PG_STATE_BACKFILL |
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 2b2aa97beb0b1..e39330e092379 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -2910,6 +2910,7 @@ void OSD::create_recoverystate_perf()
   rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency");
   rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency");
   rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency");
+  rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency");
 
   recoverystate_perf = rs_perf.create_perf_counters();
   cct->get_perfcounters_collection()->add(recoverystate_perf);
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 9429640a9b51f..f6afebd4df08b 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -202,6 +202,7 @@ enum {
   rs_down_latency,
   rs_getmissing_latency,
   rs_waitupthru_latency,
+  rs_notrecovering_latency,
   rs_last,
 };
 
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index e7e21c6130209..576ec836dc552 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -3817,6 +3817,16 @@ void PG::schedule_backfill_full_retry()
       RequestBackfill()));
 }
 
+void PG::schedule_recovery_full_retry()
+{
+  Mutex::Locker lock(osd->recovery_request_lock);
+  osd->recovery_request_timer.add_event_after(
+    cct->_conf->osd_recovery_retry_interval,
+    new QueuePeeringEvt<DoRecovery>(
+      this, get_osdmap()->get_epoch(),
+      DoRecovery()));
+}
+
 void PG::clear_scrub_reserved()
 {
   scrubber.reserved_peers.clear();
@@ -5237,6 +5247,7 @@ void PG::start_peering_interval(
   state_clear(PG_STATE_PEERED);
   state_clear(PG_STATE_DOWN);
   state_clear(PG_STATE_RECOVERY_WAIT);
+  state_clear(PG_STATE_RECOVERY_TOOFULL);
   state_clear(PG_STATE_RECOVERING);
 
   peer_purged.clear();
@@ -6488,6 +6499,24 @@ void PG::RecoveryState::NotBackfilling::exit()
   pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur);
 }
 
+/*----NotRecovering------*/
+PG::RecoveryState::NotRecovering::NotRecovering(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/NotRecovering")
+{
+  context< RecoveryMachine >().log_enter(state_name);
+  PG *pg = context< RecoveryMachine >().pg;
+  pg->publish_stats_to_osd();
+}
+
+void PG::RecoveryState::NotRecovering::exit()
+{
+  context< RecoveryMachine >().log_exit(state_name, enter_time);
+  PG *pg = context< RecoveryMachine >().pg;
+  utime_t dur = ceph_clock_now() - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur);
+}
+
 /*---RepNotRecovering----*/
 PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx)
   : my_base(ctx),
@@ -6737,6 +6766,7 @@ PG::RecoveryState::Recovering::Recovering(my_context ctx)
 
   PG *pg = context< RecoveryMachine >().pg;
   pg->state_clear(PG_STATE_RECOVERY_WAIT);
+  pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
   pg->state_set(PG_STATE_RECOVERING);
   pg->publish_stats_to_osd();
   pg->queue_recovery();
@@ -7185,6 +7215,7 @@ void PG::RecoveryState::Active::exit()
   pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
   pg->state_clear(PG_STATE_BACKFILL_WAIT);
   pg->state_clear(PG_STATE_RECOVERY_WAIT);
+  pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
   utime_t dur = ceph_clock_now() - enter_time;
   pg->osd->recoverystate_perf->tinc(rs_active_latency, dur);
   pg->agent_stop();
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 4763859d66b9a..6ac48fd6cbfaf 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -1340,6 +1340,7 @@ public:
 
   void reject_reservation();
   void schedule_backfill_full_retry();
+  void schedule_recovery_full_retry();
 
   // -- recovery state --
 
@@ -1850,6 +1851,14 @@ public:
       boost::statechart::result react(const RemoteReservationRejected& evt);
     };
 
+    struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState {
+      typedef boost::mpl::list<
+	boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >
+	> reactions;
+      explicit NotRecovering(my_context ctx);
+      void exit();
+    };
+
     struct RepNotRecovering;
     struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
       explicit ReplicaActive(my_context ctx);
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index 2cce17e029fc1..145a0d76831a4 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -789,6 +789,8 @@ std::string pg_state_string(int state)
     oss << "clean+";
   if (state & PG_STATE_RECOVERY_WAIT)
     oss << "recovery_wait+";
+  if (state & PG_STATE_RECOVERY_TOOFULL)
+    oss << "recovery_toofull+";
   if (state & PG_STATE_RECOVERING)
     oss << "recovering+";
   if (state & PG_STATE_DOWN)
@@ -869,6 +871,8 @@ int pg_string_state(const std::string& state)
     type = PG_STATE_BACKFILL_TOOFULL;
   else if (state == "recovery_wait")
     type = PG_STATE_RECOVERY_WAIT;
+  else if (state == "recovery_toofull")
+    type = PG_STATE_RECOVERY_TOOFULL;
   else if (state == "undersized")
     type = PG_STATE_UNDERSIZED;
   else if (state == "activating")
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index 0d2297582f566..1c4e4c65a6ceb 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -971,6 +971,7 @@ inline ostream& operator<<(ostream& out, const osd_stat_t& s) {
 #define PG_STATE_PEERED        (1<<25) // peered, cannot go active, can recover
 #define PG_STATE_SNAPTRIM      (1<<26) // trimming snaps
 #define PG_STATE_SNAPTRIM_WAIT (1<<27) // queued to trim snaps
+#define PG_STATE_RECOVERY_TOOFULL (1<<28) // recovery can't proceed: too full
 
 std::string pg_state_string(int state);
 std::string pg_vector_string(const vector<int32_t> &a);
-- 
2.39.5