From 8ffefecf23ace10a4cc9c52bac8236e78902ea3a Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@redhat.com>
Date: Wed, 23 Sep 2015 10:25:30 -0400
Subject: [PATCH] osd: do full check in do_op

1. The current pool_last_map_marked_full tracking is buggy.
2. We need to recheck this each time we consider the op, not just when it
is received off the wire.  Otherwise, we might get a message, queue it
for some reason, get a map indicating the cluster or pool is full, and
then requeue and process the op instead of discarding it.
3. For now, silently drop ops when failsafe check fails. This will lead to
stalled client IO.  This needs a more robust fix.

Signed-off-by: Sage Weil <sage@redhat.com>
---
 src/osd/OSD.cc          | 16 ----------------
 src/osd/ReplicatedPG.cc | 19 +++++++++++++++++++
 2 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index f8611fff16e21..25343ed6a36fe 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -8082,26 +8082,10 @@ void OSD::handle_op(OpRequestRef& op, OSDMapRef& osdmap)
   pg_t _pgid = m->get_pg();
   int64_t pool = _pgid.pool();
   if (op->may_write()) {
-    // full?
-    if ((service.check_failsafe_full() ||
-	 osdmap->test_flag(CEPH_OSDMAP_FULL) ||
-	 m->get_map_epoch() < superblock.last_map_marked_full) &&
-	!m->get_source().is_mds()) {  // FIXME: we'll exclude mds writes for now.
-      // Drop the request, since the client will retry when the full
-      // flag is unset.
-      return;
-    }
-
     const pg_pool_t *pi = osdmap->get_pg_pool(pool);
     if (!pi) {
       return;
     }
-    // pool is full ?
-    map<int64_t, epoch_t> &pool_last_map_marked_full = superblock.pool_last_map_marked_full;
-    if ((pi->has_flag(pg_pool_t::FLAG_FULL) ||
-       (pool_last_map_marked_full.count(pool) && (m->get_map_epoch() < pool_last_map_marked_full[pool]))) && !m->get_source().is_mds()) {
-      return;
-    }
     
     // invalid?
     if (m->get_snapid() != CEPH_NOSNAP) {
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index d4e2b6dbe0d56..51279ba52615b 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -1496,6 +1496,25 @@ void ReplicatedPG::do_op(OpRequestRef& op)
     return;
   }
 
+  // discard due to cluster full transition?  (we discard any op that
+  // originates before the cluster or pool is marked full; the client
+  // will resend after the full flag is removed or if they expect the
+  // op to succeed despite being full).  The except is FULL_FORCE ops,
+  // which there is no reason to discard because they bypass all full
+  // checks anyway.
+  // FIXME: we exclude mds writes for now.
+  if (!(m->get_source().is_mds() || m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
+      info.history.last_epoch_marked_full > m->get_map_epoch()) {
+    dout(10) << __func__ << " discarding op sent before full " << m << " "
+	     << *m << dendl;
+    return;
+  }
+  if (osd->check_failsafe_full()) {
+    dout(10) << __func__ << " fail-safe full check failed, dropping request"
+	     << dendl;
+    return;
+  }
+
   // order this op as a write?
   bool write_ordered =
     op->may_write() ||
-- 
2.39.5