]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
osd: do not forget pg_stat acks which failed to send
authorhuangjun <huangjun@xsky.com>
Tue, 1 Aug 2017 12:24:19 +0000 (12:24 +0000)
committerhuangjun <huangjun@xsky.com>
Tue, 1 Aug 2017 12:24:19 +0000 (12:24 +0000)
  If osd get network error when sending pg_stats, osd will
  resend the pg_stats with tid+1, so the former tid will remain
  in outstanding_pg_stats. In osd tick(), if the outstanding_pg_stats's
  size > osd_mon_report_max_in_flight(default:2), it will refuse to
  send pg_stats, that will block pg states from changing.
  Finally will fail qa tests like resolve_stuck_peering.py.

Signed-off-by: huangjun <huangjun@xsky.com>
src/osd/OSD.cc

index 33ee4182d185c9beff945794eba0ca7109f547f6..3efa72d452e63c02a6278822c227e2e06876c1e3 100644 (file)
@@ -6058,8 +6058,9 @@ void OSD::handle_pg_stats_ack(MPGStatsAck *ack)
        stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_decay);
   dout(20) << __func__ << "  timeout now " << stats_ack_timeout << dendl;
 
-  if (ack->get_tid() > pg_stat_tid_flushed) {
-    pg_stat_tid_flushed = ack->get_tid();
+  const uint64_t ack_tid = ack->get_tid();
+  if (ack_tid > pg_stat_tid_flushed) {
+    pg_stat_tid_flushed = ack_tid;
     pg_stat_queue_cond.Signal();
   }
 
@@ -6090,7 +6091,16 @@ void OSD::handle_pg_stats_ack(MPGStatsAck *ack)
     }
   }
 
-  outstanding_pg_stats.erase(ack->get_tid());
+  // if there are earlier pg-stats not yet acked, 
+  // this happens if they are not sent successfully.
+  for (auto tid = outstanding_pg_stats.cbegin();
+        tid != outstanding_pg_stats.cend(); ) {
+    if (*tid <= ack_tid) {
+      tid = outstanding_pg_stats.erase(tid);
+    } else {
+      break;
+    }
+  }
   dout(20) << __func__ << "  still pending: " << outstanding_pg_stats << dendl;
 
   pg_stat_queue_lock.Unlock();