From: huangjun Date: Tue, 1 Aug 2017 12:24:19 +0000 (+0000) Subject: osd: do not forget pg_stat acks which failed to send X-Git-Tag: v13.0.0~154^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=edc737874d03bc2c012088816d8cfe4726d6dbc7;p=ceph.git osd: do not forget pg_stat acks which failed to send If osd get network error when sending pg_stats, osd will resend the pg_stats with tid+1, so the former tid will remain in outstanding_pg_stats. In osd tick(), if the outstanding_pg_stats's size > osd_mon_report_max_in_flight(default:2), it will refuse to send pg_stats, that will block pg states from changing. Finally will fail qa tests like resolve_stuck_peering.py. Signed-off-by: huangjun --- diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 33ee4182d18..3efa72d452e 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -6058,8 +6058,9 @@ void OSD::handle_pg_stats_ack(MPGStatsAck *ack) stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_decay); dout(20) << __func__ << " timeout now " << stats_ack_timeout << dendl; - if (ack->get_tid() > pg_stat_tid_flushed) { - pg_stat_tid_flushed = ack->get_tid(); + const uint64_t ack_tid = ack->get_tid(); + if (ack_tid > pg_stat_tid_flushed) { + pg_stat_tid_flushed = ack_tid; pg_stat_queue_cond.Signal(); } @@ -6090,7 +6091,16 @@ void OSD::handle_pg_stats_ack(MPGStatsAck *ack) } } - outstanding_pg_stats.erase(ack->get_tid()); + // if there are earlier pg-stats not yet acked, + // this happens if they are not sent successfully. + for (auto tid = outstanding_pg_stats.cbegin(); + tid != outstanding_pg_stats.cend(); ) { + if (*tid <= ack_tid) { + tid = outstanding_pg_stats.erase(tid); + } else { + break; + } + } dout(20) << __func__ << " still pending: " << outstanding_pg_stats << dendl; pg_stat_queue_lock.Unlock();