From: Sage Weil Date: Fri, 12 Feb 2010 22:45:02 +0000 (-0800) Subject: osd: fix recovery requeue race X-Git-Tag: v0.19~27 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=ce464a5a0af670fba24ba37e9d44fa5e7112165a;p=ceph.git osd: fix recovery requeue race If a recovery op finished right as another recovery op was begin started, we could get into start_recovery_ops() and get max = 0 and not start anything. Since the PG wasn't being requeued for later, it would never recover. So, requeue if we race and get max == 0. --- diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 18164a6635c7..de52001de8b4 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -3861,27 +3861,35 @@ bool OSD::_recover_now() void OSD::do_recovery(PG *pg) { - pg->lock(); - + // see how many we should try to start. note that this is a bit racy. + recovery_wq.lock(); int max = g_conf.osd_recovery_max_active - recovery_ops_active; - - dout(10) << "do_recovery starting " << max - << " (" << recovery_ops_active << "/" << g_conf.osd_recovery_max_active << " rops) on " - << *pg << dendl; + recovery_wq.unlock(); + if (max == 0) { + dout(10) << "do_recovery raced and failed to start anything; requeuing " << *pg << dendl; + recovery_wq.queue(pg); + } else { + + pg->lock(); + + dout(10) << "do_recovery starting " << max + << " (" << recovery_ops_active << "/" << g_conf.osd_recovery_max_active << " rops) on " + << *pg << dendl; #ifdef DEBUG_RECOVERY_OIDS - dout(20) << " active was " << recovery_oids << dendl; + dout(20) << " active was " << recovery_oids << dendl; #endif - - int started = pg->start_recovery_ops(max); - - dout(10) << "do_recovery started " << started - << " (" << recovery_ops_active << "/" << g_conf.osd_recovery_max_active << " rops) on " - << *pg << dendl; - - if (started < max) - pg->recovery_item.remove_myself(); - - pg->unlock(); + + int started = pg->start_recovery_ops(max); + + dout(10) << "do_recovery started " << started + << " (" << recovery_ops_active << "/" << g_conf.osd_recovery_max_active << " rops) on " + << *pg << dendl; + + if (started < max) + pg->recovery_item.remove_myself(); + + pg->unlock(); + } pg->put(); }