]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: Disable heartbeat timeout until a non-future workitem can be processed
authorSridhar Seshasayee <sseshasa@redhat.com>
Tue, 25 May 2021 14:09:33 +0000 (19:39 +0530)
committerSridhar Seshasayee <sseshasa@redhat.com>
Tue, 3 Aug 2021 05:54:36 +0000 (11:24 +0530)
There could be rare instances when employing the mclock scheduler where a
worker thread for a shard may not get an immediate work item to process.
Such items are designated as future work items. In such cases, the
_process() loop waits until the time indicated by the scheduler to attempt
a dequeue from the scheduler queue again. It may so happen that if there
are multiple threads per shard, a thread may not get an immediate item for
a long time. This time could exceed the heartbeat timeout for the thread
and result in hearbeat timeouts reported for the osd in question. To
prevent this, the heartbeat timeouts for the thread is disabled before
waiting for an item and enabled once the wait period is over.

Signed-off-by: Sridhar Seshasayee <sseshasa@redhat.com>
(cherry picked from commit 9a95492b66341f7351e80f0386b4439f713debc6)

src/osd/OSD.cc

index 525304726b2d4a70d41514006df6d2d2c60b2c21..f16195bb38eca31a232397fa2e6c784a50b258c5 100644 (file)
@@ -10842,12 +10842,17 @@ void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
       std::unique_lock wait_lock{sdata->sdata_wait_lock};
       auto future_time = ceph::real_clock::from_double(*when_ready);
       dout(10) << __func__ << " dequeue future request at " << future_time << dendl;
+      // Disable heartbeat timeout until we find a non-future work item to process.
+      osd->cct->get_heartbeat_map()->clear_timeout(hb);
       sdata->shard_lock.unlock();
       ++sdata->waiting_threads;
       sdata->sdata_cond.wait_until(wait_lock, future_time);
       --sdata->waiting_threads;
       wait_lock.unlock();
       sdata->shard_lock.lock();
+      // Reapply default wq timeouts
+      osd->cct->get_heartbeat_map()->reset_timeout(hb,
+        timeout_interval, suicide_interval);
     }
   } // while