From: Jianpeng Ma Date: Thu, 5 May 2016 10:14:40 +0000 (+0800) Subject: common/HeartbeatMap: print stack info of unhealth thread. X-Git-Tag: v11.0.0~467^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=c90ccbf227c9b822a6ffb4bfba9716a978d2b34f;p=ceph.git common/HeartbeatMap: print stack info of unhealth thread. We use heartbeatmap to detect thread whether health. If timeout over suicide of thread, we assert osd. But now we assert the check-thread of heartbeatmap rather than unhealth thread. We hope dump stack info of unhealth thread to find usefull info. Signed-off-by: Jianpeng Ma --- diff --git a/src/common/HeartbeatMap.cc b/src/common/HeartbeatMap.cc index 51b7aa796e1..80d2e67684f 100644 --- a/src/common/HeartbeatMap.cc +++ b/src/common/HeartbeatMap.cc @@ -44,7 +44,7 @@ HeartbeatMap::~HeartbeatMap() assert(m_workers.empty()); } -heartbeat_handle_d *HeartbeatMap::add_worker(const string& name) +heartbeat_handle_d *HeartbeatMap::add_worker(const string& name, pthread_t thread_id) { m_rwlock.get_write(); ldout(m_cct, 10) << "add_worker '" << name << "'" << dendl; @@ -55,6 +55,7 @@ heartbeat_handle_d *HeartbeatMap::add_worker(const string& name) "heartbeat_handle_d suicide_timeout"); m_workers.push_front(h); h->list_item = m_workers.begin(); + h->thread_id = thread_id; m_rwlock.put_write(); return h; } @@ -83,6 +84,8 @@ bool HeartbeatMap::_check(const heartbeat_handle_d *h, const char *who, time_t n if (was && was < now) { ldout(m_cct, 1) << who << " '" << h->name << "'" << " had suicide timed out after " << h->suicide_grace << dendl; + pthread_kill(h->thread_id, SIGABRT); + sleep(1); assert(0 == "hit suicide timeout"); } return healthy; diff --git a/src/common/HeartbeatMap.h b/src/common/HeartbeatMap.h index 8ab5f648a03..11efa9dd56c 100644 --- a/src/common/HeartbeatMap.h +++ b/src/common/HeartbeatMap.h @@ -42,19 +42,20 @@ namespace ceph { struct heartbeat_handle_d { const std::string name; + pthread_t thread_id; atomic_t timeout, suicide_timeout; time_t grace, suicide_grace; std::list::iterator list_item; explicit heartbeat_handle_d(const std::string& n) - : name(n), grace(0), suicide_grace(0) + : name(n), thread_id(0), grace(0), suicide_grace(0) { } }; class HeartbeatMap { public: // register/unregister - heartbeat_handle_d *add_worker(const std::string& name); + heartbeat_handle_d *add_worker(const std::string& name, pthread_t thread_id); void remove_worker(const heartbeat_handle_d *h); // reset the timeout so that it expects another touch within grace amount of time diff --git a/src/common/WorkQueue.cc b/src/common/WorkQueue.cc index 9d99fd1cbfd..0a0b601833a 100644 --- a/src/common/WorkQueue.cc +++ b/src/common/WorkQueue.cc @@ -95,7 +95,7 @@ void ThreadPool::worker(WorkThread *wt) std::stringstream ss; ss << name << " thread " << (void*)pthread_self(); - heartbeat_handle_d *hb = cct->get_heartbeat_map()->add_worker(ss.str()); + heartbeat_handle_d *hb = cct->get_heartbeat_map()->add_worker(ss.str(), pthread_self()); while (!_stop) { @@ -298,7 +298,7 @@ void ShardedThreadPool::shardedthreadpool_worker(uint32_t thread_index) std::stringstream ss; ss << name << " thread " << (void*)pthread_self(); - heartbeat_handle_d *hb = cct->get_heartbeat_map()->add_worker(ss.str()); + heartbeat_handle_d *hb = cct->get_heartbeat_map()->add_worker(ss.str(), pthread_self()); while (!stop_threads.read()) { if(pause_threads.read()) { diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index 188750a195d..7ac68138859 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -72,7 +72,7 @@ MDSRank::MDSRank( suicide_hook(suicide_hook_), standby_replaying(false) { - hb = g_ceph_context->get_heartbeat_map()->add_worker("MDSRank"); + hb = g_ceph_context->get_heartbeat_map()->add_worker("MDSRank", pthread_self()); finisher = new Finisher(msgr->cct); diff --git a/src/test/heartbeat_map.cc b/src/test/heartbeat_map.cc index 9ba134afa22..41e2dc38d50 100644 --- a/src/test/heartbeat_map.cc +++ b/src/test/heartbeat_map.cc @@ -22,7 +22,7 @@ using namespace ceph; TEST(HeartbeatMap, Healthy) { HeartbeatMap hm(g_ceph_context); - heartbeat_handle_d *h = hm.add_worker("one"); + heartbeat_handle_d *h = hm.add_worker("one", pthread_self()); hm.reset_timeout(h, 9, 18); bool healthy = hm.is_healthy(); @@ -33,7 +33,7 @@ TEST(HeartbeatMap, Healthy) { TEST(HeartbeatMap, Unhealth) { HeartbeatMap hm(g_ceph_context); - heartbeat_handle_d *h = hm.add_worker("one"); + heartbeat_handle_d *h = hm.add_worker("one", pthread_self()); hm.reset_timeout(h, 1, 3); sleep(2);