From 75332450e3a0d23a051062dbbfda7a59a701e618 Mon Sep 17 00:00:00 2001 From: Dongmao Zhang Date: Wed, 29 Oct 2014 11:25:43 +0800 Subject: [PATCH] Fix rados_shutdown hang forever when using radosstriper Dear list, I have met this when I was using radosstriper C API. My program is roughly like this: rados_striper_aio_write rados_aio_flush rados_aio_wait_for_safe rados_aio_release rados_striper_destroy rados_ioctx_destroy rados_shutdown /Hangs here/ In most time, this works well, But the programm occasionally hangs forever. Output of gstack: Thread 1 (Thread 0x7fe0afba0760 (LWP 18509)): 0 0x000000330f20822d in pthread_join () from /lib64/libpthread.so.0 1 0x000000347566cea2 in Thread::join(void**) () from /usr/lib64/librados.so.2 2 0x00000034755ac535 in librados::RadosClient::shutdown() () from /usr/lib64/librados.so.2 3 0x0000003475592269 in rados_shutdown () from /usr/lib64/librados.so.2 4 0x0000000000402349 in main () Thread 4 (Thread 0x7fe0ab14d700 (LWP 18541)): 0 0x000000330f20e264 in __lll_lock_wait () from /lib64/libpthread.so.0 1 0x000000330f209508 in _L_lock_854 () from /lib64/libpthread.so.0 2 0x000000330f2093d7 in pthread_mutex_lock () from /lib64/libpthread.so.0 3 0x0000003475633af1 in Mutex::Lock(bool) () from /usr/lib64/librados.so.2 4 0x00000034755abd37 in librados::RadosClient::put() () from /usr/lib64/librados.so.2 5 0x0000003475592501 in librados::Rados::shutdown() () from /usr/lib64/librados.so.2 6 0x00007fe0afbba9f7 in libradosstriper::RadosStriperImpl::CompletionData::~CompletionData() () from /usr/lib64/libradosstriper.so.1 7 0x00007fe0afbbaad9 in libradosstriper::RadosStriperImpl::WriteCompletionData::~WriteCompletionData() () from /usr/lib64/libradosstriper.so.1 8 0x00007fe0afbc1d75 in RefCountedObject::put() () from /usr/lib64/libradosstriper.so.1 9 0x00007fe0afbc224d in libradosstriper::MultiAioCompletionImpl::safe_request(long) () from /usr/lib64/libradosstriper.so.1 10 0x00000034755c5ce8 in librados::C_AioSafe::finish(int) () from /usr/lib64/librados.so.2 11 0x00000034755a0e89 in Context::complete(int) () from /usr/lib64/librados.so.2 12 0x000000347564d4c8 in Finisher::finisher_thread_entry() () from /usr/lib64/librados.so.2 13 0x000000330f2079d1 in start_thread () from /lib64/libpthread.so.0 14 0x000000330eae886d in clone () from /lib64/libc.so.6 It is obvious that librados::Rados::shutdown is not a thread-safe function here. It will hang forever. The culprit of this is when CompletionData is released, it will first notify "rados_aio_wait_for_safe" to continue, and CompletionData will call put() to release other data. But if the main thread(Thread 1 here) runs fast enough, rados_striper_destroy will be executed before other thread(Thread 4 here)'s releasing refcnf. In this situation, main thread runs Rados::shutdown() while other thread runs Rados::shutdown() in the same time. My suggestion is to let RadosStriperImpl::aio_flush to block until all the CompletionData has been released. This makes sure other thread will never call rados_shutdown. --- src/libradosstriper/RadosStriperImpl.cc | 13 +++++++++++-- src/libradosstriper/RadosStriperImpl.h | 18 +++++++++++++++--- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/src/libradosstriper/RadosStriperImpl.cc b/src/libradosstriper/RadosStriperImpl.cc index 798fca3fcf6e1..297bfb9d3d19f 100644 --- a/src/libradosstriper/RadosStriperImpl.cc +++ b/src/libradosstriper/RadosStriperImpl.cc @@ -203,7 +203,7 @@ libradosstriper::RadosStriperImpl::RadosExclusiveLock::~RadosExclusiveLock() { ///////////////////////// constructor ///////////////////////////// libradosstriper::RadosStriperImpl::RadosStriperImpl(librados::IoCtx& ioctx, librados::IoCtxImpl *ioctx_impl) : - m_refCnt(0), m_radosCluster(ioctx), m_ioCtx(ioctx), m_ioCtxImpl(ioctx_impl), + m_refCnt(0),lock("RadosStriper Refcont", false, false), m_radosCluster(ioctx), m_ioCtx(ioctx), m_ioCtxImpl(ioctx_impl), m_layout(g_default_file_layout) {} ///////////////////////// layout ///////////////////////////// @@ -513,8 +513,17 @@ int libradosstriper::RadosStriperImpl::aio_read(const std::string& soid, int libradosstriper::RadosStriperImpl::aio_flush() { + int ret; // pass to the rados level - return m_ioCtx.aio_flush(); + ret = m_ioCtx.aio_flush(); + if (ret < 0) + return ret; + //wait all CompletionData are released + lock.Lock(); + while (m_refCnt > 1) + cond.Wait(lock); + lock.Unlock(); + return ret; } ///////////////////////// stat and deletion ///////////////////////////// diff --git a/src/libradosstriper/RadosStriperImpl.h b/src/libradosstriper/RadosStriperImpl.h index c1496b93be616..7862f0596ee57 100644 --- a/src/libradosstriper/RadosStriperImpl.h +++ b/src/libradosstriper/RadosStriperImpl.h @@ -192,10 +192,19 @@ struct libradosstriper::RadosStriperImpl { // reference counting void get() { - m_refCnt.inc(); + lock.Lock(); + m_refCnt ++ ; + lock.Unlock(); } void put() { - if (m_refCnt.dec() == 0) + bool deleteme = false; + lock.Lock(); + m_refCnt --; + if (m_refCnt == 0) + deleteme = true; + cond.Signal(); + lock.Unlock(); + if (deleteme) delete this; } @@ -312,7 +321,10 @@ struct libradosstriper::RadosStriperImpl { } // reference counting - atomic_t m_refCnt; + Cond cond; + int m_refCnt; + Mutex lock; + // Context librados::Rados m_radosCluster; -- 2.39.5