From: Radoslaw Zarzynski Date: Mon, 10 Feb 2020 20:40:06 +0000 (+0100) Subject: osd: fix racy accesses to OSD::osdmap. X-Git-Tag: v15.1.1~376^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=80da5f9a987c6a48b93f25228fdac85890013520;p=ceph.git osd: fix racy accesses to OSD::osdmap. Accordingly to cppreference.com [1]: "If multiple threads of execution access the same std::shared_ptr object without synchronization and any of those accesses uses a non-const member function of shared_ptr then a data race will occur (...)" [1]: https://en.cppreference.com/w/cpp/memory/shared_ptr/atomic One of the coredumps showed the `shared_ptr`-typed `OSD::osdmap` with healthy looking content but damaged control block: ``` [Current thread is 1 (Thread 0x7f7dcaf73700 (LWP 205295))] (gdb) bt #0 0x0000559cb81c3ea0 in ?? () #1 0x0000559c97675b27 in std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release (this=0x559cba0ec900) at /usr/include/c++/8/bits/shared_ptr_base.h:148 #2 std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release (this=0x559cba0ec900) at /usr/include/c++/8/bits/shared_ptr_base.h:148 #3 0x0000559c975ef8aa in std::__shared_count<(__gnu_cxx::_Lock_policy)2>::~__shared_count (this=, __in_chrg=) at /usr/include/c++/8/bits/shared_ptr_base.h:1167 #4 std::__shared_ptr::~__shared_ptr (this=, __in_chrg=) at /usr/include/c++/8/bits/shared_ptr_base.h:1167 #5 std::shared_ptr::~shared_ptr (this=, __in_chrg=) at /usr/include/c++/8/bits/shared_ptr.h:103 #6 OSD::create_context (this=) at /usr/src/debug/ceph-15.0.0-10071.g5b5a3a3.el8.x86_64/src/osd/OSD.cc:9053 #7 0x0000559c97655571 in OSD::dequeue_peering_evt (this=0x559ca22ac000, sdata=0x559ca2ef2900, pg=0x559cb4aa3400, evt=std::shared_ptr (use count 2, weak count 0) = {...}, handle=...) at /usr/src/debug/ceph-15.0.0-10071.g5b5a3a3.el8.x86_64/src/osd/OSD.cc:9665 #8 0x0000559c97886db6 in ceph::osd::scheduler::PGPeeringItem::run (this=, osd=, sdata=, pg=..., handle=...) at /usr/include/c++/8/ext/atomicity.h:96 #9 0x0000559c9764862f in ceph::osd::scheduler::OpSchedulerItem::run (handle=..., pg=..., sdata=, osd=, this=0x7f7dcaf703f0) at /usr/include/c++/8/bits/unique_ptr.h:342 #10 OSD::ShardedOpWQ::_process (this=, thread_index=, hb=) at /usr/src/debug/ceph-15.0.0-10071.g5b5a3a3.el8.x86_64/src/osd/OSD.cc:10677 #11 0x0000559c97c76094 in ShardedThreadPool::shardedthreadpool_worker (this=0x559ca22aca28, thread_index=14) at /usr/src/debug/ceph-15.0.0-10071.g5b5a3a3.el8.x86_64/src/common/WorkQueue.cc:311 #12 0x0000559c97c78cf4 in ShardedThreadPool::WorkThreadSharded::entry (this=) at /usr/src/debug/ceph-15.0.0-10071.g5b5a3a3.el8.x86_64/src/common/WorkQueue.h:706 #13 0x00007f7df17852de in start_thread () from /lib64/libpthread.so.0 #14 0x00007f7df052f133 in __libc_ifunc_impl_list () from /lib64/libc.so.6 #15 0x0000000000000000 in ?? () (gdb) frame 7 #7 0x0000559c97655571 in OSD::dequeue_peering_evt (this=0x559ca22ac000, sdata=0x559ca2ef2900, pg=0x559cb4aa3400, evt=std::shared_ptr (use count 2, weak count 0) = {...}, handle=...) at /usr/src/debug/ceph-15.0.0-10071.g5b5a3a3.el8.x86_64/src/osd/OSD.cc:9665 9665 in /usr/src/debug/ceph-15.0.0-10071.g5b5a3a3.el8.x86_64/src/osd/OSD.cc (gdb) print osdmap $24 = std::shared_ptr (expired, weak count 0) = {get() = 0x559cba028000} (gdb) print *osdmap # pretty sane OSDMap (gdb) print sizeof(osdmap) $26 = 16 (gdb) x/2a &osdmap 0x559ca22acef0: 0x559cba028000 0x559cba0ec900 (gdb) frame 2 #2 std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release (this=0x559cba0ec900) at /usr/include/c++/8/bits/shared_ptr_base.h:148 148 /usr/include/c++/8/bits/shared_ptr_base.h: No such file or directory. (gdb) disassemble Dump of assembler code for function std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release(): ... 0x0000559c97675b1e <+62>: mov (%rdi),%rax 0x0000559c97675b21 <+65>: mov %rdi,%rbx 0x0000559c97675b24 <+68>: callq *0x10(%rax) => 0x0000559c97675b27 <+71>: test %rbp,%rbp ... End of assembler dump. (gdb) info registers rdi rbx rax rdi 0x559cba0ec900 94131624790272 rbx 0x559cba0ec900 94131624790272 rax 0x559cba0ec8a0 94131624790176 (gdb) x/a 0x559cba0ec8a0 + 0x10 0x559cba0ec8b0: 0x559cb81c3ea0 (gdb) bt #0 0x0000559cb81c3ea0 in ?? () ... (gdb) p $_siginfo._sifields._sigfault.si_addr $27 = (void *) 0x559cb81c3ea0 ``` Helgrind seems to agree: ``` ==00:00:02:54.519 510301== Possible data race during write of size 8 at 0xF123930 by thread #90 ==00:00:02:54.519 510301== Locks held: 2, at addresses 0xF122A58 0xF1239A8 ==00:00:02:54.519 510301== at 0x7218DD: operator= (shared_ptr_base.h:1078) ==00:00:02:54.519 510301== by 0x7218DD: operator= (shared_ptr.h:103) ==00:00:02:54.519 510301== by 0x7218DD: OSD::_committed_osd_maps(unsigned int, unsigned int, MOSDMap*) (OSD.cc:8116) ==00:00:02:54.519 510301== by 0x7752CA: C_OnMapCommit::finish(int) (OSD.cc:7678) ==00:00:02:54.519 510301== by 0x72A06C: Context::complete(int) (Context.h:77) ==00:00:02:54.519 510301== by 0xD07F14: Finisher::finisher_thread_entry() (Finisher.cc:66) ==00:00:02:54.519 510301== by 0xA7E1203: mythread_wrapper (hg_intercepts.c:389) ==00:00:02:54.519 510301== by 0xC6182DD: start_thread (in /usr/lib64/libpthread-2.28.so) ==00:00:02:54.519 510301== by 0xD8B34B2: clone (in /usr/lib64/libc-2.28.so) ==00:00:02:54.519 510301== ==00:00:02:54.519 510301== This conflicts with a previous read of size 8 by thread #117 ==00:00:02:54.519 510301== Locks held: 1, at address 0x2123E9A0 ==00:00:02:54.519 510301== at 0x6B5842: __shared_ptr (shared_ptr_base.h:1165) ==00:00:02:54.519 510301== by 0x6B5842: shared_ptr (shared_ptr.h:129) ==00:00:02:54.519 510301== by 0x6B5842: get_osdmap (OSD.h:1700) ==00:00:02:54.519 510301== by 0x6B5842: OSD::create_context() (OSD.cc:9053) ==00:00:02:54.519 510301== by 0x71B570: OSD::dequeue_peering_evt(OSDShard*, PG*, std::shared_ptr, ThreadPool::TPHandle&) (OSD.cc:9665) ==00:00:02:54.519 510301== by 0x71B997: OSD::dequeue_delete(OSDShard*, PG*, unsigned int, ThreadPool::TPHandle&) (OSD.cc:9701) ==00:00:02:54.519 510301== by 0x70E62E: run (OpSchedulerItem.h:148) ==00:00:02:54.519 510301== by 0x70E62E: OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*) (OSD.cc:10677) ==00:00:02:54.519 510301== by 0xD3C093: ShardedThreadPool::shardedthreadpool_worker(unsigned int) (WorkQueue.cc:311) ==00:00:02:54.519 510301== by 0xD3ECF3: ShardedThreadPool::WorkThreadSharded::entry() (WorkQueue.h:706) ==00:00:02:54.519 510301== by 0xA7E1203: mythread_wrapper (hg_intercepts.c:389) ==00:00:02:54.519 510301== by 0xC6182DD: start_thread (in /usr/lib64/libpthread-2.28.so) ==00:00:02:54.519 510301== Address 0xf123930 is 3,824 bytes inside a block of size 10,296 alloc'd ==00:00:02:54.519 510301== at 0xA7DC0C3: operator new[](unsigned long) (vg_replace_malloc.c:433) ==00:00:02:54.519 510301== by 0x66F766: main (ceph_osd.cc:688) ==00:00:02:54.519 510301== Block was alloc'd by thread #1 ``` Actually there is plenty of similar issues reported like: ``` ==00:00:05:04.903 510301== Possible data race during read of size 8 at 0x1E3E0588 by thread #119 ==00:00:05:04.903 510301== Locks held: 1, at address 0x1EAD41D0 ==00:00:05:04.903 510301== at 0x753165: clear (hashtable.h:2051) ==00:00:05:04.903 510301== by 0x753165: std::_Hashtable, mempool::pool_allocator<(mempool::pool_index_t)15, std::pair >, std::__detail::_Select1st, std::equal_to, std::hash, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__deta il::_Hashtable_traits >::~_Hashtable() (hashtable.h:1369) ==00:00:05:04.903 510301== by 0x75331C: ~unordered_map (unordered_map.h:102) ==00:00:05:04.903 510301== by 0x75331C: OSDMap::~OSDMap() (OSDMap.h:350) ==00:00:05:04.903 510301== by 0x753606: operator() (shared_cache.hpp:100) ==00:00:05:04.903 510301== by 0x753606: std::_Sp_counted_deleter::Cleanup, std::allocator, (__gnu_cxx::_Lock_policy)2>::_M_dispose() (shared_ptr _base.h:471) ==00:00:05:04.903 510301== by 0x73BB26: _M_release (shared_ptr_base.h:155) ==00:00:05:04.903 510301== by 0x73BB26: std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release() (shared_ptr_base.h:148) ==00:00:05:04.903 510301== by 0x6B58A9: ~__shared_count (shared_ptr_base.h:728) ==00:00:05:04.903 510301== by 0x6B58A9: ~__shared_ptr (shared_ptr_base.h:1167) ==00:00:05:04.903 510301== by 0x6B58A9: ~shared_ptr (shared_ptr.h:103) ==00:00:05:04.903 510301== by 0x6B58A9: OSD::create_context() (OSD.cc:9053) ==00:00:05:04.903 510301== by 0x71B570: OSD::dequeue_peering_evt(OSDShard*, PG*, std::shared_ptr, ThreadPool::TPHandle&) (OSD.cc:9665) ==00:00:05:04.903 510301== by 0x71B997: OSD::dequeue_delete(OSDShard*, PG*, unsigned int, ThreadPool::TPHandle&) (OSD.cc:9701) ==00:00:05:04.903 510301== by 0x70E62E: run (OpSchedulerItem.h:148) ==00:00:05:04.903 510301== by 0x70E62E: OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*) (OSD.cc:10677) ==00:00:05:04.903 510301== by 0xD3C093: ShardedThreadPool::shardedthreadpool_worker(unsigned int) (WorkQueue.cc:311) ==00:00:05:04.903 510301== by 0xD3ECF3: ShardedThreadPool::WorkThreadSharded::entry() (WorkQueue.h:706) ==00:00:05:04.903 510301== by 0xA7E1203: mythread_wrapper (hg_intercepts.c:389) ==00:00:05:04.903 510301== by 0xC6182DD: start_thread (in /usr/lib64/libpthread-2.28.so) ==00:00:05:04.903 510301== by 0xD8B34B2: clone (in /usr/lib64/libc-2.28.so) ==00:00:05:04.903 510301== ==00:00:05:04.903 510301== This conflicts with a previous write of size 8 by thread #90 ==00:00:05:04.903 510301== Locks held: 2, at addresses 0xF122A58 0xF1239A8 ==00:00:05:04.903 510301== at 0x7531E1: clear (hashtable.h:2054) ==00:00:05:04.903 510301== by 0x7531E1: std::_Hashtable, mempool::pool_allocator<(mempool::pool_index_t)15, std::pair >, std::__detail::_Select1st, std::equal_to, std::hash, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits >::~_Hashtable() (hashtable.h:1369) ==00:00:05:04.903 510301== by 0x75331C: ~unordered_map (unordered_map.h:102) ==00:00:05:04.903 510301== by 0x75331C: OSDMap::~OSDMap() (OSDMap.h:350) ==00:00:05:04.903 510301== by 0x753606: operator() (shared_cache.hpp:100) ==00:00:05:04.903 510301== by 0x753606: std::_Sp_counted_deleter::Cleanup, std::allocator, (__gnu_cxx::_Lock_policy)2>::_M_dispose() (shared_ptr_base.h:471) ==00:00:05:04.903 510301== by 0x73BB26: _M_release (shared_ptr_base.h:155) ==00:00:05:04.903 510301== by 0x73BB26: std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release() (shared_ptr_base.h:148) ==00:00:05:04.903 510301== by 0x72191E: operator= (shared_ptr_base.h:747) ==00:00:05:04.903 510301== by 0x72191E: operator= (shared_ptr_base.h:1078) ==00:00:05:04.903 510301== by 0x72191E: operator= (shared_ptr.h:103) ==00:00:05:04.903 510301== by 0x72191E: OSD::_committed_osd_maps(unsigned int, unsigned int, MOSDMap*) (OSD.cc:8116) ==00:00:05:04.903 510301== by 0x7752CA: C_OnMapCommit::finish(int) (OSD.cc:7678) ==00:00:05:04.903 510301== by 0x72A06C: Context::complete(int) (Context.h:77) ==00:00:05:04.903 510301== by 0xD07F14: Finisher::finisher_thread_entry() (Finisher.cc:66) ==00:00:05:04.903 510301== Address 0x1e3e0588 is 872 bytes inside a block of size 1,208 alloc'd ==00:00:05:04.903 510301== at 0xA7DC0C3: operator new[](unsigned long) (vg_replace_malloc.c:433) ==00:00:05:04.903 510301== by 0x6C7C0C: OSDService::try_get_map(unsigned int) (OSD.cc:1606) ==00:00:05:04.903 510301== by 0x7213BD: get_map (OSD.h:699) ==00:00:05:04.903 510301== by 0x7213BD: get_map (OSD.h:1732) ==00:00:05:04.903 510301== by 0x7213BD: OSD::_committed_osd_maps(unsigned int, unsigned int, MOSDMap*) (OSD.cc:8076) ==00:00:05:04.903 510301== by 0x7752CA: C_OnMapCommit::finish(int) (OSD.cc:7678) ==00:00:05:04.903 510301== by 0x72A06C: Context::complete(int) (Context.h:77) ==00:00:05:04.903 510301== by 0xD07F14: Finisher::finisher_thread_entry() (Finisher.cc:66) ==00:00:05:04.903 510301== by 0xA7E1203: mythread_wrapper (hg_intercepts.c:389) ==00:00:05:04.903 510301== by 0xC6182DD: start_thread (in /usr/lib64/libpthread-2.28.so) ==00:00:05:04.903 510301== by 0xD8B34B2: clone (in /usr/lib64/libc-2.28.so) ``` Signed-off-by: Radoslaw Zarzynski --- diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 76113d1146869..c117d5c89b90c 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1891,7 +1891,7 @@ void OSDService::clear_sent_ready_to_merge() sent_ready_to_merge_source.clear(); } -void OSDService::prune_sent_ready_to_merge(OSDMapRef& osdmap) +void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap) { std::lock_guard l(merge_lock); auto i = sent_ready_to_merge_source.begin(); @@ -2422,7 +2422,7 @@ void OSD::asok_command( } spg_t pcand; PGRef pg; - if (osdmap->get_primary_shard(pgid, &pcand) && + if (get_osdmap()->get_primary_shard(pgid, &pcand) && (pg = _lookup_lock_pg(pcand))) { if (pg->is_primary()) { cmdmap_t new_cmdmap = cmdmap; @@ -3276,6 +3276,7 @@ float OSD::get_osd_snap_trim_sleep() int OSD::init() { + OSDMapRef osdmap; CompatSet initial, diff; std::lock_guard lock(osd_lock); if (is_stopping()) @@ -3394,13 +3395,14 @@ int OSD::init() startup_time = ceph::mono_clock::now(); // load up "current" osdmap - assert_warn(!osdmap); - if (osdmap) { + assert_warn(!get_osdmap()); + if (get_osdmap()) { derr << "OSD::init: unable to read current osdmap" << dendl; r = -EINVAL; goto out; } osdmap = get_map(superblock.current_epoch); + set_osdmap(osdmap); // make sure we don't have legacy pgs deleting { @@ -4152,9 +4154,9 @@ int OSD::shutdown() } // note unmount epoch - dout(10) << "noting clean unmount in epoch " << osdmap->get_epoch() << dendl; + dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl; superblock.mounted = service.get_boot_epoch(); - superblock.clean_thru = osdmap->get_epoch(); + superblock.clean_thru = get_osdmap_epoch(); ObjectStore::Transaction t; write_superblock(t); int r = store->queue_transaction(service.meta_ch, std::move(t)); @@ -4218,7 +4220,7 @@ int OSD::shutdown() osd_lock.unlock(); { std::unique_lock l{map_lock}; - osdmap = OSDMapRef(); + set_osdmap(OSDMapRef()); } for (auto s : shards) { std::lock_guard l(s->osdmap_lock); @@ -4694,7 +4696,7 @@ void OSD::load_pgs() if (map_epoch > 0) { OSDMapRef pgosdmap = service.try_get_map(map_epoch); if (!pgosdmap) { - if (!osdmap->have_pg_pool(pgid.pool())) { + if (!get_osdmap()->have_pg_pool(pgid.pool())) { derr << __func__ << ": could not find map for epoch " << map_epoch << " on pg " << pgid << ", but the pool is not present in the " << "current map, so this is probably a result of bug 10617. " @@ -4710,7 +4712,7 @@ void OSD::load_pgs() } pg = _make_pg(pgosdmap, pgid); } else { - pg = _make_pg(osdmap, pgid); + pg = _make_pg(get_osdmap(), pgid); } if (!pg) { recursive_remove_collection(cct, store, pgid, *it); @@ -4908,7 +4910,7 @@ void OSD::resume_creating_pg() while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) { dout(20) << __func__ << " pg " << pg->first << dendl; vector acting; - osdmap->pg_to_up_acting_osds(pg->first.pgid, nullptr, nullptr, &acting, nullptr); + get_osdmap()->pg_to_up_acting_osds(pg->first.pgid, nullptr, nullptr, &acting, nullptr); service.queue_want_pg_temp(pg->first.pgid, twiddle(acting), true); pg = pending_creates_from_osd.erase(pg); do_sub_pg_creates = true; @@ -4926,7 +4928,7 @@ void OSD::resume_creating_pg() do_renew_subs = true; } } - version_t start = osdmap->get_epoch() + 1; + version_t start = get_osdmap_epoch() + 1; if (have_pending_creates) { // don't miss any new osdmap deleting PGs if (monc->sub_want("osdmap", start, 0)) { @@ -4968,7 +4970,7 @@ void OSD::build_initial_pg_history( pgid.pgid, &up, &up_primary, &acting, &acting_primary); ostringstream debug; - for (epoch_t e = created + 1; e <= osdmap->get_epoch(); ++e) { + for (epoch_t e = created + 1; e <= get_osdmap_epoch(); ++e) { OSDMapRef osdmap = service.get_map(e); int new_up_primary, new_acting_primary; vector new_up, new_acting; @@ -5034,7 +5036,7 @@ void OSD::_add_heartbeat_peer(int p) map::iterator i = heartbeat_peers.find(p); if (i == heartbeat_peers.end()) { - pair cons = service.get_con_osd_hb(p, osdmap->get_epoch()); + pair cons = service.get_con_osd_hb(p, get_osdmap_epoch()); if (!cons.first) return; assert(cons.second); @@ -5064,7 +5066,7 @@ void OSD::_add_heartbeat_peer(int p) } else { hi = &i->second; } - hi->epoch = osdmap->get_epoch(); + hi->epoch = get_osdmap_epoch(); } void OSD::_remove_heartbeat_peer(int n) @@ -5124,7 +5126,7 @@ void OSD::maybe_update_heartbeat_peers() _get_pgs(&pgs); for (auto& pg : pgs) { pg->with_heartbeat_peers([&](int peer) { - if (osdmap->is_up(peer)) { + if (get_osdmap()->is_up(peer)) { _add_heartbeat_peer(peer); } }); @@ -5133,10 +5135,10 @@ void OSD::maybe_update_heartbeat_peers() // include next and previous up osds to ensure we have a fully-connected set set want, extras; - const int next = osdmap->get_next_up_osd_after(whoami); + const int next = get_osdmap()->get_next_up_osd_after(whoami); if (next >= 0) want.insert(next); - int prev = osdmap->get_previous_up_osd_before(whoami); + int prev = get_osdmap()->get_previous_up_osd_before(whoami); if (prev >= 0 && prev != next) want.insert(prev); @@ -5145,7 +5147,7 @@ void OSD::maybe_update_heartbeat_peers() auto min_down = cct->_conf.get_val("mon_osd_min_down_reporters"); auto subtree = cct->_conf.get_val("mon_osd_reporter_subtree_level"); auto limit = std::max(min_down, (uint64_t)cct->_conf->osd_heartbeat_min_peers); - osdmap->get_random_up_osds_by_subtree( + get_osdmap()->get_random_up_osds_by_subtree( whoami, subtree, limit, want, &want); for (set::iterator p = want.begin(); p != want.end(); ++p) { @@ -5157,13 +5159,13 @@ void OSD::maybe_update_heartbeat_peers() // remove down peers; enumerate extras map::iterator p = heartbeat_peers.begin(); while (p != heartbeat_peers.end()) { - if (!osdmap->is_up(p->first)) { + if (!get_osdmap()->is_up(p->first)) { int o = p->first; ++p; _remove_heartbeat_peer(o); continue; } - if (p->second.epoch < osdmap->get_epoch()) { + if (p->second.epoch < get_osdmap_epoch()) { extras.insert(p->first); } ++p; @@ -5178,7 +5180,7 @@ void OSD::maybe_update_heartbeat_peers() extras.insert(n); _add_heartbeat_peer(n); } - n = osdmap->get_next_up_osd_after(n); + n = get_osdmap()->get_next_up_osd_after(n); if (n == next) break; // came full circle; stop } @@ -5197,7 +5199,7 @@ void OSD::maybe_update_heartbeat_peers() // clean up stale failure pending for (auto it = failure_pending.begin(); it != failure_pending.end();) { if (heartbeat_peers.count(it->first) == 0) { - send_still_alive(osdmap->get_epoch(), it->first, it->second.second); + send_still_alive(get_osdmap_epoch(), it->first, it->second.second); failure_pending.erase(it++); } else { it++; @@ -5740,7 +5742,7 @@ void OSD::heartbeat() if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) { last_mon_heartbeat = now; dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl; - osdmap_subscribe(osdmap->get_epoch() + 1, false); + osdmap_subscribe(get_osdmap_epoch() + 1, false); } } @@ -5815,7 +5817,7 @@ void OSD::tick() if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) { last_mon_heartbeat = now; dout(1) << __func__ << " checking mon for new map" << dendl; - osdmap_subscribe(osdmap->get_epoch() + 1, false); + osdmap_subscribe(get_osdmap_epoch() + 1, false); } } @@ -6282,7 +6284,7 @@ void OSD::_preboot(epoch_t oldest, epoch_t newest) } const auto& monmap = monc->monmap; - + const auto osdmap = get_osdmap(); // if our map within recent history, try to add ourselves to the osdmap. if (osdmap->get_epoch() == 0) { derr << "waiting for initial osdmap" << dendl; @@ -6324,7 +6326,7 @@ void OSD::_preboot(epoch_t oldest, epoch_t newest) << dendl; l.unlock(); for (auto shard : shards) { - shard->wait_min_pg_epoch(osdmap->get_epoch()); + shard->wait_min_pg_epoch(get_osdmap_epoch()); } l.lock(); } @@ -6396,7 +6398,7 @@ void OSD::send_full_update() set s; OSDMap::calc_state_set(state, s); dout(10) << __func__ << " want state " << s << dendl; - monc->send_mon_message(new MOSDFull(osdmap->get_epoch(), state)); + monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state)); } void OSD::start_waiting_for_healthy() @@ -6406,7 +6408,7 @@ void OSD::start_waiting_for_healthy() last_heartbeat_resample = utime_t(); // subscribe to osdmap updates, in case our peers really are known to be dead - osdmap_subscribe(osdmap->get_epoch() + 1, false); + osdmap_subscribe(get_osdmap_epoch() + 1, false); } bool OSD::_is_healthy() @@ -6586,7 +6588,7 @@ void OSD::_collect_metadata(map *pm) void OSD::queue_want_up_thru(epoch_t want) { std::shared_lock map_locker{map_lock}; - epoch_t cur = osdmap->get_up_thru(whoami); + epoch_t cur = get_osdmap()->get_up_thru(whoami); std::lock_guard report_locker(mon_report_lock); if (want > up_thru_wanted) { dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")" @@ -6604,6 +6606,7 @@ void OSD::queue_want_up_thru(epoch_t want) void OSD::send_alive() { ceph_assert(ceph_mutex_is_locked(mon_report_lock)); + const auto osdmap = get_osdmap(); if (!osdmap->exists(whoami)) return; epoch_t up_thru = osdmap->get_up_thru(whoami); @@ -6687,6 +6690,7 @@ void OSD::send_failures() ceph_assert(ceph_mutex_is_locked(mon_report_lock)); std::lock_guard l(heartbeat_lock); utime_t now = ceph_clock_now(); + const auto osdmap = get_osdmap(); while (!failure_queue.empty()) { int osd = failure_queue.begin()->first; if (!failure_pending.count(osd)) { @@ -6719,7 +6723,7 @@ void OSD::cancel_pending_failures() while (it != failure_pending.end()) { dout(10) << __func__ << " canceling in-flight failure report for osd." << it->first << dendl; - send_still_alive(osdmap->get_epoch(), it->first, it->second.second); + send_still_alive(get_osdmap_epoch(), it->first, it->second.second); failure_pending.erase(it++); } } @@ -6736,7 +6740,7 @@ void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now) MOSDBeacon* beacon = nullptr; { std::lock_guard l{min_last_epoch_clean_lock}; - beacon = new MOSDBeacon(osdmap->get_epoch(), + beacon = new MOSDBeacon(get_osdmap_epoch(), min_last_epoch_clean, superblock.last_purged_snaps_scrub); beacon->pgs = min_last_epoch_clean_pgs; @@ -7204,7 +7208,7 @@ void OSD::_dispatch(Message *m) if (m->trace) op->osd_trace.init("osd op", &trace_endpoint, &m->trace); // no map? starting up? - if (!osdmap) { + if (!get_osdmap()) { dout(7) << "no OSDMap, not booted" << dendl; logger->inc(l_osd_waiting_for_map); waiting_for_osdmap.push_back(op); @@ -7240,7 +7244,7 @@ void OSD::handle_scrub(MOSDScrub *m) vector v; for (auto pgid : m->scrub_pgs) { spg_t pcand; - if (osdmap->get_primary_shard(pgid, &pcand) && + if (get_osdmap()->get_primary_shard(pgid, &pcand) && std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) { v.push_back(pcand); } @@ -7540,11 +7544,11 @@ MPGStats* OSD::collect_pg_stats() osd_stat_t cur_stat = service.get_osd_stat(); cur_stat.os_perf_stat = store->get_cur_stats(); - auto m = new MPGStats(monc->get_fsid(), osdmap->get_epoch()); + auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch()); m->osd_stat = cur_stat; std::lock_guard lec{min_last_epoch_clean_lock}; - min_last_epoch_clean = osdmap->get_epoch(); + min_last_epoch_clean = get_osdmap_epoch(); min_last_epoch_clean_pgs.clear(); std::set pool_set; @@ -7639,7 +7643,7 @@ void OSD::wait_for_new_map(OpRequestRef op) { // ask? if (waiting_for_osdmap.empty()) { - osdmap_subscribe(osdmap->get_epoch() + 1, false); + osdmap_subscribe(get_osdmap_epoch() + 1, false); } logger->inc(l_osd_waiting_for_map); @@ -7655,7 +7659,7 @@ void OSD::wait_for_new_map(OpRequestRef op) void OSD::note_down_osd(int peer) { ceph_assert(ceph_mutex_is_locked(osd_lock)); - cluster_messenger->mark_down_addrs(osdmap->get_cluster_addrs(peer)); + cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer)); std::lock_guard l{heartbeat_lock}; failure_queue.erase(peer); @@ -7756,10 +7760,11 @@ void OSD::handle_osd_map(MOSDMap *m) osd_min = min; } } + epoch_t osdmap_epoch = get_osdmap_epoch(); if (osd_min > 0 && - osdmap->get_epoch() > max_lag && - osdmap->get_epoch() - max_lag > osd_min) { - epoch_t need = osdmap->get_epoch() - max_lag; + osdmap_epoch > max_lag && + osdmap_epoch - max_lag > osd_min) { + epoch_t need = osdmap_epoch - max_lag; dout(10) << __func__ << " waiting for pgs to catch up (need " << need << " max_lag " << max_lag << ")" << dendl; for (auto shard : shards) { @@ -7908,7 +7913,7 @@ void OSD::handle_osd_map(MOSDMap *m) inc.decode(p); if (o->apply_incremental(inc) < 0) { - derr << "ERROR: bad fsid? i have " << osdmap->get_fsid() << " and inc has " << inc.fsid << dendl; + derr << "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl; ceph_abort_msg("bad fsid"); } @@ -8069,6 +8074,7 @@ void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m) bool do_shutdown = false; bool do_restart = false; bool network_error = false; + OSDMapRef osdmap; // advance through the new maps for (epoch_t cur = first; cur <= last; cur++) { @@ -8086,6 +8092,7 @@ void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m) // kill connections to newly down osds bool waited_for_reservations = false; set old; + osdmap = get_osdmap(); osdmap->get_all_osds(old); for (set::iterator p = old.begin(); p != old.end(); ++p) { if (*p != whoami && @@ -8117,7 +8124,8 @@ void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m) } } - osdmap = newmap; + osdmap = std::move(newmap); + set_osdmap(osdmap); epoch_t up_epoch; epoch_t boot_epoch; service.retrieve_epochs(&boot_epoch, &up_epoch, NULL); @@ -8314,6 +8322,7 @@ void OSD::check_osdmap_features() // current memory location, and setting or clearing bits in integer // fields, and we are the only writer, this is not a problem. + const auto osdmap = get_osdmap(); { Messenger::Policy p = client_messenger->get_default_policy(); uint64_t mask; @@ -8632,6 +8641,7 @@ bool OSD::advance_pg( void OSD::consume_map() { ceph_assert(ceph_mutex_is_locked(osd_lock)); + auto osdmap = get_osdmap(); dout(7) << "consume_map version " << osdmap->get_epoch() << dendl; /** make sure the cluster is speaking in SORTBITWISE, because we don't @@ -8749,6 +8759,7 @@ void OSD::consume_map() void OSD::activate_map() { ceph_assert(ceph_mutex_is_locked(osd_lock)); + auto osdmap = get_osdmap(); dout(7) << "activate_map version " << osdmap->get_epoch() << dendl; @@ -8821,7 +8832,7 @@ bool OSD::require_self_aliveness(const Message *m, epoch_t epoch) return true; } -bool OSD::require_same_peer_instance(const Message *m, OSDMapRef& map, +bool OSD::require_same_peer_instance(const Message *m, const OSDMapRef& map, bool is_fast_dispatch) { int from = m->get_source().num(); @@ -8859,6 +8870,7 @@ bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch, bool is_fast_dispatch) { const Message *m = op->get_req(); + const auto osdmap = get_osdmap(); dout(15) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ") " << m << dendl; @@ -8968,6 +8980,7 @@ void OSD::handle_pg_create(OpRequestRef op) op->mark_started(); + const auto osdmap = get_osdmap(); map::const_iterator ci = m->ctimes.begin(); for (map::const_iterator p = m->mkpg.begin(); p != m->mkpg.end(); @@ -10042,7 +10055,7 @@ epoch_t OSDShard::get_max_waiting_epoch() } void OSDShard::consume_map( - OSDMapRef& new_osdmap, + const OSDMapRef& new_osdmap, unsigned *pushes_to_free) { std::lock_guard l(shard_lock); diff --git a/src/osd/OSD.h b/src/osd/OSD.h index d0d96f52e37bd..8625bd899ee72 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -580,7 +580,7 @@ public: void send_ready_to_merge(); void _send_ready_to_merge(); void clear_sent_ready_to_merge(); - void prune_sent_ready_to_merge(OSDMapRef& osdmap); + void prune_sent_ready_to_merge(const OSDMapRef& osdmap); // -- pg_temp -- private: @@ -1055,7 +1055,7 @@ struct OSDShard { /// push osdmap into shard void consume_map( - OSDMapRef& osdmap, + const OSDMapRef& osdmap, unsigned *pushes_to_free); void _wake_pg_slot(spg_t pgid, OSDShardPGSlot *slot); @@ -1338,7 +1338,7 @@ private: i != sessions_to_check.end(); sessions_to_check.erase(i++)) { std::lock_guard l{(*i)->session_dispatch_lock}; - dispatch_session_waiting(*i, osdmap); + dispatch_session_waiting(*i, get_osdmap()); } } void session_handle_reset(const ceph::ref_t& session) { @@ -1695,11 +1695,17 @@ protected: protected: // -- osd map -- - OSDMapRef osdmap; - OSDMapRef get_osdmap() { - return osdmap; + // TODO: switch to std::atomic when C++20 will be available. + OSDMapRef _osdmap; + void set_osdmap(OSDMapRef osdmap) { + std::atomic_store(&_osdmap, osdmap); + } + OSDMapRef get_osdmap() const { + return std::atomic_load(&_osdmap); } epoch_t get_osdmap_epoch() const { + // XXX: performance? + auto osdmap = get_osdmap(); return osdmap ? osdmap->get_epoch() : 0; } @@ -1899,7 +1905,7 @@ protected: * address as in the given map. * @pre op was sent by an OSD using the cluster messenger */ - bool require_same_peer_instance(const Message *m, OSDMapRef& map, + bool require_same_peer_instance(const Message *m, const OSDMapRef& map, bool is_fast_dispatch); bool require_same_or_newer_map(OpRequestRef& op, epoch_t e,