From: Gabriel BenHanokh Date: Mon, 7 Mar 2022 15:16:54 +0000 (+0200) Subject: osd: Modify OSD Fast-Shutdown to work safely X-Git-Tag: v18.0.0~1271^2~2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=9b2a64a5f6ea743b2a4f4c2dbd703248d88b2a96;p=ceph.git osd: Modify OSD Fast-Shutdown to work safely quiesce all activities and destage allocations to disk before killing the OSD 1) keep the old (unsafe) fast-shutdown when we are not using NCB (non null-manager()) 2) skip service.prepare_to_stop() which can take as much as 10 seconds 3) skip debug options in fast-shutdown 4) set_state(STATE_STOPPING) which will stop accepting new tasks to this OSD 5) clear op_shardedwq queues, this is safe since we didn't started processing them 6) stop timer 7) drain osd_op_tp (no new items will be added) 8) now we can safely call umount which will close_db/bluefs and will destage allocation to disk 9) skip _shutdown_cache() when we are in the middle of a fast-shutdown 10) increase debug level on fast-shutdown 11) add option for bluestore_qfsck_on_mount to force scan on mount for all tests 12) disable fsck-on-umount when running fast-shutdown 13) add an option to increase debug level at fast-shutdown umount() 14) set a time limit to fast-shutdown 15) Bug-Fix BlueStore::pool_statfs don't access db after it was removed 16) Fix error message for qfsck (error was caused by PR https://github.com/ceph/ceph/pull/44563) 17) make shutdown-timeout configurable Fixes: https://tracker.ceph.com/issues/53266 Signed-off-by: Gabriel Benhanokh --- diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in index 6d8d0736ffa92..ea19ddcc4d0c2 100644 --- a/src/common/options/global.yaml.in +++ b/src/common/options/global.yaml.in @@ -3266,6 +3266,13 @@ options: slow shutdown is primarilyy useful for doing memory leak checking with valgrind. default: true with_legacy: true +- name: osd_fast_shutdown_timeout + type: int + level: advanced + desc: timeout in seconds for osd fast-shutdown (0 is unlimited) + default: 15 + with_legacy: true + min: 0 - name: osd_fast_shutdown_notify_mon type: bool level: advanced @@ -4937,6 +4944,12 @@ options: This setting is used only when OSD is doing ``--mkfs``. Next runs of OSD retrieve sharding from disk. default: m(3) p(3,0-12) O(3,0-13)=block_cache={type=binned_lru} L P +- name: bluestore_qfsck_on_mount + type: bool + level: dev + desc: Run quick-fsck at mount comparing allocation-file to RocksDB allocation state + default: true + with_legacy: true - name: bluestore_fsck_on_mount type: bool level: dev diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h index d934d092919a2..44d67c26e88f9 100644 --- a/src/os/ObjectStore.h +++ b/src/os/ObjectStore.h @@ -288,7 +288,8 @@ public: virtual bool needs_journal() = 0; //< requires a journal virtual bool wants_journal() = 0; //< prefers a journal virtual bool allows_journal() = 0; //< allows a journal - + virtual void prepare_for_fast_shutdown() {} + virtual bool has_null_manager() { return false; } // return store min allocation size, if applicable virtual uint64_t get_min_alloc_size() const { return 0; diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index d1a0fe4897cc8..d9fddacd25712 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -7565,9 +7565,16 @@ void BlueStore::set_cache_shards(unsigned num) } } +//--------------------------------------------- +bool BlueStore::has_null_manager() +{ + return (fm && fm->is_null_manager()); +} + int BlueStore::_mount() { dout(5) << __func__ << "NCB:: path " << path << dendl; + _kv_only = false; if (cct->_conf->bluestore_fsck_on_mount) { dout(5) << __func__ << "::NCB::calling fsck()" << dendl; @@ -7681,12 +7688,15 @@ int BlueStore::umount() #endif dout(20) << __func__ << " stopping kv thread" << dendl; _kv_stop(); - _shutdown_cache(); + // skip cache cleanup step on fast shutdown + if (likely(!m_fast_shutdown)) { + _shutdown_cache(); + } dout(20) << __func__ << " closing" << dendl; } - _close_db_and_around(); - if (cct->_conf->bluestore_fsck_on_umount) { + // disable fsck on fast-shutdown + if (cct->_conf->bluestore_fsck_on_umount && !m_fast_shutdown) { int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep); if (rc < 0) return rc; @@ -10305,6 +10315,11 @@ int BlueStore::get_numa_node( return 0; } +void BlueStore::prepare_for_fast_shutdown() +{ + m_fast_shutdown = true; +} + int BlueStore::get_devices(set *ls) { if (bdev) { @@ -10432,7 +10447,8 @@ int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf, string key_prefix; _key_encode_u64(pool_id, &key_prefix); *out_per_pool_omap = per_pool_omap != OMAP_BULK; - if (*out_per_pool_omap) { + // stop calls after db was closed + if (*out_per_pool_omap && db) { auto prefix = per_pool_omap == OMAP_PER_POOL ? PREFIX_PERPOOL_OMAP : PREFIX_PERPG_OMAP; @@ -19025,15 +19041,6 @@ int BlueStore::compare_allocators(Allocator* alloc1, Allocator* alloc2, uint64_t return 0; } else { derr << "mismatch:: idx1=" << idx1 << " idx2=" << idx2 << dendl; - std::cout << "===================================================================" << std::endl; - for (uint64_t i = 0; i < idx1; i++) { - std::cout << "arr1[" << i << "]<" << arr1[i].offset << "," << arr1[i].length << "> " << std::endl; - } - - std::cout << "===================================================================" << std::endl; - for (uint64_t i = 0; i < idx2; i++) { - std::cout << "arr2[" << i << "]<" << arr2[i].offset << "," << arr2[i].length << "> " << std::endl; - } return -1; } } @@ -19081,9 +19088,9 @@ int BlueStore::read_allocation_from_drive_for_bluestore_tool() utime_t start = ceph_clock_now(); auto shutdown_cache = make_scope_guard([&] { - std::cout << "Allocation Recovery was completed in " << duration - << " seconds; insert_count=" << stats.insert_count - << "; extent_count=" << stats.extent_count << std::endl; + dout(1) << "Allocation Recovery was completed in " << duration + << " seconds; insert_count=" << stats.insert_count + << "; extent_count=" << stats.extent_count << dendl; _shutdown_cache(); _close_db_and_around(); }); @@ -19113,14 +19120,14 @@ int BlueStore::read_allocation_from_drive_for_bluestore_tool() }; allocator->dump(count_entries); ret = compare_allocators(allocator.get(), alloc, stats.insert_count, memory_target); - if (ret != 0) { + if (ret == 0) { dout(5) << "Allocator drive - file integrity check OK" << dendl; } else { derr << "FAILURE. Allocator from file and allocator from metadata differ::ret=" << ret << dendl; } } - std::cout << stats << std::endl; + dout(1) << stats << dendl; return ret; } diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 72cfc2d076b79..0f804595ebb37 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -2764,7 +2764,7 @@ public: private: int32_t ondisk_format = 0; ///< value detected on mount - + bool m_fast_shutdown = false; int _upgrade_super(); ///< upgrade (called during open_super) uint64_t _get_ondisk_reserved() const; void _prepare_ondisk_format_super(KeyValueDB::Transaction& t); @@ -2783,6 +2783,9 @@ public: bool wants_journal() override { return false; }; bool allows_journal() override { return false; }; + void prepare_for_fast_shutdown() override; + virtual bool has_null_manager(); + uint64_t get_min_alloc_size() const override { return min_alloc_size; } diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index b7a68225e0602..69b18ba03f34d 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -4258,27 +4258,44 @@ PerfCounters* OSD::create_recoverystate_perf() int OSD::shutdown() { + // vstart overwrites osd_fast_shutdown value in the conf file -> force the value here! + //cct->_conf->osd_fast_shutdown = true; + + dout(0) << "Fast Shutdown: - cct->_conf->osd_fast_shutdown = " + << cct->_conf->osd_fast_shutdown + << ", null-fm = " << store->has_null_manager() << dendl; + + utime_t start_time_func = ceph_clock_now(); + if (cct->_conf->osd_fast_shutdown) { derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl; if (cct->_conf->osd_fast_shutdown_notify_mon) service.prepare_to_stop(); - cct->_log->flush(); - _exit(0); - } - if (!service.prepare_to_stop()) + // There is no state we need to keep wehn running in NULL-FM moode + if (!store->has_null_manager()) { + cct->_log->flush(); + _exit(0); + } + } else if (!service.prepare_to_stop()) { return 0; // already shutting down + } + osd_lock.lock(); if (is_stopping()) { osd_lock.unlock(); return 0; } - dout(0) << "shutdown" << dendl; + if (!cct->_conf->osd_fast_shutdown) { + dout(0) << "shutdown" << dendl; + } + + // don't accept new task for this OSD set_state(STATE_STOPPING); - // Debugging - if (cct->_conf.get_val("osd_debug_shutdown")) { + // Disabled debugging during fast-shutdown + if (!cct->_conf->osd_fast_shutdown && cct->_conf.get_val("osd_debug_shutdown")) { cct->_conf.set_val("debug_osd", "100"); cct->_conf.set_val("debug_journal", "100"); cct->_conf.set_val("debug_filestore", "100"); @@ -4287,6 +4304,45 @@ int OSD::shutdown() cct->_conf.apply_changes(nullptr); } + if (cct->_conf->osd_fast_shutdown) { + // first, stop new task from being taken from op_shardedwq + // and clear all pending tasks + op_shardedwq.stop_for_fast_shutdown(); + + utime_t start_time_timer = ceph_clock_now(); + tick_timer.shutdown(); + { + std::lock_guard l(tick_timer_lock); + tick_timer_without_osd_lock.shutdown(); + } + + osd_lock.unlock(); + utime_t start_time_osd_drain = ceph_clock_now(); + + // then, wait on osd_op_tp to drain (TBD: should probably add a timeout) + osd_op_tp.drain(); + osd_op_tp.stop(); + + utime_t start_time_umount = ceph_clock_now(); + store->prepare_for_fast_shutdown(); + std::lock_guard lock(osd_lock); + // TBD: assert in allocator that nothing is being add + store->umount(); + + utime_t end_time = ceph_clock_now(); + if (cct->_conf->osd_fast_shutdown_timeout) { + ceph_assert(end_time - start_time_func < cct->_conf->osd_fast_shutdown_timeout); + } + dout(0) <<"Fast Shutdown duration total :" << end_time - start_time_func << " seconds" << dendl; + dout(0) <<"Fast Shutdown duration osd_drain :" << start_time_umount - start_time_osd_drain << " seconds" << dendl; + dout(0) <<"Fast Shutdown duration umount :" << end_time - start_time_umount << " seconds" << dendl; + dout(0) <<"Fast Shutdown duration timer :" << start_time_osd_drain - start_time_timer << " seconds" << dendl; + cct->_log->flush(); + + // now it is safe to exit + _exit(0); + } + // stop MgrClient earlier as it's more like an internal consumer of OSD mgrc.shutdown(); @@ -4448,6 +4504,9 @@ int OSD::shutdown() hb_front_server_messenger->shutdown(); hb_back_server_messenger->shutdown(); + utime_t duration = ceph_clock_now() - start_time_func; + dout(0) <<"Slow Shutdown duration:" << duration << " seconds" << dendl; + tracing::osd::tracer.shutdown(); return r; @@ -11072,6 +11131,11 @@ void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb) } void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) { + if (unlikely(m_fast_shutdown) ) { + // stop enqueing when we are in the middle of a fast shutdown + return; + } + uint32_t shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size()); @@ -11102,6 +11166,11 @@ void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) { void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item) { + if (unlikely(m_fast_shutdown) ) { + // stop enqueing when we are in the middle of a fast shutdown + return; + } + auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size()); auto& sdata = osd->shards[shard_index]; ceph_assert(sdata); @@ -11128,6 +11197,24 @@ void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item) sdata->sdata_cond.notify_one(); } +void OSD::ShardedOpWQ::stop_for_fast_shutdown() +{ + uint32_t shard_index = 0; + m_fast_shutdown = true; + + for (; shard_index < osd->num_shards; shard_index++) { + auto& sdata = osd->shards[shard_index]; + ceph_assert(sdata); + sdata->shard_lock.lock(); + int work_count = 0; + while(! sdata->scheduler->empty() ) { + auto work_item = sdata->scheduler->dequeue(); + work_count++; + } + sdata->shard_lock.unlock(); + } +} + namespace ceph::osd_cmds { int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 30d0b0b4aef07..2da5de10aa69b 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -1592,7 +1592,7 @@ protected: : public ShardedThreadPool::ShardedWQ { OSD *osd; - + bool m_fast_shutdown = false; public: ShardedOpWQ(OSD *o, ceph::timespan ti, @@ -1610,6 +1610,8 @@ protected: /// try to do some work void _process(uint32_t thread_index, ceph::heartbeat_handle_d *hb) override; + void stop_for_fast_shutdown(); + /// enqueue a new item void _enqueue(OpSchedulerItem&& item) override;