osd: Modify OSD Fast-Shutdown to work safely

author Gabriel BenHanokh <benhanokh@gmail.com>

Mon, 7 Mar 2022 15:16:54 +0000 (17:16 +0200)

committer Gabriel BenHanokh <benhanokh@gmail.com>

Mon, 7 Mar 2022 15:40:12 +0000 (17:40 +0200)
author Gabriel BenHanokh <benhanokh@gmail.com>
Mon, 7 Mar 2022 15:16:54 +0000 (17:16 +0200)
committer Gabriel BenHanokh <benhanokh@gmail.com>
Mon, 7 Mar 2022 15:40:12 +0000 (17:40 +0200)
diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in

index 6d8d0736ffa92010cdbad87b122289476286a2a5..ea19ddcc4d0c2e3771da13e756de189152cd4aaa 100644 (file)
--- a/src/common/options/global.yaml.in
+++ b/src/common/options/global.yaml.in
@@ -3266,6 +3266,13 @@ options:
      slow shutdown is primarilyy useful for doing memory leak checking with valgrind.
    default: true
    with_legacy: true
+- name: osd_fast_shutdown_timeout
+  type: int
+  level: advanced
+  desc: timeout in seconds for osd fast-shutdown (0 is unlimited)
+  default: 15
+  with_legacy: true
+  min: 0
  - name: osd_fast_shutdown_notify_mon
    type: bool
    level: advanced
@@ -4937,6 +4944,12 @@ options:
      This setting is used only when OSD is doing ``--mkfs``.
      Next runs of OSD retrieve sharding from disk.
    default: m(3) p(3,0-12) O(3,0-13)=block_cache={type=binned_lru} L P
+- name: bluestore_qfsck_on_mount
+  type: bool
+  level: dev
+  desc: Run quick-fsck at mount comparing allocation-file to RocksDB allocation state
+  default: true
+  with_legacy: true
  - name: bluestore_fsck_on_mount
    type: bool
    level: dev
diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h

index d934d092919a27fb423221e7604b88c8f7a16369..44d67c26e88f9a48b58434eee2d82eabbd472aee 100644 (file)
--- a/src/os/ObjectStore.h
+++ b/src/os/ObjectStore.h
@@ -288,7 +288,8 @@ public:
    virtual bool needs_journal() = 0;  //< requires a journal
    virtual bool wants_journal() = 0;  //< prefers a journal
    virtual bool allows_journal() = 0; //< allows a journal
-
+  virtual void prepare_for_fast_shutdown() {}
+  virtual bool has_null_manager() { return false; }
    // return store min allocation size, if applicable
    virtual uint64_t get_min_alloc_size() const {
      return 0;
diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc

index d1a0fe4897cc887fd61d1f14bd51799890998ddb..d9fddacd2571247a41c8b768bcb3bbd00f99b0ad 100644 (file)
--- a/src/os/bluestore/BlueStore.cc
+++ b/src/os/bluestore/BlueStore.cc
@@ -7565,9 +7565,16 @@ void BlueStore::set_cache_shards(unsigned num)
    }
  }
  
+//---------------------------------------------
+bool BlueStore::has_null_manager()
+{
+  return (fm && fm->is_null_manager());
+}
+
  int BlueStore::_mount()
  {
    dout(5) << __func__ << "NCB:: path " << path << dendl;
+
    _kv_only = false;
    if (cct->_conf->bluestore_fsck_on_mount) {
      dout(5) << __func__ << "::NCB::calling fsck()" << dendl;
@@ -7681,12 +7688,15 @@ int BlueStore::umount()
  #endif
      dout(20) << __func__ << " stopping kv thread" << dendl;
      _kv_stop();
-    _shutdown_cache();
+    // skip cache cleanup step on fast shutdown
+    if (likely(!m_fast_shutdown)) {
+      _shutdown_cache();
+    }
      dout(20) << __func__ << " closing" << dendl;
    }
-
    _close_db_and_around();
-  if (cct->_conf->bluestore_fsck_on_umount) {
+  // disable fsck on fast-shutdown
+  if (cct->_conf->bluestore_fsck_on_umount && !m_fast_shutdown) {
      int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
      if (rc < 0)
        return rc;
@@ -10305,6 +10315,11 @@ int BlueStore::get_numa_node(
    return 0;
  }
  
+void BlueStore::prepare_for_fast_shutdown()
+{
+  m_fast_shutdown = true;
+}
+
  int BlueStore::get_devices(set<string> *ls)
  {
    if (bdev) {
@@ -10432,7 +10447,8 @@ int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
    string key_prefix;
    _key_encode_u64(pool_id, &key_prefix);
    *out_per_pool_omap = per_pool_omap != OMAP_BULK;
-  if (*out_per_pool_omap) {
+  // stop calls after db was closed
+  if (*out_per_pool_omap && db) {
      auto prefix = per_pool_omap == OMAP_PER_POOL ?
        PREFIX_PERPOOL_OMAP :
        PREFIX_PERPG_OMAP;
@@ -19025,15 +19041,6 @@ int BlueStore::compare_allocators(Allocator* alloc1, Allocator* alloc2, uint64_t
      return 0;
    } else {
      derr << "mismatch:: idx1=" << idx1 << " idx2=" << idx2 << dendl;
-    std::cout << "==================================================================="  << std::endl;
-    for (uint64_t i = 0; i < idx1; i++) {
-      std::cout << "arr1[" << i << "]<" << arr1[i].offset << "," << arr1[i].length << "> " << std::endl;
-    }
-
-    std::cout << "==================================================================="  << std::endl;
-    for (uint64_t i = 0; i < idx2; i++) {
-      std::cout << "arr2[" << i << "]<" << arr2[i].offset << "," << arr2[i].length << "> " << std::endl;
-    }
      return -1;
    }
  }
@@ -19081,9 +19088,9 @@ int BlueStore::read_allocation_from_drive_for_bluestore_tool()
    utime_t            start = ceph_clock_now();
  
    auto shutdown_cache = make_scope_guard([&] {
-    std::cout << "Allocation Recovery was completed in " << duration
-             << " seconds; insert_count=" << stats.insert_count
-             << "; extent_count=" << stats.extent_count << std::endl;
+    dout(1) << "Allocation Recovery was completed in " << duration
+           << " seconds; insert_count=" << stats.insert_count
+           << "; extent_count=" << stats.extent_count << dendl;
      _shutdown_cache();
      _close_db_and_around();
    });
@@ -19113,14 +19120,14 @@ int BlueStore::read_allocation_from_drive_for_bluestore_tool()
      };
      allocator->dump(count_entries);
      ret = compare_allocators(allocator.get(), alloc, stats.insert_count, memory_target);
-    if (ret != 0) {
+    if (ret == 0) {
        dout(5) << "Allocator drive - file integrity check OK" << dendl;
      } else {
        derr << "FAILURE. Allocator from file and allocator from metadata differ::ret=" << ret << dendl;
      }
    }
  
-  std::cout << stats << std::endl;
+  dout(1) << stats << dendl;
    return ret;
  }
  
diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h

index 72cfc2d076b79cbf1df8a30815e364333e051ffc..0f804595ebb37cbc96551a3f5ed008781c60baa3 100644 (file)
--- a/src/os/bluestore/BlueStore.h
+++ b/src/os/bluestore/BlueStore.h
@@ -2764,7 +2764,7 @@ public:
  
  private:
    int32_t ondisk_format = 0;  ///< value detected on mount
-
+  bool    m_fast_shutdown = false;
    int _upgrade_super();  ///< upgrade (called during open_super)
    uint64_t _get_ondisk_reserved() const;
    void _prepare_ondisk_format_super(KeyValueDB::Transaction& t);
@@ -2783,6 +2783,9 @@ public:
    bool wants_journal() override { return false; };
    bool allows_journal() override { return false; };
  
+  void prepare_for_fast_shutdown() override;
+  virtual bool has_null_manager();
+
    uint64_t get_min_alloc_size() const override {
      return min_alloc_size;
    }
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc

index b7a68225e0602c54b3d58a4ffea59a248443617b..69b18ba03f34dac3ce404b24d9dd0fa285f2c611 100644 (file)
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -4258,27 +4258,44 @@ PerfCounters* OSD::create_recoverystate_perf()
  
  int OSD::shutdown()
  {
+  // vstart overwrites osd_fast_shutdown value in the conf file -> force the value here!
+  //cct->_conf->osd_fast_shutdown = true;
+
+  dout(0) << "Fast Shutdown: - cct->_conf->osd_fast_shutdown = "
+         << cct->_conf->osd_fast_shutdown
+         << ", null-fm = " << store->has_null_manager() << dendl;
+
+  utime_t  start_time_func = ceph_clock_now();
+
    if (cct->_conf->osd_fast_shutdown) {
      derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
      if (cct->_conf->osd_fast_shutdown_notify_mon)
        service.prepare_to_stop();
-    cct->_log->flush();
-    _exit(0);
-  }
  
-  if (!service.prepare_to_stop())
+    // There is no state we need to keep wehn running in NULL-FM moode
+    if (!store->has_null_manager()) {
+      cct->_log->flush();
+      _exit(0);
+    }
+  } else if (!service.prepare_to_stop()) {
      return 0; // already shutting down
+  }
+
    osd_lock.lock();
    if (is_stopping()) {
      osd_lock.unlock();
      return 0;
    }
-  dout(0) << "shutdown" << dendl;
  
+  if (!cct->_conf->osd_fast_shutdown) {
+    dout(0) << "shutdown" << dendl;
+  }
+
+  // don't accept new task for this OSD
    set_state(STATE_STOPPING);
  
-  // Debugging
-  if (cct->_conf.get_val<bool>("osd_debug_shutdown")) {
+  // Disabled debugging during fast-shutdown
+  if (!cct->_conf->osd_fast_shutdown && cct->_conf.get_val<bool>("osd_debug_shutdown")) {
      cct->_conf.set_val("debug_osd", "100");
      cct->_conf.set_val("debug_journal", "100");
      cct->_conf.set_val("debug_filestore", "100");
@@ -4287,6 +4304,45 @@ int OSD::shutdown()
      cct->_conf.apply_changes(nullptr);
    }
  
+  if (cct->_conf->osd_fast_shutdown) {
+    // first, stop new task from being taken from op_shardedwq
+    // and clear all pending tasks
+    op_shardedwq.stop_for_fast_shutdown();
+
+    utime_t  start_time_timer = ceph_clock_now();
+    tick_timer.shutdown();
+    {
+      std::lock_guard l(tick_timer_lock);
+      tick_timer_without_osd_lock.shutdown();
+    }
+
+    osd_lock.unlock();
+    utime_t  start_time_osd_drain = ceph_clock_now();
+
+    // then, wait on osd_op_tp to drain (TBD: should probably add a timeout)
+    osd_op_tp.drain();
+    osd_op_tp.stop();
+
+    utime_t  start_time_umount = ceph_clock_now();
+    store->prepare_for_fast_shutdown();
+    std::lock_guard lock(osd_lock);
+    // TBD: assert in allocator that nothing is being add
+    store->umount();
+
+    utime_t end_time = ceph_clock_now();
+    if (cct->_conf->osd_fast_shutdown_timeout) {
+      ceph_assert(end_time - start_time_func < cct->_conf->osd_fast_shutdown_timeout);
+    }
+    dout(0) <<"Fast Shutdown duration total     :" << end_time              - start_time_func       << " seconds" << dendl;
+    dout(0) <<"Fast Shutdown duration osd_drain :" << start_time_umount     - start_time_osd_drain  << " seconds" << dendl;
+    dout(0) <<"Fast Shutdown duration umount    :" << end_time              - start_time_umount     << " seconds" << dendl;
+    dout(0) <<"Fast Shutdown duration timer     :" << start_time_osd_drain  - start_time_timer      << " seconds" << dendl;
+    cct->_log->flush();
+
+    // now it is safe to exit
+    _exit(0);
+  }
+
    // stop MgrClient earlier as it's more like an internal consumer of OSD
    mgrc.shutdown();
  
@@ -4448,6 +4504,9 @@ int OSD::shutdown()
    hb_front_server_messenger->shutdown();
    hb_back_server_messenger->shutdown();
  
+  utime_t duration = ceph_clock_now() - start_time_func;
+  dout(0) <<"Slow Shutdown duration:" << duration << " seconds" << dendl;
+
    tracing::osd::tracer.shutdown();
  
    return r;
@@ -11072,6 +11131,11 @@ void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
  }
  
  void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
+  if (unlikely(m_fast_shutdown) ) {
+    // stop enqueing when we are in the middle of a fast shutdown
+    return;
+  }
+
    uint32_t shard_index =
      item.get_ordering_token().hash_to_shard(osd->shards.size());
  
@@ -11102,6 +11166,11 @@ void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
  
  void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
  {
+  if (unlikely(m_fast_shutdown) ) {
+    // stop enqueing when we are in the middle of a fast shutdown
+    return;
+  }
+
    auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
    auto& sdata = osd->shards[shard_index];
    ceph_assert(sdata);
@@ -11128,6 +11197,24 @@ void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
    sdata->sdata_cond.notify_one();
  }
  
+void OSD::ShardedOpWQ::stop_for_fast_shutdown()
+{
+  uint32_t shard_index = 0;
+  m_fast_shutdown = true;
+
+  for (; shard_index < osd->num_shards; shard_index++) {
+    auto& sdata = osd->shards[shard_index];
+    ceph_assert(sdata);
+    sdata->shard_lock.lock();
+    int work_count = 0;
+    while(! sdata->scheduler->empty() ) {
+      auto work_item = sdata->scheduler->dequeue();
+      work_count++;
+    }
+    sdata->shard_lock.unlock();
+  }
+}
+
  namespace ceph::osd_cmds {
  
  int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f,
diff --git a/src/osd/OSD.h b/src/osd/OSD.h

index 30d0b0b4aef07a78d284fec2a782fc074bdca1c0..2da5de10aa69bb426dff0287109d404b75c3fcde 100644 (file)
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -1592,7 +1592,7 @@ protected:
      : public ShardedThreadPool::ShardedWQ<OpSchedulerItem>
    {
      OSD *osd;
-
+    bool m_fast_shutdown = false;
    public:
      ShardedOpWQ(OSD *o,
                 ceph::timespan ti,
@@ -1610,6 +1610,8 @@ protected:
      /// try to do some work
      void _process(uint32_t thread_index, ceph::heartbeat_handle_d *hb) override;
  
+    void stop_for_fast_shutdown();
+
      /// enqueue a new item
      void _enqueue(OpSchedulerItem&& item) override;
author	Gabriel BenHanokh <benhanokh@gmail.com>
	Mon, 7 Mar 2022 15:16:54 +0000 (17:16 +0200)
committer	Gabriel BenHanokh <benhanokh@gmail.com>
	Mon, 7 Mar 2022 15:40:12 +0000 (17:40 +0200)
src/common/options/global.yaml.in		patch \| blob \| history
src/os/ObjectStore.h		patch \| blob \| history
src/os/bluestore/BlueStore.cc		patch \| blob \| history
src/os/bluestore/BlueStore.h		patch \| blob \| history
src/osd/OSD.cc		patch \| blob \| history
src/osd/OSD.h		patch \| blob \| history