]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
os/bluestore/KernelDevice: async discard has been implemented.
authorTaeksang Kim <voidbag@gmail.com>
Wed, 31 May 2017 10:52:20 +0000 (19:52 +0900)
committerTaeksang Kim <voidbag@gmail.com>
Tue, 20 Feb 2018 15:00:52 +0000 (00:00 +0900)
Signed-off-by: Taeksang Kim <voidbag@gmail.com>
src/common/legacy_config_opts.h
src/common/options.cc
src/os/bluestore/BlockDevice.cc
src/os/bluestore/BlockDevice.h
src/os/bluestore/BlueFS.cc
src/os/bluestore/BlueFS.h
src/os/bluestore/BlueStore.cc
src/os/bluestore/BlueStore.h
src/os/bluestore/KernelDevice.cc
src/os/bluestore/KernelDevice.h

index 298721ef83bdec318dfcd9bcf3f5ccdd157525c7..80efe5a73bb0c698c95ac26989e727bb051f45d2 100644 (file)
@@ -938,6 +938,8 @@ OPTION(bdev_debug_aio_suicide_timeout, OPT_FLOAT)
 // NVMe driver is loaded while osd is running.
 OPTION(bdev_nvme_unbind_from_kernel, OPT_BOOL)
 OPTION(bdev_nvme_retry_count, OPT_INT) // -1 means by default which is 4
+OPTION(bdev_enable_discard, OPT_BOOL)
+OPTION(bdev_async_discard, OPT_BOOL)
 
 OPTION(objectstore_blackhole, OPT_BOOL)
 
index 628fc57269f6b1aca5ae8695fddae567683cfa26..64848ad53567a3a7595f938c74a987b6c664c270 100644 (file)
@@ -3394,6 +3394,14 @@ std::vector<Option> get_global_options() {
     .set_default(-1)
     .set_description(""),
 
+    Option("bdev_enable_discard", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("bdev_async_discard", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
     Option("bluefs_alloc_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(1_M)
     .set_description(""),
index de8e00375bd75601c55bd033631d867694c416b2..3f91651990aec99e4b7198c4ce4a17b1f599187d 100644 (file)
@@ -84,7 +84,7 @@ void IOContext::release_running_aios()
 }
 
 BlockDevice *BlockDevice::create(CephContext* cct, const string& path,
-                                aio_callback_t cb, void *cbpriv)
+                                aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv)
 {
   string type = "kernel";
   char buf[PATH_MAX + 1];
@@ -117,7 +117,7 @@ BlockDevice *BlockDevice::create(CephContext* cct, const string& path,
 #endif
 #if defined(HAVE_LIBAIO)
   if (type == "kernel") {
-    return new KernelDevice(cct, cb, cbpriv);
+    return new KernelDevice(cct, cb, cbpriv, d_cb, d_cbpriv);
   }
 #endif
 #if defined(HAVE_SPDK)
index 90c02067eca408ab540e72e966e62b750d3e1969..8e69c4a7c395a23a6a3545accf556b88a2f840ee 100644 (file)
@@ -129,7 +129,7 @@ public:
   virtual ~BlockDevice() = default;
 
   static BlockDevice *create(
-    CephContext* cct, const std::string& path, aio_callback_t cb, void *cbpriv);
+    CephContext* cct, const std::string& path, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv);
   virtual bool supported_bdev_label() { return true; }
   virtual bool is_rotational() { return rotational; }
 
@@ -179,6 +179,8 @@ public:
     bool buffered) = 0;
   virtual int flush() = 0;
   virtual int discard(uint64_t offset, uint64_t len) { return 0; }
+  virtual int queue_discard(interval_set<uint64_t> &to_release) { return -1; }
+  virtual void discard_drain() { return; }
 
   void queue_reap_ioc(IOContext *ioc);
   void reap_ioc();
index 94a3b9044257b4144f9dc98869981482a47c8ab6..7aa8bb6b18ac85725c4eee7b60c3bac2ab7315cd 100644 (file)
@@ -24,6 +24,23 @@ MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer,
 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs);
 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs);
 
+static void wal_discard_cb(void *priv, void* priv2) {
+  BlueFS *bluefs = static_cast<BlueFS*>(priv);
+  interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
+  bluefs->handle_discard(BlueFS::BDEV_WAL, *tmp);
+}
+
+static void db_discard_cb(void *priv, void* priv2) {
+  BlueFS *bluefs = static_cast<BlueFS*>(priv);
+  interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
+  bluefs->handle_discard(BlueFS::BDEV_DB, *tmp);
+}
+
+static void slow_discard_cb(void *priv, void* priv2) {
+  BlueFS *bluefs = static_cast<BlueFS*>(priv);
+  interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
+  bluefs->handle_discard(BlueFS::BDEV_SLOW, *tmp);
+}
 
 BlueFS::BlueFS(CephContext* cct)
   : cct(cct),
@@ -31,6 +48,9 @@ BlueFS::BlueFS(CephContext* cct)
     ioc(MAX_BDEV),
     block_all(MAX_BDEV)
 {
+  discard_cb[BDEV_WAL] = wal_discard_cb;
+  discard_cb[BDEV_DB] = db_discard_cb;
+  discard_cb[BDEV_SLOW] = slow_discard_cb;
 }
 
 BlueFS::~BlueFS()
@@ -133,7 +153,7 @@ int BlueFS::add_block_device(unsigned id, const string& path)
   dout(10) << __func__ << " bdev " << id << " path " << path << dendl;
   assert(id < bdev.size());
   assert(bdev[id] == NULL);
-  BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL);
+  BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL, discard_cb[id], static_cast<void*>(this));
   int r = b->open(path);
   if (r < 0) {
     delete b;
@@ -222,6 +242,13 @@ int BlueFS::reclaim_blocks(unsigned id, uint64_t want,
   return 0;
 }
 
+void BlueFS::handle_discard(unsigned id, interval_set<uint64_t>& to_release)
+{
+  dout(10) << __func__ << " bdev " << id << dendl;
+  assert(alloc[id]);
+  alloc[id]->release(to_release);
+}
+
 uint64_t BlueFS::get_fs_usage()
 {
   std::lock_guard<std::mutex> l(lock);
@@ -379,6 +406,11 @@ void BlueFS::_init_alloc()
 void BlueFS::_stop_alloc()
 {
   dout(20) << __func__ << dendl;
+  for (auto p : bdev) {
+    if (p)
+      p->discard_drain();
+  }
+
   for (auto p : alloc) {
     if (p != nullptr)  {
       p->shutdown();
@@ -1572,8 +1604,15 @@ int BlueFS::_flush_and_sync_log(std::unique_lock<std::mutex>& l,
   for (unsigned i = 0; i < to_release.size(); ++i) {
     if (!to_release[i].empty()) {
       /* OK, now we have the guarantee alloc[i] won't be null. */
-      for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) {
-        bdev[i]->discard(p.get_start(), p.get_len());
+      int r = 0;
+      if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
+       r = bdev[i]->queue_discard(to_release[i]);
+       if (r == 0)
+         continue;
+      } else if (cct->_conf->bdev_enable_discard) {
+       for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) {
+         bdev[i]->discard(p.get_start(), p.get_len());
+       }
       }
       alloc[i]->release(to_release[i]);
     }
index 78f5e8b3274e902d596e8e1dfb2995171d417394..a176a5bb689ba26fd95f5080fb78f455b6466bb2 100644 (file)
@@ -254,6 +254,8 @@ private:
   vector<Allocator*> alloc;                   ///< allocators for bdevs
   vector<interval_set<uint64_t>> pending_release; ///< extents to release
 
+  BlockDevice::aio_callback_t discard_cb[3]; //discard callbacks for each dev
+
   void _init_logger();
   void _shutdown_logger();
   void _update_logger_stats();
@@ -405,6 +407,9 @@ public:
   int reclaim_blocks(unsigned bdev, uint64_t want,
                     PExtentVector *extents);
 
+  // handler for discard event
+  void handle_discard(unsigned dev, interval_set<uint64_t>& to_release);
+
   void flush(FileWriter *h) {
     std::lock_guard<std::mutex> l(lock);
     _flush(h, false);
index 2d0d8ecdc72623187f8c29a93a5046d519af0ebd..f696bd1102362e9cb0e617fc726d7f795b211b1b 100644 (file)
@@ -3609,6 +3609,20 @@ static void aio_cb(void *priv, void *priv2)
   c->aio_finish(store);
 }
 
+static void discard_cb(void *priv, void *priv2)
+{
+  BlueStore *store = static_cast<BlueStore*>(priv);
+  interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
+  store->handle_discard(*tmp);
+}
+
+void BlueStore::handle_discard(interval_set<uint64_t>& to_release)
+{
+  dout(10) << __func__ << dendl;
+  assert(alloc);
+  alloc->release(to_release);
+}
+
 BlueStore::BlueStore(CephContext *cct, const string& path)
   : ObjectStore(cct, path),
     throttle_bytes(cct, "bluestore_throttle_bytes",
@@ -4296,7 +4310,7 @@ int BlueStore::_open_bdev(bool create)
 {
   assert(bdev == NULL);
   string p = path + "/block";
-  bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this));
+  bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this), discard_cb, static_cast<void*>(this));
   int r = bdev->open(p);
   if (r < 0)
     goto fail;
@@ -4487,6 +4501,9 @@ int BlueStore::_open_alloc()
 
 void BlueStore::_close_alloc()
 {
+  assert(bdev);
+  bdev->discard_drain();
+
   assert(alloc);
   alloc->shutdown();
   delete alloc;
@@ -8286,15 +8303,25 @@ void BlueStore::_txc_release_alloc(TransContext *txc)
 {
   // it's expected we're called with lazy_release_lock already taken!
   if (likely(!cct->_conf->bluestore_debug_no_reuse_blocks)) {
-    for (interval_set<uint64_t>::iterator p = txc->released.begin();
-        p != txc->released.end();
-        ++p) {
-      bdev->discard(p.get_start(), p.get_len());
+    int r = 0;
+    if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
+      r = bdev->queue_discard(txc->released);
+      if (r == 0) {
+       dout(10) << __func__ << "(queued) " << txc << " " << std::hex
+                << txc->released << std::dec << dendl;
+       goto out;
+      }
+    } else if (cct->_conf->bdev_enable_discard) {
+      for (auto p = txc->released.begin(); p != txc->released.end(); ++p) {
+         bdev->discard(p.get_start(), p.get_len());
+      }
     }
-    dout(10) << __func__ << " " << txc << " " << std::hex
+    dout(10) << __func__ << "(sync) " << txc << " " << std::hex
              << txc->released << std::dec << dendl;
     alloc->release(txc->released);
   }
+
+out:
   txc->allocated.clear();
   txc->released.clear();
 }
index 9bdb12c17116b8be3823f63be77756905e9a1c44..a7de90a96034b7311f5884c1c729024a9072fa04 100644 (file)
@@ -130,6 +130,9 @@ public:
   void handle_conf_change(const struct md_config_t *conf,
                                   const std::set<std::string> &changed) override;
 
+  //handler for discard event
+  void handle_discard(interval_set<uint64_t>& to_release);
+
   void _set_csum();
   void _set_compression();
   void _set_throttle_params();
index 68f5664245eb0c223a6ed99f2b6680033d914516..30cea5c6f0613f772b246e66dc0abb23dbeaa26a 100644 (file)
 #undef dout_prefix
 #define dout_prefix *_dout << "bdev(" << this << " " << path << ") "
 
-KernelDevice::KernelDevice(CephContext* cct, aio_callback_t cb, void *cbpriv)
+KernelDevice::KernelDevice(CephContext* cct, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv)
   : BlockDevice(cct, cb, cbpriv),
     fd_direct(-1),
     fd_buffered(-1),
     fs(NULL), aio(false), dio(false),
     debug_lock("KernelDevice::debug_lock"),
     aio_queue(cct->_conf->bdev_aio_max_queue_depth),
+    discard_callback(d_cb),
+    discard_callback_priv(d_cbpriv),
     aio_stop(false),
+    discard_started(false),
+    discard_stop(false),
     aio_thread(this),
+    discard_thread(this),
     injecting_crash(0)
 {
 }
@@ -145,6 +150,7 @@ int KernelDevice::open(const string& p)
   if (r < 0) {
     goto out_fail;
   }
+  _discard_start();
 
   fs = FS::create_by_fd(fd_direct);
   assert(fs);
@@ -187,6 +193,7 @@ void KernelDevice::close()
 {
   dout(1) << __func__ << dendl;
   _aio_stop();
+  _discard_stop();
 
   assert(fs);
   delete fs;
@@ -342,6 +349,40 @@ void KernelDevice::_aio_stop()
   }
 }
 
+int KernelDevice::_discard_start()
+{
+    discard_thread.create("bstore_discard");
+    return 0;
+}
+
+void KernelDevice::_discard_stop()
+{
+  dout(10) << __func__ << dendl;
+  {
+    std::unique_lock<std::mutex> l(discard_lock);
+    while (!discard_started) {
+      discard_cond.wait(l);
+    }
+    discard_stop = true;
+    discard_cond.notify_all();
+  }
+  discard_thread.join();
+  {
+    std::lock_guard<std::mutex> l(discard_lock);
+    discard_stop = false;
+  }
+  dout(10) << __func__ << " stopped" << dendl;
+}
+
+void KernelDevice::discard_drain()
+{
+  dout(10) << __func__ << dendl;
+  std::unique_lock<std::mutex> l(discard_lock);
+  while (!discard_queued.empty() || discard_running) {
+    discard_cond.wait(l);
+  }
+}
+
 void KernelDevice::_aio_thread()
 {
   dout(10) << __func__ << " start" << dendl;
@@ -440,6 +481,54 @@ void KernelDevice::_aio_thread()
   dout(10) << __func__ << " end" << dendl;
 }
 
+void KernelDevice::_discard_thread()
+{
+  std::unique_lock<std::mutex> l(discard_lock);
+  assert(!discard_started);
+  discard_started = true;
+  discard_cond.notify_all();
+  while (true) {
+    assert(discard_finishing.empty());
+    if (discard_queued.empty()) {
+      if (discard_stop)
+       break;
+      dout(20) << __func__ << " sleep" << dendl;
+      discard_cond.notify_all(); // for the thread trying to drain...
+      discard_cond.wait(l);
+      dout(20) << __func__ << " wake" << dendl;
+    } else {
+      discard_finishing.swap(discard_queued);
+      discard_running = true;
+      l.unlock();
+      dout(20) << __func__ << " finishing" << dendl;
+      for (auto p = discard_finishing.begin();p != discard_finishing.end(); ++p) {
+       discard(p.get_start(), p.get_len());
+      }
+
+      discard_callback(discard_callback_priv, static_cast<void*>(&discard_finishing));
+      discard_finishing.clear();
+      l.lock();
+      discard_running = false;
+    }
+  }
+  dout(10) << __func__ << " finish" << dendl;
+  discard_started = false;
+}
+
+int KernelDevice::queue_discard(interval_set<uint64_t> &to_release)
+{
+  if (rotational)
+    return -1;
+
+  if (to_release.empty())
+    return 0;
+
+  std::lock_guard<std::mutex> l(discard_lock);
+  discard_queued.insert(to_release);
+  discard_cond.notify_all();
+  return 0;
+}
+
 void KernelDevice::_aio_log_start(
   IOContext *ioc,
   uint64_t offset,
index e31978a0af9c03d7c4c4d8e2c8cfd4456d00b9b5..791c1e8fbf6079ed4218c35ddbe4813b134cbd1d 100644 (file)
@@ -38,7 +38,17 @@ class KernelDevice : public BlockDevice {
   std::mutex flush_mutex;
 
   aio_queue_t aio_queue;
+  aio_callback_t discard_callback;
+  void *discard_callback_priv;
   bool aio_stop;
+  bool discard_started;
+  bool discard_stop;
+
+  std::mutex discard_lock;
+  std::condition_variable discard_cond;
+  bool discard_running = false;
+  interval_set<uint64_t> discard_queued;
+  interval_set<uint64_t> discard_finishing;
 
   struct AioCompletionThread : public Thread {
     KernelDevice *bdev;
@@ -49,12 +59,27 @@ class KernelDevice : public BlockDevice {
     }
   } aio_thread;
 
+  struct DiscardThread : public Thread {
+    KernelDevice *bdev;
+    explicit DiscardThread(KernelDevice *b) : bdev(b) {}
+    void *entry() override {
+      bdev->_discard_thread();
+      return NULL;
+    }
+  } discard_thread;
+
   std::atomic_int injecting_crash;
 
   void _aio_thread();
+  void _discard_thread();
+  int queue_discard(interval_set<uint64_t> &to_release) override;
+
   int _aio_start();
   void _aio_stop();
 
+  int _discard_start();
+  void _discard_stop();
+
   void _aio_log_start(IOContext *ioc, uint64_t offset, uint64_t length);
   void _aio_log_finish(IOContext *ioc, uint64_t offset, uint64_t length);
 
@@ -73,9 +98,10 @@ class KernelDevice : public BlockDevice {
   void debug_aio_unlink(aio_t& aio);
 
 public:
-  KernelDevice(CephContext* cct, aio_callback_t cb, void *cbpriv);
+  KernelDevice(CephContext* cct, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv);
 
   void aio_submit(IOContext *ioc) override;
+  void discard_drain() override;
 
   int collect_metadata(const std::string& prefix, map<std::string,std::string> *pm) const override;
   int get_devname(std::string *s) override {