os/bluestore/KernelDevice: async discard has been implemented.

author Taeksang Kim <voidbag@gmail.com>

Wed, 31 May 2017 10:52:20 +0000 (19:52 +0900)

committer Taeksang Kim <voidbag@gmail.com>

Tue, 20 Feb 2018 15:00:52 +0000 (00:00 +0900)
author Taeksang Kim <voidbag@gmail.com>
Wed, 31 May 2017 10:52:20 +0000 (19:52 +0900)
committer Taeksang Kim <voidbag@gmail.com>
Tue, 20 Feb 2018 15:00:52 +0000 (00:00 +0900)
diff --git a/src/common/legacy_config_opts.h b/src/common/legacy_config_opts.h

index 298721ef83bdec318dfcd9bcf3f5ccdd157525c7..80efe5a73bb0c698c95ac26989e727bb051f45d2 100644 (file)
--- a/src/common/legacy_config_opts.h
+++ b/src/common/legacy_config_opts.h
@@ -938,6 +938,8 @@ OPTION(bdev_debug_aio_suicide_timeout, OPT_FLOAT)
  // NVMe driver is loaded while osd is running.
  OPTION(bdev_nvme_unbind_from_kernel, OPT_BOOL)
  OPTION(bdev_nvme_retry_count, OPT_INT) // -1 means by default which is 4
+OPTION(bdev_enable_discard, OPT_BOOL)
+OPTION(bdev_async_discard, OPT_BOOL)
  
  OPTION(objectstore_blackhole, OPT_BOOL)
  
diff --git a/src/common/options.cc b/src/common/options.cc

index 628fc57269f6b1aca5ae8695fddae567683cfa26..64848ad53567a3a7595f938c74a987b6c664c270 100644 (file)
--- a/src/common/options.cc
+++ b/src/common/options.cc
@@ -3394,6 +3394,14 @@ std::vector<Option> get_global_options() {
      .set_default(-1)
      .set_description(""),
  
+    Option("bdev_enable_discard", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("bdev_async_discard", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
      Option("bluefs_alloc_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
      .set_default(1_M)
      .set_description(""),
diff --git a/src/os/bluestore/BlockDevice.cc b/src/os/bluestore/BlockDevice.cc

index de8e00375bd75601c55bd033631d867694c416b2..3f91651990aec99e4b7198c4ce4a17b1f599187d 100644 (file)
--- a/src/os/bluestore/BlockDevice.cc
+++ b/src/os/bluestore/BlockDevice.cc
@@ -84,7 +84,7 @@ void IOContext::release_running_aios()
  }
  
  BlockDevice *BlockDevice::create(CephContext* cct, const string& path,
-                                aio_callback_t cb, void *cbpriv)
+                                aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv)
  {
    string type = "kernel";
    char buf[PATH_MAX + 1];
@@ -117,7 +117,7 @@ BlockDevice *BlockDevice::create(CephContext* cct, const string& path,
  #endif
  #if defined(HAVE_LIBAIO)
    if (type == "kernel") {
-    return new KernelDevice(cct, cb, cbpriv);
+    return new KernelDevice(cct, cb, cbpriv, d_cb, d_cbpriv);
    }
  #endif
  #if defined(HAVE_SPDK)
diff --git a/src/os/bluestore/BlockDevice.h b/src/os/bluestore/BlockDevice.h

index 90c02067eca408ab540e72e966e62b750d3e1969..8e69c4a7c395a23a6a3545accf556b88a2f840ee 100644 (file)
--- a/src/os/bluestore/BlockDevice.h
+++ b/src/os/bluestore/BlockDevice.h
@@ -129,7 +129,7 @@ public:
    virtual ~BlockDevice() = default;
  
    static BlockDevice *create(
-    CephContext* cct, const std::string& path, aio_callback_t cb, void *cbpriv);
+    CephContext* cct, const std::string& path, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv);
    virtual bool supported_bdev_label() { return true; }
    virtual bool is_rotational() { return rotational; }
  
@@ -179,6 +179,8 @@ public:
      bool buffered) = 0;
    virtual int flush() = 0;
    virtual int discard(uint64_t offset, uint64_t len) { return 0; }
+  virtual int queue_discard(interval_set<uint64_t> &to_release) { return -1; }
+  virtual void discard_drain() { return; }
  
    void queue_reap_ioc(IOContext *ioc);
    void reap_ioc();
diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc

index 94a3b9044257b4144f9dc98869981482a47c8ab6..7aa8bb6b18ac85725c4eee7b60c3bac2ab7315cd 100644 (file)
--- a/src/os/bluestore/BlueFS.cc
+++ b/src/os/bluestore/BlueFS.cc
@@ -24,6 +24,23 @@ MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer,
  MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs);
  MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs);
  
+static void wal_discard_cb(void *priv, void* priv2) {
+  BlueFS *bluefs = static_cast<BlueFS*>(priv);
+  interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
+  bluefs->handle_discard(BlueFS::BDEV_WAL, *tmp);
+}
+
+static void db_discard_cb(void *priv, void* priv2) {
+  BlueFS *bluefs = static_cast<BlueFS*>(priv);
+  interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
+  bluefs->handle_discard(BlueFS::BDEV_DB, *tmp);
+}
+
+static void slow_discard_cb(void *priv, void* priv2) {
+  BlueFS *bluefs = static_cast<BlueFS*>(priv);
+  interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
+  bluefs->handle_discard(BlueFS::BDEV_SLOW, *tmp);
+}
  
  BlueFS::BlueFS(CephContext* cct)
    : cct(cct),
@@ -31,6 +48,9 @@ BlueFS::BlueFS(CephContext* cct)
      ioc(MAX_BDEV),
      block_all(MAX_BDEV)
  {
+  discard_cb[BDEV_WAL] = wal_discard_cb;
+  discard_cb[BDEV_DB] = db_discard_cb;
+  discard_cb[BDEV_SLOW] = slow_discard_cb;
  }
  
  BlueFS::~BlueFS()
@@ -133,7 +153,7 @@ int BlueFS::add_block_device(unsigned id, const string& path)
    dout(10) << __func__ << " bdev " << id << " path " << path << dendl;
    assert(id < bdev.size());
    assert(bdev[id] == NULL);
-  BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL);
+  BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL, discard_cb[id], static_cast<void*>(this));
    int r = b->open(path);
    if (r < 0) {
      delete b;
@@ -222,6 +242,13 @@ int BlueFS::reclaim_blocks(unsigned id, uint64_t want,
    return 0;
  }
  
+void BlueFS::handle_discard(unsigned id, interval_set<uint64_t>& to_release)
+{
+  dout(10) << __func__ << " bdev " << id << dendl;
+  assert(alloc[id]);
+  alloc[id]->release(to_release);
+}
+
  uint64_t BlueFS::get_fs_usage()
  {
    std::lock_guard<std::mutex> l(lock);
@@ -379,6 +406,11 @@ void BlueFS::_init_alloc()
  void BlueFS::_stop_alloc()
  {
    dout(20) << __func__ << dendl;
+  for (auto p : bdev) {
+    if (p)
+      p->discard_drain();
+  }
+
    for (auto p : alloc) {
      if (p != nullptr)  {
        p->shutdown();
@@ -1572,8 +1604,15 @@ int BlueFS::_flush_and_sync_log(std::unique_lock<std::mutex>& l,
    for (unsigned i = 0; i < to_release.size(); ++i) {
      if (!to_release[i].empty()) {
        /* OK, now we have the guarantee alloc[i] won't be null. */
-      for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) {
-        bdev[i]->discard(p.get_start(), p.get_len());
+      int r = 0;
+      if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
+       r = bdev[i]->queue_discard(to_release[i]);
+       if (r == 0)
+         continue;
+      } else if (cct->_conf->bdev_enable_discard) {
+       for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) {
+         bdev[i]->discard(p.get_start(), p.get_len());
+       }
        }
        alloc[i]->release(to_release[i]);
      }
diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h

index 78f5e8b3274e902d596e8e1dfb2995171d417394..a176a5bb689ba26fd95f5080fb78f455b6466bb2 100644 (file)
--- a/src/os/bluestore/BlueFS.h
+++ b/src/os/bluestore/BlueFS.h
@@ -254,6 +254,8 @@ private:
    vector<Allocator*> alloc;                   ///< allocators for bdevs
    vector<interval_set<uint64_t>> pending_release; ///< extents to release
  
+  BlockDevice::aio_callback_t discard_cb[3]; //discard callbacks for each dev
+
    void _init_logger();
    void _shutdown_logger();
    void _update_logger_stats();
@@ -405,6 +407,9 @@ public:
    int reclaim_blocks(unsigned bdev, uint64_t want,
                      PExtentVector *extents);
  
+  // handler for discard event
+  void handle_discard(unsigned dev, interval_set<uint64_t>& to_release);
+
    void flush(FileWriter *h) {
      std::lock_guard<std::mutex> l(lock);
      _flush(h, false);
diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc

index 2d0d8ecdc72623187f8c29a93a5046d519af0ebd..f696bd1102362e9cb0e617fc726d7f795b211b1b 100644 (file)
--- a/src/os/bluestore/BlueStore.cc
+++ b/src/os/bluestore/BlueStore.cc
@@ -3609,6 +3609,20 @@ static void aio_cb(void *priv, void *priv2)
    c->aio_finish(store);
  }
  
+static void discard_cb(void *priv, void *priv2)
+{
+  BlueStore *store = static_cast<BlueStore*>(priv);
+  interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
+  store->handle_discard(*tmp);
+}
+
+void BlueStore::handle_discard(interval_set<uint64_t>& to_release)
+{
+  dout(10) << __func__ << dendl;
+  assert(alloc);
+  alloc->release(to_release);
+}
+
  BlueStore::BlueStore(CephContext *cct, const string& path)
    : ObjectStore(cct, path),
      throttle_bytes(cct, "bluestore_throttle_bytes",
@@ -4296,7 +4310,7 @@ int BlueStore::_open_bdev(bool create)
  {
    assert(bdev == NULL);
    string p = path + "/block";
-  bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this));
+  bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this), discard_cb, static_cast<void*>(this));
    int r = bdev->open(p);
    if (r < 0)
      goto fail;
@@ -4487,6 +4501,9 @@ int BlueStore::_open_alloc()
  
  void BlueStore::_close_alloc()
  {
+  assert(bdev);
+  bdev->discard_drain();
+
    assert(alloc);
    alloc->shutdown();
    delete alloc;
@@ -8286,15 +8303,25 @@ void BlueStore::_txc_release_alloc(TransContext *txc)
  {
    // it's expected we're called with lazy_release_lock already taken!
    if (likely(!cct->_conf->bluestore_debug_no_reuse_blocks)) {
-    for (interval_set<uint64_t>::iterator p = txc->released.begin();
-        p != txc->released.end();
-        ++p) {
-      bdev->discard(p.get_start(), p.get_len());
+    int r = 0;
+    if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
+      r = bdev->queue_discard(txc->released);
+      if (r == 0) {
+       dout(10) << __func__ << "(queued) " << txc << " " << std::hex
+                << txc->released << std::dec << dendl;
+       goto out;
+      }
+    } else if (cct->_conf->bdev_enable_discard) {
+      for (auto p = txc->released.begin(); p != txc->released.end(); ++p) {
+         bdev->discard(p.get_start(), p.get_len());
+      }
      }
-    dout(10) << __func__ << " " << txc << " " << std::hex
+    dout(10) << __func__ << "(sync) " << txc << " " << std::hex
               << txc->released << std::dec << dendl;
      alloc->release(txc->released);
    }
+
+out:
    txc->allocated.clear();
    txc->released.clear();
  }
diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h

index 9bdb12c17116b8be3823f63be77756905e9a1c44..a7de90a96034b7311f5884c1c729024a9072fa04 100644 (file)
--- a/src/os/bluestore/BlueStore.h
+++ b/src/os/bluestore/BlueStore.h
@@ -130,6 +130,9 @@ public:
    void handle_conf_change(const struct md_config_t *conf,
                                    const std::set<std::string> &changed) override;
  
+  //handler for discard event
+  void handle_discard(interval_set<uint64_t>& to_release);
+
    void _set_csum();
    void _set_compression();
    void _set_throttle_params();
diff --git a/src/os/bluestore/KernelDevice.cc b/src/os/bluestore/KernelDevice.cc

index 68f5664245eb0c223a6ed99f2b6680033d914516..30cea5c6f0613f772b246e66dc0abb23dbeaa26a 100644 (file)
--- a/src/os/bluestore/KernelDevice.cc
+++ b/src/os/bluestore/KernelDevice.cc
@@ -32,15 +32,20 @@
  #undef dout_prefix
  #define dout_prefix *_dout << "bdev(" << this << " " << path << ") "
  
-KernelDevice::KernelDevice(CephContext* cct, aio_callback_t cb, void *cbpriv)
+KernelDevice::KernelDevice(CephContext* cct, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv)
    : BlockDevice(cct, cb, cbpriv),
      fd_direct(-1),
      fd_buffered(-1),
      fs(NULL), aio(false), dio(false),
      debug_lock("KernelDevice::debug_lock"),
      aio_queue(cct->_conf->bdev_aio_max_queue_depth),
+    discard_callback(d_cb),
+    discard_callback_priv(d_cbpriv),
      aio_stop(false),
+    discard_started(false),
+    discard_stop(false),
      aio_thread(this),
+    discard_thread(this),
      injecting_crash(0)
  {
  }
@@ -145,6 +150,7 @@ int KernelDevice::open(const string& p)
    if (r < 0) {
      goto out_fail;
    }
+  _discard_start();
  
    fs = FS::create_by_fd(fd_direct);
    assert(fs);
@@ -187,6 +193,7 @@ void KernelDevice::close()
  {
    dout(1) << __func__ << dendl;
    _aio_stop();
+  _discard_stop();
  
    assert(fs);
    delete fs;
@@ -342,6 +349,40 @@ void KernelDevice::_aio_stop()
    }
  }
  
+int KernelDevice::_discard_start()
+{
+    discard_thread.create("bstore_discard");
+    return 0;
+}
+
+void KernelDevice::_discard_stop()
+{
+  dout(10) << __func__ << dendl;
+  {
+    std::unique_lock<std::mutex> l(discard_lock);
+    while (!discard_started) {
+      discard_cond.wait(l);
+    }
+    discard_stop = true;
+    discard_cond.notify_all();
+  }
+  discard_thread.join();
+  {
+    std::lock_guard<std::mutex> l(discard_lock);
+    discard_stop = false;
+  }
+  dout(10) << __func__ << " stopped" << dendl;
+}
+
+void KernelDevice::discard_drain()
+{
+  dout(10) << __func__ << dendl;
+  std::unique_lock<std::mutex> l(discard_lock);
+  while (!discard_queued.empty() || discard_running) {
+    discard_cond.wait(l);
+  }
+}
+
  void KernelDevice::_aio_thread()
  {
    dout(10) << __func__ << " start" << dendl;
@@ -440,6 +481,54 @@ void KernelDevice::_aio_thread()
    dout(10) << __func__ << " end" << dendl;
  }
  
+void KernelDevice::_discard_thread()
+{
+  std::unique_lock<std::mutex> l(discard_lock);
+  assert(!discard_started);
+  discard_started = true;
+  discard_cond.notify_all();
+  while (true) {
+    assert(discard_finishing.empty());
+    if (discard_queued.empty()) {
+      if (discard_stop)
+       break;
+      dout(20) << __func__ << " sleep" << dendl;
+      discard_cond.notify_all(); // for the thread trying to drain...
+      discard_cond.wait(l);
+      dout(20) << __func__ << " wake" << dendl;
+    } else {
+      discard_finishing.swap(discard_queued);
+      discard_running = true;
+      l.unlock();
+      dout(20) << __func__ << " finishing" << dendl;
+      for (auto p = discard_finishing.begin();p != discard_finishing.end(); ++p) {
+       discard(p.get_start(), p.get_len());
+      }
+
+      discard_callback(discard_callback_priv, static_cast<void*>(&discard_finishing));
+      discard_finishing.clear();
+      l.lock();
+      discard_running = false;
+    }
+  }
+  dout(10) << __func__ << " finish" << dendl;
+  discard_started = false;
+}
+
+int KernelDevice::queue_discard(interval_set<uint64_t> &to_release)
+{
+  if (rotational)
+    return -1;
+
+  if (to_release.empty())
+    return 0;
+
+  std::lock_guard<std::mutex> l(discard_lock);
+  discard_queued.insert(to_release);
+  discard_cond.notify_all();
+  return 0;
+}
+
  void KernelDevice::_aio_log_start(
    IOContext *ioc,
    uint64_t offset,
diff --git a/src/os/bluestore/KernelDevice.h b/src/os/bluestore/KernelDevice.h

index e31978a0af9c03d7c4c4d8e2c8cfd4456d00b9b5..791c1e8fbf6079ed4218c35ddbe4813b134cbd1d 100644 (file)
--- a/src/os/bluestore/KernelDevice.h
+++ b/src/os/bluestore/KernelDevice.h
@@ -38,7 +38,17 @@ class KernelDevice : public BlockDevice {
    std::mutex flush_mutex;
  
    aio_queue_t aio_queue;
+  aio_callback_t discard_callback;
+  void *discard_callback_priv;
    bool aio_stop;
+  bool discard_started;
+  bool discard_stop;
+
+  std::mutex discard_lock;
+  std::condition_variable discard_cond;
+  bool discard_running = false;
+  interval_set<uint64_t> discard_queued;
+  interval_set<uint64_t> discard_finishing;
  
    struct AioCompletionThread : public Thread {
      KernelDevice *bdev;
@@ -49,12 +59,27 @@ class KernelDevice : public BlockDevice {
      }
    } aio_thread;
  
+  struct DiscardThread : public Thread {
+    KernelDevice *bdev;
+    explicit DiscardThread(KernelDevice *b) : bdev(b) {}
+    void *entry() override {
+      bdev->_discard_thread();
+      return NULL;
+    }
+  } discard_thread;
+
    std::atomic_int injecting_crash;
  
    void _aio_thread();
+  void _discard_thread();
+  int queue_discard(interval_set<uint64_t> &to_release) override;
+
    int _aio_start();
    void _aio_stop();
  
+  int _discard_start();
+  void _discard_stop();
+
    void _aio_log_start(IOContext *ioc, uint64_t offset, uint64_t length);
    void _aio_log_finish(IOContext *ioc, uint64_t offset, uint64_t length);
  
@@ -73,9 +98,10 @@ class KernelDevice : public BlockDevice {
    void debug_aio_unlink(aio_t& aio);
  
  public:
-  KernelDevice(CephContext* cct, aio_callback_t cb, void *cbpriv);
+  KernelDevice(CephContext* cct, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv);
  
    void aio_submit(IOContext *ioc) override;
+  void discard_drain() override;
  
    int collect_metadata(const std::string& prefix, map<std::string,std::string> *pm) const override;
    int get_devname(std::string *s) override {
author	Taeksang Kim <voidbag@gmail.com>
	Wed, 31 May 2017 10:52:20 +0000 (19:52 +0900)
committer	Taeksang Kim <voidbag@gmail.com>
	Tue, 20 Feb 2018 15:00:52 +0000 (00:00 +0900)
src/common/legacy_config_opts.h		patch \| blob \| history
src/common/options.cc		patch \| blob \| history
src/os/bluestore/BlockDevice.cc		patch \| blob \| history
src/os/bluestore/BlockDevice.h		patch \| blob \| history
src/os/bluestore/BlueFS.cc		patch \| blob \| history
src/os/bluestore/BlueFS.h		patch \| blob \| history
src/os/bluestore/BlueStore.cc		patch \| blob \| history
src/os/bluestore/BlueStore.h		patch \| blob \| history
src/os/bluestore/KernelDevice.cc		patch \| blob \| history
src/os/bluestore/KernelDevice.h		patch \| blob \| history