]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
BlueStore: Add onode column family KV block_cache.
authorMark Nelson <mnelson@redhat.com>
Sat, 13 Apr 2019 00:33:59 +0000 (19:33 -0500)
committerAdam Kupczyk <akupczyk@redhat.com>
Fri, 15 Jan 2021 16:20:19 +0000 (11:20 -0500)
Signed-off-by: Mark Nelson <mnelson@redhat.com>
src/common/PriorityCache.cc
src/common/options.cc
src/kv/KeyValueDB.h
src/kv/RocksDBStore.cc
src/kv/RocksDBStore.h
src/kv/rocksdb_cache/BinnedLRUCache.cc
src/kv/rocksdb_cache/BinnedLRUCache.h
src/os/bluestore/BlueStore.cc
src/os/bluestore/BlueStore.h

index af1fc213f324d4f35728de1f6c6d571de0c6219c..40ca98faba6008e5dcfc2dcda42b05f11394cfda 100644 (file)
@@ -38,11 +38,11 @@ namespace PriorityCache
     // shrink it to 1/256 of the rounded up cache size
     chunk /= 256;
 
-    // bound the chunk size to be between 4MB and 32MB
+    // bound the chunk size to be between 4MB and 64MB
     chunk = (chunk > 4ul*1024*1024) ? chunk : 4ul*1024*1024;
-    chunk = (chunk < 16ul*1024*1024) ? chunk : 16ul*1024*1024;
+    chunk = (chunk < 64ul*1024*1024) ? chunk : 64ul*1024*1024;
 
-    /* Add 16 chunks of headroom and round up to the near chunk.  Note that
+    /* FIXME: Hardcoded to force get_chunk to never drop below 64MB. 
      * if RocksDB is used, it's a good idea to have N MB of headroom where
      * N is the target_file_size_base value.  RocksDB will read SST files
      * into the block cache during compaction which potentially can force out
@@ -51,7 +51,7 @@ namespace PriorityCache
      * compaction reads allows the kv cache grow even during extremely heavy
      * compaction workloads.
      */
-    uint64_t val = usage + (16 * chunk);
+    uint64_t val = usage + 64*1024*1024;
     uint64_t r = (val) % chunk;
     if (r > 0)
       val = val + chunk - r;
index 71133bdfb81d77f7db20d878bda98064e68e939b..2033f72da3e4537fb028ea5e5346f46657fce905 100644 (file)
@@ -4442,15 +4442,20 @@ std::vector<Option> get_global_options() {
     .add_see_also("bluestore_cache_size"),
 
     Option("bluestore_cache_meta_ratio", Option::TYPE_FLOAT, Option::LEVEL_DEV)
-    .set_default(.4)
+    .set_default(.45)
     .add_see_also("bluestore_cache_size")
     .set_description("Ratio of bluestore cache to devote to metadata"),
 
     Option("bluestore_cache_kv_ratio", Option::TYPE_FLOAT, Option::LEVEL_DEV)
-    .set_default(.4)
+    .set_default(.45)
     .add_see_also("bluestore_cache_size")
     .set_description("Ratio of bluestore cache to devote to kv database (rocksdb)"),
 
+    Option("bluestore_cache_kv_onode_ratio", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(.04)
+    .add_see_also("bluestore_cache_size")
+    .set_description("Ratio of bluestore cache to devote to kv onode column family (rocksdb)"),
+
     Option("bluestore_cache_autotune", Option::TYPE_BOOL, Option::LEVEL_DEV)
     .set_default(true)
     .add_see_also("bluestore_cache_size")
index 6e4616e3a6c46eac6b1b97613610441433940d4c..a9a7965117c0dc0dc62d5e8af55df32aa008a185 100644 (file)
@@ -338,10 +338,20 @@ public:
     return -EOPNOTSUPP;
   }
 
+  virtual int64_t get_cache_usage(std::string prefix) const {
+    return -EOPNOTSUPP;
+  }
+
   virtual std::shared_ptr<PriorityCache::PriCache> get_priority_cache() const {
     return nullptr;
   }
 
+  virtual std::shared_ptr<PriorityCache::PriCache> get_priority_cache(std::string prefix) const {
+    return nullptr;
+  }
+
+
+
   virtual ~KeyValueDB() {}
 
   /// estimate space utilization for a prefix (in bytes)
index 90d2f4063a1c848d0399db752ad682f3ca271a13..beba8b8ed1e948b5792495422e436186356a5e5d 100644 (file)
@@ -456,6 +456,28 @@ int RocksDBStore::create_and_open(ostream &out,
   return do_open(out, true, false, cfs);
 }
 
+int RocksDBStore::init_block_cache(uint64_t size, rocksdb::BlockBasedTableOptions& bbto) {
+  auto shard_bits = cct->_conf()->rocksdb_cache_shard_bits;
+  if (cct->_conf()->rocksdb_cache_type == "binned_lru") {
+    bbto.block_cache = rocksdb_cache::NewBinnedLRUCache(cct, size, shard_bits);
+  } else if (cct->_conf()->rocksdb_cache_type == "lru") {
+    bbto.block_cache = rocksdb::NewLRUCache(size, shard_bits);
+  } else if (cct->_conf()->rocksdb_cache_type == "clock") {
+    bbto.block_cache = rocksdb::NewClockCache(size, shard_bits);
+    if (!bbto.block_cache) {
+      derr << "rocksdb_cache_type '" << cct->_conf()->rocksdb_cache_type
+           << "' chosen, but RocksDB not compiled with LibTBB. "
+           << dendl;
+      return -EINVAL;
+    }
+  } else {
+    derr << "unrecognized rocksdb_cache_type '" << g_conf()->rocksdb_cache_type
+      << "'" << dendl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
 int RocksDBStore::load_rocksdb_options(bool create_if_missing, rocksdb::Options& opt)
 {
   rocksdb::Status status;
@@ -525,31 +547,8 @@ int RocksDBStore::load_rocksdb_options(bool create_if_missing, rocksdb::Options&
   }
   uint64_t row_cache_size = cache_size * cct->_conf->rocksdb_cache_row_ratio;
   uint64_t block_cache_size = cache_size - row_cache_size;
+  init_block_cache(block_cache_size, bbt_opts);
 
-  if (cct->_conf->rocksdb_cache_type == "binned_lru") {
-    bbt_opts.block_cache = rocksdb_cache::NewBinnedLRUCache(
-      cct,
-      block_cache_size,
-      cct->_conf->rocksdb_cache_shard_bits);
-  } else if (cct->_conf->rocksdb_cache_type == "lru") {
-    bbt_opts.block_cache = rocksdb::NewLRUCache(
-      block_cache_size,
-      cct->_conf->rocksdb_cache_shard_bits);
-  } else if (cct->_conf->rocksdb_cache_type == "clock") {
-    bbt_opts.block_cache = rocksdb::NewClockCache(
-      block_cache_size,
-      cct->_conf->rocksdb_cache_shard_bits);
-    if (!bbt_opts.block_cache) {
-      derr << "rocksdb_cache_type '" << cct->_conf->rocksdb_cache_type
-           << "' chosen, but RocksDB not compiled with LibTBB. "
-           << dendl;
-      return -EINVAL;
-    }
-  } else {
-    derr << "unrecognized rocksdb_cache_type '" << cct->_conf->rocksdb_cache_type
-      << "'" << dendl;
-    return -EINVAL;
-  }
   bbt_opts.block_size = cct->_conf->rocksdb_block_size;
 
   if (row_cache_size > 0)
@@ -836,6 +835,10 @@ int RocksDBStore::apply_sharding(const rocksdb::Options& opt,
   }
   return 0;
 }
+// linking to rocksdb function defined in options_helper.cc
+// it can parse nested params like "nested_opt={opt1=1;opt2=2}"
+extern rocksdb::Status StringToMap(const std::string& opts_str,
+                                  std::unordered_map<std::string, std::string>* opts_map);
 
 int RocksDBStore::verify_sharding(const rocksdb::Options& opt,
                                  std::vector<rocksdb::ColumnFamilyDescriptor>& existing_cfs,
@@ -887,15 +890,75 @@ int RocksDBStore::verify_sharding(const rocksdb::Options& opt,
 
   for (auto& column : stored_sharding_def) {
     rocksdb::ColumnFamilyOptions cf_opt(opt);
-    status = rocksdb::GetColumnFamilyOptionsFromString(
-                                                      cf_opt, column.options, &cf_opt);
+    //sift column.options into 2 categories:
+    // - column family options
+    // - block cache options
+    std::unordered_map<std::string, std::string> options_map;
+    status = StringToMap(column.options, &options_map);
+    if (!status.ok()) {
+      dout(5) << __func__ << " error '" << status.getState() << "' while parsing options '" <<
+       column.options << dendl;
+      return -EIO;
+    }
+    //extract "block_cache" options
+    std::unordered_map<std::string, std::string> cache_options_map;
+    for (auto it = options_map.begin; it!= options_map.end() ; /*nop*/ ) {
+      if (it->first.find("block_cache.") == 0) {
+       cache_options_map.insert(it->first.substr(strlen("block_cache.")), it->second);
+       it = erase(it);
+      } else {
+       ++it;
+      }
+    }
+    status = rocksdb::GetColumnFamilyOptionsFromMap(
+                                                   cf_opt, options_map, &cf_opt);
     if (!status.ok()) {
       derr << __func__ << " invalid db column family options for CF '"
           << column.name << "': " << column.options << dendl;
+      derr << __func__ << "error = '" << status.getState() << "'" << dendl;
       return -EINVAL;
     }
     install_cf_mergeop(column.name, &cf_opt);
-
+    if (!cache_options_map.empty()) {
+      bool require_new_block_cache = false;
+      std::string cache_type = cct->_conf()->rocksdb_cache_type;      
+      if (auto it = cache_options_map.find("cache_type"); it !=cache_options_map.end()) {
+       cache_type = it->second();
+       cache_options_map.erase(it);
+       require_new_block_cache = true;
+      }
+      size_t cache_size = cct->_conf()->rocksdb_cache_size;
+      if (auto it = cache_options_map.find("cache_size"); it !=cache_options_map.end()) {
+       std::string error;
+       cache_size = strict_iecstrtoll(it->second(), &error);
+       if (!error.empty()) {
+         derr << __func__ << " invalid size: '" << it->second() << "'" << dendl;
+       }
+       cache_options_map.erase(it);
+       require_new_block_cache = true;
+      }
+      std::shared_ptr<rocksdb::Cache> block_cache;
+      if (require_new_block_cache) {
+       block_cache = create_block_cache(cache_type, case_size);
+       if (!block_cache) {
+         return -EINVAL;
+       }
+       
+      } else {
+       block_cache = bbt_opts.block_cache;
+      }
+      rocksdb::BlockBasedTableOptions column_bbt_opts;
+      status = GetBlockBasedTableOptionsFromMap(bbt_opts, cache_options_map, column_bbt_opts);
+      if (!status.ok()) {
+       derr << __func__ << " invalid cache options for CF '"
+            << column.name << "': " << cache_options_map << dendl;
+       derr << __func__ << "error = '" << status.getState() << "'" << dendl;
+       return -EINVAL;
+      }
+         
+      cf_bbt_opts[column.name] = column_bbt_opts;
+      cf_opt.table_factory.reset(NewBlockBasedTableFactory(cf_bbt_opts[column.name]));
+    }
     if (column.shard_cnt == 1) {
       emplace_cf(column, 0, column.name, cf_opt);
     } else {
index 4fb8111de054796559eef3c77828dcc797dd58fa..d4c0be4149c7bf9c128da4b3fe9f56bc68591b88 100644 (file)
@@ -115,7 +115,8 @@ private:
   };
   std::unordered_map<std::string, prefix_shards> cf_handles;
   std::unordered_map<uint32_t, std::string> cf_ids_to_prefix;
-
+  std::unordered_map<std::string, rocksdb::BlockBasedTableOptions> cf_bbt_opts;
+  
   void add_column_family(const std::string& cf_name, uint32_t hash_l, uint32_t hash_h,
                         size_t shard_idx, rocksdb::ColumnFamilyHandle *handle);
   bool is_column_family(const std::string& prefix);
@@ -149,6 +150,7 @@ private:
                      std::vector<std::pair<size_t, RocksDBStore::ColumnFamily> >& existing_cfs_shard,
                      std::vector<rocksdb::ColumnFamilyDescriptor>& missing_cfs,
                      std::vector<std::pair<size_t, RocksDBStore::ColumnFamily> >& missing_cfs_shard);
+  int init_block_cache(uint64_t size, rocksdb::BlockBasedTableOptions& bbto);
 
   // manage async compactions
   ceph::mutex compact_queue_lock =
@@ -460,6 +462,14 @@ err:
     return static_cast<int64_t>(bbt_opts.block_cache->GetUsage());
   }
 
+  virtual int64_t get_cache_usage(string prefix) const override {
+    auto it = cf_bbt_opts.find(prefix);
+    if (it != cf_bbt_opts.end()) {
+      return static_cast<int64_t>(it->second.block_cache->GetUsage());
+    }
+    return -EINVAL;
+  }
+
   int set_cache_size(uint64_t s) override {
     cache_size = s;
     set_cache_flag = true;
@@ -494,6 +504,26 @@ public:
     bool   unittest_fail_after_successful_processing = false;
   };
   int reshard(const std::string& new_sharding, const resharding_ctrl* ctrl = nullptr);
+
+  int set_cache_capacity(int64_t capacity);
+  int64_t get_cache_capacity();
+
+  virtual std::shared_ptr<PriorityCache::PriCache>
+      get_priority_cache() const override {
+    return dynamic_pointer_cast<PriorityCache::PriCache>(
+        bbt_opts.block_cache);
+  }
+
+  virtual std::shared_ptr<PriorityCache::PriCache>
+      get_priority_cache(string prefix) const override {
+    auto it = cf_bbt_opts.find(prefix);
+    if (it != cf_bbt_opts.end()) {
+      return dynamic_pointer_cast<PriorityCache::PriCache>(
+          it->second.block_cache);
+    }
+    return nullptr;
+  }
+
 };
 
 #endif
index 2756f39f9ea237bf5e8474eaaaeeb1c8eb22e5c9..0d657883e92debb85ac1d9d9b2757fbfcf6eb080 100644 (file)
@@ -101,9 +101,10 @@ void BinnedLRUHandleTable::Resize() {
   length_ = new_length;
 }
 
-BinnedLRUCacheShard::BinnedLRUCacheShard(size_t capacity, bool strict_capacity_limit,
+BinnedLRUCacheShard::BinnedLRUCacheShard(CephContext *c, size_t capacity, bool strict_capacity_limit,
                              double high_pri_pool_ratio)
-    : capacity_(0),
+    : cct(c),
+      capacity_(0),
       high_pri_pool_usage_(0),
       strict_capacity_limit_(strict_capacity_limit),
       high_pri_pool_ratio_(high_pri_pool_ratio),
@@ -480,7 +481,7 @@ BinnedLRUCache::BinnedLRUCache(CephContext *c,
   size_t per_shard = (capacity + (num_shards_ - 1)) / num_shards_;
   for (int i = 0; i < num_shards_; i++) {
     new (&shards_[i])
-        BinnedLRUCacheShard(per_shard, strict_capacity_limit, high_pri_pool_ratio);
+        BinnedLRUCacheShard(c, per_shard, strict_capacity_limit, high_pri_pool_ratio);
   }
 }
 
index 96023ce22f7edfc351ef9e4eb8d486c08b2acf9b..85608be0e5734fa6f1cf26585c42e24d9818062e 100644 (file)
@@ -171,7 +171,7 @@ class BinnedLRUHandleTable {
 // A single shard of sharded cache.
 class alignas(CACHE_LINE_SIZE) BinnedLRUCacheShard : public CacheShard {
  public:
-  BinnedLRUCacheShard(size_t capacity, bool strict_capacity_limit,
+  BinnedLRUCacheShard(CephContext *c, size_t capacity, bool strict_capacity_limit,
                 double high_pri_pool_ratio);
   virtual ~BinnedLRUCacheShard();
 
@@ -225,6 +225,7 @@ class alignas(CACHE_LINE_SIZE) BinnedLRUCacheShard : public CacheShard {
   size_t GetHighPriPoolUsage() const;
 
  private:
+  CephContext *cct;
   void LRU_Remove(BinnedLRUHandle* e);
   void LRU_Insert(BinnedLRUHandle* e);
 
index adc16660e386895b6f481da5124bf618589aca09..2b2c02440674e16295636240108d6ea6e3820a7c 100644 (file)
@@ -4090,12 +4090,16 @@ void *BlueStore::MempoolThread::entry()
   }
 
   binned_kv_cache = store->db->get_priority_cache();
+  binned_kv_onode_cache = store->db->get_priority_cache(PREFIX_OBJ);
   if (store->cache_autotune && binned_kv_cache != nullptr) {
     pcm = std::make_shared<PriorityCache::Manager>(
         store->cct, min, max, target, true, "bluestore-pricache");
     pcm->insert("kv", binned_kv_cache, true);
     pcm->insert("meta", meta_cache, true);
     pcm->insert("data", data_cache, true);
+    if (binned_kv_onode_cache != nullptr) {
+      pcm->insert("kv_onode", binned_kv_onode_cache, true);
+    }
   }
 
   utime_t next_balance = ceph_clock_now();
@@ -4177,6 +4181,9 @@ void BlueStore::MempoolThread::_adjust_cache_settings()
   if (binned_kv_cache != nullptr) {
     binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
   }
+  if (binned_kv_onode_cache != nullptr) {
+    binned_kv_onode_cache->set_cache_ratio(store->cache_kv_onode_ratio);
+  }
   meta_cache->set_cache_ratio(store->cache_meta_ratio);
   data_cache->set_cache_ratio(store->cache_data_ratio);
 }
@@ -4186,12 +4193,15 @@ void BlueStore::MempoolThread::_resize_shards(bool interval_stats)
   size_t onode_shards = store->onode_cache_shards.size();
   size_t buffer_shards = store->buffer_cache_shards.size();
   int64_t kv_used = store->db->get_cache_usage();
+  int64_t kv_onode_used = store->db->get_cache_usage(PREFIX_OBJ);
   int64_t meta_used = meta_cache->_get_used_bytes();
   int64_t data_used = data_cache->_get_used_bytes();
 
   uint64_t cache_size = store->cache_size;
   int64_t kv_alloc =
      static_cast<int64_t>(store->cache_kv_ratio * cache_size); 
+  int64_t kv_onode_alloc =
+     static_cast<int64_t>(store->cache_kv_onode_ratio * cache_size);
   int64_t meta_alloc =
      static_cast<int64_t>(store->cache_meta_ratio * cache_size);
   int64_t data_alloc =
@@ -4202,12 +4212,17 @@ void BlueStore::MempoolThread::_resize_shards(bool interval_stats)
     kv_alloc = binned_kv_cache->get_committed_size();
     meta_alloc = meta_cache->get_committed_size();
     data_alloc = data_cache->get_committed_size();
+    if (binned_kv_onode_cache != nullptr) {
+      kv_onode_alloc = binned_kv_onode_cache->get_committed_size();
+    }
   }
   
   if (interval_stats) {
     dout(5) << __func__  << " cache_size: " << cache_size
                   << " kv_alloc: " << kv_alloc
                   << " kv_used: " << kv_used
+                  << " kv_onode_alloc: " << kv_onode_alloc
+                  << " kv_onode_used: " << kv_onode_used
                   << " meta_alloc: " << meta_alloc
                   << " meta_used: " << meta_used
                   << " data_alloc: " << data_alloc
@@ -4216,6 +4231,8 @@ void BlueStore::MempoolThread::_resize_shards(bool interval_stats)
     dout(20) << __func__  << " cache_size: " << cache_size
                    << " kv_alloc: " << kv_alloc
                    << " kv_used: " << kv_used
+                   << " kv_onode_alloc: " << kv_onode_alloc
+                   << " kv_onode_used: " << kv_onode_used
                    << " meta_alloc: " << meta_alloc
                    << " meta_used: " << meta_used
                    << " data_alloc: " << data_alloc
@@ -4733,20 +4750,27 @@ int BlueStore::_set_cache_sizes()
     }
   }
 
-  cache_meta_ratio = cct->_conf->bluestore_cache_meta_ratio;
+  cache_meta_ratio = cct->_conf.get_val<double>("bluestore_cache_meta_ratio");
   if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
     derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
          << ") must be in range [0,1.0]" << dendl;
     return -EINVAL;
   }
 
-  cache_kv_ratio = cct->_conf->bluestore_cache_kv_ratio;
+  cache_kv_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_ratio");
   if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
     derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
          << ") must be in range [0,1.0]" << dendl;
     return -EINVAL;
   }
 
+  cache_kv_onode_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_onode_ratio");
+  if (cache_kv_onode_ratio < 0 || cache_kv_onode_ratio > 1.0) {
+    derr << __func__ << " bluestore_cache_kv_onode_ratio (" << cache_kv_onode_ratio
+         << ") must be in range [0,1.0]" << dendl;
+    return -EINVAL;
+  }
+
   if (cache_meta_ratio + cache_kv_ratio > 1.0) {
     derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
          << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
@@ -4755,8 +4779,10 @@ int BlueStore::_set_cache_sizes()
     return -EINVAL;
   }
 
-  cache_data_ratio =
-    (double)1.0 - (double)cache_meta_ratio - (double)cache_kv_ratio;
+  cache_data_ratio = (double)1.0 - 
+                     (double)cache_meta_ratio - 
+                     (double)cache_kv_ratio - 
+                     (double)cache_kv_onode_ratio;
   if (cache_data_ratio < 0) {
     // deal with floating point imprecision
     cache_data_ratio = 0;
index df47409824d25184b242a52651c9995fc771ca4e..d045f2888806c36a865c26b49d52e62e7de7a8fe 100644 (file)
@@ -2173,6 +2173,7 @@ private:
   uint64_t cache_size = 0;       ///< total cache size
   double cache_meta_ratio = 0;   ///< cache ratio dedicated to metadata
   double cache_kv_ratio = 0;     ///< cache ratio dedicated to kv (e.g., rocksdb)
+  double cache_kv_onode_ratio = 0; ///< cache ratio dedicated to kv onodes (e.g., rocksdb onode CF)
   double cache_data_ratio = 0;   ///< cache ratio dedicated to object data
   bool cache_autotune = false;   ///< cache autotune setting
   double cache_autotune_interval = 0; ///< time to wait between cache rebalancing
@@ -2200,6 +2201,7 @@ private:
     ceph::mutex lock = ceph::make_mutex("BlueStore::MempoolThread::lock");
     bool stop = false;
     std::shared_ptr<PriorityCache::PriCache> binned_kv_cache = nullptr;
+    std::shared_ptr<PriorityCache::PriCache> binned_kv_onode_cache = nullptr;
     std::shared_ptr<PriorityCache::Manager> pcm = nullptr;
 
     struct MempoolCache : public PriorityCache::PriCache {