]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore: adjust allocator+freelist interfaces for smr params
authorSage Weil <sage@newdream.net>
Wed, 11 Aug 2021 16:48:45 +0000 (11:48 -0500)
committerSage Weil <sage@newdream.net>
Fri, 29 Oct 2021 13:55:56 +0000 (09:55 -0400)
Instead of shoehorning these fields into alloc_size, adjust the inferfaces
to explicitly pass in zone_size and first_sequential_zone for
Allocator and FreelistManager.

Signed-off-by: Sage Weil <sage@newdream.net>
14 files changed:
src/os/bluestore/Allocator.cc
src/os/bluestore/Allocator.h
src/os/bluestore/BitmapFreelistManager.cc
src/os/bluestore/BitmapFreelistManager.h
src/os/bluestore/BlueFS.cc
src/os/bluestore/BlueStore.cc
src/os/bluestore/BlueStore.h
src/os/bluestore/FreelistManager.h
src/os/bluestore/ZonedAllocator.cc
src/os/bluestore/ZonedAllocator.h
src/os/bluestore/ZonedFreelistManager.cc
src/os/bluestore/ZonedFreelistManager.h
src/test/objectstore/Allocator_test.cc
src/test/objectstore/allocator_replay_test.cc

index 8e4a08b2587abeb0b148bb652e754ac186f5a882..731ae5de73c5b7768e3408fa98148aa5f6f0880e 100644 (file)
@@ -109,7 +109,8 @@ public:
 Allocator::Allocator(std::string_view name,
                      int64_t _capacity,
                      int64_t _block_size)
-  : device_size(_capacity), block_size(_block_size)
+ : device_size(_capacity),
+   block_size(_block_size)
 {
   asok_hook = new SocketHook(this, name);
 }
@@ -124,8 +125,14 @@ const string& Allocator::get_name() const {
   return asok_hook->name;
 }
 
-Allocator *Allocator::create(CephContext* cct, std::string_view type,
-                             int64_t size, int64_t block_size, std::string_view name)
+Allocator *Allocator::create(
+  CephContext* cct,
+  std::string_view type,
+  int64_t size,
+  int64_t block_size,
+  int64_t zone_size,
+  int64_t first_sequential_zone,
+  std::string_view name)
 {
   Allocator* alloc = nullptr;
   if (type == "stupid") {
@@ -142,7 +149,8 @@ Allocator *Allocator::create(CephContext* cct, std::string_view type,
       name);
 #ifdef HAVE_LIBZBD
   } else if (type == "zoned") {
-    return new ZonedAllocator(cct, size, block_size, name);
+    return new ZonedAllocator(cct, size, block_size, zone_size, first_sequential_zone,
+                             name);
 #endif
   }
   if (alloc == nullptr) {
index 6f6325d57af9f223e1a1e7cedec8fc099d887802..5503ed213fb580f903fd6d4bafff204fbbda3cac 100644 (file)
@@ -66,8 +66,15 @@ public:
   virtual double get_fragmentation_score();
   virtual void shutdown() = 0;
 
-  static Allocator *create(CephContext* cct, std::string_view type, int64_t size,
-                          int64_t block_size, const std::string_view name = "");
+  static Allocator *create(
+    CephContext* cct,
+    std::string_view type,
+    int64_t size,
+    int64_t block_size,
+    int64_t zone_size = 0,
+    int64_t firs_sequential_zone = 0,
+    const std::string_view name = ""
+    );
 
 
   const std::string& get_name() const;
index 2c8971296268473b368a789d0179dd0bd589cd1d..e03a6ecacb748dc74e1997689ce3745abe230086 100644 (file)
@@ -65,6 +65,7 @@ BitmapFreelistManager::BitmapFreelistManager(CephContext* cct,
 }
 
 int BitmapFreelistManager::create(uint64_t new_size, uint64_t granularity,
+                                 uint64_t zone_size, uint64_t first_sequential_zone,
                                  KeyValueDB::Transaction txn)
 {
   bytes_per_block = granularity;
index 5b04e8fd28cc0651c1fc3cbdb3ffb93fc38c7225..8e4ea8fd385c43d1a34722f9af4162ed4c9a16ba 100644 (file)
@@ -63,6 +63,7 @@ public:
   static void setup_merge_operator(KeyValueDB *db, std::string prefix);
 
   int create(uint64_t size, uint64_t granularity,
+            uint64_t zone_size, uint64_t first_sequential_zone,
             KeyValueDB::Transaction txn) override;
 
   int init(KeyValueDB *kvdb, bool db_in_read_only,
index f34996be3e000f9d685d05f73133f4bba3aa6018..5b00c2e9a91212a9aeb0abe17c98ece7989b1f0c 100644 (file)
@@ -590,7 +590,9 @@ void BlueFS::_init_alloc()
               << std::dec << dendl;
       alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator,
                                    bdev[id]->get_size(),
-                                   alloc_size[id], name);
+                                   alloc_size[id],
+                                   0, 0,
+                                   name);
       alloc[id]->init_add_free(
         block_reserved[id],
         _get_total(id));
index 030c1f59429f4cd389c919fe68024bb37115b84f..dd4d8f42d596e27162ae4144017aecd59db68db6 100644 (file)
@@ -5397,7 +5397,6 @@ int BlueStore::_open_fm(KeyValueDB::Transaction t, bool read_only, bool fm_resto
             << dendl;
        return -EINVAL;
       }
-      alloc_size = _zoned_piggyback_device_parameters_onto(alloc_size);
     } else
 #endif
     if (freelist_type == "zoned") {
@@ -5406,7 +5405,13 @@ int BlueStore::_open_fm(KeyValueDB::Transaction t, bool read_only, bool fm_resto
       return -EINVAL;
     }
 
-    fm->create(bdev->get_size(), alloc_size, t);
+    fm->create(bdev->get_size(), alloc_size,
+              zone_size, first_sequential_zone,
+              t);
+
+    // allocate superblock reserved space.  note that we do not mark
+    // bluefs space as allocated in the freelist; we instead rely on
+    // bluefs doing that itself.
     auto reserved = _get_ondisk_reserved();
     if (fm_restore) {
       // we need to allocate the full space in restore case
@@ -5545,14 +5550,17 @@ int BlueStore::_create_alloc()
              << "Please set to 0." << dendl;
       return -EINVAL;
     }
-
-    alloc_size = _zoned_piggyback_device_parameters_onto(alloc_size);
   }
 #endif
   
-  shared_alloc.set(Allocator::create(cct, allocator_type,
-    bdev->get_size(),
-    alloc_size, "block"));
+  shared_alloc.set(
+    Allocator::create(
+      cct, allocator_type,
+      bdev->get_size(),
+      alloc_size,
+      zone_size,
+      first_sequential_zone,
+      "block"));
 
   if (!shared_alloc.a) {
     lderr(cct) << __func__ << " failed to create " << allocator_type << " allocator"
@@ -6700,6 +6708,8 @@ int BlueStore::mkfs()
 #ifdef HAVE_LIBZBD
   if (bdev->is_smr()) {
     freelist_type = "zoned";
+    zone_size = bdev->get_zone_size();
+    first_sequential_zone = bdev->get_conventional_region_size() / zone_size;
   } else
 #endif
   {
@@ -6765,6 +6775,22 @@ int BlueStore::mkfs()
       bl.append(stringify(OMAP_PER_PG));
       t->set(PREFIX_SUPER, "per_pool_omap", bl);
     }
+
+#ifdef HAVE_LIBZBD
+    if (bdev->is_smr()) {
+      {
+       bufferlist bl;
+       encode((uint64_t)zone_size, bl);
+       t->set(PREFIX_SUPER, "zone_size", bl);
+      }
+      {
+       bufferlist bl;
+       encode((uint64_t)first_sequential_zone, bl);
+       t->set(PREFIX_SUPER, "first_sequential_zone", bl);
+      }
+    }
+#endif
+    
     ondisk_format = latest_ondisk_format;
     _prepare_ondisk_format_super(t);
     db->submit_transaction_sync(t);
@@ -11467,6 +11493,27 @@ int BlueStore::_open_super_meta()
             << std::dec << dendl;
   }
 
+  // smr fields
+  {
+    bufferlist bl;
+    int r = db->get(PREFIX_SUPER, "zone_size", &bl);
+    if (r >= 0) {
+      auto p = bl.cbegin();
+      decode(zone_size, p);
+      dout(1) << __func__ << " zone_size 0x" << std::hex << zone_size << std::dec << dendl;
+    }
+  }
+  {
+    bufferlist bl;
+    int r = db->get(PREFIX_SUPER, "first_sequential_zone", &bl);
+    if (r >= 0) {
+      auto p = bl.cbegin();
+      decode(first_sequential_zone, p);
+      dout(1) << __func__ << " first_sequential_zone 0x" << std::hex
+             << first_sequential_zone << std::dec << dendl;
+    }
+  }
+
   _set_per_pool_omap();
 
   _open_statfs();
@@ -11880,18 +11927,6 @@ std::string BlueStore::_zoned_key(uint64_t offset, const ghobject_t *oid) {
   return zone_key + object_key;
 }
 
-// For now, to avoid interface changes we piggyback zone_size (in MiB) and the
-// first sequential zone number onto min_alloc_size and pass it to functions
-// Allocator::create and FreelistManager::create.
-uint64_t BlueStore::_zoned_piggyback_device_parameters_onto(uint64_t min_alloc_size) {
-  uint64_t zone_size = bdev->get_zone_size();
-  uint64_t zone_size_mb = zone_size / (1024 * 1024);
-  uint64_t first_seq_zone = bdev->get_conventional_region_size() / zone_size;
-  min_alloc_size |= (zone_size_mb << 32);
-  min_alloc_size |= (first_seq_zone << 48);
-  return min_alloc_size;
-}
-
 #endif
 
 void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
@@ -17349,7 +17384,9 @@ int BlueStore::store_allocator(Allocator* src_allocator)
 Allocator* BlueStore::create_bitmap_allocator(uint64_t bdev_size) {
   // create allocator
   uint64_t alloc_size = min_alloc_size;
-  Allocator* alloc = Allocator::create(cct, "bitmap", bdev_size, alloc_size, "recovery");
+  Allocator* alloc = Allocator::create(cct, "bitmap", bdev_size, alloc_size,
+                                      zone_size, first_sequential_zone,
+                                      "recovery");
   if (alloc) {
     return alloc;
   } else {
index 4a4171f41b0819da67abd30f840b7ad0ddd66fa6..c932ca36b528d02e716d1d92496a5b19678668ca 100644 (file)
@@ -2149,6 +2149,10 @@ private:
                std::numeric_limits<decltype(min_alloc_size)>::digits,
                "not enough bits for min_alloc_size");
 
+  // smr-only
+  uint64_t zone_size = 0;              ///< number of SMR zones 
+  uint64_t first_sequential_zone = 0;  ///< first SMR zone that is sequential-only
+
   enum {
     // Please preserve the order since it's DB persistent
     OMAP_BULK = 0,
@@ -2419,7 +2423,6 @@ private:
 
 #ifdef HAVE_LIBZBD
   // Functions related to zoned storage.
-  uint64_t _zoned_piggyback_device_parameters_onto(uint64_t min_alloc_size);
   void _zoned_update_cleaning_metadata(TransContext *txc);
   std::string _zoned_key(uint64_t offset, const ghobject_t *oid);
 #endif
index 4d375b430945611d6ee3fc1ffc74075119420834..54d27f10804be9c9a91b17f574ae84131e5c9fc8 100644 (file)
@@ -26,6 +26,7 @@ public:
   static void setup_merge_operators(KeyValueDB *db, const std::string &type);
 
   virtual int create(uint64_t size, uint64_t granularity,
+                    uint64_t zone_size, uint64_t first_sequential_zone,
                     KeyValueDB::Transaction txn) = 0;
 
   virtual int init(KeyValueDB *kvdb, bool db_in_read_only,
index 6144779cb68e06153d001de46fb1953c3c6c6f50..4a7a1f7ef5d4d865db357ad854d55efcf4f5d134 100644 (file)
 ZonedAllocator::ZonedAllocator(CephContext* cct,
                               int64_t size,
                               int64_t blk_size,
+                              int64_t _zone_size,
+                              int64_t _first_sequential_zone,
                               std::string_view name)
-    : Allocator(name, size, blk_size & 0x00000000ffffffff),
+    : Allocator(name, size, blk_size),
       cct(cct),
       num_free(0),
       size(size),
-      // To avoid interface changes, we piggyback zone size and the first
-      // sequential zone number onto the first 32 bits of 64-bit |blk_size|.
-      // The last 32 bits of |blk_size| is holding the actual block size.
-      block_size((blk_size & 0x00000000ffffffff)),
-      zone_size(((blk_size & 0x0000ffff00000000) >> 32) * 1024 * 1024),
-      first_seq_zone_num((blk_size >> 48) & 0xffff),
+      block_size(blk_size),
+      zone_size(_zone_size),
+      first_seq_zone_num(_first_sequential_zone),
       starting_zone_num(first_seq_zone_num),
       num_zones(size / zone_size),
       num_zones_to_clean(0) {
index 5deedcae9ab02d86c955842af9e1764472906de4..585e8943cec67138ae453a3c35281d2ce5051474 100644 (file)
@@ -70,7 +70,9 @@ class ZonedAllocator : public Allocator {
 
 public:
   ZonedAllocator(CephContext* cct, int64_t size, int64_t block_size,
-                 std::string_view name);
+                int64_t _zone_size,
+                int64_t _first_sequential_zone,
+                std::string_view name);
   ~ZonedAllocator() override;
 
   const char *get_type() const override {
index 3b31e202fae689a1215d2c19a44213d87323bb7b..60899939200e74339f6818bab9b2c1d43a3c0e8d 100644 (file)
@@ -81,16 +81,14 @@ ZonedFreelistManager::ZonedFreelistManager(
 int ZonedFreelistManager::create(
     uint64_t new_size,
     uint64_t granularity,
+    uint64_t new_zone_size,
+    uint64_t first_sequential_zone,
     KeyValueDB::Transaction txn) {
-  // To avoid interface changes, we piggyback zone size and the first sequential
-  // zone number onto the first 32 bits of 64-bit |granularity|.  The last 32
-  // bits of |granularity| is holding the actual allocation granularity, which
-  // is bytes_per_block.
   size = new_size;
-  bytes_per_block = granularity & 0x00000000ffffffff;
-  zone_size = ((granularity & 0x0000ffff00000000) >> 32) * 1024 * 1024;
+  bytes_per_block = granularity;
+  zone_size = new_zone_size;
   num_zones = size / zone_size;
-  starting_zone_num = (granularity & 0xffff000000000000) >> 48;
+  starting_zone_num = first_sequential_zone;
   enumerate_zone_num = ~0UL;
 
   ceph_assert(size % zone_size == 0);
index c6f9be3151e9ed4850f84a47e2628b6d912a813f..7ad072336985799dcfcb82015fe5a61311633ef2 100644 (file)
@@ -63,6 +63,8 @@ public:
 
   int create(uint64_t size,
             uint64_t granularity,
+            uint64_t zone_size,
+            uint64_t first_sequential_zone,
             KeyValueDB::Transaction txn) override;
 
   int init(KeyValueDB *kvdb,
index 40fbed12bd3a00a9a9c43818faa693af850b9c50..210bc6d9d74573cad89282172aedb7b2961308af 100644 (file)
@@ -26,7 +26,8 @@ public:
   void init_alloc(int64_t size, uint64_t min_alloc_size) {
     std::cout << "Creating alloc type " << string(GetParam()) << " \n";
     alloc.reset(Allocator::create(g_ceph_context, GetParam(), size,
-                                 min_alloc_size));
+                                 min_alloc_size,
+                                 256*1048576, 100*256*1048576ull));
   }
 
   void init_close() {
index 7927790c485f31f175d8229c217bf25817afb1ef..811cc92cdeaf4c1db0b7060c4f528b4ed6264d38 100644 (file)
@@ -274,7 +274,7 @@ int replay_free_dump_and_apply(char* fname,
 
   unique_ptr<Allocator> alloc;
   alloc.reset(Allocator::create(g_ceph_context, alloc_type,
-    capacity, alloc_unit, alloc_name));
+                               capacity, alloc_unit, 0, 0, alloc_name));
 
   auto it = o->find_first();
   while (!it.end()) {