]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
crimson/os: static assign segments to each shard
authorchunmei <chunmei.liu@intel.com>
Tue, 4 Apr 2023 07:37:16 +0000 (07:37 +0000)
committerchunmei <chunmei.liu@intel.com>
Thu, 20 Apr 2023 20:05:05 +0000 (20:05 +0000)
and make device sharded

Signed-off-by: chunmei <chunmei.liu@intel.com>
src/crimson/os/seastore/device.h
src/crimson/os/seastore/seastore.cc
src/crimson/os/seastore/seastore.h
src/crimson/os/seastore/segment_manager.cc
src/crimson/os/seastore/segment_manager.h
src/crimson/os/seastore/segment_manager/block.cc
src/crimson/os/seastore/segment_manager/block.h

index c49d1ff6aa205a82b87fb8ad43660b246ee017c9..2cdc5d02e9bf694e1e62d34b0bbd206c8fdbb155 100644 (file)
@@ -45,6 +45,34 @@ struct device_config_t {
     denc(v.secondary_devices, p);
     DENC_FINISH(p);
   }
+  static device_config_t create_primary(
+    uuid_d new_osd_fsid,
+    device_id_t id,
+    device_type_t d_type,
+    secondary_device_set_t sds) {
+    return device_config_t{
+             true,
+             device_spec_t{
+               (magic_t)std::rand(),
+               d_type,
+               id},
+             seastore_meta_t{new_osd_fsid},
+             sds};
+   }
+  static device_config_t create_secondary(
+    uuid_d new_osd_fsid,
+    device_id_t id,
+    device_type_t d_type,
+    magic_t magic) {
+    return device_config_t{
+             false,
+             device_spec_t{
+               magic,
+               d_type,
+               id},
+             seastore_meta_t{new_osd_fsid},
+             secondary_device_set_t()};
+  }
 };
 
 std::ostream& operator<<(std::ostream&, const device_config_t&);
@@ -58,9 +86,41 @@ using DeviceRef = std::unique_ptr<Device>;
  * Represents a general device regardless of the underlying medium.
  */
 class Device {
+// interfaces used by device
 public:
   virtual ~Device() {}
 
+  virtual seastar::future<> start() {
+    return seastar::now();
+  }
+
+  virtual seastar::future<> stop() {
+    return seastar::now();
+  }
+  // called on the shard to get this shard device;
+  virtual Device& get_sharded_device() {
+    return *this;
+  }
+
+  using access_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error,
+    crimson::ct_error::permission_denied,
+    crimson::ct_error::enoent>;
+
+  using mkfs_ertr = access_ertr;
+  using mkfs_ret = mkfs_ertr::future<>;
+  virtual mkfs_ret mkfs(device_config_t) = 0;
+
+  using mount_ertr = access_ertr;
+  using mount_ret = access_ertr::future<>;
+  virtual mount_ret mount() = 0;
+
+  static seastar::future<DeviceRef> make_device(
+    const std::string &device,
+    device_type_t dtype);
+
+// interfaces used by each device shard
+public:
   virtual device_id_t get_device_id() const = 0;
 
   virtual magic_t get_magic() const = 0;
@@ -77,19 +137,6 @@ public:
 
   virtual secondary_device_set_t& get_secondary_devices() = 0;
 
-  using access_ertr = crimson::errorator<
-    crimson::ct_error::input_output_error,
-    crimson::ct_error::permission_denied,
-    crimson::ct_error::enoent>;
-
-  using mkfs_ertr = access_ertr;
-  using mkfs_ret = mkfs_ertr::future<>;
-  virtual mkfs_ret mkfs(device_config_t) = 0;
-
-  using mount_ertr = access_ertr;
-  using mount_ret = access_ertr::future<>;
-  virtual mount_ret mount() = 0;
-
   using close_ertr = crimson::errorator<
     crimson::ct_error::input_output_error>;
   virtual close_ertr::future<> close() = 0;
@@ -115,10 +162,6 @@ public:
       return read_ertr::make_ready_future<bufferptr>(std::move(*ptrref));
     });
   }
-
-  static seastar::future<DeviceRef> make_device(
-    const std::string &device,
-    device_type_t dtype);
 };
 
 }
index 95cef4d1d0e2552ca0a67197c10b1f797cfeb822..b44d6696701df12950d61aeffe672a52ced2266c 100644 (file)
@@ -124,7 +124,7 @@ SeaStore::Shard::Shard(
    throttler(
       get_conf<uint64_t>("seastore_max_concurrent_transactions"))
 {
-  device.reset(dev);
+  device = &(dev->get_sharded_device());
   register_metrics();
 }
 
@@ -200,67 +200,60 @@ seastar::future<> SeaStore::start()
 #else
   bool is_test = false;
 #endif
-  return shard_stores.start(root, nullptr, is_test)
-    .then([this] {
-    return shard_stores.invoke_on_all([](auto& local_store) {
-      return local_store.make_shard_stores();
-    });
+  using crimson::common::get_conf;
+  std::string type = get_conf<std::string>("seastore_main_device_type");
+  device_type_t d_type = string_to_device_type(type);
+  assert(d_type == device_type_t::SSD ||
+         d_type == device_type_t::RANDOM_BLOCK_SSD);
+
+  ceph_assert(root != "");
+  return Device::make_device(root, d_type
+  ).then([this](DeviceRef device_obj) {
+    device = std::move(device_obj);
+    return device->start();
+  }).then([this, is_test] {
+    ceph_assert(device);
+    return shard_stores.start(root, device.get(), is_test);
   });
 }
 
-seastar::future<> SeaStore::test_start(DeviceRef device)
+seastar::future<> SeaStore::test_start(DeviceRef device_obj)
 {
-  if (device) {
-    ceph_assert(root == "");
-    return shard_stores.start_single(root, device.release(), true);
-  } else {
-    ceph_assert(0 == "impossible no device");
-  }
+  ceph_assert(device_obj);
+  ceph_assert(root == "");
+  device = std::move(device_obj);
+  return shard_stores.start_single(root, device.get(), true);
 }
 
-
 seastar::future<> SeaStore::stop()
 {
   ceph_assert(seastar::this_shard_id() == primary_core);
-  return shard_stores.stop();
-}
-
-seastar::future<> SeaStore::Shard::make_shard_stores()
-{
-  if (root != "") {
-    using crimson::common::get_conf;
-    std::string type = get_conf<std::string>("seastore_main_device_type");
-    device_type_t d_type = string_to_device_type(type);
-    assert(d_type == device_type_t::SSD ||
-         d_type == device_type_t::RANDOM_BLOCK_SSD);
-
-    return Device::make_device(
-      root, d_type
-    ).then([this](DeviceRef device_obj) {
-      device = std::move(device_obj);
-    });
-  }
-  return seastar::now();
+  return seastar::do_for_each(secondaries, [](auto& sec_dev) {
+    return sec_dev->stop();
+  }).then([this] {
+    secondaries.clear();
+    if (device) {
+      return device->stop();
+    } else {
+      return seastar::now();
+    }
+  }).then([this] {
+    return shard_stores.stop();
+  });
 }
 
 SeaStore::mount_ertr::future<> SeaStore::test_mount()
 {
-
   ceph_assert(seastar::this_shard_id() == primary_core);
-  shard_stores.local().init_managers();
-    return shard_stores.local().get_transaction_manager()->mount(
-    ).handle_error(
-      crimson::ct_error::assert_all{
-        "Invalid error in SeaStore::test_mount"
-      }
-    );
+  return shard_stores.local().mount_managers();
 }
 
-SeaStore::mount_ertr::future<> SeaStore::Shard::mount()
+SeaStore::mount_ertr::future<> SeaStore::mount()
 {
+  ceph_assert(seastar::this_shard_id() == primary_core);
   return device->mount(
   ).safe_then([this] {
-    auto sec_devices = device->get_secondary_devices();
+    auto sec_devices = device->get_sharded_device().get_secondary_devices();
     return crimson::do_for_each(sec_devices, [this](auto& device_entry) {
       device_id_t id = device_entry.first;
       magic_t magic = device_entry.second.magic;
@@ -268,25 +261,49 @@ SeaStore::mount_ertr::future<> SeaStore::Shard::mount()
       std::string path =
         fmt::format("{}/block.{}.{}", root, dtype, std::to_string(id));
       return Device::make_device(path, dtype
-      ).then([this, magic](DeviceRef sec_dev) {
-        return sec_dev->mount(
-        ).safe_then([this, sec_dev=std::move(sec_dev), magic]() mutable {
-          boost::ignore_unused(magic);  // avoid clang warning;
-          assert(sec_dev->get_magic() == magic);
-          secondaries.emplace_back(std::move(sec_dev));
+      ).then([this, path, magic](DeviceRef sec_dev) {
+        return sec_dev->start(
+        ).then([this, magic, sec_dev = std::move(sec_dev)]() mutable {
+          return sec_dev->mount(
+          ).safe_then([this, sec_dev=std::move(sec_dev), magic]() mutable {
+            boost::ignore_unused(magic);  // avoid clang warning;
+            assert(sec_dev->get_sharded_device().get_magic() == magic);
+            secondaries.emplace_back(std::move(sec_dev));
+          });
+        }).safe_then([this] {
+          return set_secondaries();
         });
       });
+    }).safe_then([this] {
+      return shard_stores.invoke_on_all([](auto &local_store) {
+        return local_store.mount_managers();
+      });
     });
-  }).safe_then([this] {
-    init_managers();
-    return transaction_manager->mount();
   }).handle_error(
     crimson::ct_error::assert_all{
-      "Invalid error in Shard::mount"
+      "Invalid error in SeaStore::mount"
     }
   );
 }
 
+seastar::future<> SeaStore::Shard::mount_managers()
+{
+  init_managers();
+  return transaction_manager->mount(
+  ).handle_error(
+    crimson::ct_error::assert_all{
+      "Invalid error in mount_managers"
+  });
+}
+
+seastar::future<> SeaStore::umount()
+{
+  ceph_assert(seastar::this_shard_id() == primary_core);
+  return shard_stores.invoke_on_all([](auto &local_store) {
+    return local_store.umount();
+  });
+}
+
 seastar::future<> SeaStore::Shard::umount()
 {
   return [this] {
@@ -367,75 +384,12 @@ SeaStore::Shard::mkfs_managers()
   );
 }
 
-seastar::future<>
-SeaStore::Shard::mkfs(
-  secondary_device_set_t &sds,
-  uuid_d new_osd_fsid)
-{
-  device_type_t d_type = device->get_device_type();
-  device_id_t id = (d_type == device_type_t::RANDOM_BLOCK_SSD) ?
-    static_cast<device_id_t>(DEVICE_ID_RANDOM_BLOCK_MIN) : 0;
-
-  return device->mkfs(
-    device_config_t{
-      true,
-      device_spec_t{
-        (magic_t)std::rand(),
-        d_type,
-        id},
-      seastore_meta_t{new_osd_fsid},
-      sds}
-  ).safe_then([this] {
-    return crimson::do_for_each(secondaries, [](auto& sec_dev) {
-      return sec_dev->mount();
-    });
-  }).safe_then([this] {
-    return device->mount();
-  }).safe_then([this] {
-    return mkfs_managers();
-  }).handle_error(
-    crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::Shard::mkfs"
-    }
-  );
-}
-
-seastar::future<> SeaStore::Shard::sec_mkfs(
-  const std::string path,
-  device_type_t dtype,
-  device_id_t id,
-  secondary_device_set_t &sds,
-  uuid_d new_osd_fsid)
-{
-  return Device::make_device(path, dtype
-  ).then([this, &sds, id, dtype, new_osd_fsid](DeviceRef sec_dev) {
-    magic_t magic = (magic_t)std::rand();
-    sds.emplace(
-      (device_id_t)id,
-      device_spec_t{magic, dtype, (device_id_t)id});
-    return sec_dev->mkfs(
-      device_config_t{
-        false,
-        device_spec_t{
-          magic,
-          dtype,
-          (device_id_t)id},
-        seastore_meta_t{new_osd_fsid},
-        secondary_device_set_t()}
-    ).safe_then([this, sec_dev=std::move(sec_dev), id]() mutable {
-      LOG_PREFIX(SeaStore::Shard::sec_mkfs);
-      DEBUG("mkfs: finished for device {}", id);
-      secondaries.emplace_back(std::move(sec_dev));
-    }).handle_error(crimson::ct_error::assert_all{"not possible"});
-  });
-}
-
-seastar::future<> SeaStore::_mkfs(uuid_d new_osd_fsid)
+seastar::future<> SeaStore::set_secondaries()
 {
-  ceph_assert(seastar::this_shard_id() == primary_core);
-  return shard_stores.local().mkfs_managers(
-  ).then([this, new_osd_fsid] {
-    return prepare_meta(new_osd_fsid);
+  auto sec_dev_ite = secondaries.rbegin();
+  Device* sec_dev = sec_dev_ite->get();
+  return shard_stores.invoke_on_all([sec_dev](auto &local_store) {
+    local_store.set_secondaries(sec_dev->get_sharded_device());
   });
 }
 
@@ -447,7 +401,10 @@ SeaStore::mkfs_ertr::future<> SeaStore::test_mkfs(uuid_d new_osd_fsid)
     if (done == 0) {
       return seastar::now();
     } 
-    return _mkfs(new_osd_fsid);
+    return shard_stores.local().mkfs_managers(
+    ).then([this, new_osd_fsid] {
+      return prepare_meta(new_osd_fsid);
+    });
   });
 }
 
@@ -481,9 +438,8 @@ SeaStore::mkfs_ertr::future<> SeaStore::mkfs(uuid_d new_osd_fsid)
       return seastar::now();
     } else {
       return seastar::do_with(
-        std::vector<secondary_device_set_t>(),
+        secondary_device_set_t(),
         [this, new_osd_fsid](auto& sds) {
-        sds.resize(seastar::smp::count);
         auto fut = seastar::now();
         LOG_PREFIX(SeaStore::mkfs);
         DEBUG("root: {}", root);
@@ -510,15 +466,22 @@ SeaStore::mkfs_ertr::future<> SeaStore::mkfs(uuid_d new_osd_fsid)
                 }
                 auto id = std::stoi(entry_name.substr(dtype_end + 1));
                 std::string path = fmt::format("{}/{}", root, entry_name);
-                return shard_stores.invoke_on_all(
-                  [&sds, id, path, dtype, new_osd_fsid]
-                  (auto &local_store) {
-                  return local_store.sec_mkfs(
-                    path,
-                    dtype,
-                    id,
-                    sds[seastar::this_shard_id()],
-                    new_osd_fsid);
+                return Device::make_device(path, dtype
+                ).then([this, &sds, id, dtype, new_osd_fsid](DeviceRef sec_dev) {
+                  auto p_sec_dev = sec_dev.get();
+                  secondaries.emplace_back(std::move(sec_dev));
+                  return p_sec_dev->start(
+                  ).then([&sds, id, dtype, new_osd_fsid, p_sec_dev]() {
+                    magic_t magic = (magic_t)std::rand();
+                    sds.emplace(
+                      (device_id_t)id,
+                      device_spec_t{magic, dtype, (device_id_t)id});
+                    return p_sec_dev->mkfs(device_config_t::create_secondary(
+                      new_osd_fsid, id, dtype, magic)
+                    ).handle_error(crimson::ct_error::assert_all{"not possible"});
+                  });
+                }).then([this] {
+                   return set_secondaries();
                 });
               }
               return seastar::now();
@@ -527,17 +490,37 @@ SeaStore::mkfs_ertr::future<> SeaStore::mkfs(uuid_d new_osd_fsid)
           });
         }
         return fut.then([this, &sds, new_osd_fsid] {
-          return shard_stores.invoke_on_all(
-            [&sds, new_osd_fsid](auto &local_store) {
-            return local_store.mkfs(
-              sds[seastar::this_shard_id()], new_osd_fsid);
+          device_id_t id = 0;
+          device_type_t d_type = device->get_device_type();
+          assert(d_type == device_type_t::SSD ||
+            d_type == device_type_t::RANDOM_BLOCK_SSD);
+          if (d_type == device_type_t::RANDOM_BLOCK_SSD) {
+            id = static_cast<device_id_t>(DEVICE_ID_RANDOM_BLOCK_MIN);
+          }
+
+          return device->mkfs(
+            device_config_t::create_primary(new_osd_fsid, id, d_type, sds)
+          );
+        }).safe_then([this] {
+          return crimson::do_for_each(secondaries, [](auto& sec_dev) {
+            return sec_dev->mount();
           });
         });
-      }).then([this, new_osd_fsid] {
+      }).safe_then([this] {
+        return device->mount();
+      }).safe_then([this] {
+        return shard_stores.invoke_on_all([] (auto &local_store) {
+          return local_store.mkfs_managers();
+        });
+      }).safe_then([this, new_osd_fsid] {
         return prepare_meta(new_osd_fsid);
-      }).then([this] {
+      }).safe_then([this] {
        return umount();
-      });
+      }).handle_error(
+        crimson::ct_error::assert_all{
+          "Invalid error in SeaStore::mkfs"
+        }
+      );
     }
   });
 }
@@ -2057,12 +2040,8 @@ void SeaStore::Shard::init_managers()
   collection_manager.reset();
   onode_manager.reset();
 
-  std::vector<Device*> sec_devices;
-  for (auto &dev : secondaries) {
-    sec_devices.emplace_back(dev.get());
-  }
   transaction_manager = make_transaction_manager(
-      device.get(), sec_devices, is_test);
+      device, secondaries, is_test);
   collection_manager = std::make_unique<collection_manager::FlatCollectionManager>(
       *transaction_manager);
   onode_manager = std::make_unique<crimson::os::seastore::onode::FLTreeOnodeManager>(
index 8e43f275f18fceafcef2948fb55052853980c378..df4323df55736d95b0b8a8f99544f310055b1838 100644 (file)
@@ -173,13 +173,13 @@ public:
 
   // only exposed to SeaStore
   public:
-    mount_ertr::future<> mount();
-
     seastar::future<> umount();
+    // init managers and mount transaction_manager
+    seastar::future<> mount_managers();
 
-    seastar::future<> mkfs(
-      secondary_device_set_t &sds,
-      uuid_d new_osd_fsid);
+    void set_secondaries(Device& sec_dev) {
+      secondaries.emplace_back(&sec_dev);
+    }
 
     using coll_core_t = FuturizedStore::coll_core_t;
     seastar::future<std::vector<coll_core_t>> list_collections();
@@ -190,28 +190,11 @@ public:
     store_statfs_t stat() const;
 
     uuid_d get_fsid() const;
-    // for each shard store make device
-    seastar::future<> make_shard_stores();
 
     seastar::future<> mkfs_managers();
 
     void init_managers();
 
-    TransactionManagerRef& get_transaction_manager() {
-      return transaction_manager;
-    }
-    // for secondaries device mkfs
-    seastar::future<> sec_mkfs(
-      const std::string path,
-      device_type_t dtype,
-      device_id_t id,
-      secondary_device_set_t &sds,
-      uuid_d new_osd_fsid);
-
-    DeviceRef get_primary_device_ref() {
-      return std::move(device);
-    }
-
   private:
     struct internal_context_t {
       CollectionRef ch;
@@ -452,11 +435,11 @@ public:
 
   private:
     std::string root;
-    DeviceRef device;
+    Device* device;
     const uint32_t max_object_size;
     bool is_test;
 
-    std::vector<DeviceRef> secondaries;
+    std::vector<Device*> secondaries;
     TransactionManagerRef transaction_manager;
     CollectionManagerRef collection_manager;
     OnodeManagerRef onode_manager;
@@ -476,24 +459,8 @@ public:
   seastar::future<> start() final;
   seastar::future<> stop() final;
 
-  mount_ertr::future<> mount() final {
-    ceph_assert(seastar::this_shard_id() == primary_core);
-    return shard_stores.invoke_on_all(
-      [](auto &local_store) {
-      return local_store.mount().handle_error(
-        crimson::ct_error::assert_all{
-        "Invalid error in SeaStore::mount"
-      });
-    });
-  }
-
-  seastar::future<> umount() final {
-    ceph_assert(seastar::this_shard_id() == primary_core);
-    return shard_stores.invoke_on_all(
-      [](auto &local_store) {
-      return local_store.umount();
-    });
-  }
+  mount_ertr::future<> mount() final;
+  seastar::future<> umount() final;
 
   mkfs_ertr::future<> mkfs(uuid_d new_osd_fsid) final;
   seastar::future<store_statfs_t> stat() const final;
@@ -532,8 +499,7 @@ public:
   mkfs_ertr::future<> test_mkfs(uuid_d new_osd_fsid);
 
   DeviceRef get_primary_device_ref() {
-    ceph_assert(seastar::this_shard_id() == primary_core);
-    return shard_stores.local().get_primary_device_ref();
+    return std::move(device);
   }
 
   seastar::future<> test_start(DeviceRef dev);
@@ -543,11 +509,13 @@ private:
 
   seastar::future<> prepare_meta(uuid_d new_osd_fsid);
 
-  seastar::future<> _mkfs(uuid_d new_osd_fsid);
+  seastar::future<> set_secondaries();
 
 private:
   std::string root;
   MDStoreRef mdstore;
+  DeviceRef device;
+  std::vector<DeviceRef> secondaries;
   seastar::sharded<SeaStore::Shard> shard_stores;
 };
 
index 44192965fcee24e19bf656fb7c0444c1073e65ea..098a9b068f8fe61afd4424d9c3ce4c8e88826c1b 100644 (file)
@@ -13,16 +13,29 @@ SET_SUBSYS(seastore_device);
 
 namespace crimson::os::seastore {
 
+std::ostream& operator<<(std::ostream& out, const block_shard_info_t& sf)
+{
+  out << "("
+      << "size=" << sf.size
+      << ", segments=" <<sf.segments
+      << ", tracker_offset=" <<sf.tracker_offset
+      << ", first_segment_offset=" <<sf.first_segment_offset
+      <<")";
+  return out;
+}
+
 std::ostream& operator<<(std::ostream& out, const block_sm_superblock_t& sb)
 {
   out << "superblock("
-      << "size=" << sb.size
+      << "shard_num=" << sb.shard_num
       << ", segment_size=" << sb.segment_size
       << ", block_size=" << sb.block_size
-      << ", segments=" << sb.segments
-      << ", tracker_offset=" << sb.tracker_offset
-      << ", first_segment_offset=" << sb.first_segment_offset
-      << ", config=" << sb.config
+      << ", shard_info:";
+  for (auto &sf : sb.shard_infos) {
+    out << sf
+        << ",";
+  }
+  out << "config=" << sb.config
       << ")";
   return out;
 }
@@ -52,7 +65,7 @@ LOG_PREFIX(SegmentManager::get_segment_manager);
     static_cast<size_t>(0),
     [&](auto &nr_zones) {
       return seastar::open_file_dma(
-       device + "/block" + std::to_string(seastar::this_shard_id()),
+       device + "/block",
        seastar::open_flags::rw
       ).then([&](auto file) {
        return seastar::do_with(
@@ -67,11 +80,11 @@ LOG_PREFIX(SegmentManager::get_segment_manager);
        if (nr_zones != 0) {
          return std::make_unique<
            segment_manager::zns::ZNSSegmentManager
-           >(device + "/block" + std::to_string(seastar::this_shard_id()));
+           >(device + "/block");
        } else {
          return std::make_unique<
            segment_manager::block::BlockSegmentManager
-           >(device + "/block" + std::to_string(seastar::this_shard_id()), dtype);
+           >(device + "/block", dtype);
        }
       });
     });
@@ -79,7 +92,7 @@ LOG_PREFIX(SegmentManager::get_segment_manager);
   return seastar::make_ready_future<crimson::os::seastore::SegmentManagerRef>(
     std::make_unique<
       segment_manager::block::BlockSegmentManager
-    >(device + "/block" + std::to_string(seastar::this_shard_id()), dtype));
+    >(device + "/block", dtype));
 #endif
 }
 
index b3e0d1618467261d6370861e663d5b0a6c8ced0b..1669d124a6b8ac079bb09d6d0b7f8a44707a1539 100644 (file)
 
 namespace crimson::os::seastore {
 
+using std::vector;
+struct block_shard_info_t {
+  std::size_t size;
+  std::size_t segments;
+  uint64_t tracker_offset;
+  uint64_t first_segment_offset;
+
+  DENC(block_shard_info_t, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.size, p);
+    denc(v.segments, p);
+    denc(v.tracker_offset, p);
+    denc(v.first_segment_offset, p);
+    DENC_FINISH(p);
+  }
+};
+
 struct block_sm_superblock_t {
-  size_t size = 0;
+  unsigned int shard_num = 0;
   size_t segment_size = 0;
   size_t block_size = 0;
 
-  size_t segments = 0;
-  uint64_t tracker_offset = 0;
-  uint64_t first_segment_offset = 0;
+  std::vector<block_shard_info_t> shard_infos;
 
   device_config_t config;
 
   DENC(block_sm_superblock_t, v, p) {
     DENC_START(1, 1, p);
-    denc(v.size, p);
+    denc(v.shard_num, p);
     denc(v.segment_size, p);
     denc(v.block_size, p);
-    denc(v.segments, p);
-    denc(v.tracker_offset, p);
-    denc(v.first_segment_offset, p);
+    denc(v.shard_infos, p);
     denc(v.config, p);
     DENC_FINISH(p);
   }
 
   void validate() const {
+    ceph_assert(shard_num == seastar::smp::count);
     ceph_assert(block_size > 0);
     ceph_assert(segment_size > 0 &&
                 segment_size % block_size == 0);
     ceph_assert_always(segment_size <= SEGMENT_OFF_MAX);
-    ceph_assert(size > segment_size &&
-                size % block_size == 0);
-    ceph_assert_always(size <= DEVICE_OFF_MAX);
-    ceph_assert(segments > 0);
-    ceph_assert_always(segments <= DEVICE_SEGMENT_ID_MAX);
-    ceph_assert(tracker_offset > 0 &&
-                tracker_offset % block_size == 0);
-    ceph_assert(first_segment_offset > tracker_offset &&
-                first_segment_offset % block_size == 0);
+    for (unsigned int i = 0; i < seastar::smp::count; i ++) {
+      ceph_assert(shard_infos[i].size > segment_size &&
+                  shard_infos[i].size % block_size == 0);
+      ceph_assert_always(shard_infos[i].size <= DEVICE_OFF_MAX);
+      ceph_assert(shard_infos[i].segments > 0);
+      ceph_assert_always(shard_infos[i].segments <= DEVICE_SEGMENT_ID_MAX);
+      ceph_assert(shard_infos[i].tracker_offset > 0 &&
+                  shard_infos[i].tracker_offset % block_size == 0);
+      ceph_assert(shard_infos[i].first_segment_offset > shard_infos[i].tracker_offset &&
+                  shard_infos[i].first_segment_offset % block_size == 0);
+    }
     ceph_assert(config.spec.magic != 0);
     ceph_assert(get_default_backend_of_device(config.spec.dtype) ==
                backend_type_t::SEGMENTED);
@@ -75,6 +91,7 @@ struct block_sm_superblock_t {
   }
 };
 
+std::ostream& operator<<(std::ostream&, const block_shard_info_t&);
 std::ostream& operator<<(std::ostream&, const block_sm_superblock_t&);
 
 class Segment : public boost::intrusive_ref_counter<
@@ -186,10 +203,14 @@ public:
 
 }
 
+WRITE_CLASS_DENC(
+  crimson::os::seastore::block_shard_info_t
+)
 WRITE_CLASS_DENC(
   crimson::os::seastore::block_sm_superblock_t
 )
 
 #if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::block_shard_info_t> : fmt::ostream_formatter {};
 template <> struct fmt::formatter<crimson::os::seastore::block_sm_superblock_t> : fmt::ostream_formatter {};
 #endif
index 8ed119cb9f7070c3a751290db4c55e62a696992e..ca1060d09816d00382b1855beba0c658f9f9f6d2 100644 (file)
@@ -195,7 +195,7 @@ SegmentStateTracker::read_in(
     bptr.length(),
     bptr);
 }
-
+using std::vector;
 static
 block_sm_superblock_t make_superblock(
   device_id_t device_id,
@@ -206,39 +206,44 @@ block_sm_superblock_t make_superblock(
   using crimson::common::get_conf;
 
   auto config_size = get_conf<Option::size_t>(
-    "seastore_device_size")/seastar::smp::count;
+    "seastore_device_size");
 
   size_t size = (data.size == 0) ? config_size : data.size;
 
   auto config_segment_size = get_conf<Option::size_t>(
     "seastore_segment_size");
   size_t raw_segments = size / config_segment_size;
-  size_t tracker_size = SegmentStateTracker::get_raw_size(
-    raw_segments,
+  size_t shard_tracker_size = SegmentStateTracker::get_raw_size(
+    raw_segments / seastar::smp::count,
     data.block_size);
-  size_t tracker_off = data.block_size;
-  size_t first_seg_off = tracker_size + tracker_off;
-  size_t segments = (size - first_seg_off) / config_segment_size;
-  size_t available_size = segments * config_segment_size;
+  size_t total_tracker_size = shard_tracker_size * seastar::smp::count;
+  size_t tracker_off = data.block_size;   //superblock
+  size_t segments = (size - tracker_off - total_tracker_size) / config_segment_size;
+  size_t segments_per_shard = segments / seastar::smp::count;
+
+  vector<block_shard_info_t> shard_infos(seastar::smp::count);
+  for (unsigned int i = 0; i < seastar::smp::count; i++) {
+    shard_infos[i].size = segments_per_shard * config_segment_size;
+    shard_infos[i].segments = segments_per_shard;
+    shard_infos[i].tracker_offset = tracker_off + i * shard_tracker_size;
+    shard_infos[i].first_segment_offset = tracker_off + total_tracker_size
+                             + i * segments_per_shard * config_segment_size;
+  }
 
-  INFO("{} disk_size={}, available_size={}, segment_size={}, segments={}, "
-       "block_size={}, tracker_off={}, first_seg_off={}",
+  INFO("{} disk_size={}, segment_size={}, block_size={}",
        device_id_printer_t{device_id},
        size,
-       available_size,
        config_segment_size,
-       segments,
-       data.block_size,
-       tracker_off,
-       first_seg_off);
+       data.block_size);
+  for (unsigned int i = 0; i < seastar::smp::count; i++) {
+    INFO("shard {} infos:", i, shard_infos[i]);
+  }
 
   return block_sm_superblock_t{
-    available_size,
+    seastar::smp::count,
     config_segment_size,
     data.block_size,
-    segments,
-    tracker_off,
-    first_seg_off,
+    shard_infos,
     std::move(sm_config)
   };
 }
@@ -449,7 +454,8 @@ Segment::close_ertr::future<> BlockSegmentManager::segment_close(
   stats.closed_segments_unused_bytes += unused_bytes;
   stats.metadata_write.increment(tracker->get_size());
   return tracker->write_out(
-      get_device_id(), device, superblock.tracker_offset);
+      get_device_id(), device,
+      shard_info.tracker_offset);
 }
 
 Segment::write_ertr::future<> BlockSegmentManager::segment_write(
@@ -474,7 +480,18 @@ BlockSegmentManager::~BlockSegmentManager()
 
 BlockSegmentManager::mount_ret BlockSegmentManager::mount()
 {
-  LOG_PREFIX(BlockSegmentManager::mount);
+  return shard_devices.invoke_on_all([](auto &local_device) {
+    return local_device.shard_mount(
+    ).handle_error(
+      crimson::ct_error::assert_all{
+        "Invalid error in BlockSegmentManager::mount"
+    });
+  });
+}
+
+BlockSegmentManager::mount_ret BlockSegmentManager::shard_mount()
+{
+  LOG_PREFIX(BlockSegmentManager::shard_mount);
   return open_device(
     device_path
   ).safe_then([=, this](auto p) {
@@ -483,19 +500,20 @@ BlockSegmentManager::mount_ret BlockSegmentManager::mount()
     return read_superblock(device, sd);
   }).safe_then([=, this](auto sb) {
     set_device_id(sb.config.spec.id);
-    INFO("{} read {}", device_id_printer_t{get_device_id()}, sb);
+    shard_info = sb.shard_infos[seastar::this_shard_id()];
+    INFO("{} read {}", device_id_printer_t{get_device_id()}, shard_info);
     sb.validate();
     superblock = sb;
     stats.data_read.increment(
         ceph::encoded_sizeof<block_sm_superblock_t>(superblock));
     tracker = std::make_unique<SegmentStateTracker>(
-      superblock.segments,
+      shard_info.segments,
       superblock.block_size);
     stats.data_read.increment(tracker->get_size());
     return tracker->read_in(
       get_device_id(),
       device,
-      superblock.tracker_offset
+      shard_info.tracker_offset
     ).safe_then([this] {
       for (device_segment_id_t i = 0; i < tracker->get_capacity(); ++i) {
        if (tracker->get(i) == segment_state_t::OPEN) {
@@ -504,7 +522,8 @@ BlockSegmentManager::mount_ret BlockSegmentManager::mount()
       }
       stats.metadata_write.increment(tracker->get_size());
       return tracker->write_out(
-          get_device_id(), device, superblock.tracker_offset);
+          get_device_id(), device,
+          shard_info.tracker_offset);
     });
   }).safe_then([this, FNAME] {
     INFO("{} complete", device_id_printer_t{get_device_id()});
@@ -515,7 +534,22 @@ BlockSegmentManager::mount_ret BlockSegmentManager::mount()
 BlockSegmentManager::mkfs_ret BlockSegmentManager::mkfs(
   device_config_t sm_config)
 {
-  LOG_PREFIX(BlockSegmentManager::mkfs);
+  return shard_devices.local().primary_mkfs(sm_config
+  ).safe_then([this] {
+    return shard_devices.invoke_on_all([](auto &local_device) {
+      return local_device.shard_mkfs(
+      ).handle_error(
+        crimson::ct_error::assert_all{
+          "Invalid error in BlockSegmentManager::mkfs"
+      });
+    });
+  });
+}
+
+BlockSegmentManager::mkfs_ret BlockSegmentManager::primary_mkfs(
+  device_config_t sm_config)
+{
+  LOG_PREFIX(BlockSegmentManager::primary_mkfs);
   ceph_assert(sm_config.spec.dtype == superblock.config.spec.dtype);
   set_device_id(sm_config.spec.id);
   INFO("{} path={}, {}",
@@ -530,8 +564,7 @@ BlockSegmentManager::mkfs_ret BlockSegmentManager::mkfs(
     check_create_device_ret maybe_create = check_create_device_ertr::now();
     using crimson::common::get_conf;
     if (get_conf<bool>("seastore_block_create")) {
-      auto size =
-        get_conf<Option::size_t>("seastore_device_size")/seastar::smp::count;
+      auto size = get_conf<Option::size_t>("seastore_device_size");
       maybe_create = check_create_device(device_path, size);
     }
 
@@ -544,12 +577,6 @@ BlockSegmentManager::mkfs_ret BlockSegmentManager::mkfs(
       stats.metadata_write.increment(
           ceph::encoded_sizeof<block_sm_superblock_t>(sb));
       return write_superblock(get_device_id(), device, sb);
-    }).safe_then([&, FNAME, this] {
-      DEBUG("{} superblock written", device_id_printer_t{get_device_id()});
-      tracker.reset(new SegmentStateTracker(sb.segments, sb.block_size));
-      stats.metadata_write.increment(tracker->get_size());
-      return tracker->write_out(
-          get_device_id(), device, sb.tracker_offset);
     }).finally([&] {
       return device.close();
     }).safe_then([FNAME, this] {
@@ -559,6 +586,34 @@ BlockSegmentManager::mkfs_ret BlockSegmentManager::mkfs(
   });
 }
 
+BlockSegmentManager::mkfs_ret BlockSegmentManager::shard_mkfs()
+{
+  LOG_PREFIX(BlockSegmentManager::shard_mkfs);
+  return open_device(
+    device_path
+  ).safe_then([this](auto p) {
+    device = std::move(p.first);
+    auto sd = p.second;
+    return read_superblock(device, sd);
+  }).safe_then([this, FNAME](auto sb) {
+    set_device_id(sb.config.spec.id);
+    shard_info = sb.shard_infos[seastar::this_shard_id()];
+    INFO("{} read {}", device_id_printer_t{get_device_id()}, shard_info);
+    sb.validate();
+    tracker.reset(new SegmentStateTracker(
+      shard_info.segments, sb.block_size));
+    stats.metadata_write.increment(tracker->get_size());
+    return tracker->write_out(
+      get_device_id(), device,
+      shard_info.tracker_offset);
+  }).finally([this] {
+    return device.close();
+  }).safe_then([FNAME, this] {
+    INFO("{} complete", device_id_printer_t{get_device_id()});
+    return mkfs_ertr::now();
+  });
+}
+
 BlockSegmentManager::close_ertr::future<> BlockSegmentManager::close()
 {
   LOG_PREFIX(BlockSegmentManager::close);
@@ -589,7 +644,8 @@ SegmentManager::open_ertr::future<SegmentRef> BlockSegmentManager::open(
   tracker->set(s_id, segment_state_t::OPEN);
   stats.metadata_write.increment(tracker->get_size());
   return tracker->write_out(
-      get_device_id(), device, superblock.tracker_offset
+      get_device_id(), device,
+      shard_info.tracker_offset
   ).safe_then([this, id, FNAME] {
     ++stats.opened_segments;
     DEBUG("{} done", id);
@@ -622,7 +678,8 @@ SegmentManager::release_ertr::future<> BlockSegmentManager::release(
   ++stats.released_segments;
   stats.metadata_write.increment(tracker->get_size());
   return tracker->write_out(
-      get_device_id(), device, superblock.tracker_offset);
+      get_device_id(), device,
+      shard_info.tracker_offset);
 }
 
 SegmentManager::read_ertr::future<> BlockSegmentManager::read(
index 4fa715ba71ff483e50fe39b6661ec2f8c21f1905..495d0d104516a62bdd461ac21fdae6c16ccb9e92 100644 (file)
@@ -110,11 +110,24 @@ public:
  * state analagous to that of the segments of a zns device.
  */
 class BlockSegmentManager final : public SegmentManager {
+// interfaces used by Device
 public:
+  seastar::future<> start() {
+    return shard_devices.start(device_path, superblock.config.spec.dtype);
+  }
+
+  seastar::future<> stop() {
+    return shard_devices.stop();
+  }
+
+  Device& get_sharded_device() final {
+    return shard_devices.local();
+  }
   mount_ret mount() final;
 
   mkfs_ret mkfs(device_config_t) final;
-
+// interfaces used by each shard device
+public:
   close_ertr::future<> close();
 
   BlockSegmentManager(
@@ -140,7 +153,7 @@ public:
     return superblock.config.spec.dtype;
   }
   size_t get_available_size() const final {
-    return superblock.size;
+    return shard_info.size;
   }
   extent_len_t get_block_size() const {
     return superblock.block_size;
@@ -205,6 +218,7 @@ private:
 
   std::string device_path;
   std::unique_ptr<SegmentStateTracker> tracker;
+  block_shard_info_t shard_info;
   block_sm_superblock_t superblock;
   seastar::file device;
 
@@ -218,7 +232,7 @@ private:
 
   size_t get_offset(paddr_t addr) {
     auto& seg_addr = addr.as_seg_paddr();
-    return superblock.first_segment_offset +
+    return shard_info.first_segment_offset +
       (seg_addr.get_segment_id().device_segment_id() * superblock.segment_size) +
       seg_addr.get_segment_off();
   }
@@ -233,6 +247,16 @@ private:
 
   Segment::close_ertr::future<> segment_close(
       segment_id_t id, segment_off_t write_pointer);
+
+private:
+  // shard 0 mkfs
+  mkfs_ret primary_mkfs(device_config_t);
+  // all shards mkfs
+  mkfs_ret shard_mkfs();
+  // all shards mount
+  mount_ret shard_mount();
+
+  seastar::sharded<BlockSegmentManager> shard_devices;
 };
 
 }