cout << " vs " << estddev << std::endl;
}
}
+
+struct cluster_test_spec_t {
+ const int num_osds_per_host;
+ const int num_hosts;
+
+ const int num_hosts_mapped;
+ const int num_mapped_per_host;
+ const int num_mapped_size;
+
+ const int num_osds;
+
+ cluster_test_spec_t(
+ int num_osds_per_host, int num_hosts,
+ int num_hosts_mapped, int num_mapped_per_host, int num_mapped_size)
+ : num_osds_per_host(num_osds_per_host), num_hosts(num_hosts),
+ num_hosts_mapped(num_hosts_mapped),
+ num_mapped_per_host(num_mapped_per_host),
+ num_mapped_size(num_mapped_size),
+ num_osds(num_osds_per_host * num_hosts) {}
+
+ void validate_osd(int osd) const {
+ EXPECT_GE(osd, 0);
+ EXPECT_LT(osd, num_osds);
+ }
+
+ bool check_osd(int osd) const {
+ return osd >= 0 && osd < num_osds;
+ }
+
+ void validate_host(int host) const {
+ assert(host >= 0);
+ assert(host < num_hosts);
+ }
+
+ std::pair<int, int> host_to_osd_range(int host) const {
+ validate_host(host);
+ auto first = host * num_osds_per_host;
+ return std::make_pair(first, first + num_osds_per_host);
+ }
+
+ int osd_to_host(int osd) const {
+ validate_osd(osd);
+ return osd / num_osds_per_host;
+ }
+};
+
+static constexpr int ROOT_TYPE = 2;
+static constexpr int HOST_TYPE = 1;
+static constexpr int OSD_TYPE = 0;
+std::pair<int, std::unique_ptr<CrushWrapper>> create_crush_heirarchy(
+ CephContext *cct,
+ const cluster_test_spec_t &spec)
+{
+ auto c = std::make_unique<CrushWrapper>();
+ c->create();
+ c->set_tunables_optimal();
+
+
+ c->set_type_name(ROOT_TYPE, "root");
+ c->set_type_name(HOST_TYPE, "host");
+ c->set_type_name(OSD_TYPE, "osd");
+
+ int rootno;
+ c->add_bucket(0, CRUSH_BUCKET_STRAW2, CRUSH_HASH_RJENKINS1,
+ ROOT_TYPE, 0, nullptr, nullptr, &rootno);
+ c->set_item_name(rootno, "default");
+
+ for (auto host_id = 0; host_id < spec.num_hosts; ++host_id) {
+ const std::string host_name = fmt::format("host{}", host_id);
+ const auto first_host_osd = host_id * spec.num_osds_per_host;
+ const auto next_first_host_osd = first_host_osd + spec.num_osds_per_host;
+ for (auto osd_id = first_host_osd; osd_id < next_first_host_osd; ++osd_id) {
+ const std::string osd_name = fmt::format("osd{}", osd_id);
+ auto ret = c->insert_item(
+ cct, osd_id, 1.0, osd_name,
+ {{ "root", "default"}, {"host", host_name}});
+ EXPECT_EQ(ret, 0);
+ }
+ }
+
+ c->finalize();
+ return std::make_pair(rootno, std::move(c));
+}
+
+std::vector<uint32_t> create_weight_vector(
+ const cluster_test_spec_t &spec)
+{
+ return std::vector<uint32_t>(spec.num_osds, CEPH_OSD_IN);
+}
+
+std::vector<uint32_t> create_weight_vector_first_osd_out(
+ const cluster_test_spec_t &spec,
+ const std::vector<int> &mapping)
+{
+ auto weights = create_weight_vector(spec);
+ spec.validate_osd(mapping[0]);
+ weights[mapping[0]] = CEPH_OSD_OUT;
+ return weights;
+}
+
+std::vector<uint32_t> create_weight_vector_first_host_out(
+ const cluster_test_spec_t &spec,
+ const std::vector<int> &mapping)
+{
+ auto weights = create_weight_vector(spec);
+ const auto [first, end] = spec.host_to_osd_range(spec.osd_to_host(mapping[0]));
+ for (auto i = first; i < end; ++i) {
+ weights[i] = CEPH_OSD_OUT;
+ }
+ return weights;
+}
+
+enum class mapping_change_t {
+ SAME,
+ FAILURE,
+ SAME_HOST,
+ NEW_HOST
+};
+void compare_mappings(
+ const cluster_test_spec_t &spec,
+ const std::vector<int> &before,
+ const std::vector<int> &after,
+ mapping_change_t expectation,
+ const std::pair<int, int> &range)
+{
+ const auto &[begin, end] = range;
+ for (auto i = begin; i < end; ++i) {
+ switch (expectation) {
+ case mapping_change_t::SAME:
+ EXPECT_EQ(before[i], after[i]);
+ break;
+ case mapping_change_t::FAILURE:
+ EXPECT_EQ(CRUSH_ITEM_NONE, after[i]);
+ break;
+ case mapping_change_t::SAME_HOST:
+ EXPECT_NE(before[i], after[i]);
+ if (!spec.check_osd(after[i])) {
+ spec.validate_osd(after[i]);
+ } else {
+ EXPECT_EQ(spec.osd_to_host(before[i]), spec.osd_to_host(after[i]));
+ }
+ break;
+ case mapping_change_t::NEW_HOST:
+ EXPECT_NE(before[i], after[i]);
+ if (!spec.check_osd(after[i])) {
+ spec.validate_osd(after[i]);
+ } else {
+ EXPECT_NE(spec.osd_to_host(before[i]), spec.osd_to_host(after[i]));
+ }
+ break;
+ }
+ }
+}
+
+std::vector<int> get_mapping(
+ const cluster_test_spec_t &spec,
+ CrushWrapper &c,
+ const std::vector<uint32_t> &weights,
+ int ruleno)
+{
+ std::vector<int> out;
+ c.do_rule(
+ ruleno, 0 /* seed */, out, spec.num_mapped_size,
+ weights,
+ 0);
+ EXPECT_EQ(std::size(out), spec.num_mapped_size);
+ return out;
+}
+
+unsigned count_mapped(const auto &v) {
+ unsigned ret = 0;
+ for (const auto &i : v) ret += (i != CRUSH_ITEM_NONE);
+ return ret;
+}
+
+TEST_F(CRUSHTest, msr_4_host_2_choose_rule) {
+ cluster_test_spec_t spec{3, 4, 3, 1, 3};
+ auto [rootno, c] = create_crush_heirarchy(cct, spec);
+
+ auto ruleno = c->add_rule(-1, 4, CRUSH_RULE_TYPE_MSR_INDEP);
+ EXPECT_EQ(0, c->set_rule_step_take(ruleno, 0, rootno));
+ EXPECT_EQ(
+ 0, c->set_rule_step_choose_msr(ruleno, 1, spec.num_hosts_mapped, HOST_TYPE));
+ EXPECT_EQ(
+ 0,
+ c->set_rule_step_choose_msr(
+ ruleno, 2, 1, OSD_TYPE));
+ EXPECT_EQ(0, c->set_rule_step_emit(ruleno, 3));
+
+ auto weights_all_in = create_weight_vector(spec);
+ auto before = get_mapping(spec, *c, weights_all_in, ruleno);
+ for (auto i : before) { spec.validate_osd(i); }
+
+ /* MSR test case. With normal CRUSH, hitting an out osd won't cause
+ * a retry of the previous step, so marking all of the osds on a host
+ * out will not cause positions mapped to that pg to remap.
+ * However, because the above is an MSR rule type, hitting an out osd
+ * will cause a retry of the previous steps as well.
+ * See https://tracker.ceph.com/issues/62214 for the original motivation */
+ auto weights_host_out = create_weight_vector_first_host_out(spec, before);
+ auto after_host_out = get_mapping(spec, *c, weights_host_out, ruleno);
+
+ CrushCompiler cc{*c, std::cout};
+ cc.decompile(std::cout);
+
+ fmt::print("weights_all_in: {}\n", fmt::join(weights_all_in, ", "));
+ fmt::print("weights_host_out: {}\n", fmt::join(weights_host_out, ", "));
+ fmt::print("before : {}\n", fmt::join(before, ", "));
+ fmt::print("after_host_out: {}\n", fmt::join(after_host_out, ", "));
+
+ auto count_mapped = [](const auto &v) {
+ unsigned ret = 0;
+ for (const auto &i : v) ret += (i != CRUSH_ITEM_NONE);
+ return ret;
+ };
+
+ EXPECT_EQ(count_mapped(before), count_mapped(after_host_out));
+
+ auto weights_osd_out = create_weight_vector_first_osd_out(spec, before);
+ auto after_osd_out = get_mapping(spec, *c, weights_osd_out, ruleno);
+ EXPECT_EQ(count_mapped(before), count_mapped(after_osd_out));
+}
+
+TEST_F(CRUSHTest, msr_2_host_2_osd) {
+ cluster_test_spec_t spec{2, 3, 2, 2, 3};
+ auto [rootno, c] = create_crush_heirarchy(cct, spec);
+
+ auto ruleno = c->add_rule(-1, 4, CRUSH_RULE_TYPE_MSR_INDEP);
+ EXPECT_EQ(0, c->set_rule_step_take(ruleno, 0, rootno));
+ EXPECT_EQ(
+ 0, c->set_rule_step_choose_msr(ruleno, 1, spec.num_hosts_mapped, HOST_TYPE));
+ EXPECT_EQ(
+ 0,
+ c->set_rule_step_choose_msr(
+ ruleno, 2, spec.num_mapped_per_host, OSD_TYPE));
+ EXPECT_EQ(0, c->set_rule_step_emit(ruleno, 3));
+
+ auto weights_all_in = create_weight_vector(spec);
+ auto before = get_mapping(spec, *c, weights_all_in, ruleno);
+ for (auto i : before) { spec.validate_osd(i); }
+
+ fmt::print("before : {}\n", fmt::join(before, ", "));
+ ASSERT_EQ(count_mapped(before), 3);
+
+ /* MSR test case. With normal CRUSH, hitting an out osd won't cause
+ * a retry of the previous step, so marking all of the osds on a host
+ * out will not cause positions mapped to that pg to remap.
+ * However, because the above is an MSR rule type, hitting an out osd
+ * will cause a retry of the previous steps as well.
+ * See https://tracker.ceph.com/issues/62214 for the original motivation */
+ auto weights_host_out = create_weight_vector_first_host_out(spec, before);
+ auto after_host_out = get_mapping(spec, *c, weights_host_out, ruleno);
+
+ CrushCompiler cc{*c, std::cout};
+ cc.decompile(std::cout);
+
+ fmt::print("weights_all_in: {}\n", fmt::join(weights_all_in, ", "));
+ fmt::print("weights_host_out: {}\n", fmt::join(weights_host_out, ", "));
+ fmt::print("before : {}\n", fmt::join(before, ", "));
+ fmt::print("after_host_out: {}\n", fmt::join(after_host_out, ", "));
+
+ compare_mappings(
+ spec, before, after_host_out, mapping_change_t::NEW_HOST,
+ {0, spec.num_mapped_per_host});
+ compare_mappings(
+ spec, before, after_host_out, mapping_change_t::SAME,
+ {spec.num_mapped_per_host, spec.num_mapped_size});
+}
+
+TEST_F(CRUSHTest, msr_5_host_8_6_ec_choose) {
+ cluster_test_spec_t spec{4, 5, 4, 4, 14};
+ auto [rootno, c] = create_crush_heirarchy(cct, spec);
+
+ auto ruleno = c->add_rule(-1, 4, CRUSH_RULE_TYPE_MSR_INDEP);
+ unsigned step_id = 0;
+ EXPECT_EQ(0, c->set_rule_step_take(ruleno, step_id++, rootno));
+ EXPECT_EQ(
+ 0,
+ c->set_rule_step_choose_msr(
+ ruleno, step_id++, spec.num_hosts_mapped, HOST_TYPE));
+ EXPECT_EQ(
+ 0,
+ c->set_rule_step_choose_msr(
+ ruleno, step_id++, spec.num_mapped_per_host, OSD_TYPE));
+ EXPECT_EQ(0, c->set_rule_step_emit(ruleno, step_id++));
+
+ auto weights_all_in = create_weight_vector(spec);
+ auto before = get_mapping(spec, *c, weights_all_in, ruleno);
+ for (auto i : before) { spec.validate_osd(i); }
+
+ /* MSR test case. With normal CRUSH, hitting an out osd won't cause
+ * a retry of the previous step, so marking all of the osds on a host
+ * out will not cause positions mapped to that pg to remap.
+ * However, because the above is an MSR rule type, hitting an out osd
+ * will cause a retry of the previous steps as well.
+ * See https://tracker.ceph.com/issues/62214 for the original motivation */
+ auto weights_host_out = create_weight_vector_first_host_out(spec, before);
+ auto after_host_out = get_mapping(spec, *c, weights_host_out, ruleno);
+
+ CrushCompiler cc{*c, std::cout};
+ cc.decompile(std::cout);
+
+ fmt::print("weights_all_in: {}\n", fmt::join(weights_all_in, ", "));
+ fmt::print("weights_host_out: {}\n", fmt::join(weights_host_out, ", "));
+ fmt::print("before : {}\n", fmt::join(before, ", "));
+ fmt::print("after_host_out: {}\n", fmt::join(after_host_out, ", "));
+
+ compare_mappings(
+ spec, before, after_host_out, mapping_change_t::NEW_HOST,
+ {0, spec.num_mapped_per_host});
+ compare_mappings(
+ spec, before, after_host_out, mapping_change_t::SAME,
+ {spec.num_mapped_per_host, spec.num_mapped_size});
+}
+
+TEST_F(CRUSHTest, msr_multi_root) {
+ constexpr unsigned NUM_HOSTS = 4;
+ constexpr unsigned NUM_OSDS_PER_HOST = 3;
+
+ auto c = CrushWrapper();
+ c.create();
+ c.set_tunables_optimal();
+
+ c.set_type_name(ROOT_TYPE, "root");
+ c.set_type_name(HOST_TYPE, "host");
+ c.set_type_name(OSD_TYPE, "osd");
+
+ std::map<int, std::pair<std::string, std::string>> osd_id_to_host_root;
+ std::map<std::string, int> root_name_to_id;
+ std::map<std::string, std::vector<int>> host_name_to_osds;
+ unsigned next_osd_id = 0;
+
+ auto populate_root = [&](const auto &root_name) {
+ int rootno;
+ c.add_bucket(0, CRUSH_BUCKET_STRAW2, CRUSH_HASH_RJENKINS1,
+ ROOT_TYPE, 0, nullptr, nullptr, &rootno);
+ c.set_item_name(rootno, root_name);
+ root_name_to_id[root_name] = rootno;
+
+ for (unsigned host_id = 0; host_id < NUM_HOSTS; ++host_id) {
+ const std::string host_name =
+ fmt::format("{}-host{}", root_name, host_id);
+ for (unsigned osd = 0; osd < NUM_OSDS_PER_HOST; ++osd) {
+ const int osd_id = next_osd_id++;
+ const std::string osd_name = fmt::format("{}-osd{}", root_name, osd_id);
+ auto ret = c.insert_item(
+ cct, osd_id, 1.0, osd_name,
+ {{ "root", root_name }, { "host", host_name }});
+ osd_id_to_host_root[osd_id] = std::make_pair(host_name, root_name);
+ host_name_to_osds[host_name].push_back(osd_id);
+ EXPECT_EQ(ret, 0);
+ }
+ }
+ };
+
+ int ruleno = 0;
+ int ret = c.add_rule(ruleno, 8, CRUSH_RULE_TYPE_MSR_INDEP);
+ ceph_assert(ret == ruleno);
+
+ unsigned step_id = 0;
+ auto populate_rule = [&](const auto &rule_name) {
+ ret = c.set_rule_step(
+ ruleno, step_id++, CRUSH_RULE_TAKE, root_name_to_id[rule_name], 0);
+ ceph_assert(ret == 0);
+ ret = c.set_rule_step(
+ ruleno, step_id++, CRUSH_RULE_CHOOSE_MSR, 2, HOST_TYPE);
+ ceph_assert(ret == 0);
+ ret = c.set_rule_step(
+ ruleno, step_id++, CRUSH_RULE_CHOOSE_MSR, 2, OSD_TYPE);
+ ceph_assert(ret == 0);
+ ret = c.set_rule_step(ruleno, step_id++, CRUSH_RULE_EMIT, 0, 0);
+ ceph_assert(ret == 0);
+ };
+
+ for (const auto &root_name : { "ssd", "hdd" }) {
+ populate_root(root_name);
+ populate_rule(root_name);
+ }
+ c.set_rule_name(ruleno, "rule_name");
+ c.finalize();
+
+ constexpr unsigned ACTING_SIZE = 8;
+ constexpr unsigned OSDS_PER_ROOT = 4;
+ constexpr unsigned OSDS_PER_HOST = 2;
+ auto validate_output = [&](const auto &out) {
+ std::set<std::string> hosts;
+ for (unsigned host = 0; host < (ACTING_SIZE / OSDS_PER_HOST); ++host) {
+ std::set<std::string> hosts_this_failure_domain;
+ unsigned start = host * OSDS_PER_HOST;
+ unsigned end = (host + 1) * OSDS_PER_HOST;
+ for (unsigned i = start; i < end; ++i) {
+ EXPECT_NE(out[i], CRUSH_ITEM_NONE);
+ EXPECT_EQ(osd_id_to_host_root.count(out[i]), 1);
+ const auto &[host_name, root_name] = osd_id_to_host_root[out[start]];
+ EXPECT_EQ(i < OSDS_PER_ROOT ? "ssd" : "hdd", root_name);
+ hosts_this_failure_domain.insert(host_name);
+ }
+ for (const auto &i: hosts_this_failure_domain) {
+ EXPECT_EQ(hosts.count(i), 0);
+ hosts.insert(i);
+ }
+ }
+ };
+
+ const std::vector<uint32_t> all_in(next_osd_id, CEPH_OSD_IN);
+ for (int x = 0; x < 1000; ++x) {
+ std::vector<int> out;
+ c.do_rule(ruleno, x, out, 8, all_in, 0);
+ EXPECT_EQ(count_mapped(out), 8);
+ validate_output(out);
+
+ {
+ std::vector<uint32_t> osds_out_weight = all_in;
+ std::set<unsigned> osd_idx_out{{1, 5}};
+ for (const auto &i: osd_idx_out) {
+ osds_out_weight[out[i]] = CEPH_OSD_OUT;
+ }
+ std::vector<int> osds_out;
+ c.do_rule(ruleno, x, osds_out, 8, osds_out_weight, 0);
+ EXPECT_EQ(count_mapped(osds_out), 8);
+ validate_output(osds_out);
+ for (unsigned i = 0; i < osds_out.size(); ++i) {
+ if (osd_idx_out.count(i)) {
+ EXPECT_NE(osds_out[i], out[i]);
+ } else {
+ EXPECT_EQ(osds_out[i], out[i]);
+ }
+ }
+ }
+
+ {
+ std::vector<uint32_t> hosts_out_weight = all_in;
+ std::set<unsigned> osd_ids_out;
+
+ for (const auto &i : {2, 6}) {
+ const auto &[host_name, _] = osd_id_to_host_root[out[i]];
+ for (const auto &osd_id: host_name_to_osds[host_name]) {
+ osd_ids_out.insert(osd_id);
+ hosts_out_weight[osd_id] = CEPH_OSD_OUT;
+ }
+ }
+
+ std::vector<int> hosts_out;
+ c.do_rule(ruleno, x, hosts_out, 8, hosts_out_weight, 0);
+ EXPECT_EQ(count_mapped(hosts_out), 8);
+ validate_output(hosts_out);
+ for (unsigned i = 0; i < hosts_out.size(); ++i) {
+ if (osd_ids_out.count(out[i])) {
+ EXPECT_NE(hosts_out[i], out[i]);
+ } else {
+ EXPECT_EQ(hosts_out[i], out[i]);
+ }
+ }
+ }
+ }
+}