]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
Allow setting peering_crush_bucket_{count|target|barrier}
authorKamoltat <ksirivad@redhat.com>
Fri, 15 Mar 2024 22:01:06 +0000 (22:01 +0000)
committerKamoltat Sirivadhna <ksirivad@redhat.com>
Wed, 28 Aug 2024 18:49:02 +0000 (18:49 +0000)
In the command `ceph osd pool stretch set`

<pool> <peering_crush_bucket_count>
<peering_crush_bucket_target> <peering_crush_bucket_barrier>
<crush_rule> <size> <min_size>

user has the option of setting the value of `peering_crush_bucket_{count|target|barrier}`.
This will then allow the utilization `calc_replicated_acting_stretch`,
since with `peering_crush_bucket_count != 0`
the pool is now a stretch_pool and we can handle pg_temp
better by settubg barriers and limits to how much OSDs
should be in a pg_temp.

This will enable the specify pool to
handle pg_temp properly during create_acting, as a stretch pool
should.

User can also use the command:
`osd pool stretch show <pool> `

to show all the stretch related information for the pool

pool: cephfs.a.data
pool_id: 3
is_stretch_pool: 1
peering_crush_bucket_count: 3
peering_crush_bucket_target: 3
peering_crush_bucket_barrier: 8
crush_rule: replicated_rule_custom
size: 3
min_size: 2

User can also unset the stretch pool wiith the commnad:
`osd pool stretch unset <pool>`
However, the pool must be a stretch pool.

Fixes: https://tracker.ceph.com/issues/64802
Signed-off-by: Kamoltat <ksirivad@redhat.com>
(cherry picked from commit 661e8e22cff8763cf8775f204dda4490e464db2c)

src/mon/MonCommands.h
src/mon/OSDMonitor.cc
src/mon/OSDMonitor.h
src/osd/osd_types.cc

index 1f40621cea9a65b84d62279bee63bf1ad46d26f0..380e3e58695de81ea7e2bc91fb82bcfe4bba90f8 100644 (file)
@@ -1194,6 +1194,25 @@ COMMAND("osd pool application get "
         "name=key,type=CephString,req=false",
         "get value of key <key> of application <app> on pool <poolname>",
         "osd", "r")
+COMMAND("osd pool stretch show "
+        "name=pool,type=CephPoolname",
+        "show all the stretch related information for the pool",
+        "osd", "r")
+COMMAND("osd pool stretch set "
+        "name=pool,type=CephPoolname "
+               "name=peering_crush_bucket_count,type=CephInt,range=0 "
+               "name=peering_crush_bucket_target,type=CephInt,range=0 "
+               "name=peering_crush_bucket_barrier,type=CephString "
+               "name=crush_rule,type=CephString "
+               "name=size,type=CephInt,range=0 "
+               "name=min_size,type=CephInt,range=0 "
+               "name=yes_i_really_mean_it,type=CephBool,req=false",
+        "make the pool stretched across the specified number of CRUSH buckets",
+        "osd", "rw")
+COMMAND("osd pool stretch unset "
+               "name=pool,type=CephPoolname",
+               "unset the stretch mode for the pool",
+               "osd", "rw")
 COMMAND("osd utilization",
        "get basic pg distribution stats",
        "osd", "r")
index 7bfd4e2cacc4fe0e940d5bafb6bc08fb1ed632e2..57143bb618aee79dd636dd2fb3497051a7fddabc 100644 (file)
@@ -6093,6 +6093,62 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
     }
     r = 0;
 
+  } else if (prefix == "osd pool stretch show") {
+    string poolstr;
+    cmd_getval(cmdmap, "pool", poolstr);
+    int64_t pool = osdmap.lookup_pg_pool_name(poolstr);
+    if (pool < 0) {
+      ss << "unrecognized pool '" << poolstr << "'";
+      r = -ENOENT;
+      goto reply;
+    }
+    const pg_pool_t *p = osdmap.get_pg_pool(pool);
+
+    if (!p->is_stretch_pool()) {
+      ss << poolstr << " " << " is not a stretch pool.";
+      r = -ENOENT;
+      goto reply;
+    } else {
+      if (f) {
+        f->open_object_section("pool");
+        f->dump_string("pool", poolstr);
+        f->dump_int("pool_id", pool);
+        f->dump_bool("is_stretch_pool", p->is_stretch_pool());
+        f->dump_int("peering_crush_bucket_count", p->peering_crush_bucket_count);
+        f->dump_int("peering_crush_bucket_target", p->peering_crush_bucket_target);
+        f->dump_string("peering_crush_bucket_barrier", stringify(osdmap.crush->get_type_name(p->peering_crush_bucket_barrier)));
+        if (osdmap.crush->rule_exists(p->get_crush_rule())) {
+          f->dump_string("crush_rule", osdmap.crush->get_rule_name(p->get_crush_rule()));
+        } else {
+          f->dump_string("crush_rule", stringify(p->get_crush_rule()));
+          // warn if the rule does not exist
+          mon.clog->warn() << __func__ << " pool " << poolstr << " crush rule " << stringify(p->get_crush_rule()) << " does not exist";
+        }
+        f->dump_int("size", p->get_size());
+        f->dump_int("min_size", p->get_min_size());
+        f->close_section();
+        f->flush(rdata);
+      } else {
+        stringstream ss;
+        ss << "pool: " << poolstr << "\n";
+        ss << "pool_id: " << pool << "\n";
+        ss << "is_stretch_pool: " << p->is_stretch_pool() << "\n";
+        ss << "peering_crush_bucket_count: " << p->peering_crush_bucket_count << "\n";
+        ss << "peering_crush_bucket_target: " << p->peering_crush_bucket_target << "\n";
+        ss << "peering_crush_bucket_barrier: " << osdmap.crush->get_type_name(p->peering_crush_bucket_barrier) << "\n";
+        if (osdmap.crush->rule_exists(p->get_crush_rule())) {
+          ss << "crush_rule: " << osdmap.crush->get_rule_name(p->get_crush_rule()) << "\n";
+        } else {
+          ss << "crush_rule: " << p->get_crush_rule() << "\n";
+          // warn if the rule does not exist
+          mon.clog->warn() << __func__ << " pool " << poolstr << " crush rule " << stringify(p->get_crush_rule()) << " does not exist";
+        }
+        ss << "size: " << p->get_size() << "\n";
+        ss << "min_size: " << p->get_min_size() << "\n";
+        rdata.append(ss.str());
+      }
+    }
+    r = 0;
   } else if (prefix == "osd pool get") {
     string poolstr;
     cmd_getval(cmdmap, "pool", poolstr);
@@ -9046,6 +9102,149 @@ int OSDMonitor::prepare_command_pool_application(const string &prefix,
   return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
 }
 
+int OSDMonitor::prepare_command_pool_stretch_set(const cmdmap_t& cmdmap,
+                                                    stringstream& ss)
+{
+  string pool_name;
+  cmd_getval(cmdmap, "pool", pool_name);
+  int64_t pool = osdmap.lookup_pg_pool_name(pool_name);
+  if (pool < 0) {
+    ss << "unrecognized pool '" << pool_name << "'";
+    return -ENOENT;
+  }
+
+  pg_pool_t p = *osdmap.get_pg_pool(pool);
+  if (pending_inc.new_pools.count(pool))
+    p = pending_inc.new_pools[pool];
+
+  int64_t bucket_count = cmd_getval_or<int64_t>(cmdmap, "peering_crush_bucket_count", 0);
+  if (bucket_count <= 0) {
+    ss << "peering_crush_bucket_count must be >= 0! FYI use 'ceph osd pool stretch unset' to unset the stretch values";
+    return -EINVAL;
+  }
+
+  int64_t bucket_target = cmd_getval_or<int64_t>(cmdmap, "peering_crush_bucket_target", 0);
+  if (bucket_target <= 0) {
+    ss << "peering_crush_bucket_target must be >= 0! FYI use 'ceph osd pool stretch unset' to unset the stretch values";
+    return -EINVAL;
+  }
+
+  int bucket_barrier = 0;
+  string bucket_barrier_str;
+  cmd_getval(cmdmap, "peering_crush_bucket_barrier", bucket_barrier_str);
+  CrushWrapper& crush = _get_stable_crush();
+  if (bucket_barrier_str.empty()) {
+    ss << "peering_crush_bucket_barrier must be provided";
+    return -EINVAL;
+  } else {
+    bucket_barrier = crush.get_type_id(bucket_barrier_str);
+    if (bucket_barrier < 0) {
+      ss << "peering_crush_bucket_barrier " << bucket_barrier_str << " does not exist";
+      return -EINVAL;
+    } else if (bucket_barrier == 0) {
+      ss << "peering_crush_bucket_barrier " << bucket_barrier_str << " is not a bucket type";
+      return -EINVAL;
+    }
+  }
+  // Check if the number of peering_crush_bucket_count and peering_crush_bucket_target
+  // exceeds the number of subtrees of the specified bucket_barrier in the cluster.
+  vector<int> subtrees;
+  bool sure = false;
+  crush.get_subtree_of_type(bucket_barrier, &subtrees);
+  cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+  if (static_cast<uint32_t>(bucket_count) > subtrees.size()) {
+    if (!sure) {
+      ss << "peering_crush_bucket_count=" << bucket_count
+        << " > " << bucket_barrier_str << "=" <<  subtrees.size()
+        << " can lead to data unavailability, pass --yes-i-really-mean-it to proceed";
+      return -EPERM;
+    }
+  } else if (static_cast<uint32_t>(bucket_target) > subtrees.size()) {
+    if (!sure) {
+      ss << "peering_crush_bucket_target=" << bucket_target
+        << " > " << bucket_barrier_str << "=" <<  subtrees.size()
+        << " can lead to data unavailability, pass --yes-i-really-mean-it to proceed";
+      return -EPERM;
+    }
+  }
+
+  string crush_rule_str;
+  cmd_getval(cmdmap, "crush_rule", crush_rule_str);
+  if (crush_rule_str.empty()) {
+    ss << "crush_rule must be provided";
+    return -EINVAL;
+  }
+  int crush_rule = crush.get_rule_id(crush_rule_str);
+  if (crush_rule < 0) {
+    ss << "crush rule " << crush_rule_str << " does not exist";
+    return -ENOENT;
+  }
+  if (!crush.rule_valid_for_pool_type(crush_rule, p.get_type())) {
+    ss << "crush rule " << crush_rule << " type does not match pool";
+    return -EINVAL;
+  }
+  int64_t pool_size = cmd_getval_or<int64_t>(cmdmap, "size", 0);
+  if (pool_size < 0) {
+    ss << "pool size must be non-negative";
+    return -EINVAL;
+  }
+
+  int64_t pool_min_size = cmd_getval_or<int64_t>(cmdmap, "min_size", 0);
+  if (pool_min_size < 0) {
+    ss << "pool min_size must be non-negative";
+    return -EINVAL;
+  }
+
+  p.peering_crush_bucket_count = static_cast<uint32_t>(bucket_count);
+  p.peering_crush_bucket_target = static_cast<uint32_t>(bucket_target);
+  p.peering_crush_bucket_barrier = static_cast<uint32_t>(bucket_barrier);
+  p.crush_rule = static_cast<__u8>(crush_rule);
+  p.size = static_cast<__u8>(pool_size);
+  p.min_size = static_cast<__u8>(pool_min_size);
+  p.last_change = pending_inc.epoch;
+  pending_inc.new_pools[pool] = p;
+  ss << "pool " << pool_name << " stretch values are set successfully";
+  return 0;
+}
+
+int OSDMonitor::prepare_command_pool_stretch_unset(const cmdmap_t& cmdmap,
+                                                    stringstream& ss)
+{
+  /**
+  * Command syntax:
+  *   ceph osd pool stretch unset <pool>
+  */
+  string pool_name;
+  cmd_getval(cmdmap, "pool", pool_name);
+  int64_t pool = osdmap.lookup_pg_pool_name(pool_name);
+  // check if pool exists
+  if (pool < 0) {
+    ss << "unrecognized pool '" << pool_name << "'";
+    return -ENOENT;
+  }
+
+  // get pool
+  pg_pool_t p = *osdmap.get_pg_pool(pool);
+  if (pending_inc.new_pools.count(pool))
+    p = pending_inc.new_pools[pool];
+  
+  // check if pool is a stretch pool
+  if (!p.is_stretch_pool()) {
+    ss << "pool " << pool_name << " is not a stretch pool";
+    return -ENOENT;
+  }
+
+  // unset stretch values
+  p.peering_crush_bucket_count = 0;
+  p.peering_crush_bucket_target = 0;
+  p.peering_crush_bucket_barrier = 0;
+  p.last_change = pending_inc.epoch;
+  pending_inc.new_pools[pool] = p;
+  ss << "pool " << pool_name
+    << " is no longer a stretch pool, all stretch values are unset successfully";
+  return 0;
+}
+
 int OSDMonitor::preprocess_command_pool_application(const string &prefix,
                                                     const cmdmap_t& cmdmap,
                                                     stringstream& ss,
@@ -13902,6 +14101,28 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     } else {
       goto update;
     }
+  } else if (prefix == "osd pool stretch set") {
+    err = prepare_command_pool_stretch_set(cmdmap, ss);
+    if (err == -EAGAIN)
+      goto wait;
+    if (err < 0)
+      goto reply_no_propose;
+
+    getline(ss, rs);
+    wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+                                                  get_last_committed() + 1));
+    return true;
+  } else if (prefix == "osd pool stretch unset") {
+    err = prepare_command_pool_stretch_unset(cmdmap, ss);
+    if (err == -EAGAIN)
+      goto wait;
+    if (err < 0)
+      goto reply_no_propose;
+
+    getline(ss, rs);
+    wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+                                                  get_last_committed() + 1));
+    return true;
   } else if (prefix == "osd force-create-pg") {
     pg_t pgid;
     string pgidstr;
index 398daf9243e10d91df218e58445cfe01a0a0dbca..8e01a0e7e1e25b43f0c3455584350b9141f53c35 100644 (file)
@@ -735,6 +735,10 @@ public:
                                           const cmdmap_t& cmdmap,
                                           std::stringstream& ss,
                                           bool *modified);
+  int prepare_command_pool_stretch_set(const cmdmap_t& cmdmap,
+                               std::stringstream& ss);
+  int prepare_command_pool_stretch_unset(const cmdmap_t& cmdmap,
+                                std::stringstream& ss);
   int _command_pool_application(const std::string &prefix,
                                const cmdmap_t& cmdmap,
                                std::stringstream& ss,
index 2c6028f9cb416bfc51b773255bdcbec44f5d883d..0bd54acb96c7e5b06c222961ba5a40d610e4d849 100644 (file)
@@ -1568,6 +1568,7 @@ void pg_pool_t::dump(Formatter *f) const
   f->dump_int("peering_crush_bucket_target", peering_crush_bucket_target);
   f->dump_int("peering_crush_bucket_barrier", peering_crush_bucket_barrier);
   f->dump_int("peering_crush_bucket_mandatory_member", peering_crush_mandatory_member);
+  f->dump_bool("is_stretch_pool", is_stretch_pool());
   f->dump_int("object_hash", get_object_hash());
   f->dump_string("pg_autoscale_mode",
                 get_pg_autoscale_mode_name(pg_autoscale_mode));