mon: osd pool create <pool-name> with --bulk flag

author Kamoltat <ksirivad@redhat.com>

Tue, 7 Dec 2021 21:15:36 +0000 (21:15 +0000)

committer Kamoltat <ksirivad@redhat.com>

Mon, 20 Dec 2021 21:46:37 +0000 (21:46 +0000)
author Kamoltat <ksirivad@redhat.com>
Tue, 7 Dec 2021 21:15:36 +0000 (21:15 +0000)
committer Kamoltat <ksirivad@redhat.com>
Mon, 20 Dec 2021 21:46:37 +0000 (21:46 +0000)
diff --git a/doc/rados/operations/pools.rst b/doc/rados/operations/pools.rst

index 82cfaddd82901bbfd93fddba28c1d1637bc42cfc..3b6d227aad741f4928a5bb832c009de9b589bbf4 100644 (file)
--- a/doc/rados/operations/pools.rst
+++ b/doc/rados/operations/pools.rst
@@ -420,6 +420,15 @@ You may set values for the following keys:
     :Valid Range: 1 sets flag, 0 unsets flag
     :Version: Version ``FIXME``
  
+.. _bulk:
+
+.. describe:: bulk
+
+   Set/Unset bulk flag on a given pool.
+
+   :Type: Boolean
+   :Valid Range: true/1 sets flag, false/0 unsets flag
+
  .. _write_fadvise_dontneed:
  
  .. describe:: write_fadvise_dontneed
diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in

index 2a00edfe3e8efffc9f14084fa7e3ae6713b6d7e4..80e7b9dc34a5c7a4ee3a6d19bfb38b09ec9b1e2b 100644 (file)
--- a/src/common/options/global.yaml.in
+++ b/src/common/options/global.yaml.in
@@ -2566,6 +2566,15 @@ options:
    services:
    - mon
    with_legacy: true
+- name: osd_pool_default_flag_bulk
+  type: bool
+  level: advanced
+  desc: set bulk flag on new pools
+  fmt_desc: Set the ``bulk`` flag on new pools. Allowing autoscaler to use scale-down mode.
+  default: false
+  services:
+  - mon
+  with_legacy: true
  - name: osd_pool_default_hit_set_bloom_fpp
    type: float
    level: advanced
@@ -6096,4 +6105,4 @@ options:
    services:
    - rgw
    - osd
-  with_legacy: true
-\ No newline at end of file
+  with_legacy: true
diff --git a/src/mon/KVMonitor.cc b/src/mon/KVMonitor.cc

index a919a29eed20a1c22315e898ce3549dd1726ee07..37a81a8048d4eb9151b53de100af5c2bd88cac00 100644 (file)
--- a/src/mon/KVMonitor.cc
+++ b/src/mon/KVMonitor.cc
@@ -53,9 +53,6 @@ void KVMonitor::create_initial()
    dout(10) << __func__ << dendl;
    version = 0;
    pending.clear();
-  bufferlist bl;
-  bl.append("scale-up");
-  pending["config/mgr/mgr/pg_autoscaler/autoscale_profile"] = bl;
  }
  
  void KVMonitor::update_from_paxos(bool *need_bootstrap)
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h

index f4f4fef9804e7ca62693b5cc605e7993df103309..0a8ce0599df8cdc7fe6de7d46b06251f8bb70c29 100644 (file)
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -1058,6 +1058,7 @@ COMMAND("osd pool create "
          "name=size,type=CephInt,range=0,req=false "
         "name=pg_num_min,type=CephInt,range=0,req=false "
         "name=autoscale_mode,type=CephChoices,strings=on|off|warn,req=false "
+       "name=bulk,type=CephBool,req=false "
         "name=target_size_bytes,type=CephInt,range=0,req=false "
         "name=target_size_ratio,type=CephFloat,range=0|1,req=false",\
         "create pool", "osd", "rw")
@@ -1082,11 +1083,11 @@ COMMAND("osd pool rename "
         "rename <srcpool> to <destpool>", "osd", "rw")
  COMMAND("osd pool get "
         "name=pool,type=CephPoolname "
-       "name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|eio",
+       "name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|eio|bulk",
         "get pool parameter <var>", "osd", "r")
  COMMAND("osd pool set "
         "name=pool,type=CephPoolname "
-       "name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|pgp_num_actual|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|min_read_recency_for_promote|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|eio "
+       "name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|pgp_num_actual|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|min_read_recency_for_promote|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|eio|bulk "
         "name=val,type=CephString "
         "name=yes_i_really_mean_it,type=CephBool,req=false",
         "set pool parameter <var> to <val>", "osd", "rw")
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc

index b563e375092a6b41608063e1ed46605cc42c423a..a75be6d4ca25ccd5af701ef5a9302478148f0e4c 100644 (file)
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -5357,7 +5357,7 @@ namespace {
      CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
      PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
      PG_AUTOSCALE_BIAS, DEDUP_TIER, DEDUP_CHUNK_ALGORITHM, 
-    DEDUP_CDC_CHUNK_SIZE, POOL_EIO };
+    DEDUP_CDC_CHUNK_SIZE, POOL_EIO, BULK };
  
    std::set<osd_pool_get_choices>
      subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
@@ -6092,6 +6092,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
        {"dedup_tier", DEDUP_TIER},
        {"dedup_chunk_algorithm", DEDUP_CHUNK_ALGORITHM},
        {"dedup_cdc_chunk_size", DEDUP_CDC_CHUNK_SIZE},
+      {"bulk", BULK}
      };
  
      typedef std::set<osd_pool_get_choices> choices_set_t;
@@ -6209,6 +6210,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
           case HASHPSPOOL:
           case POOL_EIO:
           case NODELETE:
+         case BULK:
           case NOPGCHANGE:
           case NOSIZECHANGE:
           case WRITE_FADVISE_DONTNEED:
@@ -6437,6 +6439,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
           case HASHPSPOOL:
           case POOL_EIO:
           case NODELETE:
+         case BULK:
           case NOPGCHANGE:
           case NOSIZECHANGE:
           case WRITE_FADVISE_DONTNEED:
@@ -7259,11 +7262,12 @@ int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
    string erasure_code_profile;
    stringstream ss;
    string rule_name;
+  bool bulk = false;
    int ret = 0;
    ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
                          0, 0, 0, 0, 0, 0.0,
                          erasure_code_profile,
-                        pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {},
+                        pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {}, bulk,
                          &ss);
  
    if (ret < 0) {
@@ -7885,6 +7889,7 @@ int OSDMonitor::prepare_new_pool(string& name,
                                   const uint64_t expected_num_objects,
                                   FastReadType fast_read,
                                  const string& pg_autoscale_mode,
+                                bool bulk,
                                  ostream *ss)
  {
    if (name.length() == 0)
@@ -8005,6 +8010,11 @@ int OSDMonitor::prepare_new_pool(string& name,
    pi->type = pool_type;
    pi->fast_read = fread; 
    pi->flags = g_conf()->osd_pool_default_flags;
+  if (bulk) {
+    pi->set_flag(pg_pool_t::FLAG_BULK);
+  } else if (g_conf()->osd_pool_default_flag_bulk) {
+      pi->set_flag(pg_pool_t::FLAG_BULK);
+  }
    if (g_conf()->osd_pool_default_flag_hashpspool)
      pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
    if (g_conf()->osd_pool_default_flag_nodelete)
@@ -8438,7 +8448,7 @@ int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
      p.crush_rule = id;
    } else if (var == "nodelete" || var == "nopgchange" ||
              var == "nosizechange" || var == "write_fadvise_dontneed" ||
-            var == "noscrub" || var == "nodeep-scrub") {
+            var == "noscrub" || var == "nodeep-scrub" || var == "bulk") {
      uint64_t flag = pg_pool_t::get_flag_by_name(var);
      // make sure we only compare against 'n' if we didn't receive a string
      if (val == "true" || (interr.empty() && n == 1)) {
@@ -12880,6 +12890,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      string pg_autoscale_mode;
      cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode);
  
+    bool bulk = cmd_getval_or<bool>(cmdmap, "bulk", 0);
      err = prepare_new_pool(poolstr,
                            -1, // default crush rule
                            rule_name,
@@ -12889,6 +12900,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
                             (uint64_t)expected_num_objects,
                             fast_read,
                            pg_autoscale_mode,
+                          bulk,
                            &ss);
      if (err < 0) {
        switch(err) {
diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h

index 7757119751638d299483904106a9fa2c1c5a73e5..aa789e2e26255885e1ec5aa5edd2783796eeb863 100644 (file)
--- a/src/mon/OSDMonitor.h
+++ b/src/mon/OSDMonitor.h
@@ -528,6 +528,7 @@ private:
                         const uint64_t expected_num_objects,
                         FastReadType fast_read,
                        const std::string& pg_autoscale_mode,
+                      bool bulk,
                        std::ostream *ss);
    int prepare_new_pool(MonOpRequestRef op);
  
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc

index 1168b6dc35600c208111189bad7dd10c5f2aa70c..d449543e204f536ba4bdeef1772c29a593269cf2 100644 (file)
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -4242,6 +4242,8 @@ int OSDMap::build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
         pools[pool].set_flag(pg_pool_t::FLAG_NOPGCHANGE);
        if (cct->_conf->osd_pool_default_flag_nosizechange)
         pools[pool].set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
+      if (cct->_conf->osd_pool_default_flag_bulk)
+        pools[pool].set_flag(pg_pool_t::FLAG_BULK);
        pools[pool].size = cct->_conf.get_val<uint64_t>("osd_pool_default_size");
        pools[pool].min_size = cct->_conf.get_osd_pool_default_min_size(
                                   pools[pool].size);
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h

index 65c0cf1f4109d456df3213f6ac04d97aa0c6f8b9..fcc3939ccde0081593fb4080a7e6bd4eceff4379 100644 (file)
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -1256,6 +1256,7 @@ struct pg_pool_t {
      FLAG_POOL_SNAPS = 1<<14,        // pool has pool snaps
      FLAG_CREATING = 1<<15,          // initial pool PGs are being created
      FLAG_EIO = 1<<16,               // return EIO for all client ops
+    FLAG_BULK = 1<<17, //pool is large
    };
  
    static const char *get_flag_name(uint64_t f) {
@@ -1277,6 +1278,7 @@ struct pg_pool_t {
      case FLAG_POOL_SNAPS: return "pool_snaps";
      case FLAG_CREATING: return "creating";
      case FLAG_EIO: return "eio";
+    case FLAG_BULK: return "bulk";
      default: return "???";
      }
    }
@@ -1329,6 +1331,8 @@ struct pg_pool_t {
        return FLAG_CREATING;
      if (name == "eio")
        return FLAG_EIO;
+    if (name == "bulk")
+      return FLAG_BULK;
      return 0;
    }
  
diff --git a/src/pybind/mgr/pg_autoscaler/module.py b/src/pybind/mgr/pg_autoscaler/module.py

index 487531bb6a9eea0dd5b5acb8db6150c892632cbe..414afa7ce49541306fc937a27e0eded84249e180 100644 (file)
--- a/src/pybind/mgr/pg_autoscaler/module.py
+++ b/src/pybind/mgr/pg_autoscaler/module.py
@@ -31,7 +31,7 @@ if TYPE_CHECKING:
      else:
          from typing_extensions import Literal
  
-    ScaleModeT = Literal['scale-up', 'scale-down']
+    PassT = Literal['first', 'second', 'third']
  
  
  def nearest_power_of_two(n: int) -> int:
@@ -126,17 +126,7 @@ class PgAutoscaler(MgrModule):
              name='sleep_interval',
              type='secs',
              default=60),
-        Option(
-            'autoscale_profile',
-            default='scale-up',
-            type='str',
-            desc='pg_autoscale profiler',
-            long_desc=('Determines the behavior of the autoscaler algorithm, '
-                       '`scale-up` means that it starts out with minmum pgs '
-                       'and scales up when there is pressure'
-                       '`scale-down means start out with full pgs and scales'
-                       'down when there is pressure'),
-            runtime=True),
+
          Option(
              name='threshold',
              type='float',
@@ -156,7 +146,6 @@ class PgAutoscaler(MgrModule):
          # to just keep a copy of the pythonized version.
          self._osd_map = None
          if TYPE_CHECKING:
-            self.autoscale_profile: 'ScaleModeT' = 'scale-up'
              self.sleep_interval = 60
              self.mon_target_pg_per_osd = 0
              self.threshold = 3.0
@@ -173,10 +162,6 @@ class PgAutoscaler(MgrModule):
                      self.get_module_option(opt['name']))
              self.log.debug(' mgr option %s = %s',
                             opt['name'], getattr(self, opt['name']))
-        # if the profiler option is not set, this means it is an old cluster
-        autoscale_profile = self.get_module_option("autoscale_profile")
-        if not autoscale_profile:
-            self.set_module_option("autoscale_profile", "scale-up")
  
      @CLIReadCommand('osd pool autoscale-status')
      def _command_autoscale_status(self, format: str = 'plain') -> Tuple[int, str, str]:
@@ -185,8 +170,7 @@ class PgAutoscaler(MgrModule):
          """
          osdmap = self.get_osdmap()
          pools = osdmap.get_pools_by_name()
-        profile = self.autoscale_profile
-        ps, root_map = self._get_pool_status(osdmap, pools, profile)
+        ps, root_map = self._get_pool_status(osdmap, pools)
  
          if format in ('json', 'json-pretty'):
              return 0, json.dumps(ps, indent=4, sort_keys=True), ''
@@ -199,7 +183,7 @@ class PgAutoscaler(MgrModule):
                                   'PG_NUM',
  #                                 'IDEAL',
                                   'NEW PG_NUM', 'AUTOSCALE',
-                                 'PROFILE'],
+                                 'BULK'],
                                  border=False)
              table.left_padding_width = 0
              table.right_padding_width = 2
@@ -216,7 +200,7 @@ class PgAutoscaler(MgrModule):
  #            table.align['IDEAL'] = 'r'
              table.align['NEW PG_NUM'] = 'r'
              table.align['AUTOSCALE'] = 'l'
-            table.align['PROFILE'] =  'l'
+            table.align['BULK'] = 'l'
              for p in ps:
                  if p['would_adjust']:
                      final = str(p['pg_num_final'])
@@ -248,7 +232,7 @@ class PgAutoscaler(MgrModule):
  #                    p['pg_num_ideal'],
                      final,
                      p['pg_autoscale_mode'],
-                    profile
+                    str(p['bulk'])
                  ])
              return 0, table.get_string(), ''
  
@@ -263,29 +247,6 @@ class PgAutoscaler(MgrModule):
          self.set_module_option("threshold", num)
          return 0, "threshold updated", ""
  
-    @CLIWriteCommand("osd pool set autoscale-profile scale-up")
-    def set_profile_scale_up(self) -> Tuple[int, str, str]:
-        """
-        set the autoscaler behavior to start out with minimum pgs and scales up when there is pressure
-        """
-        if self.autoscale_profile == "scale-up":
-            return 0, "", "autoscale-profile is already a scale-up!"
-        else:
-            self.set_module_option("autoscale_profile", "scale-up")
-            return 0, "", "autoscale-profile is now scale-up"
-
-    @CLIWriteCommand("osd pool set autoscale-profile scale-down")
-    def set_profile_scale_down(self) -> Tuple[int, str, str]:
-        """
-        set the autoscaler behavior to start out with full pgs and
-        scales down when there is pressure
-        """
-        if self.autoscale_profile == "scale-down":
-            return 0, "", "autoscale-profile is already a scale-down!"
-        else:
-            self.set_module_option("autoscale_profile", "scale-down")
-            return 0, "", "autoscale-profile is now scale-down"
-
      def serve(self) -> None:
          self.config_notify()
          while not self._shutdown.is_set():
@@ -393,73 +354,80 @@ class PgAutoscaler(MgrModule):
              root_map: Dict[int, CrushSubtreeResourceStatus],
              root_id: int,
              capacity_ratio: float,
-            even_pools: Dict[str, Dict[str, Any]],
              bias: float,
-            is_used: bool,
-            profile: 'ScaleModeT',
+            even_pools: Dict[str, Dict[str, Any]],
+            bulk_pools: Dict[str, Dict[str, Any]],
+            func_pass: 'PassT',
+            bulk: bool,
      ) -> Union[Tuple[float, int, int], Tuple[None, None, None]]:
          """
          `profile` determines behaviour of the autoscaler.
-        `is_used` flag used to determine if this is the first
+        `first_pass` flag used to determine if this is the first
          pass where the caller tries to calculate/adjust pools that has
          used_ratio > even_ratio else this is the second pass,
          we calculate final_ratio by giving it 1 / pool_count
          of the root we are currently looking at.
          """
-        if profile == "scale-up":
-            final_ratio = capacity_ratio
-            # So what proportion of pg allowance should we be using?
-            pg_target = root_map[root_id].pg_target
-            assert pg_target is not None
-            pool_pg_target = (final_ratio * pg_target) / p['size'] * bias
-            final_pg_target = max(p.get('options', {}).get('pg_num_min', PG_NUM_MIN),
-                                  nearest_power_of_two(pool_pg_target))
-
-        else:
-            if is_used:
-                pool_count = root_map[root_id].pool_count
-                assert pool_count is not None
-                even_ratio = 1 / pool_count
-                used_ratio = capacity_ratio
-
-                if used_ratio > even_ratio:
-                    root_map[root_id].pool_used += 1
-                else:
-                    # keep track of even_pools to be used in second pass
-                    # of the caller function
-                    even_pools[pool_name] = p
-                    return None, None, None
-
-                final_ratio = max(used_ratio, even_ratio)
-                pg_target = root_map[root_id].pg_target
-                assert pg_target is not None
-                used_pg = final_ratio * pg_target
+        if func_pass == 'first':
+            # first pass to deal with small pools (no bulk flag)
+            # calculating final_pg_target based on capacity ratio
+            # we also keep track of bulk_pools to be used in second pass
+            if not bulk:
+                final_ratio = capacity_ratio
+                pg_left = root_map[root_id].pg_left
+                assert pg_left is not None
+                used_pg = final_ratio * pg_left
                  root_map[root_id].pg_left -= int(used_pg)
+                root_map[root_id].pool_used += 1
                  pool_pg_target = used_pg / p['size'] * bias
-
              else:
-                pool_count = root_map[root_id].pool_count
-                assert pool_count is not None
-                final_ratio = 1 / (pool_count - root_map[root_id].pool_used)
-                pool_pg_target = (final_ratio * root_map[root_id].pg_left) / p['size'] * bias
-
-            final_pg_target = max(p.get('options', {}).get('pg_num_min', PG_NUM_MIN),
-                                  nearest_power_of_two(pool_pg_target))
-
-            self.log.info("Pool '{0}' root_id {1} using {2} of space, bias {3}, "
-                          "pg target {4} quantized to {5} (current {6})".format(
-                              p['pool_name'],
-                              root_id,
-                              capacity_ratio,
-                              bias,
-                              pool_pg_target,
-                              final_pg_target,
-                              p['pg_num_target']
-                          ))
+                bulk_pools[pool_name] = p
+                return None, None, None
+
+        elif func_pass == 'second':
+            # second pass we calculate the final_pg_target
+            # for pools that have used_ratio > even_ratio
+            # and we keep track of even pools to be used in third pass
+            pool_count = root_map[root_id].pool_count
+            assert pool_count is not None
+            even_ratio = 1 / (pool_count - root_map[root_id].pool_used)
+            used_ratio = capacity_ratio
+
+            if used_ratio > even_ratio:
+                root_map[root_id].pool_used += 1
+            else:
+                even_pools[pool_name] = p
+                return None, None, None
+
+            final_ratio = max(used_ratio, even_ratio)
+            pg_left = root_map[root_id].pg_left
+            assert pg_left is not None
+            used_pg = final_ratio * pg_left
+            root_map[root_id].pg_left -= int(used_pg)
+            pool_pg_target = used_pg / p['size'] * bias
  
+        else:
+            # third pass we just split the pg_left to all even_pools
+            pool_count = root_map[root_id].pool_count
+            assert pool_count is not None
+            final_ratio = 1 / (pool_count - root_map[root_id].pool_used)
+            pool_pg_target = (final_ratio * root_map[root_id].pg_left) / p['size'] * bias
+
+        final_pg_target = max(p.get('options', {}).get('pg_num_min', PG_NUM_MIN),
+                              nearest_power_of_two(pool_pg_target))
+        self.log.info("Pool '{0}' root_id {1} using {2} of space, bias {3}, "
+                      "pg target {4} quantized to {5} (current {6})".format(
+                      p['pool_name'],
+                      root_id,
+                      capacity_ratio,
+                      bias,
+                      pool_pg_target,
+                      final_pg_target,
+                      p['pg_num_target']
+        ))
          return final_ratio, pool_pg_target, final_pg_target
  
-    def _calc_pool_targets(
+    def _get_pool_pg_targets(
              self,
              osdmap: OSDMap,
              pools: Dict[str, Dict[str, Any]],
@@ -468,10 +436,9 @@ class PgAutoscaler(MgrModule):
              pool_stats: Dict[int, Dict[str, int]],
              ret: List[Dict[str, Any]],
              threshold: float,
-            is_used: bool,
-            profile: 'ScaleModeT',
+            func_pass: 'PassT',
              overlapped_roots: Set[int],
-    ) -> Tuple[List[Dict[str, Any]], Dict[str, Dict[str, Any]]]:
+    ) -> Tuple[List[Dict[str, Any]], Dict[str, Dict[str, Any]] , Dict[str, Dict[str, Any]]]:
          """
          Calculates final_pg_target of each pools and determine if it needs
          scaling, this depends on the profile of the autoscaler. For scale-down,
@@ -480,6 +447,7 @@ class PgAutoscaler(MgrModule):
          the minimal amount of pgs and only scale when there is increase in usage.
          """
          even_pools: Dict[str, Dict[str, Any]] = {}
+        bulk_pools: Dict[str, Dict[str, Any]] = {}
          for pool_name, p in pools.items():
              pool_id = p['pool']
              if pool_id not in pool_stats:
@@ -493,8 +461,8 @@ class PgAutoscaler(MgrModule):
              cr_name = crush_rule['rule_name']
              root_id = crush_map.get_rule_root(cr_name)
              assert root_id is not None
-            if root_id in overlapped_roots and profile == "scale-down":
-                # for scale-down profile skip pools
+            if root_id in overlapped_roots:
+                # skip pools
                  # with overlapping roots
                  self.log.warn("pool %d contains an overlapping root %d"
                                "... skipping scaling", pool_id, root_id)
@@ -532,9 +500,17 @@ class PgAutoscaler(MgrModule):
                                                    root_map[root_id].total_target_bytes,
                                                    capacity)
  
+            # determine if the pool is a bulk
+            bulk = False
+            flags = p['flags_names'].split(",")
+            if "bulk" in flags:
+                bulk = True
+
              capacity_ratio = max(capacity_ratio, target_ratio)
              final_ratio, pool_pg_target, final_pg_target = self._calc_final_pg_target(
-                p, pool_name, root_map, root_id, capacity_ratio, even_pools, bias, is_used, profile)
+                p, pool_name, root_map, root_id,
+                capacity_ratio, bias, even_pools,
+                bulk_pools, func_pass, bulk)
  
              if final_ratio is None:
                  continue
@@ -567,15 +543,15 @@ class PgAutoscaler(MgrModule):
                  'pg_num_final': final_pg_target,
                  'would_adjust': adjust,
                  'bias': p.get('options', {}).get('pg_autoscale_bias', 1.0),
+                'bulk': bulk,
              })
  
-        return ret, even_pools
+        return ret, bulk_pools, even_pools
  
      def _get_pool_status(
              self,
              osdmap: OSDMap,
              pools: Dict[str, Dict[str, Any]],
-            profile: 'ScaleModeT',
      ) -> Tuple[List[Dict[str, Any]],
                 Dict[int, CrushSubtreeResourceStatus]]:
          threshold = self.threshold
@@ -589,19 +565,20 @@ class PgAutoscaler(MgrModule):
          ret: List[Dict[str, Any]] = []
  
          # Iterate over all pools to determine how they should be sized.
-        # First call of _calc_pool_targets() is to find/adjust pools that uses more capacaity than
+        # First call of _get_pool_pg_targets() is to find/adjust pools that uses more capacaity than
          # the even_ratio of other pools and we adjust those first.
          # Second call make use of the even_pools we keep track of in the first call.
          # All we need to do is iterate over those and give them 1/pool_count of the
          # total pgs.
  
-        ret, even_pools = self._calc_pool_targets(osdmap, pools, crush_map, root_map,
-                                                  pool_stats, ret, threshold, True, profile, overlapped_roots)
+        ret, bulk_pools, _  = self._get_pool_pg_targets(osdmap, pools, crush_map, root_map,
+                                                  pool_stats, ret, threshold, 'first', overlapped_roots)
+
+        ret, _, even_pools = self._get_pool_pg_targets(osdmap, bulk_pools, crush_map, root_map,
+                                                  pool_stats, ret, threshold, 'second', overlapped_roots)
  
-        if profile == "scale-down":
-            # We only have adjust even_pools when we use scale-down profile
-            ret, _ = self._calc_pool_targets(osdmap, even_pools, crush_map, root_map,
-                                             pool_stats, ret, threshold, False, profile, overlapped_roots)
+        ret, _, _ = self._get_pool_pg_targets(osdmap, even_pools, crush_map, root_map,
+                                         pool_stats, ret, threshold, 'third', overlapped_roots)
  
          return (ret, root_map)
  
@@ -624,8 +601,7 @@ class PgAutoscaler(MgrModule):
          if osdmap.get_require_osd_release() < 'nautilus':
              return
          pools = osdmap.get_pools_by_name()
-        profile = self.autoscale_profile
-        ps, root_map = self._get_pool_status(osdmap, pools, profile)
+        ps, root_map = self._get_pool_status(osdmap, pools)
  
          # Anyone in 'warn', set the health message for them and then
          # drop them from consideration.
author	Kamoltat <ksirivad@redhat.com>
	Tue, 7 Dec 2021 21:15:36 +0000 (21:15 +0000)
committer	Kamoltat <ksirivad@redhat.com>
	Mon, 20 Dec 2021 21:46:37 +0000 (21:46 +0000)
doc/rados/operations/pools.rst		patch \| blob \| history
src/common/options/global.yaml.in		patch \| blob \| history
src/mon/KVMonitor.cc		patch \| blob \| history
src/mon/MonCommands.h		patch \| blob \| history
src/mon/OSDMonitor.cc		patch \| blob \| history
src/mon/OSDMonitor.h		patch \| blob \| history
src/osd/OSDMap.cc		patch \| blob \| history
src/osd/osd_types.h		patch \| blob \| history
src/pybind/mgr/pg_autoscaler/module.py		patch \| blob \| history