From 5f0f967c9337385fcf50548c36b114f809477b58 Mon Sep 17 00:00:00 2001
From: Kamoltat <ksirivad@redhat.com>
Date: Tue, 7 Dec 2021 21:15:36 +0000
Subject: [PATCH] mon: osd pool create <pool-name> with --bulk flag

Creating the pool with `--bulk` will allow
the pg_autoscaler to use the `scale-down`
mode on.

Creating pool:

`ceph osd pool create <pool-name> --bulk`

Get var:

`ceph osd pool get <pool-name> bulk`

Set var:

`ceph osd pool set <pool-name> bulk=true/false/1/0`

Removed `autoscale_profile` and incorporate bulk flag
into calculating `final_pg_target` for each pool.

bin/ceph osd pool autoscale-status no longer has
`PROFILE` column but has `BULK` instead.

Signed-off-by: Kamoltat <ksirivad@redhat.com>
---
 doc/rados/operations/pools.rst         |   9 ++
 src/common/options/global.yaml.in      |  11 +-
 src/mon/KVMonitor.cc                   |   3 -
 src/mon/MonCommands.h                  |   5 +-
 src/mon/OSDMonitor.cc                  |  18 ++-
 src/mon/OSDMonitor.h                   |   1 +
 src/osd/OSDMap.cc                      |   2 +
 src/osd/osd_types.h                    |   4 +
 src/pybind/mgr/pg_autoscaler/module.py | 204 +++++++++++--------------
 9 files changed, 134 insertions(+), 123 deletions(-)
diff --git a/doc/rados/operations/pools.rst b/doc/rados/operations/pools.rst
index 82cfaddd82901..3b6d227aad741 100644
--- a/doc/rados/operations/pools.rst
+++ b/doc/rados/operations/pools.rst
@@ -420,6 +420,15 @@ You may set values for the following keys:
    :Valid Range: 1 sets flag, 0 unsets flag
    :Version: Version ``FIXME``
 
+.. _bulk:
+
+.. describe:: bulk
+
+   Set/Unset bulk flag on a given pool.
+
+   :Type: Boolean
+   :Valid Range: true/1 sets flag, false/0 unsets flag
+
 .. _write_fadvise_dontneed:
 
 .. describe:: write_fadvise_dontneed
diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in
index 2a00edfe3e8ef..80e7b9dc34a5c 100644
--- a/src/common/options/global.yaml.in
+++ b/src/common/options/global.yaml.in
@@ -2566,6 +2566,15 @@ options:
   services:
   - mon
   with_legacy: true
+- name: osd_pool_default_flag_bulk
+  type: bool
+  level: advanced
+  desc: set bulk flag on new pools
+  fmt_desc: Set the ``bulk`` flag on new pools. Allowing autoscaler to use scale-down mode.
+  default: false
+  services:
+  - mon
+  with_legacy: true
 - name: osd_pool_default_hit_set_bloom_fpp
   type: float
   level: advanced
@@ -6096,4 +6105,4 @@ options:
   services:
   - rgw
   - osd
-  with_legacy: true
\ No newline at end of file
+  with_legacy: true
diff --git a/src/mon/KVMonitor.cc b/src/mon/KVMonitor.cc
index a919a29eed20a..37a81a8048d4e 100644
--- a/src/mon/KVMonitor.cc
+++ b/src/mon/KVMonitor.cc
@@ -53,9 +53,6 @@ void KVMonitor::create_initial()
   dout(10) << __func__ << dendl;
   version = 0;
   pending.clear();
-  bufferlist bl;
-  bl.append("scale-up");
-  pending["config/mgr/mgr/pg_autoscaler/autoscale_profile"] = bl;
 }
 
 void KVMonitor::update_from_paxos(bool *need_bootstrap)
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index f4f4fef9804e7..0a8ce0599df8c 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -1058,6 +1058,7 @@ COMMAND("osd pool create "
         "name=size,type=CephInt,range=0,req=false "
 	"name=pg_num_min,type=CephInt,range=0,req=false "
 	"name=autoscale_mode,type=CephChoices,strings=on|off|warn,req=false "
+	"name=bulk,type=CephBool,req=false "
 	"name=target_size_bytes,type=CephInt,range=0,req=false "
 	"name=target_size_ratio,type=CephFloat,range=0|1,req=false",\
 	"create pool", "osd", "rw")
@@ -1082,11 +1083,11 @@ COMMAND("osd pool rename "
 	"rename <srcpool> to <destpool>", "osd", "rw")
 COMMAND("osd pool get "
 	"name=pool,type=CephPoolname "
-	"name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|eio",
+	"name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|eio|bulk",
 	"get pool parameter <var>", "osd", "r")
 COMMAND("osd pool set "
 	"name=pool,type=CephPoolname "
-	"name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|pgp_num_actual|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|min_read_recency_for_promote|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|eio "
+	"name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|pgp_num_actual|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|min_read_recency_for_promote|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|eio|bulk "
 	"name=val,type=CephString "
 	"name=yes_i_really_mean_it,type=CephBool,req=false",
 	"set pool parameter <var> to <val>", "osd", "rw")
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index b563e375092a6..a75be6d4ca25c 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -5357,7 +5357,7 @@ namespace {
     CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
     PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
     PG_AUTOSCALE_BIAS, DEDUP_TIER, DEDUP_CHUNK_ALGORITHM, 
-    DEDUP_CDC_CHUNK_SIZE, POOL_EIO };
+    DEDUP_CDC_CHUNK_SIZE, POOL_EIO, BULK };
 
   std::set<osd_pool_get_choices>
     subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
@@ -6092,6 +6092,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
       {"dedup_tier", DEDUP_TIER},
       {"dedup_chunk_algorithm", DEDUP_CHUNK_ALGORITHM},
       {"dedup_cdc_chunk_size", DEDUP_CDC_CHUNK_SIZE},
+      {"bulk", BULK}
     };
 
     typedef std::set<osd_pool_get_choices> choices_set_t;
@@ -6209,6 +6210,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
 	  case HASHPSPOOL:
 	  case POOL_EIO:
 	  case NODELETE:
+	  case BULK:
 	  case NOPGCHANGE:
 	  case NOSIZECHANGE:
 	  case WRITE_FADVISE_DONTNEED:
@@ -6437,6 +6439,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
 	  case HASHPSPOOL:
 	  case POOL_EIO:
 	  case NODELETE:
+	  case BULK:
 	  case NOPGCHANGE:
 	  case NOSIZECHANGE:
 	  case WRITE_FADVISE_DONTNEED:
@@ -7259,11 +7262,12 @@ int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
   string erasure_code_profile;
   stringstream ss;
   string rule_name;
+  bool bulk = false;
   int ret = 0;
   ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
 			 0, 0, 0, 0, 0, 0.0,
 			 erasure_code_profile,
-			 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {},
+			 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {}, bulk,
 			 &ss);
 
   if (ret < 0) {
@@ -7885,6 +7889,7 @@ int OSDMonitor::prepare_new_pool(string& name,
                                  const uint64_t expected_num_objects,
                                  FastReadType fast_read,
 				 const string& pg_autoscale_mode,
+				 bool bulk,
 				 ostream *ss)
 {
   if (name.length() == 0)
@@ -8005,6 +8010,11 @@ int OSDMonitor::prepare_new_pool(string& name,
   pi->type = pool_type;
   pi->fast_read = fread; 
   pi->flags = g_conf()->osd_pool_default_flags;
+  if (bulk) {
+    pi->set_flag(pg_pool_t::FLAG_BULK);
+  } else if (g_conf()->osd_pool_default_flag_bulk) {
+      pi->set_flag(pg_pool_t::FLAG_BULK);
+  }
   if (g_conf()->osd_pool_default_flag_hashpspool)
     pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
   if (g_conf()->osd_pool_default_flag_nodelete)
@@ -8438,7 +8448,7 @@ int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
     p.crush_rule = id;
   } else if (var == "nodelete" || var == "nopgchange" ||
 	     var == "nosizechange" || var == "write_fadvise_dontneed" ||
-	     var == "noscrub" || var == "nodeep-scrub") {
+	     var == "noscrub" || var == "nodeep-scrub" || var == "bulk") {
     uint64_t flag = pg_pool_t::get_flag_by_name(var);
     // make sure we only compare against 'n' if we didn't receive a string
     if (val == "true" || (interr.empty() && n == 1)) {
@@ -12880,6 +12890,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     string pg_autoscale_mode;
     cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode);
 
+    bool bulk = cmd_getval_or<bool>(cmdmap, "bulk", 0);
     err = prepare_new_pool(poolstr,
 			   -1, // default crush rule
 			   rule_name,
@@ -12889,6 +12900,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
                            (uint64_t)expected_num_objects,
                            fast_read,
 			   pg_autoscale_mode,
+			   bulk,
 			   &ss);
     if (err < 0) {
       switch(err) {
diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h
index 7757119751638..aa789e2e26255 100644
--- a/src/mon/OSDMonitor.h
+++ b/src/mon/OSDMonitor.h
@@ -528,6 +528,7 @@ private:
                        const uint64_t expected_num_objects,
                        FastReadType fast_read,
 		       const std::string& pg_autoscale_mode,
+		       bool bulk,
 		       std::ostream *ss);
   int prepare_new_pool(MonOpRequestRef op);
 
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
index 1168b6dc35600..d449543e204f5 100644
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -4242,6 +4242,8 @@ int OSDMap::build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
 	pools[pool].set_flag(pg_pool_t::FLAG_NOPGCHANGE);
       if (cct->_conf->osd_pool_default_flag_nosizechange)
 	pools[pool].set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
+      if (cct->_conf->osd_pool_default_flag_bulk)
+        pools[pool].set_flag(pg_pool_t::FLAG_BULK);
       pools[pool].size = cct->_conf.get_val<uint64_t>("osd_pool_default_size");
       pools[pool].min_size = cct->_conf.get_osd_pool_default_min_size(
                                  pools[pool].size);
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index 65c0cf1f4109d..fcc3939ccde00 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -1256,6 +1256,7 @@ struct pg_pool_t {
     FLAG_POOL_SNAPS = 1<<14,        // pool has pool snaps
     FLAG_CREATING = 1<<15,          // initial pool PGs are being created
     FLAG_EIO = 1<<16,               // return EIO for all client ops
+    FLAG_BULK = 1<<17, //pool is large
   };
 
   static const char *get_flag_name(uint64_t f) {
@@ -1277,6 +1278,7 @@ struct pg_pool_t {
     case FLAG_POOL_SNAPS: return "pool_snaps";
     case FLAG_CREATING: return "creating";
     case FLAG_EIO: return "eio";
+    case FLAG_BULK: return "bulk";
     default: return "???";
     }
   }
@@ -1329,6 +1331,8 @@ struct pg_pool_t {
       return FLAG_CREATING;
     if (name == "eio")
       return FLAG_EIO;
+    if (name == "bulk")
+      return FLAG_BULK;
     return 0;
   }
 
diff --git a/src/pybind/mgr/pg_autoscaler/module.py b/src/pybind/mgr/pg_autoscaler/module.py
index 487531bb6a9ee..414afa7ce4954 100644
--- a/src/pybind/mgr/pg_autoscaler/module.py
+++ b/src/pybind/mgr/pg_autoscaler/module.py
@@ -31,7 +31,7 @@ if TYPE_CHECKING:
     else:
         from typing_extensions import Literal
 
-    ScaleModeT = Literal['scale-up', 'scale-down']
+    PassT = Literal['first', 'second', 'third']
 
 
 def nearest_power_of_two(n: int) -> int:
@@ -126,17 +126,7 @@ class PgAutoscaler(MgrModule):
             name='sleep_interval',
             type='secs',
             default=60),
-        Option(
-            'autoscale_profile',
-            default='scale-up',
-            type='str',
-            desc='pg_autoscale profiler',
-            long_desc=('Determines the behavior of the autoscaler algorithm, '
-                       '`scale-up` means that it starts out with minmum pgs '
-                       'and scales up when there is pressure'
-                       '`scale-down means start out with full pgs and scales'
-                       'down when there is pressure'),
-            runtime=True),
+
         Option(
             name='threshold',
             type='float',
@@ -156,7 +146,6 @@ class PgAutoscaler(MgrModule):
         # to just keep a copy of the pythonized version.
         self._osd_map = None
         if TYPE_CHECKING:
-            self.autoscale_profile: 'ScaleModeT' = 'scale-up'
             self.sleep_interval = 60
             self.mon_target_pg_per_osd = 0
             self.threshold = 3.0
@@ -173,10 +162,6 @@ class PgAutoscaler(MgrModule):
                     self.get_module_option(opt['name']))
             self.log.debug(' mgr option %s = %s',
                            opt['name'], getattr(self, opt['name']))
-        # if the profiler option is not set, this means it is an old cluster
-        autoscale_profile = self.get_module_option("autoscale_profile")
-        if not autoscale_profile:
-            self.set_module_option("autoscale_profile", "scale-up")
 
     @CLIReadCommand('osd pool autoscale-status')
     def _command_autoscale_status(self, format: str = 'plain') -> Tuple[int, str, str]:
@@ -185,8 +170,7 @@ class PgAutoscaler(MgrModule):
         """
         osdmap = self.get_osdmap()
         pools = osdmap.get_pools_by_name()
-        profile = self.autoscale_profile
-        ps, root_map = self._get_pool_status(osdmap, pools, profile)
+        ps, root_map = self._get_pool_status(osdmap, pools)
 
         if format in ('json', 'json-pretty'):
             return 0, json.dumps(ps, indent=4, sort_keys=True), ''
@@ -199,7 +183,7 @@ class PgAutoscaler(MgrModule):
                                  'PG_NUM',
 #                                 'IDEAL',
                                  'NEW PG_NUM', 'AUTOSCALE',
-                                 'PROFILE'],
+                                 'BULK'],
                                 border=False)
             table.left_padding_width = 0
             table.right_padding_width = 2
@@ -216,7 +200,7 @@ class PgAutoscaler(MgrModule):
 #            table.align['IDEAL'] = 'r'
             table.align['NEW PG_NUM'] = 'r'
             table.align['AUTOSCALE'] = 'l'
-            table.align['PROFILE'] =  'l'
+            table.align['BULK'] = 'l'
             for p in ps:
                 if p['would_adjust']:
                     final = str(p['pg_num_final'])
@@ -248,7 +232,7 @@ class PgAutoscaler(MgrModule):
 #                    p['pg_num_ideal'],
                     final,
                     p['pg_autoscale_mode'],
-                    profile
+                    str(p['bulk'])
                 ])
             return 0, table.get_string(), ''
 
@@ -263,29 +247,6 @@ class PgAutoscaler(MgrModule):
         self.set_module_option("threshold", num)
         return 0, "threshold updated", ""
 
-    @CLIWriteCommand("osd pool set autoscale-profile scale-up")
-    def set_profile_scale_up(self) -> Tuple[int, str, str]:
-        """
-        set the autoscaler behavior to start out with minimum pgs and scales up when there is pressure
-        """
-        if self.autoscale_profile == "scale-up":
-            return 0, "", "autoscale-profile is already a scale-up!"
-        else:
-            self.set_module_option("autoscale_profile", "scale-up")
-            return 0, "", "autoscale-profile is now scale-up"
-
-    @CLIWriteCommand("osd pool set autoscale-profile scale-down")
-    def set_profile_scale_down(self) -> Tuple[int, str, str]:
-        """
-        set the autoscaler behavior to start out with full pgs and
-        scales down when there is pressure
-        """
-        if self.autoscale_profile == "scale-down":
-            return 0, "", "autoscale-profile is already a scale-down!"
-        else:
-            self.set_module_option("autoscale_profile", "scale-down")
-            return 0, "", "autoscale-profile is now scale-down"
-
     def serve(self) -> None:
         self.config_notify()
         while not self._shutdown.is_set():
@@ -393,73 +354,80 @@ class PgAutoscaler(MgrModule):
             root_map: Dict[int, CrushSubtreeResourceStatus],
             root_id: int,
             capacity_ratio: float,
-            even_pools: Dict[str, Dict[str, Any]],
             bias: float,
-            is_used: bool,
-            profile: 'ScaleModeT',
+            even_pools: Dict[str, Dict[str, Any]],
+            bulk_pools: Dict[str, Dict[str, Any]],
+            func_pass: 'PassT',
+            bulk: bool,
     ) -> Union[Tuple[float, int, int], Tuple[None, None, None]]:
         """
         `profile` determines behaviour of the autoscaler.
-        `is_used` flag used to determine if this is the first
+        `first_pass` flag used to determine if this is the first
         pass where the caller tries to calculate/adjust pools that has
         used_ratio > even_ratio else this is the second pass,
         we calculate final_ratio by giving it 1 / pool_count
         of the root we are currently looking at.
         """
-        if profile == "scale-up":
-            final_ratio = capacity_ratio
-            # So what proportion of pg allowance should we be using?
-            pg_target = root_map[root_id].pg_target
-            assert pg_target is not None
-            pool_pg_target = (final_ratio * pg_target) / p['size'] * bias
-            final_pg_target = max(p.get('options', {}).get('pg_num_min', PG_NUM_MIN),
-                                  nearest_power_of_two(pool_pg_target))
-
-        else:
-            if is_used:
-                pool_count = root_map[root_id].pool_count
-                assert pool_count is not None
-                even_ratio = 1 / pool_count
-                used_ratio = capacity_ratio
-
-                if used_ratio > even_ratio:
-                    root_map[root_id].pool_used += 1
-                else:
-                    # keep track of even_pools to be used in second pass
-                    # of the caller function
-                    even_pools[pool_name] = p
-                    return None, None, None
-
-                final_ratio = max(used_ratio, even_ratio)
-                pg_target = root_map[root_id].pg_target
-                assert pg_target is not None
-                used_pg = final_ratio * pg_target
+        if func_pass == 'first':
+            # first pass to deal with small pools (no bulk flag)
+            # calculating final_pg_target based on capacity ratio
+            # we also keep track of bulk_pools to be used in second pass
+            if not bulk:
+                final_ratio = capacity_ratio
+                pg_left = root_map[root_id].pg_left
+                assert pg_left is not None
+                used_pg = final_ratio * pg_left
                 root_map[root_id].pg_left -= int(used_pg)
+                root_map[root_id].pool_used += 1
                 pool_pg_target = used_pg / p['size'] * bias
-
             else:
-                pool_count = root_map[root_id].pool_count
-                assert pool_count is not None
-                final_ratio = 1 / (pool_count - root_map[root_id].pool_used)
-                pool_pg_target = (final_ratio * root_map[root_id].pg_left) / p['size'] * bias
-
-            final_pg_target = max(p.get('options', {}).get('pg_num_min', PG_NUM_MIN),
-                                  nearest_power_of_two(pool_pg_target))
-
-            self.log.info("Pool '{0}' root_id {1} using {2} of space, bias {3}, "
-                          "pg target {4} quantized to {5} (current {6})".format(
-                              p['pool_name'],
-                              root_id,
-                              capacity_ratio,
-                              bias,
-                              pool_pg_target,
-                              final_pg_target,
-                              p['pg_num_target']
-                          ))
+                bulk_pools[pool_name] = p
+                return None, None, None
+
+        elif func_pass == 'second':
+            # second pass we calculate the final_pg_target
+            # for pools that have used_ratio > even_ratio
+            # and we keep track of even pools to be used in third pass
+            pool_count = root_map[root_id].pool_count
+            assert pool_count is not None
+            even_ratio = 1 / (pool_count - root_map[root_id].pool_used)
+            used_ratio = capacity_ratio
+
+            if used_ratio > even_ratio:
+                root_map[root_id].pool_used += 1
+            else:
+                even_pools[pool_name] = p
+                return None, None, None
+
+            final_ratio = max(used_ratio, even_ratio)
+            pg_left = root_map[root_id].pg_left
+            assert pg_left is not None
+            used_pg = final_ratio * pg_left
+            root_map[root_id].pg_left -= int(used_pg)
+            pool_pg_target = used_pg / p['size'] * bias
 
+        else:
+            # third pass we just split the pg_left to all even_pools
+            pool_count = root_map[root_id].pool_count
+            assert pool_count is not None
+            final_ratio = 1 / (pool_count - root_map[root_id].pool_used)
+            pool_pg_target = (final_ratio * root_map[root_id].pg_left) / p['size'] * bias
+
+        final_pg_target = max(p.get('options', {}).get('pg_num_min', PG_NUM_MIN),
+                              nearest_power_of_two(pool_pg_target))
+        self.log.info("Pool '{0}' root_id {1} using {2} of space, bias {3}, "
+                      "pg target {4} quantized to {5} (current {6})".format(
+                      p['pool_name'],
+                      root_id,
+                      capacity_ratio,
+                      bias,
+                      pool_pg_target,
+                      final_pg_target,
+                      p['pg_num_target']
+        ))
         return final_ratio, pool_pg_target, final_pg_target
 
-    def _calc_pool_targets(
+    def _get_pool_pg_targets(
             self,
             osdmap: OSDMap,
             pools: Dict[str, Dict[str, Any]],
@@ -468,10 +436,9 @@ class PgAutoscaler(MgrModule):
             pool_stats: Dict[int, Dict[str, int]],
             ret: List[Dict[str, Any]],
             threshold: float,
-            is_used: bool,
-            profile: 'ScaleModeT',
+            func_pass: 'PassT',
             overlapped_roots: Set[int],
-    ) -> Tuple[List[Dict[str, Any]], Dict[str, Dict[str, Any]]]:
+    ) -> Tuple[List[Dict[str, Any]], Dict[str, Dict[str, Any]] , Dict[str, Dict[str, Any]]]:
         """
         Calculates final_pg_target of each pools and determine if it needs
         scaling, this depends on the profile of the autoscaler. For scale-down,
@@ -480,6 +447,7 @@ class PgAutoscaler(MgrModule):
         the minimal amount of pgs and only scale when there is increase in usage.
         """
         even_pools: Dict[str, Dict[str, Any]] = {}
+        bulk_pools: Dict[str, Dict[str, Any]] = {}
         for pool_name, p in pools.items():
             pool_id = p['pool']
             if pool_id not in pool_stats:
@@ -493,8 +461,8 @@ class PgAutoscaler(MgrModule):
             cr_name = crush_rule['rule_name']
             root_id = crush_map.get_rule_root(cr_name)
             assert root_id is not None
-            if root_id in overlapped_roots and profile == "scale-down":
-                # for scale-down profile skip pools
+            if root_id in overlapped_roots:
+                # skip pools
                 # with overlapping roots
                 self.log.warn("pool %d contains an overlapping root %d"
                               "... skipping scaling", pool_id, root_id)
@@ -532,9 +500,17 @@ class PgAutoscaler(MgrModule):
                                                   root_map[root_id].total_target_bytes,
                                                   capacity)
 
+            # determine if the pool is a bulk
+            bulk = False
+            flags = p['flags_names'].split(",")
+            if "bulk" in flags:
+                bulk = True
+
             capacity_ratio = max(capacity_ratio, target_ratio)
             final_ratio, pool_pg_target, final_pg_target = self._calc_final_pg_target(
-                p, pool_name, root_map, root_id, capacity_ratio, even_pools, bias, is_used, profile)
+                p, pool_name, root_map, root_id,
+                capacity_ratio, bias, even_pools,
+                bulk_pools, func_pass, bulk)
 
             if final_ratio is None:
                 continue
@@ -567,15 +543,15 @@ class PgAutoscaler(MgrModule):
                 'pg_num_final': final_pg_target,
                 'would_adjust': adjust,
                 'bias': p.get('options', {}).get('pg_autoscale_bias', 1.0),
+                'bulk': bulk,
             })
 
-        return ret, even_pools
+        return ret, bulk_pools, even_pools
 
     def _get_pool_status(
             self,
             osdmap: OSDMap,
             pools: Dict[str, Dict[str, Any]],
-            profile: 'ScaleModeT',
     ) -> Tuple[List[Dict[str, Any]],
                Dict[int, CrushSubtreeResourceStatus]]:
         threshold = self.threshold
@@ -589,19 +565,20 @@ class PgAutoscaler(MgrModule):
         ret: List[Dict[str, Any]] = []
 
         # Iterate over all pools to determine how they should be sized.
-        # First call of _calc_pool_targets() is to find/adjust pools that uses more capacaity than
+        # First call of _get_pool_pg_targets() is to find/adjust pools that uses more capacaity than
         # the even_ratio of other pools and we adjust those first.
         # Second call make use of the even_pools we keep track of in the first call.
         # All we need to do is iterate over those and give them 1/pool_count of the
         # total pgs.
 
-        ret, even_pools = self._calc_pool_targets(osdmap, pools, crush_map, root_map,
-                                                  pool_stats, ret, threshold, True, profile, overlapped_roots)
+        ret, bulk_pools, _  = self._get_pool_pg_targets(osdmap, pools, crush_map, root_map,
+                                                  pool_stats, ret, threshold, 'first', overlapped_roots)
+
+        ret, _, even_pools = self._get_pool_pg_targets(osdmap, bulk_pools, crush_map, root_map,
+                                                  pool_stats, ret, threshold, 'second', overlapped_roots)
 
-        if profile == "scale-down":
-            # We only have adjust even_pools when we use scale-down profile
-            ret, _ = self._calc_pool_targets(osdmap, even_pools, crush_map, root_map,
-                                             pool_stats, ret, threshold, False, profile, overlapped_roots)
+        ret, _, _ = self._get_pool_pg_targets(osdmap, even_pools, crush_map, root_map,
+                                         pool_stats, ret, threshold, 'third', overlapped_roots)
 
         return (ret, root_map)
 
@@ -624,8 +601,7 @@ class PgAutoscaler(MgrModule):
         if osdmap.get_require_osd_release() < 'nautilus':
             return
         pools = osdmap.get_pools_by_name()
-        profile = self.autoscale_profile
-        ps, root_map = self._get_pool_status(osdmap, pools, profile)
+        ps, root_map = self._get_pool_status(osdmap, pools)
 
         # Anyone in 'warn', set the health message for them and then
         # drop them from consideration.
-- 
2.39.5