]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: introduce HASHPSPOOL pool flag, feature to avoid overlapping pg placements
authorSage Weil <sage@inktank.com>
Mon, 18 Feb 2013 07:23:27 +0000 (23:23 -0800)
committerSage Weil <sage@inktank.com>
Tue, 19 Feb 2013 23:59:00 +0000 (15:59 -0800)
The existing code will overlay the placement of PGs from pools because
it simply adds the ps to the pool as the CRUSH input.  That means that
the layout/placement for pg 0.10 == 1.9 == 2.8 == 3.7 == 4.6 == ...,
which is not optimal.

Instead, use hash(ps, poolid).  The avoids the initial problem of
the sequence being adjacent to other pools.  It also avoids the (small)
possibility that hash(poolid) will drop us somewhere in the output
number space where our sequence of outputs overlaps with some other
pool; instead, out output sequence will be a fully random (for a well-
behaved hash).

Use the multi-input hash functions used by CRUSH for this.

Default to the legacy behavior for now.  We won't enable this until
deployed systems and kernel code catch up.

Fixes: #4128
Signed-off-by: Sage Weil <sage@inktank.com>
src/common/config_opts.h
src/include/ceph_features.h
src/mon/OSDMonitor.cc
src/osd/OSDMap.cc
src/osd/osd_types.cc
src/osd/osd_types.h

index 3963b31aff98a5ddc7a9c39c94ecd1889507fadd..2fb52a8762279a51e3678906eaf394cb37988148 100644 (file)
@@ -321,6 +321,7 @@ OPTION(osd_pool_default_size, OPT_INT, 2)
 OPTION(osd_pool_default_min_size, OPT_INT, 0)  // 0 means no specific default; ceph will use size-size/2
 OPTION(osd_pool_default_pg_num, OPT_INT, 8) // number of PGs for new pools. Configure in global or mon section of ceph.conf
 OPTION(osd_pool_default_pgp_num, OPT_INT, 8) // number of PGs for placement purposes. Should be equal to pg_num
+OPTION(osd_pool_default_flags, OPT_INT, 0)   // default flags for new pools
 OPTION(osd_map_dedup, OPT_BOOL, true)
 OPTION(osd_map_cache_size, OPT_INT, 500)
 OPTION(osd_map_message_max, OPT_INT, 100)  // max maps per MOSDMap message
index c9ff72c15f97390d9ad6fcbb2651a1ea6e9a813d..0aa8dc158a2dd9517c8d22908b4b3a1ce5cf18ac 100644 (file)
@@ -34,6 +34,7 @@
 #define CEPH_FEATURE_REPLY_CREATE_INODE   (1<<27)
 #define CEPH_FEATURE_OSD_HBMSGS     (1<<28)
 #define CEPH_FEATURE_MDSENC         (1<<29)
+#define CEPH_FEATURE_OSDHASHPSPOOL  (1<<30)
 
 /*
  * Features supported.  Should be everything above.
@@ -67,8 +68,9 @@
         CEPH_FEATURE_CRUSH_TUNABLES2 |      \
         CEPH_FEATURE_CREATEPOOLID |         \
         CEPH_FEATURE_REPLY_CREATE_INODE |   \
-        CEPH_FEATURE_OSD_HBMSGS | \
-        CEPH_FEATURE_MDSENC)
+        CEPH_FEATURE_OSD_HBMSGS |              \
+        CEPH_FEATURE_MDSENC |                  \
+        CEPH_FEATURE_OSDHASHPSPOOL)
 
 #define CEPH_FEATURES_SUPPORTED_DEFAULT  CEPH_FEATURES_ALL
 
index 0198c29e45b377810c757b4f9d8a5d65878053f0..5a7dceac753eeea7aaa62220b8bff32cc9c50d62 100644 (file)
@@ -2076,6 +2076,7 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid, int crush_rule,
     pending_inc.new_pool_max = osdmap.pool_max;
   int64_t pool = ++pending_inc.new_pool_max;
   pending_inc.new_pools[pool].type = pg_pool_t::TYPE_REP;
+  pending_inc.new_pools[pool].type = g_conf->osd_pool_default_flags;
 
   pending_inc.new_pools[pool].size = g_conf->osd_pool_default_size;
   pending_inc.new_pools[pool].min_size = g_conf->get_osd_pool_default_min_size();
index 6b692d407a8ffdb7ceaa4697e531729b316effe7..a0a3d1247ba2ea79b15c7087dbb2a26a32b59177 100644 (file)
@@ -1690,6 +1690,7 @@ void OSDMap::build_simple(CephContext *cct, epoch_t e, uuid_d &fsid,
   for (map<int,const char*>::iterator p = rulesets.begin(); p != rulesets.end(); p++) {
     int64_t pool = ++pool_max;
     pools[pool].type = pg_pool_t::TYPE_REP;
+    pools[pool].flags = cct->_conf->osd_pool_default_flags;
     pools[pool].size = cct->_conf->osd_pool_default_size;
     pools[pool].min_size = cct->_conf->get_osd_pool_default_min_size();
     pools[pool].crush_ruleset = p->first;
@@ -1814,6 +1815,7 @@ int OSDMap::build_simple_from_conf(CephContext *cct, epoch_t e, uuid_d &fsid,
   for (map<int,const char*>::iterator p = rulesets.begin(); p != rulesets.end(); p++) {
     int64_t pool = ++pool_max;
     pools[pool].type = pg_pool_t::TYPE_REP;
+    pools[pool].flags = cct->_conf->osd_pool_default_flags;
     pools[pool].size = cct->_conf->osd_pool_default_size;
     pools[pool].min_size = cct->_conf->get_osd_pool_default_min_size();
     pools[pool].crush_ruleset = p->first;
index c3827a4680b93f9acae7c9c025d5be9a2d803a5d..b1046c9aec5fc8b8f5c5a99767ca937189490cd7 100644 (file)
@@ -14,6 +14,9 @@
 
 #include "osd_types.h"
 #include "include/ceph_features.h"
+extern "C" {
+#include "crush/hash.h"
+}
 #include "PG.h"
 #include "OSDMap.h"
 
@@ -678,7 +681,20 @@ pg_t pg_pool_t::raw_pg_to_pg(pg_t pg) const
  */
 ps_t pg_pool_t::raw_pg_to_pps(pg_t pg) const
 {
-  return ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask) + pg.pool();
+  if (true) {//flags & FLAG_HASHPSPOOL) {
+    // Hash the pool id so that pool PGs do not overlap.
+    return
+      crush_hash32_2(CRUSH_HASH_RJENKINS1,
+                    ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask),
+                    pg.pool());
+  } else {
+    // Legacy behavior; add ps and pool together.  This is not a great
+    // idea because the PGs from each pool will essentially overlap on
+    // top of each other: 0.5 == 1.4 == 2.3 == ...
+    return
+      ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask) +
+      pg.pool();
+  }
 }
 
 void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
index 558c10ff27bee43837770069f60789c1233c8e84..ff8c2c5219e30d24bbf44aca6ef80d3424b51bd5 100644 (file)
@@ -620,6 +620,9 @@ struct pg_pool_t {
     TYPE_REP = 1,     // replication
     TYPE_RAID4 = 2,   // raid4 (never implemented)
   };
+  enum {
+    FLAG_HASHPSPOOL = 1, // hash pg seed and pool together (instead of adding)
+  };
 
   static const char *get_type_name(int t) {
     switch (t) {