From: Sage Weil Date: Tue, 11 Feb 2014 17:25:04 +0000 (-0800) Subject: osd/OSDMap: apply primary_affinity to mapping X-Git-Tag: v0.78~173^2~5 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=8ecec02fc1a2e3112f331c5af7202ee384ecbead;p=ceph.git osd/OSDMap: apply primary_affinity to mapping The behavior is a bit different for replicated and indep/erasure mode. In the first case, we are rearranging the result. In the second case, we can just set the primary argument to the right value. Signed-off-by: Sage Weil --- diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index 4f621c4b0c44..d101728bbe3b 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -1398,7 +1398,60 @@ void OSDMap::_raw_to_up_osds(const pg_pool_t& pool, const vector& raw, } } } - + +void OSDMap::_apply_primary_affinity(ps_t seed, + const pg_pool_t& pool, + vector *osds, + int *primary) const +{ + // do we have any non-default primary_affinity values for these osds? + if (!osd_primary_affinity) + return; + + bool any = false; + for (vector::const_iterator p = osds->begin(); p != osds->end(); ++p) { + if ((*osd_primary_affinity)[*p] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) { + any = true; + } + } + if (!any) + return; + + // pick the primary. feed both the seed (for the pg) and the osd + // into the hash/rng so that a proportional fraction of an osd's pgs + // get rejected as primary. + int pos = -1; + for (unsigned i = 0; i < osds->size(); ++i) { + int o = (*osds)[i]; + if (o == CRUSH_ITEM_NONE) + continue; + unsigned a = (*osd_primary_affinity)[o]; + if (a < CEPH_OSD_MAX_PRIMARY_AFFINITY && + (crush_hash32_2(CRUSH_HASH_RJENKINS1, + seed, o) >> 16) >= a) { + // we chose not to use this primary. note it anyway as a + // fallback in case we don't pick anyone else, but keep looking. + if (pos < 0) + pos = i; + } else { + pos = i; + break; + } + } + if (pos < 0) + return; + + *primary = (*osds)[pos]; + + if (pool.can_shift_osds() && pos > 0) { + // move the new primary to the front. + for (int i = pos; i > 0; --i) { + (*osds)[i] = (*osds)[i-1]; + } + (*osds)[0] = *primary; + } +} + void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg, vector *temp_pg, int *temp_primary) const { @@ -1442,8 +1495,10 @@ void OSDMap::pg_to_raw_up(pg_t pg, vector *up, int *primary) const return; } vector raw; - _pg_to_osds(*pool, pg, &raw, primary); + ps_t pps; + _pg_to_osds(*pool, pg, &raw, primary, &pps); _raw_to_up_osds(*pool, raw, up, primary); + _apply_primary_affinity(pps, *pool, up, primary); } void OSDMap::_pg_to_up_acting_osds(pg_t pg, vector *up, int *up_primary, @@ -1466,8 +1521,10 @@ void OSDMap::_pg_to_up_acting_osds(pg_t pg, vector *up, int *up_primary, vector _acting; int _up_primary; int _acting_primary; - _pg_to_osds(*pool, pg, &raw, &_up_primary); + ps_t pps; + _pg_to_osds(*pool, pg, &raw, &_up_primary, &pps); _raw_to_up_osds(*pool, raw, &_up, &_up_primary); + _apply_primary_affinity(pps, *pool, &_up, &_up_primary); _get_temp_osds(*pool, pg, &_acting, &_acting_primary); if (_acting.empty()) _acting = _up; diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index 4b4481c88d1e..a405526c6961 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -559,6 +559,9 @@ private: ps_t *ppps) const; void _remove_nonexistent_osds(const pg_pool_t& pool, vector& osds) const; + void _apply_primary_affinity(ps_t seed, const pg_pool_t& pool, + vector *osds, int *primary) const; + /// pg -> (up osd list) void _raw_to_up_osds(const pg_pool_t& pool, const vector& raw, vector *up, int *primary) const; diff --git a/src/test/osd/TestOSDMap.cc b/src/test/osd/TestOSDMap.cc index ee5e9e3eed93..0ff12c80be72 100644 --- a/src/test/osd/TestOSDMap.cc +++ b/src/test/osd/TestOSDMap.cc @@ -49,8 +49,35 @@ public: pending_inc.new_uuid[i] = sample_uuid; } osdmap.apply_incremental(pending_inc); + + // kludge to get an erasure coding rule and pool + int r = osdmap.crush->add_simple_ruleset("erasure", "default", "osd", + "indep", pg_pool_t::TYPE_ERASURE, + &cerr); + pg_pool_t *p = (pg_pool_t *)osdmap.get_pg_pool(2); + p->type = pg_pool_t::TYPE_ERASURE; + p->crush_ruleset = r; } unsigned int get_num_osds() { return num_osds; } + + void test_mappings(int pool, + int num, + vector *any, + vector *first, + vector *primary) { + for (int i=0; i o; + int p; + pg_t pgid(i, pool); + osdmap.pg_to_acting_osds(pgid, &o, &p); + for (unsigned j=0; j= 0) + (*primary)[p]++; + } + } }; TEST_F(OSDMapTest, Create) { @@ -237,3 +264,85 @@ TEST_F(OSDMapTest, KeepsNecessaryTemps) { EXPECT_FALSE(pending_inc.new_pg_temp.count(pgid)); EXPECT_FALSE(pending_inc.new_primary_temp.count(pgid)); } + +TEST_F(OSDMapTest, PrimaryAffinity) { + set_up_map(); + + /* + osdmap.print(cout); + Formatter *f = new_formatter("json-pretty"); + f->open_object_section("CRUSH"); + osdmap.crush->dump(f); + f->close_section(); + f->flush(cout); + delete f; + */ + + int n = get_num_osds(); + for (map::const_iterator p = osdmap.get_pools().begin(); + p != osdmap.get_pools().end(); + ++p) { + int pool = p->first; + cout << "pool " << pool << std::endl; + { + vector any(n, 0); + vector first(n, 0); + vector primary(n, 0); + test_mappings(0, 10000, &any, &first, &primary); + for (int i=0; i any(n, 0); + vector first(n, 0); + vector primary(n, 0); + test_mappings(pool, 10000, &any, &first, &primary); + for (int i=0; i= 2) { + ASSERT_LT(0, first[i]); + ASSERT_LT(0, primary[i]); + } else { + if (p->second.is_replicated()) + ASSERT_EQ(0, first[i]); + ASSERT_EQ(0, primary[i]); + } + } + } + + osdmap.set_primary_affinity(0, 0x8000); + osdmap.set_primary_affinity(1, 0); + { + vector any(n, 0); + vector first(n, 0); + vector primary(n, 0); + test_mappings(pool, 10000, &any, &first, &primary); + for (int i=0; i= 2) { + ASSERT_LT(0, first[i]); + ASSERT_LT(0, primary[i]); + } else if (i == 1) { + if (p->second.is_replicated()) + ASSERT_EQ(0, first[i]); + ASSERT_EQ(0, primary[i]); + } else { + ASSERT_LT(10000/6/4, primary[0]); + ASSERT_GT(10000/6/4*3, primary[0]); + } + } + } + + osdmap.set_primary_affinity(0, 0x10000); + osdmap.set_primary_affinity(1, 0x10000); + } +}