]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd/OSDMap: apply primary_affinity to mapping
authorSage Weil <sage@inktank.com>
Tue, 11 Feb 2014 17:25:04 +0000 (09:25 -0800)
committerSage Weil <sage@inktank.com>
Sat, 15 Feb 2014 18:50:08 +0000 (10:50 -0800)
The behavior is a bit different for replicated and indep/erasure mode.
In the first case, we are rearranging the result.  In the second case,
we can just set the primary argument to the right value.

Signed-off-by: Sage Weil <sage@inktank.com>
src/osd/OSDMap.cc
src/osd/OSDMap.h
src/test/osd/TestOSDMap.cc

index 4f621c4b0c44dca5b0495bf5c682901670187151..d101728bbe3bd531acc5060f1359a3b6b9bcabdd 100644 (file)
@@ -1398,7 +1398,60 @@ void OSDMap::_raw_to_up_osds(const pg_pool_t& pool, const vector<int>& raw,
     }
   }
 }
-  
+
+void OSDMap::_apply_primary_affinity(ps_t seed,
+                                    const pg_pool_t& pool,
+                                    vector<int> *osds,
+                                    int *primary) const
+{
+  // do we have any non-default primary_affinity values for these osds?
+  if (!osd_primary_affinity)
+    return;
+
+  bool any = false;
+  for (vector<int>::const_iterator p = osds->begin(); p != osds->end(); ++p) {
+    if ((*osd_primary_affinity)[*p] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
+      any = true;
+    }
+  }
+  if (!any)
+    return;
+
+  // pick the primary.  feed both the seed (for the pg) and the osd
+  // into the hash/rng so that a proportional fraction of an osd's pgs
+  // get rejected as primary.
+  int pos = -1;
+  for (unsigned i = 0; i < osds->size(); ++i) {
+    int o = (*osds)[i];
+    if (o == CRUSH_ITEM_NONE)
+      continue;
+    unsigned a = (*osd_primary_affinity)[o];
+    if (a < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
+       (crush_hash32_2(CRUSH_HASH_RJENKINS1,
+                       seed, o) >> 16) >= a) {
+      // we chose not to use this primary.  note it anyway as a
+      // fallback in case we don't pick anyone else, but keep looking.
+      if (pos < 0)
+       pos = i;
+    } else {
+      pos = i;
+      break;
+    }
+  }
+  if (pos < 0)
+    return;
+
+  *primary = (*osds)[pos];
+
+  if (pool.can_shift_osds() && pos > 0) {
+    // move the new primary to the front.
+    for (int i = pos; i > 0; --i) {
+      (*osds)[i] = (*osds)[i-1];
+    }
+    (*osds)[0] = *primary;
+  }
+}
+
 void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
                             vector<int> *temp_pg, int *temp_primary) const
 {
@@ -1442,8 +1495,10 @@ void OSDMap::pg_to_raw_up(pg_t pg, vector<int> *up, int *primary) const
     return;
   }
   vector<int> raw;
-  _pg_to_osds(*pool, pg, &raw, primary);
+  ps_t pps;
+  _pg_to_osds(*pool, pg, &raw, primary, &pps);
   _raw_to_up_osds(*pool, raw, up, primary);
+  _apply_primary_affinity(pps, *pool, up, primary);
 }
   
 void OSDMap::_pg_to_up_acting_osds(pg_t pg, vector<int> *up, int *up_primary,
@@ -1466,8 +1521,10 @@ void OSDMap::_pg_to_up_acting_osds(pg_t pg, vector<int> *up, int *up_primary,
   vector<int> _acting;
   int _up_primary;
   int _acting_primary;
-  _pg_to_osds(*pool, pg, &raw, &_up_primary);
+  ps_t pps;
+  _pg_to_osds(*pool, pg, &raw, &_up_primary, &pps);
   _raw_to_up_osds(*pool, raw, &_up, &_up_primary);
+  _apply_primary_affinity(pps, *pool, &_up, &_up_primary);
   _get_temp_osds(*pool, pg, &_acting, &_acting_primary);
   if (_acting.empty())
     _acting = _up;
index 4b4481c88d1ec1b97c70b016ad946ccac325e097..a405526c6961ff41b454854b7e0b544e57f50e39 100644 (file)
@@ -559,6 +559,9 @@ private:
                  ps_t *ppps) const;
   void _remove_nonexistent_osds(const pg_pool_t& pool, vector<int>& osds) const;
 
+  void _apply_primary_affinity(ps_t seed, const pg_pool_t& pool,
+                              vector<int> *osds, int *primary) const;
+
   /// pg -> (up osd list)
   void _raw_to_up_osds(const pg_pool_t& pool, const vector<int>& raw,
                        vector<int> *up, int *primary) const;
index ee5e9e3eed935d7000653d82ac92edddce6c08b4..0ff12c80be721d9c9415926458f0675d54b6e430 100644 (file)
@@ -49,8 +49,35 @@ public:
       pending_inc.new_uuid[i] = sample_uuid;
     }
     osdmap.apply_incremental(pending_inc);
+
+    // kludge to get an erasure coding rule and pool
+    int r = osdmap.crush->add_simple_ruleset("erasure", "default", "osd",
+                                            "indep", pg_pool_t::TYPE_ERASURE,
+                                            &cerr);
+    pg_pool_t *p = (pg_pool_t *)osdmap.get_pg_pool(2);
+    p->type = pg_pool_t::TYPE_ERASURE;
+    p->crush_ruleset = r;
   }
   unsigned int get_num_osds() { return num_osds; }
+
+  void test_mappings(int pool,
+                    int num,
+                    vector<int> *any,
+                    vector<int> *first,
+                    vector<int> *primary) {
+    for (int i=0; i<num; ++i) {
+      vector<int> o;
+      int p;
+      pg_t pgid(i, pool);
+      osdmap.pg_to_acting_osds(pgid, &o, &p);
+      for (unsigned j=0; j<o.size(); ++j)
+       (*any)[o[j]]++;
+      if (!o.empty())
+       (*first)[o[0]]++;
+      if (p >= 0)
+       (*primary)[p]++;
+    }
+  }
 };
 
 TEST_F(OSDMapTest, Create) {
@@ -237,3 +264,85 @@ TEST_F(OSDMapTest, KeepsNecessaryTemps) {
   EXPECT_FALSE(pending_inc.new_pg_temp.count(pgid));
   EXPECT_FALSE(pending_inc.new_primary_temp.count(pgid));
 }
+
+TEST_F(OSDMapTest, PrimaryAffinity) {
+  set_up_map();
+
+  /*
+  osdmap.print(cout);
+  Formatter *f = new_formatter("json-pretty");
+  f->open_object_section("CRUSH");
+  osdmap.crush->dump(f);
+  f->close_section();
+  f->flush(cout);
+  delete f;
+  */
+
+  int n = get_num_osds();
+  for (map<int64_t,pg_pool_t>::const_iterator p = osdmap.get_pools().begin();
+       p != osdmap.get_pools().end();
+       ++p) {
+    int pool = p->first;
+    cout << "pool " << pool << std::endl;
+    {
+      vector<int> any(n, 0);
+      vector<int> first(n, 0);
+      vector<int> primary(n, 0);
+      test_mappings(0, 10000, &any, &first, &primary);
+      for (int i=0; i<n; ++i) {
+       //cout << "osd." << i << " " << any[i] << " " << first[i] << " " << primary[i] << std::endl;
+       ASSERT_LT(0, any[i]);
+       ASSERT_LT(0, first[i]);
+       ASSERT_LT(0, primary[i]);
+      }
+    }
+
+    osdmap.set_primary_affinity(0, 0);
+    osdmap.set_primary_affinity(1, 0);
+    {
+      vector<int> any(n, 0);
+      vector<int> first(n, 0);
+      vector<int> primary(n, 0);
+      test_mappings(pool, 10000, &any, &first, &primary);
+      for (int i=0; i<n; ++i) {
+       //cout << "osd." << i << " " << any[i] << " " << first[i] << " " << primary[i] << std::endl;
+       ASSERT_LT(0, any[i]);
+       if (i >= 2) {
+         ASSERT_LT(0, first[i]);
+         ASSERT_LT(0, primary[i]);
+       } else {
+         if (p->second.is_replicated())
+           ASSERT_EQ(0, first[i]);
+         ASSERT_EQ(0, primary[i]);
+       }
+      }
+    }
+
+    osdmap.set_primary_affinity(0, 0x8000);
+    osdmap.set_primary_affinity(1, 0);
+    {
+      vector<int> any(n, 0);
+      vector<int> first(n, 0);
+      vector<int> primary(n, 0);
+      test_mappings(pool, 10000, &any, &first, &primary);
+      for (int i=0; i<n; ++i) {
+       //cout << "osd." << i << " " << any[i] << " " << first[i] << " " << primary[i] << std::endl;
+       ASSERT_LT(0, any[i]);
+       if (i >= 2) {
+         ASSERT_LT(0, first[i]);
+         ASSERT_LT(0, primary[i]);
+       } else if (i == 1) {
+         if (p->second.is_replicated())
+           ASSERT_EQ(0, first[i]);
+         ASSERT_EQ(0, primary[i]);
+       } else {
+         ASSERT_LT(10000/6/4, primary[0]);
+         ASSERT_GT(10000/6/4*3, primary[0]);
+       }
+      }
+    }
+
+    osdmap.set_primary_affinity(0, 0x10000);
+    osdmap.set_primary_affinity(1, 0x10000);
+  }
+}