From: Sage Weil Date: Mon, 13 Mar 2017 15:58:22 +0000 (-0400) Subject: osd/OSDMap: add pg_remap and pg_remap_items X-Git-Tag: v12.0.2~280^2~10 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=27d6f4373bafa24450f6dbb4e4252c2d9c2c1448;p=ceph.git osd/OSDMap: add pg_remap and pg_remap_items The first simply forces a pg to map to a particular value. The second replaces specific OSDs with specific other OSDs in the CRUSH mapping. Signed-off-by: Sage Weil --- diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h index bf264b2997b9..68a55d05d3ac 100755 --- a/src/include/ceph_features.h +++ b/src/include/ceph_features.h @@ -100,6 +100,7 @@ DEFINE_CEPH_FEATURE_RETIRED(21, 1, MON_GV, HAMMER, JEWEL) DEFINE_CEPH_FEATURE(21, 2, SERVER_LUMINOUS) DEFINE_CEPH_FEATURE(21, 2, RESEND_ON_SPLIT) // overlap DEFINE_CEPH_FEATURE(21, 2, RADOS_BACKOFF) // overlap +DEFINE_CEPH_FEATURE(21, 2, OSDMAP_REMAP) // overlap DEFINE_CEPH_FEATURE_RETIRED(22, 1, BACKFILL_RESERVATION, JEWEL, LUMINOUS) DEFINE_CEPH_FEATURE(23, 1, MSG_AUTH) diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index 16945a289552..1032e6d0c243 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -415,7 +415,11 @@ void OSDMap::Incremental::encode(bufferlist& bl, uint64_t features) const ENCODE_START(8, 7, bl); { - ENCODE_START(3, 1, bl); // client-usable data + uint8_t v = 4; + if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) { + v = 3; + } + ENCODE_START(v, 1, bl); // client-usable data ::encode(fsid, bl); ::encode(epoch, bl); ::encode(modified, bl); @@ -436,6 +440,12 @@ void OSDMap::Incremental::encode(bufferlist& bl, uint64_t features) const ::encode(new_primary_affinity, bl); ::encode(new_erasure_code_profiles, bl); ::encode(old_erasure_code_profiles, bl); + if (v >= 4) { + ::encode(new_pg_remap, bl); + ::encode(old_pg_remap, bl); + ::encode(new_pg_remap_items, bl); + ::encode(old_pg_remap_items, bl); + } ENCODE_FINISH(bl); // client-usable data } @@ -605,7 +615,7 @@ void OSDMap::Incremental::decode(bufferlist::iterator& bl) return; } { - DECODE_START(3, bl); // client-usable data + DECODE_START(4, bl); // client-usable data ::decode(fsid, bl); ::decode(epoch, bl); ::decode(modified, bl); @@ -634,6 +644,12 @@ void OSDMap::Incremental::decode(bufferlist::iterator& bl) new_erasure_code_profiles.clear(); old_erasure_code_profiles.clear(); } + if (struct_v >= 4) { + ::decode(new_pg_remap, bl); + ::decode(old_pg_remap, bl); + ::decode(new_pg_remap_items, bl); + ::decode(old_pg_remap_items, bl); + } DECODE_FINISH(bl); // client-usable data } @@ -806,6 +822,45 @@ void OSDMap::Incremental::dump(Formatter *f) const } f->close_section(); // primary_temp + f->open_array_section("new_pg_remap"); + for (auto& i : new_pg_remap) { + f->open_object_section("mapping"); + f->dump_stream("pgid") << i.first; + f->open_array_section("osds"); + for (auto osd : i.second) { + f->dump_int("osd", osd); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_array_section("old_pg_remap"); + for (auto& i : old_pg_remap) { + f->dump_stream("pgid") << i; + } + f->close_section(); + + f->open_array_section("new_pg_remap_items"); + for (auto& i : new_pg_remap_items) { + f->open_object_section("mapping"); + f->dump_stream("pgid") << i.first; + f->open_array_section("mappings"); + for (auto& p : i.second) { + f->open_object_section("mapping"); + f->dump_int("from", p.first); + f->dump_int("to", p.second); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_array_section("old_pg_remap_items"); + for (auto& i : old_pg_remap_items) { + f->dump_stream("pgid") << i; + } + f->close_section(); + f->open_array_section("new_up_thru"); for (map::const_iterator p = new_up_thru.begin(); p != new_up_thru.end(); ++p) { f->open_object_section("osd"); @@ -1073,6 +1128,10 @@ uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const features |= CEPH_FEATURE_CRUSH_TUNABLES5; mask |= CEPH_FEATURES_CRUSH; + if (!pg_remap.empty() || !pg_remap_items.empty()) + features |= CEPH_FEATUREMASK_OSDMAP_REMAP; + mask |= CEPH_FEATUREMASK_OSDMAP_REMAP; + for (auto p = pools.begin(); p != pools.end(); ++p) { if (p->second.has_flag(pg_pool_t::FLAG_HASHPSPOOL)) { features |= CEPH_FEATURE_OSDHASHPSPOOL; @@ -1481,6 +1540,19 @@ int OSDMap::apply_incremental(const Incremental &inc) (*primary_temp)[p->first] = p->second; } + for (auto& p : inc.new_pg_remap) { + pg_remap[p.first] = p.second; + } + for (auto& pg : inc.old_pg_remap) { + pg_remap.erase(pg); + } + for (auto& p : inc.new_pg_remap_items) { + pg_remap_items[p.first] = p.second; + } + for (auto& pg : inc.old_pg_remap_items) { + pg_remap_items.erase(pg); + } + // blacklist if (!inc.new_blacklist.empty()) { blacklist.insert(inc.new_blacklist.begin(),inc.new_blacklist.end()); @@ -1622,6 +1694,40 @@ int OSDMap::_pick_primary(const vector& osds) const return -1; } +void OSDMap::_apply_remap(const pg_pool_t& pi, pg_t raw_pg, vector *raw) const +{ + pg_t pg = pi.raw_pg_to_pg(raw_pg); + auto p = pg_remap.find(pg); + if (p != pg_remap.end()) { + *raw = p->second; + return; + } + + auto q = pg_remap_items.find(pg); + if (q != pg_remap_items.end()) { + // NOTE: this approach does not allow a bidirectional swap, + // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1]. + for (auto& r : q->second) { + // make sure the replacement value doesn't already appear + bool exists = false; + ssize_t pos = -1; + for (unsigned i = 0; i < raw->size(); ++i) { + if ((*raw)[i] == r.second) { + exists = true; + break; + } + if ((*raw)[i] == r.first && pos < 0) { + pos = i; + } + } + if (!exists && pos >= 0) { + (*raw)[pos] = r.second; + return; + } + } + } +} + // pg -> (up osd list) void OSDMap::_raw_to_up_osds(const pg_pool_t& pool, const vector& raw, vector *up) const @@ -1762,6 +1868,7 @@ void OSDMap::pg_to_raw_up(pg_t pg, vector *up, int *primary) const vector raw; ps_t pps; _pg_to_raw_osds(*pool, pg, &raw, &pps); + _apply_remap(*pool, pg, &raw); _raw_to_up_osds(*pool, raw, up); *primary = _pick_primary(raw); _apply_primary_affinity(pps, *pool, up, primary); @@ -1794,6 +1901,7 @@ void OSDMap::_pg_to_up_acting_osds( _get_temp_osds(*pool, pg, &_acting, &_acting_primary); if (_acting.empty() || up || up_primary) { _pg_to_raw_osds(*pool, pg, &raw, &pps); + _apply_remap(*pool, pg, &raw); _raw_to_up_osds(*pool, raw, &_up); _up_primary = _pick_primary(_up); _apply_primary_affinity(pps, *pool, &_up, &_up_primary); @@ -1980,7 +2088,11 @@ void OSDMap::encode(bufferlist& bl, uint64_t features) const ENCODE_START(8, 7, bl); { - ENCODE_START(3, 1, bl); // client-usable data + uint8_t v = 4; + if (!HAVE_FEATURE(features, OSDMAP_REMAP)) { + v = 3; + } + ENCODE_START(v, 1, bl); // client-usable data // base ::encode(fsid, bl); ::encode(epoch, bl); @@ -2012,6 +2124,14 @@ void OSDMap::encode(bufferlist& bl, uint64_t features) const crush->encode(cbl, features); ::encode(cbl, bl); ::encode(erasure_code_profiles, bl); + + if (v >= 4) { + ::encode(pg_remap, bl); + ::encode(pg_remap_items, bl); + } else { + assert(pg_remap.empty()); + assert(pg_remap_items.empty()); + } ENCODE_FINISH(bl); // client-usable data } @@ -2208,7 +2328,7 @@ void OSDMap::decode(bufferlist::iterator& bl) * Since we made it past that hurdle, we can use our normal paths. */ { - DECODE_START(3, bl); // client-usable data + DECODE_START(4, bl); // client-usable data // base ::decode(fsid, bl); ::decode(epoch, bl); @@ -2247,6 +2367,13 @@ void OSDMap::decode(bufferlist::iterator& bl) } else { erasure_code_profiles.clear(); } + if (struct_v >= 4) { + ::decode(pg_remap, bl); + ::decode(pg_remap_items, bl); + } else { + pg_remap.clear(); + pg_remap_items.clear(); + } DECODE_FINISH(bl); // client-usable data } @@ -2398,6 +2525,33 @@ void OSDMap::dump(Formatter *f) const } f->close_section(); + f->open_array_section("pg_remap"); + for (auto& p : pg_remap) { + f->open_object_section("mapping"); + f->dump_stream("pgid") << p.first; + f->open_array_section("osds"); + for (auto q : p.second) { + f->dump_int("osd", q); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_array_section("pg_remap_items"); + for (auto& p : pg_remap_items) { + f->open_object_section("mapping"); + f->dump_stream("pgid") << p.first; + f->open_array_section("mappings"); + for (auto& q : p.second) { + f->open_object_section("mapping"); + f->dump_int("from", q.first); + f->dump_int("to", q.second); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); f->open_array_section("pg_temp"); for (map >::const_iterator p = pg_temp->begin(); p != pg_temp->end(); @@ -2565,6 +2719,13 @@ void OSDMap::print(ostream& out) const } out << std::endl; + for (auto& p : pg_remap) { + out << "pg_remap " << p.first << " " << p.second << "\n"; + } + for (auto& p : pg_remap_items) { + out << "pg_remap_items " << p.first << " " << p.second << "\n"; + } + for (map >::const_iterator p = pg_temp->begin(); p != pg_temp->end(); ++p) diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index 21bcacc0750c..7c17ecad2e44 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -148,6 +148,10 @@ public: map new_hb_back_up; map new_hb_front_up; + map> new_pg_remap; + map>> new_pg_remap_items; + set old_pg_remap, old_pg_remap_items; + string cluster_snapshot; float new_nearfull_ratio = -1; @@ -232,6 +236,10 @@ private: ceph::shared_ptr< map > primary_temp; // temp primary mapping (e.g. while we rebuild) ceph::shared_ptr< vector<__u32> > osd_primary_affinity; ///< 16.16 fixed point, 0x10000 = baseline + // remap (post-CRUSH, pre-up) + map> pg_remap; ///< remap pg + map>> pg_remap_items; ///< remap osds in up set + map pools; map pool_name; map > erasure_code_profiles; @@ -655,6 +663,9 @@ private: void _apply_primary_affinity(ps_t seed, const pg_pool_t& pool, vector *osds, int *primary) const; + /// apply pg_remap[_items] mappings + void _apply_remap(const pg_pool_t& pi, pg_t pg, vector *raw) const; + /// pg -> (up osd list) void _raw_to_up_osds(const pg_pool_t& pool, const vector& raw, vector *up) const;