From 3ced5e7de243edeccfd20a90ec2034206c920795 Mon Sep 17 00:00:00 2001 From: Greg Farnum Date: Fri, 12 Feb 2010 13:21:22 -0800 Subject: [PATCH] osd: Deal with pools being removed from OSDMap. This potentially has issues, since pools are not removed from the map until after all the PGs are removed (which is threaded, not inline with map delivery). But Sage thinks it's okay and the system keeps working even if you delete a pool while benchmarking on it with rados. --- src/include/rados.h | 4 ++-- src/mon/OSDMonitor.cc | 22 ++++++++-------------- src/osd/OSD.cc | 38 +++++++++++++++++++++++++++----------- src/osd/OSD.h | 1 + src/osd/OSDMap.cc | 2 ++ src/osd/OSDMap.h | 12 +++++++++++- 6 files changed, 51 insertions(+), 28 deletions(-) diff --git a/src/include/rados.h b/src/include/rados.h index 1f4c786405419..26ac8b89a6767 100644 --- a/src/include/rados.h +++ b/src/include/rados.h @@ -11,8 +11,8 @@ /* * osdmap encoding versions */ -#define CEPH_OSDMAP_INC_VERSION 3 -#define CEPH_OSDMAP_VERSION 3 +#define CEPH_OSDMAP_INC_VERSION 4 +#define CEPH_OSDMAP_VERSION 4 /* * fs id diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index c4c3552f18633..424cb6092749f 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -130,8 +130,9 @@ bool OSDMonitor::update_from_paxos() void OSDMonitor::create_pending() { - pending_inc = OSDMap::Incremental(osdmap.epoch+1); + pending_inc = OSDMap::Incremental(osdmap.highest_pool_num, osdmap.epoch+1); pending_inc.fsid = mon->monmap->fsid; + pending_inc.highest_pool_num_new = osdmap.highest_pool_num; dout(10) << "create_pending e " << pending_inc.epoch << dendl; } @@ -1005,18 +1006,12 @@ bool OSDMonitor::preprocess_command(MMonCommand *m) int OSDMonitor::prepare_new_pool(string& name) { - int pool = 1; - int err = 0; - for (map::iterator i = osdmap.pool_name.begin(); - i != osdmap.pool_name.end(); - i++) { - if (i->second == name) { - err = -EEXIST; - goto out; - } - if (i->first >= pool) - pool = i->first + 1; + if (osdmap.name_pool.count(name)) { + return -EEXIST; } + if (-1 == pending_inc.highest_pool_num_new) + pending_inc.highest_pool_num_new = osdmap.highest_pool_num; + int pool = ++pending_inc.highest_pool_num_new; pending_inc.new_pools[pool].v.type = CEPH_PG_TYPE_REP; pending_inc.new_pools[pool].v.size = 2; pending_inc.new_pools[pool].v.crush_ruleset = 0; @@ -1026,8 +1021,7 @@ int OSDMonitor::prepare_new_pool(string& name) pending_inc.new_pools[pool].v.lpgp_num = 0; pending_inc.new_pools[pool].v.last_change = pending_inc.epoch; pending_inc.new_pool_names[pool] = name; -out: - return err; + return 0; } bool OSDMonitor::prepare_command(MMonCommand *m) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 2f9912ff86e59..f4938e7fca2d6 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -2176,6 +2176,11 @@ void OSD::handle_osd_map(MOSDMap *m) p != pool_map.end(); p++) { const pg_pool_t* pi = osdmap->get_pg_pool(p->first); + if (NULL == pi) { + dout(10) << " pool " << p->first + << " appears to have been deleted" << dendl; + continue; + } if (pi->get_snap_epoch() == cur+1) { PGPool *pool = p->second; pi->build_removed_snaps(pool->newly_removed_snaps); @@ -2509,6 +2514,12 @@ void OSD::activate_map(ObjectStore::Transaction& t, list& tfin) it++) { PG *pg = it->second; pg->lock(); + if (!osdmap->have_pg_pool(pg->info.pgid.pool())) { + //pool is deleted! + queue_pg_for_deletion(pg); + pg->unlock(); + continue; + } if (pg->is_active()) { // update started counter if (!pg->info.snap_trimq.empty()) @@ -3543,39 +3554,44 @@ void OSD::handle_pg_remove(MOSDPGRemove *m) << m->pg_list.size() << " pgs" << dendl; if (!require_same_or_newer_map(m, m->get_epoch())) return; - + for (vector::iterator it = m->pg_list.begin(); it != m->pg_list.end(); it++) { pg_t pgid = *it; - PG *pg; - + if (pg_map.count(pgid) == 0) { dout(10) << " don't have pg " << pgid << dendl; continue; } - - pg = _lookup_lock_pg(pgid); + dout(5) << "queue_pg_for_deletion: " << pgid << dendl; + PG *pg = _lookup_lock_pg(pgid); if (pg->info.history.same_acting_since <= m->get_epoch()) { if (pg->deleting) { dout(10) << *pg << " already removing." << dendl; } else { - dout(10) << *pg << " removing." << dendl; - assert(pg->get_role() == -1); assert(pg->get_primary() == m->get_source().num()); - pg->deleting = true; - remove_wq.queue(pg); + queue_pg_for_deletion(pg); } } else { dout(10) << *pg << " ignoring remove request, pg changed in epoch " - << pg->info.history.same_acting_since << " > " << m->get_epoch() << dendl; + << pg->info.history.same_acting_since + << " > " << m->get_epoch() << dendl; } pg->unlock(); } - delete m; } + +void OSD::queue_pg_for_deletion(PG *pg) +{ + dout(10) << *pg << " removing." << dendl; + assert(pg->get_role() == -1); + pg->deleting = true; + remove_wq.queue(pg); +} + void OSD::_remove_pg(PG *pg) { pg_t pgid = pg->info.pgid; diff --git a/src/osd/OSD.h b/src/osd/OSD.h index e378fd6e69efe..f9d632acaddb2 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -601,6 +601,7 @@ protected: void handle_pg_trim(class MOSDPGTrim *m); void handle_pg_remove(class MOSDPGRemove *m); + void queue_pg_for_deletion(PG *pg); void _remove_pg(PG *pg); // helper for handle_pg_log and handle_pg_info diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index dd9b20d97acd3..1dddc58e7168b 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -100,6 +100,8 @@ void OSDMap::build_simple(epoch_t e, ceph_fsid_t &fsid, rulesets[CEPH_DATA_RULE] = "data"; rulesets[CEPH_METADATA_RULE] = "metadata"; rulesets[CEPH_CASDATA_RULE] = "casdata"; + //If you add new rulesets, you MUST change the default "highest_pool_num" + //initialization to match or very bad things WILL happen, like losing pools! int pool = 0; for (map::iterator p = rulesets.begin(); p != rulesets.end(); p++) { diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index 8fe1dd13e582f..4278737d9d78d 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -129,6 +129,7 @@ public: ceph_fsid_t fsid; epoch_t epoch; // new epoch; we are a diff from epoch-1 to epoch utime_t modified; + int highest_pool_num_new; //incremented by the OSDMonitor on each pool create int32_t new_flags; /* @@ -168,6 +169,7 @@ public: ::encode(fsid, bl); ::encode(epoch, bl); ::encode(modified, bl); + ::encode(highest_pool_num_new, bl); ::encode(new_flags, bl); ::encode(fullmap, bl); ::encode(crush, bl); @@ -196,6 +198,7 @@ public: ::decode(fsid, p); ::decode(epoch, p); ::decode(modified, p); + ::decode(highest_pool_num_new, p); ::decode(new_flags, p); ::decode(fullmap, p); ::decode(crush, p); @@ -218,7 +221,8 @@ public: ::decode(old_blacklist, p); } - Incremental(epoch_t e=0) : epoch(e), new_flags(-1), new_max_osd(-1) { + Incremental(epoch_t e=0) : + epoch(e), highest_pool_num_new(-1), new_flags(-1), new_max_osd(-1) { memset(&fsid, 0, sizeof(fsid)); } Incremental(bufferlist &bl) { @@ -234,6 +238,7 @@ private: ceph_fsid_t fsid; epoch_t epoch; // what epoch of the osd cluster descriptor is this utime_t created, modified; // epoch start time + int highest_pool_num; //the largest pool num in this epoch uint32_t flags; @@ -260,6 +265,7 @@ private: public: OSDMap() : epoch(0), + highest_pool_num(2), flags(0), max_osd(0) { memset(&fsid, 0, sizeof(fsid)); @@ -463,6 +469,8 @@ private: if (inc.new_max_osd >= 0) set_max_osd(inc.new_max_osd); + if (inc.highest_pool_num_new != -1) + highest_pool_num = inc.highest_pool_num_new; for (set::iterator p = inc.old_pools.begin(); p != inc.old_pools.end(); p++) { @@ -567,6 +575,7 @@ private: max_pools = pools.rbegin()->first + 1; ::encode(max_pools, bl); ::encode(pools, bl); + ::encode(highest_pool_num, bl); ::encode(flags, bl); @@ -603,6 +612,7 @@ private: int32_t max_pools; ::decode(max_pools, p); ::decode(pools, p); + ::decode(highest_pool_num, p); ::decode(flags, p); -- 2.39.5