From: Greg Farnum Date: Fri, 12 Feb 2010 21:21:22 +0000 (-0800) Subject: osd: Deal with pools being removed from OSDMap. X-Git-Tag: v0.19~17 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=3ced5e7de243edeccfd20a90ec2034206c920795;p=ceph.git osd: Deal with pools being removed from OSDMap. This potentially has issues, since pools are not removed from the map until after all the PGs are removed (which is threaded, not inline with map delivery). But Sage thinks it's okay and the system keeps working even if you delete a pool while benchmarking on it with rados. --- diff --git a/src/include/rados.h b/src/include/rados.h index 1f4c78640541..26ac8b89a676 100644 --- a/src/include/rados.h +++ b/src/include/rados.h @@ -11,8 +11,8 @@ /* * osdmap encoding versions */ -#define CEPH_OSDMAP_INC_VERSION 3 -#define CEPH_OSDMAP_VERSION 3 +#define CEPH_OSDMAP_INC_VERSION 4 +#define CEPH_OSDMAP_VERSION 4 /* * fs id diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index c4c3552f1863..424cb6092749 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -130,8 +130,9 @@ bool OSDMonitor::update_from_paxos() void OSDMonitor::create_pending() { - pending_inc = OSDMap::Incremental(osdmap.epoch+1); + pending_inc = OSDMap::Incremental(osdmap.highest_pool_num, osdmap.epoch+1); pending_inc.fsid = mon->monmap->fsid; + pending_inc.highest_pool_num_new = osdmap.highest_pool_num; dout(10) << "create_pending e " << pending_inc.epoch << dendl; } @@ -1005,18 +1006,12 @@ bool OSDMonitor::preprocess_command(MMonCommand *m) int OSDMonitor::prepare_new_pool(string& name) { - int pool = 1; - int err = 0; - for (map::iterator i = osdmap.pool_name.begin(); - i != osdmap.pool_name.end(); - i++) { - if (i->second == name) { - err = -EEXIST; - goto out; - } - if (i->first >= pool) - pool = i->first + 1; + if (osdmap.name_pool.count(name)) { + return -EEXIST; } + if (-1 == pending_inc.highest_pool_num_new) + pending_inc.highest_pool_num_new = osdmap.highest_pool_num; + int pool = ++pending_inc.highest_pool_num_new; pending_inc.new_pools[pool].v.type = CEPH_PG_TYPE_REP; pending_inc.new_pools[pool].v.size = 2; pending_inc.new_pools[pool].v.crush_ruleset = 0; @@ -1026,8 +1021,7 @@ int OSDMonitor::prepare_new_pool(string& name) pending_inc.new_pools[pool].v.lpgp_num = 0; pending_inc.new_pools[pool].v.last_change = pending_inc.epoch; pending_inc.new_pool_names[pool] = name; -out: - return err; + return 0; } bool OSDMonitor::prepare_command(MMonCommand *m) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 2f9912ff86e5..f4938e7fca2d 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -2176,6 +2176,11 @@ void OSD::handle_osd_map(MOSDMap *m) p != pool_map.end(); p++) { const pg_pool_t* pi = osdmap->get_pg_pool(p->first); + if (NULL == pi) { + dout(10) << " pool " << p->first + << " appears to have been deleted" << dendl; + continue; + } if (pi->get_snap_epoch() == cur+1) { PGPool *pool = p->second; pi->build_removed_snaps(pool->newly_removed_snaps); @@ -2509,6 +2514,12 @@ void OSD::activate_map(ObjectStore::Transaction& t, list& tfin) it++) { PG *pg = it->second; pg->lock(); + if (!osdmap->have_pg_pool(pg->info.pgid.pool())) { + //pool is deleted! + queue_pg_for_deletion(pg); + pg->unlock(); + continue; + } if (pg->is_active()) { // update started counter if (!pg->info.snap_trimq.empty()) @@ -3543,39 +3554,44 @@ void OSD::handle_pg_remove(MOSDPGRemove *m) << m->pg_list.size() << " pgs" << dendl; if (!require_same_or_newer_map(m, m->get_epoch())) return; - + for (vector::iterator it = m->pg_list.begin(); it != m->pg_list.end(); it++) { pg_t pgid = *it; - PG *pg; - + if (pg_map.count(pgid) == 0) { dout(10) << " don't have pg " << pgid << dendl; continue; } - - pg = _lookup_lock_pg(pgid); + dout(5) << "queue_pg_for_deletion: " << pgid << dendl; + PG *pg = _lookup_lock_pg(pgid); if (pg->info.history.same_acting_since <= m->get_epoch()) { if (pg->deleting) { dout(10) << *pg << " already removing." << dendl; } else { - dout(10) << *pg << " removing." << dendl; - assert(pg->get_role() == -1); assert(pg->get_primary() == m->get_source().num()); - pg->deleting = true; - remove_wq.queue(pg); + queue_pg_for_deletion(pg); } } else { dout(10) << *pg << " ignoring remove request, pg changed in epoch " - << pg->info.history.same_acting_since << " > " << m->get_epoch() << dendl; + << pg->info.history.same_acting_since + << " > " << m->get_epoch() << dendl; } pg->unlock(); } - delete m; } + +void OSD::queue_pg_for_deletion(PG *pg) +{ + dout(10) << *pg << " removing." << dendl; + assert(pg->get_role() == -1); + pg->deleting = true; + remove_wq.queue(pg); +} + void OSD::_remove_pg(PG *pg) { pg_t pgid = pg->info.pgid; diff --git a/src/osd/OSD.h b/src/osd/OSD.h index e378fd6e69ef..f9d632acaddb 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -601,6 +601,7 @@ protected: void handle_pg_trim(class MOSDPGTrim *m); void handle_pg_remove(class MOSDPGRemove *m); + void queue_pg_for_deletion(PG *pg); void _remove_pg(PG *pg); // helper for handle_pg_log and handle_pg_info diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index dd9b20d97acd..1dddc58e7168 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -100,6 +100,8 @@ void OSDMap::build_simple(epoch_t e, ceph_fsid_t &fsid, rulesets[CEPH_DATA_RULE] = "data"; rulesets[CEPH_METADATA_RULE] = "metadata"; rulesets[CEPH_CASDATA_RULE] = "casdata"; + //If you add new rulesets, you MUST change the default "highest_pool_num" + //initialization to match or very bad things WILL happen, like losing pools! int pool = 0; for (map::iterator p = rulesets.begin(); p != rulesets.end(); p++) { diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index 8fe1dd13e582..4278737d9d78 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -129,6 +129,7 @@ public: ceph_fsid_t fsid; epoch_t epoch; // new epoch; we are a diff from epoch-1 to epoch utime_t modified; + int highest_pool_num_new; //incremented by the OSDMonitor on each pool create int32_t new_flags; /* @@ -168,6 +169,7 @@ public: ::encode(fsid, bl); ::encode(epoch, bl); ::encode(modified, bl); + ::encode(highest_pool_num_new, bl); ::encode(new_flags, bl); ::encode(fullmap, bl); ::encode(crush, bl); @@ -196,6 +198,7 @@ public: ::decode(fsid, p); ::decode(epoch, p); ::decode(modified, p); + ::decode(highest_pool_num_new, p); ::decode(new_flags, p); ::decode(fullmap, p); ::decode(crush, p); @@ -218,7 +221,8 @@ public: ::decode(old_blacklist, p); } - Incremental(epoch_t e=0) : epoch(e), new_flags(-1), new_max_osd(-1) { + Incremental(epoch_t e=0) : + epoch(e), highest_pool_num_new(-1), new_flags(-1), new_max_osd(-1) { memset(&fsid, 0, sizeof(fsid)); } Incremental(bufferlist &bl) { @@ -234,6 +238,7 @@ private: ceph_fsid_t fsid; epoch_t epoch; // what epoch of the osd cluster descriptor is this utime_t created, modified; // epoch start time + int highest_pool_num; //the largest pool num in this epoch uint32_t flags; @@ -260,6 +265,7 @@ private: public: OSDMap() : epoch(0), + highest_pool_num(2), flags(0), max_osd(0) { memset(&fsid, 0, sizeof(fsid)); @@ -463,6 +469,8 @@ private: if (inc.new_max_osd >= 0) set_max_osd(inc.new_max_osd); + if (inc.highest_pool_num_new != -1) + highest_pool_num = inc.highest_pool_num_new; for (set::iterator p = inc.old_pools.begin(); p != inc.old_pools.end(); p++) { @@ -567,6 +575,7 @@ private: max_pools = pools.rbegin()->first + 1; ::encode(max_pools, bl); ::encode(pools, bl); + ::encode(highest_pool_num, bl); ::encode(flags, bl); @@ -603,6 +612,7 @@ private: int32_t max_pools; ::decode(max_pools, p); ::decode(pools, p); + ::decode(highest_pool_num, p); ::decode(flags, p);