From: Sage Weil Date: Thu, 10 Dec 2009 16:28:31 +0000 (-0800) Subject: mon: stop trying to create localized pgs >= max osd|device X-Git-Tag: v0.19~277 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=cfda3be56f9cd00afac9248fa8df723bf318702c;p=ceph.git mon: stop trying to create localized pgs >= max osd|device If lpg_num == lpgp_num, and the preferred osd no longer exists, there is no guarantee that the parent and child pgs will land on the same osd. This happens when we are creating pgs for a new osd, they don't finish getting created, and then we destroy that OSD again. Care should be taken to avoid expanding the OSD count _and_ lpg_num simultaneously... --- diff --git a/src/TODO b/src/TODO index ad2881d2fcf5..cf20091a1a5c 100644 --- a/src/TODO +++ b/src/TODO @@ -77,6 +77,7 @@ v0.19 pending wire, disk format changes +- add v to PGMap, PGMap::Incremental bugs - mds memory leak @@ -265,6 +266,7 @@ mds - rename: importing inode... also journal imported client map? mon +- don't allow lpg_num expansion and osd addition at the same time? - how to shrink cluster? - how to tell osd to cleanly shut down - mds injectargs N should take mds# or id. * should bcast to standy mds's. diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h index 0d16c5e01b1b..2d2b3e54f7de 100644 --- a/src/mon/PGMap.h +++ b/src/mon/PGMap.h @@ -44,6 +44,7 @@ public: set osd_stat_rm; epoch_t osdmap_epoch; epoch_t pg_scan; // osdmap epoch + set pg_remove; void encode(bufferlist &bl) const { ::encode(version, bl); @@ -52,6 +53,7 @@ public: ::encode(osd_stat_rm, bl); ::encode(osdmap_epoch, bl); ::encode(pg_scan, bl); + ::encode(pg_remove, bl); } void decode(bufferlist::iterator &bl) { ::decode(version, bl); @@ -60,6 +62,8 @@ public: ::decode(osd_stat_rm, bl); ::decode(osdmap_epoch, bl); ::decode(pg_scan, bl); + if (!bl.end()) + ::decode(pg_remove, bl); } Incremental() : version(0), osdmap_epoch(0), pg_scan(0) {} @@ -101,6 +105,15 @@ public: nearfull_osds.erase(from); } } + for (set::iterator p = inc.pg_remove.begin(); + p != inc.pg_remove.end(); + p++) { + if (pg_set.count(*p)) { + pg_set.erase(*p); + stat_pg_sub(*p, pg_stat[*p]); + pg_stat.erase(*p); + } + } for (set::iterator p = inc.osd_stat_rm.begin(); p != inc.osd_stat_rm.end(); @@ -109,6 +122,7 @@ public: stat_osd_sub(osd_stat[*p]); osd_stat.erase(*p); } + if (inc.osdmap_epoch) last_osdmap_epoch = inc.osdmap_epoch; if (inc.pg_scan) diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc index ec8bc700d097..ca60f0ebe4b6 100644 --- a/src/mon/PGMonitor.cc +++ b/src/mon/PGMonitor.cc @@ -552,8 +552,21 @@ bool PGMonitor::register_new_pgs() } } } - dout(10) << "register_new_pgs registered " << created << " new pgs" << dendl; - if (created) { + + int max = MIN(osdmap->get_max_osd(), osdmap->crush.get_max_devices()); + int removed = 0; + for (set::iterator p = pg_map.creating_pgs.begin(); + p != pg_map.creating_pgs.end(); + p++) + if (p->preferred() >= max) { + dout(20) << " removing creating_pg " << *p << " because preferred >= max osd or crush device" << dendl; + pending_inc.pg_remove.insert(*p); + removed++; + } + + dout(10) << "register_new_pgs registered " << created << " new pgs, removed " + << removed << " uncreated pgs" << dendl; + if (created || removed) { pending_inc.pg_scan = epoch; return true; } @@ -566,6 +579,9 @@ void PGMonitor::send_pg_creates() map msg; utime_t now = g_clock.now(); + + OSDMap *osdmap = &mon->osdmon()->osdmap; + int max = MIN(osdmap->get_max_osd(), osdmap->crush.get_max_devices()); for (set::iterator p = pg_map.creating_pgs.begin(); p != pg_map.creating_pgs.end(); @@ -583,6 +599,10 @@ void PGMonitor::send_pg_creates() } int osd = acting[0]; + // don't send creates for non-existant preferred osds! + if (pgid.preferred() >= max) + continue; + // throttle? if (last_sent_pg_create.count(osd) && now - g_conf.mon_pg_create_interval < last_sent_pg_create[osd])