From: Kefu Chai Date: Mon, 9 May 2016 07:01:46 +0000 (+0800) Subject: osd: remove all stale osdmaps in handle_osd_map() X-Git-Tag: v0.94.8~37^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=c2ea6dbd652782bb227b7787c486660871b2d950;p=ceph.git osd: remove all stale osdmaps in handle_osd_map() in a large cluster, there are better chances that the OSD fails to trim the cached osdmap in a timely manner. and sometimes, it is just unable to keep up with the incoming osdmap if skip_maps, so the osdmap cache can keep building up to over 250GB in size. in this change * publish_superblock() before trimming the osdmaps, so other osdmap consumers of OSDService.superblock won't access the osdmaps being removed. * trim all stale osdmaps in batch of conf->osd_target_transaction_size if skip_maps is true. in my test, it happens when the osd only receives the osdmap from monitor occasionally because the osd happens to be chosen when monitor wants to share a new osdmap with a random osd. * always use dedicated transaction(s) for trimming osdmaps. so even in the normal case where we are able to trim all stale osdmaps in a single batch, a separated transaction is used. we can piggy back the commits for removing maps, but we keep it this way for simplicity. * use std::min() instead MIN() for type safety Fixes: http://tracker.ceph.com/issues/13990 Signed-off-by: Kefu Chai (cherry picked from commit 369db9930887d75b498927da9c97733bff4472b6) Conflicts: remove the C++11ism --- diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index f3e5e8ef53dc..33c42da3de95 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -6168,6 +6168,48 @@ void OSD::osdmap_subscribe(version_t epoch, bool force_request) } } +void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps) +{ + epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound()); + if (superblock.oldest_map >= min) + return; + + int num = 0; + ObjectStore::Transaction *t = NULL; + for (epoch_t e = superblock.oldest_map; e < min; ++e) { + dout(20) << " removing old osdmap epoch " << e << dendl; + if (!t) { + t = new ObjectStore::Transaction; + } + t->remove(META_COLL, get_osdmap_pobject_name(e)); + t->remove(META_COLL, get_inc_osdmap_pobject_name(e)); + superblock.oldest_map = e + 1; + num++; + if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) { + service.publish_superblock(superblock); + write_superblock(*t); + store->queue_transaction_and_cleanup(NULL, t); + t = NULL; + num = 0; + if (!skip_maps) { + // skip_maps leaves us with a range of old maps if we fail to remove all + // of them before moving superblock.oldest_map forward to the first map + // in the incoming MOSDMap msg. so we should continue removing them in + // this case, even we could do huge series of delete transactions all at + // once. + break; + } + } + } + if (num > 0) { + service.publish_superblock(superblock); + write_superblock(*t); + store->queue_transaction_and_cleanup(NULL, t); + } + // we should not remove the cached maps + assert(min <= service.map_cache.cached_key_lower_bound()); +} + void OSD::handle_osd_map(MOSDMap *m) { assert(osd_lock.is_locked()); @@ -6342,20 +6384,8 @@ void OSD::handle_osd_map(MOSDMap *m) } if (superblock.oldest_map) { - int num = 0; - epoch_t min( - MIN(m->oldest_map, - service.map_cache.cached_key_lower_bound())); - for (epoch_t e = superblock.oldest_map; e < min; ++e) { - dout(20) << " removing old osdmap epoch " << e << dendl; - t.remove(META_COLL, get_osdmap_pobject_name(e)); - t.remove(META_COLL, get_inc_osdmap_pobject_name(e)); - superblock.oldest_map = e+1; - num++; - if (num >= cct->_conf->osd_target_transaction_size && - (uint64_t)num > (last - first)) // make sure we at least keep pace with incoming maps - break; - } + // make sure we at least keep pace with incoming maps + trim_maps(m->oldest_map, last - first + 1, skip_maps); } if (!superblock.oldest_map || skip_maps) diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 517e5e6d6ac6..4e564d3ee177 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -1659,6 +1659,7 @@ private: void wait_for_new_map(OpRequestRef op); void handle_osd_map(class MOSDMap *m); + void trim_maps(epoch_t oldest, int nreceived, bool skip_maps); void note_down_osd(int osd); void note_up_osd(int osd);