From: Xiaoxi Chen Date: Mon, 16 Nov 2015 08:47:13 +0000 (+0800) Subject: osd/OSD.cc: shutdown after flapping certain times X-Git-Tag: v10.0.3~193^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=9a14ff7eebf9eea898f4ac48187893cae931a720;p=ceph.git osd/OSD.cc: shutdown after flapping certain times OSD stauts may flapping due to some hardware/network issue. Although we tried our best to self healthing but still in some case the OSD is still flipping and require admin to operate. This patch try another approach that shutdown the OSD after being marked down certain times(flapping), thus speed up the convergence of cluster. Signed-off-by: Xiaoxi Chen --- diff --git a/src/common/config_opts.h b/src/common/config_opts.h index a19cc5d8ff78..af1823d18350 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -610,6 +610,10 @@ OPTION(osd_map_message_max, OPT_INT, 100) // max maps per MOSDMap message OPTION(osd_map_share_max_epochs, OPT_INT, 100) // cap on # of inc maps we send to peers, clients OPTION(osd_inject_bad_map_crc_probability, OPT_FLOAT, 0) OPTION(osd_inject_failure_on_pg_removal, OPT_BOOL, false) +// shutdown the OSD if stuatus flipping more than max_markdown_count times in recent max_markdown_period seconds +OPTION(osd_max_markdown_period , OPT_INT, 600) +OPTION(osd_max_markdown_count, OPT_INT, 5) + OPTION(osd_op_threads, OPT_INT, 2) // 0 == no threading OPTION(osd_peering_wq_batch_size, OPT_U64, 20) OPTION(osd_op_pq_max_tokens_per_priority, OPT_U64, 4194304) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index c2267efe7c21..da0e934091fb 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -6531,6 +6531,18 @@ void OSD::handle_osd_map(MOSDMap *m) service.set_epochs(NULL,&up_epoch, &bind_epoch); do_restart = true; + //add markdown log + utime_t now = ceph_clock_now(g_ceph_context); + utime_t grace = utime_t(g_conf->osd_max_markdown_period, 0); + osd_markdown_log.push_back(now); + //clear all out-of-date log + while (!osd_markdown_log.empty() && osd_markdown_log.front() + grace < now) + osd_markdown_log.pop_front(); + if ((int)osd_markdown_log.size() > g_conf->osd_max_markdown_count) { + do_restart = false; + do_shutdown = true; + } + start_waiting_for_healthy(); set avoid_ports; diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 8c0cd8e9c4cb..ccf21372023a 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -1822,6 +1822,7 @@ private: utime_t had_map_since; RWLock map_lock; list waiting_for_osdmap; + deque osd_markdown_log; friend struct send_map_on_destruct;