From 1acb85ed2ac2392625938954869699fd4682e5f8 Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Thu, 15 Jun 2017 17:05:25 +0800 Subject: [PATCH] ceph-rest-api: be more tolerant on network failure * set timeout for json_command() * retry on timeout Fixes: http://tracker.ceph.com/issues/20115 Signed-off-by: Kefu Chai --- src/pybind/ceph_rest_api.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/pybind/ceph_rest_api.py b/src/pybind/ceph_rest_api.py index 9bae3ef62a000..6ca21779876ad 100755 --- a/src/pybind/ceph_rest_api.py +++ b/src/pybind/ceph_rest_api.py @@ -29,6 +29,12 @@ DEFAULT_LOG_LEVEL = 'warning' DEFAULT_LOGDIR = '/var/log/ceph' # default client name will be 'client.' +# network failure could keep the underlying json_command() waiting forever, +# set a timeout, so it bails out on timeout. +DEFAULT_TIMEOUT = 20 +# and retry in that case. +DEFAULT_TRIES = 5 + # 'app' must be global for decorators, etc. APPNAME = '__main__' app = flask.Flask(APPNAME) @@ -481,9 +487,18 @@ def handler(catchall_path=None, fmt=None, target=None): cmdtarget = ('mon', '') app.logger.debug('sending command prefix %s argdict %s', prefix, argdict) - ret, outbuf, outs = json_command(app.ceph_cluster, prefix=prefix, - target=cmdtarget, - inbuf=flask.request.data, argdict=argdict) + + for _ in range(DEFAULT_TRIES): + ret, outbuf, outs = json_command(app.ceph_cluster, prefix=prefix, + target=cmdtarget, + inbuf=flask.request.data, + argdict=argdict, + timeout=DEFAULT_TIMEOUT) + if ret != -errno.EINTR: + break + else: + return make_response(fmt, '', + 'Timedout: {0} ({1})'.format(outs, ret), 504) if ret: return make_response(fmt, '', 'Error: {0} ({1})'.format(outs, ret), 400) -- 2.39.5