]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
ceph-rest-api: be more tolerant on network failure 15706/head
authorKefu Chai <kchai@redhat.com>
Thu, 15 Jun 2017 09:05:25 +0000 (17:05 +0800)
committerKefu Chai <kchai@redhat.com>
Thu, 15 Jun 2017 09:28:24 +0000 (17:28 +0800)
* set timeout for json_command()
* retry on timeout

Fixes: http://tracker.ceph.com/issues/20115
Signed-off-by: Kefu Chai <kchai@redhat.com>
src/pybind/ceph_rest_api.py

index 9bae3ef62a0003f24f6f07310fae2446e6ad818f..6ca21779876adc27ec576c77189f02bb992a0618 100755 (executable)
@@ -29,6 +29,12 @@ DEFAULT_LOG_LEVEL = 'warning'
 DEFAULT_LOGDIR = '/var/log/ceph'
 # default client name will be 'client.<DEFAULT_ID>'
 
+# network failure could keep the underlying json_command() waiting forever,
+# set a timeout, so it bails out on timeout.
+DEFAULT_TIMEOUT = 20
+# and retry in that case.
+DEFAULT_TRIES = 5
+
 # 'app' must be global for decorators, etc.
 APPNAME = '__main__'
 app = flask.Flask(APPNAME)
@@ -481,9 +487,18 @@ def handler(catchall_path=None, fmt=None, target=None):
         cmdtarget = ('mon', '')
 
     app.logger.debug('sending command prefix %s argdict %s', prefix, argdict)
-    ret, outbuf, outs = json_command(app.ceph_cluster, prefix=prefix,
-                                     target=cmdtarget,
-                                     inbuf=flask.request.data, argdict=argdict)
+
+    for _ in range(DEFAULT_TRIES):
+        ret, outbuf, outs = json_command(app.ceph_cluster, prefix=prefix,
+                                         target=cmdtarget,
+                                         inbuf=flask.request.data,
+                                         argdict=argdict,
+                                         timeout=DEFAULT_TIMEOUT)
+        if ret != -errno.EINTR:
+            break
+    else:
+        return make_response(fmt, '',
+                             'Timedout: {0} ({1})'.format(outs, ret), 504)
     if ret:
         return make_response(fmt, '', 'Error: {0} ({1})'.format(outs, ret), 400)