From bcf403209ab4544c6b595ee574a638774d20f4d4 Mon Sep 17 00:00:00 2001 From: "Robin H. Johnson" Date: Thu, 23 Aug 2018 10:57:24 -0700 Subject: [PATCH] rgw: use chunked encoding to get partial results out faster Some operations can take a long time to have their complete result. If a RGW operation does not set a content-length header, the RGW frontends (CivetWeb, Beast) buffer the entire request so that a Content-Length header can be sent. If the RGW operation takes long enough, the buffering time may exceed keepalive values, and because no bytes have been sent in the connection, the connection will be reset. If a HTTP response header contains neither Content-Length or chunked Transfer-Encoding, HTTP keep-alive is not possible. To fix the issue within these requirements, use chunked Transfer-Encoding for the following operations: * RGWCopyObj_ObjStore_S3 ** * RGWDeleteMultiObj_ObjStore_S3 ** * RGWGetUsage_ObjStore_S3 * RGWListBucketMultiparts_ObjStore_S3 * RGWListBucket_ObjStore_S3 * RGWListBuckets_ObjStore_S3 * RGWListMultipart_ObjStore_S3 RGWCopyObj & RGWDeleteMultiObj specifically use send_partial_response for long-running operations, and are the most impacted by this issue, esp. for large inputs. RGWCopyObj attempts to send a Progress header during the copy, but it's not actually passed on to the client until the end of the copy, because it's buffered by the RGW frontends! The HTTP/1.1 specification REQUIRES chunked encoding to be supported, and the specification does NOT require "chunked" to be included in the "TE" request header. This patch has one side-effect: this causes many more small IP packets. When combined with high-latency links this can increase the apparent deletion time due to round trips and TCP slow start. Future improvements to the RGW frontends are possible in two seperate but related ways: - The FE could continue to push more chunks without waiting for the ACK on the previous chunk, esp. while under the TCP window size. - The FE could be patched for different buffer flushing behaviors, as that behavior is presently unclear (packets of 200-500 bytes seen). Performance results: - Bucket with 5M objects, index sharded 32 ways. - Index on SSD 3x replicas, Data on spinning disk, 5:2 - Multi-delete of 1000 keys, with a common prefix. - Cache of index primed by listing the common prefix immediately before deletion. - Timing data captured at the RGW. - Timing t0 is the TCP ACK sent by the RGW at the end of the response body. - Client is ~75ms away from RGW. BEFORE: Time to first byte of response header: 11.3 seconds. Entire operation: 11.5 seconds. Response packets: 17 AFTER: Time to first byte of response header: 3.5ms Entire operation: 16.36 seconds Response packets: 206 Backport: mimic, luminous Issue: http://tracker.ceph.com/issues/12713 Signed-off-by: Robin H. Johnson (cherry picked from commit d22c1f96707ba9ae84578932bd4d741f6c101a54) --- src/rgw/rgw_rest_s3.cc | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc index 39237cac221c..bda05ba923a4 100644 --- a/src/rgw/rgw_rest_s3.cc +++ b/src/rgw/rgw_rest_s3.cc @@ -460,7 +460,9 @@ void RGWListBuckets_ObjStore_S3::send_response_begin(bool has_buckets) set_req_state_err(s, op_ret); dump_errno(s); dump_start(s); - end_header(s, NULL, "application/xml"); + // Explicitly use chunked transfer encoding so that we can stream the result + // to the user without having to wait for the full length of it. + end_header(s, NULL, "application/xml", CHUNKED_TRANSFER_ENCODING); if (! op_ret) { list_all_buckets_start(s); @@ -535,7 +537,9 @@ void RGWGetUsage_ObjStore_S3::send_response() set_req_state_err(s, op_ret); dump_errno(s); - end_header(s, this, "application/xml"); + // Explicitly use chunked transfer encoding so that we can stream the result + // to the user without having to wait for the full length of it. + end_header(s, this, "application/xml", CHUNKED_TRANSFER_ENCODING); dump_start(s); if (op_ret < 0) return; @@ -769,7 +773,9 @@ void RGWListBucket_ObjStore_S3::send_response() set_req_state_err(s, op_ret); dump_errno(s); - end_header(s, this, "application/xml"); + // Explicitly use chunked transfer encoding so that we can stream the result + // to the user without having to wait for the full length of it. + end_header(s, this, "application/xml", CHUNKED_TRANSFER_ENCODING); dump_start(s); if (op_ret < 0) return; @@ -2184,7 +2190,9 @@ void RGWCopyObj_ObjStore_S3::send_partial_response(off_t ofs) set_req_state_err(s, op_ret); dump_errno(s); - end_header(s, this, "application/xml"); + // Explicitly use chunked transfer encoding so that we can stream the result + // to the user without having to wait for the full length of it. + end_header(s, this, "application/xml", CHUNKED_TRANSFER_ENCODING); dump_start(s); if (op_ret == 0) { s->formatter->open_object_section_in_ns("CopyObjectResult", XMLNS_AWS_S3); @@ -2652,7 +2660,9 @@ void RGWListMultipart_ObjStore_S3::send_response() if (op_ret) set_req_state_err(s, op_ret); dump_errno(s); - end_header(s, this, "application/xml"); + // Explicitly use chunked transfer encoding so that we can stream the result + // to the user without having to wait for the full length of it. + end_header(s, this, "application/xml", CHUNKED_TRANSFER_ENCODING); if (op_ret == 0) { dump_start(s); @@ -2703,7 +2713,9 @@ void RGWListBucketMultiparts_ObjStore_S3::send_response() set_req_state_err(s, op_ret); dump_errno(s); - end_header(s, this, "application/xml"); + // Explicitly use chunked transfer encoding so that we can stream the result + // to the user without having to wait for the full length of it. + end_header(s, this, "application/xml", CHUNKED_TRANSFER_ENCODING); dump_start(s); if (op_ret < 0) return; @@ -2786,7 +2798,9 @@ void RGWDeleteMultiObj_ObjStore_S3::begin_response() } dump_start(s); - end_header(s, this, "application/xml"); + // Explicitly use chunked transfer encoding so that we can stream the result + // to the user without having to wait for the full length of it. + end_header(s, this, "application/xml", CHUNKED_TRANSFER_ENCODING); s->formatter->open_object_section_in_ns("DeleteResult", XMLNS_AWS_S3); rgw_flush_formatter(s, s->formatter); -- 2.47.3