From: Cory Snyder Date: Wed, 31 May 2023 18:01:02 +0000 (+0000) Subject: rgw: cancel olh modification by removing pending xattrs in set_olh error scenarios X-Git-Tag: v16.2.14~7^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=cc95bc83e24a20f6fa981881752a1e5cb9dc8c55;p=ceph.git rgw: cancel olh modification by removing pending xattrs in set_olh error scenarios Makes request threads clean up after themselves and remove their pending xattrs from OLH objects before they return an error from set_olh. Fixes: https://tracker.ceph.com/issues/59663 Signed-off-by: Cory Snyder (cherry picked from commit 38133e59d293dc8c16c92666967783d9e08a79cd) Conflicts: qa/workunits/rgw/test_rgw_reshard.py src/common/options/rgw.yaml.in src/rgw/driver/rados/rgw_rados.cc src/rgw/driver/rados/rgw_rados.h Cherry-pick notes: - conflicts due to options definitions being different after Pacific - conflicts due to rgw_rados.h/cc moving directories --- diff --git a/qa/suites/rgw/verify/tasks/versioning.yml b/qa/suites/rgw/verify/tasks/versioning.yml new file mode 100644 index 000000000000..ab928f270e45 --- /dev/null +++ b/qa/suites/rgw/verify/tasks/versioning.yml @@ -0,0 +1,5 @@ +tasks: +- workunit: + clients: + client.0: + - rgw/run-versioning.sh diff --git a/qa/workunits/rgw/common.py b/qa/workunits/rgw/common.py new file mode 100755 index 000000000000..235c36c9521f --- /dev/null +++ b/qa/workunits/rgw/common.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 + +import errno +import subprocess +import logging as log +import boto3 +import botocore.exceptions + +log.basicConfig(format = '%(message)s', level=log.DEBUG) +log.getLogger('botocore').setLevel(log.CRITICAL) +log.getLogger('boto3').setLevel(log.CRITICAL) +log.getLogger('urllib3').setLevel(log.CRITICAL) + +def exec_cmd(cmd, wait = True, **kwargs): + check_retcode = kwargs.pop('check_retcode', True) + kwargs['shell'] = True + kwargs['stdout'] = subprocess.PIPE + proc = subprocess.Popen(cmd, **kwargs) + log.info(proc.args) + if wait: + out, _ = proc.communicate() + if check_retcode: + assert(proc.returncode == 0) + return out + return (out, proc.returncode) + return '' + +def create_user(uid, display_name, access_key, secret_key): + _, ret = exec_cmd(f'radosgw-admin user create --uid {uid} --display-name "{display_name}" --access-key {access_key} --secret {secret_key}', check_retcode=False) + assert(ret == 0 or errno.EEXIST) + +def boto_connect(access_key, secret_key, config=None): + def try_connect(portnum, ssl, proto): + endpoint = proto + '://localhost:' + portnum + conn = boto3.resource('s3', + aws_access_key_id=access_key, + aws_secret_access_key=secret_key, + use_ssl=ssl, + endpoint_url=endpoint, + verify=False, + config=config, + ) + try: + list(conn.buckets.limit(1)) # just verify we can list buckets + except botocore.exceptions.ConnectionError as e: + print(e) + raise + print('connected to', endpoint) + return conn + try: + return try_connect('80', False, 'http') + except botocore.exceptions.ConnectionError: + try: # retry on non-privileged http port + return try_connect('8000', False, 'http') + except botocore.exceptions.ConnectionError: + # retry with ssl + return try_connect('443', True, 'https') diff --git a/qa/workunits/rgw/run-versioning.sh b/qa/workunits/rgw/run-versioning.sh new file mode 100755 index 000000000000..df60b7b03b2d --- /dev/null +++ b/qa/workunits/rgw/run-versioning.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +set -ex + +# assume working ceph environment (radosgw-admin in path) and rgw on localhost:80 +# localhost::443 for ssl + +mydir=`dirname $0` + +python3 -m venv $mydir +source $mydir/bin/activate +pip install pip --upgrade +pip install boto3 + +## run test +$mydir/bin/python3 $mydir/test_rgw_versioning.py + +deactivate +echo OK. + diff --git a/qa/workunits/rgw/test_rgw_reshard.py b/qa/workunits/rgw/test_rgw_reshard.py index 400994e476fe..d15544995787 100755 --- a/qa/workunits/rgw/test_rgw_reshard.py +++ b/qa/workunits/rgw/test_rgw_reshard.py @@ -1,12 +1,11 @@ #!/usr/bin/python3 -import logging as log +import errno import time -import subprocess +import logging as log import json -import boto3 -import botocore.exceptions import os +from common import exec_cmd, boto_connect, create_user """ Rgw manual and dynamic resharding testing against a running instance @@ -18,11 +17,6 @@ Rgw manual and dynamic resharding testing against a running instance # # -log.basicConfig(level=log.DEBUG) -log.getLogger('botocore').setLevel(log.CRITICAL) -log.getLogger('boto3').setLevel(log.CRITICAL) -log.getLogger('urllib3').setLevel(log.CRITICAL) - """ Constants """ USER = 'tester' DISPLAY_NAME = 'Testing' @@ -33,23 +27,6 @@ BUCKET_NAME2 = 'mybar' VER_BUCKET_NAME = 'myver' INDEX_POOL = 'default.rgw.buckets.index' - -def exec_cmd(cmd): - try: - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) - out, err = proc.communicate() - if proc.returncode == 0: - log.info('command succeeded') - if out is not None: log.info(out) - return out - else: - raise Exception("error: %s \nreturncode: %s" % (err, proc.returncode)) - except Exception as e: - log.error('command failed') - log.error(e) - return False - - class BucketStats: def __init__(self, bucket_name, bucket_id, num_objs=0, size_kb=0, num_shards=0): self.bucket_name = bucket_name @@ -98,36 +75,9 @@ def main(): """ execute manual and dynamic resharding commands """ - # create user - exec_cmd('radosgw-admin user create --uid %s --display-name %s --access-key %s --secret %s' - % (USER, DISPLAY_NAME, ACCESS_KEY, SECRET_KEY)) - - def boto_connect(portnum, ssl, proto): - endpoint = proto + '://localhost:' + portnum - conn = boto3.resource('s3', - aws_access_key_id=ACCESS_KEY, - aws_secret_access_key=SECRET_KEY, - use_ssl=ssl, - endpoint_url=endpoint, - verify=False, - config=None, - ) - try: - list(conn.buckets.limit(1)) # just verify we can list buckets - except botocore.exceptions.ConnectionError as e: - print(e) - raise - print('connected to', endpoint) - return conn - - try: - connection = boto_connect('80', False, 'http') - except botocore.exceptions.ConnectionError: - try: # retry on non-privileged http port - connection = boto_connect('8000', False, 'http') - except botocore.exceptions.ConnectionError: - # retry with ssl - connection = boto_connect('443', True, 'https') + create_user(USER, DISPLAY_NAME, ACCESS_KEY, SECRET_KEY) + + connection = boto_connect(ACCESS_KEY, SECRET_KEY) # create a bucket bucket1 = connection.create_bucket(Bucket=BUCKET_NAME1) diff --git a/qa/workunits/rgw/test_rgw_versioning.py b/qa/workunits/rgw/test_rgw_versioning.py new file mode 100755 index 000000000000..d5ab2ec691f8 --- /dev/null +++ b/qa/workunits/rgw/test_rgw_versioning.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 + +import logging as log +import json +import uuid +import botocore +from common import exec_cmd, create_user, boto_connect +from botocore.config import Config + +""" +Tests behavior of bucket versioning. +""" +# The test cases in this file have been annotated for inventory. +# To extract the inventory (in csv format) use the command: +# +# grep '^ *# TESTCASE' | sed 's/^ *# TESTCASE //' +# +# + +""" Constants """ +USER = 'versioning-tester' +DISPLAY_NAME = 'Versioning Testing' +ACCESS_KEY = 'LTA662PVVDTDWX6M2AB0' +SECRET_KEY = 'pvtchqajgzqx5581t6qbddbkj0bgf3a69qdkjcea' +BUCKET_NAME = 'versioning-bucket' +DATA_POOL = 'default.rgw.buckets.data' + +def main(): + """ + execute versioning tests + """ + create_user(USER, DISPLAY_NAME, ACCESS_KEY, SECRET_KEY) + + connection = boto_connect(ACCESS_KEY, SECRET_KEY, Config(retries = { + 'total_max_attempts': 1, + })) + + # pre-test cleanup + try: + bucket = connection.Bucket(BUCKET_NAME) + bucket.objects.all().delete() + bucket.object_versions.all().delete() + bucket.delete() + except botocore.exceptions.ClientError as e: + if not e.response['Error']['Code'] == 'NoSuchBucket': + raise + + bucket = connection.create_bucket(Bucket=BUCKET_NAME) + connection.BucketVersioning(BUCKET_NAME).enable() + + # reproducer for bug from https://tracker.ceph.com/issues/59663 + # TESTCASE 'verify that index entries and OLH objects are cleaned up after redundant deletes' + log.debug('TEST: verify that index entries and OLH objects are cleaned up after redundant deletes\n') + key = str(uuid.uuid4()) + resp = bucket.Object(key).delete() + assert 'DeleteMarker' in resp, 'DeleteMarker key not present in response' + assert resp['DeleteMarker'], 'DeleteMarker value not True in response' + assert 'VersionId' in resp, 'VersionId key not present in response' + version_id = resp['VersionId'] + bucket.Object(key).delete() + connection.ObjectVersion(bucket.name, key, version_id).delete() + # bucket index should now be empty + out = exec_cmd(f'radosgw-admin bi list --bucket {BUCKET_NAME}') + json_out = json.loads(out) + assert len(json_out) == 0, 'bucket index was not empty after all objects were deleted' + + (_out, ret) = exec_cmd(f'rados -p {DATA_POOL} ls | grep {key}', check_retcode=False) + assert ret != 0, 'olh object was not cleaned up' + + # TESTCASE 'verify that index entries and OLH objects are cleaned up after index linking error' + log.debug('TEST: verify that index entries and OLH objects are cleaned up after index linking error\n') + key = str(uuid.uuid4()) + try: + exec_cmd('ceph config set client.rgw rgw_debug_inject_set_olh_err 2') + bucket.Object(key).delete() + finally: + exec_cmd('ceph config rm client.rgw rgw_debug_inject_set_olh_err') + out = exec_cmd(f'radosgw-admin bi list --bucket {BUCKET_NAME}') + json_out = json.loads(out) + assert len(json_out) == 0, 'bucket index was not empty after op failed' + (_out, ret) = exec_cmd(f'rados -p {DATA_POOL} ls | grep {key}', check_retcode=False) + assert ret != 0, 'olh object was not cleaned up' + + # TESTCASE 'verify that original null object version is intact after failed olh upgrade' + log.debug('TEST: verify that original null object version is intact after failed olh upgrade\n') + connection.BucketVersioning(BUCKET_NAME).suspend() + key = str(uuid.uuid4()) + put_resp = bucket.put_object(Key=key, Body=b"data") + connection.BucketVersioning(BUCKET_NAME).enable() + try: + exec_cmd('ceph config set client.rgw rgw_debug_inject_set_olh_err 2') + # expected to fail due to the above error injection + bucket.put_object(Key=key, Body=b"new data") + except Exception as e: + log.debug(e) + finally: + exec_cmd('ceph config rm client.rgw rgw_debug_inject_set_olh_err') + get_resp = bucket.Object(key).get() + assert put_resp.e_tag == get_resp['ETag'], 'get did not return null version with correct etag' + + # Clean up + log.debug("Deleting bucket {}".format(BUCKET_NAME)) + bucket.object_versions.all().delete() + bucket.delete() + +main() +log.info("Completed bucket versioning tests") diff --git a/src/common/legacy_config_opts.h b/src/common/legacy_config_opts.h index ea103d7de9eb..d08468c69b22 100644 --- a/src/common/legacy_config_opts.h +++ b/src/common/legacy_config_opts.h @@ -1580,3 +1580,6 @@ OPTION(debug_allow_any_pool_priority, OPT_BOOL) OPTION(rgw_gc_max_deferred_entries_size, OPT_U64) // GC deferred entries size in queue head OPTION(rgw_gc_max_queue_size, OPT_U64) // GC max queue size OPTION(rgw_gc_max_deferred, OPT_U64) // GC max number of deferred entries + +OPTION(rgw_debug_inject_set_olh_err, OPT_U64) // dev option to inject an error in set_olh +OPTION(rgw_debug_inject_olh_cancel_modification_err, OPT_BOOL) // dev option to simulate olh cancel modification error diff --git a/src/common/options.cc b/src/common/options.cc index 4dc69cb0db3a..d0fb62313179 100644 --- a/src/common/options.cc +++ b/src/common/options.cc @@ -7156,6 +7156,20 @@ std::vector