From 280d73584717dbde07fa54a619d2a28ac1438cad Mon Sep 17 00:00:00 2001 From: Joseph Sawaya Date: Wed, 8 Sep 2021 11:33:14 -0400 Subject: [PATCH] qa/tasks/rook: test reapplication of drive groups stored in mgr This commit adds testing for the drive_group_loop in the Rook orchestrator that reapplies drive groups that were applied previously. This test removes an OSD, zaps the underlying device then waits for the OSD to be re-created by the drive_group_loop. This commit also updates the rook test suite to test v1.7.2 instead of 1.7.0 since `orch device zap` is only supported from v1.7.2 onwards. Fixes: https://tracker.ceph.com/issues/53501 Signed-off-by: Joseph Sawaya --- qa/suites/orch/rook/smoke/3-final.yaml | 34 +++++++++++++++++++++++ qa/suites/orch/rook/smoke/rook/1.7.0.yaml | 4 --- qa/suites/orch/rook/smoke/rook/1.7.2.yaml | 4 +++ qa/tasks/kubeadm.py | 2 +- qa/tasks/rook.py | 27 ++++++++++++------ src/pybind/mgr/rook/rook_cluster.py | 2 +- 6 files changed, 59 insertions(+), 14 deletions(-) delete mode 100644 qa/suites/orch/rook/smoke/rook/1.7.0.yaml create mode 100644 qa/suites/orch/rook/smoke/rook/1.7.2.yaml diff --git a/qa/suites/orch/rook/smoke/3-final.yaml b/qa/suites/orch/rook/smoke/3-final.yaml index ece8469fab44b..27d8a04e84eb2 100644 --- a/qa/suites/orch/rook/smoke/3-final.yaml +++ b/qa/suites/orch/rook/smoke/3-final.yaml @@ -1,4 +1,38 @@ tasks: +- exec: + host.a: + - | + set -ex + toolbox() { + kubectl -n rook-ceph exec -it deploy/rook-ceph-tools -- "$@" + } + orig_num_osd=`toolbox ceph osd stat | cut -f3 -d " "` + toolbox ceph orch osd rm 0 --force + removed_pv="" + while [ "$removed_pv" = "" ] + do + removed_pv=`kubectl get pv | grep Released | cut -f1 -d " "` + sleep 3s + done + target_path=`kubectl get pv $removed_pv -o jsonpath='{.spec.local.path}'` + host=`echo $removed_pv | cut -f1 -d "-"` + toolbox ceph orch device zap $host $target_path --force + zap_completion="0" + while [ "$zap_completion" = "0" ] + do + zap_completion=`kubectl get job -n rook-ceph rook-ceph-device-zap -o jsonpath='{.status.succeeded.path}'` + sleep 3s + done + kubectl patch pv $removed_pv -p '{"spec":{"claimRef": null}}' + toolbox ceph orch apply osd --all-available-devices + kubectl delete job rook-ceph-device-zap -n rook-ceph + num_osd="0" + while [ "$num_osd" != "$orig_num_osd" ] + do + echo "waiting for osd to come back up" + num_osd=`toolbox ceph osd stat | cut -f3 -d " "` + sleep 30s + done - rook.shell: commands: - ceph orch status diff --git a/qa/suites/orch/rook/smoke/rook/1.7.0.yaml b/qa/suites/orch/rook/smoke/rook/1.7.0.yaml deleted file mode 100644 index 702d3bfdd808a..0000000000000 --- a/qa/suites/orch/rook/smoke/rook/1.7.0.yaml +++ /dev/null @@ -1,4 +0,0 @@ -overrides: - rook: - rook_image: rook/ceph:v1.7.0 - rook_branch: v1.7.0 diff --git a/qa/suites/orch/rook/smoke/rook/1.7.2.yaml b/qa/suites/orch/rook/smoke/rook/1.7.2.yaml new file mode 100644 index 0000000000000..de96c5815001f --- /dev/null +++ b/qa/suites/orch/rook/smoke/rook/1.7.2.yaml @@ -0,0 +1,4 @@ +overrides: + rook: + rook_image: rook/ceph:v1.7.2 + rook_branch: v1.7.2 diff --git a/qa/tasks/kubeadm.py b/qa/tasks/kubeadm.py index b212d06d8f0b9..dae9f6b2c9525 100644 --- a/qa/tasks/kubeadm.py +++ b/qa/tasks/kubeadm.py @@ -468,7 +468,7 @@ def setup_pvs(ctx, config): 'volumeMode': 'Block', 'accessModes': ['ReadWriteOnce'], 'capacity': {'storage': '100Gi'}, # doesn't matter? - 'persistentVolumeReclaimPolicy': 'Recycle', + 'persistentVolumeReclaimPolicy': 'Retain', 'storageClassName': 'scratch', 'local': {'path': dev}, 'nodeAffinity': { diff --git a/qa/tasks/rook.py b/qa/tasks/rook.py index 15e2f04518962..c4e1dffe967bc 100644 --- a/qa/tasks/rook.py +++ b/qa/tasks/rook.py @@ -22,6 +22,14 @@ from tasks.cephadm import update_archive_setting log = logging.getLogger(__name__) +def path_to_examples(ctx, cluster_name : str) -> str: + for p in ['rook/deploy/examples/', 'rook/cluster/examples/kubernetes/ceph/']: + try: + ctx.rook[cluster_name].remote.get_file(p + 'operator.yaml') + return p + except: + pass + assert False, 'Path to examples not found' def _kubectl(ctx, config, args, **kwargs): cluster_name = config.get('cluster', 'ceph') @@ -94,8 +102,12 @@ def rook_operator(ctx, config): ) # operator.yaml + log.info(os.path.abspath(os.getcwd())) + object_methods = [method_name for method_name in dir(ctx.rook[cluster_name].remote) + if callable(getattr(ctx.rook[cluster_name].remote, method_name))] + log.info(object_methods) operator_yaml = ctx.rook[cluster_name].remote.read_file( - 'rook/cluster/examples/kubernetes/ceph/operator.yaml' + (path_to_examples(ctx, cluster_name) + 'operator.yaml') ) rook_image = config.get('rook_image') if rook_image: @@ -111,8 +123,8 @@ def rook_operator(ctx, config): log.info('Deploying operator') _kubectl(ctx, config, [ 'create', - '-f', 'rook/cluster/examples/kubernetes/ceph/crds.yaml', - '-f', 'rook/cluster/examples/kubernetes/ceph/common.yaml', + '-f', (path_to_examples(ctx, cluster_name) + 'crds.yaml'), + '-f', (path_to_examples(ctx, cluster_name) + 'common.yaml'), '-f', 'operator.yaml', ]) @@ -165,11 +177,11 @@ def rook_operator(ctx, config): # fails sometimes when deleting some of the CRDs... not sure why!) _kubectl(ctx, config, [ 'delete', - '-f', 'rook/cluster/examples/kubernetes/ceph/common.yaml', + '-f', (path_to_examples() + 'common.yaml'), ]) _kubectl(ctx, config, [ 'delete', - '-f', 'rook/cluster/examples/kubernetes/ceph/crds.yaml', + '-f', (path_to_examples() + 'crds.yaml'), ]) ctx.rook[cluster_name].remote.run(args=['rm', '-rf', 'rook', 'operator.yaml']) if op_job: @@ -409,7 +421,7 @@ def rook_toolbox(ctx, config): try: _kubectl(ctx, config, [ 'create', - '-f', 'rook/cluster/examples/kubernetes/ceph/toolbox.yaml', + '-f', (path_to_examples(ctx, cluster_name) + 'toolbox.yaml'), ]) log.info('Waiting for tools container to start') @@ -436,7 +448,7 @@ def rook_toolbox(ctx, config): finally: _kubectl(ctx, config, [ 'delete', - '-f', 'rook/cluster/examples/kubernetes/ceph/toolbox.yaml', + '-f', (path_to_examples(ctx, cluster_name) + 'toolbox.yaml'), ], check_status=False) @@ -493,7 +505,6 @@ def wait_for_osds(ctx, config): yield - @contextlib.contextmanager def ceph_config_keyring(ctx, config): # get config and push to hosts diff --git a/src/pybind/mgr/rook/rook_cluster.py b/src/pybind/mgr/rook/rook_cluster.py index ab433e71ca994..4662b06143673 100644 --- a/src/pybind/mgr/rook/rook_cluster.py +++ b/src/pybind/mgr/rook/rook_cluster.py @@ -423,7 +423,6 @@ class DefaultCreator(): ] for device in to_create: new_scds = self.device_to_device_set(drive_group, device) - new_cluster.spec.storage.storageClassDeviceSets.append(new_scds) if new_scds.name not in existing_scds: new_cluster.spec.storage.storageClassDeviceSets.append(new_scds) return new_cluster @@ -1187,6 +1186,7 @@ class RookCluster(object): ) ], security_context=client.V1SecurityContext( + run_as_user=0, privileged=True ), volume_mounts=[ -- 2.39.5