+++ /dev/null
-*~
-.*.sw[nmop]
-*.pyc
-.tox
-__pycache__
+++ /dev/null
-ceph-qa-suite
--------------
-
-clusters/ - some predefined cluster layouts
-suites/ - set suite
-
-The suites directory has a hierarchical collection of tests. This can be
-freeform, but generally follows the convention of
-
- suites/<test suite name>/<test group>/...
-
-A test is described by a yaml fragment.
-
-A test can exist as a single .yaml file in the directory tree. For example:
-
- suites/foo/one.yaml
- suites/foo/two.yaml
-
-is a simple group of two tests.
-
-A directory with a magic '+' file represents a test that combines all
-other items in the directory into a single yaml fragment. For example:
-
- suites/foo/bar/+
- suites/foo/bar/a.yaml
- suites/foo/bar/b.yaml
- suites/foo/bar/c.yaml
-
-is a single test consisting of a + b + c.
-
-A directory with a magic '%' file represents a test matrix formed from
-all other items in the directory. For example,
-
- suites/baz/%
- suites/baz/a.yaml
- suites/baz/b/b1.yaml
- suites/baz/b/b2.yaml
- suites/baz/c.yaml
- suites/baz/d/d1.yaml
- suites/baz/d/d2.yaml
-
-is a 4-dimensional test matrix. Two dimensions (a, c) are trivial (1
-item), so this is really 2x2 = 4 tests, which are
-
- a + b1 + c + d1
- a + b1 + c + d2
- a + b2 + c + d1
- a + b2 + c + d2
-
-Symlinks are okay.
-
-The teuthology code can be found in https://github.com/ceph/teuthology.git
+++ /dev/null
-arch: aarch64
+++ /dev/null
-arch: armv7l
+++ /dev/null
-arch: i686
+++ /dev/null
-arch: x86_64
+++ /dev/null
-overrides:
- ceph-deploy:
- dmcrypt: yes
+++ /dev/null
-overrides:
- ceph-deploy:
- separate_journal_disk:
+++ /dev/null
-overrides:
- ceph-deploy:
- separate_journal_disk: yes
+++ /dev/null
-overrides:
- ceph-deploy:
- dmcrypt: yes
- separate_journal_disk: yes
+++ /dev/null
-tasks:
- - install:
- - ceph:
+++ /dev/null
-roles:
-- [mon.a, mon.c, mds.a, osd.0, osd.1, osd.2]
-- [mon.b, mds.b, mds.c, osd.3, osd.4, osd.5]
-- [client.0]
+++ /dev/null
-roles:
-- [mon.a, mon.c, mds.a, mds.b, mds.c, mds.d, osd.0, osd.1, osd.2]
-- [mon.b, mds.e, mds.f, mds.g, mds.h, mds.i, osd.3, osd.4, osd.5]
-- [client.0]
+++ /dev/null
-roles:
-- [mon.a, mds.a, osd.0, osd.1, client.0]
-- [mon.b, mds.a-s, mon.c, osd.2, osd.3]
-log-rotate:
- ceph-mds: 10G
- ceph-osd: 10G
+++ /dev/null
-tasks:
- - ceph-fuse:
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms die on skipped message: false
-tasks:
-- kclient:
+++ /dev/null
-overrides:
- ceph:
- conf:
- mds:
- debug ms: 1
- debug mds: 20
- client:
- debug ms: 1
- debug client: 20
+++ /dev/null
-overrides:
- ceph:
- conf:
- mds:
- mds bal frag: true
- mds bal fragment size max: 10000
- mds bal split size: 100
- mds bal merge size: 5
- mds bal split bits: 3
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- fuse default permissions: false
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- fuse default permissions: true
+++ /dev/null
-overrides:
- ceph:
- log-whitelist:
- - wrongly marked me down
- conf:
- mds:
- debug mds: 20
- debug ms: 1
- client:
- debug client: 10
+++ /dev/null
-tasks:
-- workunit:
- clients:
- all:
- - suites/blogbench.sh
+++ /dev/null
-tasks:
-- workunit:
- clients:
- all:
- - suites/dbench.sh
+++ /dev/null
-overrides:
- ceph:
- conf:
- osd:
- filestore flush min: 0
-tasks:
-- workunit:
- clients:
- all:
- - suites/ffsb.sh
+++ /dev/null
-tasks:
-- workunit:
- clients:
- all:
- - suites/fsstress.sh
+++ /dev/null
-tasks:
-- workunit:
- clients:
- all: [fs/misc/trivial_sync.sh]
+++ /dev/null
-overrides:
- ceph-fuse:
- disabled: true
- kclient:
- disabled: true
-tasks:
-- workunit:
- clients:
- client.0:
- - libcephfs/test.sh
+++ /dev/null
-roles:
-- [mon.a, mon.c, osd.0, osd.1, osd.2]
-- [mon.b, mds.a, osd.3, osd.4, osd.5]
-- [client.0]
-- [client.1]
+++ /dev/null
-overrides:
- ceph-deploy:
- conf:
- global:
- osd pool default size: 2
- osd crush chooseleaf type: 0
- osd pool default pg num: 128
- osd pool default pgp num: 128
-roles:
-- [mon.a, osd.0, osd.1, osd.2, client.0]
+++ /dev/null
-roles:
-- [mon.a, mon.c, osd.0, osd.1, osd.2, client.0]
-- [mon.b, osd.3, osd.4, osd.5, client.1]
+++ /dev/null
-roles:
-- [mon.a, mds.a, osd.0, osd.1]
-- [mon.b, mds.a-s, mon.c, osd.2, osd.3]
-- [client.0]
-log-rotate:
- ceph-mds: 10G
- ceph-osd: 10G
+++ /dev/null
-roles:
-- [mon.a, mon.c, osd.0, osd.1, osd.2]
-- [mon.b, osd.3, osd.4, osd.5]
-- [client.0]
+++ /dev/null
-roles:
-- [mon.a, osd.0, osd.4, osd.8, osd.12]
-- [mon.b, osd.1, osd.5, osd.9, osd.13]
-- [mon.c, osd.2, osd.6, osd.10, osd.14]
-- [osd.3, osd.7, osd.11, osd.15, client.0]
+++ /dev/null
-overrides:
- ceph:
- conf:
- osd:
- osd op queue: debug_random
- osd op queue cut off: debug_random
- osd debug verify missing on start: true
+++ /dev/null
-overrides:
- ceph-deploy:
- conf:
- global:
- mon pg warn min per osd: 2
- osd pool default size: 2
+++ /dev/null
-tasks:
- - buildpackages:
- machine:
- disk: 40 # GB
- ram: 15000 # MB
- cpus: 16
+++ /dev/null
-overrides:
- ceph:
- conf:
- mds:
- debug ms: 1
- debug mds: 20
- client:
- debug ms: 1
- debug client: 20
\ No newline at end of file
+++ /dev/null
-openstack:
- - machine:
- ram: 15000 # MB
+++ /dev/null
-openstack:
- - machine:
- ram: 30000 # MB
+++ /dev/null
-all/centos_7.2.yaml
\ No newline at end of file
+++ /dev/null
-os_type: centos
-os_version: "6.3"
+++ /dev/null
-os_type: centos
-os_version: "6.4"
+++ /dev/null
-os_type: centos
-os_version: "6.5"
+++ /dev/null
-os_type: centos
-os_version: "7.0"
+++ /dev/null
-os_type: centos
-os_version: "7.1"
+++ /dev/null
-os_type: centos
-os_version: "7.2"
+++ /dev/null
-os_type: debian
-os_version: "6.0"
+++ /dev/null
-os_type: debian
-os_version: "7.0"
+++ /dev/null
-os_type: debian
-os_version: "8.0"
+++ /dev/null
-os_type: fedora
-os_version: "17"
+++ /dev/null
-os_type: fedora
-os_version: "18"
+++ /dev/null
-os_type: fedora
-os_version: "19"
+++ /dev/null
-os_type: opensuse
-os_version: "12.2"
+++ /dev/null
-os_type: opensuse
-os_version: "13.2"
+++ /dev/null
-os_type: opensuse
-os_version: "42.1"
+++ /dev/null
-os_type: rhel
-os_version: "6.3"
+++ /dev/null
-os_type: rhel
-os_version: "6.4"
+++ /dev/null
-os_type: rhel
-os_version: "6.5"
+++ /dev/null
-os_type: rhel
-os_version: "7.0"
+++ /dev/null
-os_type: sles
-os_version: "11-sp2"
+++ /dev/null
-os_type: ubuntu
-os_version: "12.04"
+++ /dev/null
-os_type: ubuntu
-os_version: "12.10"
+++ /dev/null
-os_type: ubuntu
-os_version: "14.04"
+++ /dev/null
-os_type: ubuntu
-os_version: "14.04"
-arch: aarch64
+++ /dev/null
-os_type: ubuntu
-os_version: "14.04"
-arch: i686
+++ /dev/null
-os_type: ubuntu
-os_version: "16.04"
+++ /dev/null
-../all/centos_7.2.yaml
\ No newline at end of file
+++ /dev/null
-../all/ubuntu_14.04.yaml
\ No newline at end of file
+++ /dev/null
-../all/ubuntu_16.04.yaml
\ No newline at end of file
+++ /dev/null
-#
-# Test the expected behavior of the
-#
-# CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2
-#
-# feature.
-#
-roles:
-- - mon.a
- - mon.b
- - osd.0
- - osd.1
-- - osd.2
- - mon.c
-tasks:
-#
-# Install firefly
-#
-- install:
- branch: firefly
-- ceph:
- fs: xfs
-#
-# We don't need mon.c for now: it will be used later to make sure an old
-# mon cannot join the quorum once the feature has been activated
-#
-- ceph.stop:
- daemons: [mon.c]
-- exec:
- mon.a:
- - |-
- ceph osd erasure-code-profile set WRONG plugin=WRONG
- ceph osd pool create poolWRONG 12 12 erasure WRONG 2>&1 | grep "failed to load plugin using profile WRONG"
-#
-# Partial upgrade, osd.2 is not upgraded
-#
-- install.upgrade:
- osd.0:
-#
-# a is the leader
-#
-- ceph.restart:
- daemons: [mon.a]
- wait-for-healthy: false
-- exec:
- mon.a:
- - |-
- ceph osd erasure-code-profile set profile-lrc plugin=lrc 2>&1 | grep "unsupported by: the monitor cluster"
-- ceph.restart:
- daemons: [mon.b, osd.1, osd.0]
- wait-for-healthy: false
- wait-for-osds-up: true
-#
-# The lrc plugin cannot be used because osd.2 is not upgraded yet
-# and would crash.
-#
-- exec:
- mon.a:
- - |-
- ceph osd erasure-code-profile set profile-lrc plugin=lrc 2>&1 | grep "unsupported by: osd.2"
-#
-# Taking osd.2 out, the rest of the cluster is upgraded
-#
-- ceph.stop:
- daemons: [osd.2]
-- sleep:
- duration: 60
-#
-# Creating an erasure code profile using the lrc plugin now works
-#
-- exec:
- mon.a:
- - "ceph osd erasure-code-profile set profile-lrc plugin=lrc"
-#
-# osd.2 won't be able to join the because is does not support the feature
-#
-- ceph.restart:
- daemons: [osd.2]
- wait-for-healthy: false
-- sleep:
- duration: 60
-- exec:
- osd.2:
- - |-
- grep "protocol feature.*missing 100000000000" /var/log/ceph/ceph-osd.2.log
-#
-# mon.c won't be able to join the because it does not support the feature
-#
-- ceph.restart:
- daemons: [mon.c]
- wait-for-healthy: false
-- sleep:
- duration: 60
-- exec:
- mon.c:
- - |-
- grep "missing.*feature" /var/log/ceph/ceph-mon.c.log
+++ /dev/null
-#
-# Test the expected behavior of the
-#
-# CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3
-#
-# feature.
-#
-roles:
-- - mon.a
- - mon.b
- - osd.0
- - osd.1
-- - osd.2
- - mon.c
-tasks:
-#
-# Install hammer
-#
-- install:
- branch: hammer
-- ceph:
- fs: xfs
-#
-# We don't need mon.c for now: it will be used later to make sure an old
-# mon cannot join the quorum once the feature has been activated
-#
-- ceph.stop:
- daemons: [mon.c]
-- exec:
- mon.a:
- - |-
- ceph osd erasure-code-profile set WRONG plugin=WRONG
- ceph osd pool create poolWRONG 12 12 erasure WRONG 2>&1 | grep "failed to load plugin using profile WRONG"
-#
-# Partial upgrade, osd.2 is not upgraded
-#
-- install.upgrade:
- osd.0:
-#
-# a is the leader
-#
-- ceph.restart:
- daemons: [mon.a]
- wait-for-healthy: false
-- exec:
- mon.a:
- - |-
- ceph osd erasure-code-profile set profile-shec k=2 m=1 c=1 plugin=shec 2>&1 | grep "unsupported by: the monitor cluster"
-- ceph.restart:
- daemons: [mon.b, osd.1, osd.0]
- wait-for-healthy: false
- wait-for-osds-up: true
-#
-# The shec plugin cannot be used because osd.2 is not upgraded yet
-# and would crash.
-#
-- exec:
- mon.a:
- - |-
- ceph osd erasure-code-profile set profile-shec k=2 m=1 c=1 plugin=shec 2>&1 | grep "unsupported by: osd.2"
-#
-# Taking osd.2 out, the rest of the cluster is upgraded
-#
-- ceph.stop:
- daemons: [osd.2]
-- sleep:
- duration: 60
-#
-# Creating an erasure code profile using the shec plugin now works
-#
-- exec:
- mon.a:
- - "ceph osd erasure-code-profile set profile-shec k=2 m=1 c=1 plugin=shec"
-#
-# osd.2 won't be able to join the because is does not support the feature
-#
-- ceph.restart:
- daemons: [osd.2]
- wait-for-healthy: false
-- sleep:
- duration: 60
-- exec:
- osd.2:
- - |-
- grep "protocol feature.*missing" /var/log/ceph/ceph-osd.2.log
-#
-# mon.c won't be able to join the because it does not support the feature
-#
-- ceph.restart:
- daemons: [mon.c]
- wait-for-healthy: false
-- sleep:
- duration: 60
-- exec:
- mon.c:
- - |-
- grep "missing.*feature" /var/log/ceph/ceph-mon.c.log
+++ /dev/null
-tasks:
- - rados:
- clients: [client.0]
- ops: 4000
- objects: 50
- ec_pool: true
- write_append_excl: false
- op_weights:
- read: 100
- write: 0
- append: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
- setattr: 25
- rmattr: 25
- - print: "**** done rados ec task"
+++ /dev/null
-workload:
- parallel:
- - rados:
- clients: [client.0]
- ops: 4000
- objects: 50
- ec_pool: true
- write_append_excl: false
- op_weights:
- read: 100
- write: 0
- append: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
- setattr: 25
- rmattr: 25
- - print: "**** done rados ec parallel"
+++ /dev/null
-tasks:
-- rados:
- clients: [client.0]
- ops: 4000
- objects: 50
- ec_pool: true
- write_append_excl: false
- erasure_code_profile:
- name: isaprofile
- plugin: isa
- k: 2
- m: 1
- technique: reed_sol_van
- ruleset-failure-domain: osd
- op_weights:
- read: 100
- write: 0
- append: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
- setattr: 25
- rmattr: 25
+++ /dev/null
-tasks:
-- rados:
- clients: [client.0]
- ops: 4000
- objects: 50
- ec_pool: true
- write_append_excl: false
- erasure_code_profile:
- name: jerasure21profile
- plugin: jerasure
- k: 2
- m: 1
- technique: reed_sol_van
- ruleset-failure-domain: osd
- op_weights:
- read: 100
- write: 0
- append: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
- setattr: 25
- rmattr: 25
+++ /dev/null
-#
-# k=3 implies a stripe_width of 1376*3 = 4128 which is different from
-# the default value of 4096 It is also not a multiple of 1024*1024 and
-# creates situations where rounding rules during recovery becomes
-# necessary.
-#
-tasks:
-- rados:
- clients: [client.0]
- ops: 4000
- objects: 50
- ec_pool: true
- write_append_excl: false
- erasure_code_profile:
- name: jerasure31profile
- plugin: jerasure
- k: 3
- m: 1
- technique: reed_sol_van
- ruleset-failure-domain: osd
- op_weights:
- read: 100
- write: 0
- append: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
- setattr: 25
- rmattr: 25
+++ /dev/null
-tasks:
-- rados:
- clients: [client.0]
- ops: 400
- objects: 50
- ec_pool: true
- write_append_excl: false
- erasure_code_profile:
- name: lrcprofile
- plugin: lrc
- k: 4
- m: 2
- l: 3
- ruleset-failure-domain: osd
- op_weights:
- read: 100
- write: 0
- append: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
- setattr: 25
- rmattr: 25
+++ /dev/null
-tasks:
-- rados:
- clients: [client.0]
- ops: 400
- objects: 50
- ec_pool: true
- write_append_excl: false
- erasure_code_profile:
- name: shecprofile
- plugin: shec
- k: 4
- m: 3
- c: 2
- ruleset-failure-domain: osd
- op_weights:
- read: 100
- write: 0
- append: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
- setattr: 25
- rmattr: 25
+++ /dev/null
-workload:
- sequential:
- - rados:
- clients: [client.0]
- ops: 4000
- objects: 50
- ec_pool: true
- write_append_excl: false
- op_weights:
- read: 100
- write: 0
- append: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
- setattr: 25
- rmattr: 25
- - print: "**** done rados ec sequential"
+++ /dev/null
-overrides:
- ceph:
- fs: btrfs
- conf:
- osd:
- osd sloppy crc: true
- osd op thread timeout: 60
+++ /dev/null
-overrides:
- ceph:
- fs: ext4
- conf:
- global:
- osd max object name len: 460
- osd max object namespace len: 64
+++ /dev/null
-overrides:
- ceph:
- fs: xfs
- conf:
- osd:
- osd sloppy crc: true
+++ /dev/null
-#!/bin/bash
-
-# $1 - part
-# $2 - branch name
-# $3 - machine name
-# $4 - email address
-# $5 - filter out (this arg is to be at the end of the command line for now)
-
-## example #1
-## (date +%U) week number
-## % 2 - mod 2 (e.g. 0,1,0,1 ...)
-## * 7 - multiplied by 7 (e.g. 0,7,0,7...)
-## $1 day of the week (0-6)
-## /14 for 2 weeks
-
-## example #2
-## (date +%U) week number
-## % 4 - mod 4 (e.g. 0,1,2,3,0,1,2,3 ...)
-## * 7 - multiplied by 7 (e.g. 0,7,14,21,0,7,14,21...)
-## $1 day of the week (0-6)
-## /28 for 4 weeks
-
-echo "Scheduling " $2 " branch"
-if [ $2 = "master" ] ; then
- # run master branch with --newest option looking for good sha1 7 builds back
- teuthology-suite -v -c $2 -m $3 -k distro -s rados --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/28 --newest 7 -e $4 $5
-elif [ $2 = "hammer" ] ; then
- # run hammer branch with less jobs
- teuthology-suite -v -c $2 -m $3 -k distro -s rados --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/56 -e $4 $5
-elif [ $2 = "jewel" ] ; then
- # run jewel branch with /40 jobs
- teuthology-suite -v -c $2 -m $3 -k distro -s rados --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/40 -e $4 $5
-else
- # run NON master branches without --newest
- teuthology-suite -v -c $2 -m $3 -k distro -s rados --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/28 -e $4 $5
-fi
+++ /dev/null
-#!/bin/bash
-
-# $1 - part
-# $2 - branch name
-# $3 - machine name
-# $4 - email address
-# $5 - filter out (this arg is to be at the end of the command line for now)
-
-## example #1
-## (date +%U) week number
-## % 2 - mod 2 (e.g. 0,1,0,1 ...)
-## * 7 - multiplied by 7 (e.g. 0,7,0,7...)
-## $1 day of the week (0-6)
-## /14 for 2 weeks
-
-## example #2
-## (date +%U) week number
-## % 4 - mod 4 (e.g. 0,1,2,3,0,1,2,3 ...)
-## * 7 - multiplied by 7 (e.g. 0,7,14,21,0,7,14,21...)
-## $1 day of the week (0-6)
-## /28 for 4 weeks
-
-echo "Scheduling " $2 " branch"
-if [ $2 = "master" ] ; then
- # run master branch with --newest option looking for good sha1 7 builds back
- teuthology-suite -v -c $2 -m $3 -k distro -s rados --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/28 --newest 7 -e $4 ~/vps.yaml $5
-elif [ $2 = "hammer" ] ; then
- # run hammer branch with less jobs
- teuthology-suite -v -c $2 -m $3 -k distro -s rados --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/56 -e $4 ~/vps.yaml $5
-elif [ $2 = "jewel" ] ; then
- # run jewel branch with /40 jobs
- teuthology-suite -v -c $2 -m $3 -k distro -s rados --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/40 -e $4 ~/vps.yaml $5
-else
- # run NON master branches without --newest
- teuthology-suite -v -c $2 -m $3 -k distro -s rados --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/28 -e $4 ~/vps.yaml $5
-fi
-
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- osd heartbeat grace: 100
- # this line to address issue #1017
- mon lease: 15
- mon lease ack timeout: 25
- rgw:
- default_idle_timeout: 1200
- s3tests:
- idle_timeout: 1200
- ceph-fuse:
- client.0:
- mount_wait: 60
- mount_timeout: 120
+++ /dev/null
-overrides:
- ceph:
- conf:
- mon:
- mon keyvaluedb: leveldb
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- enable experimental unrecoverable data corrupting features: '*'
- mon:
- mon keyvaluedb: rocksdb
+++ /dev/null
-#!/bin/bash
-# /nightlies/cron_wrapper.sh
-
-# check for no argument case and stop
-if [ -z $1 ]; then
- echo "need argument"
- exit 1
-fi
-
-# set permanent $LOG file var
-LOG="/var/log/crontab-nightlies-log/crontab.log"
-# set $LOG_LOCKED_ERR in case locking failed
-LOG_LOCK_ERR="/var/log/crontab-nightlies-log/crontab_lock_problem.$$"
-
-# temp files to store stdout and stderr
-# named with the PID of this script in their name so they'll be unique
-STDERR="/var/tmp/stderr.$$"
-STDOUT="/var/tmp/stdout.$$"
-
-# $STDOUT and $STDERR are removed when the script exits for any reason
-trap "rm -f $STDOUT $STDERR" 0
-
-# run a command from this script's argument
-# redirect stdout to $STDOUT file and redirect stderr to $STDERR file
-
-DATE=$(date)
-echo -n "$DATE: " >> $STDOUT
-echo "Running command: $@" >> $STDOUT
-"$@" > $STDOUT 2> $STDERR
-
-# get return code from the command run
-code=$?
-
-if [ $code != 0 ] ; then
- # echoing to stdout/stderr makes cron send email
- echo "stdout:"
- cat $STDOUT
- echo "stderr:"
- cat $STDERR
-else
- # normal exit: just log stdout
-
- # lock $LOG with file descriptor 200
- exec 200>>$LOG
- # if $LOG is locked by other process - wait for 20 sec
- flock -w 20 200 || LOG=$LOG_LOCK_ERR
- echo "stdout:" >> $LOG
- cat $STDOUT >> $LOG
- echo "stderr:" >> $LOG
- cat $STDERR >> $LOG
- # unlock
- flock -u 200
-fi
+++ /dev/null
-overrides:
- thrashosds:
- bdev_inject_crash: 2
- bdev_inject_crash_probability: .5
- ceph:
- conf:
- osd:
- osd objectstore: bluestore
- bluestore block size: 96636764160
- debug bluestore: 30
- debug bdev: 20
- debug bluefs: 20
- debug rocksdb: 10
- enable experimental unrecoverable data corrupting features: "*"
- osd debug randomize hobject sort order: false
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-# bluestore bluefs env mirror: true
+++ /dev/null
-overrides:
- ceph:
- conf:
- osd:
- osd objectstore: filestore
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- osd_pool_default_size: 2
- osd_pool_default_min_size: 1
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- osd_pool_default_size: 2
- osd_pool_default_min_size: 2
+++ /dev/null
-overrides:
- thrashosds:
- min_in: 4
- ceph:
- conf:
- global:
- osd_pool_default_size: 3
- osd_pool_default_min_size: 2
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- osd_min_pg_log_entries: 300
- osd_max_pg_log_entries: 600
+++ /dev/null
-overrides:
- ceph:
- log-whitelist:
- - wrongly marked me down
- conf:
- mds:
- debug mds: 20
- debug ms: 1
- client:
- debug client: 10
\ No newline at end of file
+++ /dev/null
----
-ceph:
- deb:
- - ceph
- - ceph-mds
- - ceph-mgr
- - ceph-common
- - ceph-fuse
- - ceph-test
- - radosgw
- - python-ceph
- - libcephfs2
- - libcephfs-dev
- - libcephfs-java
- - libcephfs-jni
- - librados2
- - librbd1
- - rbd-fuse
- - ceph-common-dbg
- - ceph-fuse-dbg
- - ceph-mds-dbg
- - ceph-mgr-dbg
- - ceph-mon-dbg
- - ceph-osd-dbg
- - ceph-test-dbg
- - libcephfs2-dbg
- - librados2-dbg
- - libradosstriper1-dbg
- - librbd1-dbg
- - librgw2-dbg
- - radosgw-dbg
- - rbd-fuse-dbg
- - rbd-mirror-dbg
- - rbd-nbd-dbg
- rpm:
- - ceph-radosgw
- - ceph-test
- - ceph
- - ceph-mgr
- - ceph-fuse
- - cephfs-java
- - libcephfs_jni1
- - libcephfs2
- - libcephfs-devel
- - librados2
- - librbd1
- - python-ceph
- - rbd-fuse
- - ceph-debuginfo
--- /dev/null
+*~
+.*.sw[nmop]
+*.pyc
+.tox
+__pycache__
--- /dev/null
+ceph-qa-suite
+-------------
+
+clusters/ - some predefined cluster layouts
+suites/ - set suite
+
+The suites directory has a hierarchical collection of tests. This can be
+freeform, but generally follows the convention of
+
+ suites/<test suite name>/<test group>/...
+
+A test is described by a yaml fragment.
+
+A test can exist as a single .yaml file in the directory tree. For example:
+
+ suites/foo/one.yaml
+ suites/foo/two.yaml
+
+is a simple group of two tests.
+
+A directory with a magic '+' file represents a test that combines all
+other items in the directory into a single yaml fragment. For example:
+
+ suites/foo/bar/+
+ suites/foo/bar/a.yaml
+ suites/foo/bar/b.yaml
+ suites/foo/bar/c.yaml
+
+is a single test consisting of a + b + c.
+
+A directory with a magic '%' file represents a test matrix formed from
+all other items in the directory. For example,
+
+ suites/baz/%
+ suites/baz/a.yaml
+ suites/baz/b/b1.yaml
+ suites/baz/b/b2.yaml
+ suites/baz/c.yaml
+ suites/baz/d/d1.yaml
+ suites/baz/d/d2.yaml
+
+is a 4-dimensional test matrix. Two dimensions (a, c) are trivial (1
+item), so this is really 2x2 = 4 tests, which are
+
+ a + b1 + c + d1
+ a + b1 + c + d2
+ a + b2 + c + d1
+ a + b2 + c + d2
+
+Symlinks are okay.
+
+The teuthology code can be found in https://github.com/ceph/teuthology.git
--- /dev/null
+arch: aarch64
--- /dev/null
+arch: armv7l
--- /dev/null
+arch: i686
--- /dev/null
+arch: x86_64
--- /dev/null
+overrides:
+ ceph-deploy:
+ dmcrypt: yes
--- /dev/null
+overrides:
+ ceph-deploy:
+ separate_journal_disk:
--- /dev/null
+overrides:
+ ceph-deploy:
+ separate_journal_disk: yes
--- /dev/null
+overrides:
+ ceph-deploy:
+ dmcrypt: yes
+ separate_journal_disk: yes
--- /dev/null
+tasks:
+ - install:
+ - ceph:
--- /dev/null
+roles:
+- [mon.a, mon.c, mds.a, osd.0, osd.1, osd.2]
+- [mon.b, mds.b, mds.c, osd.3, osd.4, osd.5]
+- [client.0]
--- /dev/null
+roles:
+- [mon.a, mon.c, mds.a, mds.b, mds.c, mds.d, osd.0, osd.1, osd.2]
+- [mon.b, mds.e, mds.f, mds.g, mds.h, mds.i, osd.3, osd.4, osd.5]
+- [client.0]
--- /dev/null
+roles:
+- [mon.a, mds.a, osd.0, osd.1, client.0]
+- [mon.b, mds.a-s, mon.c, osd.2, osd.3]
+log-rotate:
+ ceph-mds: 10G
+ ceph-osd: 10G
--- /dev/null
+tasks:
+ - ceph-fuse:
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms die on skipped message: false
+tasks:
+- kclient:
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ mds:
+ debug ms: 1
+ debug mds: 20
+ client:
+ debug ms: 1
+ debug client: 20
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ mds:
+ mds bal frag: true
+ mds bal fragment size max: 10000
+ mds bal split size: 100
+ mds bal merge size: 5
+ mds bal split bits: 3
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ fuse default permissions: false
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ fuse default permissions: true
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - wrongly marked me down
+ conf:
+ mds:
+ debug mds: 20
+ debug ms: 1
+ client:
+ debug client: 10
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ all:
+ - suites/blogbench.sh
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ all:
+ - suites/dbench.sh
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ osd:
+ filestore flush min: 0
+tasks:
+- workunit:
+ clients:
+ all:
+ - suites/ffsb.sh
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ all:
+ - suites/fsstress.sh
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ all: [fs/misc/trivial_sync.sh]
--- /dev/null
+overrides:
+ ceph-fuse:
+ disabled: true
+ kclient:
+ disabled: true
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - libcephfs/test.sh
--- /dev/null
+roles:
+- [mon.a, mon.c, osd.0, osd.1, osd.2]
+- [mon.b, mds.a, osd.3, osd.4, osd.5]
+- [client.0]
+- [client.1]
--- /dev/null
+overrides:
+ ceph-deploy:
+ conf:
+ global:
+ osd pool default size: 2
+ osd crush chooseleaf type: 0
+ osd pool default pg num: 128
+ osd pool default pgp num: 128
+roles:
+- [mon.a, osd.0, osd.1, osd.2, client.0]
--- /dev/null
+roles:
+- [mon.a, mon.c, osd.0, osd.1, osd.2, client.0]
+- [mon.b, osd.3, osd.4, osd.5, client.1]
--- /dev/null
+roles:
+- [mon.a, mds.a, osd.0, osd.1]
+- [mon.b, mds.a-s, mon.c, osd.2, osd.3]
+- [client.0]
+log-rotate:
+ ceph-mds: 10G
+ ceph-osd: 10G
--- /dev/null
+roles:
+- [mon.a, mon.c, osd.0, osd.1, osd.2]
+- [mon.b, osd.3, osd.4, osd.5]
+- [client.0]
--- /dev/null
+roles:
+- [mon.a, osd.0, osd.4, osd.8, osd.12]
+- [mon.b, osd.1, osd.5, osd.9, osd.13]
+- [mon.c, osd.2, osd.6, osd.10, osd.14]
+- [osd.3, osd.7, osd.11, osd.15, client.0]
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ osd:
+ osd op queue: debug_random
+ osd op queue cut off: debug_random
+ osd debug verify missing on start: true
--- /dev/null
+overrides:
+ ceph-deploy:
+ conf:
+ global:
+ mon pg warn min per osd: 2
+ osd pool default size: 2
--- /dev/null
+tasks:
+ - buildpackages:
+ machine:
+ disk: 40 # GB
+ ram: 15000 # MB
+ cpus: 16
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ mds:
+ debug ms: 1
+ debug mds: 20
+ client:
+ debug ms: 1
+ debug client: 20
\ No newline at end of file
--- /dev/null
+openstack:
+ - machine:
+ ram: 15000 # MB
--- /dev/null
+openstack:
+ - machine:
+ ram: 30000 # MB
--- /dev/null
+all/centos_7.2.yaml
\ No newline at end of file
--- /dev/null
+os_type: centos
+os_version: "6.3"
--- /dev/null
+os_type: centos
+os_version: "6.4"
--- /dev/null
+os_type: centos
+os_version: "6.5"
--- /dev/null
+os_type: centos
+os_version: "7.0"
--- /dev/null
+os_type: centos
+os_version: "7.1"
--- /dev/null
+os_type: centos
+os_version: "7.2"
--- /dev/null
+os_type: debian
+os_version: "6.0"
--- /dev/null
+os_type: debian
+os_version: "7.0"
--- /dev/null
+os_type: debian
+os_version: "8.0"
--- /dev/null
+os_type: fedora
+os_version: "17"
--- /dev/null
+os_type: fedora
+os_version: "18"
--- /dev/null
+os_type: fedora
+os_version: "19"
--- /dev/null
+os_type: opensuse
+os_version: "12.2"
--- /dev/null
+os_type: opensuse
+os_version: "13.2"
--- /dev/null
+os_type: opensuse
+os_version: "42.1"
--- /dev/null
+os_type: rhel
+os_version: "6.3"
--- /dev/null
+os_type: rhel
+os_version: "6.4"
--- /dev/null
+os_type: rhel
+os_version: "6.5"
--- /dev/null
+os_type: rhel
+os_version: "7.0"
--- /dev/null
+os_type: sles
+os_version: "11-sp2"
--- /dev/null
+os_type: ubuntu
+os_version: "12.04"
--- /dev/null
+os_type: ubuntu
+os_version: "12.10"
--- /dev/null
+os_type: ubuntu
+os_version: "14.04"
--- /dev/null
+os_type: ubuntu
+os_version: "14.04"
+arch: aarch64
--- /dev/null
+os_type: ubuntu
+os_version: "14.04"
+arch: i686
--- /dev/null
+os_type: ubuntu
+os_version: "16.04"
--- /dev/null
+../all/centos_7.2.yaml
\ No newline at end of file
--- /dev/null
+../all/ubuntu_14.04.yaml
\ No newline at end of file
--- /dev/null
+../all/ubuntu_16.04.yaml
\ No newline at end of file
--- /dev/null
+#
+# Test the expected behavior of the
+#
+# CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2
+#
+# feature.
+#
+roles:
+- - mon.a
+ - mon.b
+ - osd.0
+ - osd.1
+- - osd.2
+ - mon.c
+tasks:
+#
+# Install firefly
+#
+- install:
+ branch: firefly
+- ceph:
+ fs: xfs
+#
+# We don't need mon.c for now: it will be used later to make sure an old
+# mon cannot join the quorum once the feature has been activated
+#
+- ceph.stop:
+ daemons: [mon.c]
+- exec:
+ mon.a:
+ - |-
+ ceph osd erasure-code-profile set WRONG plugin=WRONG
+ ceph osd pool create poolWRONG 12 12 erasure WRONG 2>&1 | grep "failed to load plugin using profile WRONG"
+#
+# Partial upgrade, osd.2 is not upgraded
+#
+- install.upgrade:
+ osd.0:
+#
+# a is the leader
+#
+- ceph.restart:
+ daemons: [mon.a]
+ wait-for-healthy: false
+- exec:
+ mon.a:
+ - |-
+ ceph osd erasure-code-profile set profile-lrc plugin=lrc 2>&1 | grep "unsupported by: the monitor cluster"
+- ceph.restart:
+ daemons: [mon.b, osd.1, osd.0]
+ wait-for-healthy: false
+ wait-for-osds-up: true
+#
+# The lrc plugin cannot be used because osd.2 is not upgraded yet
+# and would crash.
+#
+- exec:
+ mon.a:
+ - |-
+ ceph osd erasure-code-profile set profile-lrc plugin=lrc 2>&1 | grep "unsupported by: osd.2"
+#
+# Taking osd.2 out, the rest of the cluster is upgraded
+#
+- ceph.stop:
+ daemons: [osd.2]
+- sleep:
+ duration: 60
+#
+# Creating an erasure code profile using the lrc plugin now works
+#
+- exec:
+ mon.a:
+ - "ceph osd erasure-code-profile set profile-lrc plugin=lrc"
+#
+# osd.2 won't be able to join the because is does not support the feature
+#
+- ceph.restart:
+ daemons: [osd.2]
+ wait-for-healthy: false
+- sleep:
+ duration: 60
+- exec:
+ osd.2:
+ - |-
+ grep "protocol feature.*missing 100000000000" /var/log/ceph/ceph-osd.2.log
+#
+# mon.c won't be able to join the because it does not support the feature
+#
+- ceph.restart:
+ daemons: [mon.c]
+ wait-for-healthy: false
+- sleep:
+ duration: 60
+- exec:
+ mon.c:
+ - |-
+ grep "missing.*feature" /var/log/ceph/ceph-mon.c.log
--- /dev/null
+#
+# Test the expected behavior of the
+#
+# CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3
+#
+# feature.
+#
+roles:
+- - mon.a
+ - mon.b
+ - osd.0
+ - osd.1
+- - osd.2
+ - mon.c
+tasks:
+#
+# Install hammer
+#
+- install:
+ branch: hammer
+- ceph:
+ fs: xfs
+#
+# We don't need mon.c for now: it will be used later to make sure an old
+# mon cannot join the quorum once the feature has been activated
+#
+- ceph.stop:
+ daemons: [mon.c]
+- exec:
+ mon.a:
+ - |-
+ ceph osd erasure-code-profile set WRONG plugin=WRONG
+ ceph osd pool create poolWRONG 12 12 erasure WRONG 2>&1 | grep "failed to load plugin using profile WRONG"
+#
+# Partial upgrade, osd.2 is not upgraded
+#
+- install.upgrade:
+ osd.0:
+#
+# a is the leader
+#
+- ceph.restart:
+ daemons: [mon.a]
+ wait-for-healthy: false
+- exec:
+ mon.a:
+ - |-
+ ceph osd erasure-code-profile set profile-shec k=2 m=1 c=1 plugin=shec 2>&1 | grep "unsupported by: the monitor cluster"
+- ceph.restart:
+ daemons: [mon.b, osd.1, osd.0]
+ wait-for-healthy: false
+ wait-for-osds-up: true
+#
+# The shec plugin cannot be used because osd.2 is not upgraded yet
+# and would crash.
+#
+- exec:
+ mon.a:
+ - |-
+ ceph osd erasure-code-profile set profile-shec k=2 m=1 c=1 plugin=shec 2>&1 | grep "unsupported by: osd.2"
+#
+# Taking osd.2 out, the rest of the cluster is upgraded
+#
+- ceph.stop:
+ daemons: [osd.2]
+- sleep:
+ duration: 60
+#
+# Creating an erasure code profile using the shec plugin now works
+#
+- exec:
+ mon.a:
+ - "ceph osd erasure-code-profile set profile-shec k=2 m=1 c=1 plugin=shec"
+#
+# osd.2 won't be able to join the because is does not support the feature
+#
+- ceph.restart:
+ daemons: [osd.2]
+ wait-for-healthy: false
+- sleep:
+ duration: 60
+- exec:
+ osd.2:
+ - |-
+ grep "protocol feature.*missing" /var/log/ceph/ceph-osd.2.log
+#
+# mon.c won't be able to join the because it does not support the feature
+#
+- ceph.restart:
+ daemons: [mon.c]
+ wait-for-healthy: false
+- sleep:
+ duration: 60
+- exec:
+ mon.c:
+ - |-
+ grep "missing.*feature" /var/log/ceph/ceph-mon.c.log
--- /dev/null
+tasks:
+ - rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ ec_pool: true
+ write_append_excl: false
+ op_weights:
+ read: 100
+ write: 0
+ append: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
+ setattr: 25
+ rmattr: 25
+ - print: "**** done rados ec task"
--- /dev/null
+workload:
+ parallel:
+ - rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ ec_pool: true
+ write_append_excl: false
+ op_weights:
+ read: 100
+ write: 0
+ append: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
+ setattr: 25
+ rmattr: 25
+ - print: "**** done rados ec parallel"
--- /dev/null
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ ec_pool: true
+ write_append_excl: false
+ erasure_code_profile:
+ name: isaprofile
+ plugin: isa
+ k: 2
+ m: 1
+ technique: reed_sol_van
+ ruleset-failure-domain: osd
+ op_weights:
+ read: 100
+ write: 0
+ append: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
+ setattr: 25
+ rmattr: 25
--- /dev/null
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ ec_pool: true
+ write_append_excl: false
+ erasure_code_profile:
+ name: jerasure21profile
+ plugin: jerasure
+ k: 2
+ m: 1
+ technique: reed_sol_van
+ ruleset-failure-domain: osd
+ op_weights:
+ read: 100
+ write: 0
+ append: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
+ setattr: 25
+ rmattr: 25
--- /dev/null
+#
+# k=3 implies a stripe_width of 1376*3 = 4128 which is different from
+# the default value of 4096 It is also not a multiple of 1024*1024 and
+# creates situations where rounding rules during recovery becomes
+# necessary.
+#
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ ec_pool: true
+ write_append_excl: false
+ erasure_code_profile:
+ name: jerasure31profile
+ plugin: jerasure
+ k: 3
+ m: 1
+ technique: reed_sol_van
+ ruleset-failure-domain: osd
+ op_weights:
+ read: 100
+ write: 0
+ append: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
+ setattr: 25
+ rmattr: 25
--- /dev/null
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 400
+ objects: 50
+ ec_pool: true
+ write_append_excl: false
+ erasure_code_profile:
+ name: lrcprofile
+ plugin: lrc
+ k: 4
+ m: 2
+ l: 3
+ ruleset-failure-domain: osd
+ op_weights:
+ read: 100
+ write: 0
+ append: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
+ setattr: 25
+ rmattr: 25
--- /dev/null
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 400
+ objects: 50
+ ec_pool: true
+ write_append_excl: false
+ erasure_code_profile:
+ name: shecprofile
+ plugin: shec
+ k: 4
+ m: 3
+ c: 2
+ ruleset-failure-domain: osd
+ op_weights:
+ read: 100
+ write: 0
+ append: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
+ setattr: 25
+ rmattr: 25
--- /dev/null
+workload:
+ sequential:
+ - rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ ec_pool: true
+ write_append_excl: false
+ op_weights:
+ read: 100
+ write: 0
+ append: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
+ setattr: 25
+ rmattr: 25
+ - print: "**** done rados ec sequential"
--- /dev/null
+overrides:
+ ceph:
+ fs: btrfs
+ conf:
+ osd:
+ osd sloppy crc: true
+ osd op thread timeout: 60
--- /dev/null
+overrides:
+ ceph:
+ fs: ext4
+ conf:
+ global:
+ osd max object name len: 460
+ osd max object namespace len: 64
--- /dev/null
+overrides:
+ ceph:
+ fs: xfs
+ conf:
+ osd:
+ osd sloppy crc: true
--- /dev/null
+#!/bin/bash
+
+# $1 - part
+# $2 - branch name
+# $3 - machine name
+# $4 - email address
+# $5 - filter out (this arg is to be at the end of the command line for now)
+
+## example #1
+## (date +%U) week number
+## % 2 - mod 2 (e.g. 0,1,0,1 ...)
+## * 7 - multiplied by 7 (e.g. 0,7,0,7...)
+## $1 day of the week (0-6)
+## /14 for 2 weeks
+
+## example #2
+## (date +%U) week number
+## % 4 - mod 4 (e.g. 0,1,2,3,0,1,2,3 ...)
+## * 7 - multiplied by 7 (e.g. 0,7,14,21,0,7,14,21...)
+## $1 day of the week (0-6)
+## /28 for 4 weeks
+
+echo "Scheduling " $2 " branch"
+if [ $2 = "master" ] ; then
+ # run master branch with --newest option looking for good sha1 7 builds back
+ teuthology-suite -v -c $2 -m $3 -k distro -s rados --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/28 --newest 7 -e $4 $5
+elif [ $2 = "hammer" ] ; then
+ # run hammer branch with less jobs
+ teuthology-suite -v -c $2 -m $3 -k distro -s rados --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/56 -e $4 $5
+elif [ $2 = "jewel" ] ; then
+ # run jewel branch with /40 jobs
+ teuthology-suite -v -c $2 -m $3 -k distro -s rados --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/40 -e $4 $5
+else
+ # run NON master branches without --newest
+ teuthology-suite -v -c $2 -m $3 -k distro -s rados --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/28 -e $4 $5
+fi
--- /dev/null
+#!/bin/bash
+
+# $1 - part
+# $2 - branch name
+# $3 - machine name
+# $4 - email address
+# $5 - filter out (this arg is to be at the end of the command line for now)
+
+## example #1
+## (date +%U) week number
+## % 2 - mod 2 (e.g. 0,1,0,1 ...)
+## * 7 - multiplied by 7 (e.g. 0,7,0,7...)
+## $1 day of the week (0-6)
+## /14 for 2 weeks
+
+## example #2
+## (date +%U) week number
+## % 4 - mod 4 (e.g. 0,1,2,3,0,1,2,3 ...)
+## * 7 - multiplied by 7 (e.g. 0,7,14,21,0,7,14,21...)
+## $1 day of the week (0-6)
+## /28 for 4 weeks
+
+echo "Scheduling " $2 " branch"
+if [ $2 = "master" ] ; then
+ # run master branch with --newest option looking for good sha1 7 builds back
+ teuthology-suite -v -c $2 -m $3 -k distro -s rados --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/28 --newest 7 -e $4 ~/vps.yaml $5
+elif [ $2 = "hammer" ] ; then
+ # run hammer branch with less jobs
+ teuthology-suite -v -c $2 -m $3 -k distro -s rados --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/56 -e $4 ~/vps.yaml $5
+elif [ $2 = "jewel" ] ; then
+ # run jewel branch with /40 jobs
+ teuthology-suite -v -c $2 -m $3 -k distro -s rados --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/40 -e $4 ~/vps.yaml $5
+else
+ # run NON master branches without --newest
+ teuthology-suite -v -c $2 -m $3 -k distro -s rados --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/28 -e $4 ~/vps.yaml $5
+fi
+
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ osd heartbeat grace: 100
+ # this line to address issue #1017
+ mon lease: 15
+ mon lease ack timeout: 25
+ rgw:
+ default_idle_timeout: 1200
+ s3tests:
+ idle_timeout: 1200
+ ceph-fuse:
+ client.0:
+ mount_wait: 60
+ mount_timeout: 120
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ mon:
+ mon keyvaluedb: leveldb
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ enable experimental unrecoverable data corrupting features: '*'
+ mon:
+ mon keyvaluedb: rocksdb
--- /dev/null
+#!/bin/bash
+# /nightlies/cron_wrapper.sh
+
+# check for no argument case and stop
+if [ -z $1 ]; then
+ echo "need argument"
+ exit 1
+fi
+
+# set permanent $LOG file var
+LOG="/var/log/crontab-nightlies-log/crontab.log"
+# set $LOG_LOCKED_ERR in case locking failed
+LOG_LOCK_ERR="/var/log/crontab-nightlies-log/crontab_lock_problem.$$"
+
+# temp files to store stdout and stderr
+# named with the PID of this script in their name so they'll be unique
+STDERR="/var/tmp/stderr.$$"
+STDOUT="/var/tmp/stdout.$$"
+
+# $STDOUT and $STDERR are removed when the script exits for any reason
+trap "rm -f $STDOUT $STDERR" 0
+
+# run a command from this script's argument
+# redirect stdout to $STDOUT file and redirect stderr to $STDERR file
+
+DATE=$(date)
+echo -n "$DATE: " >> $STDOUT
+echo "Running command: $@" >> $STDOUT
+"$@" > $STDOUT 2> $STDERR
+
+# get return code from the command run
+code=$?
+
+if [ $code != 0 ] ; then
+ # echoing to stdout/stderr makes cron send email
+ echo "stdout:"
+ cat $STDOUT
+ echo "stderr:"
+ cat $STDERR
+else
+ # normal exit: just log stdout
+
+ # lock $LOG with file descriptor 200
+ exec 200>>$LOG
+ # if $LOG is locked by other process - wait for 20 sec
+ flock -w 20 200 || LOG=$LOG_LOCK_ERR
+ echo "stdout:" >> $LOG
+ cat $STDOUT >> $LOG
+ echo "stderr:" >> $LOG
+ cat $STDERR >> $LOG
+ # unlock
+ flock -u 200
+fi
--- /dev/null
+overrides:
+ thrashosds:
+ bdev_inject_crash: 2
+ bdev_inject_crash_probability: .5
+ ceph:
+ conf:
+ osd:
+ osd objectstore: bluestore
+ bluestore block size: 96636764160
+ debug bluestore: 30
+ debug bdev: 20
+ debug bluefs: 20
+ debug rocksdb: 10
+ enable experimental unrecoverable data corrupting features: "*"
+ osd debug randomize hobject sort order: false
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+# bluestore bluefs env mirror: true
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ osd:
+ osd objectstore: filestore
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ osd_pool_default_size: 2
+ osd_pool_default_min_size: 1
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ osd_pool_default_size: 2
+ osd_pool_default_min_size: 2
--- /dev/null
+overrides:
+ thrashosds:
+ min_in: 4
+ ceph:
+ conf:
+ global:
+ osd_pool_default_size: 3
+ osd_pool_default_min_size: 2
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ osd_min_pg_log_entries: 300
+ osd_max_pg_log_entries: 600
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - wrongly marked me down
+ conf:
+ mds:
+ debug mds: 20
+ debug ms: 1
+ client:
+ debug client: 10
\ No newline at end of file
--- /dev/null
+---
+ceph:
+ deb:
+ - ceph
+ - ceph-mds
+ - ceph-mgr
+ - ceph-common
+ - ceph-fuse
+ - ceph-test
+ - radosgw
+ - python-ceph
+ - libcephfs2
+ - libcephfs-dev
+ - libcephfs-java
+ - libcephfs-jni
+ - librados2
+ - librbd1
+ - rbd-fuse
+ - ceph-common-dbg
+ - ceph-fuse-dbg
+ - ceph-mds-dbg
+ - ceph-mgr-dbg
+ - ceph-mon-dbg
+ - ceph-osd-dbg
+ - ceph-test-dbg
+ - libcephfs2-dbg
+ - librados2-dbg
+ - libradosstriper1-dbg
+ - librbd1-dbg
+ - librgw2-dbg
+ - radosgw-dbg
+ - rbd-fuse-dbg
+ - rbd-mirror-dbg
+ - rbd-nbd-dbg
+ rpm:
+ - ceph-radosgw
+ - ceph-test
+ - ceph
+ - ceph-mgr
+ - ceph-fuse
+ - cephfs-java
+ - libcephfs_jni1
+ - libcephfs2
+ - libcephfs-devel
+ - librados2
+ - librbd1
+ - python-ceph
+ - rbd-fuse
+ - ceph-debuginfo
--- /dev/null
+tasks:
+- exec:
+ osd.0:
+ - ceph osd set sortbitwise
+ - for p in `ceph osd pool ls` ; do ceph osd pool set $p use_gmt_hitset true ; done
--- /dev/null
+tasks:
+- exec:
+ osd.0:
+ - ceph osd set sortbitwise
+ - ceph osd set require_jewel_osds
+ - for p in `ceph osd pool ls` ; do ceph osd pool set $p use_gmt_hitset true ; done
--- /dev/null
+tasks:
+- exec:
+ osd.0:
+ - ceph osd set require_kraken_osds
--- /dev/null
+overrides:
+ rgw:
+ ec-data-pool: true
+ cache-pools: true
+ s3tests:
+ slow_backend: true
--- /dev/null
+overrides:
+ rgw:
+ ec-data-pool: true
+ erasure_code_profile:
+ name: testprofile
+ k: 3
+ m: 1
+ ruleset-failure-domain: osd
+ s3tests:
+ slow_backend: true
--- /dev/null
+overrides:
+ rgw:
+ ec-data-pool: true
+ s3tests:
+ slow_backend: true
--- /dev/null
+overrides:
+ rgw:
+ ec-data-pool: false
--- /dev/null
+tasks:
+- install:
+- ceph:
--- /dev/null
+roles:
+- [osd.0, osd.1, osd.2, client.0, mon.a]
+- [osd.3, osd.4, osd.5, client.1, mon.b]
+- [osd.6, osd.7, osd.8, client.2, mon.c]
+- [osd.9, osd.10, osd.11, client.3, mon.d]
+- [osd.12, osd.13, osd.14, client.4, mon.e]
+- [osd.15, osd.16, osd.17, client.5]
+- [osd.18, osd.19, osd.20, client.6]
+- [osd.21, osd.22, osd.23, client.7]
+- [osd.24, osd.25, osd.26, client.8]
+- [osd.27, osd.28, osd.29, client.9]
+- [osd.30, osd.31, osd.32, client.10]
+- [osd.33, osd.34, osd.35, client.11]
+- [osd.36, osd.37, osd.38, client.12]
+- [osd.39, osd.40, osd.41, client.13]
+- [osd.42, osd.43, osd.44, client.14]
+- [osd.45, osd.46, osd.47, client.15]
+- [osd.48, osd.49, osd.50, client.16]
+- [osd.51, osd.52, osd.53, client.17]
+- [osd.54, osd.55, osd.56, client.18]
+- [osd.57, osd.58, osd.59, client.19]
+- [osd.60, osd.61, osd.62, client.20]
+- [osd.63, osd.64, osd.65, client.21]
+- [osd.66, osd.67, osd.68, client.22]
+- [osd.69, osd.70, osd.71, client.23]
+- [osd.72, osd.73, osd.74, client.24]
+- [osd.75, osd.76, osd.77, client.25]
+- [osd.78, osd.79, osd.80, client.26]
+- [osd.81, osd.82, osd.83, client.27]
+- [osd.84, osd.85, osd.86, client.28]
+- [osd.87, osd.88, osd.89, client.29]
+- [osd.90, osd.91, osd.92, client.30]
+- [osd.93, osd.94, osd.95, client.31]
+- [osd.96, osd.97, osd.98, client.32]
+- [osd.99, osd.100, osd.101, client.33]
+- [osd.102, osd.103, osd.104, client.34]
+- [osd.105, osd.106, osd.107, client.35]
+- [osd.108, osd.109, osd.110, client.36]
+- [osd.111, osd.112, osd.113, client.37]
+- [osd.114, osd.115, osd.116, client.38]
+- [osd.117, osd.118, osd.119, client.39]
+- [osd.120, osd.121, osd.122, client.40]
+- [osd.123, osd.124, osd.125, client.41]
+- [osd.126, osd.127, osd.128, client.42]
+- [osd.129, osd.130, osd.131, client.43]
+- [osd.132, osd.133, osd.134, client.44]
+- [osd.135, osd.136, osd.137, client.45]
+- [osd.138, osd.139, osd.140, client.46]
+- [osd.141, osd.142, osd.143, client.47]
+- [osd.144, osd.145, osd.146, client.48]
+- [osd.147, osd.148, osd.149, client.49]
+- [osd.150, osd.151, osd.152, client.50]
+#- [osd.153, osd.154, osd.155, client.51]
+#- [osd.156, osd.157, osd.158, client.52]
+#- [osd.159, osd.160, osd.161, client.53]
+#- [osd.162, osd.163, osd.164, client.54]
+#- [osd.165, osd.166, osd.167, client.55]
+#- [osd.168, osd.169, osd.170, client.56]
+#- [osd.171, osd.172, osd.173, client.57]
+#- [osd.174, osd.175, osd.176, client.58]
+#- [osd.177, osd.178, osd.179, client.59]
+#- [osd.180, osd.181, osd.182, client.60]
+#- [osd.183, osd.184, osd.185, client.61]
+#- [osd.186, osd.187, osd.188, client.62]
+#- [osd.189, osd.190, osd.191, client.63]
+#- [osd.192, osd.193, osd.194, client.64]
+#- [osd.195, osd.196, osd.197, client.65]
+#- [osd.198, osd.199, osd.200, client.66]
--- /dev/null
+roles:
+- [osd.0, osd.1, osd.2, client.0, mon.a]
+- [osd.3, osd.4, osd.5, client.1, mon.b]
+- [osd.6, osd.7, osd.8, client.2, mon.c]
+- [osd.9, osd.10, osd.11, client.3, mon.d]
+- [osd.12, osd.13, osd.14, client.4, mon.e]
+- [osd.15, osd.16, osd.17, client.5]
+- [osd.18, osd.19, osd.20, client.6]
+- [osd.21, osd.22, osd.23, client.7]
+- [osd.24, osd.25, osd.26, client.8]
+- [osd.27, osd.28, osd.29, client.9]
+- [osd.30, osd.31, osd.32, client.10]
+- [osd.33, osd.34, osd.35, client.11]
+- [osd.36, osd.37, osd.38, client.12]
+- [osd.39, osd.40, osd.41, client.13]
+- [osd.42, osd.43, osd.44, client.14]
+- [osd.45, osd.46, osd.47, client.15]
+- [osd.48, osd.49, osd.50, client.16]
+- [osd.51, osd.52, osd.53, client.17]
+- [osd.54, osd.55, osd.56, client.18]
+- [osd.57, osd.58, osd.59, client.19]
+- [osd.60, osd.61, osd.62, client.20]
--- /dev/null
+roles:
+- [osd.0, osd.1, osd.2, client.0, mon.a]
+- [osd.3, osd.4, osd.5, client.1, mon.b]
+- [osd.6, osd.7, osd.8, client.2, mon.c]
+- [osd.9, osd.10, osd.11, client.3, mon.d]
+- [osd.12, osd.13, osd.14, client.4, mon.e]
--- /dev/null
+overrides:
+ ceph:
+ fs: btrfs
+ conf:
+ osd:
+ osd sloppy crc: true
+ osd op thread timeout: 60
--- /dev/null
+overrides:
+ ceph:
+ fs: xfs
+ conf:
+ osd:
+ osd sloppy crc: true
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+tasks:
+- thrashosds:
+ timeout: 1200
+ chance_pgnum_grow: 1
+ chance_pgpnum_fix: 1
--- /dev/null
+tasks:
+- rados:
+ ops: 4000
+ max_seconds: 3600
+ objects: 50
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
--- /dev/null
+../../../distros/all
\ No newline at end of file
--- /dev/null
+# --suite buildpackages/any --ceph v10.0.1 --filter centos_7,ubuntu_14.04
+roles:
+ - [client.0]
+tasks:
+ - install:
+ - exec:
+ client.0:
+ - ceph --version | grep 'version '
--- /dev/null
+../../../distros/all
\ No newline at end of file
--- /dev/null
+# --suite buildpackages/tests --ceph v10.0.1 --filter centos_7.2,ubuntu_14.04
+overrides:
+ ansible.cephlab:
+ playbook: users.yml
+ buildpackages:
+ good_machine:
+ disk: 20 # GB
+ ram: 2000 # MB
+ cpus: 2
+ min_machine:
+ disk: 10 # GB
+ ram: 1000 # MB
+ cpus: 1
+roles:
+ - [client.0]
+tasks:
+ - install:
+ - exec:
+ client.0:
+ - ceph --version | grep 'version '
--- /dev/null
+meta:
+- desc: "3-node cluster"
+roles:
+- [mon.a, mds.a, osd.0, osd.1, client.0]
+- [mon.b, mds.b, osd.2, osd.3]
+- [mon.c, mds.c, osd.4, osd.5]
--- /dev/null
+meta:
+- desc: "1-node cluster"
+roles:
+ - [mon.a, osd.0, client.0]
--- /dev/null
+../../../distros/supported
\ No newline at end of file
--- /dev/null
+meta:
+- desc: "Build the cluster using ceph-ansible; then check health and make the keyring readable"
+tasks:
+- ceph_ansible:
+- exec:
+ mon.a:
+ - "sudo ceph health"
+- exec:
+ all:
+ - "KEYRING=/etc/ceph/ceph.client.admin.keyring; test -f $KEYRING && sudo chmod o+r $KEYRING"
+- install.ship_utilities:
+overrides:
+ ceph_ansible:
+ vars:
+ ceph_test: true
+openstack:
+ - volumes:
+ count: 3
+ size: 20 # GB
--- /dev/null
+meta:
+- desc: "Set os_tuning_params to values that are safe for VMs"
+overrides:
+ ceph_ansible:
+ vars:
+ os_tuning_params: '[{"name": "kernel.pid_max", "value": 4194303},{"name": "fs.file-max", "value": 26234859}]'
--- /dev/null
+meta:
+- desc: "Use a stable upstream Ceph release"
+overrides:
+ ceph_ansible:
+ vars:
+ ceph_origin: upstream
+ ceph_stable: true
--- /dev/null
+meta:
+- desc: "Have teuthology tell ceph-ansible which OSD devices to use"
+overrides:
+ ceph_ansible:
+ vars:
+ osd_auto_discovery: false
--- /dev/null
+meta:
+- desc: "Tell ceph-ansible to discover OSD devices automatically"
+overrides:
+ ceph_ansible:
+ vars:
+ osd_auto_discovery: true
--- /dev/null
+meta:
+- desc: "Use a collocated journal"
+overrides:
+ ceph_ansible:
+ vars:
+ journal_collocation: true
+ journal_size: 1024
--- /dev/null
+meta:
+- desc: "Run ceph-admin-commands.sh"
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - ceph-tests/ceph-admin-commands.sh
--- /dev/null
+meta:
+- desc: "Run the rados cls tests"
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - cls
--- /dev/null
+meta:
+- desc: "Run the rbd cli tests"
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/run_cli_tests.sh
+
--- /dev/null
+meta:
+- desc: "Run the rbd import/export tests"
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/import_export.sh
--- /dev/null
+../../../ceph-deploy-overrides
\ No newline at end of file
--- /dev/null
+../../../config_options
\ No newline at end of file
--- /dev/null
+../../../distros/supported
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph-deploy:
+ python_version: "2"
--- /dev/null
+overrides:
+ ceph-deploy:
+ python_version: "3"
--- /dev/null
+overrides:
+ ceph-deploy:
+ conf:
+ global:
+ debug ms: 1
+ osd:
+ debug osd: 10
+ mon:
+ debug mon: 10
+roles:
+- - mon.a
+ - mds.0
+ - osd.0
+- - osd.1
+ - mon.b
+ - client.0
+openstack:
+ - machine:
+ disk: 10 # GB
+ ram: 2000 # MB
+ cpus: 1
+ volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+tasks:
+- ssh_keys:
+- print: "**** done ssh_keys"
+- ceph-deploy:
+- print: "**** done ceph-deploy"
+- workunit:
+ clients:
+ client.0:
+ - ceph-tests/ceph-admin-commands.sh
+- print: "**** done ceph-tests/ceph-admin-commands.sh"
--- /dev/null
+../../../distros/supported
\ No newline at end of file
--- /dev/null
+roles:
+- - mon.a
+ - client.0
+- - osd.0
+ - osd.1
+openstack:
+ - machine:
+ disk: 20 # GB
+ ram: 2000 # MB
+ cpus: 1
+ volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+tasks:
+- install:
+- ceph:
+ fs: xfs # this implicitly means /dev/vd? are used instead of directories
+ wait-for-scrub: false
+ conf:
+ global:
+ mon pg warn min per osd: 2
+ osd pool default size: 2
+ osd crush chooseleaf type: 0 # failure domain == osd
+ osd pg bits: 2
+ osd pgp bits: 2
+#
+# Keep this around for debugging purposes. If uncommented the target
+# will pause and the workunit can be run and debug manually.
+#
+# - exec:
+# client.0:
+# - sleep 1000000000 # forever
+#
+- workunit:
+ clients:
+ all:
+ - ceph-disk/ceph-disk.sh
--- /dev/null
+overrides:
+ ansible.cephlab:
+ playbook: users.yml
+roles:
+ - [mon.a, mds.a, osd.0, osd.1, client.0]
+
+tasks:
+ - nop:
+
--- /dev/null
+roles:
+- [mon.a, mds.a, mds.a-s]
+- [mon.b, mds.b, mds.b-s]
+- [mon.c, mds.c, mds.c-s]
+- [osd.0]
+- [osd.1]
+- [osd.2]
+- [client.0]
--- /dev/null
+tasks:
+- install:
+- ceph:
+ conf:
+ mds:
+ mds thrash exports: 1
+ mds debug subtrees: 1
+ mds debug scatterstat: 1
+ mds verify scatter: 1
+- ceph-fuse:
+- workunit:
+ clients:
+ client.0:
+ - suites/fsstress.sh
+
--- /dev/null
+../../../cephfs/begin.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/clusters/fixed-2-ucephfs.yaml
\ No newline at end of file
--- /dev/null
+../../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/mount/fuse.yaml
\ No newline at end of file
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ client use faked inos: true
--- /dev/null
+../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/tasks/cfuse_workunit_suites_fsstress.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ fuse set user groups: true
+ fuse default permissions: false
+tasks:
+- workunit:
+ clients:
+ all:
+ - suites/pjd.sh
--- /dev/null
+../../../cephfs/begin.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/clusters/fixed-2-ucephfs.yaml
\ No newline at end of file
--- /dev/null
+../../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- exec:
+ client.0:
+ - sudo ceph mds set inline_data true --yes-i-really-mean-it
--- /dev/null
+../../../../cephfs/mount/fuse.yaml
\ No newline at end of file
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_dump_tree
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - Scrub error on inode
+ conf:
+ mds:
+ mds log max segments: 1
+ mds cache max size: 1000
+tasks:
+- cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_scrub_checks
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ fuse_default_permissions: 0
+tasks:
+- workunit:
+ clients:
+ all:
+ - kernel_untar_build.sh
--- /dev/null
+tasks:
+- workunit:
+ timeout: 6h
+ clients:
+ all:
+ - fs/misc
+
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ all:
+ - fs/test_o_trunc.sh
--- /dev/null
+tasks:
+- workunit:
+ timeout: 6h
+ clients:
+ all:
+ - fs/norstats
+
+overrides:
+ ceph:
+ conf:
+ client:
+ client dirsize rbytes: false
--- /dev/null
+tasks:
+- workunit:
+ timeout: 6h
+ clients:
+ all:
+ - fs/quota
+
+overrides:
+ ceph:
+ conf:
+ client:
+ client quota: true
--- /dev/null
+../../../../cephfs/tasks/cfuse_workunit_suites_blogbench.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/tasks/cfuse_workunit_suites_dbench.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/tasks/cfuse_workunit_suites_ffsb.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/tasks/cfuse_workunit_suites_fsstress.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ all:
+ - suites/fsx.sh
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ all:
+ - suites/fsync-tester.sh
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ all:
+ - suites/iogen.sh
+
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ all:
+ - suites/iozone.sh
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ debug ms: 1
+ debug client: 20
+ fuse set user groups: true
+ fuse default permissions: false
+ mds:
+ debug ms: 1
+ debug mds: 20
+tasks:
+- workunit:
+ clients:
+ all:
+ - suites/pjd.sh
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ ms_inject_delay_probability: 1
+ ms_inject_delay_type: osd
+ ms_inject_delay_max: 5
+ client_oc_max_dirty_age: 1
+tasks:
+- exec:
+ client.0:
+ - cd $TESTDIR/mnt.* && dd if=/dev/zero of=./foo count=100
+ - sleep 2
+ - cd $TESTDIR/mnt.* && truncate --size 0 ./foo
--- /dev/null
+../../../../cephfs/tasks/cfuse_workunit_trivial_sync.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/tasks/libcephfs_interface_tests.yaml
\ No newline at end of file
--- /dev/null
+
+os_type: ubuntu
+os_version: "14.04"
+
+overrides:
+ ceph-fuse:
+ disabled: true
+ kclient:
+ disabled: true
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - libcephfs-java/test.sh
--- /dev/null
+overrides:
+ ceph-fuse:
+ disabled: true
+ kclient:
+ disabled: true
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - fs/test_python.sh
--- /dev/null
+tasks:
+-mds_creation_failure:
+- workunit:
+ clients:
+ all: [fs/misc/trivial_sync.sh]
+
--- /dev/null
+../../../cephfs/begin.yaml
\ No newline at end of file
--- /dev/null
+roles:
+- [mon.a, mon.b, mon.c, mds.a, osd.0, osd.1, osd.2]
+- [client.2]
+- [client.1]
+- [client.0]
+
+log-rotate:
+ ceph-mds: 10G
+ ceph-osd: 10G
+
--- /dev/null
+roles:
+- [mon.a, mon.b, mon.c, mds.a, osd.0, osd.1, osd.2]
+- [client.1]
+- [client.0]
+
+log-rotate:
+ ceph-mds: 10G
+ ceph-osd: 10G
+
--- /dev/null
+../../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/mount/fuse.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms die on skipped message: false
+tasks:
+- kclient:
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_misc
--- /dev/null
+# make sure we get the same MPI version on all hosts
+os_type: ubuntu
+os_version: "14.04"
+
+tasks:
+- pexec:
+ clients:
+ - cd $TESTDIR
+ - wget http://ceph.com/qa/fsx-mpi.c
+ - mpicc fsx-mpi.c -o fsx-mpi
+ - rm fsx-mpi.c
+ - ln -s $TESTDIR/mnt.* $TESTDIR/gmnt
+- ssh_keys:
+- mpi:
+ exec: sudo $TESTDIR/fsx-mpi -o 1MB -N 50000 -p 10000 -l 1048576 $TESTDIR/gmnt/test
+ workdir: $TESTDIR/gmnt
+- pexec:
+ all:
+ - rm $TESTDIR/gmnt
+ - rm $TESTDIR/fsx-mpi
--- /dev/null
+# make sure we get the same MPI version on all hosts
+os_type: ubuntu
+os_version: "14.04"
+
+tasks:
+- pexec:
+ clients:
+ - cd $TESTDIR
+ - wget http://ceph.com/qa/ior.tbz2
+ - tar xvfj ior.tbz2
+ - cd ior
+ - ./configure
+ - make
+ - make install DESTDIR=$TESTDIR/binary/
+ - cd $TESTDIR/
+ - rm ior.tbz2
+ - rm -r ior
+ - ln -s $TESTDIR/mnt.* $TESTDIR/gmnt
+- ssh_keys:
+- mpi:
+ exec: $TESTDIR/binary/usr/local/bin/ior -e -w -r -W -b 10m -a POSIX -o $TESTDIR/gmnt/ior.testfile
+- pexec:
+ all:
+ - rm -f $TESTDIR/gmnt/ior.testfile
+ - rm -f $TESTDIR/gmnt
+ - rm -rf $TESTDIR/binary
--- /dev/null
+# make sure we get the same MPI version on all hosts
+os_type: ubuntu
+os_version: "14.04"
+
+tasks:
+- pexec:
+ clients:
+ - cd $TESTDIR
+ - wget http://ceph.com/qa/mdtest-1.9.3.tgz
+ - mkdir mdtest-1.9.3
+ - cd mdtest-1.9.3
+ - tar xvfz $TESTDIR/mdtest-1.9.3.tgz
+ - rm $TESTDIR/mdtest-1.9.3.tgz
+ - MPI_CC=mpicc make
+ - ln -s $TESTDIR/mnt.* $TESTDIR/gmnt
+- ssh_keys:
+- mpi:
+ exec: $TESTDIR/mdtest-1.9.3/mdtest -d $TESTDIR/gmnt -I 20 -z 5 -b 2 -R
+- pexec:
+ all:
+ - rm -f $TESTDIR/gmnt
+ - rm -rf $TESTDIR/mdtest-1.9.3
+ - rm -rf $TESTDIR/._mdtest-1.9.3
\ No newline at end of file
--- /dev/null
+../../../cephfs/begin.yaml
\ No newline at end of file
--- /dev/null
+roles:
+- [mon.a, osd.0, mon.b, mds.a, mds.b, client.1]
+- [mds.c, mds.d, mon.c, client.0, osd.1, osd.2]
+log-rotate:
+ ceph-mds: 10G
+ ceph-osd: 10G
--- /dev/null
+../../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/mount/fuse.yaml
\ No newline at end of file
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ mon:
+ debug mon: 20
--- /dev/null
+../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph-fuse:
+ disabled: true
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_failover
+
--- /dev/null
+../../../cephfs/begin.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/clusters/fixed-2-ucephfs.yaml
\ No newline at end of file
--- /dev/null
+../../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/mount/fuse.yaml
\ No newline at end of file
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ fuse default permissions: false
+ client acl type: posix_acl
+tasks:
+- workunit:
+ clients:
+ all:
+ - fs/misc/acl.sh
+ - fs/misc/chmod.sh
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ fuse set user groups: true
+ fuse default permissions: false
+ client acl type: posix_acl
+tasks:
+- workunit:
+ clients:
+ all:
+ - suites/pjd.sh
--- /dev/null
+../../../cephfs/begin.yaml
\ No newline at end of file
--- /dev/null
+roles:
+- [mon.a, osd.0, mds.a, mds.b, client.1, client.2, client.3]
+- [client.0, osd.1, osd.2]
+log-rotate:
+ ceph-mds: 10G
+ ceph-osd: 10G
--- /dev/null
+../../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/mount/fuse.yaml
\ No newline at end of file
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph-fuse:
+ disabled: true
--- /dev/null
+../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - force file system read-only
+ - bad backtrace
+
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_auto_repair
--- /dev/null
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_backtrace
--- /dev/null
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_cap_flush
--- /dev/null
+
+overrides:
+ ceph:
+ log-whitelist:
+ - responding to mclientcaps\(revoke\)
+ - not advance its oldest_client_tid
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_client_limits
--- /dev/null
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_readahead
--- /dev/null
+
+# The task interferes with the network, so we need
+# to permit OSDs to complain about that.
+overrides:
+ ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - slow request
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_client_recovery
--- /dev/null
+
+overrides:
+ ceph:
+ conf:
+ global:
+ lockdep: true
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_config_commands
--- /dev/null
+
+overrides:
+ ceph:
+ log-whitelist:
+ - bad backtrace
+ - object missing on disk
+ - error reading table object
+ - error reading sessionmap
+ - Error loading MDS rank
+ - missing journal object
+ - Error recovering journal
+ - error decoding table object
+ - failed to read JournalPointer
+ - Corrupt directory entry
+ - Corrupt fnode header
+ - corrupt sessionmap header
+ - Corrupt dentry
+ - Scrub error on inode
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_damage
+
--- /dev/null
+
+overrides:
+ ceph:
+ log-whitelist:
+ - bad backtrace
+ - object missing on disk
+ - error reading table object
+ - error reading sessionmap
+ - unmatched fragstat
+ - was unreadable, recreating it now
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_data_scan
--- /dev/null
+
+overrides:
+ ceph:
+ log-whitelist:
+ - inode wrongly marked free
+ - bad backtrace on inode
+ - inode table repaired for inode
+ - Scrub error on inode
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_forward_scrub
--- /dev/null
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_fragment
--- /dev/null
+
+overrides:
+ ceph:
+ log-whitelist:
+ - bad backtrace on dir ino
+ - error reading table object
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_journal_repair
--- /dev/null
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_flush
--- /dev/null
+
+overrides:
+ ceph:
+ log-whitelist:
+ - OSD full dropping all updates
+ - OSD near full
+ - is full \(reached quota
+ conf:
+ osd:
+ osd mon report interval max: 5
+ osd objectstore: memstore
+ memstore device bytes: 100000000
+ client.0:
+ debug client: 20
+ debug objecter: 20
+ debug objectcacher: 20
+ client.1:
+ debug client: 20
+ debug objecter: 20
+ debug objectcacher: 20
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_full
--- /dev/null
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_pool_perm
--- /dev/null
+
+overrides:
+ ceph:
+ conf:
+ global:
+ ms type: simple
+ log-whitelist:
+ - client session with invalid root
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_sessionmap
--- /dev/null
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_strays
--- /dev/null
+
+overrides:
+ ceph:
+ conf:
+ global:
+ ms type: simple
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_volume_client
--- /dev/null
+../../../cephfs/begin.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/clusters/fixed-2-ucephfs.yaml
\ No newline at end of file
--- /dev/null
+../../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/mount/fuse.yaml
\ No newline at end of file
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ all:
+ - fs/snaps
--- /dev/null
+../../../cephfs/begin.yaml
\ No newline at end of file
--- /dev/null
+
+overrides:
+ ceph:
+ conf:
+ mds:
+ mds standby replay: true
+
+roles:
+- [mon.a, mds.a, mds.b-s-0, osd.0, osd.1, client.0]
+- [mon.b, mds.c-s-0, mds.d-s-0, mon.c, osd.2, osd.3]
+log-rotate:
+ ceph-mds: 10G
+ ceph-osd: 10G
--- /dev/null
+../../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/mount/fuse.yaml
\ No newline at end of file
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
--- /dev/null
+
+tasks:
+- cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_journal_migration
--- /dev/null
+../../../cephfs/begin.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- mds_thrash:
--- /dev/null
+roles:
+- [mon.a, mon.c, osd.0, osd.1, osd.2, mds.b-s-a]
+- [mon.b, mds.a, osd.3, osd.4, osd.5, client.0]
+log-rotate:
+ ceph-mds: 10G
+ ceph-osd: 10G
--- /dev/null
+../../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/mount/fuse.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 2500
+ mds inject delay type: osd mds
+ ms inject delay probability: .005
+ ms inject delay max: 1
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ all:
+ - fs/snaps
--- /dev/null
+../../../../cephfs/tasks/cfuse_workunit_suites_fsstress.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ fuse set user groups: true
+ fuse default permissions: false
+tasks:
+- workunit:
+ clients:
+ all:
+ - suites/pjd.sh
--- /dev/null
+../../../../cephfs/tasks/cfuse_workunit_trivial_sync.yaml
\ No newline at end of file
--- /dev/null
+../../../cephfs/begin.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/clusters/fixed-2-ucephfs.yaml
\ No newline at end of file
--- /dev/null
+../../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/mount/fuse.yaml
\ No newline at end of file
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/tasks/cfuse_workunit_suites_blogbench.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/tasks/cfuse_workunit_suites_dbench.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/tasks/cfuse_workunit_suites_ffsb.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/tasks/cfuse_workunit_suites_fsstress.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ mds:
+ mds inject traceless reply probability: .5
--- /dev/null
+../../../cephfs/begin.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/clusters/fixed-2-ucephfs.yaml
\ No newline at end of file
--- /dev/null
+../../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/mount/fuse.yaml
\ No newline at end of file
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ mon:
+ debug ms: 1
+ debug mon: 20
--- /dev/null
+../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/tasks/cfuse_workunit_suites_dbench.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/tasks/cfuse_workunit_suites_fsstress.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/tasks/libcephfs_interface_tests.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ lockdep: true
--- /dev/null
+os_type: centos # xenial valgrind buggy, see http://tracker.ceph.com/issues/18126
+overrides:
+ install:
+ ceph:
+ flavor: notcmalloc
+ debuginfo: true
+ ceph:
+ conf:
+ global:
+ osd heartbeat grace: 40
+ valgrind:
+ mon: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
+ osd: [--tool=memcheck]
+ mds: [--tool=memcheck]
+ ceph-fuse:
+ client.0:
+ valgrind: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
--- /dev/null
+
+os_type: ubuntu
+os_version: "14.04"
+
+overrides:
+ ceph:
+ conf:
+ client:
+ client permissions: false
+roles:
+- [mon.0, mds.0, osd.0, hadoop.master.0]
+- [mon.1, osd.1, hadoop.slave.0]
+- [mon.2, hadoop.slave.1, client.0]
+
--- /dev/null
+tasks:
+- ssh_keys:
+- install:
+- ceph:
+- hadoop:
+- workunit:
+ clients:
+ client.0: [hadoop/repl.sh]
--- /dev/null
+tasks:
+- ssh_keys:
+- install:
+- ceph:
+- hadoop:
+- workunit:
+ clients:
+ client.0: [hadoop/terasort.sh]
+ env:
+ NUM_RECORDS: "10000000"
--- /dev/null
+tasks:
+- ssh_keys:
+- install:
+- ceph:
+- hadoop:
+- workunit:
+ clients:
+ client.0: [hadoop/wordcount.sh]
--- /dev/null
+../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+../../../../clusters/fixed-3-cephfs.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms die on skipped message: false
+ mds:
+ debug mds: 20
--- /dev/null
+../../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- install:
+- ceph:
--- /dev/null
+tasks:
+- install:
+- ceph:
+- exec:
+ client.0:
+ - sudo ceph mds set inline_data true --yes-i-really-mean-it
--- /dev/null
+tasks:
+- kclient:
+- workunit:
+ clients:
+ all:
+ - direct_io
+
--- /dev/null
+tasks:
+- kclient:
+- workunit:
+ clients:
+ all:
+ - kernel_untar_build.sh
--- /dev/null
+tasks:
+- kclient:
+- workunit:
+ clients:
+ all:
+ - fs/misc
--- /dev/null
+tasks:
+- kclient:
+- workunit:
+ clients:
+ all:
+ - fs/test_o_trunc.sh
+
--- /dev/null
+tasks:
+- kclient:
+- workunit:
+ clients:
+ all:
+ - fs/snaps
--- /dev/null
+tasks:
+- kclient:
+- workunit:
+ clients:
+ all:
+ - suites/dbench.sh
--- /dev/null
+tasks:
+- kclient:
+- workunit:
+ clients:
+ all:
+ - suites/ffsb.sh
--- /dev/null
+tasks:
+- kclient:
+- workunit:
+ clients:
+ all:
+ - suites/fsstress.sh
--- /dev/null
+tasks:
+- kclient:
+- workunit:
+ clients:
+ all:
+ - suites/fsx.sh
--- /dev/null
+tasks:
+- kclient:
+- workunit:
+ clients:
+ all:
+ - suites/fsync-tester.sh
--- /dev/null
+tasks:
+- kclient:
+- workunit:
+ clients:
+ all:
+ - suites/iozone.sh
--- /dev/null
+tasks:
+- kclient:
+- workunit:
+ clients:
+ all:
+ - suites/pjd.sh
--- /dev/null
+tasks:
+- kclient:
+- workunit:
+ clients:
+ all: [fs/misc/trivial_sync.sh]
--- /dev/null
+roles:
+- [mon.a, mds.a, osd.0, osd.1]
+- [mon.b, mon.c, osd.2, osd.3]
+- [client.0]
+- [client.1]
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms die on skipped message: false
+ mds:
+ debug mds: 20
\ No newline at end of file
--- /dev/null
+../../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- install:
+- ceph:
+- parallel:
+ - user-workload
+ - kclient-workload
+user-workload:
+ sequential:
+ - ceph-fuse: [client.0]
+ - workunit:
+ clients:
+ client.0:
+ - suites/iozone.sh
+kclient-workload:
+ sequential:
+ - kclient: [client.1]
+ - workunit:
+ clients:
+ client.1:
+ - suites/dbench.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+- parallel:
+ - user-workload
+ - kclient-workload
+user-workload:
+ sequential:
+ - ceph-fuse: [client.0]
+ - workunit:
+ clients:
+ client.0:
+ - suites/blogbench.sh
+kclient-workload:
+ sequential:
+ - kclient: [client.1]
+ - workunit:
+ clients:
+ client.1:
+ - kernel_untar_build.sh
--- /dev/null
+roles:
+- [mon.a, osd.0, mds.a, mds.c, client.2]
+- [osd.1, osd.2, mds.b, mds.d, client.3]
+- [client.0]
+- [client.1]
+log-rotate:
+ ceph-mds: 10G
+ ceph-osd: 10G
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ mds:
+ debug ms: 1
+ debug mds: 20
+ client.0:
+ debug ms: 1
+ debug client: 20
+ client.1:
+ debug ms: 1
+ debug client: 20
--- /dev/null
+
+overrides:
+ ceph:
+ conf:
+ mds:
+ mds bal frag: true
+ mds bal fragment size max: 10000
+ mds bal split size: 100
+ mds bal merge size: 5
+ mds bal split bits: 3
+
--- /dev/null
+tasks:
+- install:
+- ceph:
+- kclient:
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - force file system read-only
+ - bad backtrace
+
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_auto_repair
--- /dev/null
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_backtrace
--- /dev/null
+
+overrides:
+ ceph:
+ log-whitelist:
+ - responding to mclientcaps\(revoke\)
+ - not advance its oldest_client_tid
+
+tasks:
+ - cephfs_test_runner:
+ fail_on_skip: false
+ modules:
+ - tasks.cephfs.test_client_limits
--- /dev/null
+
+# The task interferes with the network, so we need
+# to permit OSDs to complain about that.
+overrides:
+ ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - slow request
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_client_recovery
--- /dev/null
+
+overrides:
+ ceph:
+ conf:
+ global:
+ lockdep: true
+
+tasks:
+ - cephfs_test_runner:
+ fail_on_skip: false
+ modules:
+ - tasks.cephfs.test_config_commands
--- /dev/null
+
+overrides:
+ ceph:
+ log-whitelist:
+ - bad backtrace
+ - object missing on disk
+ - error reading table object
+ - error reading sessionmap
+ - Error loading MDS rank
+ - missing journal object
+ - Error recovering journal
+ - error decoding table object
+ - failed to read JournalPointer
+ - Corrupt directory entry
+ - Corrupt fnode header
+ - corrupt sessionmap header
+ - Corrupt dentry
+ - Scrub error on inode
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_damage
+
--- /dev/null
+
+overrides:
+ ceph:
+ log-whitelist:
+ - bad backtrace
+ - object missing on disk
+ - error reading table object
+ - error reading sessionmap
+ - unmatched fragstat
+ - was unreadable, recreating it now
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_data_scan
--- /dev/null
+
+tasks:
+ - cephfs_test_runner:
+ fail_on_skip: false
+ modules:
+ - tasks.cephfs.test_failover
+
--- /dev/null
+
+overrides:
+ ceph:
+ log-whitelist:
+ - inode wrongly marked free
+ - bad backtrace on inode
+ - inode table repaired for inode
+ - Scrub error on inode
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_forward_scrub
--- /dev/null
+
+overrides:
+ ceph:
+ log-whitelist:
+ - bad backtrace on dir ino
+ - error reading table object
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_journal_repair
--- /dev/null
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_flush
--- /dev/null
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_pool_perm
--- /dev/null
+
+overrides:
+ ceph:
+ conf:
+ global:
+ ms type: simple
+ log-whitelist:
+ - client session with invalid root
+
+tasks:
+ - cephfs_test_runner:
+ fail_on_skip: false
+ modules:
+ - tasks.cephfs.test_sessionmap
--- /dev/null
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_strays
--- /dev/null
+
+overrides:
+ ceph:
+ conf:
+ global:
+ ms type: simple
+
+tasks:
+ - cephfs_test_runner:
+ fail_on_skip: false
+ modules:
+ - tasks.cephfs.test_volume_client
--- /dev/null
+../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+../../../../clusters/fixed-3-cephfs.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms die on skipped message: false
+ mds:
+ debug mds: 20
\ No newline at end of file
--- /dev/null
+../../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+- thrashosds:
--- /dev/null
+tasks:
+- install:
+- ceph:
+- mds_thrash:
--- /dev/null
+tasks:
+- install:
+- ceph:
+- mon_thrash:
+ revive_delay: 20
+ thrash_delay: 1
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ osd:
+ filestore flush min: 0
+tasks:
+- kclient:
+- workunit:
+ clients:
+ all:
+ - suites/ffsb.sh
--- /dev/null
+tasks:
+- kclient:
+- workunit:
+ clients:
+ all:
+ - suites/iozone.sh
--- /dev/null
+
+os_type: ubuntu
+
+overrides:
+ ceph:
+ conf:
+ global:
+ ms die on skipped message: false
+
+tasks:
+- install:
+- ceph:
+- kclient: [client.0]
+- knfsd:
+ client.0:
+ options: [rw,no_root_squash,async]
--- /dev/null
+../../../../clusters/extra-client.yaml
\ No newline at end of file
--- /dev/null
+../../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- nfs:
+ client.1:
+ server: client.0
+ options: [rw,hard,intr,nfsvers=3]
--- /dev/null
+tasks:
+- nfs:
+ client.1:
+ server: client.0
+ options: [rw,hard,intr,nfsvers=4]
--- /dev/null
+tasks:
+- workunit:
+ timeout: 6h
+ clients:
+ client.1:
+ - kernel_untar_build.sh
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.1:
+ - fs/misc/chmod.sh
+ - fs/misc/i_complete_vs_rename.sh
+ - fs/misc/trivial_sync.sh
+ #- fs/misc/multiple_rsync.sh
+ #- fs/misc/xattrs.sh
+# Once we can run multiple_rsync.sh and xattrs.sh we can change to this
+# - misc
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.1:
+ - suites/blogbench.sh
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.1:
+ - suites/dbench-short.sh
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ osd:
+ filestore flush min: 0
+tasks:
+- workunit:
+ clients:
+ client.1:
+ - suites/ffsb.sh
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.1:
+ - suites/fsstress.sh
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.1:
+ - suites/iozone.sh
--- /dev/null
+../../../../clusters/fixed-3.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms die on skipped message: false
+ client:
+ rbd default features: 5
--- /dev/null
+../../../../fs/btrfs.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- install:
+- ceph:
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 5000
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 500
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ all:
+ - rbd/concurrent.sh
+# Options for rbd/concurrent.sh (default values shown)
+# env:
+# RBD_CONCURRENT_ITER: 100
+# RBD_CONCURRENT_COUNT: 5
+# RBD_CONCURRENT_DELAY: 5
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ all:
+ - rbd/huge-tickets.sh
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ all:
+ - rbd/image_read.sh
+# Options for rbd/image_read.sh (default values shown)
+# env:
+# IMAGE_READ_LOCAL_FILES: 'false'
+# IMAGE_READ_FORMAT: '2'
+# IMAGE_READ_VERBOSE: 'true'
+# IMAGE_READ_PAGE_SIZE: '4096'
+# IMAGE_READ_OBJECT_ORDER: '22'
+# IMAGE_READ_TEST_CLONES: 'true'
+# IMAGE_READ_DOUBLE_ORDER: 'true'
+# IMAGE_READ_HALF_ORDER: 'false'
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ all:
+ - rbd/kernel.sh
--- /dev/null
+tasks:
+- rbd_fsx:
+ clients: [client.0]
+ ops: 10000
+ krbd: true
+ readbdy: 512
+ writebdy: 512
+ truncbdy: 512
+ holebdy: 512
+ punch_holes: true
+ randomized_striping: false
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ all:
+ - rbd/map-snapshot-io.sh
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ all:
+ - rbd/map-unmap.sh
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ all:
+ - rbd/simple_big.sh
+
--- /dev/null
+../../../../clusters/fixed-3.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms die on skipped message: false
+ client:
+ rbd default features: 5
--- /dev/null
+../../../../fs/btrfs.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 5000
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 500
--- /dev/null
+tasks:
+- install:
+- ceph: null
+- rbd_fio:
+ client.0:
+ fio-io-size: 90%
+ formats: [2]
+ features: [[layering,exclusive-lock]]
+ io-engine: sync
+ rw: randrw
+ runtime: 900
--- /dev/null
+tasks:
+- install:
+- ceph:
+- rbd:
+ all:
+- workunit:
+ clients:
+ all:
+ - kernel_untar_build.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+- rbd:
+ all:
+- workunit:
+ clients:
+ all:
+ - suites/dbench.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+- rbd:
+ all:
+ image_size: 20480
+- workunit:
+ clients:
+ all:
+ - suites/ffsb.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+- rbd:
+ all:
+- workunit:
+ clients:
+ all:
+ - suites/fsstress.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+- rbd:
+ all:
+ fs_type: btrfs
+- workunit:
+ clients:
+ all:
+ - suites/fsstress.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+- rbd:
+ all:
+ fs_type: ext4
+- workunit:
+ clients:
+ all:
+ - suites/fsstress.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+- rbd:
+ all:
+- workunit:
+ clients:
+ all:
+ - suites/fsx.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+- rbd:
+ all:
+ image_size: 20480
+- workunit:
+ clients:
+ all:
+ - suites/iozone.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+- rbd:
+ all:
+- workunit:
+ clients:
+ all: [fs/misc/trivial_sync.sh]
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms die on skipped message: false
+ client:
+ rbd default features: 5
--- /dev/null
+../../../../fs/btrfs.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 5000
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 500
--- /dev/null
+roles:
+- [mon.a, mon.c, osd.0, osd.1, osd.2]
+- [mon.b, mds.a, osd.3, osd.4, osd.5]
+- [client.0]
+- [client.1]
+tasks:
+- install:
+- ceph:
+- rbd.xfstests:
+ client.0:
+ test_image: 'test_image-0'
+ scratch_image: 'scratch_image-0'
+ tests: '-g auto'
+ randomize: true
+ client.1:
+ test_image: 'test_image-1'
+ scratch_image: 'scratch_image-1'
+ tests: '-g auto'
+ randomize: true
--- /dev/null
+../../../../clusters/fixed-3.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms die on skipped message: false
+ client:
+ rbd default features: 5
--- /dev/null
+../../../../fs/btrfs.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+- thrashosds:
--- /dev/null
+tasks:
+- install:
+- ceph:
+- mon_thrash:
+ revive_delay: 20
+ thrash_delay: 1
--- /dev/null
+tasks:
+- rbd_fio:
+ client.0:
+ fio-io-size: 90%
+ formats: [2]
+ features: [[layering,exclusive-lock]]
+ io-engine: sync
+ rw: randrw
+ runtime: 1200
--- /dev/null
+tasks:
+- rbd:
+ all:
+ image_size: 20480
+- workunit:
+ clients:
+ all:
+ - suites/ffsb.sh
--- /dev/null
+tasks:
+- rbd:
+ all:
+ image_size: 20480
+- workunit:
+ clients:
+ all:
+ - suites/iozone.sh
--- /dev/null
+overrides:
+ ceph:
+ crush_tunables: bobtail
+tasks:
+- install:
+- ceph:
--- /dev/null
+# fixed-1.yaml, but with client.0 on a separate target
+overrides:
+ ceph-deploy:
+ conf:
+ global:
+ osd pool default size: 2
+ osd crush chooseleaf type: 0
+ osd pool default pg num: 128
+ osd pool default pgp num: 128
+roles:
+- [mon.a, osd.0, osd.1, osd.2]
+- [client.0]
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ rbd default features: 1 # pre-single-major is v3.13, so layering only
--- /dev/null
+overrides:
+ kernel:
+ client.0:
+ branch: nightly_pre-single-major # v3.12.z
+tasks:
+- exec:
+ client.0:
+ - "modprobe -r rbd"
+ - "modprobe --first-time rbd"
+ - "test ! -f /sys/module/rbd/parameters/single_major"
--- /dev/null
+tasks:
+- exec:
+ client.0:
+ - "modprobe -r rbd"
+ - "modprobe --first-time rbd single_major=0"
+ - "grep -q N /sys/module/rbd/parameters/single_major"
--- /dev/null
+tasks:
+- exec:
+ client.0:
+ - "modprobe -r rbd"
+ - "modprobe --first-time rbd single_major=1"
+ - "grep -q Y /sys/module/rbd/parameters/single_major"
--- /dev/null
+tasks:
+- cram:
+ clients:
+ client.0:
+ - http://git.ceph.com/?p=ceph.git;a=blob_plain;hb={branch};f=src/test/cli-integration/rbd/unmap.t
--- /dev/null
+../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+roles:
+- [mon.a, mon.c, osd.0, osd.1, osd.2]
+- [mon.b, mds.a, osd.3, osd.4, osd.5]
+- [client.0]
--- /dev/null
+../../../../fs/btrfs.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- install:
+- ceph:
+- kclient:
+- workunit:
+ clients:
+ all:
+ - suites/blogbench.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+- kclient:
+- workunit:
+ clients:
+ all:
+ - suites/fsx.sh
--- /dev/null
+roles:
+- [mon.a, mon.b, mon.c, mds.a, osd.0, osd.1, osd.2]
+- [client.1]
+- [client.0]
--- /dev/null
+../../../../fs/btrfs.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- install:
+- ceph:
+- kclient:
+- locktest: [client.0, client.1]
--- /dev/null
+roles:
+- [mon.a, mon.b, mon.c, osd.0, osd.1, osd.2]
+- [mds.a]
+- [client.0]
--- /dev/null
+tasks:
+- install:
+- ceph:
+ conf:
+ mds:
+ mds log segment size: 16384
+ mds log max segments: 1
+- restart:
+ exec:
+ client.0:
+ - test-backtraces.py
--- /dev/null
+roles:
+- [mon.a, mon.c, mds.a, osd.0, osd.1, osd.2]
+- [mon.b, mds.b, mds.c, osd.3, osd.4, osd.5]
+- [client.0]
+- [client.1]
--- /dev/null
+roles:
+- [mon.a, mon.c, mds.a, mds.b, mds.c, mds.d, osd.0, osd.1, osd.2]
+- [mon.b, mds.e, mds.f, mds.g, mds.h, mds.i, osd.3, osd.4, osd.5]
+- [client.0]
+- [client.1]
--- /dev/null
+../../../../fs/btrfs.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- install:
+- ceph:
+ conf:
+ client:
+ fuse_default_permissions: 0
+- ceph-fuse:
--- /dev/null
+tasks:
+- install:
+- ceph:
+- kclient:
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ all:
+ - fs/misc
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ all:
+ - suites/blogbench.sh
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ all:
+ - suites/dbench.sh
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ all:
+ - suites/fsstress.sh
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ all:
+ - suites/fsync-tester.sh
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ fuse default permissions: false
+ fuse set user groups: true
+tasks:
+- workunit:
+ clients:
+ all:
+ - suites/pjd.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+ conf:
+ client:
+ ms_inject_delay_probability: 1
+ ms_inject_delay_type: osd
+ ms_inject_delay_max: 5
+ client_oc_max_dirty_age: 1
+- ceph-fuse:
+- exec:
+ client.0:
+ - dd if=/dev/zero of=./foo count=100
+ - sleep 2
+ - truncate --size 0 ./foo
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ mds:
+ mds thrash exports: 1
--- /dev/null
+roles:
+- [mgr.x, mon.a, mds.a, mds.c, osd.0, client.0]
+- [mgr.y, mon.b, mds.b, osd.1, osd.2, client.1]
+log-rotate:
+ ceph-mds: 10G
+ ceph-osd: 10G
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ mon:
+ debug mon: 20
+ mgr:
+ debug mgr: 20
+ debug ms: 1
+ client:
+ debug client: 20
+ debug mgrc: 20
+ debug ms: 1
+ osd:
+ debug mgrc: 20
+ mds:
+ debug mgrc: 20
--- /dev/null
+overrides:
+ ceph:
+ fs: xfs
+ conf:
+ osd:
+ osd sloppy crc: true
--- /dev/null
+
+tasks:
+ - install:
+ - ceph:
+ - cephfs_test_runner:
+ modules:
+ - tasks.mgr.test_failover
--- /dev/null
+roles:
+- [mon.a, mds.a, osd.0, osd.1]
+- [mon.b, mon.c, osd.2, osd.3, client.0]
+- [client.1]
--- /dev/null
+../../../../fs/btrfs.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms die on skipped message: false
+tasks:
+- install:
+ branch: dumpling
+- ceph:
+- parallel:
+ - user-workload
+ - kclient-workload
+user-workload:
+ sequential:
+ - ceph-fuse: [client.0]
+ - workunit:
+ clients:
+ client.0:
+ - suites/iozone.sh
+kclient-workload:
+ sequential:
+ - kclient: [client.1]
+ - workunit:
+ clients:
+ client.1:
+ - suites/dbench.sh
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms die on skipped message: false
+tasks:
+- install:
+ branch: dumpling
+- ceph:
+- parallel:
+ - user-workload
+ - kclient-workload
+user-workload:
+ sequential:
+ - ceph-fuse: [client.0]
+ - workunit:
+ clients:
+ client.0:
+ - suites/blogbench.sh
+kclient-workload:
+ sequential:
+ - kclient: [client.1]
+ - workunit:
+ clients:
+ client.1:
+ - kernel_untar_build.sh
--- /dev/null
+../../fs/basic/begin.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/clusters/3-mds.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/clusters/9-mds.yaml
\ No newline at end of file
--- /dev/null
+../../fs/basic/fs/
\ No newline at end of file
--- /dev/null
+../../fs/basic/inline/
\ No newline at end of file
--- /dev/null
+../../../../cephfs/mount/fuse.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/mount/kclient.yaml
\ No newline at end of file
--- /dev/null
+../../../fs/basic/overrides/
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/fuse/default-perm/no.yaml
\ No newline at end of file
--- /dev/null
+../../fs/basic/tasks/
\ No newline at end of file
--- /dev/null
+../../fs/verify/begin.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/clusters/3-mds.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/clusters/9-mds.yaml
\ No newline at end of file
--- /dev/null
+../../fs/verify/fs/
\ No newline at end of file
--- /dev/null
+../../../../cephfs/mount/fuse.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/mount/kclient.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/overrides/fuse/default-perm/no.yaml
\ No newline at end of file
--- /dev/null
+../../../fs/verify/overrides/
\ No newline at end of file
--- /dev/null
+../../fs/verify/tasks/
\ No newline at end of file
--- /dev/null
+../../fs/verify/validater/
\ No newline at end of file
--- /dev/null
+roles:
+- [mon.0, mon.1, mon.2, mds.0, client.0]
+- [osd.0]
+- [osd.1]
+- [osd.2]
--- /dev/null
+../../../../fs/btrfs.yaml
\ No newline at end of file
--- /dev/null
+../../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+tasks:
+- install:
+- ceph:
+- thrashosds:
+ chance_down: 1.0
+ powercycle: true
+ timeout: 600
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client.0:
+ admin socket: /var/run/ceph/ceph-$name.asok
+tasks:
+- radosbench:
+ clients: [client.0]
+ time: 60
+- admin_socket:
+ client.0:
+ objecter_requests:
+ test: "http://git.ceph.com/?p=ceph.git;a=blob_plain;f=src/test/admin_socket/objecter_requests;hb={branch}"
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ fuse_default_permissions: 0
+tasks:
+- ceph-fuse:
+- workunit:
+ timeout: 6h
+ clients:
+ all:
+ - kernel_untar_build.sh
--- /dev/null
+tasks:
+- ceph-fuse:
+- workunit:
+ timeout: 6h
+ clients:
+ all:
+ - fs/misc
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ osd:
+ filestore flush min: 0
+ mds:
+ debug ms: 1
+ debug mds: 20
+tasks:
+- ceph-fuse:
+- workunit:
+ clients:
+ all:
+ - suites/ffsb.sh
--- /dev/null
+tasks:
+- ceph-fuse:
+- workunit:
+ clients:
+ all:
+ - suites/fsstress.sh
--- /dev/null
+tasks:
+- ceph-fuse:
+- workunit:
+ timeout: 6h
+ clients:
+ all:
+ - suites/fsx.sh
--- /dev/null
+tasks:
+- ceph-fuse:
+- workunit:
+ clients:
+ all:
+ - suites/fsync-tester.sh
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ fuse default permissions: false
+ fuse set user groups: true
+tasks:
+- ceph-fuse:
+- workunit:
+ clients:
+ all:
+ - suites/pjd.sh
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ ms_inject_delay_probability: 1
+ ms_inject_delay_type: osd
+ ms_inject_delay_max: 5
+ client_oc_max_dirty_age: 1
+tasks:
+- ceph-fuse:
+- exec:
+ client.0:
+ - dd if=/dev/zero of=./foo count=100
+ - sleep 2
+ - truncate --size 0 ./foo
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - reached quota
+tasks:
+- ceph-fuse:
+- workunit:
+ clients:
+ client.0:
+ - rados/test.sh
--- /dev/null
+tasks:
+- full_sequential:
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
--- /dev/null
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 500
+ op_weights:
+ read: 45
+ write: 45
+ delete: 10
--- /dev/null
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
--- /dev/null
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 500
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
--- /dev/null
+../../../../clusters/fixed-2.yaml
\ No newline at end of file
--- /dev/null
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
--- /dev/null
+../../../../fs/btrfs.yaml
\ No newline at end of file
--- /dev/null
+../../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+../../../mon_kv_backend
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 5000
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 1500
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms type: async
+ enable experimental unrecoverable data corrupting features: '*'
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms type: random
+ enable experimental unrecoverable data corrupting features: '*'
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms type: simple
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+../../../config/rados.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - reached quota
+ - wrongly marked me down
+tasks:
+- install:
+- ceph:
+- workunit:
+ clients:
+ client.0:
+ - rados/test.sh
+ - rados/test_pool_quota.sh
+
--- /dev/null
+tasks:
+- install:
+- ceph:
+- workunit:
+ clients:
+ client.0:
+ - cls
--- /dev/null
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+- workunit:
+ clients:
+ client.0:
+ - rados/test_python.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+- workunit:
+ clients:
+ client.0:
+ - rados/stress_watch.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+- exec:
+ client.0:
+ - ceph_test_rados_striper_api_io
+ - ceph_test_rados_striper_api_aio
+ - ceph_test_rados_striper_api_striping
+
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - wrongly marked me down
+tasks:
+- install:
+- ceph:
+- workunit:
+ clients:
+ all:
+ - rados/load-gen-big.sh
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - wrongly marked me down
+tasks:
+- install:
+- ceph:
+- workunit:
+ clients:
+ all:
+ - rados/load-gen-mix.sh
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - wrongly marked me down
+tasks:
+- install:
+- ceph:
+- workunit:
+ clients:
+ all:
+ - rados/load-gen-mostlyread.sh
--- /dev/null
+overrides:
+ ceph:
+ crush_tunables: optimal
+ conf:
+ osd:
+ osd_discard_disconnected_ops: false
+tasks:
+- install:
+- ceph:
+- rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 500
+ op_weights:
+ read: 45
+ write: 45
+ delete: 10
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - candidate had a stat error
+ - candidate had a read error
+ - deep-scrub 0 missing, 1 inconsistent objects
+ - deep-scrub 0 missing, 4 inconsistent objects
+ - deep-scrub [0-9]+ errors
+ - '!= omap_digest'
+ - '!= data_digest'
+ - repair 0 missing, 1 inconsistent objects
+ - repair 0 missing, 4 inconsistent objects
+ - repair [0-9]+ errors, [0-9]+ fixed
+ - scrub 0 missing, 1 inconsistent objects
+ - scrub [0-9]+ errors
+ - 'size 1 != size'
+ - attr name mismatch
+ conf:
+ osd:
+ filestore debug inject read err: true
+ bluestore debug inject read err: true
+tasks:
+- install:
+- ceph:
+- repair_test:
+
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ debug rgw: 20
+ debug ms: 1
+tasks:
+- install:
+- ceph:
+- rgw:
+ default_idle_timeout: 3600
+ client.0: null
+- thrash_pool_snaps:
+ pools:
+ - .rgw.buckets
+ - .rgw.root
+ - .rgw.control
+ - .rgw
+ - .users.uid
+ - .users.email
+ - .users
+- s3readwrite:
+ client.0:
+ rgw_server: client.0
+ readwrite:
+ bucket: rwtest
+ readers: 10
+ writers: 3
+ duration: 300
+ files:
+ num: 10
+ size: 2000
+ stddev: 500
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - '!= data_digest'
+ - '!= omap_digest'
+ - '!= size'
+ - 'deep-scrub 0 missing, 1 inconsistent objects'
+ - 'deep-scrub [0-9]+ errors'
+ - 'repair 0 missing, 1 inconsistent objects'
+ - 'repair [0-9]+ errors, [0-9]+ fixed'
+ - 'shard [0-9]+ missing'
+ - 'deep-scrub 1 missing, 1 inconsistent objects'
+ - 'does not match object info size'
+ - 'attr name mistmatch'
+ - 'deep-scrub 1 missing, 0 inconsistent objects'
+ - 'failed to pick suitable auth object'
+ conf:
+ osd:
+ osd deep scrub update digest min age: 0
+tasks:
+- install:
+- ceph:
+- scrub_test:
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ mon:
+ mon min osdmap epochs: 25
+ paxos service trim min: 5
+tasks:
+- install:
+- ceph:
--- /dev/null
+roles:
+- [mon.a, mon.c, osd.0, osd.1, osd.2]
+- [mon.b, osd.3, osd.4, osd.5, client.0]
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
--- /dev/null
+roles:
+- [mon.a, mon.b, mon.c, mon.d, mon.e, osd.0, osd.1, osd.2]
+- [mon.f, mon.g, mon.h, mon.i, osd.3, osd.4, osd.5, client.0]
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
--- /dev/null
+../../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+../../../mon_kv_backend
\ No newline at end of file
--- /dev/null
+../basic/msgr
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 5000
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 2500
+ ms inject delay type: mon
+ ms inject delay probability: .005
+ ms inject delay max: 1
+ ms inject internal delays: .002
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+../../../config/rados.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- mon_thrash:
+ revive_delay: 90
+ thrash_delay: 1
+ thrash_store: true
+ thrash_many: true
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ osd:
+ mon client ping interval: 4
+ mon client ping timeout: 12
+tasks:
+- mon_thrash:
+ revive_delay: 20
+ thrash_delay: 1
+ thrash_many: true
+ freeze_mon_duration: 20
+ freeze_mon_probability: 10
--- /dev/null
+tasks:
+- mon_thrash:
+ revive_delay: 20
+ thrash_delay: 1
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ mon:
+ paxos min: 10
+ paxos trim min: 10
+tasks:
+- mon_thrash:
+ revive_delay: 90
+ thrash_delay: 1
+ thrash_many: true
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ mon:
+ paxos min: 10
+ paxos trim min: 10
+tasks:
+- mon_thrash:
+ revive_delay: 90
+ thrash_delay: 1
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - slow request
+tasks:
+- exec:
+ client.0:
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
+ - ceph_test_rados_delete_pools_parallel
--- /dev/null
+tasks:
+- exec:
+ client.0:
+ - ceph_test_rados_delete_pools_parallel --debug_objecter 20 --debug_ms 1 --debug_rados 20 --debug_monc 20
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - reached quota
+ conf:
+ global:
+ debug objecter: 20
+ debug rados: 20
+ debug ms: 1
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rados/test.sh
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - wrongly marked me down
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - mon/pool_ops.sh
+ - mon/crush_ops.sh
+ - mon/osd.sh
+ - mon/caps.sh
+
--- /dev/null
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
--- /dev/null
+roles:
+- [mon.a, mon.d, mon.g, mon.j, mon.m, mon.p, mon.s, osd.0]
+- [mon.b, mon.e, mon.h, mon.k, mon.n, mon.q, mon.t]
+- [mon.c, mon.f, mon.i, mon.l, mon.o, mon.r, mon.u, osd.1]
+openstack:
+- volumes: # attached to each instance
+ count: 1
+ size: 10 # GB
--- /dev/null
+roles:
+- [mon.a, mon.b, mon.c, osd.0, osd.1]
+openstack:
+- volumes: # attached to each instance
+ count: 2
+ size: 10 # GB
--- /dev/null
+roles:
+- [mon.a, mon.c, mon.e, osd.0]
+- [mon.b, mon.d, mon.f, osd.1]
+openstack:
+- volumes: # attached to each instance
+ count: 1
+ size: 10 # GB
--- /dev/null
+roles:
+- [mon.a, mon.d, mon.g, osd.0]
+- [mon.b, mon.e, mon.h]
+- [mon.c, mon.f, mon.i, osd.1]
+openstack:
+- volumes: # attached to each instance
+ count: 1
+ size: 10 # GB
--- /dev/null
+../../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+../../../mon_kv_backend
\ No newline at end of file
--- /dev/null
+../basic/msgr
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 5000
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 500
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+../../../config/rados.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - slow request
+ - .*clock.*skew.*
+ - clocks not synchronized
+- mon_clock_skew_check:
+ expect-skew: false
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ mon.b:
+ clock offset: 10
+tasks:
+- install:
+- ceph:
+ wait-for-healthy: false
+ log-whitelist:
+ - slow request
+ - .*clock.*skew.*
+ - clocks not synchronized
+- mon_clock_skew_check:
+ expect-skew: true
--- /dev/null
+tasks:
+- install:
+- ceph:
+- mon_recovery:
--- /dev/null
+roles:
+- [mon.a, osd.0, osd.1, osd.2, client.0]
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+
+overrides:
+ ceph:
+ fs: xfs
+ conf:
+ osd:
+ filestore xfs extsize: true
+
+tasks:
+- install:
+- ceph:
+- workunit:
+ clients:
+ all:
+ - rados/test_alloc_hint.sh
--- /dev/null
+roles:
+- [mon.0, osd.0, osd.1, osd.2, osd.3, osd.4, osd.5, client.0]
+openstack:
+- volumes: # attached to each instance
+ count: 6
+ size: 10 # GB
+tasks:
+- install:
+- ceph:
+ fs: xfs
+ conf:
+ global:
+ osd max object name len: 460
+ osd max object namespace len: 64
+- ceph_objectstore_tool:
+ objects: 20
--- /dev/null
+roles:
+- [mon.0, osd.0, osd.1, client.0]
+openstack:
+- volumes: # attached to each instance
+ count: 2
+ size: 10 # GB
+tasks:
+- install:
+- ceph:
+ fs: xfs
+- exec:
+ client.0:
+ - ceph_test_filejournal
--- /dev/null
+roles:
+- [mon.0, osd.0, osd.1, client.0]
+openstack:
+- volumes: # attached to each instance
+ count: 2
+ size: 10 # GB
+tasks:
+- install:
+- ceph:
+ fs: xfs
+ conf:
+ global:
+ journal aio: true
+- filestore_idempotent:
--- /dev/null
+roles:
+- [mon.0, osd.0, osd.1, client.0]
+openstack:
+- volumes: # attached to each instance
+ count: 2
+ size: 10 # GB
+tasks:
+- install:
+- ceph:
+ fs: xfs
+- filestore_idempotent:
--- /dev/null
+roles:
+- [mon.0, osd.0, osd.1, client.0]
+tasks:
+- install:
+- workunit:
+ clients:
+ all:
+ - objectstore/test_fuse.sh
+
--- /dev/null
+roles:
+- [mon.0, osd.0, osd.1, client.0]
+tasks:
+- install:
+- exec:
+ client.0:
+ - mkdir $TESTDIR/kvtest && cd $TESTDIR/kvtest && ceph_test_keyvaluedb
+ - rm -rf $TESTDIR/kvtest
--- /dev/null
+roles:
+- [mon.0, osd.0, osd.1, client.0]
+openstack:
+- volumes: # attached to each instance
+ count: 2
+ size: 10 # GB
+tasks:
+- install:
+- ceph:
+ fs: xfs
+- workunit:
+ clients:
+ all:
+ - osdc/stress_objectcacher.sh
--- /dev/null
+roles:
+- [mon.0, osd.0, osd.1, client.0]
+openstack:
+- volumes: # attached to each instance
+ count: 2
+ size: 10 # GB
+tasks:
+- install:
+- exec:
+ client.0:
+ - mkdir $TESTDIR/ostest && cd $TESTDIR/ostest && ulimit -c 0 && ulimit -Sn 4096 && ceph_test_objectstore --gtest_filter=-*/3
+ - rm -rf $TESTDIR/ostest
--- /dev/null
+roles:
+- [mon.0, mds.a, osd.0, osd.1, osd.2, client.0, client.1]
+tasks:
+- install:
+- ceph:
+ conf:
+ global:
+ osd max object name len: 460
+ osd max object namespace len: 64
+ debug client: 20
+ debug mds: 20
+ debug ms: 1
+- exec:
+ client.0:
+ - ceph osd pool create data_cache 4
+ - ceph osd tier add cephfs_data data_cache
+ - ceph osd tier cache-mode data_cache writeback
+ - ceph osd tier set-overlay cephfs_data data_cache
+ - ceph osd pool set data_cache hit_set_type bloom
+ - ceph osd pool set data_cache hit_set_count 8
+ - ceph osd pool set data_cache hit_set_period 3600
+ - ceph osd pool set data_cache min_read_recency_for_promote 0
+- ceph-fuse:
+- exec:
+ client.0:
+ - sudo chmod 777 $TESTDIR/mnt.0/
+ - dd if=/dev/urandom of=$TESTDIR/mnt.0/foo bs=1M count=5
+ - ls -al $TESTDIR/mnt.0/foo
+ - truncate --size 0 $TESTDIR/mnt.0/foo
+ - ls -al $TESTDIR/mnt.0/foo
+ - dd if=/dev/urandom of=$TESTDIR/mnt.0/foo bs=1M count=5
+ - ls -al $TESTDIR/mnt.0/foo
+ - cp $TESTDIR/mnt.0/foo /tmp/foo
+ - sync
+ - rados -p data_cache ls -
+ - sleep 10
+ - rados -p data_cache ls -
+ - rados -p data_cache cache-flush-evict-all
+ - rados -p data_cache ls -
+ - sleep 1
+- exec:
+ client.1:
+ - hexdump -C /tmp/foo | head
+ - hexdump -C $TESTDIR/mnt.1/foo | head
+ - cmp $TESTDIR/mnt.1/foo /tmp/foo
--- /dev/null
+roles:
+- [mon.a, osd.0, osd.1, osd.2, client.0]
+tasks:
+- install:
+- workunit:
+ clients:
+ all:
+ - post-file.sh
--- /dev/null
+roles:
+- - mon.a
+ - osd.0
+ - osd.1
+ - osd.2
+ - client.0
+tasks:
+- install:
+- ceph:
+ conf:
+ global:
+ osd max object name len: 460
+ osd max object namespace len: 64
+- exec:
+ client.0:
+ - ceph osd pool create base-pool 4
+ - ceph osd pool create cache-pool 4
+ - ceph osd tier add base-pool cache-pool
+ - ceph osd tier cache-mode cache-pool writeback
+ - ceph osd tier set-overlay base-pool cache-pool
+ - dd if=/dev/urandom of=$TESTDIR/foo bs=1M count=1
+ - rbd import --image-format 2 $TESTDIR/foo base-pool/bar
+ - rbd snap create base-pool/bar@snap
+ - rados -p base-pool cache-flush-evict-all
+ - rbd export base-pool/bar $TESTDIR/bar
+ - rbd export base-pool/bar@snap $TESTDIR/snap
+ - cmp $TESTDIR/foo $TESTDIR/bar
+ - cmp $TESTDIR/foo $TESTDIR/snap
+ - rm $TESTDIR/foo $TESTDIR/bar $TESTDIR/snap
--- /dev/null
+# verify #13098 fix
+roles:
+- [mon.a, osd.0, osd.1, osd.2, client.0]
+overrides:
+ ceph:
+ log-whitelist:
+ - is full
+tasks:
+- install:
+- ceph:
+ conf:
+ global:
+ osd max object name len: 460
+ osd max object namespace len: 64
+- exec:
+ client.0:
+ - ceph osd pool create ec-ca 1 1
+ - ceph osd pool create ec 1 1 erasure default
+ - ceph osd tier add ec ec-ca
+ - ceph osd tier cache-mode ec-ca readproxy
+ - ceph osd tier set-overlay ec ec-ca
+ - ceph osd pool set ec-ca hit_set_type bloom
+ - ceph osd pool set-quota ec-ca max_bytes 20480000
+ - ceph osd pool set-quota ec max_bytes 20480000
+ - ceph osd pool set ec-ca target_max_bytes 20480000
+ - timeout 30 rados -p ec-ca bench 30 write || true
--- /dev/null
+roles:
+- [mon.0, osd.0, osd.1, client.0]
+tasks:
+- install:
+- exec:
+ client.0:
+ - ceph_test_async_driver
+ - ceph_test_msgr
+openstack:
+ - machine:
+ disk: 40 # GB
+ ram: 15000 # MB
+ cpus: 1
+ volumes: # attached to each instance
+ count: 0
+ size: 1 # GB
+overrides:
+ ceph:
+ conf:
+ client:
+ debug ms: 20
--- /dev/null
+roles:
+- - mon.a
+ - osd.0
+ - osd.1
+ - osd.2
+ - client.0
+- - osd.3
+ - osd.4
+ - osd.5
+tasks:
+- install:
+- ceph:
+ conf:
+ osd:
+ osd debug reject backfill probability: .3
+ osd min pg log entries: 25
+ osd max pg log entries: 100
+ osd max object name len: 460
+ osd max object namespace len: 64
+- exec:
+ client.0:
+ - sudo ceph osd pool create foo 64
+ - rados -p foo bench 60 write -b 1024 --no-cleanup
+ - sudo ceph osd pool set foo size 3
+ - sudo ceph osd out 0 1
+- sleep:
+ duration: 60
+- exec:
+ client.0:
+ - sudo ceph osd in 0 1
+- sleep:
+ duration: 60
--- /dev/null
+os_type: centos # xenial valgrind buggy, see http://tracker.ceph.com/issues/18126
+overrides:
+ install:
+ ceph:
+ flavor: notcmalloc
+ debuginfo: true
+ ceph:
+ conf:
+ global:
+ osd heartbeat grace: 40
+ debug deliberately leak memory: true
+ osd max object name len: 460
+ osd max object namespace len: 64
+ valgrind:
+ mon: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
+ osd: [--tool=memcheck]
+roles:
+- [mon.0, osd.0, osd.1, client.0]
+tasks:
+- install:
+- ceph:
+ expect_valgrind_errors: true
--- /dev/null
+../../../config/rados.yaml
\ No newline at end of file
--- /dev/null
+roles:
+- - mon.a
+ - osd.0
+ - osd.1
+ - client.a
+openstack:
+ - volumes: # attached to each instance
+ count: 2
+ size: 10 # GB
+tasks:
+- install:
+- ceph:
+- admin_socket:
+ osd.0:
+ version:
+ git_version:
+ help:
+ config show:
+ config set filestore_dump_file /tmp/foo:
+ perf dump:
+ perf schema:
+ get_heap_property tcmalloc.max_total_thread_cache_byte:
+ set_heap_property tcmalloc.max_total_thread_cache_bytes 67108864:
+ set_heap_property tcmalloc.max_total_thread_cache_bytes 33554432:
--- /dev/null
+roles:
+- - mon.a
+ - mon.b
+ - mon.c
+ - osd.0
+ - osd.1
+ - osd.2
+ - client.0
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - had wrong client addr
+ - had wrong cluster addr
+ - must scrub before tier agent can activate
+- workunit:
+ clients:
+ all:
+ - cephtool
+ - mon/pool_ops.sh
--- /dev/null
+roles:
+- - mon.0
+ - osd.0
+ - osd.1
+ - osd.2
+ - client.0
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+
+overrides:
+ ceph:
+ conf:
+ osd:
+ debug osd: 5
+
+tasks:
+- install:
+- ceph:
+- divergent_priors:
--- /dev/null
+roles:
+- - mon.0
+ - osd.0
+ - osd.1
+ - osd.2
+ - client.0
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+
+overrides:
+ ceph:
+ conf:
+ osd:
+ debug osd: 5
+
+tasks:
+- install:
+- ceph:
+- divergent_priors2:
--- /dev/null
+roles:
+- - mon.a
+ - osd.0
+ - osd.1
+openstack:
+ - volumes: # attached to each instance
+ count: 2
+ size: 10 # GB
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+- dump_stuck:
--- /dev/null
+roles:
+- - mon.a
+ - mon.b
+ - mon.c
+ - osd.0
+ - osd.1
+ - osd.2
+ - osd.3
+openstack:
+ - volumes: # attached to each instance
+ count: 4
+ size: 10 # GB
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - objects unfound and apparently lost
+- ec_lost_unfound:
--- /dev/null
+roles:
+- - mon.a
+ - mon.b
+ - mon.c
+ - osd.0
+ - osd.1
+ - osd.2
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - objects unfound and apparently lost
+- rep_lost_unfound_delete:
--- /dev/null
+roles:
+- - mon.a
+ - mon.b
+ - mon.c
+ - osd.0
+ - osd.1
+ - osd.2
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - objects unfound and apparently lost
+- lost_unfound:
--- /dev/null
+roles:
+- - mon.0
+ - mon.1
+ - mon.2
+ - osd.0
+ - osd.1
+ - osd.2
+ - client.0
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+tasks:
+- install:
+- ceph:
+- workunit:
+ clients:
+ all:
+ - mon/test_mon_config_key.py
--- /dev/null
+roles:
+- - mon.a
+ - osd.0
+ - osd.1
+ - osd.2
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+tasks:
+- install:
+- ceph:
+ config:
+ global:
+ osd pool default min size : 1
+ osd:
+ debug monc: 1
+ debug ms: 1
+- mon_seesaw:
+- ceph_manager.create_pool:
+ kwargs:
+ pool_name: test
+ pg_num: 1
+- ceph_manager.wait_for_clean:
+ kwargs:
+ timeout: 60
--- /dev/null
+roles:
+- - mon.a
+ - mon.b
+ - mon.c
+ - osd.0
+ - osd.1
+ - client.0
+openstack:
+ - volumes: # attached to each instance
+ count: 2
+ size: 10 # GB
+tasks:
+- install:
+- ceph:
+- mon_thrash:
+ revive_delay: 20
+ thrash_delay: 1
+- workunit:
+ clients:
+ all:
+ - mon/workloadgen.sh
+ env:
+ LOADGEN_NUM_OSDS: "5"
+ VERBOSE: "1"
+ DURATION: "600"
--- /dev/null
+roles:
+- - mon.a
+ - mon.b
+ - mon.c
+ - osd.0
+ - osd.1
+ - osd.2
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+ conf:
+ osd:
+ osd min pg log entries: 5
+- osd_backfill:
--- /dev/null
+roles:
+- - mon.a
+ - mon.b
+ - mon.c
+ - osd.0
+ - osd.1
+ - osd.2
+ - osd.3
+openstack:
+ - volumes: # attached to each instance
+ count: 4
+ size: 10 # GB
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+ conf:
+ osd:
+ osd min pg log entries: 5
+ osd_fast_fail_on_connection_refused: false
+- osd_recovery.test_incomplete_pgs:
--- /dev/null
+roles:
+- - mon.a
+ - mon.b
+ - mon.c
+ - osd.0
+ - osd.1
+ - osd.2
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+ conf:
+ osd:
+ osd min pg log entries: 5
+ osd_fast_fail_on_connection_refused: false
+- osd_recovery:
--- /dev/null
+roles:
+- - mon.0
+ - mon.1
+ - mon.2
+ - osd.0
+ - osd.1
+ - osd.2
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+tasks:
+- install:
+- ceph:
+ config:
+ global:
+ osd pool default min size : 1
+ log-whitelist:
+ - objects unfound and apparently lost
+- peer:
--- /dev/null
+roles:
+- - mon.a
+ - osd.0
+ - osd.1
+ - osd.2
+ - client.0
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - slow request
+- exec:
+ client.0:
+ - sudo ceph osd pool create foo 128 128
+ - sleep 5
+ - sudo ceph tell osd.0 injectargs -- --osd-inject-failure-on-pg-removal
+ - sudo ceph osd pool delete foo foo --yes-i-really-really-mean-it
+- ceph.wait_for_failure: [osd.0]
+- exec:
+ client.0:
+ - sudo ceph osd down 0
+- ceph.restart: [osd.0]
+- ceph.healthy:
--- /dev/null
+roles:
+- - mon.a
+ - osd.0
+ - osd.1
+ - osd.2
+ - client.0
+openstack:
+ - volumes: # attached to each instance
+ count: 2
+ size: 10 # GB
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - had wrong client addr
+ - had wrong cluster addr
+ - reached quota
+- workunit:
+ clients:
+ all:
+ - rados/test_rados_tool.sh
--- /dev/null
+roles:
+- - mon.0
+ - mon.1
+ - mon.2
+ - osd.0
+ - osd.1
+ - osd.2
+ - client.0
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - no reply from
+- full_sequential:
+ - radosbench:
+ clients: [client.0]
+ time: 30
+ - rebuild_mondb:
+ - radosbench:
+ clients: [client.0]
+ time: 30
--- /dev/null
+roles:
+- - mon.0
+ - osd.0
+ - osd.1
+ - osd.2
+ - client.0
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+
+overrides:
+ ceph:
+ conf:
+ osd:
+ debug osd: 5
+
+tasks:
+- install:
+- ceph:
+- reg11184:
--- /dev/null
+roles:
+- [mon.0]
+- [osd.0, osd.1, osd.2, client.0]
+
+tasks:
+- install:
+- ceph:
+ fs: xfs
+- resolve_stuck_peering:
+
--- /dev/null
+roles:
+- - mon.0
+ - mon.1
+ - mon.2
+ - osd.0
+ - osd.1
+ - osd.2
+ - mds.a
+ - client.0
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - had wrong client addr
+ conf:
+ client.rest0:
+ debug ms: 1
+ debug objecter: 20
+ debug rados: 20
+- rest-api: [client.0]
+- workunit:
+ clients:
+ all:
+ - rest/test.py
--- /dev/null
+overrides:
+ ceph:
+ fs: ext4
+ conf:
+ global:
+ osd max object name len: 460
+ osd max object namespace len: 64
+roles:
+- [mon.a, osd.0, osd.1, osd.2, client.0]
+tasks:
+- install:
+- ceph:
+- workunit:
+ clients:
+ all:
+ - rados/test_envlibrados_for_rocksdb.sh
--- /dev/null
+roles:
+- - mon.a
+ - osd.0
+ - osd.1
+ - osd.2
+- - osd.3
+ - osd.4
+ - osd.5
+ - client.0
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+- thrashosds:
+ op_delay: 30
+ clean_interval: 120
+ chance_down: .5
+- workunit:
+ clients:
+ all:
+ - rados/load-gen-mix-small.sh
--- /dev/null
+roles:
+- - mon.a
+ - osd.0
+ - osd.1
+ - osd.2
+- - osd.3
+ - osd.4
+ - osd.5
+ - client.0
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 30 # GB
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - slow request
+- exec:
+ client.0:
+ - sudo ceph osd pool create base 4
+ - sudo ceph osd pool create cache 4
+ - sudo ceph osd tier add base cache
+ - sudo ceph osd tier cache-mode cache writeback
+ - sudo ceph osd tier set-overlay base cache
+ - sudo ceph osd pool set cache hit_set_type bloom
+ - sudo ceph osd pool set cache hit_set_count 8
+ - sudo ceph osd pool set cache hit_set_period 60
+ - sudo ceph osd pool set cache target_max_objects 500
+- background_exec:
+ mon.a:
+ - while true
+ - do sleep 30
+ - echo proxy
+ - sudo ceph osd tier cache-mode cache proxy
+ - sleep 10
+ - sudo ceph osd pool set cache cache_target_full_ratio .001
+ - echo cache-try-flush-evict-all
+ - rados -p cache cache-try-flush-evict-all
+ - sleep 5
+ - echo cache-flush-evict-all
+ - rados -p cache cache-flush-evict-all
+ - sleep 5
+ - echo remove overlay
+ - sudo ceph osd tier remove-overlay base
+ - sleep 20
+ - echo add writeback overlay
+ - sudo ceph osd tier cache-mode cache writeback
+ - sudo ceph osd pool set cache cache_target_full_ratio .8
+ - sudo ceph osd tier set-overlay base cache
+ - sleep 30
+ - sudo ceph osd tier cache-mode cache readproxy
+ - done
+- rados:
+ clients: [client.0]
+ pools: [base]
+ max_seconds: 600
+ ops: 400000
+ objects: 10000
+ size: 1024
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ copy_from: 50
--- /dev/null
+roles:
+- - mon.0
+ - mon.1
+ - mon.2
+ - osd.0
+ - osd.1
+ - osd.2
+ - client.0
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+tasks:
+- install:
+- ceph:
+ config:
+ global:
+ osd pool default min size : 1
+ client:
+ debug ms: 1
+ debug objecter: 20
+ debug rados: 20
+ log-whitelist:
+ - objects unfound and apparently lost
+- watch_notify_same_primary:
+ clients: [client.0]
--- /dev/null
+../../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+../basic/msgr
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 5000
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 500
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+../../../config/rados.yaml
\ No newline at end of file
--- /dev/null
+roles:
+- [osd.0, osd.1, osd.2, client.0, mon.a]
+- [osd.3, osd.4, osd.5, mon.b]
+- [osd.6, osd.7, osd.8, mon.c]
+- [osd.9, osd.10, osd.11]
--- /dev/null
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
--- /dev/null
+../thrash/fs
\ No newline at end of file
--- /dev/null
+../../../mon_kv_backend/leveldb.yaml
\ No newline at end of file
--- /dev/null
+../thrash/msgr-failures
\ No newline at end of file
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+../../../config/rados.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+ - slow request
+ conf:
+ osd:
+ osd debug reject backfill probability: .3
+ osd scrub min interval: 60
+ osd scrub max interval: 120
+ osd max backfills: 6
+- thrashosds:
+ timeout: 1200
+ chance_pgnum_grow: 1
+ chance_pgpnum_fix: 1
+ min_in: 8
--- /dev/null
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+ conf:
+ mon:
+ mon osd pool ec fast read: 1
+ osd:
+ osd debug reject backfill probability: .3
+ osd scrub min interval: 60
+ osd scrub max interval: 120
+ osd max backfills: 2
+- thrashosds:
+ timeout: 1200
+ chance_pgnum_grow: 1
+ chance_pgpnum_fix: 1
+ min_in: 4
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ mon:
+ mon min osdmap epochs: 2
+ osd:
+ osd map cache size: 1
+ osd scrub min interval: 60
+ osd scrub max interval: 120
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+ - osd_map_cache_size
+- thrashosds:
+ timeout: 1800
+ chance_pgnum_grow: 1
+ chance_pgpnum_fix: 1
+ chance_test_map_discontinuity: 0.5
+ min_in: 8
--- /dev/null
+tasks:
+- install:
+- ceph:
+ conf:
+ osd:
+ osd scrub min interval: 60
+ osd scrub max interval: 120
+ osd max backfills: 9
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+- thrashosds:
+ timeout: 1200
+ chance_pgnum_grow: 3
+ chance_pgpnum_fix: 1
+ min_in: 8
--- /dev/null
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+ conf:
+ osd:
+ osd scrub min interval: 60
+ osd scrub max interval: 120
+- thrashosds:
+ timeout: 1200
+ chance_pgnum_grow: 2
+ chance_pgpnum_fix: 1
+ min_in: 8
--- /dev/null
+../../../../erasure-code/ec-rados-plugin=lrc-k=4-m=2-l=3.yaml
\ No newline at end of file
--- /dev/null
+arch: x86_64
--- /dev/null
+../thrash/clusters
\ No newline at end of file
--- /dev/null
+../thrash/fs
\ No newline at end of file
--- /dev/null
+../../../mon_kv_backend/leveldb.yaml
\ No newline at end of file
--- /dev/null
+../thrash/msgr-failures
\ No newline at end of file
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+../../../config/rados.yaml
\ No newline at end of file
--- /dev/null
+../../../distros/supported
\ No newline at end of file
--- /dev/null
+../thrash/thrashers
\ No newline at end of file
--- /dev/null
+../../../../erasure-code/ec-rados-plugin=isa-k=2-m=1.yaml
\ No newline at end of file
--- /dev/null
+../../../../clusters/fixed-4.yaml
\ No newline at end of file
--- /dev/null
+openstack:
+ - volumes: # attached to each instance
+ count: 4
+ size: 10 # GB
--- /dev/null
+../../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+../../../mon_kv_backend/leveldb.yaml
\ No newline at end of file
--- /dev/null
+../thrash/msgr-failures
\ No newline at end of file
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+../../../config/rados.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+ - slow request
+ conf:
+ osd:
+ osd debug reject backfill probability: .3
+ osd scrub min interval: 60
+ osd scrub max interval: 120
+ osd max backfills: 3
+- thrashosds:
+ timeout: 1200
+ chance_pgnum_grow: 1
+ chance_pgpnum_fix: 1
+ min_in: 8
--- /dev/null
+../../../../erasure-code/ec-rados-plugin=shec-k=4-m=3-c=2.yaml
\ No newline at end of file
--- /dev/null
+../thrash/clusters
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ mon osd pool ec fast read: true
--- /dev/null
+../thrash/fs
\ No newline at end of file
--- /dev/null
+../../../mon_kv_backend/leveldb.yaml
\ No newline at end of file
--- /dev/null
+../thrash/msgr-failures
\ No newline at end of file
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+../../../config/rados.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+ conf:
+ osd:
+ osd debug reject backfill probability: .3
+ osd scrub min interval: 60
+ osd scrub max interval: 120
+ osd max backfills: 2
+- thrashosds:
+ timeout: 1200
+ chance_pgnum_grow: 1
+ chance_pgpnum_fix: 1
+ min_in: 4
--- /dev/null
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+ conf:
+ mon:
+ mon osd pool ec fast read: 1
+ osd:
+ osd debug reject backfill probability: .3
+ osd scrub min interval: 60
+ osd scrub max interval: 120
+ osd max backfills: 3
+- thrashosds:
+ timeout: 1200
+ chance_pgnum_grow: 1
+ chance_pgpnum_fix: 1
+ min_in: 4
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ mon:
+ mon min osdmap epochs: 2
+ osd:
+ osd map cache size: 1
+ osd scrub min interval: 60
+ osd scrub max interval: 120
+ osd max backfills: 5
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+ - osd_map_cache_size
+- thrashosds:
+ timeout: 1800
+ chance_pgnum_grow: 1
+ chance_pgpnum_fix: 1
+ chance_test_map_discontinuity: 0.5
+ min_in: 4
--- /dev/null
+tasks:
+- install:
+- ceph:
+ conf:
+ osd:
+ osd scrub min interval: 60
+ osd scrub max interval: 120
+ osd max backfills: 9
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+- thrashosds:
+ timeout: 1200
+ chance_pgnum_grow: 3
+ chance_pgpnum_fix: 1
+ min_in: 4
--- /dev/null
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+ conf:
+ osd:
+ osd scrub min interval: 60
+ osd scrub max interval: 120
+ osd max backfills: 4
+- thrashosds:
+ timeout: 1200
+ chance_pgnum_grow: 2
+ chance_pgpnum_fix: 1
+ min_in: 4
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ enable experimental unrecoverable data corrupting features: '*'
+ thrashosds:
+ disable_objectstore_tool_tests: true
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ pool_snaps: true
+ ec_pool: true
+ erasure_code_use_hacky_overwrites: true
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
--- /dev/null
+../../../../erasure-code/ec-rados-plugin=jerasure-k=2-m=1.yaml
\ No newline at end of file
--- /dev/null
+../../../../erasure-code/ec-rados-plugin=jerasure-k=3-m=1.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- full_sequential:
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ unique_pool: true
+ ec_pool: true
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ unique_pool: true
+ ec_pool: true
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ unique_pool: true
+ ec_pool: true
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ unique_pool: true
+ ec_pool: true
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ unique_pool: true
+ ec_pool: true
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ enable experimental unrecoverable data corrupting features: '*'
+ thrashosds:
+ disable_objectstore_tool_tests: true
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 400000
+ max_seconds: 600
+ max_in_flight: 64
+ objects: 1024
+ size: 16384
+ ec_pool: true
+ erasure_code_use_hacky_overwrites: true
+ fast_read: true
+ op_weights:
+ read: 100
+ write: 100
+ append: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
+ setattr: 25
+ rmattr: 25
--- /dev/null
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 400000
+ max_seconds: 600
+ max_in_flight: 64
+ objects: 1024
+ size: 16384
+ ec_pool: true
+ fast_read: true
+ op_weights:
+ read: 100
+ write: 0
+ append: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
+ setattr: 25
+ rmattr: 25
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ enable experimental unrecoverable data corrupting features: '*'
+ thrashosds:
+ disable_objectstore_tool_tests: true
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 400000
+ max_seconds: 600
+ max_in_flight: 64
+ objects: 1024
+ size: 16384
+ ec_pool: true
+ erasure_code_use_hacky_overwrites: true
+ op_weights:
+ read: 100
+ write: 100
+ append: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
+ setattr: 25
+ rmattr: 25
--- /dev/null
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 400000
+ max_seconds: 600
+ max_in_flight: 64
+ objects: 1024
+ size: 16384
+ ec_pool: true
+ op_weights:
+ read: 100
+ write: 0
+ append: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
+ setattr: 25
+ rmattr: 25
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ enable experimental unrecoverable data corrupting features: '*'
+ thrashosds:
+ disable_objectstore_tool_tests: true
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ ec_pool: true
+ erasure_code_use_hacky_overwrites: true
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
--- /dev/null
+../../../../overrides/2-size-1-min-size.yaml
\ No newline at end of file
--- /dev/null
+../../../../overrides/2-size-2-min-size.yaml
\ No newline at end of file
--- /dev/null
+../../../../overrides/3-size-2-min-size.yaml
\ No newline at end of file
--- /dev/null
+../../../../overrides/short_pg_log.yaml
\ No newline at end of file
--- /dev/null
+../../../../clusters/fixed-2.yaml
\ No newline at end of file
--- /dev/null
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 30 # GB
--- /dev/null
+../../../../fs/btrfs.yaml
\ No newline at end of file
--- /dev/null
+../../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ osd:
+ osd debug randomize hobject sort order: true
--- /dev/null
+../basic/msgr
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 2500
+ ms tcp read timeout: 5
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 5000
+ osd:
+ osd heartbeat use min delay socket: true
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 2500
+ ms inject delay type: osd
+ ms inject delay probability: .005
+ ms inject delay max: 1
+ ms inject internal delays: .002
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+../../../config/rados.yaml
\ No newline at end of file
--- /dev/null
+../../../mon_kv_backend/rocksdb.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+ conf:
+ osd:
+ osd debug reject backfill probability: .3
+ osd scrub min interval: 60
+ osd scrub max interval: 120
+ osd max backfills: 3
+- thrashosds:
+ timeout: 1200
+ chance_pgnum_grow: 1
+ chance_pgpnum_fix: 1
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ mon:
+ mon min osdmap epochs: 2
+ osd:
+ osd map cache size: 1
+ osd scrub min interval: 60
+ osd scrub max interval: 120
+ osd scrub during recovery: false
+ osd max backfills: 6
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+ - osd_map_cache_size
+- thrashosds:
+ timeout: 1800
+ chance_pgnum_grow: 1
+ chance_pgpnum_fix: 1
+ chance_test_map_discontinuity: 0.5
--- /dev/null
+tasks:
+- install:
+- ceph:
+ conf:
+ osd:
+ osd scrub min interval: 60
+ osd scrub max interval: 120
+ journal throttle high multiple: 2
+ journal throttle max multiple: 10
+ filestore queue throttle high multiple: 2
+ filestore queue throttle max multiple: 10
+ osd max backfills: 9
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+- thrashosds:
+ timeout: 1200
+ chance_pgnum_grow: 3
+ chance_pgpnum_fix: 1
+openstack:
+- volumes:
+ size: 50
--- /dev/null
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+ conf:
+ osd:
+ osd scrub min interval: 60
+ osd scrub max interval: 120
+ filestore odsync write: true
+ osd max backfills: 2
+- thrashosds:
+ timeout: 1200
+ chance_pgnum_grow: 2
+ chance_pgpnum_fix: 1
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client.0:
+ admin socket: /var/run/ceph/ceph-$name.asok
+tasks:
+- radosbench:
+ clients: [client.0]
+ time: 150
+- admin_socket:
+ client.0:
+ objecter_requests:
+ test: "http://git.ceph.com/?p=ceph.git;a=blob_plain;f=src/test/admin_socket/objecter_requests;hb={branch}"
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - must scrub before tier agent can activate
+tasks:
+- exec:
+ client.0:
+ - sudo ceph osd erasure-code-profile set teuthologyprofile ruleset-failure-domain=osd m=1 k=2
+ - sudo ceph osd pool create base 4 4 erasure teuthologyprofile
+ - sudo ceph osd pool set base min_size 2
+ - sudo ceph osd pool create cache 4
+ - sudo ceph osd tier add base cache
+ - sudo ceph osd tier cache-mode cache writeback
+ - sudo ceph osd tier set-overlay base cache
+ - sudo ceph osd pool set cache hit_set_type bloom
+ - sudo ceph osd pool set cache hit_set_count 8
+ - sudo ceph osd pool set cache hit_set_period 60
+ - sudo ceph osd pool set cache target_max_objects 5000
+- rados:
+ clients: [client.0]
+ pools: [base]
+ ops: 10000
+ objects: 6600
+ max_seconds: 1200
+ size: 1024
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ copy_from: 50
--- /dev/null
+overrides:
+ ceph:
+ crush_tunables: firefly
+ log-whitelist:
+ - must scrub before tier agent can activate
+tasks:
+- exec:
+ client.0:
+ - sudo ceph osd pool create base 4
+ - sudo ceph osd pool create cache 4
+ - sudo ceph osd tier add base cache
+ - sudo ceph osd tier cache-mode cache writeback
+ - sudo ceph osd tier set-overlay base cache
+ - sudo ceph osd pool set cache hit_set_type bloom
+ - sudo ceph osd pool set cache hit_set_count 8
+ - sudo ceph osd pool set cache hit_set_period 60
+ - sudo ceph osd pool set cache target_max_objects 250
+ - sudo ceph osd pool set cache min_read_recency_for_promote 2
+ - sudo ceph osd pool set cache min_write_recency_for_promote 2
+- rados:
+ clients: [client.0]
+ pools: [base]
+ ops: 4000
+ objects: 500
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ copy_from: 50
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - must scrub before tier agent can activate
+tasks:
+- exec:
+ client.0:
+ - sudo ceph osd pool create base 4
+ - sudo ceph osd pool create cache 4
+ - sudo ceph osd tier add base cache
+ - sudo ceph osd tier cache-mode cache readproxy
+ - sudo ceph osd tier set-overlay base cache
+ - sudo ceph osd pool set cache hit_set_type bloom
+ - sudo ceph osd pool set cache hit_set_count 8
+ - sudo ceph osd pool set cache hit_set_period 3600
+ - sudo ceph osd pool set cache target_max_objects 250
+- rados:
+ clients: [client.0]
+ pools: [base]
+ ops: 4000
+ objects: 500
+ pool_snaps: true
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ copy_from: 50
+ flush: 50
+ try_flush: 50
+ evict: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - must scrub before tier agent can activate
+tasks:
+- exec:
+ client.0:
+ - sudo ceph osd pool create base 4
+ - sudo ceph osd pool create cache 4
+ - sudo ceph osd tier add base cache
+ - sudo ceph osd tier cache-mode cache writeback
+ - sudo ceph osd tier set-overlay base cache
+ - sudo ceph osd pool set cache hit_set_type bloom
+ - sudo ceph osd pool set cache hit_set_count 8
+ - sudo ceph osd pool set cache hit_set_period 3600
+ - sudo ceph osd pool set cache target_max_objects 250
+ - sudo ceph osd pool set cache min_read_recency_for_promote 0
+ - sudo ceph osd pool set cache min_write_recency_for_promote 0
+- rados:
+ clients: [client.0]
+ pools: [base]
+ ops: 4000
+ objects: 500
+ pool_snaps: true
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ copy_from: 50
+ flush: 50
+ try_flush: 50
+ evict: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+openstack:
+ - machine:
+ ram: 15000 # MB
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - must scrub before tier agent can activate
+tasks:
+- exec:
+ client.0:
+ - sudo ceph osd pool create base 4
+ - sudo ceph osd pool create cache 4
+ - sudo ceph osd tier add base cache
+ - sudo ceph osd tier cache-mode cache writeback
+ - sudo ceph osd tier set-overlay base cache
+ - sudo ceph osd pool set cache hit_set_type bloom
+ - sudo ceph osd pool set cache hit_set_count 8
+ - sudo ceph osd pool set cache hit_set_period 3600
+ - sudo ceph osd pool set cache target_max_objects 250
+ - sudo ceph osd pool set cache min_read_recency_for_promote 2
+- rados:
+ clients: [client.0]
+ pools: [base]
+ ops: 4000
+ objects: 500
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ copy_from: 50
+ flush: 50
+ try_flush: 50
+ evict: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - must scrub before tier agent can activate
+tasks:
+- exec:
+ client.0:
+ - sudo ceph osd pool create base 4
+ - sudo ceph osd pool create cache 4
+ - sudo ceph osd tier add base cache
+ - sudo ceph osd tier cache-mode cache writeback
+ - sudo ceph osd tier set-overlay base cache
+ - sudo ceph osd pool set cache hit_set_type bloom
+ - sudo ceph osd pool set cache hit_set_count 8
+ - sudo ceph osd pool set cache hit_set_period 3600
+ - sudo ceph osd pool set cache min_read_recency_for_promote 0
+ - sudo ceph osd pool set cache min_write_recency_for_promote 0
+- rados:
+ clients: [client.0]
+ pools: [base]
+ ops: 4000
+ objects: 500
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ copy_from: 50
+ flush: 50
+ try_flush: 50
+ evict: 50
--- /dev/null
+override:
+ conf:
+ osd:
+ osd deep scrub update digest min age: 0
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ pool_snaps: true
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - reached quota
+ crush_tunables: hammer
+ conf:
+ client:
+ debug ms: 1
+ debug objecter: 20
+ debug rados: 20
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rados/test.sh
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client.0:
+ debug ms: 1
+ debug objecter: 20
+ debug rados: 20
+tasks:
+- full_sequential:
+ - radosbench:
+ clients: [client.0]
+ time: 120
+ - radosbench:
+ clients: [client.0]
+ time: 120
+ - radosbench:
+ clients: [client.0]
+ time: 120
+ - radosbench:
+ clients: [client.0]
+ time: 120
+ - radosbench:
+ clients: [client.0]
+ time: 120
+ - radosbench:
+ clients: [client.0]
+ time: 120
--- /dev/null
+overrides:
+ ceph:
+ crush_tunables: jewel
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 400000
+ max_seconds: 600
+ max_in_flight: 64
+ objects: 1024
+ size: 16384
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
+ setattr: 25
+ rmattr: 25
--- /dev/null
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
--- /dev/null
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 500
+ write_fadvise_dontneed: true
+ op_weights:
+ write: 100
--- /dev/null
+openstack:
+ - machine:
+ disk: 100 # GB
+ - volumes: # attached to each instance
+ count: 3
+ size: 30 # GB
--- /dev/null
+meta:
+- desc: |
+ Run ceph on two nodes,
+ with a separate client-only node.
+ Use xfs beneath the osds.
+overrides:
+ ceph:
+ conf:
+ mon:
+ mon warn on legacy crush tunables: false
+ fs: xfs
+roles:
+- - mon.a
+ - mon.b
+ - mon.c
+ - mds.a
+ - osd.0
+ - osd.1
+ - osd.2
+- - osd.3
+ - osd.4
+ - osd.5
+- - client.0
--- /dev/null
+meta:
+- desc: install ceph/jewel latest
+tasks:
+- install:
+ branch: jewel
+ exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+- print: "**** done install jewel"
+- ceph:
+- print: "**** done ceph"
--- /dev/null
+meta:
+- desc: |
+ install upgrade ceph/-x on one node only
+ 1st half
+ restart : osd.0,1,2,3,4,5
+tasks:
+- install.upgrade:
+ osd.0:
+- print: "**** done install.upgrade osd.0"
+- ceph.restart:
+ daemons: [osd.0, osd.1, osd.2, osd.3, osd.4, osd.5]
+- print: "**** done ceph.restart 1st half"
--- /dev/null
+meta:
+- desc: |
+ randomly kill and revive osd
+ small chance to increase the number of pgs
+overrides:
+ ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+ - log bound mismatch
+tasks:
+- thrashosds:
+ timeout: 1200
+ chance_pgnum_grow: 1
+ chance_pgpnum_fix: 1
+- print: "**** done thrashosds 3-thrash"
--- /dev/null
+meta:
+- desc: |
+ restart mon.a so it is upgraded to -x
+tasks:
+- ceph.restart:
+ daemons: [mon.a]
+ wait-for-healthy: false
+ wait-for-osds-up: true
+- print: "**** done ceph.restart mon.a"
--- /dev/null
+meta:
+- desc: |
+ run basic cls tests for rbd
+tasks:
+- workunit:
+ branch: jewel
+ clients:
+ client.0:
+ - cls/test_cls_rbd.sh
+- print: "**** done cls/test_cls_rbd.sh 5-workload"
--- /dev/null
+meta:
+- desc: |
+ run basic import/export cli tests for rbd
+tasks:
+- workunit:
+ branch: jewel
+ clients:
+ client.0:
+ - rbd/import_export.sh
+ env:
+ RBD_CREATE_ARGS: --new-format
+- print: "**** done rbd/import_export.sh 5-workload"
--- /dev/null
+meta:
+- desc: |
+ randomized correctness test for rados operations on a replicated pool,
+ using only reads, writes, and deletes
+tasks:
+- full_sequential:
+ - rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 500
+ write_append_excl: false
+ op_weights:
+ read: 45
+ write: 45
+ delete: 10
+- print: "**** done rados/readwrite 5-workload"
--- /dev/null
+meta:
+- desc: |
+ randomized correctness test for rados operations on a replicated pool with snapshot operations
+tasks:
+- full_sequential:
+ - rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ write_append_excl: false
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+- print: "**** done rados/snaps-few-objects 5-workload"
--- /dev/null
+meta:
+- desc: |
+ restart mon.b so it is upgraded to -x
+tasks:
+- ceph.restart:
+ daemons: [mon.b]
+ wait-for-healthy: false
+ wait-for-osds-up: true
+- print: "**** done ceph.restart mon.b 6-next-mon"
--- /dev/null
+meta:
+- desc: |
+ run randomized correctness test for rados operations
+ generate write load with rados bench
+tasks:
+- full_sequential:
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+- print: "**** done radosbench 7-workload"
--- /dev/null
+meta:
+- desc: |
+ librbd C and C++ api tests
+tasks:
+- workunit:
+ branch: jewel
+ clients:
+ client.0:
+ - rbd/test_librbd.sh
+- print: "**** done rbd/test_librbd.sh 7-workload"
--- /dev/null
+meta:
+- desc: |
+ restart mon.c so it is upgraded to -x
+ as all mon were upgrated, expected ceph cluster reach quorum
+tasks:
+- ceph.restart:
+ daemons: [mon.c]
+ wait-for-healthy: false
+ wait-for-osds-up: true
+- print: "**** done ceph.restart mon.c 8-next-mon"
+- ceph.wait_for_mon_quorum: [a, b, c]
+- print: "**** done wait_for_mon_quorum 8-next-mon"
--- /dev/null
+meta:
+- desc: |
+ librbd python api tests
+tasks:
+- workunit:
+ branch: jewel
+ clients:
+ client.0:
+ - rbd/test_librbd_python.sh
+- print: "**** done rbd/test_librbd_python.sh 9-workload"
--- /dev/null
+meta:
+- desc: |
+ swift api tests for rgw
+tasks:
+- rgw:
+ client.0:
+ default_idle_timeout: 300
+- print: "**** done rgw 9-workload"
+- swift:
+ client.0:
+ rgw_server: client.0
+- print: "**** done swift 9-workload"
--- /dev/null
+meta:
+- desc: |
+ randomized correctness test for rados operations on a replicated pool with snapshot operations
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 500
+ write_append_excl: false
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
--- /dev/null
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+- thrashosds:
+ timeout: 1200
+ chance_pgnum_grow: 1
+ chance_pgpnum_fix: 1
--- /dev/null
+tasks:
+- install:
+- ceph:
--- /dev/null
+../../../../clusters/fixed-2.yaml
\ No newline at end of file
--- /dev/null
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
--- /dev/null
+../../../../fs/btrfs.yaml
\ No newline at end of file
--- /dev/null
+../../../mon_kv_backend
\ No newline at end of file
--- /dev/null
+../basic/msgr
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 5000
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+../../../config/rados.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- mon_recovery:
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - reached quota
+ conf:
+ client:
+ debug ms: 1
+ debug objecter: 20
+ debug rados: 20
+ debug monc: 20
+tasks:
+- workunit:
+ timeout: 6h
+ clients:
+ client.0:
+ - rados/test.sh
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - cls
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ lockdep: true
--- /dev/null
+os_type: centos # xenial valgrind buggy, see http://tracker.ceph.com/issues/18126
+overrides:
+ install:
+ ceph:
+ flavor: notcmalloc
+ debuginfo: true
+ ceph:
+ conf:
+ global:
+ osd heartbeat grace: 40
+ valgrind:
+ mon: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
+ osd: [--tool=memcheck]
+ mds: [--tool=memcheck]
--- /dev/null
+tasks:
+- install:
+- ceph:
--- /dev/null
+tasks:
+- exec:
+ client.0:
+ - sudo ceph osd pool create cache 4
+ - sudo ceph osd tier add rbd cache
+ - sudo ceph osd tier cache-mode cache writeback
+ - sudo ceph osd tier set-overlay rbd cache
+ - sudo ceph osd pool set cache hit_set_type bloom
+ - sudo ceph osd pool set cache hit_set_count 8
+ - sudo ceph osd pool set cache hit_set_period 60
+ - sudo ceph osd pool set cache target_max_objects 250
--- /dev/null
+../../../../clusters/fixed-1.yaml
\ No newline at end of file
--- /dev/null
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 30 # GB
--- /dev/null
+../../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 5000
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 500
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/test_librbd.sh
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/run_cli_tests.sh
+
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - cls/test_cls_rbd.sh
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/test_lock_fence.sh
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/test_librbd_python.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
--- /dev/null
+../basic/clusters
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ rbd default format: 2
+ rbd default features: 61
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ rbd default format: 1
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ rbd default format: 2
+ rbd default features: 125
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ rbd default format: 2
+ rbd default features: 1
--- /dev/null
+../basic/fs
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 5000
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 500
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+tasks:
+- exec:
+ client.0:
+ - sudo ceph osd erasure-code-profile set teuthologyprofile ruleset-failure-domain=osd m=1 k=2
+ - sudo ceph osd pool create datapool 4 4 erasure teuthologyprofile
+ - sudo ceph osd pool set datapool debug_white_box_testing_ec_overwrites true
+
+overrides:
+ ceph:
+ conf:
+ global:
+ enable experimental unrecoverable data corrupting features: debug_white_box_testing_ec_overwrites
+ client:
+ rbd default data pool: datapool
--- /dev/null
+tasks:
+- exec:
+ client.0:
+ - sudo ceph osd pool create datapool 4
+
+overrides:
+ ceph:
+ conf:
+ client:
+ rbd default data pool: datapool
--- /dev/null
+tasks:
+- exec:
+ client.0:
+ - sudo ceph osd pool create cache 4
+ - sudo ceph osd tier add rbd cache
+ - sudo ceph osd tier cache-mode cache writeback
+ - sudo ceph osd tier set-overlay rbd cache
+ - sudo ceph osd pool set cache hit_set_type bloom
+ - sudo ceph osd pool set cache hit_set_count 8
+ - sudo ceph osd pool set cache hit_set_period 60
+ - sudo ceph osd pool set cache target_max_objects 250
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/copy.sh
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/import_export.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+ conf:
+ client:
+ rbd cache: false
--- /dev/null
+tasks:
+- install:
+- ceph:
+ conf:
+ client:
+ rbd cache: true
--- /dev/null
+tasks:
+- install:
+- ceph:
+ conf:
+ client:
+ rbd cache: true
+ rbd cache max dirty: 0
--- /dev/null
+../../../../clusters/fixed-3.yaml
\ No newline at end of file
--- /dev/null
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 30 # GB
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ rbd clone copy on read: true
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ rbd skip partial discard: true
--- /dev/null
+../basic/fs
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 5000
+ log-whitelist:
+ - wrongly marked me down
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+tasks:
+- exec:
+ client.0:
+ - sudo ceph osd erasure-code-profile set teuthologyprofile ruleset-failure-domain=osd m=1 k=2
+ - sudo ceph osd pool create datapool 4 4 erasure teuthologyprofile
+ - sudo ceph osd pool set datapool debug_white_box_testing_ec_overwrites true
+
+overrides:
+ ceph:
+ conf:
+ global:
+ enable experimental unrecoverable data corrupting features: debug_white_box_testing_ec_overwrites
+ client:
+ rbd default data pool: datapool
--- /dev/null
+tasks:
+- exec:
+ client.0:
+ - sudo ceph osd pool create datapool 4
+
+overrides:
+ ceph:
+ conf:
+ client:
+ rbd default data pool: datapool
--- /dev/null
+tasks:
+- exec:
+ client.0:
+ - sudo ceph osd pool create cache 4
+ - sudo ceph osd tier add rbd cache
+ - sudo ceph osd tier cache-mode cache writeback
+ - sudo ceph osd tier set-overlay rbd cache
+ - sudo ceph osd pool set cache hit_set_type bloom
+ - sudo ceph osd pool set cache hit_set_count 8
+ - sudo ceph osd pool set cache hit_set_period 60
+ - sudo ceph osd pool set cache target_max_objects 250
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/test_librbd.sh
+ env:
+ RBD_FEATURES: "1"
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/test_librbd.sh
+ env:
+ RBD_FEATURES: "61"
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/test_librbd.sh
+ env:
+ RBD_FEATURES: "125"
--- /dev/null
+tasks:
+- rbd_fsx:
+ clients: [client.0]
+ ops: 20000
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/test_librbd_python.sh
+ env:
+ RBD_FEATURES: "1"
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/test_librbd_python.sh
+ env:
+ RBD_FEATURES: "61"
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/test_librbd_python.sh
+ env:
+ RBD_FEATURES: "125"
--- /dev/null
+tasks:
+- rbd_fio:
+ client.0:
+ fio-io-size: 80%
+ formats: [2]
+ features: [[layering],[layering,exclusive-lock,object-map]]
+ io-engine: rbd
+ test-clone-io: 1
+ rw: randrw
+ runtime: 900
--- /dev/null
+tasks:
+- install:
+- ceph:
--- /dev/null
+../../../../clusters/fixed-3.yaml
\ No newline at end of file
--- /dev/null
+../../qemu/clusters/openstack.yaml
\ No newline at end of file
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+tasks:
+- parallel:
+ - io_workload
+ - op_workload
+io_workload:
+ sequential:
+ - qemu:
+ client.0:
+ clone: true
+ type: block
+ num_rbd: 2
+ test: http://git.ceph.com/?p=ceph.git;a=blob_plain;f=qa/run_xfstests_qemu.sh
+exclude_arch: armv7l
--- /dev/null
+op_workload:
+ sequential:
+ - workunit:
+ clients:
+ client.0:
+ - rbd/qemu_dynamic_features.sh
+ env:
+ IMAGE_NAME: client.0.1-clone
--- /dev/null
+op_workload:
+ sequential:
+ - workunit:
+ clients:
+ client.0:
+ - rbd/qemu_rebuild_object_map.sh
+ env:
+ IMAGE_NAME: client.0.1-clone
--- /dev/null
+../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+meta:
+- desc: run two ceph clusters and install rbd-mirror
+tasks:
+- install:
+ extra_packages: [rbd-mirror]
+- ceph:
+ cluster: cluster1
+- ceph:
+ cluster: cluster2
--- /dev/null
+meta:
+- desc: 2 ceph clusters with 3 mons and 3 osds each
+roles:
+- - cluster1.mon.a
+ - cluster1.mon.b
+ - cluster1.osd.0
+ - cluster1.osd.1
+ - cluster1.osd.2
+ - cluster2.mon.c
+ - cluster1.client.0
+ - cluster2.client.0
+- - cluster1.mon.c
+ - cluster2.mon.a
+ - cluster2.mon.b
+ - cluster2.osd.0
+ - cluster2.osd.1
+ - cluster2.osd.2
+ - cluster1.client.mirror
+ - cluster2.client.mirror
--- /dev/null
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 30 # GB
--- /dev/null
+../basic/fs
\ No newline at end of file
--- /dev/null
+../basic/msgr-failures
\ No newline at end of file
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+meta:
+- desc: run one rbd-mirror daemon per cluster
+overrides:
+ ceph:
+ conf:
+ client.mirror:
+ # override to make these names predictable
+ admin socket: /var/run/ceph/$cluster-$name.asok
+ pid file: /var/run/ceph/$cluster-$name.pid
+tasks:
+- rbd-mirror:
+ client: cluster1.client.mirror
+- rbd-mirror:
+ client: cluster2.client.mirror
--- /dev/null
+meta:
+- desc: run the rbd_mirror_stress.sh workunit to test the rbd-mirror daemon
+tasks:
+- workunit:
+ clients:
+ cluster1.client.mirror: [rbd/rbd_mirror_stress.sh]
+ env:
+ # override workunit setting of CEPH_ARGS='--cluster'
+ CEPH_ARGS: ''
+ RBD_MIRROR_USE_EXISTING_CLUSTER: '1'
+ RBD_MIRROR_USE_RBD_MIRROR: '1'
+ timeout: 6h
--- /dev/null
+meta:
+- desc: run the rbd_mirror.sh workunit to test the rbd-mirror daemon
+tasks:
+- workunit:
+ clients:
+ cluster1.client.mirror: [rbd/rbd_mirror.sh]
+ env:
+ # override workunit setting of CEPH_ARGS='--cluster'
+ CEPH_ARGS: ''
+ RBD_MIRROR_USE_EXISTING_CLUSTER: '1'
+ RBD_MIRROR_USE_RBD_MIRROR: '1'
--- /dev/null
+../thrash/base
\ No newline at end of file
--- /dev/null
+roles:
+- [mon.a, mon.c, osd.0, osd.1, osd.2]
+- [mon.b, osd.3, osd.4, osd.5]
+- [client.0]
--- /dev/null
+../../thrash/clusters/openstack.yaml
\ No newline at end of file
--- /dev/null
+../thrash/fs
\ No newline at end of file
--- /dev/null
+../thrash/msgr-failures
\ No newline at end of file
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+../thrash/thrashers
\ No newline at end of file
--- /dev/null
+os_type: ubuntu
+overrides:
+ install:
+ ceph:
+ extra_packages: [rbd-nbd]
+tasks:
+- rbd_fsx:
+ clients: [client.0]
+ ops: 6000
+ nbd: True
+ holebdy: 512
+ punch_holes: true
+ readbdy: 512
+ truncbdy: 512
+ writebdy: 512
--- /dev/null
+os_type: ubuntu
+overrides:
+ install:
+ ceph:
+ extra_packages: [rbd-nbd]
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/rbd-nbd.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+ conf:
+ client:
+ rbd cache: false
--- /dev/null
+tasks:
+- install:
+- ceph:
+ conf:
+ client:
+ rbd cache: true
--- /dev/null
+tasks:
+- install:
+- ceph:
+ conf:
+ client:
+ rbd cache: true
+ rbd cache max dirty: 0
--- /dev/null
+../../../../clusters/fixed-3.yaml
\ No newline at end of file
--- /dev/null
+openstack:
+ - machine:
+ disk: 40 # GB
+ ram: 30000 # MB
+ cpus: 1
+ volumes: # attached to each instance
+ count: 3
+ size: 30 # GB
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ rbd default format: 2
+ rbd default features: 61
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ rbd default format: 2
+ rbd default features: 125
--- /dev/null
+../basic/fs
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 5000
+ log-whitelist:
+ - wrongly marked me down
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+tasks:
+- exec:
+ client.0:
+ - sudo ceph osd erasure-code-profile set teuthologyprofile ruleset-failure-domain=osd m=1 k=2
+ - sudo ceph osd pool delete rbd rbd --yes-i-really-really-mean-it
+ - sudo ceph osd pool create rbd 4 4 erasure teuthologyprofile
+ - sudo ceph osd pool create cache 4
+ - sudo ceph osd tier add rbd cache
+ - sudo ceph osd tier cache-mode cache writeback
+ - sudo ceph osd tier set-overlay rbd cache
+ - sudo ceph osd pool set cache hit_set_type bloom
+ - sudo ceph osd pool set cache hit_set_count 8
+ - sudo ceph osd pool set cache hit_set_period 60
+ - sudo ceph osd pool set cache target_max_objects 250
--- /dev/null
+tasks:
+- exec:
+ client.0:
+ - sudo ceph osd erasure-code-profile set teuthologyprofile ruleset-failure-domain=osd m=1 k=2
+ - sudo ceph osd pool create datapool 4 4 erasure teuthologyprofile
+ - sudo ceph osd pool set datapool debug_white_box_testing_ec_overwrites true
+
+overrides:
+ ceph:
+ conf:
+ global:
+ enable experimental unrecoverable data corrupting features: debug_white_box_testing_ec_overwrites
+ client:
+ rbd default data pool: datapool
--- /dev/null
+tasks:
+- exec:
+ client.0:
+ - sudo ceph osd pool create datapool 4
+
+overrides:
+ ceph:
+ conf:
+ client:
+ rbd default data pool: datapool
--- /dev/null
+tasks:
+- exec:
+ client.0:
+ - sudo ceph osd pool create cache 4
+ - sudo ceph osd tier add rbd cache
+ - sudo ceph osd tier cache-mode cache writeback
+ - sudo ceph osd tier set-overlay rbd cache
+ - sudo ceph osd pool set cache hit_set_type bloom
+ - sudo ceph osd pool set cache hit_set_count 8
+ - sudo ceph osd pool set cache hit_set_period 60
+ - sudo ceph osd pool set cache target_max_objects 250
--- /dev/null
+tasks:
+- qemu:
+ all:
+ type: block
+ num_rbd: 2
+ grtest: http://git.ceph.com/?p=ceph.git;a=blob_plain;f=qa/run_xfstests_qemu.sh
+exclude_arch: armv7l
--- /dev/null
+tasks:
+- qemu:
+ all:
+ clone: true
+ test: http://git.ceph.com/?p=ceph.git;a=blob_plain;f=qa/workunits/suites/bonnie.sh
+exclude_arch: armv7l
--- /dev/null
+tasks:
+- qemu:
+ all:
+ clone: true
+ test: http://git.ceph.com/?p=ceph.git;a=blob_plain;f=qa/workunits/suites/fsstress.sh
+exclude_arch: armv7l
--- /dev/null
+tasks:
+- qemu:
+ all:
+ test: http://git.ceph.com/?p=ceph.git;a=blob_plain;f=qa/workunits/suites/iozone.sh
+ image_size: 20480
+exclude_arch: armv7l
--- /dev/null
+tasks:
+- qemu:
+ all:
+ clone: true
+ type: block
+ num_rbd: 2
+ test: http://git.ceph.com/?p=ceph.git;a=blob_plain;f=qa/run_xfstests_qemu.sh
+exclude_arch: armv7l
--- /dev/null
+roles:
+- [mon.a, osd.0, osd.1, client.0]
+tasks:
+- install:
+- ceph:
+ fs: xfs
+- workunit:
+ clients:
+ all: [rbd/test_admin_socket.sh]
--- /dev/null
+roles:
+- [mon.a, osd.0, osd.1, client.0]
+tasks:
+- install:
+- ceph:
+ fs: xfs
+- cram:
+ clients:
+ client.0:
+ - http://git.ceph.com/?p=ceph.git;a=blob_plain;hb={branch};f=src/test/cli-integration/rbd/formatted-output.t
--- /dev/null
+roles:
+- [mon.a, osd.0, osd.1, client.0]
+tasks:
+- install:
+- ceph:
+ fs: xfs
+- workunit:
+ clients:
+ all: [rbd/merge_diff.sh]
--- /dev/null
+roles:
+- [mon.a, osd.0, osd.1, client.0]
+tasks:
+- install:
+- ceph:
+ fs: xfs
+- workunit:
+ clients:
+ all: [rbd/permissions.sh]
--- /dev/null
+exclude_arch: armv7l
+roles:
+- [mon.a, osd.0, osd.1, client.0]
+tasks:
+- install:
+- ceph:
+ fs: xfs
+ conf:
+ client:
+ rbd cache: false
+- workunit:
+ clients:
+ all: [rbd/qemu-iotests.sh]
--- /dev/null
+exclude_arch: armv7l
+roles:
+- [mon.a, osd.0, osd.1, client.0]
+tasks:
+- install:
+- ceph:
+ fs: xfs
+ conf:
+ client:
+ rbd cache: true
+- workunit:
+ clients:
+ all: [rbd/qemu-iotests.sh]
--- /dev/null
+exclude_arch: armv7l
+roles:
+- [mon.a, osd.0, osd.1, client.0]
+tasks:
+- install:
+- ceph:
+ fs: xfs
+ conf:
+ client:
+ rbd cache: true
+ rbd cache max dirty: 0
+- workunit:
+ clients:
+ all: [rbd/qemu-iotests.sh]
--- /dev/null
+roles:
+- [mon.a, osd.0, osd.1, client.0]
+tasks:
+- install:
+- ceph:
+ fs: xfs
+ conf:
+ client:
+ rbd validate pool: false
+- workunit:
+ clients:
+ all:
+ - mon/rbd_snaps_ops.sh
+
--- /dev/null
+roles:
+- [mon.a, osd.0, osd.1, client.0]
+tasks:
+- install:
+- ceph:
+ fs: xfs
+- workunit:
+ clients:
+ all: [rbd/test_rbd_mirror.sh]
--- /dev/null
+roles:
+- [client.0]
+tasks:
+- install:
+- workunit:
+ clients:
+ all: [rbd/test_rbdmap_RBDMAPFILE.sh]
--- /dev/null
+roles:
+- [mon.a, osd.0, osd.1, client.0]
+tasks:
+- install:
+- ceph:
+ fs: xfs
+ conf:
+ client:
+ rbd cache: false
+- workunit:
+ clients:
+ all: [rbd/read-flags.sh]
--- /dev/null
+roles:
+- [mon.a, osd.0, osd.1, client.0]
+tasks:
+- install:
+- ceph:
+ fs: xfs
+ conf:
+ client:
+ rbd cache: true
+- workunit:
+ clients:
+ all: [rbd/read-flags.sh]
--- /dev/null
+roles:
+- [mon.a, osd.0, osd.1, client.0]
+tasks:
+- install:
+- ceph:
+ fs: xfs
+ conf:
+ client:
+ rbd cache: true
+ rbd cache max dirty: 0
+- workunit:
+ clients:
+ all: [rbd/read-flags.sh]
--- /dev/null
+roles:
+- [mon.a, osd.0, osd.1, client.0]
+tasks:
+- install:
+- ceph:
+ fs: xfs
+- workunit:
+ clients:
+ all: [rbd/verify_pool.sh]
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+openstack:
+ - volumes: # attached to each instance
+ count: 2
+ size: 30 # GB
--- /dev/null
+tasks:
+- install:
+- ceph:
--- /dev/null
+../../../../clusters/fixed-2.yaml
\ No newline at end of file
--- /dev/null
+openstack:
+ - machine:
+ disk: 40 # GB
+ ram: 8000 # MB
+ cpus: 1
+ volumes: # attached to each instance
+ count: 3
+ size: 30 # GB
--- /dev/null
+../../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 5000
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+tasks:
+- exec:
+ client.0:
+ - sudo ceph osd pool create cache 4
+ - sudo ceph osd tier add rbd cache
+ - sudo ceph osd tier cache-mode cache writeback
+ - sudo ceph osd tier set-overlay rbd cache
+ - sudo ceph osd pool set cache hit_set_type bloom
+ - sudo ceph osd pool set cache hit_set_count 8
+ - sudo ceph osd pool set cache hit_set_period 60
+ - sudo ceph osd pool set cache target_max_objects 250
+- thrashosds:
+ timeout: 1200
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+tasks:
+- thrashosds:
+ timeout: 1200
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/journal.sh
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/test_librbd.sh
+ env:
+ RBD_FEATURES: "61"
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/test_librbd.sh
+ env:
+ RBD_FEATURES: "61"
+overrides:
+ ceph:
+ conf:
+ client:
+ rbd clone copy on read: true
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/test_librbd.sh
+ env:
+ RBD_FEATURES: "125"
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/test_librbd.sh
+ env:
+ RBD_FEATURES: "1"
--- /dev/null
+tasks:
+- rbd_fsx:
+ clients: [client.0]
+ ops: 6000
+overrides:
+ ceph:
+ conf:
+ client:
+ rbd cache: true
--- /dev/null
+tasks:
+- rbd_fsx:
+ clients: [client.0]
+ ops: 6000
+overrides:
+ ceph:
+ conf:
+ client:
+ rbd cache: true
+ rbd cache max dirty: 0
--- /dev/null
+tasks:
+- rbd_fsx:
+ clients: [client.0]
+ ops: 6000
+overrides:
+ ceph:
+ conf:
+ client:
+ rbd cache: true
+ rbd clone copy on read: true
--- /dev/null
+tasks:
+- rbd_fsx:
+ clients: [client.0]
+ ops: 6000
+ journal_replay: True
--- /dev/null
+tasks:
+- rbd_fsx:
+ clients: [client.0]
+ ops: 6000
+overrides:
+ ceph:
+ conf:
+ client:
+ rbd cache: false
--- /dev/null
+tasks:
+- install:
+- ceph:
--- /dev/null
+../basic/clusters
\ No newline at end of file
--- /dev/null
+../basic/fs
\ No newline at end of file
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+os_type: centos # xenial valgrind buggy, see http://tracker.ceph.com/issues/18126
+overrides:
+ install:
+ ceph:
+ flavor: notcmalloc
+ debuginfo: true
+ rbd_fsx:
+ valgrind: ["--tool=memcheck"]
+ workunit:
+ env:
+ VALGRIND: "memcheck"
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/test_librbd.sh
+ env:
+ RBD_FEATURES: "1"
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/test_librbd.sh
+ env:
+ RBD_FEATURES: "61"
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/test_librbd.sh
+ env:
+ RBD_FEATURES: "125"
--- /dev/null
+tasks:
+- rbd_fsx:
+ clients: [client.0]
+ size: 134217728
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/test_librbd_python.sh
+ env:
+ RBD_FEATURES: "1"
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/test_librbd_python.sh
+ env:
+ RBD_FEATURES: "61"
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/test_librbd_python.sh
+ env:
+ RBD_FEATURES: "125"
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/test_rbd_mirror.sh
--- /dev/null
+roles:
+- - mon.a
+ - mds.a
+ - osd.0
+ - osd.1
+- - mon.b
+ - mon.c
+ - osd.2
+ - osd.3
+ - client.0
+
+tasks:
+- install:
+- ceph:
+ fs: xfs
+ log-whitelist:
+ - wrongly marked me down
+ conf:
+ client.rest0:
+ debug ms: 1
+ debug objecter: 20
+ debug rados: 20
+- rest-api: [client.0]
+- workunit:
+ clients:
+ client.0:
+ - rest/test.py
--- /dev/null
+../../../../clusters/fixed-2.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ rgw:
+ frontend: apache
--- /dev/null
+overrides:
+ rgw:
+ frontend: civetweb
--- /dev/null
+../../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ debug rgw: 20
+ rgw:
+ frontend: civetweb
--- /dev/null
+../../../rgw_pool_type
\ No newline at end of file
--- /dev/null
+# Amazon/S3.pm (cpan) not available as an rpm
+os_type: ubuntu
+tasks:
+- install:
+- ceph:
+- rgw: [client.0]
+- workunit:
+ clients:
+ client.0:
+ - rgw/s3_bucket_quota.pl
--- /dev/null
+# Amazon::S3 is not available on el7
+os_type: ubuntu
+tasks:
+- install:
+- ceph:
+- rgw: [client.0]
+- workunit:
+ clients:
+ client.0:
+ - rgw/s3_multipart_upload.pl
--- /dev/null
+tasks:
+- install:
+- ceph:
+- rgw: [client.0]
+- s3readwrite:
+ client.0:
+ rgw_server: client.0
+ readwrite:
+ bucket: rwtest
+ readers: 10
+ writers: 3
+ duration: 300
+ files:
+ num: 10
+ size: 2000
+ stddev: 500
--- /dev/null
+tasks:
+- install:
+- ceph:
+- rgw: [client.0]
+- s3roundtrip:
+ client.0:
+ rgw_server: client.0
+ roundtrip:
+ bucket: rttest
+ readers: 10
+ writers: 3
+ duration: 300
+ files:
+ num: 10
+ size: 2000
+ stddev: 500
--- /dev/null
+tasks:
+- install:
+- ceph:
+- rgw: [client.0]
+- s3tests:
+ client.0:
+ force-branch: ceph-master
+ rgw_server: client.0
+overrides:
+ ceph:
+ conf:
+ client:
+ rgw lc debug interval: 10
--- /dev/null
+tasks:
+- install:
+- ceph:
+- rgw: [client.0]
+- swift:
+ client.0:
+ rgw_server: client.0
--- /dev/null
+# Amazon/S3.pm (cpan) not available as an rpm
+os_type: ubuntu
+tasks:
+- install:
+- ceph:
+- rgw: [client.0]
+- workunit:
+ clients:
+ client.0:
+ - rgw/s3_user_quota.pl
--- /dev/null
+roles:
+- [mon.a, osd.0, client.0]
+- [osd.1, osd.2, osd.3, client.1]
+tasks:
+- install:
+- ceph:
+ conf:
+ client:
+ debug ms: 1
+ rgw gc obj min wait: 15
+ rgw data log window: 30
+ osd:
+ debug ms: 1
+ debug objclass : 20
+ client.0:
+ rgw region: region0
+ rgw zone: r0z0
+ rgw region root pool: .rgw.region.0
+ rgw zone root pool: .rgw.zone.0
+ rgw gc pool: .rgw.gc.0
+ rgw user uid pool: .users.uid.0
+ rgw user keys pool: .users.0
+ rgw log data: True
+ rgw log meta: True
+ client.1:
+ rgw region: region0
+ rgw zone: r0z1
+ rgw region root pool: .rgw.region.0
+ rgw zone root pool: .rgw.zone.1
+ rgw gc pool: .rgw.gc.1
+ rgw user uid pool: .users.uid.1
+ rgw user keys pool: .users.1
+ rgw log data: False
+ rgw log meta: False
+- rgw:
+ realm:
+ realm0
+ regions:
+ region0:
+ api name: api1
+ is master: True
+ master zone: r0z0
+ zones: [r0z0, r0z1]
+ client.0:
+ system user:
+ name: client0-system-user
+ access key: 0te6NH5mcdcq0Tc5i8i2
+ secret key: Oy4IOauQoL18Gp2zM7lC1vLmoawgqcYPbYGcWfXv
+ client.1:
+ system user:
+ name: client1-system-user
+ access key: 1te6NH5mcdcq0Tc5i8i3
+ secret key: Py4IOauQoL18Gp2zM7lC1vLmoawgqcYPbYGcWfXw
+- radosgw-agent:
+ client.0:
+ max-entries: 10
+ src: client.0
+ dest: client.1
+- sleep:
+ duration: 30
+- radosgw-admin:
--- /dev/null
+roles:
+- [mon.a, osd.0, osd.1, osd.2, client.0]
+- [mon.b, mon.c, osd.3, osd.4, osd.5, client.1]
+tasks:
+- install:
+- ceph:
+ conf:
+ client:
+ debug ms: 1
+ rgw gc obj min wait: 15
+ osd:
+ debug ms: 1
+ debug objclass : 20
+ client.0:
+ rgw region: region0
+ rgw zone: r0z1
+ rgw region root pool: .rgw.region.0
+ rgw zone root pool: .rgw.zone.0
+ rgw gc pool: .rgw.gc.0
+ rgw user uid pool: .users.uid.0
+ rgw user keys pool: .users.0
+ rgw log data: True
+ rgw log meta: True
+ client.1:
+ rgw region: region1
+ rgw zone: r1z1
+ rgw region root pool: .rgw.region.1
+ rgw zone root pool: .rgw.zone.1
+ rgw gc pool: .rgw.gc.1
+ rgw user uid pool: .users.uid.1
+ rgw user keys pool: .users.1
+ rgw log data: False
+ rgw log meta: False
+- rgw:
+ realm:
+ realm0
+ regions:
+ region0:
+ api name: api1
+ is master: True
+ master zone: r0z1
+ zones: [r0z1]
+ region1:
+ api name: api1
+ is master: False
+ master zone: r1z1
+ zones: [r1z1]
+ client.0:
+ system user:
+ name: client0-system-user
+ access key: 0te6NH5mcdcq0Tc5i8i2
+ secret key: Oy4IOauQoL18Gp2zM7lC1vLmoawgqcYPbYGcWfXv
+ client.1:
+ system user:
+ name: client1-system-user
+ access key: 1te6NH5mcdcq0Tc5i8i3
+ secret key: Py4IOauQoL18Gp2zM7lC1vLmoawgqcYPbYGcWfXw
+- radosgw-agent:
+ client.0:
+ src: client.0
+ dest: client.1
+ metadata-only: true
+- radosgw-admin:
--- /dev/null
+roles:
+- [mon.a, osd.0]
+- [client.0, osd.1, osd.2, osd.3]
+tasks:
+- install:
+- ceph:
+ conf:
+ client:
+ debug ms: 1
+ rgw gc obj min wait: 15
+ osd:
+ debug ms: 1
+ debug objclass : 20
+- rgw:
+ client.0:
+- radosgw-admin:
--- /dev/null
+overrides:
+ s3readwrite:
+ s3:
+ user_id: s3readwrite-test-user
+ display_name: test user for the s3readwrite tests
+ email: tester@inktank
+ access_key: 2te6NH5mcdcq0Tc5i8i4
+ secret_key: Qy4IOauQoL18Gp2zM7lC1vLmoawgqcYPbYGcWfXx
+ readwrite:
+ deterministic_file_names: True
+ duration: 30
+ bucket: testbucket
+ files:
+ num: 10
+ size: 2000
+ stddev: 500
+roles:
+- [mon.a, osd.0, osd.1, osd.2, client.0]
+- [mon.b, mon.c, osd.3, osd.4, osd.5, client.1]
+
+tasks:
+- install:
+- ceph:
+ conf:
+ client:
+ rgw region: default
+ rgw zone: r1z1
+ rgw region root pool: .rgw
+ rgw zone root pool: .rgw
+ rgw domain root: .rgw
+ rgw gc pool: .rgw.gc
+ rgw user uid pool: .users.uid
+ rgw user keys pool: .users
+- rgw:
+ realm:
+ realm0
+ regions:
+ default:
+ api name: api1
+ is master: true
+ master zone: r1z1
+ zones: [r1z1]
+ client.0:
+ system user:
+ name: nr-system
+ access key: 0te6NH5mcdcq0Tc5i8i2
+ secret key: Oy4IOauQoL18Gp2zM7lC1vLmoawgqcYPbYGcWfXv
+- s3readwrite:
+ client.0:
+ extra_args: ['--no-cleanup']
+ s3:
+ delete_user: False
+ readwrite:
+ writers: 1
+ readers: 0
+- rgw:
+ realm:
+ realm0
+ regions:
+ default:
+ api name: api1
+ is master: true
+ master zone: r1z1
+ zones: [r1z1]
+ client.1:
+ system user:
+ name: r2-system
+ access key: 1te6NH5mcdcq0Tc5i8i3
+ secret key: Py4IOauQoL18Gp2zM7lC1vLmoawgqcYPbYGcWfXw
+- s3readwrite:
+ client.1:
+ s3:
+ create_user: False
+ readwrite:
+ writers: 0
+ readers: 2
+
--- /dev/null
+overrides:
+ rgw:
+ frontend: apache
--- /dev/null
+overrides:
+ rgw:
+ frontend: civetweb
--- /dev/null
+overrides:
+ ceph:
+ fs: xfs
+ conf:
+ osd:
+ osd sloppy crc: true
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ debug rgw: 20
+ rgw:
+ frontend: civetweb
--- /dev/null
+../../../rgw_pool_type/
\ No newline at end of file
--- /dev/null
+../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+../../../../clusters/fixed-2.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ rgw:
+ frontend: apache
--- /dev/null
+overrides:
+ rgw:
+ frontend: civetweb
--- /dev/null
+overrides:
+ ceph:
+ fs: xfs
+ conf:
+ osd:
+ osd sloppy crc: true
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 5000
--- /dev/null
+../../../objectstore
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ debug rgw: 20
+ rgw compression type: random
+ rgw:
+ frontend: civetweb
--- /dev/null
+../../../rgw_pool_type/
\ No newline at end of file
--- /dev/null
+os_type: centos # xenial valgrind buggy, see http://tracker.ceph.com/issues/18126
+tasks:
+- install:
+ flavor: notcmalloc
+ debuginfo: true
+- ceph:
+- rgw:
+ client.0:
+ valgrind: [--tool=memcheck]
+- s3tests:
+ client.0:
+ force-branch: ceph-master
+ rgw_server: client.0
+overrides:
+ ceph:
+ conf:
+ global:
+ osd_min_pg_log_entries: 10
+ osd_max_pg_log_entries: 10
+ client:
+ rgw lc debug interval: 10
--- /dev/null
+os_type: centos # xenial valgrind buggy, see http://tracker.ceph.com/issues/18126
+tasks:
+- install:
+ flavor: notcmalloc
+ debuginfo: true
+- ceph:
+ conf:
+ client.0:
+ rgw region: zero
+ rgw zone: r0z1
+ rgw region root pool: .rgw.region.0
+ rgw zone root pool: .rgw.zone.0
+ rgw gc pool: .rgw.gc.0
+ rgw user uid pool: .users.uid.0
+ rgw user keys pool: .users.0
+ rgw log data: True
+ rgw log meta: True
+ client.1:
+ rgw region: one
+ rgw zone: r1z1
+ rgw region root pool: .rgw.region.1
+ rgw zone root pool: .rgw.zone.1
+ rgw gc pool: .rgw.gc.1
+ rgw user uid pool: .users.uid.1
+ rgw user keys pool: .users.1
+ rgw log data: False
+ rgw log meta: False
+- rgw:
+ default_idle_timeout: 300
+ realm:
+ realm0
+ regions:
+ zero:
+ api name: api1
+ is master: True
+ master zone: r0z1
+ zones: [r0z1]
+ one:
+ api name: api1
+ is master: False
+ master zone: r1z1
+ zones: [r1z1]
+ client.0:
+ valgrind: [--tool=memcheck]
+ system user:
+ name: client0-system-user
+ access key: 1te6NH5mcdcq0Tc5i8i2
+ secret key: 1y4IOauQoL18Gp2zM7lC1vLmoawgqcYPbYGcWfXv
+ client.1:
+ valgrind: [--tool=memcheck]
+ system user:
+ name: client1-system-user
+ access key: 0te6NH5mcdcq0Tc5i8i2
+ secret key: Oy4IOauQoL18Gp2zM7lC1vLmoawgqcYPbYGcWfXv
+- radosgw-agent:
+ client.0:
+ src: client.0
+ dest: client.1
+ metadata-only: true
+- s3tests:
+ client.0:
+ force-branch: ceph-master
+ idle_timeout: 300
+ rgw_server: client.0
+overrides:
+ ceph:
+ conf:
+ client:
+ rgw lc debug interval: 10
--- /dev/null
+os_type: centos # xenial valgrind buggy, see http://tracker.ceph.com/issues/18126
+tasks:
+- install:
+ flavor: notcmalloc
+ debuginfo: true
+- ceph:
+- rgw:
+ client.0:
+ valgrind: [--tool=memcheck]
+- swift:
+ client.0:
+ rgw_server: client.0
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ osd:
+ lockdep: true
+ mon:
+ lockdep: true
--- /dev/null
+os_type: centos # xenial valgrind buggy, see http://tracker.ceph.com/issues/18126
+overrides:
+ install:
+ ceph:
+ flavor: notcmalloc
+ debuginfo: true
+ ceph:
+ conf:
+ global:
+ osd heartbeat grace: 40
+ valgrind:
+ mon: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
+ osd: [--tool=memcheck]
+ mds: [--tool=memcheck]
--- /dev/null
+roles:
+- [mon.a, mon.b, mon.c, mds.a, osd.0, osd.1]
+- [samba.0, client.0, client.1]
--- /dev/null
+../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+# we currently can't install Samba on RHEL; need a gitbuilder and code updates
+os_type: ubuntu
+
+tasks:
+- install:
+- install:
+ project: samba
+ extra_packages: ['samba']
+- ceph:
--- /dev/null
+tasks:
+- ceph-fuse: [client.0]
+- samba:
+ samba.0:
+ ceph: "{testdir}/mnt.0"
+
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms die on skipped message: false
+kernel:
+ client:
+ branch: testing
+tasks:
+- kclient: [client.0]
+- samba:
+ samba.0:
+ ceph: "{testdir}/mnt.0"
+
--- /dev/null
+tasks:
+- samba:
--- /dev/null
+tasks:
+- localdir: [client.0]
+- samba:
+ samba.0:
+ ceph: "{testdir}/mnt.0"
--- /dev/null
+tasks:
+- cifs-mount:
+ client.1:
+ share: ceph
+- workunit:
+ clients:
+ client.1:
+ - suites/dbench.sh
--- /dev/null
+tasks:
+- cifs-mount:
+ client.1:
+ share: ceph
+- workunit:
+ clients:
+ client.1:
+ - suites/fsstress.sh
--- /dev/null
+tasks:
+- cifs-mount:
+ client.1:
+ share: ceph
+- workunit:
+ clients:
+ client.1:
+ - kernel_untar_build.sh
+
--- /dev/null
+tasks:
+- pexec:
+ client.1:
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.lock
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.fdpass
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.unlink
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.attr
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.trans2
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.negnowait
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.dir1
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.deny1
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.deny2
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.deny3
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.denydos
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.ntdeny1
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.ntdeny2
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.tcon
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.tcondev
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.vuid
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.rw1
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.open
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.defer_open
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.xcopy
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.rename
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.properties
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.mangle
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.openattr
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.chkpath
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.secleak
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.disconnect
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.samba3error
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.smb
+# - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.bench-holdcon
+# - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.bench-holdopen
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.bench-readwrite
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.bench-torture
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.scan-pipe_number
+ - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.scan-ioctl
+# - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.scan-maxfid
--- /dev/null
+../../../../clusters/fixed-1.yaml
\ No newline at end of file
--- /dev/null
+openstack:
+ - machine:
+ disk: 40 # GB
+ ram: 8000 # MB
+ cpus: 1
+ volumes: # attached to each instance
+ count: 3
+ size: 30 # GB
--- /dev/null
+meta:
+- desc: |
+ Run ceph-deploy cli tests on one node
+ and verify all the cli works and cluster can reach
+ HEALTH_OK state(implicty verifying the daemons via init).
+tasks:
+- ceph_deploy.single_node_test: null
--- /dev/null
+../../../../clusters/fixed-3-cephfs.yaml
\ No newline at end of file
--- /dev/null
+../../../../fs/btrfs.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- install:
+- ceph:
+ fs: xfs
+- ceph-fuse:
+- workunit:
+ clients:
+ all:
+ - suites/blogbench.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+ fs: btrfs
+- ceph-fuse:
+- workunit:
+ clients:
+ all:
+ - suites/fsstress.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+ fs: btrfs
+- ceph-fuse: [client.0]
+- workunit:
+ clients:
+ all:
+ - suites/iozone.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+ fs: xfs
+ conf:
+ mds:
+ debug mds: 20
+ debug ms: 1
+ client:
+ debug client: 20
+ debug ms: 1
+ fuse default permissions: false
+ fuse set user groups: true
+- ceph-fuse:
+- workunit:
+ clients:
+ all:
+ - suites/pjd.sh
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms die on skipped message: false
+tasks:
+- install:
+- ceph:
+ fs: btrfs
+- kclient:
+- workunit:
+ clients:
+ all:
+ - direct_io
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms die on skipped message: false
+tasks:
+- install:
+- ceph:
+ fs: xfs
+- kclient:
+- workunit:
+ clients:
+ all:
+ - suites/dbench.sh
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms die on skipped message: false
+tasks:
+- install:
+- ceph:
+ fs: xfs
+- kclient:
+- workunit:
+ clients:
+ all:
+ - suites/fsstress.sh
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms die on skipped message: false
+tasks:
+- install:
+- ceph:
+ fs: xfs
+- kclient:
+- workunit:
+ clients:
+ all:
+ - suites/pjd.sh
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ debug ms: 1
+ debug client: 20
+ mds:
+ debug ms: 1
+ debug mds: 20
+tasks:
+- install:
+- ceph:
+ fs: btrfs
+- ceph-fuse:
+- workunit:
+ clients:
+ client.0:
+ - libcephfs/test.sh
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - reached quota
+ conf:
+ global:
+ ms inject delay max: 1
+ ms inject delay probability: 0.005
+ ms inject delay type: mon
+ ms inject internal delays: 0.002
+ ms inject socket failures: 2500
+tasks:
+- install: null
+- ceph:
+ fs: xfs
+- mon_thrash:
+ revive_delay: 90
+ thrash_delay: 1
+ thrash_many: true
+- workunit:
+ clients:
+ client.0:
+ - rados/test.sh
--- /dev/null
+tasks:
+- install: null
+- ceph:
+ fs: ext4
+ log-whitelist:
+ - reached quota
+ - wrongly marked me down
+ - objects unfound and apparently lost
+- thrashosds:
+ chance_pgnum_grow: 2
+ chance_pgpnum_fix: 1
+ timeout: 1200
+- workunit:
+ clients:
+ client.0:
+ - rados/test.sh
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject delay max: 1
+ ms inject delay probability: 0.005
+ ms inject delay type: osd
+ ms inject internal delays: 0.002
+ ms inject socket failures: 2500
+tasks:
+- install: null
+- ceph:
+ fs: xfs
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+- thrashosds:
+ chance_pgnum_grow: 2
+ chance_pgpnum_fix: 1
+ timeout: 1200
+- full_sequential:
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
--- /dev/null
+tasks:
+- install: null
+- ceph:
+ fs: btrfs
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+- thrashosds:
+ chance_pgnum_grow: 2
+ chance_pgpnum_fix: 1
+ timeout: 1200
+- exec:
+ client.0:
+ - sudo ceph osd pool create base 4
+ - sudo ceph osd pool create cache 4
+ - sudo ceph osd tier add base cache
+ - sudo ceph osd tier cache-mode cache writeback
+ - sudo ceph osd tier set-overlay base cache
+ - sudo ceph osd pool set cache hit_set_type bloom
+ - sudo ceph osd pool set cache hit_set_count 8
+ - sudo ceph osd pool set cache hit_set_period 3600
+ - sudo ceph osd pool set cache target_max_objects 250
+- rados:
+ clients:
+ - client.0
+ objects: 500
+ op_weights:
+ copy_from: 50
+ delete: 50
+ evict: 50
+ flush: 50
+ read: 100
+ rollback: 50
+ snap_create: 50
+ snap_remove: 50
+ try_flush: 50
+ write: 100
+ ops: 4000
+ pool_snaps: true
+ pools:
+ - base
--- /dev/null
+tasks:
+- install:
+- ceph:
+ fs: xfs
+- workunit:
+ clients:
+ client.0:
+ - cls
--- /dev/null
+tasks:
+- install: null
+- ceph:
+ fs: xfs
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+- thrashosds:
+ chance_pgnum_grow: 3
+ chance_pgpnum_fix: 1
+ timeout: 1200
+- rados:
+ clients:
+ - client.0
+ ec_pool: true
+ max_in_flight: 64
+ max_seconds: 600
+ objects: 1024
+ op_weights:
+ append: 100
+ copy_from: 50
+ delete: 50
+ read: 100
+ rmattr: 25
+ rollback: 50
+ setattr: 25
+ snap_create: 50
+ snap_remove: 50
+ write: 0
+ ops: 400000
+ size: 16384
--- /dev/null
+tasks:
+- install:
+- ceph:
+ fs: btrfs
+ log-whitelist:
+ - wrongly marked me down
+- ceph-fuse:
+- workunit:
+ clients:
+ client.0:
+ - rados/test_python.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+ fs: ext4
+- ceph-fuse:
+- workunit:
+ clients:
+ all:
+ - rados/load-gen-mix.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+ fs: xfs
+- ceph-fuse:
+- workunit:
+ clients:
+ client.0:
+ - rbd/test_librbd.sh
+ env:
+ RBD_FEATURES: "1"
--- /dev/null
+tasks:
+- install:
+- ceph:
+ fs: xfs
+- ceph-fuse:
+- workunit:
+ clients:
+ client.0:
+ - rbd/import_export.sh
+ env:
+ RBD_CREATE_ARGS: --new-format
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ rbd cache: true
+ global:
+ ms inject socket failures: 5000
+tasks:
+- install: null
+- ceph:
+ fs: xfs
+- thrashosds:
+ timeout: 1200
+- rbd_fsx:
+ clients:
+ - client.0
+ ops: 2000
--- /dev/null
+tasks:
+- install:
+- ceph:
+ fs: btrfs
+- ceph-fuse:
+- workunit:
+ clients:
+ client.0:
+ - rbd/test_librbd_python.sh
+ env:
+ RBD_FEATURES: "1"
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms die on skipped message: false
+ client:
+ rbd default features: 5
+tasks:
+- install:
+- ceph:
+ fs: btrfs
+- rbd:
+ all:
+ image_size: 20480
+- workunit:
+ clients:
+ all:
+ - suites/iozone.sh
--- /dev/null
+overrides:
+ rgw:
+ ec-data-pool: true
+ cache-pools: true
+ frontend: civetweb
+tasks:
+- install:
+- ceph:
+ fs: btrfs
+- rgw: [client.0]
+- s3tests:
+ client.0:
+ rgw_server: client.0
+overrides:
+ ceph:
+ conf:
+ client:
+ rgw lc debug interval: 10
--- /dev/null
+tasks:
+- install:
+- ceph:
+ fs: xfs
+- rgw: [client.0]
+- s3tests:
+ client.0:
+ rgw_server: client.0
+overrides:
+ ceph:
+ conf:
+ client:
+ rgw lc debug interval: 10
--- /dev/null
+tasks:
+- install:
+- ceph:
+ fs: ext4
+- rgw: [client.0]
+- swift:
+ client.0:
+ rgw_server: client.0
--- /dev/null
+../../../../distros/supported/centos_7.2.yaml
\ No newline at end of file
--- /dev/null
+os_type: ubuntu
+os_version: "16.04"
--- /dev/null
+roles:
+- [mon.a, osd.0]
+- [osd.1, osd.2]
+- [mds.a, osd.3]
+- [mon.b, client.0]
+tasks:
+- ssh-keys:
+- ceph-deploy:
+- systemd:
+- workunit:
+ clients:
+ all:
+ - rados/load-gen-mix.sh
--- /dev/null
+../../../../clusters/fixed-3-cephfs.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- install:
+- ceph:
+- ceph-fuse:
+- workunit:
+ clients:
+ all:
+ - snaps
--- /dev/null
+tasks:
+- install:
+- ceph:
+- kclient:
+- workunit:
+ clients:
+ all:
+ - suites/fsx.sh
--- /dev/null
+roles:
+- [mon.0, mds.a, osd.0]
+- [mon.1, osd.1]
+- [mon.2, osd.2]
+- [osd.3]
+- [osd.4]
+- [osd.5]
+- [osd.6]
+- [osd.7]
+- [osd.8]
+- [osd.9]
+- [osd.10]
+- [osd.11]
+- [osd.12]
+- [osd.13]
+- [osd.14]
+- [osd.15]
+- [client.0]
--- /dev/null
+roles:
+- [mon.0, mds.a, osd.0, osd.1, osd.2]
+- [mon.1, mon.2, client.0]
--- /dev/null
+roles:
+- [mon.0, mds.a, osd.0]
+- [mon.1, osd.1]
+- [mon.2, osd.2]
+- [osd.3]
+- [osd.4]
+- [osd.5]
+- [osd.6]
+- [osd.7]
+- [client.0]
--- /dev/null
+../../../../fs/btrfs.yaml
\ No newline at end of file
--- /dev/null
+../../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+- thrashosds:
--- /dev/null
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+- thrashosds:
+ op_delay: 1
+ chance_down: 10
--- /dev/null
+tasks:
+- install:
+- ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+- thrashosds:
+ chance_down: 50
--- /dev/null
+tasks:
+- ceph-fuse:
+- workunit:
+ clients:
+ all:
+ - suites/bonnie.sh
--- /dev/null
+tasks:
+- ceph-fuse:
+- workunit:
+ clients:
+ all:
+ - suites/iozone.sh
--- /dev/null
+tasks:
+- radosbench:
+ clients: [client.0]
+ time: 1800
--- /dev/null
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 500
+ op_weights:
+ read: 45
+ write: 45
+ delete: 10
--- /dev/null
+../../../distros/supported/
\ No newline at end of file
--- /dev/null
+roles:
+ - [mon.0, client.0]
+tasks:
+ - install:
+ # branch has precedence over sha1
+ branch: hammer
+ sha1: e5b6eea91cc37434f78a987d2dd1d3edd4a23f3f # dumpling
+ - exec:
+ client.0:
+ - ceph --version | grep 'version 0.94'
--- /dev/null
+roles:
+ - [client.0]
+tasks:
+ - install:
+ tag: v0.94.1
+ - exec:
+ client.0:
+ - ceph --version | grep 'version 0.94.1'
+ - install.upgrade:
+ client.0:
+ tag: v0.94.3
+ - exec:
+ client.0:
+ - ceph --version | grep 'version 0.94.3'
--- /dev/null
+roles:
+ - [mon.0, client.0]
+tasks:
+ - install:
+ # tag has precedence over branch and sha1
+ tag: v0.94.1
+ branch: firefly
+ sha1: e5b6eea91cc37434f78a987d2dd1d3edd4a23f3f # dumpling
+ - exec:
+ client.0:
+ - ceph --version | grep 'version 0.94.1'
--- /dev/null
+roles:
+ - [mon.0, client.0]
--- /dev/null
+../../../distros/supported/
\ No newline at end of file
--- /dev/null
+tasks:
+ - install:
+ - tests:
--- /dev/null
+tasks:
+- teuthology_integration:
--- /dev/null
+roles:
+- - ceph.mon.a
+ - ceph.mon.b
+ - backup.osd.0
+ - backup.osd.1
+ - backup.osd.2
+ - backup.client.0
+- - backup.mon.a
+ - ceph.osd.0
+ - ceph.osd.1
+ - ceph.osd.2
+ - ceph.client.0
+ - client.1
+ - osd.3
+tasks:
+- install:
+- ceph:
+ cluster: backup
+- ceph:
+- workunit:
+ clients:
+ ceph.client.0: [true.sh]
+ backup.client.0: [true.sh]
--- /dev/null
+roles:
+- - backup.mon.a
+ - backup.mon.b
+ - backup.osd.0
+ - backup.osd.1
+ - backup.osd.2
+- - backup.mon.c
+ - backup.osd.3
+ - backup.osd.4
+ - backup.osd.5
+ - backup.client.0
+tasks:
+- install:
+- ceph:
+ cluster: backup
+- thrashosds:
+ cluster: backup
+- workunit:
+ clients:
+ all: [true.sh]
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - failed to encode map
+ conf:
+ mon:
+ mon warn on legacy crush tunables: false
+roles:
+- - ceph.mon.a
+ - ceph.mon.b
+ - backup.osd.0
+ - backup.osd.1
+ - backup.osd.2
+ - backup.client.0
+- - backup.mon.a
+ - ceph.osd.0
+ - ceph.osd.1
+ - ceph.osd.2
+ - ceph.client.0
+ - client.1
+ - osd.3
+tasks:
+- install:
+ branch: infernalis
+- ceph:
+ cluster: backup
+- ceph:
+- workunit:
+ clients:
+ backup.client.0: [true.sh]
+ ceph.client.0: [true.sh]
+- install.upgrade:
+ ceph.mon.a:
+ branch: jewel
+ backup.mon.a:
+ branch: jewel
+- ceph.restart: [ceph.mon.a, ceph.mon.b, ceph.osd.0, ceph.osd.1, ceph.osd.2, osd.3]
+- exec:
+ ceph.client.0:
+ - ceph --version | grep -F 'version 10.'
+ client.1:
+ - ceph --cluster backup --version | grep -F 'version 10.'
+ backup.client.0:
+ # cli upgraded
+ - ceph --cluster backup --id 0 --version | grep -F 'version 10.'
+ - ceph --version | grep -F 'version 10.'
+ # backup cluster mon not upgraded
+ - ceph --cluster backup --id 0 tell mon.a version | grep -F 'version 9.2.'
+ - ceph tell mon.a version | grep -F 'version 10.'
--- /dev/null
+roles:
+- - backup.mon.a
+ - osd.0
+ - osd.1
+ - osd.2
+ - client.0
+ - backup.client.0
+- - mon.a
+ - backup.osd.0
+ - backup.osd.1
+ - backup.osd.2
+ - client.1
+ - backup.client.1
+tasks:
+- install:
+- workunit:
+ clients:
+ all: [true.sh]
+- workunit:
+ clients:
+ backup.client.1: [true.sh]
--- /dev/null
+../../../../fs/xfs.yaml
\ No newline at end of file
--- /dev/null
+roles:
+ - [mon.0, client.0]
--- /dev/null
+tasks:
+ - tests:
--- /dev/null
+tasks:
+ - nop:
+
--- /dev/null
+../../../distros/supported/
\ No newline at end of file
--- /dev/null
+# this runs s3tests against rgw, using civetweb
+roles:
+- [mon.a, mon.c, osd.0, osd.1, osd.2, client.0]
+- [mon.b, osd.3, osd.4, osd.5, client.1]
+
+tasks:
+- install:
+ branch: master
+- ceph:
+- rgw: [client.0]
+- s3tests:
+ client.0:
+ rgw_server: client.0
+ force-branch: master
+overrides:
+ ceph:
+ fs: xfs
+ conf:
+ client:
+ debug rgw: 20
+ rgw lc debug interval: 10
+ rgw:
+ ec-data-pool: false
+ frontend: civetweb
--- /dev/null
+# this runs s3tests against rgw, using mod_fastcgi
+roles:
+- [mon.a, mon.c, osd.0, osd.1, osd.2, client.0]
+- [mon.b, osd.3, osd.4, osd.5, client.1]
+
+tasks:
+- install:
+ branch: master
+- ceph:
+- rgw: [client.0]
+- s3tests:
+ client.0:
+ rgw_server: client.0
+ force-branch: master
+overrides:
+ ceph:
+ fs: xfs
+ conf:
+ client:
+ debug rgw: 20
+ rgw lc debug interval: 10
+ rgw:
+ ec-data-pool: false
+ frontend: apache
--- /dev/null
+# this runs s3tests against rgw, using mod_proxy_fcgi
+# the choice between uds or tcp with mod_proxy_fcgi depends on the distro
+roles:
+- [mon.a, mon.c, osd.0, osd.1, osd.2, client.0]
+- [mon.b, osd.3, osd.4, osd.5, client.1]
+
+tasks:
+- install:
+ branch: master
+- ceph:
+- rgw: [client.0]
+- s3tests:
+ client.0:
+ rgw_server: client.0
+ force-branch: master
+overrides:
+ ceph:
+ fs: xfs
+ conf:
+ client:
+ debug rgw: 20
+ rgw lc debug interval: 10
+ rgw:
+ ec-data-pool: false
+ frontend: apache
+ use_fcgi: true
--- /dev/null
+roles:
+ - [client.0]
+tasks:
+- install:
+- workunit:
+ clients:
+ all:
+ - true.sh
--- /dev/null
+roles:
+- [mon.a, mon.c, osd.0, osd.1, osd.2]
+- [mon.b, mds.a, osd.3, osd.4, osd.5]
+- [client.0]
--- /dev/null
+overrides:
+ ceph:
+ fs: btrfs
+ conf:
+ osd:
+ osd op thread timeout: 60
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 5000
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ global:
+ ms inject socket failures: 500
--- /dev/null
+tasks:
+- install:
+- ceph:
+- tgt:
+- iscsi:
+- workunit:
+ clients:
+ all:
+ - suites/blogbench.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+- tgt:
+- iscsi:
+- workunit:
+ clients:
+ all:
+ - suites/bonnie.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+- tgt:
+- iscsi:
+- workunit:
+ clients:
+ all:
+ - suites/dbench-short.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+- tgt:
+- iscsi:
+- workunit:
+ clients:
+ all:
+ - suites/dbench.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+- tgt:
+- iscsi:
+- workunit:
+ clients:
+ all:
+ - suites/ffsb.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+- tgt:
+- iscsi:
+- workunit:
+ clients:
+ all:
+ - suites/fio.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+- tgt:
+- iscsi:
+- workunit:
+ clients:
+ all:
+ - suites/fsstress.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+- tgt:
+- iscsi:
+- workunit:
+ clients:
+ all:
+ - suites/fsx.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+- tgt:
+- iscsi:
+- workunit:
+ clients:
+ all:
+ - suites/fsync-tester.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+- tgt:
+- iscsi:
+- workunit:
+ clients:
+ all:
+ - suites/iogen.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+- tgt:
+- iscsi:
+- workunit:
+ clients:
+ all:
+ - suites/iozone-sync.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+- tgt:
+- iscsi:
+- workunit:
+ clients:
+ all:
+ - suites/iozone.sh
--- /dev/null
+tasks:
+- install:
+- ceph:
+- tgt:
+- iscsi:
+- workunit:
+ clients:
+ all:
+ - suites/pjd.sh
--- /dev/null
+roles:
+- - mon.a
+ - osd.0
+ - osd.1
+- - mon.b
+ - mon.c
+ - osd.2
+ - osd.3
+- - client.0
+overrides:
+ ceph:
+ log-whitelist:
+ - failed to encode map
+ fs: xfs
--- /dev/null
+tasks:
+- install:
+ branch: hammer
+ exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+- print: "**** done install hammer"
+upgrade_workload:
+ sequential:
+ - install.upgrade:
+ exclude_packages: ['ceph-test-dbg']
+ client.0:
+ - print: "**** done install.upgrade client.0"
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ client:
+ rbd default features: 13
+tasks:
+- exec:
+ client.0:
+ - "cp $(which ceph_test_librbd_api) $TESTDIR/ceph_test_librbd_api"
+- sequential:
+ - upgrade_workload
+- ceph:
+- print: "**** done ceph"
+- exec:
+ client.0:
+ - "cp --force $TESTDIR/ceph_test_librbd_api $(which ceph_test_librbd_api)"
+ - "rm -rf $TESTDIR/ceph_test_librbd_api"
+- print: "**** done reverting to hammer ceph_test_librbd_api"
+- workunit:
+ branch: hammer
+ clients:
+ client.0:
+ - rbd/test_librbd_api.sh
+ env:
+ RBD_FEATURES: "13"
+- print: "**** done rbd/test_librbd_api.sh"
--- /dev/null
+tasks:
+- sequential:
+ - upgrade_workload
+- ceph:
+- print: "**** done ceph"
+- workunit:
+ branch: hammer
+ clients:
+ client.0:
+ - rbd/import_export.sh
+ env:
+ RBD_CREATE_ARGS: --image-feature layering,exclusive-lock,object-map
+- print: "**** done rbd/import_export.sh"
--- /dev/null
+roles:
+- - mon.a
+ - mon.b
+ - mon.c
+ - osd.0
+ - osd.1
+ - osd.2
+ - client.0
+- - client.1
+overrides:
+ ceph:
+ log-whitelist:
+ - failed to encode map
+ fs: xfs
+ conf:
+ client:
+ rbd default features: 1
--- /dev/null
+tasks:
+- install:
+ branch: hammer
+ exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+- print: "**** done install hammer"
+- install.upgrade:
+ exclude_packages: ['ceph-test-dbg']
+ client.1:
+- print: "**** done install.upgrade client.1"
+- ceph:
+- print: "**** done ceph"
--- /dev/null
+tasks:
+- workunit:
+ branch: hammer
+ clients:
+ client.0:
+ - rbd/notify_master.sh
+ client.1:
+ - rbd/notify_slave.sh
+ env:
+ RBD_FEATURES: "13"
+- print: "**** done rbd: old librbd -> new librbd"
+- workunit:
+ branch: hammer
+ clients:
+ client.0:
+ - rbd/notify_slave.sh
+ client.1:
+ - rbd/notify_master.sh
+ env:
+ RBD_FEATURES: "13"
+- print: "**** done rbd: new librbd -> old librbd"
--- /dev/null
+roles:
+- - mon.a
+ - mon.b
+ - mon.c
+ - osd.0
+ - osd.1
+ - osd.2
+- - client.0
+overrides:
+ ceph:
+ log-whitelist:
+ - failed to encode map
+ fs: xfs
--- /dev/null
+tasks:
+- install:
+ branch: jewel
+ exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+- print: "**** done install jewel"
+upgrade_workload:
+ sequential:
+ - install.upgrade:
+ exclude_packages: ['ceph-test', 'ceph-test-dbg']
+ client.0:
+ - print: "**** done install.upgrade to -x on client.0"
--- /dev/null
+tasks:
+- exec:
+ client.0:
+ - "cp $(which ceph_test_librbd_api) $TESTDIR/ceph_test_librbd_api"
+- sequential:
+ - upgrade_workload
+- ceph:
+- print: "**** done ceph"
+- exec:
+ client.0:
+ - "cp --force $TESTDIR/ceph_test_librbd_api $(which ceph_test_librbd_api)"
+ - "rm -rf $TESTDIR/ceph_test_librbd_api"
+- print: "**** done reverting to jewel ceph_test_librbd_api"
+- workunit:
+ branch: kraken
+ clients:
+ client.0:
+ - rbd/test_librbd_api.sh
+ env:
+ RBD_FEATURES: "13"
+- print: "**** done rbd/test_librbd_api.sh"
--- /dev/null
+tasks:
+- sequential:
+ - upgrade_workload
+- ceph:
+- print: "**** done ceph"
+- workunit:
+ branch: jewel
+ clients:
+ client.0:
+ - rbd/import_export.sh
+ env:
+ RBD_CREATE_ARGS: --image-feature layering,exclusive-lock,object-map
+- print: "**** done rbd/import_export.sh"
--- /dev/null
+roles:
+- - mon.a
+ - mon.b
+ - mon.c
+ - osd.0
+ - osd.1
+ - osd.2
+ - client.0
+- - client.1
+overrides:
+ ceph:
+ log-whitelist:
+ - failed to encode map
+ fs: xfs
+ conf:
+ client:
+ rbd default features: 1
+
--- /dev/null
+tasks:
+- install:
+ branch: jewel
+ exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+- print: "**** done install jewel"
+- install.upgrade:
+ exclude_packages: ['ceph-test', 'ceph-test-dbg']
+ client.1:
+- print: "**** done install.upgrade to -x on client.0"
+- ceph:
+- print: "**** done ceph task"
--- /dev/null
+tasks:
+- workunit:
+ branch: jewel
+ clients:
+ client.0:
+ - rbd/notify_master.sh
+ client.1:
+ - rbd/notify_slave.sh
+ env:
+ RBD_FEATURES: "13"
+- print: "**** done rbd: old librbd -> new librbd"
+- workunit:
+ branch: jewel
+ clients:
+ client.0:
+ - rbd/notify_slave.sh
+ client.1:
+ - rbd/notify_master.sh
+ env:
+ RBD_FEATURES: "13"
+- print: "**** done rbd: new librbd -> old librbd"
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ mon:
+ mon warn on legacy crush tunables: false
+ mon debug unsafe allow tier with nonempty snaps: true
+ log-whitelist:
+ - wrongly marked me down
+ - reached quota
+roles:
+- - mon.a
+ - osd.0
+ - osd.1
+- - mon.b
+ - mon.c
+ - osd.2
+ - osd.3
+- - client.0
+ - client.1
--- /dev/null
+tasks:
+- install:
+ branch: hammer
+ exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+- print: "**** done hammer"
+- ceph:
+ fs: xfs
+- install.upgrade:
+ exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+ osd.0:
+ branch: jewel
+ osd.2:
+ branch: jewel
+- print: "*** client.0 upgraded packages to jewel"
+- parallel:
+ - workload
+ - upgrade-sequence
+- print: "**** done parallel"
--- /dev/null
+workload:
+ full_sequential:
+ - rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ ec_pool: true
+ write_append_excl: false
+ op_weights:
+ read: 100
+ write: 0
+ append: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
+ setattr: 25
+ rmattr: 25
+ - print: "**** done rados ec task"
--- /dev/null
+workload:
+ full_sequential:
+ - workunit:
+ branch: hammer
+ clients:
+ client.0:
+ - cls
+ - print: "**** done cls 2-workload"
--- /dev/null
+workload:
+ full_sequential:
+ - workunit:
+ branch: hammer
+ clients:
+ client.0:
+ - rados/load-gen-big.sh
+ - print: "**** done rados/load-gen-big.sh 2-workload"
--- /dev/null
+workload:
+ full_sequential:
+ - workunit:
+ branch: hammer
+ clients:
+ client.0:
+ - rbd/test_librbd.sh
+ - print: "**** done rbd/test_librbd.sh 2-workload"
--- /dev/null
+workload:
+ full_sequential:
+ - workunit:
+ branch: hammer
+ clients:
+ client.0:
+ - rbd/test_librbd_python.sh
+ - print: "**** done rbd/test_librbd_python.sh 2-workload"
--- /dev/null
+upgrade-sequence:
+ sequential:
+ - ceph.restart:
+ daemons: [osd.0, osd.1, osd.2, osd.3]
+ wait-for-healthy: false
+ wait-for-osds-up: true
+ - ceph.restart:
+ daemons: [mon.a, mon.b, mon.c]
+ wait-for-healthy: false
+ wait-for-osds-up: true
+ - print: "**** done ceph.restart do not wait for healthy"
+ - exec:
+ mon.a:
+ - sleep 300 # http://tracker.ceph.com/issues/17808
+ - ceph osd set require_jewel_osds
+ - ceph.healthy:
+ - print: "**** done ceph.healthy"
--- /dev/null
+upgrade-sequence:
+ sequential:
+ - ceph.restart:
+ daemons: [osd.0, osd.1]
+ wait-for-healthy: true
+ - sleep:
+ duration: 60
+ - ceph.restart:
+ daemons: [osd.2, osd.3]
+ wait-for-healthy: true
+ - sleep:
+ duration: 60
+ - ceph.restart:
+ daemons: [mon.a]
+ wait-for-healthy: false
+ - sleep:
+ duration: 60
+ - print: "**** running mixed versions of osds and mons"
+#do we need to use "ceph osd crush tunables hammer" ?
+ - exec:
+ mon.b:
+ - sudo ceph osd crush tunables hammer
+ - print: "**** done ceph osd crush tunables hammer"
+ - ceph.restart:
+ daemons: [mon.b, mon.c]
+ wait-for-healthy: false
+ - sleep:
+ duration: 30
+ - exec:
+ osd.0:
+ - sleep 300 # http://tracker.ceph.com/issues/17808
+ - ceph osd set require_jewel_osds
+ - ceph.healthy:
+ - sleep:
+ duration: 60
--- /dev/null
+tasks:
+- install.upgrade:
+ exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+ client.0:
+ branch: jewel
--- /dev/null
+../../../../releases/jewel.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+ - install.upgrade:
+ exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+ client.0:
+ branch: jewel
+ - print: "**** done install.upgrade client.0 to jewel"
+ - install.upgrade:
+ osd.0:
+ osd.2:
+ - print: "**** done install.upgrade daemons to x"
+ - parallel:
+ - workload2
+ - upgrade-sequence2
+ - print: "**** done parallel workload2 and upgrade-sequence2"
--- /dev/null
+meta:
+- desc: |
+ run run randomized correctness test for rados operations
+ on an erasure-coded pool
+workload2:
+ full_sequential:
+ - rados:
+ erasure_code_profile:
+ name: teuthologyprofile2
+ k: 2
+ m: 1
+ ruleset-failure-domain: osd
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ ec_pool: true
+ write_append_excl: false
+ op_weights:
+ read: 100
+ write: 0
+ append: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
+ setattr: 25
+ rmattr: 25
+ - print: "**** done rados ec task"
--- /dev/null
+meta:
+- desc: |
+ object class functional tests
+workload2:
+ full_sequential:
+ - workunit:
+ branch: jewel
+ clients:
+ client.0:
+ - cls
+ - print: "**** done cls 2-workload"
--- /dev/null
+meta:
+- desc: |
+ generate read/write load with rados objects ranging from 1MB to 25MB
+workload2:
+ full_sequential:
+ - workunit:
+ branch: jewel
+ clients:
+ client.0:
+ - rados/load-gen-big.sh
+ - print: "**** done rados/load-gen-big.sh 2-workload"
--- /dev/null
+meta:
+- desc: |
+ librbd C and C++ api tests
+workload2:
+ full_sequential:
+ - workunit:
+ branch: jewel
+ clients:
+ client.0:
+ - rbd/test_librbd.sh
+ - print: "**** done rbd/test_librbd.sh 2-workload"
--- /dev/null
+meta:
+- desc: |
+ librbd python api tests
+workload2:
+ full_sequential:
+ - workunit:
+ branch: jewel
+ clients:
+ client.0:
+ - rbd/test_librbd_python.sh
+ - print: "**** done rbd/test_librbd_python.sh 2-workload"
--- /dev/null
+meta:
+- desc: |
+ upgrade the ceph cluster
+upgrade-sequence2:
+ sequential:
+ - ceph.restart:
+ daemons: [mon.a, mon.b, mon.c, osd.0, osd.1, osd.2, osd.3]
+ wait-for-healthy: false
+ wait-for-osds-up: true
+ - exec:
+ mon.a:
+ - ceph osd set require_kraken_osds
+ - ceph.restart:
+ daemons: [osd.0]
+ - print: "**** done ceph.restart all"
--- /dev/null
+meta:
+- desc: |
+ upgrade the ceph cluster,
+ upgrate in two steps
+ step one ordering: mon.a, osd.0, osd.1
+ step two ordering: mon.b, mon.c, osd.2, osd.3
+ ceph expected to be healthy state after each step
+upgrade-sequence2:
+ sequential:
+ - ceph.restart:
+ daemons: [mon.a]
+ wait-for-healthy: true
+ - sleep:
+ duration: 60
+ - ceph.restart:
+ daemons: [osd.0, osd.1]
+ wait-for-healthy: true
+ - sleep:
+ duration: 60
+ - print: "**** running mixed versions of osds and mons"
+ - exec:
+ mon.b:
+ - sudo ceph osd crush tunables jewel
+ - print: "**** done ceph osd crush tunables jewel"
+ - ceph.restart:
+ daemons: [mon.b, mon.c]
+ wait-for-healthy: true
+ - sleep:
+ duration: 60
+ - ceph.restart:
+ daemons: [osd.2, osd.3]
+ wait-for-healthy: false
+ wait-for-osds-up: true
+ - exec:
+ mon.a:
+ - ceph osd set require_kraken_osds
+ - ceph.restart: [osd.3]
+ - sleep:
+ duration: 60
--- /dev/null
+../../../../releases/kraken.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- rados:
+ clients: [client.1]
+ ops: 4000
+ objects: 50
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+- print: "**** done 7-final-workload/rados-snaps-few-objects.yaml"
--- /dev/null
+tasks:
+ - workunit:
+ clients:
+ client.1:
+ - rados/load-gen-mix.sh
+ - print: "**** done 7-final-workload/rados_loadgenmix.yaml"
--- /dev/null
+tasks:
+ - sequential:
+ - mon_thrash:
+ revive_delay: 20
+ thrash_delay: 1
+ - workunit:
+ branch: jewel
+ clients:
+ client.1:
+ - rados/test-upgrade-v11.0.0.sh
+ - print: "**** done rados/test-upgrade-v11.0.0.sh from 7-final-workload"
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.1:
+ - cls/test_cls_rbd.sh
+- print: "**** done 7-final-workload/rbd_cls.yaml"
--- /dev/null
+tasks:
+- workunit:
+ clients:
+ client.1:
+ - rbd/import_export.sh
+ env:
+ RBD_CREATE_ARGS: --new-format
+- print: "**** done rbd/import_export.sh from 7-final-workload"
--- /dev/null
+tasks:
+- rgw: [client.1]
+- s3tests:
+ client.1:
+ rgw_server: client.1
+- print: "**** done rgw_server from 7-final-workload"
+overrides:
+ ceph:
+ conf:
+ client:
+ rgw lc debug interval: 10
--- /dev/null
+os_type: centos
+os_version: "7.2"
--- /dev/null
+os_type: ubuntu
+os_version: "14.04"
--- /dev/null
+../../jewel-x/stress-split/0-cluster
\ No newline at end of file
--- /dev/null
+tasks:
+- install:
+ branch: hammer
+ exclude_packages:
+ - ceph-mgr
+ - libcephfs2
+ - libcephfs-devel
+ - libcephfs-dev
+- print: '**** done hammer'
+- ceph:
+ fs: xfs
+- install.upgrade:
+ exclude_packages:
+ - ceph-mgr
+ - libcephfs2
+ - libcephfs-devel
+ - libcephfs-dev
+ osd.0:
+ branch: jewel
+ osd.3:
+ branch: jewel
+- print: '*** client.0 upgraded packages to jewel'
+- parallel:
+ - workload-h-j
+ - upgrade-sequence-h-j
+- print: '**** done parallel'
+- install.upgrade:
+ client.0:
+ branch: jewel
+ exclude_packages:
+ - ceph-mgr
+ - libcephfs2
+ - libcephfs-devel
+ - libcephfs-dev
+- exec:
+ osd.0:
+ - ceph osd set sortbitwise
+ - ceph osd set require_jewel_osds
+ - for p in `ceph osd pool ls` ; do ceph osd pool set $p use_gmt_hitset true ;
+ done
+- install.upgrade:
+ client.0:
+ branch: jewel
+ exclude_packages:
+ - ceph-mgr
+ - libcephfs2
+ - libcephfs-devel
+ - libcephfs-dev
+- print: '**** done install.upgrade client.0 to jewel'
+upgrade-sequence-h-j:
+ sequential:
+ - ceph.restart:
+ daemons:
+ - osd.0
+ - osd.1
+ - osd.2
+ - osd.3
+ - osd.4
+ - osd.5
+ wait-for-healthy: false
+ wait-for-osds-up: true
+ - ceph.restart:
+ daemons:
+ - mon.a
+ - mon.b
+ - mon.c
+ wait-for-healthy: false
+ wait-for-osds-up: true
+ - print: '**** done ceph.restart do not wait for healthy'
+ - exec:
+ mon.a:
+ - sleep 300
+ - ceph osd set require_jewel_osds
+ - ceph.healthy: null
+ - print: '**** done ceph.healthy'
+workload-h-j:
+ full_sequential:
+ - workunit:
+ branch: hammer
+ clients:
+ client.0:
+ - cls
+ - print: "**** done cls 2-workload"
+ - workunit:
+ branch: hammer
+ clients:
+ client.0:
+ - rbd/test_librbd.sh
+ - print: "**** done rbd/test_librbd.sh 2-workload"
--- /dev/null
+../../jewel-x/stress-split/2-partial-upgrade/
\ No newline at end of file
--- /dev/null
+../../jewel-x/stress-split/3-thrash/
\ No newline at end of file
--- /dev/null
+../../jewel-x/stress-split/4-mon/
\ No newline at end of file
--- /dev/null
+../../jewel-x/stress-split/5-workload/
\ No newline at end of file
--- /dev/null
+../../jewel-x/stress-split/6-next-mon/
\ No newline at end of file
--- /dev/null
+../../jewel-x/stress-split/7-workload/
\ No newline at end of file
--- /dev/null
+../../jewel-x/stress-split/8-next-mon/
\ No newline at end of file
--- /dev/null
+../../jewel-x/stress-split/9-workload/
\ No newline at end of file
--- /dev/null
+os_type: centos
+os_version: "7.2"
--- /dev/null
+os_type: ubuntu
+os_version: "14.04"
--- /dev/null
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 30 # GB
--- /dev/null
+meta:
+- desc: |
+ Run ceph on two nodes,
+ with a separate client 0,1,2 third node.
+ Use xfs beneath the osds.
+ CephFS tests running on client 2,3
+roles:
+- - mon.a
+ - mds.a
+ - osd.0
+ - osd.1
+- - mon.b
+ - mon.c
+ - osd.2
+ - osd.3
+- - client.0
+ - client.1
+ - client.2
+ - client.3
+overrides:
+ ceph:
+ log-whitelist:
+ - scrub mismatch
+ - ScrubResult
+ - wrongly marked
+ conf:
+ fs: xfs
--- /dev/null
+meta:
+- desc: |
+ install ceph/jewel latest
+ run workload and upgrade-sequence in parallel
+ upgrade the client node
+tasks:
+- install:
+ branch: jewel
+ exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+- print: "**** done installing jewel"
+- ceph:
+- print: "**** done ceph"
+- install.upgrade:
+ mon.a:
+ mon.b:
+- print: "**** done install.upgrade mon.a and mon.b"
+- parallel:
+ - workload
+ - upgrade-sequence
+- print: "**** done parallel"
+- install.upgrade:
+ client.0:
+- print: "**** done install.upgrade on client.0"
--- /dev/null
+meta:
+- desc: |
+ run a cephfs stress test
+ mount ceph-fuse on client.2 before running workunit
+workload:
+ full_sequential:
+ - sequential:
+ - ceph-fuse:
+ - print: "**** done ceph-fuse 2-workload"
+ - workunit:
+ clients:
+ client.2:
+ - suites/blogbench.sh
+ - print: "**** done suites/blogbench.sh 2-workload"
--- /dev/null
+meta:
+- desc: |
+ run run randomized correctness test for rados operations
+ on an erasure-coded pool
+workload:
+ full_sequential:
+ - rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ ec_pool: true
+ write_append_excl: false
+ op_weights:
+ read: 100
+ write: 0
+ append: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
+ setattr: 25
+ rmattr: 25
+ - print: "**** done rados ec task"
--- /dev/null
+meta:
+- desc: |
+ object class functional tests
+workload:
+ full_sequential:
+ - workunit:
+ branch: jewel
+ clients:
+ client.0:
+ - cls
+ - print: "**** done cls 2-workload"
--- /dev/null
+meta:
+- desc: |
+ generate read/write load with rados objects ranging from 1MB to 25MB
+workload:
+ full_sequential:
+ - workunit:
+ branch: jewel
+ clients:
+ client.0:
+ - rados/load-gen-big.sh
+ - print: "**** done rados/load-gen-big.sh 2-workload"
--- /dev/null
+meta:
+- desc: |
+ librbd C and C++ api tests
+workload:
+ full_sequential:
+ - workunit:
+ branch: jewel
+ clients:
+ client.0:
+ - rbd/test_librbd.sh
+ - print: "**** done rbd/test_librbd.sh 2-workload"
--- /dev/null
+meta:
+- desc: |
+ librbd python api tests
+workload:
+ full_sequential:
+ - workunit:
+ branch: jewel
+ clients:
+ client.0:
+ - rbd/test_librbd_python.sh
+ - print: "**** done rbd/test_librbd_python.sh 2-workload"
--- /dev/null
+meta:
+- desc: |
+ upgrade the ceph cluster
+upgrade-sequence:
+ sequential:
+ - ceph.restart:
+ daemons: [mon.a, mon.b, mon.c, mds.a, osd.0, osd.1, osd.2, osd.3]
+ wait-for-healthy: false
+ wait-for-osds-up: true
+ - exec:
+ mon.a:
+ - ceph osd set require_kraken_osds
+ - ceph.restart:
+ daemons: [osd.0]
+ - print: "**** done ceph.restart all"
--- /dev/null
+meta:
+- desc: |
+ upgrade the ceph cluster,
+ upgrate in two steps
+ step one ordering: mon.a, osd.0, osd.1, mds.a
+ step two ordering: mon.b, mon.c, osd.2, osd.3
+ ceph expected to be healthy state after each step
+upgrade-sequence:
+ sequential:
+ - ceph.restart:
+ daemons: [mon.a]
+ wait-for-healthy: true
+ - sleep:
+ duration: 60
+ - ceph.restart:
+ daemons: [osd.0, osd.1]
+ wait-for-healthy: true
+ - sleep:
+ duration: 60
+ - ceph.restart: [mds.a]
+ - sleep:
+ duration: 60
+ - print: "**** running mixed versions of osds and mons"
+ - exec:
+ mon.b:
+ - sudo ceph osd crush tunables jewel
+ - print: "**** done ceph osd crush tunables jewel"
+ - ceph.restart:
+ daemons: [mon.b, mon.c]
+ wait-for-healthy: true
+ - sleep:
+ duration: 60
+ - ceph.restart:
+ daemons: [osd.2, osd.3]
+ wait-for-healthy: false
+ wait-for-osds-up: true
+ - exec:
+ mon.a:
+ - ceph osd set require_kraken_osds
+ - ceph.restart: [osd.3]
+ - sleep:
+ duration: 60
--- /dev/null
+../../../../releases/kraken.yaml
\ No newline at end of file
--- /dev/null
+meta:
+- desc: |
+ run a cephfs stress test
+ mount ceph-fuse on client.3 before running workunit
+tasks:
+- sequential:
+ - ceph-fuse:
+ - print: "**** done ceph-fuse 5-final-workload"
+ - workunit:
+ clients:
+ client.3:
+ - suites/blogbench.sh
+ - print: "**** done suites/blogbench.sh 5-final-workload"
--- /dev/null
+meta:
+- desc: |
+ randomized correctness test for rados operations on a replicated pool with snapshots
+tasks:
+ - rados:
+ clients: [client.1]
+ ops: 4000
+ objects: 50
+ write_append_excl: false
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ - print: "**** done rados 4-final-workload"
--- /dev/null
+meta:
+- desc: |
+ generate read/write load with rados objects ranging from 1 byte to 1MB
+tasks:
+ - workunit:
+ clients:
+ client.1:
+ - rados/load-gen-mix.sh
+ - print: "**** done rados/load-gen-mix.sh 4-final-workload"
--- /dev/null
+meta:
+- desc: |
+ librados C and C++ api tests
+overrides:
+ ceph:
+ log-whitelist:
+ - reached quota
+tasks:
+ - mon_thrash:
+ revive_delay: 20
+ thrash_delay: 1
+ - print: "**** done mon_thrash 4-final-workload"
+ - workunit:
+ branch: jewel
+ clients:
+ client.1:
+ - rados/test-upgrade-v11.0.0.sh
+ - print: "**** done rados/test-upgrade-v11.0.0.sh 4-final-workload"
--- /dev/null
+meta:
+- desc: |
+ rbd object class functional tests
+tasks:
+ - workunit:
+ clients:
+ client.1:
+ - cls/test_cls_rbd.sh
+ - print: "**** done cls/test_cls_rbd.sh 4-final-workload"
--- /dev/null
+meta:
+- desc: |
+ run basic import/export cli tests for rbd
+tasks:
+ - workunit:
+ clients:
+ client.1:
+ - rbd/import_export.sh
+ env:
+ RBD_CREATE_ARGS: --new-format
+ - print: "**** done rbd/import_export.sh 4-final-workload"
--- /dev/null
+meta:
+- desc: |
+ swift api tests for rgw
+overrides:
+ rgw:
+ frontend: civetweb
+tasks:
+ - rgw: [client.1]
+ - print: "**** done rgw 4-final-workload"
+ - swift:
+ client.1:
+ rgw_server: client.1
+ - print: "**** done swift 4-final-workload"
--- /dev/null
+../../../../distros/supported/
\ No newline at end of file
--- /dev/null
+../../../../releases/kraken.yaml
\ No newline at end of file
--- /dev/null
+../../../../../distros/all/centos_7.2.yaml
\ No newline at end of file
--- /dev/null
+../../../../../distros/all/ubuntu_14.04.yaml
\ No newline at end of file
--- /dev/null
+meta:
+- desc: |
+ Run ceph on two nodes, using one of them as a client,
+ with a separate client-only node.
+ Use xfs beneath the osds.
+ install ceph/jewel v10.2.0 point version
+ run workload and upgrade-sequence in parallel
+ install ceph/jewel latest version
+ run workload and upgrade-sequence in parallel
+ install ceph/-x version (jewel or kraken)
+ run workload and upgrade-sequence in parallel
+overrides:
+ ceph:
+ log-whitelist:
+ - reached quota
+ - scrub
+ - osd_map_max_advance
+ - wrongly marked
+ fs: xfs
+ conf:
+ mon:
+ mon debug unsafe allow tier with nonempty snaps: true
+ osd:
+ osd map max advance: 1000
+roles:
+- - mon.a
+ - mds.a
+ - osd.0
+ - osd.1
+ - osd.2
+- - mon.b
+ - mon.c
+ - osd.3
+ - osd.4
+ - osd.5
+ - client.0
+- - client.1
+openstack:
+- volumes: # attached to each instance
+ count: 3
+ size: 30 # GB
+tasks:
+- print: "**** v10.2.0 about to install"
+- install:
+ tag: v10.2.0
+ exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev', 'librgw2']
+- print: "**** done v10.2.0 install"
+- ceph:
+ fs: xfs
+- print: "**** done ceph xfs"
+- sequential:
+ - workload
+- print: "**** done workload v10.2.0"
+- install.upgrade:
+ exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+ mon.a:
+ branch: jewel
+ mon.b:
+ branch: jewel
+ # Note that client.a IS NOT upgraded at this point
+ #client.1:
+ #branch: jewel
+- parallel:
+ - workload_jewel
+ - upgrade-sequence_jewel
+- print: "**** done parallel jewel branch"
+- install.upgrade:
+ exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+ client.1:
+ branch: jewel
+- print: "**** done branch: jewel install.upgrade on client.1"
+- install.upgrade:
+ mon.a:
+ mon.b:
+- print: "**** done branch: -x install.upgrade on mon.a and mon.b"
+- parallel:
+ - workload_x
+ - upgrade-sequence_x
+- print: "**** done parallel -x branch"
+# Run librados tests on the -x upgraded cluster
+- install.upgrade:
+ client.1:
+- workunit:
+ branch: jewel
+ clients:
+ client.1:
+ - rados/test-upgrade-v11.0.0.sh
+ - cls
+- print: "**** done final test on -x cluster"
+#######################
+workload:
+ sequential:
+ - workunit:
+ clients:
+ client.0:
+ - suites/blogbench.sh
+workload_jewel:
+ full_sequential:
+ - workunit:
+ branch: jewel
+ clients:
+ client.1:
+ - rados/test.sh
+ - cls
+ env:
+ CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.mirror_image'
+ - print: "**** done rados/test.sh & cls workload_jewel"
+ - sequential:
+ - rgw: [client.0]
+ - print: "**** done rgw workload_jewel"
+ - s3tests:
+ client.0:
+ force-branch: ceph-jewel
+ rgw_server: client.0
+ - print: "**** done s3tests workload_jewel"
+upgrade-sequence_jewel:
+ sequential:
+ - print: "**** done branch: jewel install.upgrade"
+ - ceph.restart: [mds.a]
+ - sleep:
+ duration: 60
+ - ceph.restart: [osd.0]
+ - sleep:
+ duration: 30
+ - ceph.restart: [osd.1]
+ - sleep:
+ duration: 30
+ - ceph.restart: [osd.2]
+ - sleep:
+ duration: 30
+ - ceph.restart: [osd.3]
+ - sleep:
+ duration: 30
+ - ceph.restart: [osd.4]
+ - sleep:
+ duration: 30
+ - ceph.restart: [osd.5]
+ - sleep:
+ duration: 60
+ - ceph.restart: [mon.a]
+ - sleep:
+ duration: 60
+ - ceph.restart: [mon.b]
+ - sleep:
+ duration: 60
+ - ceph.restart: [mon.c]
+ - sleep:
+ duration: 60
+ - print: "**** done ceph.restart all jewel branch mds/osd/mon"
+workload_x:
+ sequential:
+ - workunit:
+ branch: jewel
+ clients:
+ client.1:
+ - rados/test-upgrade-v11.0.0.sh
+ - cls
+ env:
+ CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.mirror_image'
+ - print: "**** done rados/test-upgrade-v11.0.0.sh & cls workload_x NOT upgraded client"
+ - workunit:
+ branch: jewel
+ clients:
+ client.0:
+ - rados/test-upgrade-v11.0.0.sh
+ - cls
+ - print: "**** done rados/test-upgrade-v11.0.0.sh & cls workload_x upgraded client"
+ - rgw: [client.1]
+ - print: "**** done rgw workload_x"
+ - s3tests:
+ client.1:
+ force-branch: ceph-jewel
+ rgw_server: client.1
+ - print: "**** done s3tests workload_x"
+upgrade-sequence_x:
+ sequential:
+ - ceph.restart: [mds.a]
+ - sleep:
+ duration: 60
+ - ceph.restart: [mon.a]
+ - sleep:
+ duration: 60
+ - ceph.restart: [mon.b]
+ - sleep:
+ duration: 60
+ - ceph.restart: [mon.c]
+ - sleep:
+ duration: 60
+ - ceph.restart: [osd.0]
+ - sleep:
+ duration: 30
+ - ceph.restart: [osd.1]
+ - sleep:
+ duration: 30
+ - ceph.restart: [osd.2]
+ - sleep:
+ duration: 30
+ - ceph.restart: [osd.3]
+ - sleep:
+ duration: 30
+ - ceph.restart: [osd.4]
+ - sleep:
+ duration: 30
+ - ceph.restart:
+ daemons: [osd.5]
+ wait-for-healthy: false
+ wait-for-up-osds: true
+ - exec:
+ mon.a:
+ - ceph osd set require_kraken_osds
+ - sleep:
+ duration: 60
+ - print: "**** done ceph.restart all -x branch mds/osd/mon"
--- /dev/null
+../stress-split/0-cluster/
\ No newline at end of file
--- /dev/null
+arch: x86_64
--- /dev/null
+../stress-split/1-jewel-install/
\ No newline at end of file
--- /dev/null
+../stress-split/2-partial-upgrade/
\ No newline at end of file
--- /dev/null
+meta:
+- desc: |
+ randomly kill and revive osd
+ small chance of increasing the number of pgs
+overrides:
+ ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+ - log bound mismatch
+tasks:
+- thrashosds:
+ timeout: 1200
+ chance_pgnum_grow: 1
+ chance_pgpnum_fix: 1
+ min_in: 4
+- print: "**** done thrashosds 3-thrash"
--- /dev/null
+../stress-split/4-mon/
\ No newline at end of file
--- /dev/null
+meta:
+- desc: |
+ randomized correctness test for rados operations on an erasure coded pool
+tasks:
+ - rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ ec_pool: true
+ write_append_excl: false
+ op_weights:
+ read: 100
+ write: 0
+ append: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
+ setattr: 25
+ rmattr: 25
+ - print: "**** done rados ec task"
--- /dev/null
+../stress-split/6-next-mon/
\ No newline at end of file
--- /dev/null
+../stress-split/8-next-mon/
\ No newline at end of file
--- /dev/null
+#
+# k=3 implies a stripe_width of 1376*3 = 4128 which is different from
+# the default value of 4096 It is also not a multiple of 1024*1024 and
+# creates situations where rounding rules during recovery becomes
+# necessary.
+#
+meta:
+- desc: |
+ randomized correctness test for rados operations on an erasure coded pool
+ using the jerasure plugin with k=3 and m=1
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ ec_pool: true
+ write_append_excl: false
+ erasure_code_profile:
+ name: jerasure31profile
+ plugin: jerasure
+ k: 3
+ m: 1
+ technique: reed_sol_van
+ ruleset-failure-domain: osd
+ op_weights:
+ read: 100
+ write: 0
+ append: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
+ setattr: 25
+ rmattr: 25
--- /dev/null
+../stress-split/0-cluster/
\ No newline at end of file
--- /dev/null
+../stress-split/1-jewel-install/
\ No newline at end of file
--- /dev/null
+../stress-split/2-partial-upgrade/
\ No newline at end of file
--- /dev/null
+meta:
+- desc: |
+ randomly kill and revive osd
+ small chance to increase the number of pgs
+overrides:
+ ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+ - log bound mismatch
+tasks:
+- thrashosds:
+ timeout: 1200
+ chance_pgnum_grow: 1
+ chance_pgpnum_fix: 1
+ min_in: 4
+- print: "**** done thrashosds 3-thrash"
--- /dev/null
+../stress-split/4-mon/
\ No newline at end of file
--- /dev/null
+meta:
+- desc: |
+ randomized correctness test for rados operations on an erasure coded pool
+tasks:
+ - rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ ec_pool: true
+ write_append_excl: false
+ op_weights:
+ read: 100
+ write: 0
+ append: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
+ setattr: 25
+ rmattr: 25
+ - print: "**** done rados ec task"
--- /dev/null
+../stress-split/6-next-mon/
\ No newline at end of file
--- /dev/null
+../stress-split/8-next-mon/
\ No newline at end of file
--- /dev/null
+#
+# k=3 implies a stripe_width of 1376*3 = 4128 which is different from
+# the default value of 4096 It is also not a multiple of 1024*1024 and
+# creates situations where rounding rules during recovery becomes
+# necessary.
+#
+meta:
+- desc: |
+ randomized correctness test for rados operations on an erasure coded pool
+ using the jerasure plugin with k=3 and m=1
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ ec_pool: true
+ write_append_excl: false
+ erasure_code_profile:
+ name: jerasure31profile
+ plugin: jerasure
+ k: 3
+ m: 1
+ technique: reed_sol_van
+ ruleset-failure-domain: osd
+ op_weights:
+ read: 100
+ write: 0
+ append: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ copy_from: 50
+ setattr: 25
+ rmattr: 25
--- /dev/null
+../../../../distros/supported/
\ No newline at end of file
--- /dev/null
+openstack:
+ - machine:
+ disk: 100 # GB
+ - volumes: # attached to each instance
+ count: 3
+ size: 30 # GB
--- /dev/null
+meta:
+- desc: |
+ Run ceph on two nodes,
+ with a separate client-only node.
+ Use xfs beneath the osds.
+overrides:
+ ceph:
+ conf:
+ mon:
+ mon warn on legacy crush tunables: false
+ fs: xfs
+roles:
+- - mon.a
+ - mon.b
+ - mon.c
+ - osd.0
+ - osd.1
+ - osd.2
+- - osd.3
+ - osd.4
+ - osd.5
+- - client.0
--- /dev/null
+meta:
+- desc: install ceph/jewel latest
+tasks:
+- install:
+ branch: jewel
+ exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+- print: "**** done install jewel"
+- ceph:
+- print: "**** done ceph"
--- /dev/null
+meta:
+- desc: |
+ install upgrade ceph/-x on one node only
+ 1st half
+ restart : osd.0,1,2,3,4,5
+tasks:
+- install.upgrade:
+ osd.0:
+- print: "**** done install.upgrade osd.0"
+- ceph.restart:
+ daemons: [osd.0, osd.1, osd.2, osd.3, osd.4, osd.5]
+- print: "**** done ceph.restart 1st half"
--- /dev/null
+meta:
+- desc: |
+ randomly kill and revive osd
+ small chance to increase the number of pgs
+overrides:
+ ceph:
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+ - log bound mismatch
+tasks:
+- thrashosds:
+ timeout: 1200
+ chance_pgnum_grow: 1
+ chance_pgpnum_fix: 1
+- print: "**** done thrashosds 3-thrash"
--- /dev/null
+meta:
+- desc: |
+ restart mon.a so it is upgraded to -x
+tasks:
+- ceph.restart:
+ daemons: [mon.a]
+ wait-for-healthy: false
+ wait-for-osds-up: true
+- print: "**** done ceph.restart mon.a"
--- /dev/null
+meta:
+- desc: |
+ run basic cls tests for rbd
+tasks:
+- workunit:
+ branch: jewel
+ clients:
+ client.0:
+ - cls/test_cls_rbd.sh
+- print: "**** done cls/test_cls_rbd.sh 5-workload"
--- /dev/null
+meta:
+- desc: |
+ run basic import/export cli tests for rbd
+tasks:
+- workunit:
+ branch: jewel
+ clients:
+ client.0:
+ - rbd/import_export.sh
+ env:
+ RBD_CREATE_ARGS: --new-format
+- print: "**** done rbd/import_export.sh 5-workload"
--- /dev/null
+meta:
+- desc: |
+ randomized correctness test for rados operations on a replicated pool,
+ using only reads, writes, and deletes
+tasks:
+- full_sequential:
+ - rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 500
+ write_append_excl: false
+ op_weights:
+ read: 45
+ write: 45
+ delete: 10
+- print: "**** done rados/readwrite 5-workload"
--- /dev/null
+meta:
+- desc: |
+ randomized correctness test for rados operations on a replicated pool with snapshot operations
+tasks:
+- full_sequential:
+ - rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ write_append_excl: false
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+- print: "**** done rados/snaps-few-objects 5-workload"
--- /dev/null
+meta:
+- desc: |
+ restart mon.b so it is upgraded to -x
+tasks:
+- ceph.restart:
+ daemons: [mon.b]
+ wait-for-healthy: false
+ wait-for-osds-up: true
+- print: "**** done ceph.restart mon.b 6-next-mon"
--- /dev/null
+meta:
+- desc: |
+ run randomized correctness test for rados operations
+ generate write load with rados bench
+tasks:
+- full_sequential:
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+ - radosbench:
+ clients: [client.0]
+ time: 150
+- print: "**** done radosbench 7-workload"
--- /dev/null
+meta:
+- desc: |
+ librbd C and C++ api tests
+tasks:
+- workunit:
+ branch: jewel
+ clients:
+ client.0:
+ - rbd/test_librbd.sh
+- print: "**** done rbd/test_librbd.sh 7-workload"
--- /dev/null
+meta:
+- desc: |
+ restart mon.c so it is upgraded to -x
+ as all mon were upgrated, expected ceph cluster reach quorum
+tasks:
+- ceph.restart:
+ daemons: [mon.c]
+ wait-for-healthy: false
+ wait-for-osds-up: true
+- print: "**** done ceph.restart mon.c 8-next-mon"
+- ceph.wait_for_mon_quorum: [a, b, c]
+- print: "**** done wait_for_mon_quorum 8-next-mon"
--- /dev/null
+meta:
+- desc: |
+ librbd python api tests
+tasks:
+- workunit:
+ branch: jewel
+ clients:
+ client.0:
+ - rbd/test_librbd_python.sh
+- print: "**** done rbd/test_librbd_python.sh 9-workload"
--- /dev/null
+meta:
+- desc: |
+ swift api tests for rgw
+tasks:
+- rgw:
+ client.0:
+ default_idle_timeout: 300
+- print: "**** done rgw 9-workload"
+- swift:
+ client.0:
+ rgw_server: client.0
+- print: "**** done swift 9-workload"
--- /dev/null
+meta:
+- desc: |
+ randomized correctness test for rados operations on a replicated pool with snapshot operations
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 500
+ write_append_excl: false
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
--- /dev/null
+../../../../distros/supported/
\ No newline at end of file
--- /dev/null
+import logging
+
+# Inherit teuthology's log level
+teuthology_log = logging.getLogger('teuthology')
+log = logging.getLogger(__name__)
+log.setLevel(teuthology_log.level)
--- /dev/null
+"""
+Admin Socket task -- used in rados, powercycle, and smoke testing
+"""
+from cStringIO import StringIO
+
+import json
+import logging
+import os
+import time
+
+from teuthology.orchestra import run
+from teuthology import misc as teuthology
+from teuthology.parallel import parallel
+
+log = logging.getLogger(__name__)
+
+
+def task(ctx, config):
+ """
+ Run an admin socket command, make sure the output is json, and run
+ a test program on it. The test program should read json from
+ stdin. This task succeeds if the test program exits with status 0.
+
+ To run the same test on all clients::
+
+ tasks:
+ - ceph:
+ - rados:
+ - admin_socket:
+ all:
+ dump_requests:
+ test: http://example.com/script
+
+ To restrict it to certain clients::
+
+ tasks:
+ - ceph:
+ - rados: [client.1]
+ - admin_socket:
+ client.1:
+ dump_requests:
+ test: http://example.com/script
+
+ If an admin socket command has arguments, they can be specified as
+ a list::
+
+ tasks:
+ - ceph:
+ - rados: [client.0]
+ - admin_socket:
+ client.0:
+ dump_requests:
+ test: http://example.com/script
+ help:
+ test: http://example.com/test_help_version
+ args: [version]
+
+ Note that there must be a ceph client with an admin socket running
+ before this task is run. The tests are parallelized at the client
+ level. Tests for a single client are run serially.
+
+ :param ctx: Context
+ :param config: Configuration
+ """
+ assert isinstance(config, dict), \
+ 'admin_socket task requires a dict for configuration'
+ teuthology.replace_all_with_clients(ctx.cluster, config)
+
+ with parallel() as ptask:
+ for client, tests in config.iteritems():
+ ptask.spawn(_run_tests, ctx, client, tests)
+
+
+def _socket_command(ctx, remote, socket_path, command, args):
+ """
+ Run an admin socket command and return the result as a string.
+
+ :param ctx: Context
+ :param remote: Remote site
+ :param socket_path: path to socket
+ :param command: command to be run remotely
+ :param args: command arguments
+
+ :returns: output of command in json format
+ """
+ json_fp = StringIO()
+ testdir = teuthology.get_testdir(ctx)
+ max_tries = 120
+ while True:
+ proc = remote.run(
+ args=[
+ 'sudo',
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'ceph',
+ '--admin-daemon', socket_path,
+ ] + command.split(' ') + args,
+ stdout=json_fp,
+ check_status=False,
+ )
+ if proc.exitstatus == 0:
+ break
+ assert max_tries > 0
+ max_tries -= 1
+ log.info('ceph cli returned an error, command not registered yet?')
+ log.info('sleeping and retrying ...')
+ time.sleep(1)
+ out = json_fp.getvalue()
+ json_fp.close()
+ log.debug('admin socket command %s returned %s', command, out)
+ return json.loads(out)
+
+def _run_tests(ctx, client, tests):
+ """
+ Create a temp directory and wait for a client socket to be created.
+ For each test, copy the executable locally and run the test.
+ Remove temp directory when finished.
+
+ :param ctx: Context
+ :param client: client machine to run the test
+ :param tests: list of tests to run
+ """
+ testdir = teuthology.get_testdir(ctx)
+ log.debug('Running admin socket tests on %s', client)
+ (remote,) = ctx.cluster.only(client).remotes.iterkeys()
+ socket_path = '/var/run/ceph/ceph-{name}.asok'.format(name=client)
+ overrides = ctx.config.get('overrides', {}).get('admin_socket', {})
+
+ try:
+ tmp_dir = os.path.join(
+ testdir,
+ 'admin_socket_{client}'.format(client=client),
+ )
+ remote.run(
+ args=[
+ 'mkdir',
+ '--',
+ tmp_dir,
+ run.Raw('&&'),
+ # wait for client process to create the socket
+ 'while', 'test', '!', '-e', socket_path, run.Raw(';'),
+ 'do', 'sleep', '1', run.Raw(';'), 'done',
+ ],
+ )
+
+ for command, config in tests.iteritems():
+ if config is None:
+ config = {}
+ teuthology.deep_merge(config, overrides)
+ log.debug('Testing %s with config %s', command, str(config))
+
+ test_path = None
+ if 'test' in config:
+ url = config['test'].format(
+ branch=config.get('branch', 'master')
+ )
+ test_path = os.path.join(tmp_dir, command)
+ remote.run(
+ args=[
+ 'wget',
+ '-q',
+ '-O',
+ test_path,
+ '--',
+ url,
+ run.Raw('&&'),
+ 'chmod',
+ 'u=rx',
+ '--',
+ test_path,
+ ],
+ )
+
+ args = config.get('args', [])
+ assert isinstance(args, list), \
+ 'admin socket command args must be a list'
+ sock_out = _socket_command(ctx, remote, socket_path, command, args)
+ if test_path is not None:
+ remote.run(
+ args=[
+ test_path,
+ ],
+ stdin=json.dumps(sock_out),
+ )
+
+ finally:
+ remote.run(
+ args=[
+ 'rm', '-rf', '--', tmp_dir,
+ ],
+ )
--- /dev/null
+<IfModule !version_module>
+ LoadModule version_module {mod_path}/mod_version.so
+</IfModule>
+<IfModule !env_module>
+ LoadModule env_module {mod_path}/mod_env.so
+</IfModule>
+<IfModule !rewrite_module>
+ LoadModule rewrite_module {mod_path}/mod_rewrite.so
+</IfModule>
+<IfModule !log_config_module>
+ LoadModule log_config_module {mod_path}/mod_log_config.so
+</IfModule>
+
+Listen {port}
+ServerName {host}
+
+<IfVersion >= 2.4>
+ <IfModule !unixd_module>
+ LoadModule unixd_module {mod_path}/mod_unixd.so
+ </IfModule>
+ <IfModule !authz_core_module>
+ LoadModule authz_core_module {mod_path}/mod_authz_core.so
+ </IfModule>
+ <IfModule !mpm_worker_module>
+ LoadModule mpm_worker_module {mod_path}/mod_mpm_worker.so
+ </IfModule>
+ User {user}
+ Group {group}
+</IfVersion>
+
+ServerRoot {testdir}/apache
+ErrorLog {testdir}/archive/apache.{client}/error.log
+LogFormat "%h l %u %t \"%r\" %>s %b \"{{Referer}}i\" \"%{{User-agent}}i\"" combined
+CustomLog {testdir}/archive/apache.{client}/access.log combined
+PidFile {testdir}/apache/tmp.{client}/apache.pid
+DocumentRoot {testdir}/apache/htdocs.{client}
+
+
+<Directory {testdir}/apache/htdocs.{client}>
+ Options +ExecCGI
+ AllowOverride All
+ SetHandler fastcgi-script
+</Directory>
+
+AllowEncodedSlashes On
+ServerSignature Off
+MaxRequestsPerChild 0
+
--- /dev/null
+"""
+Run an autotest test on the ceph cluster.
+"""
+import json
+import logging
+import os
+
+from teuthology import misc as teuthology
+from teuthology.parallel import parallel
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+ """
+ Run an autotest test on the ceph cluster.
+
+ Only autotest client tests are supported.
+
+ The config is a mapping from role name to list of tests to run on
+ that client.
+
+ For example::
+
+ tasks:
+ - ceph:
+ - ceph-fuse: [client.0, client.1]
+ - autotest:
+ client.0: [dbench]
+ client.1: [bonnie]
+
+ You can also specify a list of tests to run on all clients::
+
+ tasks:
+ - ceph:
+ - ceph-fuse:
+ - autotest:
+ all: [dbench]
+ """
+ assert isinstance(config, dict)
+ config = teuthology.replace_all_with_clients(ctx.cluster, config)
+ log.info('Setting up autotest...')
+ testdir = teuthology.get_testdir(ctx)
+ with parallel() as p:
+ for role in config.iterkeys():
+ (remote,) = ctx.cluster.only(role).remotes.keys()
+ p.spawn(_download, testdir, remote)
+
+ log.info('Making a separate scratch dir for every client...')
+ for role in config.iterkeys():
+ assert isinstance(role, basestring)
+ PREFIX = 'client.'
+ assert role.startswith(PREFIX)
+ id_ = role[len(PREFIX):]
+ (remote,) = ctx.cluster.only(role).remotes.iterkeys()
+ mnt = os.path.join(testdir, 'mnt.{id}'.format(id=id_))
+ scratch = os.path.join(mnt, 'client.{id}'.format(id=id_))
+ remote.run(
+ args=[
+ 'sudo',
+ 'install',
+ '-d',
+ '-m', '0755',
+ '--owner={user}'.format(user='ubuntu'), #TODO
+ '--',
+ scratch,
+ ],
+ )
+
+ with parallel() as p:
+ for role, tests in config.iteritems():
+ (remote,) = ctx.cluster.only(role).remotes.keys()
+ p.spawn(_run_tests, testdir, remote, role, tests)
+
+def _download(testdir, remote):
+ """
+ Download. Does not explicitly support muliple tasks in a single run.
+ """
+ remote.run(
+ args=[
+ # explicitly does not support multiple autotest tasks
+ # in a single run; the result archival would conflict
+ 'mkdir', '{tdir}/archive/autotest'.format(tdir=testdir),
+ run.Raw('&&'),
+ 'mkdir', '{tdir}/autotest'.format(tdir=testdir),
+ run.Raw('&&'),
+ 'wget',
+ '-nv',
+ '--no-check-certificate',
+ 'https://github.com/ceph/autotest/tarball/ceph',
+ '-O-',
+ run.Raw('|'),
+ 'tar',
+ '-C', '{tdir}/autotest'.format(tdir=testdir),
+ '-x',
+ '-z',
+ '-f-',
+ '--strip-components=1',
+ ],
+ )
+
+def _run_tests(testdir, remote, role, tests):
+ """
+ Spawned to run test on remote site
+ """
+ assert isinstance(role, basestring)
+ PREFIX = 'client.'
+ assert role.startswith(PREFIX)
+ id_ = role[len(PREFIX):]
+ mnt = os.path.join(testdir, 'mnt.{id}'.format(id=id_))
+ scratch = os.path.join(mnt, 'client.{id}'.format(id=id_))
+
+ assert isinstance(tests, list)
+ for idx, testname in enumerate(tests):
+ log.info('Running autotest client test #%d: %s...', idx, testname)
+
+ tag = 'client.{id}.num{idx}.{testname}'.format(
+ idx=idx,
+ testname=testname,
+ id=id_,
+ )
+ control = '{tdir}/control.{tag}'.format(tdir=testdir, tag=tag)
+ teuthology.write_file(
+ remote=remote,
+ path=control,
+ data='import json; data=json.loads({data!r}); job.run_test(**data)'.format(
+ data=json.dumps(dict(
+ url=testname,
+ dir=scratch,
+ # TODO perhaps tag
+ # results will be in {testdir}/autotest/client/results/dbench
+ # or {testdir}/autotest/client/results/dbench.{tag}
+ )),
+ ),
+ )
+ remote.run(
+ args=[
+ '{tdir}/autotest/client/bin/autotest'.format(tdir=testdir),
+ '--verbose',
+ '--harness=simple',
+ '--tag={tag}'.format(tag=tag),
+ control,
+ run.Raw('3>&1'),
+ ],
+ )
+
+ remote.run(
+ args=[
+ 'rm', '-rf', '--', control,
+ ],
+ )
+
+ remote.run(
+ args=[
+ 'mv',
+ '--',
+ '{tdir}/autotest/client/results/{tag}'.format(tdir=testdir, tag=tag),
+ '{tdir}/archive/autotest/{tag}'.format(tdir=testdir, tag=tag),
+ ],
+ )
+
+ remote.run(
+ args=[
+ 'rm', '-rf', '--', '{tdir}/autotest'.format(tdir=testdir),
+ ],
+ )
--- /dev/null
+"""
+Aver wrapper task
+"""
+import contextlib
+import logging
+from subprocess import check_call, Popen, PIPE
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Execute an aver assertion
+
+ Parameters:
+
+ input: file containing data referred to by the assertions. File name is
+ relative to the job's archive path
+ validations: list of validations in the Aver language
+
+ Example:
+ - aver:
+ input: bench_output.csv
+ validations:
+ - expect performance(alg='ceph') > performance(alg='raw')
+ - for size > 3 expect avg_throughput > 2000
+ """
+ log.info('Beginning aver...')
+ assert isinstance(config, dict), 'expecting dictionary for configuration'
+
+ if 'input' not in config:
+ raise Exception("Expecting 'input' option")
+ if len(config.get('validations', [])) < 1:
+ raise Exception("Expecting at least one entry in 'validations'")
+
+ url = ('https://github.com/ivotron/aver/releases/download/'
+ 'v0.3.0/aver-linux-amd64.tar.bz2')
+
+ aver_path = ctx.archive + '/aver'
+
+ # download binary
+ check_call(['wget', '-O', aver_path + '.tbz', url])
+ check_call(['tar', 'xfj', aver_path + '.tbz', '-C', ctx.archive])
+
+ # print version
+ process = Popen([aver_path, '-v'], stdout=PIPE)
+ log.info(process.communicate()[0])
+
+ # validate
+ for validation in config['validations']:
+ cmd = (aver_path + ' -s -i ' + (ctx.archive + '/' + config['input']) +
+ ' "' + validation + '"')
+ log.info("executing: " + cmd)
+ process = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
+ (stdout, stderr) = process.communicate()
+ if stderr:
+ log.info('aver stderr: ' + stderr)
+ log.info('aver result: ' + stdout)
+ if stdout.strip(' \t\n\r') != 'true':
+ raise Exception('Failed validation: ' + validation)
+
+ try:
+ yield
+ finally:
+ log.info('Removing aver binary...')
+ check_call(['rm', aver_path, aver_path + '.tbz'])
--- /dev/null
+"""
+Run blktrace program through teuthology
+"""
+import contextlib
+import logging
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+blktrace = '/usr/sbin/blktrace'
+daemon_signal = 'term'
+
+@contextlib.contextmanager
+def setup(ctx, config):
+ """
+ Setup all the remotes
+ """
+ osds = ctx.cluster.only(teuthology.is_type('osd', config['cluster']))
+ log_dir = '{tdir}/archive/performance/blktrace'.format(tdir=teuthology.get_testdir(ctx))
+
+ for remote, roles_for_host in osds.remotes.iteritems():
+ log.info('Creating %s on %s' % (log_dir, remote.name))
+ remote.run(
+ args=['mkdir', '-p', '-m0755', '--', log_dir],
+ wait=False,
+ )
+ yield
+
+@contextlib.contextmanager
+def execute(ctx, config):
+ """
+ Run the blktrace program on remote machines.
+ """
+ procs = []
+ testdir = teuthology.get_testdir(ctx)
+ log_dir = '{tdir}/archive/performance/blktrace'.format(tdir=testdir)
+
+ osds = ctx.cluster.only(teuthology.is_type('osd'))
+ for remote, roles_for_host in osds.remotes.iteritems():
+ roles_to_devs = ctx.disk_config.remote_to_roles_to_dev[remote]
+ for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd',
+ config['cluster']):
+ if roles_to_devs.get(role):
+ dev = roles_to_devs[role]
+ log.info("running blktrace on %s: %s" % (remote.name, dev))
+
+ proc = remote.run(
+ args=[
+ 'cd',
+ log_dir,
+ run.Raw(';'),
+ 'daemon-helper',
+ daemon_signal,
+ 'sudo',
+ blktrace,
+ '-o',
+ dev.rsplit("/", 1)[1],
+ '-d',
+ dev,
+ ],
+ wait=False,
+ stdin=run.PIPE,
+ )
+ procs.append(proc)
+ try:
+ yield
+ finally:
+ osds = ctx.cluster.only(teuthology.is_type('osd'))
+ log.info('stopping blktrace processs')
+ for proc in procs:
+ proc.stdin.close()
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Usage:
+ blktrace:
+
+ or:
+ blktrace:
+ cluster: backup
+
+ Runs blktrace on all osds in the specified cluster (the 'ceph' cluster by
+ default).
+ """
+ if config is None:
+ config = {}
+ config['cluster'] = config.get('cluster', 'ceph')
+
+ with contextutil.nested(
+ lambda: setup(ctx=ctx, config=config),
+ lambda: execute(ctx=ctx, config=config),
+ ):
+ yield
--- /dev/null
+[Boto]
+http_socket_timeout = {idle_timeout}
--- /dev/null
+"""
+Build ceph packages
+
+Unit tests:
+
+py.test -v -s tests/test_buildpackages.py
+
+Integration tests:
+
+teuthology-openstack --verbose --key-name myself --key-filename ~/Downloads/myself --ceph infernalis --suite teuthology/buildpackages
+
+"""
+import copy
+import logging
+import os
+import types
+from teuthology import packaging
+from teuthology import misc
+from teuthology.config import config as teuth_config
+from teuthology.openstack import OpenStack
+
+log = logging.getLogger(__name__)
+
+class LocalGitbuilderProject(packaging.GitbuilderProject):
+
+ def __init__(self):
+ pass
+
+
+def get_pkg_type(os_type):
+ if os_type in ('centos', 'fedora', 'opensuse', 'rhel', 'sles'):
+ return 'rpm'
+ else:
+ return 'deb'
+
+def apply_overrides(ctx, config):
+ if config is None:
+ config = {}
+ else:
+ config = copy.deepcopy(config)
+
+ assert isinstance(config, dict), \
+ "task install only supports a dictionary for configuration"
+
+ project, = config.get('project', 'ceph'),
+ log.debug('project %s' % project)
+ overrides = ctx.config.get('overrides')
+ if overrides:
+ install_overrides = overrides.get('install', {})
+ misc.deep_merge(config, install_overrides.get(project, {}))
+ return config
+
+def get_config_install(ctx, config):
+ config = apply_overrides(ctx, config)
+ log.debug('install config %s' % config)
+ return [(config.get('flavor', 'basic'),
+ config.get('tag', ''),
+ config.get('branch', ''),
+ config.get('sha1'))]
+
+def get_config_install_upgrade(ctx, config):
+ log.debug('install.upgrade config before override %s' % config)
+ configs = []
+ for (role, role_config) in config.iteritems():
+ if role_config is None:
+ role_config = {}
+ o = apply_overrides(ctx, role_config)
+
+ log.debug('install.upgrade config ' + str(role_config) +
+ ' and with overrides ' + str(o))
+ # for install.upgrade overrides are actually defaults
+ configs.append((o.get('flavor', 'basic'),
+ role_config.get('tag', o.get('tag', '')),
+ role_config.get('branch', o.get('branch', '')),
+ role_config.get('sha1', o.get('sha1'))))
+ return configs
+
+GET_CONFIG_FUNCTIONS = {
+ 'install': get_config_install,
+ 'install.upgrade': get_config_install_upgrade,
+}
+
+def lookup_configs(ctx, node):
+ configs = []
+ if type(node) is types.ListType:
+ for leaf in node:
+ configs.extend(lookup_configs(ctx, leaf))
+ elif type(node) is types.DictType:
+ for (key, value) in node.iteritems():
+ if key in ('install', 'install.upgrade'):
+ configs.extend(GET_CONFIG_FUNCTIONS[key](ctx, value))
+ elif key in ('overrides',):
+ pass
+ else:
+ configs.extend(lookup_configs(ctx, value))
+ return configs
+
+def get_sha1(ref):
+ url = teuth_config.get_ceph_git_url()
+ ls_remote = misc.sh("git ls-remote " + url + " " + ref)
+ return ls_remote.split()[0]
+
+def task(ctx, config):
+ """
+ Build Ceph packages. This task will automagically be run
+ before the task that need to install packages (this is taken
+ care of by the internal teuthology task).
+
+ The config should be as follows:
+
+ buildpackages:
+ good_machine:
+ disk: 40 # GB
+ ram: 48000 # MB
+ cpus: 16
+ min_machine:
+ disk: 40 # GB
+ ram: 8000 # MB
+ cpus: 1
+
+ example:
+
+ tasks:
+ - buildpackages:
+ good_machine:
+ disk: 40 # GB
+ ram: 15000 # MB
+ cpus: 16
+ min_machine:
+ disk: 40 # GB
+ ram: 8000 # MB
+ cpus: 1
+ - install:
+
+ When a buildpackages task is already included, the values it contains can be
+ overriden with:
+
+ overrides:
+ buildpackages:
+ good_machine:
+ disk: 20 # GB
+ ram: 2000 # MB
+ cpus: 2
+ min_machine:
+ disk: 10 # GB
+ ram: 1000 # MB
+ cpus: 1
+
+ """
+ log.info('Beginning buildpackages...')
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'task only accepts a dict for config not ' + str(config)
+ overrides = ctx.config.get('overrides', {})
+ misc.deep_merge(config, overrides.get('buildpackages', {}))
+ d = os.path.join(os.path.dirname(__file__), 'buildpackages')
+ os_type = misc.get_distro(ctx)
+ os_version = misc.get_distro_version(ctx)
+ arch = ctx.config.get('arch', OpenStack().get_default_arch())
+ dist = LocalGitbuilderProject()._get_distro(distro=os_type,
+ version=os_version)
+ pkg_type = get_pkg_type(os_type)
+ misc.sh(
+ "flock --close /tmp/buildpackages " +
+ "make -C " + d + " " + os.environ['HOME'] + "/.ssh_agent")
+ for (flavor, tag, branch, sha1) in lookup_configs(ctx, ctx.config):
+ if tag:
+ sha1 = get_sha1(tag)
+ elif branch:
+ sha1 = get_sha1(branch)
+ log.info("building flavor = " + flavor + "," +
+ " tag = " + tag + "," +
+ " branch = " + branch + "," +
+ " sha1 = " + sha1)
+ target = ('ceph-' +
+ pkg_type + '-' +
+ dist + '-' +
+ arch + '-' +
+ flavor + '-' +
+ sha1)
+ openstack = OpenStack()
+ openstack.set_provider()
+ if openstack.provider == 'ovh':
+ select = '^(vps|hg)-.*ssd'
+ else:
+ select = ''
+ network = openstack.net()
+ if network != "":
+ network = " OPENSTACK_NETWORK='" + network + "' "
+ openstack.image(os_type, os_version, arch) # create if it does not exist
+ build_flavor = openstack.flavor_range(
+ config['min_machine'], config['good_machine'], arch, select)
+ default_arch = openstack.get_default_arch()
+ http_flavor = openstack.flavor({
+ 'disk': 30, # GB
+ 'ram': 1024, # MB
+ 'cpus': 1,
+ }, default_arch, select)
+ lock = "/tmp/buildpackages-" + sha1 + "-" + os_type + "-" + os_version
+ cmd = (". " + os.environ['HOME'] + "/.ssh_agent ; " +
+ " flock --close " + lock +
+ " make -C " + d +
+ network +
+ " CEPH_GIT_URL=" + teuth_config.get_ceph_git_url() +
+ " CEPH_PKG_TYPE=" + pkg_type +
+ " CEPH_OS_TYPE=" + os_type +
+ " CEPH_OS_VERSION=" + os_version +
+ " CEPH_DIST=" + dist +
+ " CEPH_ARCH=" + arch +
+ " CEPH_SHA1=" + sha1 +
+ " CEPH_TAG=" + tag +
+ " CEPH_BRANCH=" + branch +
+ " CEPH_FLAVOR=" + flavor +
+ " BUILD_FLAVOR=" + build_flavor +
+ " HTTP_FLAVOR=" + http_flavor +
+ " HTTP_ARCH=" + default_arch +
+ " " + target +
+ " ")
+ log.info("buildpackages: " + cmd)
+ misc.sh(cmd)
+ teuth_config.gitbuilder_host = openstack.get_ip('packages-repository', '')
+ log.info('Finished buildpackages')
--- /dev/null
+SHELL=/bin/bash
+D=/tmp/stampsdir
+VPATH=${D}
+TIMEOUT_SERVER_CREATE = 30m
+TIMEOUT_BUILD = 220m # 20 minutes short of 4 hours
+PKG_REPO=packages-repository
+PKG_REPO_OS_TYPE=ubuntu
+PKG_REPO_OS_VERSION=14.04
+PKG_REPO_USER_DATA=${PKG_REPO_OS_TYPE}-${PKG_REPO_OS_VERSION}-user-data.txt
+
+# We want to extract the first listed IPv4 address!
+# Openstack will provide the addresses field in this format:
+# "net1-name=ip(, ip)+(; net2-name=ip(, ip)+)+"
+# Each IP may be v4 or v6 (including shortened forms and IPv4-mapped-IPv6 forms)
+# 1.2.3.4
+# 2001:db8:6050:ed4d:f816:3eff:fe48:3b36
+# 2001:db8::fe48:3b36
+# 2001:db8::1.2.3.4
+# Example long-form input:
+# private-network=10.10.10.69, 2001:db8:6050:ed4d:f816:3eff:fed1:d9f8;net-name2=2001:db8::fe48:3b36, 2001:db8::1.2.3.4, 1.2.3.4;
+# TODO: allow selection of the network instead of taking the first network
+# TODO: Support IPv6 in future
+define get_ip
+$$(openstack server show -f value -c addresses $(1) |perl -pe 's/^[^=]+=([^;]+).*/\1/g; s/[ ,]/\n/g; ' |grep -v -e ':' -e '^$$' |head -n1)
+endef
+
+MY_IP=$(shell hostname -I | cut -f1 -d' ')
+
+${HOME}/.ssh_agent:
+ ssh-agent -s > ${HOME}/.ssh_agent
+ source ${HOME}/.ssh_agent ; ssh-add ; ssh-add -l
+ grep -q ssh_agent ~/.bashrc_teuthology || echo 'source ${HOME}/.ssh_agent' >> ~/.bashrc_teuthology
+
+flock-${PKG_REPO}:
+ timeout $(TIMEOUT_SERVER_CREATE) openstack server create --image 'teuthology-ubuntu-14.04-${HTTP_ARCH}' ${OPENSTACK_NETWORK} --flavor ${HTTP_FLAVOR} --key-name teuthology --security-group teuthology --property ownedby=${MY_IP} --user-data ${PKG_REPO_USER_DATA} --wait ${PKG_REPO}
+ sleep 30
+ set -ex ; \
+ ip=$(call get_ip,${PKG_REPO}) ; \
+ for delay in 1 2 4 8 8 8 8 8 8 8 8 8 16 16 16 16 16 32 32 32 64 128 256 512 ; do if ssh -o 'ConnectTimeout=3' $$ip bash -c '"grep -q READYTORUN /var/log/cloud-init*.log"' ; then break ; else sleep $$delay ; fi ; done ; \
+ ssh $$ip sudo apt-get update ; \
+ ssh $$ip sudo apt-get install -y nginx rsync && \
+ ssh $$ip sudo chown -R ubuntu /usr/share/nginx/html && \
+ ssh $$ip sudo rm /usr/share/nginx/html/\* && \
+ ssh $$ip sudo perl -pi -e '"s|location / {|location / { autoindex on;|"' /etc/nginx/sites-available/default && \
+ ssh $$ip sudo /etc/init.d/nginx restart && \
+ perl -pi -e "s/^gitbuilder_host:.*/gitbuilder_host: $$ip/" ~/.teuthology.yaml
+ touch ${D}/$@
+
+${PKG_REPO}:
+ mkdir -p ${D}
+ flock --close ${D}/flock-$@.lock ${MAKE} flock-$@
+ touch ${D}/$@
+
+# Just because 'server create' return success does not mean it actually succeeded!
+# Check the server status before we proceed.
+# If it's a weird status, bail out and let the delete fire
+# eg: ERROR status can happen if there is no VM host without enough capacity for the request.
+ceph-${CEPH_PKG_TYPE}-${CEPH_DIST}-${CEPH_ARCH}-${CEPH_FLAVOR}-${CEPH_SHA1}: ${PKG_REPO}
+ timeout $(TIMEOUT_SERVER_CREATE) openstack server create --image 'teuthology-${CEPH_OS_TYPE}-${CEPH_OS_VERSION}-${CEPH_ARCH}' ${OPENSTACK_NETWORK} --flavor ${BUILD_FLAVOR} --key-name teuthology --security-group teuthology --property ownedby=${MY_IP} --user-data ${CEPH_OS_TYPE}-${CEPH_OS_VERSION}-user-data.txt --wait $@
+ set -ex ; \
+ trap "openstack server delete --wait $@" EXIT ; \
+ for delay in 30 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 ; do \
+ status=$$(openstack server show -c status -f value $@) ; \
+ case $$status in \
+ ACTIVE) break ;; \
+ NOSTATE|*BUILD|*BOOT|*RESIZE) sleep $$delay ;; \
+ *) exit 1 ;; \
+ esac ; \
+ done ; \
+ ip=$(call get_ip,$@) ; \
+ test -n "$$ip" || exit ; \
+ for delay in 1 2 4 8 8 8 8 8 8 8 8 8 16 16 16 16 16 32 32 32 64 128 256 512 ; do if ssh -o 'ConnectTimeout=3' $$ip bash -c '"grep -q READYTORUN /var/log/cloud-init*.log"' ; then break ; else sleep $$delay ; fi ; done ; \
+ scp make-${CEPH_PKG_TYPE}.sh common.sh ubuntu@$$ip: ; \
+ packages_repository=$(call get_ip,${<F}) ; \
+ timeout $(TIMEOUT_BUILD) ssh -tt -A ubuntu@$$ip bash ./make-${CEPH_PKG_TYPE}.sh $$packages_repository ${CEPH_DIST} ${CEPH_GIT_URL} ${CEPH_SHA1} ${CEPH_FLAVOR} ${CEPH_ARCH}
+ mkdir -p ${D}/${@D} ; touch ${D}/$@
+
+clobber:
+ pkill ssh-agent || true
+ rm -f ${HOME}/.ssh_agent
+ rm -fr ${D}
--- /dev/null
+#cloud-config
+bootcmd:
+ - yum install -y yum-utils && yum-config-manager --add-repo https://dl.fedoraproject.org/pub/epel/6/x86_64/ && yum install --nogpgcheck -y epel-release && rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-6 && rm /etc/yum.repos.d/dl.fedoraproject.org*
+ - ( echo ; echo "MaxSessions 1000" ) >> /etc/ssh/sshd_config
+ - ( echo 'Defaults !requiretty' ; echo 'Defaults visiblepw' ) | tee /etc/sudoers.d/cephlab_sudo
+preserve_hostname: true
+system_info:
+ default_user:
+ name: ubuntu
+packages:
+ - dracut-modules-growroot
+runcmd:
+ - mkinitrd --force /boot/initramfs-2.6.32-573.3.1.el6.x86_64.img 2.6.32-573.3.1.el6.x86_64
+ - reboot
+final_message: "READYTORUN"
--- /dev/null
+user-data.txt
\ No newline at end of file
--- /dev/null
+user-data.txt
\ No newline at end of file
--- /dev/null
+user-data.txt
\ No newline at end of file
--- /dev/null
+#!/bin/bash
+#
+# Copyright (C) 2015 Red Hat <contact@redhat.com>
+#
+# Author: Loic Dachary <loic@dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+function install_deps() {
+ git archive --remote=git://git.ceph.com/ceph.git master install-deps.sh | tar -xvf -
+ #
+ # drop the following hack when trusty is not supported anymore
+ # there is no other way as long as we maintain a debian directory that tries
+ # to be the same for all distributions
+ #
+ if grep --quiet 14.04 /etc/issue 2>/dev/null && sudo apt-get install --force-yes -qq -y dpkg-dev && test "$(dpkg-architecture -qDEB_BUILD_GNU_CPU 2>/dev/null)" = aarch64 ; then
+ sed -i -e '/libgoogle-perftools-dev/d' debian/control
+ fi
+ bash -x install-deps.sh
+}
+
+function git_submodules() {
+ # see http://tracker.ceph.com/issues/13426
+ perl -pi -e 's|git://ceph.com/git/ceph-object-corpus.git|https://github.com/ceph/ceph-object-corpus.git|' .gitmodules
+ local force=$(if git submodule usage 2>&1 | grep --quiet 'update.*--force'; then echo --force ; fi)
+ git submodule sync || return 1
+ git submodule update $force --init --recursive || return 1
+}
+
+function get_ceph() {
+ local git_ceph_url=$1
+ local sha1=$2
+
+ test -d ceph || git clone ${git_ceph_url} ceph
+ cd ceph
+ if test -d src ; then # so we don't try to fetch when using a fixture
+ git fetch --tags http://github.com/ceph/ceph
+ fi
+ git fetch --tags ${git_ceph_url}
+ git checkout ${sha1}
+}
+
+function init_ceph() {
+ local git_ceph_url=$1
+ local sha1=$2
+ get_ceph $git_ceph_url $sha1 || return 1
+ git_submodules || return 1
+ install_deps || return 1
+}
+
+function flavor2configure() {
+ local flavor=$1
+
+ eval $(dpkg-architecture)
+
+ if test $flavor = notcmalloc || test "$DEB_HOST_GNU_CPU" = aarch64 ; then
+ echo --without-tcmalloc --without-cryptopp
+ fi
+}
+
+#
+# for a given $sha1 in the $ceph_dir repository, lookup all references
+# from the remote origin and tags matching the sha1. Add a symbolic
+# link in $ref_dir to the $sha1 for each reference found. If the
+# reference is a tag, also add a symbolic link to the commit to which
+# the tag points, if it is an annotated tag.
+#
+function link_same() {
+ local ref_dir=$1
+ local ceph_dir=$2
+ local sha1=$3
+
+ mkdir -p $ref_dir
+ (
+ cd ${ceph_dir}
+ git for-each-ref refs/tags/** refs/remotes/origin/** | grep $sha1 | \
+ while read sha1 type ref ; do
+ if test $type = 'tag' ; then
+ commit_sha1=$(git rev-parse $ref^{commit})
+ if test $commit_sha1 != $sha1 ; then
+ echo ../sha1/$sha1 ../sha1/$commit_sha1
+ fi
+ fi
+ echo ../sha1/$sha1 $(basename $ref)
+ done
+ ) | while read from to ; do
+ ( cd $ref_dir ; ln -sf $from $to )
+ done
+}
+
+function test_link_same() {
+ local d=/tmp/link_same$$
+ mkdir -p $d/primary
+ cd $d/primary
+ git init
+ touch a ; git add a ; git commit -m 'm' a
+ git tag tag1
+ tag1=$(git rev-parse HEAD)
+ git branch branch1
+ touch b ; git add b ; git commit -m 'm' b
+ git tag --annotate -m 'a' tag2
+ tag2=$(git rev-parse tag2)
+ sha1_tag2=$(git rev-parse tag2^{commit})
+ git branch branch2
+ touch c ; git add c ; git commit -m 'm' c
+ git branch branch3
+ sha1_branch3=$(git rev-parse branch3)
+
+ git clone $d/primary $d/secondary
+ cd $d/secondary
+ mkdir $d/ref $d/sha1
+
+ touch $d/sha1/$sha1_branch3
+ link_same $d/ref $d/secondary $sha1_branch3
+ test $(readlink --canonicalize $d/ref/branch3) = $d/sha1/$sha1_branch3 || return 1
+ test $(readlink --canonicalize $d/ref/master) = $d/sha1/$sha1_branch3 || return 1
+
+ touch $d/sha1/$tag2
+ link_same $d/ref $d/secondary $tag2
+ test $(readlink --canonicalize $d/ref/tag2) = $d/sha1/$tag2 || return 1
+ test $(readlink --canonicalize $d/sha1/$sha1_tag2) = $d/sha1/$tag2 || return 1
+
+ touch $d/sha1/$tag1
+ link_same $d/ref $d/secondary $tag1
+ test $(readlink --canonicalize $d/ref/tag1) = $d/sha1/$tag1 || return 1
+ test $(readlink --canonicalize $d/ref/branch1) = $d/sha1/$tag1 || return 1
+
+ rm -fr $d
+}
+
+function maybe_parallel() {
+ local nproc=$1
+ local vers=$2
+
+ if echo $vers | grep --quiet '0\.67' ; then
+ return
+ fi
+
+ if test $nproc -gt 1 ; then
+ echo -j${nproc}
+ fi
+}
+
+function test_maybe_parallel() {
+ test "$(maybe_parallel 1 0.72)" = "" || return 1
+ test "$(maybe_parallel 8 0.67)" = "" || return 1
+ test "$(maybe_parallel 8 0.72)" = "-j8" || return 1
+}
+
+if test "$1" = "TEST" ; then
+ shopt -s -o xtrace
+ PS4='${BASH_SOURCE[0]}:$LINENO: ${FUNCNAME[0]}: '
+ test_link_same
+ test_maybe_parallel
+fi
--- /dev/null
+#cloud-config
+bootcmd:
+ - echo 'APT::Get::AllowUnauthenticated "true";' | tee /etc/apt/apt.conf.d/99disablesigs
+ - echo nameserver 8.8.8.8 | tee -a /etc/resolv.conf # last resort, in case the DHCP server does not provide a resolver
+manage_etc_hosts: true
+preserve_hostname: true
+system_info:
+ default_user:
+ name: ubuntu
+runcmd:
+ - echo 'ubuntu ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
+final_message: "READYTORUN"
--- /dev/null
+#!/bin/bash
+#
+# Copyright (C) 2015 Red Hat <contact@redhat.com>
+#
+# Author: Loic Dachary <loic@dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+#
+# Create and upload a deb repository with the same naming conventions
+# as https://github.com/ceph/autobuild-ceph/blob/master/build-ceph-deb.sh
+#
+set -xe
+
+base=/tmp/release
+gitbuilder_host=$1
+codename=$2
+git_ceph_url=$3
+sha1=$4
+flavor=$5
+arch=$6
+
+sudo apt-get update
+sudo apt-get install -y git
+
+source $(dirname $0)/common.sh
+
+init_ceph $git_ceph_url $sha1
+
+#codename=$(lsb_release -sc)
+releasedir=$base/$(lsb_release -si)/WORKDIR
+#
+# git describe provides a version that is
+# a) human readable
+# b) is unique for each commit
+# c) compares higher than any previous commit
+# d) contains the short hash of the commit
+#
+vers=$(git describe --match "v*" | sed s/^v//)
+#
+# always set the debian version to 1 which is ok because the debian
+# directory is included in the sources and the upstream version will
+# change each time it is modified.
+#
+dvers="$vers-1"
+: ${NPROC:=$(nproc)}
+ceph_dir=$(pwd)
+
+function build_package() {
+
+ rm -fr $releasedir
+ mkdir -p $releasedir
+ #
+ # remove all files not under git so they are not
+ # included in the distribution.
+ #
+ git clean -qdxff
+
+ fileext="gz"
+ # autotools only works in jewel and below
+ if [[ ! -e "make-dist" ]] ; then
+ #
+ # creating the distribution tarbal requires some configure
+ # options (otherwise parts of the source tree will be left out).
+ #
+ ./autogen.sh
+ # Building with LTTNG on Ubuntu Precise is not possible.
+ # It fails the LTTNG-is-sane check (it misses headers)
+ # And the Debian rules files leave it out anyway
+ case $codename in
+ precise) lttng_opt="--without-lttng" ;;
+ *) lttng_opt="--with-lttng" ;;
+ esac
+ ./configure $(flavor2configure $flavor) \
+ --with-rocksdb --with-ocf \
+ --with-nss --with-debug --enable-cephfs-java \
+ $lttng_opt --with-babeltrace
+ #
+ # use distdir= to set the name of the top level directory of the
+ # tarbal to match the desired version
+ #
+ make distdir=ceph-$vers dist
+ else
+ ./make-dist
+ fileext="bz2"
+ fi
+ #
+ # rename the tarbal to match debian conventions and extract it
+ #
+ mv ceph-$vers.tar.$fileext $releasedir/ceph_$vers.orig.tar.$fileext
+ tar -C $releasedir -xf $releasedir/ceph_$vers.orig.tar.$fileext
+ #
+ # copy the debian directory over
+ #
+ cp -a debian $releasedir/ceph-$vers/debian
+ cd $releasedir
+ #
+ # uncomment to remove -dbg packages
+ # because they are large and take time to build
+ #
+ #perl -ni -e 'print if(!(/^Package: .*-dbg$/../^$/))' ceph-$vers/debian/control
+ #perl -pi -e 's/--dbg-package.*//' ceph-$vers/debian/rules
+ #
+ # update the changelog to match the desired version
+ #
+ cd ceph-$vers
+ local chvers=$(head -1 debian/changelog | perl -ne 's/.*\(//; s/\).*//; print')
+ if [ "$chvers" != "$dvers" ]; then
+ DEBEMAIL="contact@ceph.com" dch -D $codename --force-distribution -b -v "$dvers" "new version"
+ fi
+ #
+ # create the packages (with ccache)
+ #
+ export CEPH_EXTRA_CONFIGURE_ARGS=$(flavor2configure $flavor)
+ j=$(maybe_parallel $NPROC $vers)
+ PATH=/usr/lib/ccache:$PATH dpkg-buildpackage $j -uc -us -sa
+}
+
+function build_repo() {
+ local gitbuilder_host=$1
+
+ sudo apt-get install -y reprepro
+ cd ${releasedir}/..
+ #
+ # Create a repository in a directory with a name structured
+ # as
+ #
+ base=ceph-deb-$codename-$arch-$flavor
+ sha1_dir=$codename/$base/sha1/$sha1
+ mkdir -p $sha1_dir/conf
+ cat > $sha1_dir/conf/distributions <<EOF
+Codename: $codename
+Suite: stable
+Components: main
+Architectures: i386 amd64 arm64 source
+EOF
+ reprepro --basedir $sha1_dir include $codename WORKDIR/*.changes
+ echo $dvers > $sha1_dir/version
+ echo $sha1 > $sha1_dir/sha1
+ link_same $codename/$base/ref $ceph_dir $sha1
+ if test "$gitbuilder_host" ; then
+ cd $codename
+ sudo apt-get install -y rsync
+ RSYNC_RSH='ssh -o StrictHostKeyChecking=false' rsync -av $base/ $gitbuilder_host:/usr/share/nginx/html/$base/
+ fi
+}
+
+build_package
+build_repo $gitbuilder_host
--- /dev/null
+#!/bin/bash
+#
+# Copyright (C) 2015 Red Hat <contact@redhat.com>
+#
+# Author: Loic Dachary <loic@dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+#
+# Create and upload a RPM repository with the same naming conventions
+# as https://github.com/ceph/autobuild-ceph/blob/master/build-ceph-rpm.sh
+#
+
+set -xe
+
+base=/tmp/release
+gitbuilder_host=$1
+codename=$2
+git_ceph_url=$3
+sha1=$4
+flavor=$5
+arch=$6
+
+suse=false
+[[ $codename =~ suse ]] && suse=true
+
+if [ "$suse" = true ] ; then
+ sudo zypper -n install git
+else
+ sudo yum install -y git
+fi
+
+source $(dirname $0)/common.sh
+
+init_ceph $git_ceph_url $sha1
+
+distro=$( source /etc/os-release ; echo $ID )
+distro_version=$( source /etc/os-release ; echo $VERSION )
+releasedir=$base/$distro/WORKDIR
+#
+# git describe provides a version that is
+# a) human readable
+# b) is unique for each commit
+# c) compares higher than any previous commit
+# d) contains the short hash of the commit
+#
+vers=$(git describe --match "v*" | sed s/^v//)
+ceph_dir=$(pwd)
+
+#
+# Create a repository in a directory with a name structured
+# as
+#
+base=ceph-rpm-$codename-$arch-$flavor
+
+function setup_rpmmacros() {
+ if ! grep -q find_debuginfo_dwz_opts $HOME/.rpmmacros ; then
+ echo '%_find_debuginfo_dwz_opts %{nil}' >> $HOME/.rpmmacros
+ fi
+ if [ "x${distro}x" = "xcentosx" ] && echo $distro_version | grep -q '7' ; then
+ if ! grep -q '%dist .el7' $HOME/.rpmmacros ; then
+ echo '%dist .el7' >> $HOME/.rpmmacros
+ fi
+ fi
+}
+
+function build_package() {
+ rm -fr $releasedir
+ mkdir -p $releasedir
+ #
+ # remove all files not under git so they are not
+ # included in the distribution.
+ #
+ git clean -qdxff
+ #
+ # creating the distribution tarbal requires some configure
+ # options (otherwise parts of the source tree will be left out).
+ #
+ if [ "$suse" = true ] ; then
+ sudo zypper -n install bzip2
+ else
+ sudo yum install -y bzip2
+ fi
+ # autotools only works in jewel and below
+ if [[ ! -e "make-dist" ]] ; then
+ ./autogen.sh
+ ./configure $(flavor2configure $flavor) --with-debug --with-radosgw --with-fuse --with-libatomic-ops --with-gtk2 --with-nss
+
+ #
+ # use distdir= to set the name of the top level directory of the
+ # tarbal to match the desired version
+ #
+ make dist-bzip2
+ else
+ # kraken and above
+ ./make-dist
+ fi
+ # Set up build area
+ setup_rpmmacros
+ if [ "$suse" = true ] ; then
+ sudo zypper -n install rpm-build
+ else
+ sudo yum install -y rpm-build
+ fi
+ local buildarea=$releasedir
+ mkdir -p ${buildarea}/SOURCES
+ mkdir -p ${buildarea}/SRPMS
+ mkdir -p ${buildarea}/SPECS
+ cp ceph.spec ${buildarea}/SPECS
+ mkdir -p ${buildarea}/RPMS
+ mkdir -p ${buildarea}/BUILD
+ CEPH_TARBALL=( ceph-*.tar.bz2 )
+ cp -a $CEPH_TARBALL ${buildarea}/SOURCES/.
+ cp -a rpm/*.patch ${buildarea}/SOURCES || true
+ (
+ cd ${buildarea}/SPECS
+ ccache=$(echo /usr/lib*/ccache)
+ # Build RPMs
+ if [ "$suse" = true ]; then
+ sed -i -e '0,/%package/s//%debug_package\n&/' \
+ -e 's/%{epoch}://g' \
+ -e '/^Epoch:/d' \
+ -e 's/%bcond_with ceph_test_package/%bcond_without ceph_test_package/' \
+ -e "s/^Source0:.*$/Source0: $CEPH_TARBALL/" \
+ ceph.spec
+ fi
+ buildarea=`readlink -fn ${releasedir}` ### rpm wants absolute path
+ PATH=$ccache:$PATH rpmbuild -ba --define "_unpackaged_files_terminate_build 0" --define "_topdir ${buildarea}" ceph.spec
+ )
+}
+
+function build_rpm_release() {
+ local buildarea=$1
+ local sha1=$2
+ local gitbuilder_host=$3
+ local base=$4
+
+ cat <<EOF > ${buildarea}/SPECS/ceph-release.spec
+Name: ceph-release
+Version: 1
+Release: 0%{?dist}
+Summary: Ceph repository configuration
+Group: System Environment/Base
+License: GPLv2
+URL: http://gitbuilder.ceph.com/$dist
+Source0: ceph.repo
+#Source0: RPM-GPG-KEY-CEPH
+#Source1: ceph.repo
+BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n)
+BuildArch: noarch
+
+%description
+This package contains the Ceph repository GPG key as well as configuration
+for yum and up2date.
+
+%prep
+
+%setup -q -c -T
+install -pm 644 %{SOURCE0} .
+#install -pm 644 %{SOURCE1} .
+
+%build
+
+%install
+rm -rf %{buildroot}
+#install -Dpm 644 %{SOURCE0} \
+# %{buildroot}/%{_sysconfdir}/pki/rpm-gpg/RPM-GPG-KEY-CEPH
+%if 0%{defined suse_version}
+install -dm 755 %{buildroot}/%{_sysconfdir}/zypp
+install -dm 755 %{buildroot}/%{_sysconfdir}/zypp/repos.d
+install -pm 644 %{SOURCE0} \
+ %{buildroot}/%{_sysconfdir}/zypp/repos.d
+%else
+install -dm 755 %{buildroot}/%{_sysconfdir}/yum.repos.d
+install -pm 644 %{SOURCE0} \
+ %{buildroot}/%{_sysconfdir}/yum.repos.d
+%endif
+
+%clean
+#rm -rf %{buildroot}
+
+%post
+
+%postun
+
+%files
+%defattr(-,root,root,-)
+#%doc GPL
+%if 0%{defined suse_version}
+/etc/zypp/repos.d/*
+%else
+/etc/yum.repos.d/*
+%endif
+#/etc/pki/rpm-gpg/*
+
+%changelog
+* Tue Mar 10 2013 Gary Lowell <glowell@inktank.com> - 1-0
+- Handle both yum and zypper
+- Use URL to ceph git repo for key
+- remove config attribute from repo file
+* Tue Aug 27 2012 Gary Lowell <glowell@inktank.com> - 1-0
+- Initial Package
+EOF
+
+ cat <<EOF > $buildarea/SOURCES/ceph.repo
+[Ceph]
+name=Ceph packages for \$basearch
+baseurl=http://${gitbuilder_host}/${base}/sha1/${sha1}/\$basearch
+enabled=1
+gpgcheck=0
+type=rpm-md
+
+[Ceph-noarch]
+name=Ceph noarch packages
+baseurl=http://${gitbuilder_host}/${base}/sha1/${sha1}/noarch
+enabled=1
+gpgcheck=0
+type=rpm-md
+
+[ceph-source]
+name=Ceph source packages
+baseurl=http://${gitbuilder_host}/${base}/sha1/${sha1}/SRPMS
+enabled=1
+gpgcheck=0
+type=rpm-md
+EOF
+
+ rpmbuild -bb --define "_topdir ${buildarea}" ${buildarea}/SPECS/ceph-release.spec
+}
+
+function build_rpm_repo() {
+ local buildarea=$1
+ local gitbuilder_host=$2
+ local base=$3
+
+ if [ "$suse" = true ] ; then
+ sudo zypper -n install createrepo
+ else
+ sudo yum install -y createrepo
+ fi
+
+ for dir in ${buildarea}/SRPMS ${buildarea}/RPMS/*
+ do
+ createrepo ${dir}
+ done
+
+ local sha1_dir=${buildarea}/../$codename/$base/sha1/$sha1
+ mkdir -p $sha1_dir
+ echo $vers > $sha1_dir/version
+ echo $sha1 > $sha1_dir/sha1
+ echo ceph > $sha1_dir/name
+
+ for dir in ${buildarea}/SRPMS ${buildarea}/RPMS/*
+ do
+ cp -fla ${dir} $sha1_dir
+ done
+
+ link_same ${buildarea}/../$codename/$base/ref $ceph_dir $sha1
+ if test "$gitbuilder_host" ; then
+ (
+ cd ${buildarea}/../$codename
+ RSYNC_RSH='ssh -o StrictHostKeyChecking=false' rsync -av $base/ ubuntu@$gitbuilder_host:/usr/share/nginx/html/$base/
+ )
+ fi
+}
+
+setup_rpmmacros
+build_package
+build_rpm_release $releasedir $sha1 $gitbuilder_host $base
+build_rpm_repo $releasedir $gitbuilder_host $base
--- /dev/null
+#cloud-config
+bootcmd:
+ - echo nameserver 8.8.8.8 | tee -a /etc/resolv.conf # last resort, in case the DHCP server does not provide a resolver
+manage_etc_hosts: true
+preserve_hostname: true
+users:
+ - name: ubuntu
+ gecos: User
+ sudo: ["ALL=(ALL) NOPASSWD:ALL"]
+ groups: users
+runcmd:
+ - ( MYHOME=/home/ubuntu ; mkdir $MYHOME/.ssh ; chmod 700 $MYHOME/.ssh ; cp /root/.ssh/authorized_keys $MYHOME/.ssh ; chown -R ubuntu.users $MYHOME/.ssh )
+final_message: "READYTORUN"
--- /dev/null
+user-data.txt
\ No newline at end of file
--- /dev/null
+user-data.txt
\ No newline at end of file
--- /dev/null
+user-data.txt
\ No newline at end of file
--- /dev/null
+#cloud-config
+bootcmd:
+ - echo 'APT::Get::AllowUnauthenticated "true";' | tee /etc/apt/apt.conf.d/99disablesigs
+ - echo nameserver 8.8.8.8 | tee -a /etc/resolv.conf # last resort, in case the DHCP server does not provide a resolver
+manage_etc_hosts: true
+preserve_hostname: true
+system_info:
+ default_user:
+ name: ubuntu
+final_message: "READYTORUN"
--- /dev/null
+import contextlib
+import logging
+import os
+import textwrap
+import yaml
+
+from cStringIO import StringIO
+from teuthology import contextutil
+from teuthology import misc
+from teuthology import packaging
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+# extra stuff we need to do our job here
+EXTRA_PKGS = [
+ 'git',
+]
+
+# stuff that would be in a devmode install, but should be
+# installed in the system for running nosetests against
+# a production install.
+EXTRA_NOSETEST_PKGS = [
+ 'python-psutil',
+ 'python-mock',
+]
+
+
+def find_client0(cluster):
+ ''' Find remote that has client.0 role, or None '''
+ for rem, roles in cluster.remotes.iteritems():
+ if 'client.0' in roles:
+ return rem
+ return None
+
+
+def pip(remote, package, venv=None, uninstall=False, force=False):
+ ''' {un}install a package with pip, possibly in a virtualenv '''
+ if venv:
+ pip = os.path.join(venv, 'bin', 'pip')
+ args = ['sudo', pip]
+ else:
+ args = ['sudo', 'pip']
+
+ if uninstall:
+ args.extend(['uninstall', '-y'])
+ else:
+ args.append('install')
+ if force:
+ args.append('-I')
+
+ args.append(package)
+ remote.run(args=args)
+
+
+@contextlib.contextmanager
+def install_epel(remote):
+ ''' install a disabled-by-default epel repo config file '''
+ remove = False
+ try:
+ if remote.os.package_type == 'deb':
+ yield
+ else:
+ remove = True
+ distromajor = remote.os.version.split('.')[0]
+
+ repofiledata = textwrap.dedent('''
+ [epel]
+ name=epel{version}
+ metalink=http://mirrors.fedoraproject.org/metalink?repo=epel-{version}&arch=$basearch
+ enabled=0
+ gpgcheck=0
+ ''').format(version=distromajor)
+
+ misc.create_file(remote, '/etc/yum.repos.d/epel.repo',
+ data=repofiledata, sudo=True)
+ remote.run(args='sudo yum clean all')
+ yield
+
+ finally:
+ if remove:
+ misc.delete_file(remote, '/etc/yum.repos.d/epel.repo', sudo=True)
+
+
+def enable_epel(remote, enable=True):
+ ''' enable/disable the epel repo '''
+ args = 'sudo sed -i'.split()
+ if enable:
+ args.extend(['s/enabled=0/enabled=1/'])
+ else:
+ args.extend(['s/enabled=1/enabled=0/'])
+ args.extend(['/etc/yum.repos.d/epel.repo'])
+
+ remote.run(args=args)
+ remote.run(args='sudo yum clean all')
+
+
+@contextlib.contextmanager
+def install_extra_pkgs(client):
+ ''' Install EXTRA_PKGS '''
+ try:
+ for pkg in EXTRA_PKGS:
+ packaging.install_package(pkg, client)
+ yield
+
+ finally:
+ for pkg in EXTRA_PKGS:
+ packaging.remove_package(pkg, client)
+
+
+@contextlib.contextmanager
+def clone_calamari(config, client):
+ ''' clone calamari source into current directory on remote '''
+ branch = config.get('calamari_branch', 'master')
+ url = config.get('calamari_giturl', 'git://github.com/ceph/calamari')
+ try:
+ out = StringIO()
+ # ensure branch is present (clone -b will succeed even if
+ # the branch doesn't exist, falling back to master)
+ client.run(
+ args='git ls-remote %s %s' % (url, branch),
+ stdout=out,
+ label='check for calamari branch %s existence' % branch
+ )
+ if len(out.getvalue()) == 0:
+ raise RuntimeError("Calamari branch %s doesn't exist" % branch)
+ client.run(args='git clone -b %s %s' % (branch, url))
+ yield
+ finally:
+ # sudo python setup.py develop may have left some root files around
+ client.run(args='sudo rm -rf calamari')
+
+
+@contextlib.contextmanager
+def write_info_yaml(cluster, client):
+ ''' write info.yaml to client for nosetests '''
+ try:
+ info = {
+ 'cluster': {
+ rem.name: {'roles': roles}
+ for rem, roles in cluster.remotes.iteritems()
+ }
+ }
+ misc.create_file(client, 'calamari/info.yaml',
+ data=yaml.safe_dump(info, default_flow_style=False))
+ yield
+ finally:
+ misc.delete_file(client, 'calamari/info.yaml')
+
+
+@contextlib.contextmanager
+def write_test_conf(client):
+ ''' write calamari/tests/test.conf to client for nosetests '''
+ try:
+ testconf = textwrap.dedent('''
+ [testing]
+
+ calamari_control = external
+ ceph_control = external
+ bootstrap = False
+ api_username = admin
+ api_password = admin
+ embedded_timeout_factor = 1
+ external_timeout_factor = 3
+ external_cluster_path = info.yaml
+ ''')
+ misc.create_file(client, 'calamari/tests/test.conf', data=testconf)
+ yield
+
+ finally:
+ misc.delete_file(client, 'calamari/tests/test.conf')
+
+
+@contextlib.contextmanager
+def prepare_nosetest_env(client):
+ try:
+ # extra dependencies that would be in the devmode venv
+ if client.os.package_type == 'rpm':
+ enable_epel(client, enable=True)
+ for package in EXTRA_NOSETEST_PKGS:
+ packaging.install_package(package, client)
+ if client.os.package_type == 'rpm':
+ enable_epel(client, enable=False)
+
+ # install nose itself into the calamari venv, force it in case it's
+ # already installed in the system, so we can invoke it by path without
+ # fear that it's not present
+ pip(client, 'nose', venv='/opt/calamari/venv', force=True)
+
+ # install a later version of requests into the venv as well
+ # (for precise)
+ pip(client, 'requests', venv='/opt/calamari/venv', force=True)
+
+ # link (setup.py develop) calamari/rest-api into the production venv
+ # because production does not include calamari_rest.management, needed
+ # for test_rest_api.py's ApiIntrospection
+ args = 'cd calamari/rest-api'.split() + [run.Raw(';')] + \
+ 'sudo /opt/calamari/venv/bin/python setup.py develop'.split()
+ client.run(args=args)
+
+ # because, at least in Python 2.6/Centos, site.py uses
+ # 'os.path.exists()' to process .pth file entries, and exists() uses
+ # access(2) to check for existence, all the paths leading up to
+ # $HOME/calamari/rest-api need to be searchable by all users of
+ # the package, which will include the WSGI/Django app, running
+ # as the Apache user. So make them all world-read-and-execute.
+ args = 'sudo chmod a+x'.split() + \
+ ['.', './calamari', './calamari/rest-api']
+ client.run(args=args)
+
+ # make one dummy request just to get the WSGI app to do
+ # all its log creation here, before the chmod below (I'm
+ # looking at you, graphite -- /var/log/calamari/info.log and
+ # /var/log/calamari/exception.log)
+ client.run(args='wget -q -O /dev/null http://localhost')
+
+ # /var/log/calamari/* is root-or-apache write-only
+ client.run(args='sudo chmod a+w /var/log/calamari/*')
+
+ yield
+
+ finally:
+ args = 'cd calamari/rest-api'.split() + [run.Raw(';')] + \
+ 'sudo /opt/calamari/venv/bin/python setup.py develop -u'.split()
+ client.run(args=args)
+ for pkg in ('nose', 'requests'):
+ pip(client, pkg, venv='/opt/calamari/venv', uninstall=True)
+ for package in EXTRA_NOSETEST_PKGS:
+ packaging.remove_package(package, client)
+
+
+@contextlib.contextmanager
+def run_nosetests(client):
+ ''' Actually run the tests '''
+ args = [
+ 'cd',
+ 'calamari',
+ run.Raw(';'),
+ 'CALAMARI_CONFIG=/etc/calamari/calamari.conf',
+ '/opt/calamari/venv/bin/nosetests',
+ '-v',
+ 'tests/',
+ ]
+ client.run(args=args)
+ yield
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Run Calamari tests against an instance set up by 'calamari_server'.
+
+ -- clone the Calamari source into $HOME (see options)
+ -- write calamari/info.yaml describing the cluster
+ -- write calamari/tests/test.conf containing
+ 'external' for calamari_control and ceph_control
+ 'bootstrap = False' to disable test bootstrapping (installing minions)
+ no api_url necessary (inferred from client.0)
+ 'external_cluster_path = info.yaml'
+ -- modify the production Calamari install to allow test runs:
+ install nose in the venv
+ install EXTRA_NOSETEST_PKGS
+ link in, with setup.py develop, calamari_rest (for ApiIntrospection)
+ -- set CALAMARI_CONFIG to point to /etc/calamari/calamari.conf
+ -- nosetests -v tests/
+
+ Options are:
+ calamari_giturl: url from which to git clone calamari
+ (default: git://github.com/ceph/calamari)
+ calamari_branch: git branch of calamari to check out
+ (default: master)
+
+ Note: the tests must find a clean cluster, so don't forget to
+ set the crush default type appropriately, or install min_size OSD hosts
+ """
+ client0 = find_client0(ctx.cluster)
+ if client0 is None:
+ raise RuntimeError("must have client.0 role")
+
+ with contextutil.nested(
+ lambda: install_epel(client0),
+ lambda: install_extra_pkgs(client0),
+ lambda: clone_calamari(config, client0),
+ lambda: write_info_yaml(ctx.cluster, client0),
+ lambda: write_test_conf(client0),
+ lambda: prepare_nosetest_env(client0),
+ lambda: run_nosetests(client0),
+ ):
+ yield
--- /dev/null
+"""
+Calamari setup task
+"""
+import contextlib
+import logging
+import os
+import requests
+import shutil
+import webbrowser
+
+from cStringIO import StringIO
+from teuthology.orchestra import run
+from teuthology import contextutil
+from teuthology import misc
+
+log = logging.getLogger(__name__)
+
+
+DEFAULTS = {
+ 'version': 'v0.80.9',
+ 'test_image': None,
+ 'start_browser': False,
+ 'email': 'x@y.com',
+ 'no_epel': True,
+ 'calamari_user': 'admin',
+ 'calamari_password': 'admin',
+}
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Do the setup of a calamari server.
+
+ - calamari_setup:
+ version: 'v80.1'
+ test_image: <path to tarball or iso>
+
+ Options are (see DEFAULTS above):
+
+ version -- ceph version we are testing against
+ test_image -- Can be an HTTP URL, in which case fetch from this
+ http path; can also be local path
+ start_browser -- If True, start a browser. To be used by runs that will
+ bring up a browser quickly for human use. Set to False
+ for overnight suites that are testing for problems in
+ the installation itself
+ email -- email address for the user
+ no_epel -- indicates if we should remove epel files prior to yum
+ installations.
+ calamari_user -- user name to log into gui
+ calamari_password -- calamari user password
+ """
+ local_config = DEFAULTS
+ local_config.update(config)
+ config = local_config
+ cal_svr = None
+ for remote_, roles in ctx.cluster.remotes.items():
+ if 'client.0' in roles:
+ cal_svr = remote_
+ break
+ if not cal_svr:
+ raise RuntimeError('client.0 not found in roles')
+ with contextutil.nested(
+ lambda: adjust_yum_repos(ctx, cal_svr, config['no_epel']),
+ lambda: calamari_install(config, cal_svr),
+ lambda: ceph_install(ctx, cal_svr),
+ # do it again because ceph-deploy installed epel for centos
+ lambda: remove_epel(ctx, config['no_epel']),
+ lambda: calamari_connect(ctx, cal_svr),
+ lambda: browser(config['start_browser'], cal_svr.hostname),
+ ):
+ yield
+
+
+@contextlib.contextmanager
+def adjust_yum_repos(ctx, cal_svr, no_epel):
+ """
+ For each remote machine, fix the repos if yum is used.
+ """
+ ice_distro = str(cal_svr.os)
+ if ice_distro.startswith('rhel') or ice_distro.startswith('centos'):
+ if no_epel:
+ for remote in ctx.cluster.remotes:
+ fix_yum_repos(remote, ice_distro)
+ try:
+ yield
+ finally:
+ if ice_distro.startswith('rhel') or ice_distro.startswith('centos'):
+ if no_epel:
+ for remote in ctx.cluster.remotes:
+ restore_yum_repos(remote)
+
+
+def restore_yum_repos(remote):
+ """
+ Copy the old saved repo back in.
+ """
+ if remote.run(args=['sudo', 'rm', '-rf', '/etc/yum.repos.d']).exitstatus:
+ return False
+ if remote.run(args=['sudo', 'mv', '/etc/yum.repos.d.old',
+ '/etc/yum.repos.d']).exitstatus:
+ return False
+
+
+def fix_yum_repos(remote, distro):
+ """
+ For yum calamari installations, the repos.d directory should only
+ contain a repo file named rhel<version-number>.repo
+ """
+ if distro.startswith('centos'):
+ # hack alert: detour: install lttng for ceph
+ # this works because epel is preinstalled on the vpms
+ # this is not a generic solution
+ # this is here solely to test the one-off 1.3.0 release for centos6
+ remote.run(args="sudo yum -y install lttng-tools")
+ cmds = [
+ 'sudo mkdir /etc/yum.repos.d.old'.split(),
+ ['sudo', 'cp', run.Raw('/etc/yum.repos.d/*'),
+ '/etc/yum.repos.d.old'],
+ ['sudo', 'rm', run.Raw('/etc/yum.repos.d/epel*')],
+ ]
+ for cmd in cmds:
+ if remote.run(args=cmd).exitstatus:
+ return False
+ else:
+ cmds = [
+ 'sudo mv /etc/yum.repos.d /etc/yum.repos.d.old'.split(),
+ 'sudo mkdir /etc/yum.repos.d'.split(),
+ ]
+ for cmd in cmds:
+ if remote.run(args=cmd).exitstatus:
+ return False
+
+ # map "distroversion" from Remote.os to a tuple of
+ # (repo title, repo name descriptor, apt-mirror repo path chunk)
+ yum_repo_params = {
+ 'rhel 6.4': ('rhel6-server', 'RHEL', 'rhel6repo-server'),
+ 'rhel 6.5': ('rhel6-server', 'RHEL', 'rhel6repo-server'),
+ 'rhel 7.0': ('rhel7-server', 'RHEL', 'rhel7repo/server'),
+ }
+ repotitle, reponame, path = yum_repo_params[distro]
+ repopath = '/etc/yum.repos.d/%s.repo' % repotitle
+ # TO DO: Make this data configurable too
+ repo_contents = '\n'.join(
+ ('[%s]' % repotitle,
+ 'name=%s $releasever - $basearch' % reponame,
+ 'baseurl=http://apt-mirror.front.sepia.ceph.com/' + path,
+ 'gpgcheck=0',
+ 'enabled=1')
+ )
+ misc.sudo_write_file(remote, repopath, repo_contents)
+ cmds = [
+ 'sudo yum clean all'.split(),
+ 'sudo yum makecache'.split(),
+ ]
+ for cmd in cmds:
+ if remote.run(args=cmd).exitstatus:
+ return False
+ return True
+
+
+@contextlib.contextmanager
+def remove_epel(ctx, no_epel):
+ """
+ just remove epel. No undo; assumed that it's used after
+ adjust_yum_repos, and relies on its state-save/restore.
+ """
+ if no_epel:
+ for remote in ctx.cluster.remotes:
+ if remote.os.name.startswith('centos'):
+ remote.run(args=[
+ 'sudo', 'rm', '-f', run.Raw('/etc/yum.repos.d/epel*')
+ ])
+ try:
+ yield
+ finally:
+ pass
+
+
+def get_iceball_with_http(url, destdir):
+ '''
+ Copy iceball with http to destdir. Try both .tar.gz and .iso.
+ '''
+ # stream=True means we don't download until copyfileobj below,
+ # and don't need a temp file
+ r = requests.get(url, stream=True)
+ if not r.ok:
+ raise RuntimeError("Failed to download %s", str(url))
+ filename = os.path.join(destdir, url.split('/')[-1])
+ with open(filename, 'w') as f:
+ shutil.copyfileobj(r.raw, f)
+ log.info('saved %s as %s' % (url, filename))
+ return filename
+
+
+@contextlib.contextmanager
+def calamari_install(config, cal_svr):
+ """
+ Install calamari
+
+ The steps here are:
+ -- Get the iceball, locally or from http
+ -- Copy the iceball to the calamari server, and untar/mount it.
+ -- Run ice-setup on the calamari server.
+ -- Run calamari-ctl initialize.
+ """
+ client_id = str(cal_svr)
+ at_loc = client_id.find('@')
+ if at_loc > 0:
+ client_id = client_id[at_loc + 1:]
+
+ test_image = config['test_image']
+
+ if not test_image:
+ raise RuntimeError('Must supply test image')
+ log.info('calamari test image: %s' % test_image)
+ delete_iceball = False
+
+ if test_image.startswith('http'):
+ iceball_file = get_iceball_with_http(test_image, '/tmp')
+ delete_iceball = True
+ else:
+ iceball_file = test_image
+
+ remote_iceball_file = os.path.join('/tmp', os.path.split(iceball_file)[1])
+ cal_svr.put_file(iceball_file, remote_iceball_file)
+ if iceball_file.endswith('.tar.gz'): # XXX specify tar/iso in config?
+ icetype = 'tarball'
+ elif iceball_file.endswith('.iso'):
+ icetype = 'iso'
+ else:
+ raise RuntimeError('Can''t handle iceball {0}'.format(iceball_file))
+
+ if icetype == 'tarball':
+ ret = cal_svr.run(args=['gunzip', run.Raw('<'), remote_iceball_file,
+ run.Raw('|'), 'tar', 'xvf', run.Raw('-')])
+ if ret.exitstatus:
+ raise RuntimeError('remote iceball untar failed')
+ elif icetype == 'iso':
+ mountpoint = '/mnt/' # XXX create?
+ ret = cal_svr.run(
+ args=['sudo', 'mount', '-o', 'loop', '-r',
+ remote_iceball_file, mountpoint]
+ )
+
+ # install ice_setup package
+ args = {
+ 'deb': 'sudo dpkg -i /mnt/ice-setup*deb',
+ 'rpm': 'sudo yum -y localinstall /mnt/ice_setup*rpm'
+ }.get(cal_svr.system_type, None)
+ if not args:
+ raise RuntimeError('{0}: unknown system type'.format(cal_svr))
+ ret = cal_svr.run(args=args)
+ if ret.exitstatus:
+ raise RuntimeError('ice_setup package install failed')
+
+ # Run ice_setup
+ icesetdata = 'yes\n\n%s\nhttp\n' % client_id
+ ice_in = StringIO(icesetdata)
+ ice_out = StringIO()
+ if icetype == 'tarball':
+ args = 'sudo python ice_setup.py'
+ else:
+ args = 'sudo ice_setup -d /mnt'
+ ret = cal_svr.run(args=args, stdin=ice_in, stdout=ice_out)
+ log.debug(ice_out.getvalue())
+ if ret.exitstatus:
+ raise RuntimeError('ice_setup failed')
+
+ # Run calamari-ctl initialize.
+ icesetdata = '%s\n%s\n%s\n%s\n' % (
+ config['calamari_user'],
+ config['email'],
+ config['calamari_password'],
+ config['calamari_password'],
+ )
+ ice_in = StringIO(icesetdata)
+ ret = cal_svr.run(args=['sudo', 'calamari-ctl', 'initialize'],
+ stdin=ice_in, stdout=ice_out)
+ log.debug(ice_out.getvalue())
+ if ret.exitstatus:
+ raise RuntimeError('calamari-ctl initialize failed')
+ try:
+ yield
+ finally:
+ log.info('Cleaning up after Calamari installation')
+ if icetype == 'iso':
+ cal_svr.run(args=['sudo', 'umount', mountpoint])
+ if delete_iceball:
+ os.unlink(iceball_file)
+
+
+@contextlib.contextmanager
+def ceph_install(ctx, cal_svr):
+ """
+ Install ceph if ceph was not previously installed by teuthology. This
+ code tests the case where calamari is installed on a brand new system.
+ """
+ loc_inst = False
+ if 'install' not in [x.keys()[0] for x in ctx.config['tasks']]:
+ loc_inst = True
+ ret = deploy_ceph(ctx, cal_svr)
+ if ret:
+ raise RuntimeError('ceph installs failed')
+ try:
+ yield
+ finally:
+ if loc_inst:
+ if not undeploy_ceph(ctx, cal_svr):
+ log.error('Cleanup of Ceph installed by Calamari-setup failed')
+
+
+def deploy_ceph(ctx, cal_svr):
+ """
+ Perform the ceph-deploy actions needed to bring up a Ceph cluster. This
+ test is needed to check the ceph-deploy that comes with the calamari
+ package.
+ """
+ osd_to_name = {}
+ all_machines = set()
+ all_mons = set()
+ all_osds = set()
+
+ # collect which remotes are osds and which are mons
+ for remote in ctx.cluster.remotes:
+ all_machines.add(remote.shortname)
+ roles = ctx.cluster.remotes[remote]
+ for role in roles:
+ daemon_type, number = role.split('.')
+ if daemon_type == 'osd':
+ all_osds.add(remote.shortname)
+ osd_to_name[number] = remote.shortname
+ if daemon_type == 'mon':
+ all_mons.add(remote.shortname)
+
+ # figure out whether we're in "1.3+" mode: prior to 1.3, there was
+ # only one Ceph repo, and it was all installed on every Ceph host.
+ # with 1.3, we've split that into MON and OSD repos (in order to
+ # be able to separately track subscriptions per-node). This
+ # requires new switches to ceph-deploy to select which locally-served
+ # repo is connected to which cluster host.
+ #
+ # (TODO: A further issue is that the installation/setup may not have
+ # created local repos at all, but that is the subject of a future
+ # change.)
+
+ r = cal_svr.run(args='/usr/bin/test -d /mnt/MON', check_status=False)
+ use_install_repo = (r.returncode == 0)
+
+ # pre-1.3:
+ # ceph-deploy new <all_mons>
+ # ceph-deploy install <all_machines>
+ # ceph-deploy mon create-initial
+ #
+ # 1.3 and later:
+ # ceph-deploy new <all_mons>
+ # ceph-deploy install --repo --release=ceph-mon <all_mons>
+ # ceph-deploy install <all_mons>
+ # ceph-deploy install --repo --release=ceph-osd <all_osds>
+ # ceph-deploy install <all_osds>
+ # ceph-deploy mon create-initial
+ #
+ # one might think the install <all_mons> and install <all_osds>
+ # commands would need --mon and --osd, but #12147 has not yet
+ # made it into RHCS 1.3.0; since the package split also hasn't
+ # landed, we can avoid using the flag and avoid the bug.
+
+ cmds = ['ceph-deploy new ' + ' '.join(all_mons)]
+
+ if use_install_repo:
+ cmds.append('ceph-deploy repo ceph-mon ' +
+ ' '.join(all_mons))
+ cmds.append('ceph-deploy install --no-adjust-repos --mon ' +
+ ' '.join(all_mons))
+ cmds.append('ceph-deploy repo ceph-osd ' +
+ ' '.join(all_osds))
+ cmds.append('ceph-deploy install --no-adjust-repos --osd ' +
+ ' '.join(all_osds))
+ # We tell users to use `hostname` in our docs. Do the same here.
+ cmds.append('ceph-deploy install --no-adjust-repos --cli `hostname`')
+ else:
+ cmds.append('ceph-deploy install ' + ' '.join(all_machines))
+
+ cmds.append('ceph-deploy mon create-initial')
+
+ for cmd in cmds:
+ cal_svr.run(args=cmd).exitstatus
+
+ disk_labels = '_dcba'
+ # NEEDS WORK assumes disks start with vd (need to check this somewhere)
+ for cmd_pts in [['disk', 'zap'], ['osd', 'prepare'], ['osd', 'activate']]:
+ mach_osd_cnt = {}
+ for osdn in osd_to_name:
+ osd_mac = osd_to_name[osdn]
+ mach_osd_cnt[osd_mac] = mach_osd_cnt.get(osd_mac, 0) + 1
+ arg_list = ['ceph-deploy']
+ arg_list.extend(cmd_pts)
+ disk_id = '%s:vd%s' % (osd_to_name[osdn],
+ disk_labels[mach_osd_cnt[osd_mac]])
+ if 'activate' in cmd_pts:
+ disk_id += '1'
+ arg_list.append(disk_id)
+ cal_svr.run(args=arg_list).exitstatus
+
+
+def undeploy_ceph(ctx, cal_svr):
+ """
+ Cleanup deployment of ceph.
+ """
+ all_machines = []
+ ret = True
+ for remote in ctx.cluster.remotes:
+ roles = ctx.cluster.remotes[remote]
+ if (
+ not any('osd' in role for role in roles) and
+ not any('mon' in role for role in roles)
+ ):
+ continue
+ ret &= remote.run(
+ args=['sudo', 'stop', 'ceph-all', run.Raw('||'),
+ 'sudo', 'service', 'ceph', 'stop']
+ ).exitstatus
+ all_machines.append(remote.shortname)
+ all_machines = set(all_machines)
+ cmd1 = ['ceph-deploy', 'uninstall']
+ cmd1.extend(all_machines)
+ ret &= cal_svr.run(args=cmd1).exitstatus
+ cmd2 = ['ceph-deploy', 'purge']
+ cmd2.extend(all_machines)
+ ret &= cal_svr.run(args=cmd2).exitstatus
+ for remote in ctx.cluster.remotes:
+ ret &= remote.run(args=['sudo', 'rm', '-rf',
+ '.ssh/known_hosts']).exitstatus
+ return ret
+
+
+@contextlib.contextmanager
+def calamari_connect(ctx, cal_svr):
+ """
+ Connect calamari to the ceph nodes.
+ """
+ connects = ['ceph-deploy', 'calamari', 'connect']
+ for machine_info in ctx.cluster.remotes:
+ if 'client.0' not in ctx.cluster.remotes[machine_info]:
+ connects.append(machine_info.shortname)
+ ret = cal_svr.run(args=connects)
+ if ret.exitstatus:
+ raise RuntimeError('calamari connect failed')
+ try:
+ yield
+ finally:
+ log.info('Calamari test terminating')
+
+
+@contextlib.contextmanager
+def browser(start_browser, web_page):
+ """
+ Bring up a browser, if wanted.
+ """
+ if start_browser:
+ webbrowser.open('http://%s' % web_page)
+ try:
+ yield
+ finally:
+ if start_browser:
+ log.info('Web browser support terminating')
--- /dev/null
+"""
+Ceph cluster task.
+
+Handle the setup, starting, and clean-up of a Ceph cluster.
+"""
+from cStringIO import StringIO
+
+import argparse
+import contextlib
+import errno
+import logging
+import os
+import json
+import time
+import gevent
+import socket
+
+from ceph_manager import CephManager, write_conf
+from tasks.cephfs.filesystem import Filesystem
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology import exceptions
+from teuthology.orchestra import run
+import ceph_client as cclient
+from teuthology.orchestra.daemon import DaemonGroup
+
+CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw']
+
+log = logging.getLogger(__name__)
+
+
+def generate_caps(type_):
+ """
+ Each call will return the next capability for each system type
+ (essentially a subset of possible role values). Valid types are osd,
+ mds and client.
+ """
+ defaults = dict(
+ osd=dict(
+ mon='allow *',
+ osd='allow *',
+ ),
+ mgr=dict(
+ mon='allow *',
+ ),
+ mds=dict(
+ mon='allow *',
+ osd='allow *',
+ mds='allow',
+ ),
+ client=dict(
+ mon='allow rw',
+ osd='allow rwx',
+ mds='allow',
+ ),
+ )
+ for subsystem, capability in defaults[type_].items():
+ yield '--cap'
+ yield subsystem
+ yield capability
+
+
+@contextlib.contextmanager
+def ceph_log(ctx, config):
+ """
+ Create /var/log/ceph log directory that is open to everyone.
+ Add valgrind and profiling-logger directories.
+
+ :param ctx: Context
+ :param config: Configuration
+ """
+ log.info('Making ceph log dir writeable by non-root...')
+ run.wait(
+ ctx.cluster.run(
+ args=[
+ 'sudo',
+ 'chmod',
+ '777',
+ '/var/log/ceph',
+ ],
+ wait=False,
+ )
+ )
+ log.info('Disabling ceph logrotate...')
+ run.wait(
+ ctx.cluster.run(
+ args=[
+ 'sudo',
+ 'rm', '-f', '--',
+ '/etc/logrotate.d/ceph',
+ ],
+ wait=False,
+ )
+ )
+ log.info('Creating extra log directories...')
+ run.wait(
+ ctx.cluster.run(
+ args=[
+ 'sudo',
+ 'install', '-d', '-m0777', '--',
+ '/var/log/ceph/valgrind',
+ '/var/log/ceph/profiling-logger',
+ ],
+ wait=False,
+ )
+ )
+
+ class Rotater(object):
+ stop_event = gevent.event.Event()
+
+ def invoke_logrotate(self):
+ # 1) install ceph-test.conf in /etc/logrotate.d
+ # 2) continuously loop over logrotate invocation with ceph-test.conf
+ while not self.stop_event.is_set():
+ self.stop_event.wait(timeout=30)
+ try:
+ run.wait(
+ ctx.cluster.run(
+ args=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'
+ ],
+ wait=False,
+ )
+ )
+ except exceptions.ConnectionLostError as e:
+ # Some tests may power off nodes during test, in which
+ # case we will see connection errors that we should ignore.
+ log.debug("Missed logrotate, node '{0}' is offline".format(
+ e.node))
+ except EOFError as e:
+ # Paramiko sometimes raises this when it fails to
+ # connect to a node during open_session. As with
+ # ConnectionLostError, we ignore this because nodes
+ # are allowed to get power cycled during tests.
+ log.debug("Missed logrotate, EOFError")
+ except socket.error as e:
+ if e.errno == errno.EHOSTUNREACH:
+ log.debug("Missed logrotate, host unreachable")
+ else:
+ raise
+
+ def begin(self):
+ self.thread = gevent.spawn(self.invoke_logrotate)
+
+ def end(self):
+ self.stop_event.set()
+ self.thread.get()
+
+ def write_rotate_conf(ctx, daemons):
+ testdir = teuthology.get_testdir(ctx)
+ rotate_conf_path = os.path.join(os.path.dirname(__file__), 'logrotate.conf')
+ with file(rotate_conf_path, 'rb') as f:
+ conf = ""
+ for daemon, size in daemons.iteritems():
+ log.info('writing logrotate stanza for {daemon}'.format(daemon=daemon))
+ conf += f.read().format(daemon_type=daemon, max_size=size)
+ f.seek(0, 0)
+
+ for remote in ctx.cluster.remotes.iterkeys():
+ teuthology.write_file(remote=remote,
+ path='{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
+ data=StringIO(conf)
+ )
+ remote.run(
+ args=[
+ 'sudo',
+ 'mv',
+ '{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
+ '/etc/logrotate.d/ceph-test.conf',
+ run.Raw('&&'),
+ 'sudo',
+ 'chmod',
+ '0644',
+ '/etc/logrotate.d/ceph-test.conf',
+ run.Raw('&&'),
+ 'sudo',
+ 'chown',
+ 'root.root',
+ '/etc/logrotate.d/ceph-test.conf'
+ ]
+ )
+ remote.chcon('/etc/logrotate.d/ceph-test.conf',
+ 'system_u:object_r:etc_t:s0')
+
+ if ctx.config.get('log-rotate'):
+ daemons = ctx.config.get('log-rotate')
+ log.info('Setting up log rotation with ' + str(daemons))
+ write_rotate_conf(ctx, daemons)
+ logrotater = Rotater()
+ logrotater.begin()
+ try:
+ yield
+
+ finally:
+ if ctx.config.get('log-rotate'):
+ log.info('Shutting down logrotate')
+ logrotater.end()
+ ctx.cluster.run(
+ args=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf'
+ ]
+ )
+ if ctx.archive is not None and \
+ not (ctx.config.get('archive-on-error') and ctx.summary['success']):
+ # and logs
+ log.info('Compressing logs...')
+ run.wait(
+ ctx.cluster.run(
+ args=[
+ 'sudo',
+ 'find',
+ '/var/log/ceph',
+ '-name',
+ '*.log',
+ '-print0',
+ run.Raw('|'),
+ 'sudo',
+ 'xargs',
+ '-0',
+ '--no-run-if-empty',
+ '--',
+ 'gzip',
+ '--',
+ ],
+ wait=False,
+ ),
+ )
+
+ log.info('Archiving logs...')
+ path = os.path.join(ctx.archive, 'remote')
+ os.makedirs(path)
+ for remote in ctx.cluster.remotes.iterkeys():
+ sub = os.path.join(path, remote.shortname)
+ os.makedirs(sub)
+ teuthology.pull_directory(remote, '/var/log/ceph',
+ os.path.join(sub, 'log'))
+
+
+def assign_devs(roles, devs):
+ """
+ Create a dictionary of devs indexed by roles
+
+ :param roles: List of roles
+ :param devs: Corresponding list of devices.
+ :returns: Dictionary of devs indexed by roles.
+ """
+ return dict(zip(roles, devs))
+
+
+@contextlib.contextmanager
+def valgrind_post(ctx, config):
+ """
+ After the tests run, look throught all the valgrind logs. Exceptions are raised
+ if textual errors occured in the logs, or if valgrind exceptions were detected in
+ the logs.
+
+ :param ctx: Context
+ :param config: Configuration
+ """
+ try:
+ yield
+ finally:
+ lookup_procs = list()
+ log.info('Checking for errors in any valgrind logs...')
+ for remote in ctx.cluster.remotes.iterkeys():
+ # look at valgrind logs for each node
+ proc = remote.run(
+ args=[
+ 'sudo',
+ 'zgrep',
+ '<kind>',
+ run.Raw('/var/log/ceph/valgrind/*'),
+ '/dev/null', # include a second file so that we always get a filename prefix on the output
+ run.Raw('|'),
+ 'sort',
+ run.Raw('|'),
+ 'uniq',
+ ],
+ wait=False,
+ check_status=False,
+ stdout=StringIO(),
+ )
+ lookup_procs.append((proc, remote))
+
+ valgrind_exception = None
+ for (proc, remote) in lookup_procs:
+ proc.wait()
+ out = proc.stdout.getvalue()
+ for line in out.split('\n'):
+ if line == '':
+ continue
+ try:
+ (file, kind) = line.split(':')
+ except Exception:
+ log.error('failed to split line %s', line)
+ raise
+ log.debug('file %s kind %s', file, kind)
+ if (file.find('mds') >= 0) and kind.find('Lost') > 0:
+ continue
+ log.error('saw valgrind issue %s in %s', kind, file)
+ valgrind_exception = Exception('saw valgrind issues')
+
+ if config.get('expect_valgrind_errors'):
+ if not valgrind_exception:
+ raise Exception('expected valgrind issues and found none')
+ else:
+ if valgrind_exception:
+ raise valgrind_exception
+
+
+@contextlib.contextmanager
+def crush_setup(ctx, config):
+ cluster_name = config['cluster']
+ first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
+ (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+ profile = config.get('crush_tunables', 'default')
+ log.info('Setting crush tunables to %s', profile)
+ mon_remote.run(
+ args=['sudo', 'ceph', '--cluster', cluster_name,
+ 'osd', 'crush', 'tunables', profile])
+ yield
+
+
+@contextlib.contextmanager
+def cephfs_setup(ctx, config):
+ cluster_name = config['cluster']
+ testdir = teuthology.get_testdir(ctx)
+ coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
+
+ first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
+ (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+ mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
+ # If there are any MDSs, then create a filesystem for them to use
+ # Do this last because requires mon cluster to be up and running
+ if mdss.remotes:
+ log.info('Setting up CephFS filesystem...')
+
+ Filesystem(ctx, create='cephfs') # TODO: make Filesystem cluster-aware
+
+ is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role
+ all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
+ num_active = len([r for r in all_roles if is_active_mds(r)])
+ mon_remote.run(
+ args=[
+ 'sudo',
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ coverage_dir,
+ 'ceph', 'mds', 'set', 'allow_multimds', 'true',
+ '--yes-i-really-mean-it'],
+ check_status=False, # probably old version, upgrade test
+ )
+ mon_remote.run(args=[
+ 'sudo',
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ coverage_dir,
+ 'ceph',
+ '--cluster', cluster_name,
+ 'mds', 'set_max_mds', str(num_active)])
+ mon_remote.run(
+ args=[
+ 'sudo',
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ coverage_dir,
+ 'ceph', 'mds', 'set', 'allow_dirfrags', 'true',
+ '--yes-i-really-mean-it'],
+ check_status=False, # probably old version, upgrade test
+ )
+
+ yield
+
+
+@contextlib.contextmanager
+def cluster(ctx, config):
+ """
+ Handle the creation and removal of a ceph cluster.
+
+ On startup:
+ Create directories needed for the cluster.
+ Create remote journals for all osds.
+ Create and set keyring.
+ Copy the monmap to tht test systems.
+ Setup mon nodes.
+ Setup mds nodes.
+ Mkfs osd nodes.
+ Add keyring information to monmaps
+ Mkfs mon nodes.
+
+ On exit:
+ If errors occured, extract a failure message and store in ctx.summary.
+ Unmount all test files and temporary journaling files.
+ Save the monitor information and archive all ceph logs.
+ Cleanup the keyring setup, and remove all monitor map and data files left over.
+
+ :param ctx: Context
+ :param config: Configuration
+ """
+ if ctx.config.get('use_existing_cluster', False) is True:
+ log.info("'use_existing_cluster' is true; skipping cluster creation")
+ yield
+
+ testdir = teuthology.get_testdir(ctx)
+ cluster_name = config['cluster']
+ data_dir = '{tdir}/{cluster}.data'.format(tdir=testdir, cluster=cluster_name)
+ log.info('Creating ceph cluster %s...', cluster_name)
+ run.wait(
+ ctx.cluster.run(
+ args=[
+ 'install', '-d', '-m0755', '--',
+ data_dir,
+ ],
+ wait=False,
+ )
+ )
+
+ run.wait(
+ ctx.cluster.run(
+ args=[
+ 'sudo',
+ 'install', '-d', '-m0777', '--', '/var/run/ceph',
+ ],
+ wait=False,
+ )
+ )
+
+ devs_to_clean = {}
+ remote_to_roles_to_devs = {}
+ remote_to_roles_to_journals = {}
+ osds = ctx.cluster.only(teuthology.is_type('osd', cluster_name))
+ for remote, roles_for_host in osds.remotes.iteritems():
+ devs = teuthology.get_scratch_devices(remote)
+ roles_to_devs = {}
+ roles_to_journals = {}
+ if config.get('fs'):
+ log.info('fs option selected, checking for scratch devs')
+ log.info('found devs: %s' % (str(devs),))
+ devs_id_map = teuthology.get_wwn_id_map(remote, devs)
+ iddevs = devs_id_map.values()
+ roles_to_devs = assign_devs(
+ teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
+ )
+ if len(roles_to_devs) < len(iddevs):
+ iddevs = iddevs[len(roles_to_devs):]
+ devs_to_clean[remote] = []
+
+ if config.get('block_journal'):
+ log.info('block journal enabled')
+ roles_to_journals = assign_devs(
+ teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
+ )
+ log.info('journal map: %s', roles_to_journals)
+
+ if config.get('tmpfs_journal'):
+ log.info('tmpfs journal enabled')
+ roles_to_journals = {}
+ remote.run(args=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt'])
+ for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
+ tmpfs = '/mnt/' + role
+ roles_to_journals[role] = tmpfs
+ remote.run(args=['truncate', '-s', '1500M', tmpfs])
+ log.info('journal map: %s', roles_to_journals)
+
+ log.info('dev map: %s' % (str(roles_to_devs),))
+ remote_to_roles_to_devs[remote] = roles_to_devs
+ remote_to_roles_to_journals[remote] = roles_to_journals
+
+ log.info('Generating config...')
+ remotes_and_roles = ctx.cluster.remotes.items()
+ roles = [role_list for (remote, role_list) in remotes_and_roles]
+ ips = [host for (host, port) in
+ (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
+ conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips, cluster=cluster_name)
+ for remote, roles_to_journals in remote_to_roles_to_journals.iteritems():
+ for role, journal in roles_to_journals.iteritems():
+ name = teuthology.ceph_role(role)
+ if name not in conf:
+ conf[name] = {}
+ conf[name]['osd journal'] = journal
+ for section, keys in config['conf'].iteritems():
+ for key, value in keys.iteritems():
+ log.info("[%s] %s = %s" % (section, key, value))
+ if section not in conf:
+ conf[section] = {}
+ conf[section][key] = value
+
+ if config.get('tmpfs_journal'):
+ conf['journal dio'] = False
+
+ if not hasattr(ctx, 'ceph'):
+ ctx.ceph = {}
+ ctx.ceph[cluster_name] = argparse.Namespace()
+ ctx.ceph[cluster_name].conf = conf
+
+ default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name)
+ keyring_path = config.get('keyring_path', default_keyring)
+
+ coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
+
+ firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
+
+ log.info('Setting up %s...' % firstmon)
+ ctx.cluster.only(firstmon).run(
+ args=[
+ 'sudo',
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ coverage_dir,
+ 'ceph-authtool',
+ '--create-keyring',
+ keyring_path,
+ ],
+ )
+ ctx.cluster.only(firstmon).run(
+ args=[
+ 'sudo',
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ coverage_dir,
+ 'ceph-authtool',
+ '--gen-key',
+ '--name=mon.',
+ keyring_path,
+ ],
+ )
+ ctx.cluster.only(firstmon).run(
+ args=[
+ 'sudo',
+ 'chmod',
+ '0644',
+ keyring_path,
+ ],
+ )
+ (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
+ monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
+ cluster=cluster_name)
+ fsid = teuthology.create_simple_monmap(
+ ctx,
+ remote=mon0_remote,
+ conf=conf,
+ path=monmap_path,
+ )
+ if not 'global' in conf:
+ conf['global'] = {}
+ conf['global']['fsid'] = fsid
+
+ default_conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster_name)
+ conf_path = config.get('conf_path', default_conf_path)
+ log.info('Writing %s for FSID %s...' % (conf_path, fsid))
+ write_conf(ctx, conf_path, cluster_name)
+
+ log.info('Creating admin key on %s...' % firstmon)
+ ctx.cluster.only(firstmon).run(
+ args=[
+ 'sudo',
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ coverage_dir,
+ 'ceph-authtool',
+ '--gen-key',
+ '--name=client.admin',
+ '--set-uid=0',
+ '--cap', 'mon', 'allow *',
+ '--cap', 'osd', 'allow *',
+ '--cap', 'mds', 'allow *',
+ keyring_path,
+ ],
+ )
+
+ log.info('Copying monmap to all nodes...')
+ keyring = teuthology.get_file(
+ remote=mon0_remote,
+ path=keyring_path,
+ )
+ monmap = teuthology.get_file(
+ remote=mon0_remote,
+ path=monmap_path,
+ )
+
+ for rem in ctx.cluster.remotes.iterkeys():
+ # copy mon key and initial monmap
+ log.info('Sending monmap to node {remote}'.format(remote=rem))
+ teuthology.sudo_write_file(
+ remote=rem,
+ path=keyring_path,
+ data=keyring,
+ perms='0644'
+ )
+ teuthology.write_file(
+ remote=rem,
+ path=monmap_path,
+ data=monmap,
+ )
+
+ log.info('Setting up mon nodes...')
+ mons = ctx.cluster.only(teuthology.is_type('mon', cluster_name))
+ osdmap_path = '{tdir}/{cluster}.osdmap'.format(tdir=testdir,
+ cluster=cluster_name)
+ run.wait(
+ mons.run(
+ args=[
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ coverage_dir,
+ 'osdmaptool',
+ '-c', conf_path,
+ '--clobber',
+ '--createsimple', '{num:d}'.format(
+ num=teuthology.num_instances_of_type(ctx.cluster, 'osd',
+ cluster_name),
+ ),
+ osdmap_path,
+ '--pg_bits', '2',
+ '--pgp_bits', '4',
+ ],
+ wait=False,
+ ),
+ )
+
+ log.info('Setting up mgr nodes...')
+ mgrs = ctx.cluster.only(teuthology.is_type('mgr', cluster_name))
+ for remote, roles_for_host in mgrs.remotes.iteritems():
+ for role in teuthology.cluster_roles_of_type(roles_for_host, 'mgr',
+ cluster_name):
+ _, _, id_ = teuthology.split_role(role)
+ mgr_dir = '/var/lib/ceph/mgr/{cluster}-{id}'.format(
+ cluster=cluster_name,
+ id=id_,
+ )
+ remote.run(
+ args=[
+ 'sudo',
+ 'mkdir',
+ '-p',
+ mgr_dir,
+ run.Raw('&&'),
+ 'sudo',
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ coverage_dir,
+ 'ceph-authtool',
+ '--create-keyring',
+ '--gen-key',
+ '--name=mgr.{id}'.format(id=id_),
+ mgr_dir + '/keyring',
+ ],
+ )
+
+ log.info('Setting up mds nodes...')
+ mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
+ for remote, roles_for_host in mdss.remotes.iteritems():
+ for role in teuthology.cluster_roles_of_type(roles_for_host, 'mds',
+ cluster_name):
+ _, _, id_ = teuthology.split_role(role)
+ mds_dir = '/var/lib/ceph/mds/{cluster}-{id}'.format(
+ cluster=cluster_name,
+ id=id_,
+ )
+ remote.run(
+ args=[
+ 'sudo',
+ 'mkdir',
+ '-p',
+ mds_dir,
+ run.Raw('&&'),
+ 'sudo',
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ coverage_dir,
+ 'ceph-authtool',
+ '--create-keyring',
+ '--gen-key',
+ '--name=mds.{id}'.format(id=id_),
+ mds_dir + '/keyring',
+ ],
+ )
+
+ cclient.create_keyring(ctx, cluster_name)
+ log.info('Running mkfs on osd nodes...')
+
+ if not hasattr(ctx, 'disk_config'):
+ ctx.disk_config = argparse.Namespace()
+ if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev'):
+ ctx.disk_config.remote_to_roles_to_dev = {}
+ if not hasattr(ctx.disk_config, 'remote_to_roles_to_journals'):
+ ctx.disk_config.remote_to_roles_to_journals = {}
+ if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_mount_options'):
+ ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
+ if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_fstype'):
+ ctx.disk_config.remote_to_roles_to_dev_fstype = {}
+
+ teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev, remote_to_roles_to_devs)
+ teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_journals, remote_to_roles_to_journals)
+
+ log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
+ for remote, roles_for_host in osds.remotes.iteritems():
+ roles_to_devs = remote_to_roles_to_devs[remote]
+ roles_to_journals = remote_to_roles_to_journals[remote]
+
+ for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
+ _, _, id_ = teuthology.split_role(role)
+ mnt_point = '/var/lib/ceph/osd/{cluster}-{id}'.format(cluster=cluster_name, id=id_)
+ remote.run(
+ args=[
+ 'sudo',
+ 'mkdir',
+ '-p',
+ mnt_point,
+ ])
+ log.info(str(roles_to_journals))
+ log.info(role)
+ if roles_to_devs.get(role):
+ dev = roles_to_devs[role]
+ fs = config.get('fs')
+ package = None
+ mkfs_options = config.get('mkfs_options')
+ mount_options = config.get('mount_options')
+ if fs == 'btrfs':
+ # package = 'btrfs-tools'
+ if mount_options is None:
+ mount_options = ['noatime', 'user_subvol_rm_allowed']
+ if mkfs_options is None:
+ mkfs_options = ['-m', 'single',
+ '-l', '32768',
+ '-n', '32768']
+ if fs == 'xfs':
+ # package = 'xfsprogs'
+ if mount_options is None:
+ mount_options = ['noatime']
+ if mkfs_options is None:
+ mkfs_options = ['-f', '-i', 'size=2048']
+ if fs == 'ext4' or fs == 'ext3':
+ if mount_options is None:
+ mount_options = ['noatime', 'user_xattr']
+
+ if mount_options is None:
+ mount_options = []
+ if mkfs_options is None:
+ mkfs_options = []
+ mkfs = ['mkfs.%s' % fs] + mkfs_options
+ log.info('%s on %s on %s' % (mkfs, dev, remote))
+ if package is not None:
+ remote.run(
+ args=[
+ 'sudo',
+ 'apt-get', 'install', '-y', package
+ ],
+ stdout=StringIO(),
+ )
+
+ try:
+ remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
+ except run.CommandFailedError:
+ # Newer btfs-tools doesn't prompt for overwrite, use -f
+ if '-f' not in mount_options:
+ mkfs_options.append('-f')
+ mkfs = ['mkfs.%s' % fs] + mkfs_options
+ log.info('%s on %s on %s' % (mkfs, dev, remote))
+ remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
+
+ log.info('mount %s on %s -o %s' % (dev, remote,
+ ','.join(mount_options)))
+ remote.run(
+ args=[
+ 'sudo',
+ 'mount',
+ '-t', fs,
+ '-o', ','.join(mount_options),
+ dev,
+ mnt_point,
+ ]
+ )
+ remote.run(
+ args=[
+ 'sudo', '/sbin/restorecon', mnt_point,
+ ],
+ check_status=False,
+ )
+ if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
+ ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
+ ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][role] = mount_options
+ if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
+ ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
+ ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role] = fs
+ devs_to_clean[remote].append(mnt_point)
+
+ for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
+ _, _, id_ = teuthology.split_role(role)
+ remote.run(
+ args=[
+ 'sudo',
+ 'MALLOC_CHECK_=3',
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ coverage_dir,
+ 'ceph-osd',
+ '--cluster',
+ cluster_name,
+ '--mkfs',
+ '--mkkey',
+ '-i', id_,
+ '--monmap', monmap_path,
+ ],
+ )
+
+ log.info('Reading keys from all nodes...')
+ keys_fp = StringIO()
+ keys = []
+ for remote, roles_for_host in ctx.cluster.remotes.iteritems():
+ for type_ in ['mgr', 'mds', 'osd']:
+ for role in teuthology.cluster_roles_of_type(roles_for_host, type_, cluster_name):
+ _, _, id_ = teuthology.split_role(role)
+ data = teuthology.get_file(
+ remote=remote,
+ path='/var/lib/ceph/{type}/{cluster}-{id}/keyring'.format(
+ type=type_,
+ id=id_,
+ cluster=cluster_name,
+ ),
+ sudo=True,
+ )
+ keys.append((type_, id_, data))
+ keys_fp.write(data)
+ for remote, roles_for_host in ctx.cluster.remotes.iteritems():
+ for role in teuthology.cluster_roles_of_type(roles_for_host, 'client', cluster_name):
+ _, _, id_ = teuthology.split_role(role)
+ data = teuthology.get_file(
+ remote=remote,
+ path='/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_, cluster=cluster_name)
+ )
+ keys.append(('client', id_, data))
+ keys_fp.write(data)
+
+ log.info('Adding keys to all mons...')
+ writes = mons.run(
+ args=[
+ 'sudo', 'tee', '-a',
+ keyring_path,
+ ],
+ stdin=run.PIPE,
+ wait=False,
+ stdout=StringIO(),
+ )
+ keys_fp.seek(0)
+ teuthology.feed_many_stdins_and_close(keys_fp, writes)
+ run.wait(writes)
+ for type_, id_, data in keys:
+ run.wait(
+ mons.run(
+ args=[
+ 'sudo',
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ coverage_dir,
+ 'ceph-authtool',
+ keyring_path,
+ '--name={type}.{id}'.format(
+ type=type_,
+ id=id_,
+ ),
+ ] + list(generate_caps(type_)),
+ wait=False,
+ ),
+ )
+
+ log.info('Running mkfs on mon nodes...')
+ for remote, roles_for_host in mons.remotes.iteritems():
+ for role in teuthology.cluster_roles_of_type(roles_for_host, 'mon', cluster_name):
+ _, _, id_ = teuthology.split_role(role)
+ remote.run(
+ args=[
+ 'sudo',
+ 'mkdir',
+ '-p',
+ '/var/lib/ceph/mon/{cluster}-{id}'.format(id=id_, cluster=cluster_name),
+ ],
+ )
+ remote.run(
+ args=[
+ 'sudo',
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ coverage_dir,
+ 'ceph-mon',
+ '--cluster', cluster_name,
+ '--mkfs',
+ '-i', id_,
+ '--monmap', monmap_path,
+ '--osdmap', osdmap_path,
+ '--keyring', keyring_path,
+ ],
+ )
+
+ run.wait(
+ mons.run(
+ args=[
+ 'rm',
+ '--',
+ monmap_path,
+ osdmap_path,
+ ],
+ wait=False,
+ ),
+ )
+
+ try:
+ yield
+ except Exception:
+ # we need to know this below
+ ctx.summary['success'] = False
+ raise
+ finally:
+ (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
+
+ log.info('Checking cluster log for badness...')
+
+ def first_in_ceph_log(pattern, excludes):
+ """
+ Find the first occurence of the pattern specified in the Ceph log,
+ Returns None if none found.
+
+ :param pattern: Pattern scanned for.
+ :param excludes: Patterns to ignore.
+ :return: First line of text (or None if not found)
+ """
+ args = [
+ 'sudo',
+ 'egrep', pattern,
+ '/var/log/ceph/{cluster}.log'.format(cluster=cluster_name),
+ ]
+ for exclude in excludes:
+ args.extend([run.Raw('|'), 'egrep', '-v', exclude])
+ args.extend([
+ run.Raw('|'), 'head', '-n', '1',
+ ])
+ r = mon0_remote.run(
+ stdout=StringIO(),
+ args=args,
+ )
+ stdout = r.stdout.getvalue()
+ if stdout != '':
+ return stdout
+ return None
+
+ if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
+ config['log_whitelist']) is not None:
+ log.warning('Found errors (ERR|WRN|SEC) in cluster log')
+ ctx.summary['success'] = False
+ # use the most severe problem as the failure reason
+ if 'failure_reason' not in ctx.summary:
+ for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
+ match = first_in_ceph_log(pattern, config['log_whitelist'])
+ if match is not None:
+ ctx.summary['failure_reason'] = \
+ '"{match}" in cluster log'.format(
+ match=match.rstrip('\n'),
+ )
+ break
+
+ for remote, dirs in devs_to_clean.iteritems():
+ for dir_ in dirs:
+ log.info('Unmounting %s on %s' % (dir_, remote))
+ try:
+ remote.run(
+ args=[
+ 'sync',
+ run.Raw('&&'),
+ 'sudo',
+ 'umount',
+ '-f',
+ dir_
+ ]
+ )
+ except Exception as e:
+ remote.run(args=[
+ 'sudo',
+ run.Raw('PATH=/usr/sbin:$PATH'),
+ 'lsof',
+ run.Raw(';'),
+ 'ps', 'auxf',
+ ])
+ raise e
+
+ if config.get('tmpfs_journal'):
+ log.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
+ for remote, roles_for_host in osds.remotes.iteritems():
+ remote.run(
+ args=['sudo', 'umount', '-f', '/mnt'],
+ check_status=False,
+ )
+
+ if ctx.archive is not None and \
+ not (ctx.config.get('archive-on-error') and ctx.summary['success']):
+
+ # archive mon data, too
+ log.info('Archiving mon data...')
+ path = os.path.join(ctx.archive, 'data')
+ try:
+ os.makedirs(path)
+ except OSError as e:
+ if e.errno == errno.EEXIST:
+ pass
+ else:
+ raise
+ for remote, roles in mons.remotes.iteritems():
+ for role in roles:
+ is_mon = teuthology.is_type('mon', cluster_name)
+ if is_mon(role):
+ _, _, id_ = teuthology.split_role(role)
+ mon_dir = '/var/lib/ceph/mon/' + \
+ '{0}-{1}'.format(cluster_name, id_)
+ teuthology.pull_directory_tarball(
+ remote,
+ mon_dir,
+ path + '/' + role + '.tgz')
+
+ log.info('Cleaning ceph cluster...')
+ run.wait(
+ ctx.cluster.run(
+ args=[
+ 'sudo',
+ 'rm',
+ '-rf',
+ '--',
+ conf_path,
+ keyring_path,
+ data_dir,
+ monmap_path,
+ osdmap_path,
+ run.Raw('{tdir}/../*.pid'.format(tdir=testdir)),
+ ],
+ wait=False,
+ ),
+ )
+
+
+def osd_scrub_pgs(ctx, config):
+ """
+ Scrub pgs when we exit.
+
+ First make sure all pgs are active and clean.
+ Next scrub all osds.
+ Then periodically check until all pgs have scrub time stamps that
+ indicate the last scrub completed. Time out if no progess is made
+ here after two minutes.
+ """
+ retries = 12
+ delays = 10
+ cluster_name = config['cluster']
+ manager = ctx.managers[cluster_name]
+ all_clean = False
+ for _ in range(0, retries):
+ stats = manager.get_pg_stats()
+ states = [stat['state'] for stat in stats]
+ if len(set(states)) == 1 and states[0] == 'active+clean':
+ all_clean = True
+ break
+ log.info("Waiting for all osds to be active and clean.")
+ time.sleep(delays)
+ if not all_clean:
+ log.info("Scrubbing terminated -- not all pgs were active and clean.")
+ return
+ check_time_now = time.localtime()
+ time.sleep(1)
+ all_roles = teuthology.all_roles(ctx.cluster)
+ for role in teuthology.cluster_roles_of_type(all_roles, 'osd', cluster_name):
+ log.info("Scrubbing {osd}".format(osd=role))
+ _, _, id_ = teuthology.split_role(role)
+ manager.raw_cluster_cmd('osd', 'deep-scrub', id_)
+ prev_good = 0
+ gap_cnt = 0
+ loop = True
+ while loop:
+ stats = manager.get_pg_stats()
+ timez = [stat['last_scrub_stamp'] for stat in stats]
+ loop = False
+ thiscnt = 0
+ for tmval in timez:
+ pgtm = time.strptime(tmval[0:tmval.find('.')], '%Y-%m-%d %H:%M:%S')
+ if pgtm > check_time_now:
+ thiscnt += 1
+ else:
+ loop = True
+ if thiscnt > prev_good:
+ prev_good = thiscnt
+ gap_cnt = 0
+ else:
+ gap_cnt += 1
+ if gap_cnt > retries:
+ log.info('Exiting scrub checking -- not all pgs scrubbed.')
+ return
+ if loop:
+ log.info('Still waiting for all pgs to be scrubbed.')
+ time.sleep(delays)
+
+
+@contextlib.contextmanager
+def run_daemon(ctx, config, type_):
+ """
+ Run daemons for a role type. Handle the startup and termination of a a daemon.
+ On startup -- set coverages, cpu_profile, valgrind values for all remotes,
+ and a max_mds value for one mds.
+ On cleanup -- Stop all existing daemons of this type.
+
+ :param ctx: Context
+ :param config: Configuration
+ :paran type_: Role type
+ """
+ cluster_name = config['cluster']
+ log.info('Starting %s daemons in cluster %s...', type_, cluster_name)
+ testdir = teuthology.get_testdir(ctx)
+ daemons = ctx.cluster.only(teuthology.is_type(type_, cluster_name))
+
+ # check whether any daemons if this type are configured
+ if daemons is None:
+ return
+ coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
+
+ daemon_signal = 'kill'
+ if config.get('coverage') or config.get('valgrind') is not None:
+ daemon_signal = 'term'
+
+ for remote, roles_for_host in daemons.remotes.iteritems():
+ is_type_ = teuthology.is_type(type_, cluster_name)
+ for role in roles_for_host:
+ if not is_type_(role):
+ continue
+ _, _, id_ = teuthology.split_role(role)
+
+ run_cmd = [
+ 'sudo',
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ coverage_dir,
+ 'daemon-helper',
+ daemon_signal,
+ ]
+ run_cmd_tail = [
+ 'ceph-%s' % (type_),
+ '-f',
+ '--cluster', cluster_name,
+ '-i', id_]
+
+ if type_ in config.get('cpu_profile', []):
+ profile_path = '/var/log/ceph/profiling-logger/%s.prof' % (role)
+ run_cmd.extend(['env', 'CPUPROFILE=%s' % profile_path])
+
+ if config.get('valgrind') is not None:
+ valgrind_args = None
+ if type_ in config['valgrind']:
+ valgrind_args = config['valgrind'][type_]
+ if role in config['valgrind']:
+ valgrind_args = config['valgrind'][role]
+ run_cmd = teuthology.get_valgrind_args(testdir, role,
+ run_cmd,
+ valgrind_args)
+
+ run_cmd.extend(run_cmd_tail)
+
+ ctx.daemons.add_daemon(remote, type_, id_,
+ cluster=cluster_name,
+ args=run_cmd,
+ logger=log.getChild(role),
+ stdin=run.PIPE,
+ wait=False,
+ )
+
+ try:
+ yield
+ finally:
+ teuthology.stop_daemons_of_type(ctx, type_, cluster_name)
+
+
+def healthy(ctx, config):
+ """
+ Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
+
+ :param ctx: Context
+ :param config: Configuration
+ """
+ config = config if isinstance(config, dict) else dict()
+ cluster_name = config.get('cluster', 'ceph')
+ log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
+ firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
+ (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
+ teuthology.wait_until_osds_up(
+ ctx,
+ cluster=ctx.cluster,
+ remote=mon0_remote,
+ ceph_cluster=cluster_name,
+ )
+ teuthology.wait_until_healthy(
+ ctx,
+ remote=mon0_remote,
+ ceph_cluster=cluster_name,
+ )
+
+ if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
+ # Some MDSs exist, wait for them to be healthy
+ ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware
+ ceph_fs.wait_for_daemons(timeout=300)
+
+
+def wait_for_osds_up(ctx, config):
+ """
+ Wait for all osd's to come up.
+
+ :param ctx: Context
+ :param config: Configuration
+ """
+ log.info('Waiting until ceph osds are all up...')
+ cluster_name = config.get('cluster', 'ceph')
+ firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
+ (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
+ teuthology.wait_until_osds_up(
+ ctx,
+ cluster=ctx.cluster,
+ remote=mon0_remote
+ )
+
+
+def wait_for_mon_quorum(ctx, config):
+ """
+ Check renote ceph status until all monitors are up.
+
+ :param ctx: Context
+ :param config: Configuration
+ """
+ if isinstance(config, dict):
+ mons = config['daemons']
+ cluster_name = config.get('cluster', 'ceph')
+ else:
+ assert isinstance(config, list)
+ mons = config
+ cluster_name = 'ceph'
+ firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
+ (remote,) = ctx.cluster.only(firstmon).remotes.keys()
+ with contextutil.safe_while(sleep=10, tries=60,
+ action='wait for monitor quorum') as proceed:
+ while proceed():
+ r = remote.run(
+ args=[
+ 'sudo',
+ 'ceph',
+ 'quorum_status',
+ ],
+ stdout=StringIO(),
+ logger=log.getChild('quorum_status'),
+ )
+ j = json.loads(r.stdout.getvalue())
+ q = j.get('quorum_names', [])
+ log.debug('Quorum: %s', q)
+ if sorted(q) == sorted(mons):
+ break
+
+
+def created_pool(ctx, config):
+ """
+ Add new pools to the dictionary of pools that the ceph-manager
+ knows about.
+ """
+ for new_pool in config:
+ if new_pool not in ctx.managers['ceph'].pools:
+ ctx.managers['ceph'].pools[new_pool] = ctx.managers['ceph'].get_pool_property(
+ new_pool, 'pg_num')
+
+
+@contextlib.contextmanager
+def restart(ctx, config):
+ """
+ restart ceph daemons
+
+ For example::
+ tasks:
+ - ceph.restart: [all]
+
+ For example::
+ tasks:
+ - ceph.restart: [osd.0, mon.1, mds.*]
+
+ or::
+
+ tasks:
+ - ceph.restart:
+ daemons: [osd.0, mon.1]
+ wait-for-healthy: false
+ wait-for-osds-up: true
+
+ :param ctx: Context
+ :param config: Configuration
+ """
+ if config is None:
+ config = {}
+ elif isinstance(config, list):
+ config = {'daemons': config}
+
+ daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
+ clusters = set()
+ for role in daemons:
+ cluster, type_, id_ = teuthology.split_role(role)
+ ctx.daemons.get_daemon(type_, id_, cluster).restart()
+ clusters.add(cluster)
+
+ if config.get('wait-for-healthy', True):
+ for cluster in clusters:
+ healthy(ctx=ctx, config=dict(cluster=cluster))
+ if config.get('wait-for-osds-up', False):
+ for cluster in clusters:
+ wait_for_osds_up(ctx=ctx, config=dict(cluster=cluster))
+ manager = ctx.managers['ceph']
+ for dmon in daemons:
+ if '.' in dmon:
+ dm_parts = dmon.split('.')
+ if dm_parts[1].isdigit():
+ if dm_parts[0] == 'osd':
+ manager.mark_down_osd(int(dm_parts[1]))
+ yield
+
+
+@contextlib.contextmanager
+def stop(ctx, config):
+ """
+ Stop ceph daemons
+
+ For example::
+ tasks:
+ - ceph.stop: [mds.*]
+
+ tasks:
+ - ceph.stop: [osd.0, osd.2]
+
+ tasks:
+ - ceph.stop:
+ daemons: [osd.0, osd.2]
+
+ """
+ if config is None:
+ config = {}
+ elif isinstance(config, list):
+ config = {'daemons': config}
+
+ daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
+ for role in daemons:
+ cluster, type_, id_ = teuthology.split_role(role)
+ ctx.daemons.get_daemon(type_, id_, cluster).stop()
+
+ yield
+
+
+@contextlib.contextmanager
+def wait_for_failure(ctx, config):
+ """
+ Wait for a failure of a ceph daemon
+
+ For example::
+ tasks:
+ - ceph.wait_for_failure: [mds.*]
+
+ tasks:
+ - ceph.wait_for_failure: [osd.0, osd.2]
+
+ tasks:
+ - ceph.wait_for_failure:
+ daemons: [osd.0, osd.2]
+
+ """
+ if config is None:
+ config = {}
+ elif isinstance(config, list):
+ config = {'daemons': config}
+
+ daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
+ for role in daemons:
+ cluster, type_, id_ = teuthology.split_role(role)
+ try:
+ ctx.daemons.get_daemon(type_, id_, cluster).wait()
+ except:
+ log.info('Saw expected daemon failure. Continuing.')
+ pass
+ else:
+ raise RuntimeError('daemon %s did not fail' % role)
+
+ yield
+
+
+def validate_config(ctx, config):
+ """
+ Perform some simple validation on task configuration.
+ Raises exceptions.ConfigError if an error is found.
+ """
+ # check for osds from multiple clusters on the same host
+ for remote, roles_for_host in ctx.cluster.remotes.items():
+ last_cluster = None
+ last_role = None
+ for role in roles_for_host:
+ role_cluster, role_type, _ = teuthology.split_role(role)
+ if role_type != 'osd':
+ continue
+ if last_cluster and last_cluster != role_cluster:
+ msg = "Host should not have osds (%s and %s) from multiple clusters" % (
+ last_role, role)
+ raise exceptions.ConfigError(msg)
+ last_cluster = role_cluster
+ last_role = role
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Set up and tear down a Ceph cluster.
+
+ For example::
+
+ tasks:
+ - ceph:
+ - interactive:
+
+ You can also specify what branch to run::
+
+ tasks:
+ - ceph:
+ branch: foo
+
+ Or a tag::
+
+ tasks:
+ - ceph:
+ tag: v0.42.13
+
+ Or a sha1::
+
+ tasks:
+ - ceph:
+ sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
+
+ Or a local source dir::
+
+ tasks:
+ - ceph:
+ path: /home/sage/ceph
+
+ To capture code coverage data, use::
+
+ tasks:
+ - ceph:
+ coverage: true
+
+ To use btrfs, ext4, or xfs on the target's scratch disks, use::
+
+ tasks:
+ - ceph:
+ fs: xfs
+ mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
+ mount_options: [nobarrier, inode64]
+
+ Note, this will cause the task to check the /scratch_devs file on each node
+ for available devices. If no such file is found, /dev/sdb will be used.
+
+ To run some daemons under valgrind, include their names
+ and the tool/args to use in a valgrind section::
+
+ tasks:
+ - ceph:
+ valgrind:
+ mds.1: --tool=memcheck
+ osd.1: [--tool=memcheck, --leak-check=no]
+
+ Those nodes which are using memcheck or valgrind will get
+ checked for bad results.
+
+ To adjust or modify config options, use::
+
+ tasks:
+ - ceph:
+ conf:
+ section:
+ key: value
+
+ For example::
+
+ tasks:
+ - ceph:
+ conf:
+ mds.0:
+ some option: value
+ other key: other value
+ client.0:
+ debug client: 10
+ debug ms: 1
+
+ By default, the cluster log is checked for errors and warnings,
+ and the run marked failed if any appear. You can ignore log
+ entries by giving a list of egrep compatible regexes, i.e.:
+
+ tasks:
+ - ceph:
+ log-whitelist: ['foo.*bar', 'bad message']
+
+ To run multiple ceph clusters, use multiple ceph tasks, and roles
+ with a cluster name prefix, e.g. cluster1.client.0. Roles with no
+ cluster use the default cluster name, 'ceph'. OSDs from separate
+ clusters must be on separate hosts. Clients and non-osd daemons
+ from multiple clusters may be colocated. For each cluster, add an
+ instance of the ceph task with the cluster name specified, e.g.::
+
+ roles:
+ - [mon.a, osd.0, osd.1]
+ - [backup.mon.a, backup.osd.0, backup.osd.1]
+ - [client.0, backup.client.0]
+ tasks:
+ - ceph:
+ cluster: ceph
+ - ceph:
+ cluster: backup
+
+ :param ctx: Context
+ :param config: Configuration
+
+ """
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ "task ceph only supports a dictionary for configuration"
+
+ overrides = ctx.config.get('overrides', {})
+ teuthology.deep_merge(config, overrides.get('ceph', {}))
+
+ first_ceph_cluster = False
+ if not hasattr(ctx, 'daemons'):
+ first_ceph_cluster = True
+ ctx.daemons = DaemonGroup()
+
+ testdir = teuthology.get_testdir(ctx)
+ if config.get('coverage'):
+ coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
+ log.info('Creating coverage directory...')
+ run.wait(
+ ctx.cluster.run(
+ args=[
+ 'install', '-d', '-m0755', '--',
+ coverage_dir,
+ ],
+ wait=False,
+ )
+ )
+
+ if 'cluster' not in config:
+ config['cluster'] = 'ceph'
+
+ validate_config(ctx, config)
+
+ subtasks = []
+ if first_ceph_cluster:
+ # these tasks handle general log setup and parsing on all hosts,
+ # so they should only be run once
+ subtasks = [
+ lambda: ceph_log(ctx=ctx, config=None),
+ lambda: valgrind_post(ctx=ctx, config=config),
+ ]
+
+ subtasks += [
+ lambda: cluster(ctx=ctx, config=dict(
+ conf=config.get('conf', {}),
+ fs=config.get('fs', 'xfs'),
+ mkfs_options=config.get('mkfs_options', None),
+ mount_options=config.get('mount_options', None),
+ block_journal=config.get('block_journal', None),
+ tmpfs_journal=config.get('tmpfs_journal', None),
+ log_whitelist=config.get('log-whitelist', []),
+ cpu_profile=set(config.get('cpu_profile', []),),
+ cluster=config['cluster'],
+ )),
+ lambda: run_daemon(ctx=ctx, config=config, type_='mon'),
+ lambda: run_daemon(ctx=ctx, config=config, type_='mgr'),
+ lambda: crush_setup(ctx=ctx, config=config),
+ lambda: run_daemon(ctx=ctx, config=config, type_='osd'),
+ lambda: cephfs_setup(ctx=ctx, config=config),
+ lambda: run_daemon(ctx=ctx, config=config, type_='mds'),
+ ]
+
+ with contextutil.nested(*subtasks):
+ first_mon = teuthology.get_first_mon(ctx, config, config['cluster'])
+ (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+ if not hasattr(ctx, 'managers'):
+ ctx.managers = {}
+ ctx.managers[config['cluster']] = CephManager(
+ mon,
+ ctx=ctx,
+ logger=log.getChild('ceph_manager.' + config['cluster']),
+ cluster=config['cluster'],
+ )
+
+ try:
+ if config.get('wait-for-healthy', True):
+ healthy(ctx=ctx, config=dict(cluster=config['cluster']))
+
+ yield
+ finally:
+ if config.get('wait-for-scrub', True):
+ osd_scrub_pgs(ctx, config)
--- /dev/null
+"""
+Set up client keyring
+"""
+import logging
+
+from teuthology import misc as teuthology
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+def create_keyring(ctx, cluster_name):
+ """
+ Set up key ring on remote sites
+ """
+ log.info('Setting up client nodes...')
+ clients = ctx.cluster.only(teuthology.is_type('client', cluster_name))
+ testdir = teuthology.get_testdir(ctx)
+ coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
+ for remote, roles_for_host in clients.remotes.iteritems():
+ for role in teuthology.cluster_roles_of_type(roles_for_host, 'client',
+ cluster_name):
+ name = teuthology.ceph_role(role)
+ client_keyring = '/etc/ceph/{0}.{1}.keyring'.format(cluster_name, name)
+ remote.run(
+ args=[
+ 'sudo',
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ coverage_dir,
+ 'ceph-authtool',
+ '--create-keyring',
+ '--gen-key',
+ # TODO this --name= is not really obeyed, all unknown "types" are munged to "client"
+ '--name={name}'.format(name=name),
+ client_keyring,
+ run.Raw('&&'),
+ 'sudo',
+ 'chmod',
+ '0644',
+ client_keyring,
+ ],
+ )
--- /dev/null
+"""
+Execute ceph-deploy as a task
+"""
+from cStringIO import StringIO
+
+import contextlib
+import os
+import time
+import logging
+import traceback
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.config import config as teuth_config
+from teuthology.task import install as install_fn
+from teuthology.orchestra import run
+from tasks.cephfs.filesystem import Filesystem
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def download_ceph_deploy(ctx, config):
+ """
+ Downloads ceph-deploy from the ceph.com git mirror and (by default)
+ switches to the master branch. If the `ceph-deploy-branch` is specified, it
+ will use that instead. The `bootstrap` script is ran, with the argument
+ obtained from `python_version`, if specified.
+ """
+ ceph_admin = ctx.cluster.only(teuthology.get_first_mon(ctx, config))
+
+ try:
+ py_ver = str(config['python_version'])
+ except KeyError:
+ pass
+ else:
+ supported_versions = ['2', '3']
+ if py_ver not in supported_versions:
+ raise ValueError("python_version must be: {}, not {}".format(
+ ' or '.join(supported_versions), py_ver
+ ))
+
+ log.info("Installing Python")
+ for admin in ceph_admin.remotes:
+ system_type = teuthology.get_system_type(admin)
+
+ if system_type == 'rpm':
+ package = 'python34' if py_ver == '3' else 'python'
+ ctx.cluster.run(args=[
+ 'sudo', 'yum', '-y', 'install',
+ package, 'python-virtualenv'
+ ])
+ else:
+ package = 'python3' if py_ver == '3' else 'python'
+ ctx.cluster.run(args=[
+ 'sudo', 'apt-get', '-y', '--force-yes', 'install',
+ package, 'python-virtualenv'
+ ])
+
+ log.info('Downloading ceph-deploy...')
+ testdir = teuthology.get_testdir(ctx)
+ ceph_deploy_branch = config.get('ceph-deploy-branch', 'master')
+
+ ceph_admin.run(
+ args=[
+ 'git', 'clone', '-b', ceph_deploy_branch,
+ teuth_config.ceph_git_base_url + 'ceph-deploy.git',
+ '{tdir}/ceph-deploy'.format(tdir=testdir),
+ ],
+ )
+ args = [
+ 'cd',
+ '{tdir}/ceph-deploy'.format(tdir=testdir),
+ run.Raw('&&'),
+ './bootstrap',
+ ]
+ try:
+ args.append(str(config['python_version']))
+ except KeyError:
+ pass
+ ceph_admin.run(args=args)
+
+ try:
+ yield
+ finally:
+ log.info('Removing ceph-deploy ...')
+ ceph_admin.run(
+ args=[
+ 'rm',
+ '-rf',
+ '{tdir}/ceph-deploy'.format(tdir=testdir),
+ ],
+ )
+
+
+def is_healthy(ctx, config):
+ """Wait until a Ceph cluster is healthy."""
+ testdir = teuthology.get_testdir(ctx)
+ ceph_admin = teuthology.get_first_mon(ctx, config)
+ (remote,) = ctx.cluster.only(ceph_admin).remotes.keys()
+ max_tries = 90 # 90 tries * 10 secs --> 15 minutes
+ tries = 0
+ while True:
+ tries += 1
+ if tries >= max_tries:
+ msg = "ceph health was unable to get 'HEALTH_OK' after waiting 15 minutes"
+ remote.run(
+ args=[
+ 'cd',
+ '{tdir}'.format(tdir=testdir),
+ run.Raw('&&'),
+ 'sudo', 'ceph',
+ 'report',
+ ],
+ )
+ raise RuntimeError(msg)
+
+ r = remote.run(
+ args=[
+ 'cd',
+ '{tdir}'.format(tdir=testdir),
+ run.Raw('&&'),
+ 'sudo', 'ceph',
+ 'health',
+ ],
+ stdout=StringIO(),
+ logger=log.getChild('health'),
+ )
+ out = r.stdout.getvalue()
+ log.info('Ceph health: %s', out.rstrip('\n'))
+ if out.split(None, 1)[0] == 'HEALTH_OK':
+ break
+ time.sleep(10)
+
+
+def get_nodes_using_role(ctx, target_role):
+ """
+ Extract the names of nodes that match a given role from a cluster, and modify the
+ cluster's service IDs to match the resulting node-based naming scheme that ceph-deploy
+ uses, such that if "mon.a" is on host "foo23", it'll be renamed to "mon.foo23".
+ """
+
+ # Nodes containing a service of the specified role
+ nodes_of_interest = []
+
+ # Prepare a modified version of cluster.remotes with ceph-deploy-ized names
+ modified_remotes = {}
+
+ for _remote, roles_for_host in ctx.cluster.remotes.iteritems():
+ modified_remotes[_remote] = []
+ for svc_id in roles_for_host:
+ if svc_id.startswith("{0}.".format(target_role)):
+ fqdn = str(_remote).split('@')[-1]
+ nodename = str(str(_remote).split('.')[0]).split('@')[1]
+ if target_role == 'mon':
+ nodes_of_interest.append(fqdn)
+ else:
+ nodes_of_interest.append(nodename)
+
+ modified_remotes[_remote].append(
+ "{0}.{1}".format(target_role, nodename))
+ else:
+ modified_remotes[_remote].append(svc_id)
+
+ ctx.cluster.remotes = modified_remotes
+
+ return nodes_of_interest
+
+
+def get_dev_for_osd(ctx, config):
+ """Get a list of all osd device names."""
+ osd_devs = []
+ for remote, roles_for_host in ctx.cluster.remotes.iteritems():
+ host = remote.name.split('@')[-1]
+ shortname = host.split('.')[0]
+ devs = teuthology.get_scratch_devices(remote)
+ num_osd_per_host = list(
+ teuthology.roles_of_type(
+ roles_for_host, 'osd'))
+ num_osds = len(num_osd_per_host)
+ if config.get('separate_journal_disk') is not None:
+ num_devs_reqd = 2 * num_osds
+ assert num_devs_reqd <= len(
+ devs), 'fewer data and journal disks than required ' + shortname
+ for dindex in range(0, num_devs_reqd, 2):
+ jd_index = dindex + 1
+ dev_short = devs[dindex].split('/')[-1]
+ jdev_short = devs[jd_index].split('/')[-1]
+ osd_devs.append((shortname, dev_short, jdev_short))
+ else:
+ assert num_osds <= len(devs), 'fewer disks than osds ' + shortname
+ for dev in devs[:num_osds]:
+ dev_short = dev.split('/')[-1]
+ osd_devs.append((shortname, dev_short))
+ return osd_devs
+
+
+def get_all_nodes(ctx, config):
+ """Return a string of node names separated by blanks"""
+ nodelist = []
+ for t, k in ctx.config['targets'].iteritems():
+ host = t.split('@')[-1]
+ simple_host = host.split('.')[0]
+ nodelist.append(simple_host)
+ nodelist = " ".join(nodelist)
+ return nodelist
+
+
+@contextlib.contextmanager
+def build_ceph_cluster(ctx, config):
+ """Build a ceph cluster"""
+
+ # Expect to find ceph_admin on the first mon by ID, same place that the download task
+ # puts it. Remember this here, because subsequently IDs will change from those in
+ # the test config to those that ceph-deploy invents.
+ (ceph_admin,) = ctx.cluster.only(
+ teuthology.get_first_mon(ctx, config)).remotes.iterkeys()
+
+ def execute_ceph_deploy(cmd):
+ """Remotely execute a ceph_deploy command"""
+ return ceph_admin.run(
+ args=[
+ 'cd',
+ '{tdir}/ceph-deploy'.format(tdir=testdir),
+ run.Raw('&&'),
+ run.Raw(cmd),
+ ],
+ check_status=False,
+ ).exitstatus
+
+ try:
+ log.info('Building ceph cluster using ceph-deploy...')
+ testdir = teuthology.get_testdir(ctx)
+ ceph_branch = None
+ if config.get('branch') is not None:
+ cbranch = config.get('branch')
+ for var, val in cbranch.iteritems():
+ ceph_branch = '--{var}={val}'.format(var=var, val=val)
+ all_nodes = get_all_nodes(ctx, config)
+ mds_nodes = get_nodes_using_role(ctx, 'mds')
+ mds_nodes = " ".join(mds_nodes)
+ mon_node = get_nodes_using_role(ctx, 'mon')
+ mon_nodes = " ".join(mon_node)
+ new_mon = './ceph-deploy new' + " " + mon_nodes
+ mon_hostname = mon_nodes.split(' ')[0]
+ mon_hostname = str(mon_hostname)
+ gather_keys = './ceph-deploy gatherkeys' + " " + mon_hostname
+ deploy_mds = './ceph-deploy mds create' + " " + mds_nodes
+ no_of_osds = 0
+
+ if mon_nodes is None:
+ raise RuntimeError("no monitor nodes in the config file")
+
+ estatus_new = execute_ceph_deploy(new_mon)
+ if estatus_new != 0:
+ raise RuntimeError("ceph-deploy: new command failed")
+
+ log.info('adding config inputs...')
+ testdir = teuthology.get_testdir(ctx)
+ conf_path = '{tdir}/ceph-deploy/ceph.conf'.format(tdir=testdir)
+
+ if config.get('conf') is not None:
+ confp = config.get('conf')
+ for section, keys in confp.iteritems():
+ lines = '[{section}]\n'.format(section=section)
+ teuthology.append_lines_to_file(ceph_admin, conf_path, lines,
+ sudo=True)
+ for key, value in keys.iteritems():
+ log.info("[%s] %s = %s" % (section, key, value))
+ lines = '{key} = {value}\n'.format(key=key, value=value)
+ teuthology.append_lines_to_file(
+ ceph_admin, conf_path, lines, sudo=True)
+
+ # install ceph
+ ceph_sha = ctx.config['sha1']
+ devcommit = '--dev-commit={sha}'.format(sha=ceph_sha)
+ if ceph_branch:
+ option = ceph_branch
+ else:
+ option = devcommit
+ install_nodes = './ceph-deploy install ' + option + " " + all_nodes
+ estatus_install = execute_ceph_deploy(install_nodes)
+ if estatus_install != 0:
+ raise RuntimeError("ceph-deploy: Failed to install ceph")
+ # install ceph-test package too
+ install_nodes2 = './ceph-deploy install --tests ' + option + \
+ " " + all_nodes
+ estatus_install = execute_ceph_deploy(install_nodes2)
+ if estatus_install != 0:
+ raise RuntimeError("ceph-deploy: Failed to install ceph-test")
+
+ mon_create_nodes = './ceph-deploy mon create-initial'
+ # If the following fails, it is OK, it might just be that the monitors
+ # are taking way more than a minute/monitor to form quorum, so lets
+ # try the next block which will wait up to 15 minutes to gatherkeys.
+ execute_ceph_deploy(mon_create_nodes)
+
+ estatus_gather = execute_ceph_deploy(gather_keys)
+ max_gather_tries = 90
+ gather_tries = 0
+ while (estatus_gather != 0):
+ gather_tries += 1
+ if gather_tries >= max_gather_tries:
+ msg = 'ceph-deploy was not able to gatherkeys after 15 minutes'
+ raise RuntimeError(msg)
+ estatus_gather = execute_ceph_deploy(gather_keys)
+ time.sleep(10)
+
+ if mds_nodes:
+ estatus_mds = execute_ceph_deploy(deploy_mds)
+ if estatus_mds != 0:
+ raise RuntimeError("ceph-deploy: Failed to deploy mds")
+
+ if config.get('test_mon_destroy') is not None:
+ for d in range(1, len(mon_node)):
+ mon_destroy_nodes = './ceph-deploy mon destroy' + \
+ " " + mon_node[d]
+ estatus_mon_d = execute_ceph_deploy(mon_destroy_nodes)
+ if estatus_mon_d != 0:
+ raise RuntimeError("ceph-deploy: Failed to delete monitor")
+
+ node_dev_list = get_dev_for_osd(ctx, config)
+ for d in node_dev_list:
+ node = d[0]
+ for disk in d[1:]:
+ zap = './ceph-deploy disk zap ' + node + ':' + disk
+ estatus = execute_ceph_deploy(zap)
+ if estatus != 0:
+ raise RuntimeError("ceph-deploy: Failed to zap osds")
+ osd_create_cmd = './ceph-deploy osd create '
+ if config.get('dmcrypt') is not None:
+ osd_create_cmd += '--dmcrypt '
+ osd_create_cmd += ":".join(d)
+ estatus_osd = execute_ceph_deploy(osd_create_cmd)
+ if estatus_osd == 0:
+ log.info('successfully created osd')
+ no_of_osds += 1
+ else:
+ raise RuntimeError("ceph-deploy: Failed to create osds")
+
+ if config.get('wait-for-healthy', True) and no_of_osds >= 2:
+ is_healthy(ctx=ctx, config=None)
+
+ log.info('Setting up client nodes...')
+ conf_path = '/etc/ceph/ceph.conf'
+ admin_keyring_path = '/etc/ceph/ceph.client.admin.keyring'
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon0_remote,) = ctx.cluster.only(first_mon).remotes.keys()
+ conf_data = teuthology.get_file(
+ remote=mon0_remote,
+ path=conf_path,
+ sudo=True,
+ )
+ admin_keyring = teuthology.get_file(
+ remote=mon0_remote,
+ path=admin_keyring_path,
+ sudo=True,
+ )
+
+ clients = ctx.cluster.only(teuthology.is_type('client'))
+ for remot, roles_for_host in clients.remotes.iteritems():
+ for id_ in teuthology.roles_of_type(roles_for_host, 'client'):
+ client_keyring = \
+ '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_)
+ mon0_remote.run(
+ args=[
+ 'cd',
+ '{tdir}'.format(tdir=testdir),
+ run.Raw('&&'),
+ 'sudo', 'bash', '-c',
+ run.Raw('"'), 'ceph',
+ 'auth',
+ 'get-or-create',
+ 'client.{id}'.format(id=id_),
+ 'mds', 'allow',
+ 'mon', 'allow *',
+ 'osd', 'allow *',
+ run.Raw('>'),
+ client_keyring,
+ run.Raw('"'),
+ ],
+ )
+ key_data = teuthology.get_file(
+ remote=mon0_remote,
+ path=client_keyring,
+ sudo=True,
+ )
+ teuthology.sudo_write_file(
+ remote=remot,
+ path=client_keyring,
+ data=key_data,
+ perms='0644'
+ )
+ teuthology.sudo_write_file(
+ remote=remot,
+ path=admin_keyring_path,
+ data=admin_keyring,
+ perms='0644'
+ )
+ teuthology.sudo_write_file(
+ remote=remot,
+ path=conf_path,
+ data=conf_data,
+ perms='0644'
+ )
+
+ if mds_nodes:
+ log.info('Configuring CephFS...')
+ ceph_fs = Filesystem(ctx)
+ if not ceph_fs.legacy_configured():
+ ceph_fs.create()
+ elif not config.get('only_mon'):
+ raise RuntimeError(
+ "The cluster is NOT operational due to insufficient OSDs")
+ yield
+
+ except Exception:
+ log.info(
+ "Error encountered, logging exception before tearing down ceph-deploy")
+ log.info(traceback.format_exc())
+ raise
+ finally:
+ if config.get('keep_running'):
+ return
+ log.info('Stopping ceph...')
+ ctx.cluster.run(args=['sudo', 'stop', 'ceph-all', run.Raw('||'),
+ 'sudo', 'service', 'ceph', 'stop', run.Raw('||'),
+ 'sudo', 'systemctl', 'stop', 'ceph.target'])
+
+ # Are you really not running anymore?
+ # try first with the init tooling
+ # ignoring the status so this becomes informational only
+ ctx.cluster.run(
+ args=[
+ 'sudo', 'status', 'ceph-all', run.Raw('||'),
+ 'sudo', 'service', 'ceph', 'status', run.Raw('||'),
+ 'sudo', 'systemctl', 'status', 'ceph.target'],
+ check_status=False)
+
+ # and now just check for the processes themselves, as if upstart/sysvinit
+ # is lying to us. Ignore errors if the grep fails
+ ctx.cluster.run(args=['sudo', 'ps', 'aux', run.Raw('|'),
+ 'grep', '-v', 'grep', run.Raw('|'),
+ 'grep', 'ceph'], check_status=False)
+
+ if ctx.archive is not None:
+ # archive mon data, too
+ log.info('Archiving mon data...')
+ path = os.path.join(ctx.archive, 'data')
+ os.makedirs(path)
+ mons = ctx.cluster.only(teuthology.is_type('mon'))
+ for remote, roles in mons.remotes.iteritems():
+ for role in roles:
+ if role.startswith('mon.'):
+ teuthology.pull_directory_tarball(
+ remote,
+ '/var/lib/ceph/mon',
+ path + '/' + role + '.tgz')
+
+ log.info('Compressing logs...')
+ run.wait(
+ ctx.cluster.run(
+ args=[
+ 'sudo',
+ 'find',
+ '/var/log/ceph',
+ '-name',
+ '*.log',
+ '-print0',
+ run.Raw('|'),
+ 'sudo',
+ 'xargs',
+ '-0',
+ '--no-run-if-empty',
+ '--',
+ 'gzip',
+ '--',
+ ],
+ wait=False,
+ ),
+ )
+
+ log.info('Archiving logs...')
+ path = os.path.join(ctx.archive, 'remote')
+ os.makedirs(path)
+ for remote in ctx.cluster.remotes.iterkeys():
+ sub = os.path.join(path, remote.shortname)
+ os.makedirs(sub)
+ teuthology.pull_directory(remote, '/var/log/ceph',
+ os.path.join(sub, 'log'))
+
+ # Prevent these from being undefined if the try block fails
+ all_nodes = get_all_nodes(ctx, config)
+ purge_nodes = './ceph-deploy purge' + " " + all_nodes
+ purgedata_nodes = './ceph-deploy purgedata' + " " + all_nodes
+
+ log.info('Purging package...')
+ execute_ceph_deploy(purge_nodes)
+ log.info('Purging data...')
+ execute_ceph_deploy(purgedata_nodes)
+
+
+@contextlib.contextmanager
+def cli_test(ctx, config):
+ """
+ ceph-deploy cli to exercise most commonly use cli's and ensure
+ all commands works and also startup the init system.
+
+ """
+ log.info('Ceph-deploy Test')
+ if config is None:
+ config = {}
+ test_branch = ''
+ conf_dir = teuthology.get_testdir(ctx) + "/cdtest"
+
+ def execute_cdeploy(admin, cmd, path):
+ """Execute ceph-deploy commands """
+ """Either use git path or repo path """
+ args = ['cd', conf_dir, run.Raw(';')]
+ if path:
+ args.append('{path}/ceph-deploy/ceph-deploy'.format(path=path));
+ else:
+ args.append('ceph-deploy')
+ args.append(run.Raw(cmd))
+ ec = admin.run(args=args, check_status=False).exitstatus
+ if ec != 0:
+ raise RuntimeError(
+ "failed during ceph-deploy cmd: {cmd} , ec={ec}".format(cmd=cmd, ec=ec))
+
+ if config.get('rhbuild'):
+ path = None
+ else:
+ path = teuthology.get_testdir(ctx)
+ # test on branch from config eg: wip-* , master or next etc
+ # packages for all distro's should exist for wip*
+ if ctx.config.get('branch'):
+ branch = ctx.config.get('branch')
+ test_branch = ' --dev={branch} '.format(branch=branch)
+ mons = ctx.cluster.only(teuthology.is_type('mon'))
+ for node, role in mons.remotes.iteritems():
+ admin = node
+ admin.run(args=['mkdir', conf_dir], check_status=False)
+ nodename = admin.shortname
+ system_type = teuthology.get_system_type(admin)
+ if config.get('rhbuild'):
+ admin.run(args=['sudo', 'yum', 'install', 'ceph-deploy', '-y'])
+ log.info('system type is %s', system_type)
+ osds = ctx.cluster.only(teuthology.is_type('osd'))
+
+ for remote, roles in osds.remotes.iteritems():
+ devs = teuthology.get_scratch_devices(remote)
+ log.info("roles %s", roles)
+ if (len(devs) < 3):
+ log.error(
+ 'Test needs minimum of 3 devices, only found %s',
+ str(devs))
+ raise RuntimeError("Needs minimum of 3 devices ")
+
+ conf_path = '{conf_dir}/ceph.conf'.format(conf_dir=conf_dir)
+ new_cmd = 'new ' + nodename
+ execute_cdeploy(admin, new_cmd, path)
+ if config.get('conf') is not None:
+ confp = config.get('conf')
+ for section, keys in confp.iteritems():
+ lines = '[{section}]\n'.format(section=section)
+ teuthology.append_lines_to_file(admin, conf_path, lines,
+ sudo=True)
+ for key, value in keys.iteritems():
+ log.info("[%s] %s = %s" % (section, key, value))
+ lines = '{key} = {value}\n'.format(key=key, value=value)
+ teuthology.append_lines_to_file(admin, conf_path, lines,
+ sudo=True)
+ new_mon_install = 'install {branch} --mon '.format(
+ branch=test_branch) + nodename
+ new_osd_install = 'install {branch} --osd '.format(
+ branch=test_branch) + nodename
+ new_admin = 'install {branch} --cli '.format(branch=test_branch) + nodename
+ create_initial = 'mon create-initial '
+ execute_cdeploy(admin, new_mon_install, path)
+ execute_cdeploy(admin, new_osd_install, path)
+ execute_cdeploy(admin, new_admin, path)
+ execute_cdeploy(admin, create_initial, path)
+
+ for i in range(3):
+ zap_disk = 'disk zap ' + "{n}:{d}".format(n=nodename, d=devs[i])
+ prepare = 'osd prepare ' + "{n}:{d}".format(n=nodename, d=devs[i])
+ execute_cdeploy(admin, zap_disk, path)
+ execute_cdeploy(admin, prepare, path)
+
+ log.info("list files for debugging purpose to check file permissions")
+ admin.run(args=['ls', run.Raw('-lt'), conf_dir])
+ remote.run(args=['sudo', 'ceph', '-s'], check_status=False)
+ r = remote.run(args=['sudo', 'ceph', 'health'], stdout=StringIO())
+ out = r.stdout.getvalue()
+ log.info('Ceph health: %s', out.rstrip('\n'))
+ log.info("Waiting for cluster to become healthy")
+ with contextutil.safe_while(sleep=10, tries=6,
+ action='check health') as proceed:
+ while proceed():
+ r = remote.run(args=['sudo', 'ceph', 'health'], stdout=StringIO())
+ out = r.stdout.getvalue()
+ if (out.split(None,1)[0] == 'HEALTH_OK'):
+ break
+ rgw_install = 'install {branch} --rgw {node}'.format(
+ branch=test_branch,
+ node=nodename,
+ )
+ rgw_create = 'rgw create ' + nodename
+ execute_cdeploy(admin, rgw_install, path)
+ execute_cdeploy(admin, rgw_create, path)
+ log.info('All ceph-deploy cli tests passed')
+ try:
+ yield
+ finally:
+ log.info("cleaning up")
+ ctx.cluster.run(args=['sudo', 'stop', 'ceph-all', run.Raw('||'),
+ 'sudo', 'service', 'ceph', 'stop', run.Raw('||'),
+ 'sudo', 'systemctl', 'stop', 'ceph.target'],
+ check_status=False)
+ time.sleep(4)
+ for i in range(3):
+ umount_dev = "{d}1".format(d=devs[i])
+ r = remote.run(args=['sudo', 'umount', run.Raw(umount_dev)])
+ cmd = 'purge ' + nodename
+ execute_cdeploy(admin, cmd, path)
+ cmd = 'purgedata ' + nodename
+ execute_cdeploy(admin, cmd, path)
+ log.info("Removing temporary dir")
+ admin.run(
+ args=[
+ 'rm',
+ run.Raw('-rf'),
+ run.Raw(conf_dir)],
+ check_status=False)
+ if config.get('rhbuild'):
+ admin.run(args=['sudo', 'yum', 'remove', 'ceph-deploy', '-y'])
+
+
+@contextlib.contextmanager
+def single_node_test(ctx, config):
+ """
+ - ceph-deploy.single_node_test: null
+
+ #rhbuild testing
+ - ceph-deploy.single_node_test:
+ rhbuild: 1.2.3
+
+ """
+ log.info("Testing ceph-deploy on single node")
+ if config is None:
+ config = {}
+ overrides = ctx.config.get('overrides', {})
+ teuthology.deep_merge(config, overrides.get('ceph-deploy', {}))
+
+ if config.get('rhbuild'):
+ log.info("RH Build, Skip Download")
+ with contextutil.nested(
+ lambda: cli_test(ctx=ctx, config=config),
+ ):
+ yield
+ else:
+ with contextutil.nested(
+ lambda: install_fn.ship_utilities(ctx=ctx, config=None),
+ lambda: download_ceph_deploy(ctx=ctx, config=config),
+ lambda: cli_test(ctx=ctx, config=config),
+ ):
+ yield
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Set up and tear down a Ceph cluster.
+
+ For example::
+
+ tasks:
+ - install:
+ extras: yes
+ - ssh_keys:
+ - ceph-deploy:
+ branch:
+ stable: bobtail
+ mon_initial_members: 1
+ only_mon: true
+ keep_running: true
+
+ tasks:
+ - install:
+ extras: yes
+ - ssh_keys:
+ - ceph-deploy:
+ branch:
+ dev: master
+ conf:
+ mon:
+ debug mon = 20
+
+ tasks:
+ - install:
+ extras: yes
+ - ssh_keys:
+ - ceph-deploy:
+ branch:
+ testing:
+ dmcrypt: yes
+ separate_journal_disk: yes
+
+ """
+ if config is None:
+ config = {}
+
+ assert isinstance(config, dict), \
+ "task ceph-deploy only supports a dictionary for configuration"
+
+ overrides = ctx.config.get('overrides', {})
+ teuthology.deep_merge(config, overrides.get('ceph-deploy', {}))
+
+ if config.get('branch') is not None:
+ assert isinstance(
+ config['branch'], dict), 'branch must be a dictionary'
+
+ log.info('task ceph-deploy with config ' + str(config))
+
+ with contextutil.nested(
+ lambda: install_fn.ship_utilities(ctx=ctx, config=None),
+ lambda: download_ceph_deploy(ctx=ctx, config=config),
+ lambda: build_ceph_cluster(ctx=ctx, config=config),
+ ):
+ yield
--- /dev/null
+"""
+Ceph FUSE client task
+"""
+
+import contextlib
+import logging
+
+from teuthology import misc as teuthology
+from cephfs.fuse_mount import FuseMount
+
+log = logging.getLogger(__name__)
+
+
+def get_client_configs(ctx, config):
+ """
+ Get a map of the configuration for each FUSE client in the configuration by
+ combining the configuration of the current task with any global overrides.
+
+ :param ctx: Context instance
+ :param config: configuration for this task
+ :return: dict of client name to config or to None
+ """
+ if config is None:
+ config = dict(('client.{id}'.format(id=id_), None)
+ for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client'))
+ elif isinstance(config, list):
+ config = dict((name, None) for name in config)
+
+ overrides = ctx.config.get('overrides', {})
+ teuthology.deep_merge(config, overrides.get('ceph-fuse', {}))
+
+ return config
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Mount/unmount a ``ceph-fuse`` client.
+
+ The config is optional and defaults to mounting on all clients. If
+ a config is given, it is expected to be a list of clients to do
+ this operation on. This lets you e.g. set up one client with
+ ``ceph-fuse`` and another with ``kclient``.
+
+ Example that mounts all clients::
+
+ tasks:
+ - ceph:
+ - ceph-fuse:
+ - interactive:
+
+ Example that uses both ``kclient` and ``ceph-fuse``::
+
+ tasks:
+ - ceph:
+ - ceph-fuse: [client.0]
+ - kclient: [client.1]
+ - interactive:
+
+ Example that enables valgrind:
+
+ tasks:
+ - ceph:
+ - ceph-fuse:
+ client.0:
+ valgrind: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
+ - interactive:
+
+ Example that stops an already-mounted client:
+
+ ::
+
+ tasks:
+ - ceph:
+ - ceph-fuse: [client.0]
+ - ... do something that requires the FS mounted ...
+ - ceph-fuse:
+ client.0:
+ mounted: false
+ - ... do something that requires the FS unmounted ...
+
+ Example that adds more generous wait time for mount (for virtual machines):
+
+ tasks:
+ - ceph:
+ - ceph-fuse:
+ client.0:
+ mount_wait: 60 # default is 0, do not wait before checking /sys/
+ mount_timeout: 120 # default is 30, give up if /sys/ is not populated
+ - interactive:
+
+ :param ctx: Context
+ :param config: Configuration
+ """
+ log.info('Mounting ceph-fuse clients...')
+
+ testdir = teuthology.get_testdir(ctx)
+ config = get_client_configs(ctx, config)
+
+ # List clients we will configure mounts for, default is all clients
+ clients = list(teuthology.get_clients(ctx=ctx, roles=filter(lambda x: 'client.' in x, config.keys())))
+
+ all_mounts = getattr(ctx, 'mounts', {})
+ mounted_by_me = {}
+
+ # Construct any new FuseMount instances
+ for id_, remote in clients:
+ client_config = config.get("client.%s" % id_)
+ if client_config is None:
+ client_config = {}
+
+ if id_ not in all_mounts:
+ fuse_mount = FuseMount(client_config, testdir, id_, remote)
+ all_mounts[id_] = fuse_mount
+ else:
+ # Catch bad configs where someone has e.g. tried to use ceph-fuse and kcephfs for the same client
+ assert isinstance(all_mounts[id_], FuseMount)
+
+ if not config.get("disabled", False) and client_config.get('mounted', True):
+ mounted_by_me[id_] = all_mounts[id_]
+
+ ctx.mounts = all_mounts
+
+ # Mount any clients we have been asked to (default to mount all)
+ for mount in mounted_by_me.values():
+ mount.mount()
+
+ for mount in mounted_by_me.values():
+ mount.wait_until_mounted()
+
+ # Umount any pre-existing clients that we have not been asked to mount
+ for client_id in set(all_mounts.keys()) - set(mounted_by_me.keys()):
+ mount = all_mounts[client_id]
+ if mount.is_mounted():
+ mount.umount_wait()
+
+ try:
+ yield all_mounts
+ finally:
+ log.info('Unmounting ceph-fuse clients...')
+
+ for mount in mounted_by_me.values():
+ # Conditional because an inner context might have umounted it
+ if mount.is_mounted():
+ mount.umount_wait()
--- /dev/null
+"""
+ceph manager -- Thrasher and CephManager objects
+"""
+from cStringIO import StringIO
+from functools import wraps
+import contextlib
+import random
+import signal
+import time
+import gevent
+import base64
+import json
+import logging
+import threading
+import traceback
+import os
+from teuthology import misc as teuthology
+from tasks.scrub import Scrubber
+from util.rados import cmd_erasure_code_profile
+from util import get_remote
+from teuthology.contextutil import safe_while
+from teuthology.orchestra.remote import Remote
+from teuthology.orchestra import run
+from teuthology.exceptions import CommandFailedError
+
+try:
+ from subprocess import DEVNULL # py3k
+except ImportError:
+ DEVNULL = open(os.devnull, 'r+')
+
+DEFAULT_CONF_PATH = '/etc/ceph/ceph.conf'
+
+log = logging.getLogger(__name__)
+
+
+def write_conf(ctx, conf_path=DEFAULT_CONF_PATH, cluster='ceph'):
+ conf_fp = StringIO()
+ ctx.ceph[cluster].conf.write(conf_fp)
+ conf_fp.seek(0)
+ writes = ctx.cluster.run(
+ args=[
+ 'sudo', 'mkdir', '-p', '/etc/ceph', run.Raw('&&'),
+ 'sudo', 'chmod', '0755', '/etc/ceph', run.Raw('&&'),
+ 'sudo', 'python',
+ '-c',
+ ('import shutil, sys; '
+ 'shutil.copyfileobj(sys.stdin, file(sys.argv[1], "wb"))'),
+ conf_path,
+ run.Raw('&&'),
+ 'sudo', 'chmod', '0644', conf_path,
+ ],
+ stdin=run.PIPE,
+ wait=False)
+ teuthology.feed_many_stdins_and_close(conf_fp, writes)
+ run.wait(writes)
+
+
+def mount_osd_data(ctx, remote, cluster, osd):
+ """
+ Mount a remote OSD
+
+ :param ctx: Context
+ :param remote: Remote site
+ :param cluster: name of ceph cluster
+ :param osd: Osd name
+ """
+ log.debug('Mounting data for osd.{o} on {r}'.format(o=osd, r=remote))
+ role = "{0}.osd.{1}".format(cluster, osd)
+ alt_role = role if cluster != 'ceph' else "osd.{0}".format(osd)
+ if remote in ctx.disk_config.remote_to_roles_to_dev:
+ if alt_role in ctx.disk_config.remote_to_roles_to_dev[remote]:
+ role = alt_role
+ if role not in ctx.disk_config.remote_to_roles_to_dev[remote]:
+ return
+ dev = ctx.disk_config.remote_to_roles_to_dev[remote][role]
+ mount_options = ctx.disk_config.\
+ remote_to_roles_to_dev_mount_options[remote][role]
+ fstype = ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role]
+ mnt = os.path.join('/var/lib/ceph/osd', '{0}-{1}'.format(cluster, osd))
+
+ log.info('Mounting osd.{o}: dev: {n}, cluster: {c}'
+ 'mountpoint: {p}, type: {t}, options: {v}'.format(
+ o=osd, n=remote.name, p=mnt, t=fstype, v=mount_options,
+ c=cluster))
+
+ remote.run(
+ args=[
+ 'sudo',
+ 'mount',
+ '-t', fstype,
+ '-o', ','.join(mount_options),
+ dev,
+ mnt,
+ ]
+ )
+
+
+class Thrasher:
+ """
+ Object used to thrash Ceph
+ """
+ def __init__(self, manager, config, logger=None):
+ self.ceph_manager = manager
+ self.cluster = manager.cluster
+ self.ceph_manager.wait_for_clean()
+ osd_status = self.ceph_manager.get_osd_status()
+ self.in_osds = osd_status['in']
+ self.live_osds = osd_status['live']
+ self.out_osds = osd_status['out']
+ self.dead_osds = osd_status['dead']
+ self.stopping = False
+ self.logger = logger
+ self.config = config
+ self.revive_timeout = self.config.get("revive_timeout", 150)
+ if self.config.get('powercycle'):
+ self.revive_timeout += 120
+ self.clean_wait = self.config.get('clean_wait', 0)
+ self.minin = self.config.get("min_in", 3)
+ self.chance_move_pg = self.config.get('chance_move_pg', 1.0)
+ self.sighup_delay = self.config.get('sighup_delay')
+ self.optrack_toggle_delay = self.config.get('optrack_toggle_delay')
+ self.dump_ops_enable = self.config.get('dump_ops_enable')
+ self.noscrub_toggle_delay = self.config.get('noscrub_toggle_delay')
+
+ num_osds = self.in_osds + self.out_osds
+ self.max_pgs = self.config.get("max_pgs_per_pool_osd", 1200) * num_osds
+ if self.logger is not None:
+ self.log = lambda x: self.logger.info(x)
+ else:
+ def tmp(x):
+ """
+ Implement log behavior
+ """
+ print x
+ self.log = tmp
+ if self.config is None:
+ self.config = dict()
+ # prevent monitor from auto-marking things out while thrasher runs
+ # try both old and new tell syntax, in case we are testing old code
+ try:
+ manager.raw_cluster_cmd('--', 'tell', 'mon.*', 'injectargs',
+ '--mon-osd-down-out-interval 0')
+ except Exception:
+ manager.raw_cluster_cmd('--', 'mon', 'tell', '*', 'injectargs',
+ '--mon-osd-down-out-interval 0')
+ self.thread = gevent.spawn(self.do_thrash)
+ if self.sighup_delay:
+ self.sighup_thread = gevent.spawn(self.do_sighup)
+ if self.optrack_toggle_delay:
+ self.optrack_toggle_thread = gevent.spawn(self.do_optrack_toggle)
+ if self.dump_ops_enable == "true":
+ self.dump_ops_thread = gevent.spawn(self.do_dump_ops)
+ if self.noscrub_toggle_delay:
+ self.noscrub_toggle_thread = gevent.spawn(self.do_noscrub_toggle)
+ if (self.config.get('powercycle') or
+ not self.cmd_exists_on_osds("ceph-objectstore-tool") or
+ self.config.get('disable_objectstore_tool_tests', False)):
+ self.ceph_objectstore_tool = False
+ self.test_rm_past_intervals = False
+ if self.config.get('powercycle'):
+ self.log("Unable to test ceph-objectstore-tool, "
+ "powercycle testing")
+ else:
+ self.log("Unable to test ceph-objectstore-tool, "
+ "not available on all OSD nodes")
+ else:
+ self.ceph_objectstore_tool = \
+ self.config.get('ceph_objectstore_tool', True)
+ self.test_rm_past_intervals = \
+ self.config.get('test_rm_past_intervals', True)
+
+ def cmd_exists_on_osds(self, cmd):
+ allremotes = self.ceph_manager.ctx.cluster.only(\
+ teuthology.is_type('osd', self.cluster)).remotes.keys()
+ allremotes = list(set(allremotes))
+ for remote in allremotes:
+ proc = remote.run(args=['type', cmd], wait=True,
+ check_status=False, stdout=StringIO(),
+ stderr=StringIO())
+ if proc.exitstatus != 0:
+ return False;
+ return True;
+
+ def kill_osd(self, osd=None, mark_down=False, mark_out=False):
+ """
+ :param osd: Osd to be killed.
+ :mark_down: Mark down if true.
+ :mark_out: Mark out if true.
+ """
+ if osd is None:
+ osd = random.choice(self.live_osds)
+ self.log("Killing osd %s, live_osds are %s" % (str(osd),
+ str(self.live_osds)))
+ self.live_osds.remove(osd)
+ self.dead_osds.append(osd)
+ self.ceph_manager.kill_osd(osd)
+ if mark_down:
+ self.ceph_manager.mark_down_osd(osd)
+ if mark_out and osd in self.in_osds:
+ self.out_osd(osd)
+ if self.ceph_objectstore_tool:
+ self.log("Testing ceph-objectstore-tool on down osd")
+ remote = self.ceph_manager.find_remote('osd', osd)
+ FSPATH = self.ceph_manager.get_filepath()
+ JPATH = os.path.join(FSPATH, "journal")
+ exp_osd = imp_osd = osd
+ exp_remote = imp_remote = remote
+ # If an older osd is available we'll move a pg from there
+ if (len(self.dead_osds) > 1 and
+ random.random() < self.chance_move_pg):
+ exp_osd = random.choice(self.dead_osds[:-1])
+ exp_remote = self.ceph_manager.find_remote('osd', exp_osd)
+ if ('keyvaluestore_backend' in
+ self.ceph_manager.ctx.ceph[self.cluster].conf['osd']):
+ prefix = ("sudo adjust-ulimits ceph-objectstore-tool "
+ "--data-path {fpath} --journal-path {jpath} "
+ "--type keyvaluestore "
+ "--log-file="
+ "/var/log/ceph/objectstore_tool.\\$pid.log ".
+ format(fpath=FSPATH, jpath=JPATH))
+ else:
+ prefix = ("sudo adjust-ulimits ceph-objectstore-tool "
+ "--data-path {fpath} --journal-path {jpath} "
+ "--log-file="
+ "/var/log/ceph/objectstore_tool.\\$pid.log ".
+ format(fpath=FSPATH, jpath=JPATH))
+ cmd = (prefix + "--op list-pgs").format(id=exp_osd)
+
+ # ceph-objectstore-tool might be temporarily absent during an
+ # upgrade - see http://tracker.ceph.com/issues/18014
+ with safe_while(sleep=15, tries=40, action="type ceph-objectstore-tool") as proceed:
+ while proceed():
+ proc = exp_remote.run(args=['type', 'ceph-objectstore-tool'],
+ wait=True, check_status=False, stdout=StringIO(),
+ stderr=StringIO())
+ if proc.exitstatus == 0:
+ break
+ log.debug("ceph-objectstore-tool binary not present, trying again")
+
+ proc = exp_remote.run(args=cmd, wait=True,
+ check_status=False, stdout=StringIO())
+ if proc.exitstatus:
+ raise Exception("ceph-objectstore-tool: "
+ "exp list-pgs failure with status {ret}".
+ format(ret=proc.exitstatus))
+ pgs = proc.stdout.getvalue().split('\n')[:-1]
+ if len(pgs) == 0:
+ self.log("No PGs found for osd.{osd}".format(osd=exp_osd))
+ return
+ pg = random.choice(pgs)
+ exp_path = teuthology.get_testdir(self.ceph_manager.ctx)
+ exp_path = os.path.join(exp_path, '{0}.data'.format(self.cluster))
+ exp_path = os.path.join(exp_path,
+ "exp.{pg}.{id}".format(
+ pg=pg,
+ id=exp_osd))
+ # export
+ cmd = prefix + "--op export --pgid {pg} --file {file}"
+ cmd = cmd.format(id=exp_osd, pg=pg, file=exp_path)
+ proc = exp_remote.run(args=cmd)
+ if proc.exitstatus:
+ raise Exception("ceph-objectstore-tool: "
+ "export failure with status {ret}".
+ format(ret=proc.exitstatus))
+ # remove
+ cmd = prefix + "--op remove --pgid {pg}"
+ cmd = cmd.format(id=exp_osd, pg=pg)
+ proc = exp_remote.run(args=cmd)
+ if proc.exitstatus:
+ raise Exception("ceph-objectstore-tool: "
+ "remove failure with status {ret}".
+ format(ret=proc.exitstatus))
+ # If there are at least 2 dead osds we might move the pg
+ if exp_osd != imp_osd:
+ # If pg isn't already on this osd, then we will move it there
+ cmd = (prefix + "--op list-pgs").format(id=imp_osd)
+ proc = imp_remote.run(args=cmd, wait=True,
+ check_status=False, stdout=StringIO())
+ if proc.exitstatus:
+ raise Exception("ceph-objectstore-tool: "
+ "imp list-pgs failure with status {ret}".
+ format(ret=proc.exitstatus))
+ pgs = proc.stdout.getvalue().split('\n')[:-1]
+ if pg not in pgs:
+ self.log("Moving pg {pg} from osd.{fosd} to osd.{tosd}".
+ format(pg=pg, fosd=exp_osd, tosd=imp_osd))
+ if imp_remote != exp_remote:
+ # Copy export file to the other machine
+ self.log("Transfer export file from {srem} to {trem}".
+ format(srem=exp_remote, trem=imp_remote))
+ tmpexport = Remote.get_file(exp_remote, exp_path)
+ Remote.put_file(imp_remote, tmpexport, exp_path)
+ os.remove(tmpexport)
+ else:
+ # Can't move the pg after all
+ imp_osd = exp_osd
+ imp_remote = exp_remote
+ # import
+ cmd = (prefix + "--op import --file {file}")
+ cmd = cmd.format(id=imp_osd, file=exp_path)
+ proc = imp_remote.run(args=cmd, wait=True, check_status=False)
+ if proc.exitstatus == 10:
+ self.log("Pool went away before processing an import"
+ "...ignored")
+ elif proc.exitstatus == 11:
+ self.log("Attempt to import an incompatible export"
+ "...ignored")
+ elif proc.exitstatus:
+ raise Exception("ceph-objectstore-tool: "
+ "import failure with status {ret}".
+ format(ret=proc.exitstatus))
+ cmd = "rm -f {file}".format(file=exp_path)
+ exp_remote.run(args=cmd)
+ if imp_remote != exp_remote:
+ imp_remote.run(args=cmd)
+
+ # apply low split settings to each pool
+ for pool in self.ceph_manager.list_pools():
+ no_sudo_prefix = prefix[5:]
+ cmd = ("CEPH_ARGS='--filestore-merge-threshold 1 "
+ "--filestore-split-multiple 1' sudo -E "
+ + no_sudo_prefix + "--op apply-layout-settings --pool " + pool).format(id=osd)
+ proc = remote.run(args=cmd, wait=True, check_status=False, stderr=StringIO())
+ output = proc.stderr.getvalue()
+ if 'Couldn\'t find pool' in output:
+ continue
+ if proc.exitstatus:
+ raise Exception("ceph-objectstore-tool apply-layout-settings"
+ " failed with {status}".format(status=proc.exitstatus))
+
+ def rm_past_intervals(self, osd=None):
+ """
+ :param osd: Osd to find pg to remove past intervals
+ """
+ if self.test_rm_past_intervals:
+ if osd is None:
+ osd = random.choice(self.dead_osds)
+ self.log("Use ceph_objectstore_tool to remove past intervals")
+ remote = self.ceph_manager.find_remote('osd', osd)
+ FSPATH = self.ceph_manager.get_filepath()
+ JPATH = os.path.join(FSPATH, "journal")
+ if ('keyvaluestore_backend' in
+ self.ceph_manager.ctx.ceph[self.cluster].conf['osd']):
+ prefix = ("sudo adjust-ulimits ceph-objectstore-tool "
+ "--data-path {fpath} --journal-path {jpath} "
+ "--type keyvaluestore "
+ "--log-file="
+ "/var/log/ceph/objectstore_tool.\\$pid.log ".
+ format(fpath=FSPATH, jpath=JPATH))
+ else:
+ prefix = ("sudo adjust-ulimits ceph-objectstore-tool "
+ "--data-path {fpath} --journal-path {jpath} "
+ "--log-file="
+ "/var/log/ceph/objectstore_tool.\\$pid.log ".
+ format(fpath=FSPATH, jpath=JPATH))
+ cmd = (prefix + "--op list-pgs").format(id=osd)
+ proc = remote.run(args=cmd, wait=True,
+ check_status=False, stdout=StringIO())
+ if proc.exitstatus:
+ raise Exception("ceph_objectstore_tool: "
+ "exp list-pgs failure with status {ret}".
+ format(ret=proc.exitstatus))
+ pgs = proc.stdout.getvalue().split('\n')[:-1]
+ if len(pgs) == 0:
+ self.log("No PGs found for osd.{osd}".format(osd=osd))
+ return
+ pg = random.choice(pgs)
+ cmd = (prefix + "--op rm-past-intervals --pgid {pg}").\
+ format(id=osd, pg=pg)
+ proc = remote.run(args=cmd)
+ if proc.exitstatus:
+ raise Exception("ceph_objectstore_tool: "
+ "rm-past-intervals failure with status {ret}".
+ format(ret=proc.exitstatus))
+
+ def blackhole_kill_osd(self, osd=None):
+ """
+ If all else fails, kill the osd.
+ :param osd: Osd to be killed.
+ """
+ if osd is None:
+ osd = random.choice(self.live_osds)
+ self.log("Blackholing and then killing osd %s, live_osds are %s" %
+ (str(osd), str(self.live_osds)))
+ self.live_osds.remove(osd)
+ self.dead_osds.append(osd)
+ self.ceph_manager.blackhole_kill_osd(osd)
+
+ def revive_osd(self, osd=None, skip_admin_check=False):
+ """
+ Revive the osd.
+ :param osd: Osd to be revived.
+ """
+ if osd is None:
+ osd = random.choice(self.dead_osds)
+ self.log("Reviving osd %s" % (str(osd),))
+ self.ceph_manager.revive_osd(
+ osd,
+ self.revive_timeout,
+ skip_admin_check=skip_admin_check)
+ self.dead_osds.remove(osd)
+ self.live_osds.append(osd)
+
+ def out_osd(self, osd=None):
+ """
+ Mark the osd out
+ :param osd: Osd to be marked.
+ """
+ if osd is None:
+ osd = random.choice(self.in_osds)
+ self.log("Removing osd %s, in_osds are: %s" %
+ (str(osd), str(self.in_osds)))
+ self.ceph_manager.mark_out_osd(osd)
+ self.in_osds.remove(osd)
+ self.out_osds.append(osd)
+
+ def in_osd(self, osd=None):
+ """
+ Mark the osd out
+ :param osd: Osd to be marked.
+ """
+ if osd is None:
+ osd = random.choice(self.out_osds)
+ if osd in self.dead_osds:
+ return self.revive_osd(osd)
+ self.log("Adding osd %s" % (str(osd),))
+ self.out_osds.remove(osd)
+ self.in_osds.append(osd)
+ self.ceph_manager.mark_in_osd(osd)
+ self.log("Added osd %s" % (str(osd),))
+
+ def reweight_osd_or_by_util(self, osd=None):
+ """
+ Reweight an osd that is in
+ :param osd: Osd to be marked.
+ """
+ if osd is not None or random.choice([True, False]):
+ if osd is None:
+ osd = random.choice(self.in_osds)
+ val = random.uniform(.1, 1.0)
+ self.log("Reweighting osd %s to %s" % (str(osd), str(val)))
+ self.ceph_manager.raw_cluster_cmd('osd', 'reweight',
+ str(osd), str(val))
+ else:
+ # do it several times, the option space is large
+ for i in range(5):
+ options = {
+ 'max_change': random.choice(['0.05', '1.0', '3.0']),
+ 'overage': random.choice(['110', '1000']),
+ 'type': random.choice([
+ 'reweight-by-utilization',
+ 'test-reweight-by-utilization']),
+ }
+ self.log("Reweighting by: %s"%(str(options),))
+ self.ceph_manager.raw_cluster_cmd(
+ 'osd',
+ options['type'],
+ options['overage'],
+ options['max_change'])
+
+ def primary_affinity(self, osd=None):
+ if osd is None:
+ osd = random.choice(self.in_osds)
+ if random.random() >= .5:
+ pa = random.random()
+ elif random.random() >= .5:
+ pa = 1
+ else:
+ pa = 0
+ self.log('Setting osd %s primary_affinity to %f' % (str(osd), pa))
+ self.ceph_manager.raw_cluster_cmd('osd', 'primary-affinity',
+ str(osd), str(pa))
+
+ def all_up(self):
+ """
+ Make sure all osds are up and not out.
+ """
+ while len(self.dead_osds) > 0:
+ self.log("reviving osd")
+ self.revive_osd()
+ while len(self.out_osds) > 0:
+ self.log("inning osd")
+ self.in_osd()
+
+ def do_join(self):
+ """
+ Break out of this Ceph loop
+ """
+ self.stopping = True
+ self.thread.get()
+ if self.sighup_delay:
+ self.log("joining the do_sighup greenlet")
+ self.sighup_thread.get()
+ if self.optrack_toggle_delay:
+ self.log("joining the do_optrack_toggle greenlet")
+ self.optrack_toggle_thread.join()
+ if self.dump_ops_enable == "true":
+ self.log("joining the do_dump_ops greenlet")
+ self.dump_ops_thread.join()
+ if self.noscrub_toggle_delay:
+ self.log("joining the do_noscrub_toggle greenlet")
+ self.noscrub_toggle_thread.join()
+
+ def grow_pool(self):
+ """
+ Increase the size of the pool
+ """
+ pool = self.ceph_manager.get_pool()
+ self.log("Growing pool %s" % (pool,))
+ self.ceph_manager.expand_pool(pool,
+ self.config.get('pool_grow_by', 10),
+ self.max_pgs)
+
+ def fix_pgp_num(self):
+ """
+ Fix number of pgs in pool.
+ """
+ pool = self.ceph_manager.get_pool()
+ self.log("fixing pg num pool %s" % (pool,))
+ self.ceph_manager.set_pool_pgpnum(pool)
+
+ def test_pool_min_size(self):
+ """
+ Kill and revive all osds except one.
+ """
+ self.log("test_pool_min_size")
+ self.all_up()
+ self.ceph_manager.wait_for_recovery(
+ timeout=self.config.get('timeout')
+ )
+ the_one = random.choice(self.in_osds)
+ self.log("Killing everyone but %s", the_one)
+ to_kill = filter(lambda x: x != the_one, self.in_osds)
+ [self.kill_osd(i) for i in to_kill]
+ [self.out_osd(i) for i in to_kill]
+ time.sleep(self.config.get("test_pool_min_size_time", 10))
+ self.log("Killing %s" % (the_one,))
+ self.kill_osd(the_one)
+ self.out_osd(the_one)
+ self.log("Reviving everyone but %s" % (the_one,))
+ [self.revive_osd(i) for i in to_kill]
+ [self.in_osd(i) for i in to_kill]
+ self.log("Revived everyone but %s" % (the_one,))
+ self.log("Waiting for clean")
+ self.ceph_manager.wait_for_recovery(
+ timeout=self.config.get('timeout')
+ )
+
+ def inject_pause(self, conf_key, duration, check_after, should_be_down):
+ """
+ Pause injection testing. Check for osd being down when finished.
+ """
+ the_one = random.choice(self.live_osds)
+ self.log("inject_pause on {osd}".format(osd=the_one))
+ self.log(
+ "Testing {key} pause injection for duration {duration}".format(
+ key=conf_key,
+ duration=duration
+ ))
+ self.log(
+ "Checking after {after}, should_be_down={shouldbedown}".format(
+ after=check_after,
+ shouldbedown=should_be_down
+ ))
+ self.ceph_manager.set_config(the_one, **{conf_key: duration})
+ if not should_be_down:
+ return
+ time.sleep(check_after)
+ status = self.ceph_manager.get_osd_status()
+ assert the_one in status['down']
+ time.sleep(duration - check_after + 20)
+ status = self.ceph_manager.get_osd_status()
+ assert not the_one in status['down']
+
+ def test_backfill_full(self):
+ """
+ Test backfills stopping when the replica fills up.
+
+ First, use osd_backfill_full_ratio to simulate a now full
+ osd by setting it to 0 on all of the OSDs.
+
+ Second, on a random subset, set
+ osd_debug_skip_full_check_in_backfill_reservation to force
+ the more complicated check in do_scan to be exercised.
+
+ Then, verify that all backfills stop.
+ """
+ self.log("injecting osd_backfill_full_ratio = 0")
+ for i in self.live_osds:
+ self.ceph_manager.set_config(
+ i,
+ osd_debug_skip_full_check_in_backfill_reservation=
+ random.choice(['false', 'true']),
+ osd_backfill_full_ratio=0)
+ for i in range(30):
+ status = self.ceph_manager.compile_pg_status()
+ if 'backfill' not in status.keys():
+ break
+ self.log(
+ "waiting for {still_going} backfills".format(
+ still_going=status.get('backfill')))
+ time.sleep(1)
+ assert('backfill' not in self.ceph_manager.compile_pg_status().keys())
+ for i in self.live_osds:
+ self.ceph_manager.set_config(
+ i,
+ osd_debug_skip_full_check_in_backfill_reservation='false',
+ osd_backfill_full_ratio=0.85)
+
+ def test_map_discontinuity(self):
+ """
+ 1) Allows the osds to recover
+ 2) kills an osd
+ 3) allows the remaining osds to recover
+ 4) waits for some time
+ 5) revives the osd
+ This sequence should cause the revived osd to have to handle
+ a map gap since the mons would have trimmed
+ """
+ while len(self.in_osds) < (self.minin + 1):
+ self.in_osd()
+ self.log("Waiting for recovery")
+ self.ceph_manager.wait_for_all_up(
+ timeout=self.config.get('timeout')
+ )
+ # now we wait 20s for the pg status to change, if it takes longer,
+ # the test *should* fail!
+ time.sleep(20)
+ self.ceph_manager.wait_for_clean(
+ timeout=self.config.get('timeout')
+ )
+
+ # now we wait 20s for the backfill replicas to hear about the clean
+ time.sleep(20)
+ self.log("Recovered, killing an osd")
+ self.kill_osd(mark_down=True, mark_out=True)
+ self.log("Waiting for clean again")
+ self.ceph_manager.wait_for_clean(
+ timeout=self.config.get('timeout')
+ )
+ self.log("Waiting for trim")
+ time.sleep(int(self.config.get("map_discontinuity_sleep_time", 40)))
+ self.revive_osd()
+
+ def choose_action(self):
+ """
+ Random action selector.
+ """
+ chance_down = self.config.get('chance_down', 0.4)
+ chance_test_min_size = self.config.get('chance_test_min_size', 0)
+ chance_test_backfill_full = \
+ self.config.get('chance_test_backfill_full', 0)
+ if isinstance(chance_down, int):
+ chance_down = float(chance_down) / 100
+ minin = self.minin
+ minout = self.config.get("min_out", 0)
+ minlive = self.config.get("min_live", 2)
+ mindead = self.config.get("min_dead", 0)
+
+ self.log('choose_action: min_in %d min_out '
+ '%d min_live %d min_dead %d' %
+ (minin, minout, minlive, mindead))
+ actions = []
+ if len(self.in_osds) > minin:
+ actions.append((self.out_osd, 1.0,))
+ if len(self.live_osds) > minlive and chance_down > 0:
+ actions.append((self.kill_osd, chance_down,))
+ if len(self.dead_osds) > 1:
+ actions.append((self.rm_past_intervals, 1.0,))
+ if len(self.out_osds) > minout:
+ actions.append((self.in_osd, 1.7,))
+ if len(self.dead_osds) > mindead:
+ actions.append((self.revive_osd, 1.0,))
+ if self.config.get('thrash_primary_affinity', True):
+ actions.append((self.primary_affinity, 1.0,))
+ actions.append((self.reweight_osd_or_by_util,
+ self.config.get('reweight_osd', .5),))
+ actions.append((self.grow_pool,
+ self.config.get('chance_pgnum_grow', 0),))
+ actions.append((self.fix_pgp_num,
+ self.config.get('chance_pgpnum_fix', 0),))
+ actions.append((self.test_pool_min_size,
+ chance_test_min_size,))
+ actions.append((self.test_backfill_full,
+ chance_test_backfill_full,))
+ for key in ['heartbeat_inject_failure', 'filestore_inject_stall']:
+ for scenario in [
+ (lambda:
+ self.inject_pause(key,
+ self.config.get('pause_short', 3),
+ 0,
+ False),
+ self.config.get('chance_inject_pause_short', 1),),
+ (lambda:
+ self.inject_pause(key,
+ self.config.get('pause_long', 80),
+ self.config.get('pause_check_after', 70),
+ True),
+ self.config.get('chance_inject_pause_long', 0),)]:
+ actions.append(scenario)
+
+ total = sum([y for (x, y) in actions])
+ val = random.uniform(0, total)
+ for (action, prob) in actions:
+ if val < prob:
+ return action
+ val -= prob
+ return None
+
+ def log_exc(func):
+ @wraps(func)
+ def wrapper(self):
+ try:
+ return func(self)
+ except:
+ self.log(traceback.format_exc())
+ raise
+ return wrapper
+
+ @log_exc
+ def do_sighup(self):
+ """
+ Loops and sends signal.SIGHUP to a random live osd.
+
+ Loop delay is controlled by the config value sighup_delay.
+ """
+ delay = float(self.sighup_delay)
+ self.log("starting do_sighup with a delay of {0}".format(delay))
+ while not self.stopping:
+ osd = random.choice(self.live_osds)
+ self.ceph_manager.signal_osd(osd, signal.SIGHUP, silent=True)
+ time.sleep(delay)
+
+ @log_exc
+ def do_optrack_toggle(self):
+ """
+ Loops and toggle op tracking to all osds.
+
+ Loop delay is controlled by the config value optrack_toggle_delay.
+ """
+ delay = float(self.optrack_toggle_delay)
+ osd_state = "true"
+ self.log("starting do_optrack_toggle with a delay of {0}".format(delay))
+ while not self.stopping:
+ if osd_state == "true":
+ osd_state = "false"
+ else:
+ osd_state = "true"
+ self.ceph_manager.raw_cluster_cmd_result('tell', 'osd.*',
+ 'injectargs', '--osd_enable_op_tracker=%s' % osd_state)
+ gevent.sleep(delay)
+
+ @log_exc
+ def do_dump_ops(self):
+ """
+ Loops and does op dumps on all osds
+ """
+ self.log("starting do_dump_ops")
+ while not self.stopping:
+ for osd in self.live_osds:
+ # Ignore errors because live_osds is in flux
+ self.ceph_manager.osd_admin_socket(osd, command=['dump_ops_in_flight'],
+ check_status=False, timeout=30, stdout=DEVNULL)
+ self.ceph_manager.osd_admin_socket(osd, command=['dump_blocked_ops'],
+ check_status=False, timeout=30, stdout=DEVNULL)
+ self.ceph_manager.osd_admin_socket(osd, command=['dump_historic_ops'],
+ check_status=False, timeout=30, stdout=DEVNULL)
+ gevent.sleep(0)
+
+ @log_exc
+ def do_noscrub_toggle(self):
+ """
+ Loops and toggle noscrub flags
+
+ Loop delay is controlled by the config value noscrub_toggle_delay.
+ """
+ delay = float(self.noscrub_toggle_delay)
+ scrub_state = "none"
+ self.log("starting do_noscrub_toggle with a delay of {0}".format(delay))
+ while not self.stopping:
+ if scrub_state == "none":
+ self.ceph_manager.raw_cluster_cmd('osd', 'set', 'noscrub')
+ scrub_state = "noscrub"
+ elif scrub_state == "noscrub":
+ self.ceph_manager.raw_cluster_cmd('osd', 'set', 'nodeep-scrub')
+ scrub_state = "both"
+ elif scrub_state == "both":
+ self.ceph_manager.raw_cluster_cmd('osd', 'unset', 'noscrub')
+ scrub_state = "nodeep-scrub"
+ else:
+ self.ceph_manager.raw_cluster_cmd('osd', 'unset', 'nodeep-scrub')
+ scrub_state = "none"
+ gevent.sleep(delay)
+ self.ceph_manager.raw_cluster_cmd('osd', 'unset', 'noscrub')
+ self.ceph_manager.raw_cluster_cmd('osd', 'unset', 'nodeep-scrub')
+
+ @log_exc
+ def do_thrash(self):
+ """
+ Loop to select random actions to thrash ceph manager with.
+ """
+ cleanint = self.config.get("clean_interval", 60)
+ scrubint = self.config.get("scrub_interval", -1)
+ maxdead = self.config.get("max_dead", 0)
+ delay = self.config.get("op_delay", 5)
+ self.log("starting do_thrash")
+ while not self.stopping:
+ to_log = [str(x) for x in ["in_osds: ", self.in_osds,
+ "out_osds: ", self.out_osds,
+ "dead_osds: ", self.dead_osds,
+ "live_osds: ", self.live_osds]]
+ self.log(" ".join(to_log))
+ if random.uniform(0, 1) < (float(delay) / cleanint):
+ while len(self.dead_osds) > maxdead:
+ self.revive_osd()
+ for osd in self.in_osds:
+ self.ceph_manager.raw_cluster_cmd('osd', 'reweight',
+ str(osd), str(1))
+ if random.uniform(0, 1) < float(
+ self.config.get('chance_test_map_discontinuity', 0)):
+ self.test_map_discontinuity()
+ else:
+ self.ceph_manager.wait_for_recovery(
+ timeout=self.config.get('timeout')
+ )
+ time.sleep(self.clean_wait)
+ if scrubint > 0:
+ if random.uniform(0, 1) < (float(delay) / scrubint):
+ self.log('Scrubbing while thrashing being performed')
+ Scrubber(self.ceph_manager, self.config)
+ self.choose_action()()
+ time.sleep(delay)
+ self.all_up()
+
+
+class ObjectStoreTool:
+
+ def __init__(self, manager, pool, **kwargs):
+ self.manager = manager
+ self.pool = pool
+ self.osd = kwargs.get('osd', None)
+ self.object_name = kwargs.get('object_name', None)
+ self.do_revive = kwargs.get('do_revive', True)
+ if self.osd and self.pool and self.object_name:
+ if self.osd == "primary":
+ self.osd = self.manager.get_object_primary(self.pool,
+ self.object_name)
+ assert self.osd
+ if self.object_name:
+ self.pgid = self.manager.get_object_pg_with_shard(self.pool,
+ self.object_name,
+ self.osd)
+ self.remote = self.manager.ctx.\
+ cluster.only('osd.{o}'.format(o=self.osd)).remotes.keys()[0]
+ path = self.manager.get_filepath().format(id=self.osd)
+ self.paths = ("--data-path {path} --journal-path {path}/journal".
+ format(path=path))
+
+ def build_cmd(self, options, args, stdin):
+ lines = []
+ if self.object_name:
+ lines.append("object=$(sudo adjust-ulimits ceph-objectstore-tool "
+ "{paths} --pgid {pgid} --op list |"
+ "grep '\"oid\":\"{name}\"')".
+ format(paths=self.paths,
+ pgid=self.pgid,
+ name=self.object_name))
+ args = '"$object" ' + args
+ options += " --pgid {pgid}".format(pgid=self.pgid)
+ cmd = ("sudo adjust-ulimits ceph-objectstore-tool {paths} {options} {args}".
+ format(paths=self.paths,
+ args=args,
+ options=options))
+ if stdin:
+ cmd = ("echo {payload} | base64 --decode | {cmd}".
+ format(payload=base64.encode(stdin),
+ cmd=cmd))
+ lines.append(cmd)
+ return "\n".join(lines)
+
+ def run(self, options, args, stdin=None, stdout=None):
+ if stdout is None:
+ stdout = StringIO()
+ self.manager.kill_osd(self.osd)
+ cmd = self.build_cmd(options, args, stdin)
+ self.manager.log(cmd)
+ try:
+ proc = self.remote.run(args=['bash', '-e', '-x', '-c', cmd],
+ check_status=False,
+ stdout=stdout,
+ stderr=StringIO())
+ proc.wait()
+ if proc.exitstatus != 0:
+ self.manager.log("failed with " + str(proc.exitstatus))
+ error = proc.stdout.getvalue() + " " + proc.stderr.getvalue()
+ raise Exception(error)
+ finally:
+ if self.do_revive:
+ self.manager.revive_osd(self.osd)
+
+
+class CephManager:
+ """
+ Ceph manager object.
+ Contains several local functions that form a bulk of this module.
+
+ Note: this class has nothing to do with the Ceph daemon (ceph-mgr) of
+ the same name.
+ """
+
+ REPLICATED_POOL = 1
+ ERASURE_CODED_POOL = 3
+
+ def __init__(self, controller, ctx=None, config=None, logger=None,
+ cluster='ceph'):
+ self.lock = threading.RLock()
+ self.ctx = ctx
+ self.config = config
+ self.controller = controller
+ self.next_pool_id = 0
+ self.cluster = cluster
+ if (logger):
+ self.log = lambda x: logger.info(x)
+ else:
+ def tmp(x):
+ """
+ implement log behavior.
+ """
+ print x
+ self.log = tmp
+ if self.config is None:
+ self.config = dict()
+ pools = self.list_pools()
+ self.pools = {}
+ for pool in pools:
+ # we may race with a pool deletion; ignore failures here
+ try:
+ self.pools[pool] = self.get_pool_property(pool, 'pg_num')
+ except CommandFailedError:
+ self.log('Failed to get pg_num from pool %s, ignoring' % pool)
+
+ def raw_cluster_cmd(self, *args):
+ """
+ Start ceph on a raw cluster. Return count
+ """
+ testdir = teuthology.get_testdir(self.ctx)
+ ceph_args = [
+ 'sudo',
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'timeout',
+ '120',
+ 'ceph',
+ '--cluster',
+ self.cluster,
+ ]
+ ceph_args.extend(args)
+ proc = self.controller.run(
+ args=ceph_args,
+ stdout=StringIO(),
+ )
+ return proc.stdout.getvalue()
+
+ def raw_cluster_cmd_result(self, *args):
+ """
+ Start ceph on a cluster. Return success or failure information.
+ """
+ testdir = teuthology.get_testdir(self.ctx)
+ ceph_args = [
+ 'sudo',
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'timeout',
+ '120',
+ 'ceph',
+ '--cluster',
+ self.cluster,
+ ]
+ ceph_args.extend(args)
+ proc = self.controller.run(
+ args=ceph_args,
+ check_status=False,
+ )
+ return proc.exitstatus
+
+ def run_ceph_w(self):
+ """
+ Execute "ceph -w" in the background with stdout connected to a StringIO,
+ and return the RemoteProcess.
+ """
+ return self.controller.run(
+ args=["sudo",
+ "daemon-helper",
+ "kill",
+ "ceph",
+ '--cluster',
+ self.cluster,
+ "-w"],
+ wait=False, stdout=StringIO(), stdin=run.PIPE)
+
+ def do_rados(self, remote, cmd, check_status=True):
+ """
+ Execute a remote rados command.
+ """
+ testdir = teuthology.get_testdir(self.ctx)
+ pre = [
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'rados',
+ '--cluster',
+ self.cluster,
+ ]
+ pre.extend(cmd)
+ proc = remote.run(
+ args=pre,
+ wait=True,
+ check_status=check_status
+ )
+ return proc
+
+ def rados_write_objects(self, pool, num_objects, size,
+ timelimit, threads, cleanup=False):
+ """
+ Write rados objects
+ Threads not used yet.
+ """
+ args = [
+ '-p', pool,
+ '--num-objects', num_objects,
+ '-b', size,
+ 'bench', timelimit,
+ 'write'
+ ]
+ if not cleanup:
+ args.append('--no-cleanup')
+ return self.do_rados(self.controller, map(str, args))
+
+ def do_put(self, pool, obj, fname, namespace=None):
+ """
+ Implement rados put operation
+ """
+ args = ['-p', pool]
+ if namespace is not None:
+ args += ['-N', namespace]
+ args += [
+ 'put',
+ obj,
+ fname
+ ]
+ return self.do_rados(
+ self.controller,
+ args,
+ check_status=False
+ ).exitstatus
+
+ def do_get(self, pool, obj, fname='/dev/null', namespace=None):
+ """
+ Implement rados get operation
+ """
+ args = ['-p', pool]
+ if namespace is not None:
+ args += ['-N', namespace]
+ args += [
+ 'get',
+ obj,
+ fname
+ ]
+ return self.do_rados(
+ self.controller,
+ args,
+ check_status=False
+ ).exitstatus
+
+ def do_rm(self, pool, obj, namespace=None):
+ """
+ Implement rados rm operation
+ """
+ args = ['-p', pool]
+ if namespace is not None:
+ args += ['-N', namespace]
+ args += [
+ 'rm',
+ obj
+ ]
+ return self.do_rados(
+ self.controller,
+ args,
+ check_status=False
+ ).exitstatus
+
+ def osd_admin_socket(self, osd_id, command, check_status=True, timeout=0, stdout=None):
+ if stdout is None:
+ stdout = StringIO()
+ return self.admin_socket('osd', osd_id, command, check_status, timeout, stdout)
+
+ def find_remote(self, service_type, service_id):
+ """
+ Get the Remote for the host where a particular service runs.
+
+ :param service_type: 'mds', 'osd', 'client'
+ :param service_id: The second part of a role, e.g. '0' for
+ the role 'client.0'
+ :return: a Remote instance for the host where the
+ requested role is placed
+ """
+ return get_remote(self.ctx, self.cluster,
+ service_type, service_id)
+
+ def admin_socket(self, service_type, service_id,
+ command, check_status=True, timeout=0, stdout=None):
+ """
+ Remotely start up ceph specifying the admin socket
+ :param command: a list of words to use as the command
+ to the admin socket
+ """
+ if stdout is None:
+ stdout = StringIO()
+ testdir = teuthology.get_testdir(self.ctx)
+ remote = self.find_remote(service_type, service_id)
+ args = [
+ 'sudo',
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'timeout',
+ str(timeout),
+ 'ceph',
+ '--cluster',
+ self.cluster,
+ '--admin-daemon',
+ '/var/run/ceph/{cluster}-{type}.{id}.asok'.format(
+ cluster=self.cluster,
+ type=service_type,
+ id=service_id),
+ ]
+ args.extend(command)
+ return remote.run(
+ args=args,
+ stdout=stdout,
+ wait=True,
+ check_status=check_status
+ )
+
+ def objectstore_tool(self, pool, options, args, **kwargs):
+ return ObjectStoreTool(self, pool, **kwargs).run(options, args)
+
+ def get_pgid(self, pool, pgnum):
+ """
+ :param pool: pool name
+ :param pgnum: pg number
+ :returns: a string representing this pg.
+ """
+ poolnum = self.get_pool_num(pool)
+ pg_str = "{poolnum}.{pgnum}".format(
+ poolnum=poolnum,
+ pgnum=pgnum)
+ return pg_str
+
+ def get_pg_replica(self, pool, pgnum):
+ """
+ get replica for pool, pgnum (e.g. (data, 0)->0
+ """
+ output = self.raw_cluster_cmd("pg", "dump", '--format=json')
+ j = json.loads('\n'.join(output.split('\n')[1:]))
+ pg_str = self.get_pgid(pool, pgnum)
+ for pg in j['pg_stats']:
+ if pg['pgid'] == pg_str:
+ return int(pg['acting'][-1])
+ assert False
+
+ def get_pg_primary(self, pool, pgnum):
+ """
+ get primary for pool, pgnum (e.g. (data, 0)->0
+ """
+ output = self.raw_cluster_cmd("pg", "dump", '--format=json')
+ j = json.loads('\n'.join(output.split('\n')[1:]))
+ pg_str = self.get_pgid(pool, pgnum)
+ for pg in j['pg_stats']:
+ if pg['pgid'] == pg_str:
+ return int(pg['acting'][0])
+ assert False
+
+ def get_pool_num(self, pool):
+ """
+ get number for pool (e.g., data -> 2)
+ """
+ return int(self.get_pool_dump(pool)['pool'])
+
+ def list_pools(self):
+ """
+ list all pool names
+ """
+ osd_dump = self.get_osd_dump_json()
+ self.log(osd_dump['pools'])
+ return [str(i['pool_name']) for i in osd_dump['pools']]
+
+ def clear_pools(self):
+ """
+ remove all pools
+ """
+ [self.remove_pool(i) for i in self.list_pools()]
+
+ def kick_recovery_wq(self, osdnum):
+ """
+ Run kick_recovery_wq on cluster.
+ """
+ return self.raw_cluster_cmd(
+ 'tell', "osd.%d" % (int(osdnum),),
+ 'debug',
+ 'kick_recovery_wq',
+ '0')
+
+ def wait_run_admin_socket(self, service_type,
+ service_id, args=['version'], timeout=75, stdout=None):
+ """
+ If osd_admin_socket call suceeds, return. Otherwise wait
+ five seconds and try again.
+ """
+ if stdout is None:
+ stdout = StringIO()
+ tries = 0
+ while True:
+ proc = self.admin_socket(service_type, service_id,
+ args, check_status=False, stdout=stdout)
+ if proc.exitstatus is 0:
+ break
+ else:
+ tries += 1
+ if (tries * 5) > timeout:
+ raise Exception('timed out waiting for admin_socket '
+ 'to appear after {type}.{id} restart'.
+ format(type=service_type,
+ id=service_id))
+ self.log("waiting on admin_socket for {type}-{id}, "
+ "{command}".format(type=service_type,
+ id=service_id,
+ command=args))
+ time.sleep(5)
+
+ def get_pool_dump(self, pool):
+ """
+ get the osd dump part of a pool
+ """
+ osd_dump = self.get_osd_dump_json()
+ for i in osd_dump['pools']:
+ if i['pool_name'] == pool:
+ return i
+ assert False
+
+ def set_config(self, osdnum, **argdict):
+ """
+ :param osdnum: osd number
+ :param argdict: dictionary containing values to set.
+ """
+ for k, v in argdict.iteritems():
+ self.wait_run_admin_socket(
+ 'osd', osdnum,
+ ['config', 'set', str(k), str(v)])
+
+ def raw_cluster_status(self):
+ """
+ Get status from cluster
+ """
+ status = self.raw_cluster_cmd('status', '--format=json-pretty')
+ return json.loads(status)
+
+ def raw_osd_status(self):
+ """
+ Get osd status from cluster
+ """
+ return self.raw_cluster_cmd('osd', 'dump')
+
+ def get_osd_status(self):
+ """
+ Get osd statuses sorted by states that the osds are in.
+ """
+ osd_lines = filter(
+ lambda x: x.startswith('osd.') and (("up" in x) or ("down" in x)),
+ self.raw_osd_status().split('\n'))
+ self.log(osd_lines)
+ in_osds = [int(i[4:].split()[0])
+ for i in filter(lambda x: " in " in x, osd_lines)]
+ out_osds = [int(i[4:].split()[0])
+ for i in filter(lambda x: " out " in x, osd_lines)]
+ up_osds = [int(i[4:].split()[0])
+ for i in filter(lambda x: " up " in x, osd_lines)]
+ down_osds = [int(i[4:].split()[0])
+ for i in filter(lambda x: " down " in x, osd_lines)]
+ dead_osds = [int(x.id_)
+ for x in filter(lambda x:
+ not x.running(),
+ self.ctx.daemons.
+ iter_daemons_of_role('osd', self.cluster))]
+ live_osds = [int(x.id_) for x in
+ filter(lambda x:
+ x.running(),
+ self.ctx.daemons.iter_daemons_of_role('osd',
+ self.cluster))]
+ return {'in': in_osds, 'out': out_osds, 'up': up_osds,
+ 'down': down_osds, 'dead': dead_osds, 'live': live_osds,
+ 'raw': osd_lines}
+
+ def get_num_pgs(self):
+ """
+ Check cluster status for the number of pgs
+ """
+ status = self.raw_cluster_status()
+ self.log(status)
+ return status['pgmap']['num_pgs']
+
+ def create_erasure_code_profile(self, profile_name, profile):
+ """
+ Create an erasure code profile name that can be used as a parameter
+ when creating an erasure coded pool.
+ """
+ with self.lock:
+ args = cmd_erasure_code_profile(profile_name, profile)
+ self.raw_cluster_cmd(*args)
+
+ def create_pool_with_unique_name(self, pg_num=16,
+ erasure_code_profile_name=None,
+ min_size=None,
+ erasure_code_use_hacky_overwrites=False):
+ """
+ Create a pool named unique_pool_X where X is unique.
+ """
+ name = ""
+ with self.lock:
+ name = "unique_pool_%s" % (str(self.next_pool_id),)
+ self.next_pool_id += 1
+ self.create_pool(
+ name,
+ pg_num,
+ erasure_code_profile_name=erasure_code_profile_name,
+ min_size=min_size,
+ erasure_code_use_hacky_overwrites=erasure_code_use_hacky_overwrites)
+ return name
+
+ @contextlib.contextmanager
+ def pool(self, pool_name, pg_num=16, erasure_code_profile_name=None):
+ self.create_pool(pool_name, pg_num, erasure_code_profile_name)
+ yield
+ self.remove_pool(pool_name)
+
+ def create_pool(self, pool_name, pg_num=16,
+ erasure_code_profile_name=None,
+ min_size=None,
+ erasure_code_use_hacky_overwrites=False):
+ """
+ Create a pool named from the pool_name parameter.
+ :param pool_name: name of the pool being created.
+ :param pg_num: initial number of pgs.
+ :param erasure_code_profile_name: if set and !None create an
+ erasure coded pool using the profile
+ :param erasure_code_use_hacky_overwrites: if true, use the hacky
+ overwrites mode
+ """
+ with self.lock:
+ assert isinstance(pool_name, basestring)
+ assert isinstance(pg_num, int)
+ assert pool_name not in self.pools
+ self.log("creating pool_name %s" % (pool_name,))
+ if erasure_code_profile_name:
+ self.raw_cluster_cmd('osd', 'pool', 'create',
+ pool_name, str(pg_num), str(pg_num),
+ 'erasure', erasure_code_profile_name)
+ else:
+ self.raw_cluster_cmd('osd', 'pool', 'create',
+ pool_name, str(pg_num))
+ if min_size is not None:
+ self.raw_cluster_cmd(
+ 'osd', 'pool', 'set', pool_name,
+ 'min_size',
+ str(min_size))
+ if erasure_code_use_hacky_overwrites:
+ self.raw_cluster_cmd(
+ 'osd', 'pool', 'set', pool_name,
+ 'debug_white_box_testing_ec_overwrites',
+ 'true')
+ self.pools[pool_name] = pg_num
+ time.sleep(1)
+
+ def add_pool_snap(self, pool_name, snap_name):
+ """
+ Add pool snapshot
+ :param pool_name: name of pool to snapshot
+ :param snap_name: name of snapshot to take
+ """
+ self.raw_cluster_cmd('osd', 'pool', 'mksnap',
+ str(pool_name), str(snap_name))
+
+ def remove_pool_snap(self, pool_name, snap_name):
+ """
+ Remove pool snapshot
+ :param pool_name: name of pool to snapshot
+ :param snap_name: name of snapshot to remove
+ """
+ self.raw_cluster_cmd('osd', 'pool', 'rmsnap',
+ str(pool_name), str(snap_name))
+
+ def remove_pool(self, pool_name):
+ """
+ Remove the indicated pool
+ :param pool_name: Pool to be removed
+ """
+ with self.lock:
+ assert isinstance(pool_name, basestring)
+ assert pool_name in self.pools
+ self.log("removing pool_name %s" % (pool_name,))
+ del self.pools[pool_name]
+ self.do_rados(self.controller,
+ ['rmpool', pool_name, pool_name,
+ "--yes-i-really-really-mean-it"])
+
+ def get_pool(self):
+ """
+ Pick a random pool
+ """
+ with self.lock:
+ return random.choice(self.pools.keys())
+
+ def get_pool_pg_num(self, pool_name):
+ """
+ Return the number of pgs in the pool specified.
+ """
+ with self.lock:
+ assert isinstance(pool_name, basestring)
+ if pool_name in self.pools:
+ return self.pools[pool_name]
+ return 0
+
+ def get_pool_property(self, pool_name, prop):
+ """
+ :param pool_name: pool
+ :param prop: property to be checked.
+ :returns: property as an int value.
+ """
+ with self.lock:
+ assert isinstance(pool_name, basestring)
+ assert isinstance(prop, basestring)
+ output = self.raw_cluster_cmd(
+ 'osd',
+ 'pool',
+ 'get',
+ pool_name,
+ prop)
+ return int(output.split()[1])
+
+ def set_pool_property(self, pool_name, prop, val):
+ """
+ :param pool_name: pool
+ :param prop: property to be set.
+ :param val: value to set.
+
+ This routine retries if set operation fails.
+ """
+ with self.lock:
+ assert isinstance(pool_name, basestring)
+ assert isinstance(prop, basestring)
+ assert isinstance(val, int)
+ tries = 0
+ while True:
+ r = self.raw_cluster_cmd_result(
+ 'osd',
+ 'pool',
+ 'set',
+ pool_name,
+ prop,
+ str(val))
+ if r != 11: # EAGAIN
+ break
+ tries += 1
+ if tries > 50:
+ raise Exception('timed out getting EAGAIN '
+ 'when setting pool property %s %s = %s' %
+ (pool_name, prop, val))
+ self.log('got EAGAIN setting pool property, '
+ 'waiting a few seconds...')
+ time.sleep(2)
+
+ def expand_pool(self, pool_name, by, max_pgs):
+ """
+ Increase the number of pgs in a pool
+ """
+ with self.lock:
+ assert isinstance(pool_name, basestring)
+ assert isinstance(by, int)
+ assert pool_name in self.pools
+ if self.get_num_creating() > 0:
+ return
+ if (self.pools[pool_name] + by) > max_pgs:
+ return
+ self.log("increase pool size by %d" % (by,))
+ new_pg_num = self.pools[pool_name] + by
+ self.set_pool_property(pool_name, "pg_num", new_pg_num)
+ self.pools[pool_name] = new_pg_num
+
+ def set_pool_pgpnum(self, pool_name):
+ """
+ Set pgpnum property of pool_name pool.
+ """
+ with self.lock:
+ assert isinstance(pool_name, basestring)
+ assert pool_name in self.pools
+ if self.get_num_creating() > 0:
+ return
+ self.set_pool_property(pool_name, 'pgp_num', self.pools[pool_name])
+
+ def list_pg_missing(self, pgid):
+ """
+ return list of missing pgs with the id specified
+ """
+ r = None
+ offset = {}
+ while True:
+ out = self.raw_cluster_cmd('--', 'pg', pgid, 'list_missing',
+ json.dumps(offset))
+ j = json.loads(out)
+ if r is None:
+ r = j
+ else:
+ r['objects'].extend(j['objects'])
+ if not 'more' in j:
+ break
+ if j['more'] == 0:
+ break
+ offset = j['objects'][-1]['oid']
+ if 'more' in r:
+ del r['more']
+ return r
+
+ def get_pg_stats(self):
+ """
+ Dump the cluster and get pg stats
+ """
+ out = self.raw_cluster_cmd('pg', 'dump', '--format=json')
+ j = json.loads('\n'.join(out.split('\n')[1:]))
+ return j['pg_stats']
+
+ def compile_pg_status(self):
+ """
+ Return a histogram of pg state values
+ """
+ ret = {}
+ j = self.get_pg_stats()
+ for pg in j:
+ for status in pg['state'].split('+'):
+ if status not in ret:
+ ret[status] = 0
+ ret[status] += 1
+ return ret
+
+ def pg_scrubbing(self, pool, pgnum):
+ """
+ pg scrubbing wrapper
+ """
+ pgstr = self.get_pgid(pool, pgnum)
+ stats = self.get_single_pg_stats(pgstr)
+ return 'scrub' in stats['state']
+
+ def pg_repairing(self, pool, pgnum):
+ """
+ pg repairing wrapper
+ """
+ pgstr = self.get_pgid(pool, pgnum)
+ stats = self.get_single_pg_stats(pgstr)
+ return 'repair' in stats['state']
+
+ def pg_inconsistent(self, pool, pgnum):
+ """
+ pg inconsistent wrapper
+ """
+ pgstr = self.get_pgid(pool, pgnum)
+ stats = self.get_single_pg_stats(pgstr)
+ return 'inconsistent' in stats['state']
+
+ def get_last_scrub_stamp(self, pool, pgnum):
+ """
+ Get the timestamp of the last scrub.
+ """
+ stats = self.get_single_pg_stats(self.get_pgid(pool, pgnum))
+ return stats["last_scrub_stamp"]
+
+ def do_pg_scrub(self, pool, pgnum, stype):
+ """
+ Scrub pg and wait for scrubbing to finish
+ """
+ init = self.get_last_scrub_stamp(pool, pgnum)
+ RESEND_TIMEOUT = 120 # Must be a multiple of SLEEP_TIME
+ FATAL_TIMEOUT = RESEND_TIMEOUT * 3
+ SLEEP_TIME = 10
+ timer = 0
+ while init == self.get_last_scrub_stamp(pool, pgnum):
+ assert timer < FATAL_TIMEOUT, "fatal timeout trying to " + stype
+ self.log("waiting for scrub type %s" % (stype,))
+ if (timer % RESEND_TIMEOUT) == 0:
+ self.raw_cluster_cmd('pg', stype, self.get_pgid(pool, pgnum))
+ # The first time in this loop is the actual request
+ if timer != 0 and stype == "repair":
+ self.log("WARNING: Resubmitted a non-idempotent repair")
+ time.sleep(SLEEP_TIME)
+ timer += SLEEP_TIME
+
+ def get_single_pg_stats(self, pgid):
+ """
+ Return pg for the pgid specified.
+ """
+ all_stats = self.get_pg_stats()
+
+ for pg in all_stats:
+ if pg['pgid'] == pgid:
+ return pg
+
+ return None
+
+ def get_object_pg_with_shard(self, pool, name, osdid):
+ """
+ """
+ pool_dump = self.get_pool_dump(pool)
+ object_map = self.get_object_map(pool, name)
+ if pool_dump["type"] == CephManager.ERASURE_CODED_POOL:
+ shard = object_map['acting'].index(osdid)
+ return "{pgid}s{shard}".format(pgid=object_map['pgid'],
+ shard=shard)
+ else:
+ return object_map['pgid']
+
+ def get_object_primary(self, pool, name):
+ """
+ """
+ object_map = self.get_object_map(pool, name)
+ return object_map['acting_primary']
+
+ def get_object_map(self, pool, name):
+ """
+ osd map --format=json converted to a python object
+ :returns: the python object
+ """
+ out = self.raw_cluster_cmd('--format=json', 'osd', 'map', pool, name)
+ return json.loads('\n'.join(out.split('\n')[1:]))
+
+ def get_osd_dump_json(self):
+ """
+ osd dump --format=json converted to a python object
+ :returns: the python object
+ """
+ out = self.raw_cluster_cmd('osd', 'dump', '--format=json')
+ return json.loads('\n'.join(out.split('\n')[1:]))
+
+ def get_osd_dump(self):
+ """
+ Dump osds
+ :returns: all osds
+ """
+ return self.get_osd_dump_json()['osds']
+
+ def get_stuck_pgs(self, type_, threshold):
+ """
+ :returns: stuck pg information from the cluster
+ """
+ out = self.raw_cluster_cmd('pg', 'dump_stuck', type_, str(threshold),
+ '--format=json')
+ return json.loads(out)
+
+ def get_num_unfound_objects(self):
+ """
+ Check cluster status to get the number of unfound objects
+ """
+ status = self.raw_cluster_status()
+ self.log(status)
+ return status['pgmap'].get('unfound_objects', 0)
+
+ def get_num_creating(self):
+ """
+ Find the number of pgs in creating mode.
+ """
+ pgs = self.get_pg_stats()
+ num = 0
+ for pg in pgs:
+ if 'creating' in pg['state']:
+ num += 1
+ return num
+
+ def get_num_active_clean(self):
+ """
+ Find the number of active and clean pgs.
+ """
+ pgs = self.get_pg_stats()
+ num = 0
+ for pg in pgs:
+ if (pg['state'].count('active') and
+ pg['state'].count('clean') and
+ not pg['state'].count('stale')):
+ num += 1
+ return num
+
+ def get_num_active_recovered(self):
+ """
+ Find the number of active and recovered pgs.
+ """
+ pgs = self.get_pg_stats()
+ num = 0
+ for pg in pgs:
+ if (pg['state'].count('active') and
+ not pg['state'].count('recover') and
+ not pg['state'].count('backfill') and
+ not pg['state'].count('stale')):
+ num += 1
+ return num
+
+ def get_is_making_recovery_progress(self):
+ """
+ Return whether there is recovery progress discernable in the
+ raw cluster status
+ """
+ status = self.raw_cluster_status()
+ kps = status['pgmap'].get('recovering_keys_per_sec', 0)
+ bps = status['pgmap'].get('recovering_bytes_per_sec', 0)
+ ops = status['pgmap'].get('recovering_objects_per_sec', 0)
+ return kps > 0 or bps > 0 or ops > 0
+
+ def get_num_active(self):
+ """
+ Find the number of active pgs.
+ """
+ pgs = self.get_pg_stats()
+ num = 0
+ for pg in pgs:
+ if pg['state'].count('active') and not pg['state'].count('stale'):
+ num += 1
+ return num
+
+ def get_num_down(self):
+ """
+ Find the number of pgs that are down.
+ """
+ pgs = self.get_pg_stats()
+ num = 0
+ for pg in pgs:
+ if ((pg['state'].count('down') and not
+ pg['state'].count('stale')) or
+ (pg['state'].count('incomplete') and not
+ pg['state'].count('stale'))):
+ num += 1
+ return num
+
+ def get_num_active_down(self):
+ """
+ Find the number of pgs that are either active or down.
+ """
+ pgs = self.get_pg_stats()
+ num = 0
+ for pg in pgs:
+ if ((pg['state'].count('active') and not
+ pg['state'].count('stale')) or
+ (pg['state'].count('down') and not
+ pg['state'].count('stale')) or
+ (pg['state'].count('incomplete') and not
+ pg['state'].count('stale'))):
+ num += 1
+ return num
+
+ def is_clean(self):
+ """
+ True if all pgs are clean
+ """
+ return self.get_num_active_clean() == self.get_num_pgs()
+
+ def is_recovered(self):
+ """
+ True if all pgs have recovered
+ """
+ return self.get_num_active_recovered() == self.get_num_pgs()
+
+ def is_active_or_down(self):
+ """
+ True if all pgs are active or down
+ """
+ return self.get_num_active_down() == self.get_num_pgs()
+
+ def wait_for_clean(self, timeout=None):
+ """
+ Returns true when all pgs are clean.
+ """
+ self.log("waiting for clean")
+ start = time.time()
+ num_active_clean = self.get_num_active_clean()
+ while not self.is_clean():
+ if timeout is not None:
+ if self.get_is_making_recovery_progress():
+ self.log("making progress, resetting timeout")
+ start = time.time()
+ else:
+ self.log("no progress seen, keeping timeout for now")
+ if time.time() - start >= timeout:
+ self.log('dumping pgs')
+ out = self.raw_cluster_cmd('pg', 'dump')
+ self.log(out)
+ assert time.time() - start < timeout, \
+ 'failed to become clean before timeout expired'
+ cur_active_clean = self.get_num_active_clean()
+ if cur_active_clean != num_active_clean:
+ start = time.time()
+ num_active_clean = cur_active_clean
+ time.sleep(3)
+ self.log("clean!")
+
+ def are_all_osds_up(self):
+ """
+ Returns true if all osds are up.
+ """
+ x = self.get_osd_dump()
+ return (len(x) == sum([(y['up'] > 0) for y in x]))
+
+ def wait_for_all_up(self, timeout=None):
+ """
+ When this exits, either the timeout has expired, or all
+ osds are up.
+ """
+ self.log("waiting for all up")
+ start = time.time()
+ while not self.are_all_osds_up():
+ if timeout is not None:
+ assert time.time() - start < timeout, \
+ 'timeout expired in wait_for_all_up'
+ time.sleep(3)
+ self.log("all up!")
+
+ def wait_for_recovery(self, timeout=None):
+ """
+ Check peering. When this exists, we have recovered.
+ """
+ self.log("waiting for recovery to complete")
+ start = time.time()
+ num_active_recovered = self.get_num_active_recovered()
+ while not self.is_recovered():
+ now = time.time()
+ if timeout is not None:
+ if self.get_is_making_recovery_progress():
+ self.log("making progress, resetting timeout")
+ start = time.time()
+ else:
+ self.log("no progress seen, keeping timeout for now")
+ if now - start >= timeout:
+ self.log('dumping pgs')
+ out = self.raw_cluster_cmd('pg', 'dump')
+ self.log(out)
+ assert now - start < timeout, \
+ 'failed to recover before timeout expired'
+ cur_active_recovered = self.get_num_active_recovered()
+ if cur_active_recovered != num_active_recovered:
+ start = time.time()
+ num_active_recovered = cur_active_recovered
+ time.sleep(3)
+ self.log("recovered!")
+
+ def wait_for_active(self, timeout=None):
+ """
+ Check peering. When this exists, we are definitely active
+ """
+ self.log("waiting for peering to complete")
+ start = time.time()
+ num_active = self.get_num_active()
+ while not self.is_active():
+ if timeout is not None:
+ if time.time() - start >= timeout:
+ self.log('dumping pgs')
+ out = self.raw_cluster_cmd('pg', 'dump')
+ self.log(out)
+ assert time.time() - start < timeout, \
+ 'failed to recover before timeout expired'
+ cur_active = self.get_num_active()
+ if cur_active != num_active:
+ start = time.time()
+ num_active = cur_active
+ time.sleep(3)
+ self.log("active!")
+
+ def wait_for_active_or_down(self, timeout=None):
+ """
+ Check peering. When this exists, we are definitely either
+ active or down
+ """
+ self.log("waiting for peering to complete or become blocked")
+ start = time.time()
+ num_active_down = self.get_num_active_down()
+ while not self.is_active_or_down():
+ if timeout is not None:
+ if time.time() - start >= timeout:
+ self.log('dumping pgs')
+ out = self.raw_cluster_cmd('pg', 'dump')
+ self.log(out)
+ assert time.time() - start < timeout, \
+ 'failed to recover before timeout expired'
+ cur_active_down = self.get_num_active_down()
+ if cur_active_down != num_active_down:
+ start = time.time()
+ num_active_down = cur_active_down
+ time.sleep(3)
+ self.log("active or down!")
+
+ def osd_is_up(self, osd):
+ """
+ Wrapper for osd check
+ """
+ osds = self.get_osd_dump()
+ return osds[osd]['up'] > 0
+
+ def wait_till_osd_is_up(self, osd, timeout=None):
+ """
+ Loop waiting for osd.
+ """
+ self.log('waiting for osd.%d to be up' % osd)
+ start = time.time()
+ while not self.osd_is_up(osd):
+ if timeout is not None:
+ assert time.time() - start < timeout, \
+ 'osd.%d failed to come up before timeout expired' % osd
+ time.sleep(3)
+ self.log('osd.%d is up' % osd)
+
+ def is_active(self):
+ """
+ Wrapper to check if all pgs are active
+ """
+ return self.get_num_active() == self.get_num_pgs()
+
+ def wait_till_active(self, timeout=None):
+ """
+ Wait until all pgs are active.
+ """
+ self.log("waiting till active")
+ start = time.time()
+ while not self.is_active():
+ if timeout is not None:
+ if time.time() - start >= timeout:
+ self.log('dumping pgs')
+ out = self.raw_cluster_cmd('pg', 'dump')
+ self.log(out)
+ assert time.time() - start < timeout, \
+ 'failed to become active before timeout expired'
+ time.sleep(3)
+ self.log("active!")
+
+ def mark_out_osd(self, osd):
+ """
+ Wrapper to mark osd out.
+ """
+ self.raw_cluster_cmd('osd', 'out', str(osd))
+
+ def kill_osd(self, osd):
+ """
+ Kill osds by either power cycling (if indicated by the config)
+ or by stopping.
+ """
+ if self.config.get('powercycle'):
+ remote = self.find_remote('osd', osd)
+ self.log('kill_osd on osd.{o} '
+ 'doing powercycle of {s}'.format(o=osd, s=remote.name))
+ self._assert_ipmi(remote)
+ remote.console.power_off()
+ elif self.config.get('bdev_inject_crash') and self.config.get('bdev_inject_crash_probability'):
+ if random.uniform(0, 1) < self.config.get('bdev_inject_crash_probability', .5):
+ self.raw_cluster_cmd(
+ '--', 'tell', 'osd.%d' % osd,
+ 'injectargs',
+ '--bdev-inject-crash %d' % self.config.get('bdev_inject_crash'),
+ )
+ try:
+ self.ctx.daemons.get_daemon('osd', osd, self.cluster).wait()
+ except:
+ pass
+ else:
+ raise RuntimeError('osd.%s did not fail' % osd)
+ else:
+ self.ctx.daemons.get_daemon('osd', osd, self.cluster).stop()
+ else:
+ self.ctx.daemons.get_daemon('osd', osd, self.cluster).stop()
+
+ @staticmethod
+ def _assert_ipmi(remote):
+ assert remote.console.has_ipmi_credentials, (
+ "powercycling requested but RemoteConsole is not "
+ "initialized. Check ipmi config.")
+
+ def blackhole_kill_osd(self, osd):
+ """
+ Stop osd if nothing else works.
+ """
+ self.raw_cluster_cmd('--', 'tell', 'osd.%d' % osd,
+ 'injectargs',
+ '--objectstore-blackhole')
+ time.sleep(2)
+ self.ctx.daemons.get_daemon('osd', osd, self.cluster).stop()
+
+ def revive_osd(self, osd, timeout=150, skip_admin_check=False):
+ """
+ Revive osds by either power cycling (if indicated by the config)
+ or by restarting.
+ """
+ if self.config.get('powercycle'):
+ remote = self.find_remote('osd', osd)
+ self.log('kill_osd on osd.{o} doing powercycle of {s}'.
+ format(o=osd, s=remote.name))
+ self._assert_ipmi(remote)
+ remote.console.power_on()
+ if not remote.console.check_status(300):
+ raise Exception('Failed to revive osd.{o} via ipmi'.
+ format(o=osd))
+ teuthology.reconnect(self.ctx, 60, [remote])
+ mount_osd_data(self.ctx, remote, self.cluster, str(osd))
+ self.make_admin_daemon_dir(remote)
+ self.ctx.daemons.get_daemon('osd', osd, self.cluster).reset()
+ self.ctx.daemons.get_daemon('osd', osd, self.cluster).restart()
+
+ if not skip_admin_check:
+ # wait for dump_ops_in_flight; this command doesn't appear
+ # until after the signal handler is installed and it is safe
+ # to stop the osd again without making valgrind leak checks
+ # unhappy. see #5924.
+ self.wait_run_admin_socket('osd', osd,
+ args=['dump_ops_in_flight'],
+ timeout=timeout, stdout=DEVNULL)
+
+ def mark_down_osd(self, osd):
+ """
+ Cluster command wrapper
+ """
+ self.raw_cluster_cmd('osd', 'down', str(osd))
+
+ def mark_in_osd(self, osd):
+ """
+ Cluster command wrapper
+ """
+ self.raw_cluster_cmd('osd', 'in', str(osd))
+
+ def signal_osd(self, osd, sig, silent=False):
+ """
+ Wrapper to local get_daemon call which sends the given
+ signal to the given osd.
+ """
+ self.ctx.daemons.get_daemon('osd', osd,
+ self.cluster).signal(sig, silent=silent)
+
+ ## monitors
+ def signal_mon(self, mon, sig, silent=False):
+ """
+ Wrapper to local get_deamon call
+ """
+ self.ctx.daemons.get_daemon('mon', mon,
+ self.cluster).signal(sig, silent=silent)
+
+ def kill_mon(self, mon):
+ """
+ Kill the monitor by either power cycling (if the config says so),
+ or by doing a stop.
+ """
+ if self.config.get('powercycle'):
+ remote = self.find_remote('mon', mon)
+ self.log('kill_mon on mon.{m} doing powercycle of {s}'.
+ format(m=mon, s=remote.name))
+ self._assert_ipmi(remote)
+ remote.console.power_off()
+ else:
+ self.ctx.daemons.get_daemon('mon', mon, self.cluster).stop()
+
+ def revive_mon(self, mon):
+ """
+ Restart by either power cycling (if the config says so),
+ or by doing a normal restart.
+ """
+ if self.config.get('powercycle'):
+ remote = self.find_remote('mon', mon)
+ self.log('revive_mon on mon.{m} doing powercycle of {s}'.
+ format(m=mon, s=remote.name))
+ self._assert_ipmi(remote)
+ remote.console.power_on()
+ self.make_admin_daemon_dir(remote)
+ self.ctx.daemons.get_daemon('mon', mon, self.cluster).restart()
+
+ def get_mon_status(self, mon):
+ """
+ Extract all the monitor status information from the cluster
+ """
+ addr = self.ctx.ceph[self.cluster].conf['mon.%s' % mon]['mon addr']
+ out = self.raw_cluster_cmd('-m', addr, 'mon_status')
+ return json.loads(out)
+
+ def get_mon_quorum(self):
+ """
+ Extract monitor quorum information from the cluster
+ """
+ out = self.raw_cluster_cmd('quorum_status')
+ j = json.loads(out)
+ self.log('quorum_status is %s' % out)
+ return j['quorum']
+
+ def wait_for_mon_quorum_size(self, size, timeout=300):
+ """
+ Loop until quorum size is reached.
+ """
+ self.log('waiting for quorum size %d' % size)
+ start = time.time()
+ while not len(self.get_mon_quorum()) == size:
+ if timeout is not None:
+ assert time.time() - start < timeout, \
+ ('failed to reach quorum size %d '
+ 'before timeout expired' % size)
+ time.sleep(3)
+ self.log("quorum is size %d" % size)
+
+ def get_mon_health(self, debug=False):
+ """
+ Extract all the monitor health information.
+ """
+ out = self.raw_cluster_cmd('health', '--format=json')
+ if debug:
+ self.log('health:\n{h}'.format(h=out))
+ return json.loads(out)
+
+ def get_mds_status(self, mds):
+ """
+ Run cluster commands for the mds in order to get mds information
+ """
+ out = self.raw_cluster_cmd('mds', 'dump', '--format=json')
+ j = json.loads(' '.join(out.splitlines()[1:]))
+ # collate; for dup ids, larger gid wins.
+ for info in j['info'].itervalues():
+ if info['name'] == mds:
+ return info
+ return None
+
+ def get_filepath(self):
+ """
+ Return path to osd data with {id} needing to be replaced
+ """
+ return '/var/lib/ceph/osd/' + self.cluster + '-{id}'
+
+ def make_admin_daemon_dir(self, remote):
+ """
+ Create /var/run/ceph directory on remote site.
+
+ :param ctx: Context
+ :param remote: Remote site
+ """
+ remote.run(args=['sudo',
+ 'install', '-d', '-m0777', '--', '/var/run/ceph', ], )
+
+
+def utility_task(name):
+ """
+ Generate ceph_manager subtask corresponding to ceph_manager
+ method name
+ """
+ def task(ctx, config):
+ if config is None:
+ config = {}
+ args = config.get('args', [])
+ kwargs = config.get('kwargs', {})
+ cluster = config.get('cluster', 'ceph')
+ fn = getattr(ctx.managers[cluster], name)
+ fn(*args, **kwargs)
+ return task
+
+revive_osd = utility_task("revive_osd")
+revive_mon = utility_task("revive_mon")
+kill_osd = utility_task("kill_osd")
+kill_mon = utility_task("kill_mon")
+create_pool = utility_task("create_pool")
+remove_pool = utility_task("remove_pool")
+wait_for_clean = utility_task("wait_for_clean")
+set_pool_property = utility_task("set_pool_property")
+do_pg_scrub = utility_task("do_pg_scrub")
--- /dev/null
+"""
+ceph_objectstore_tool - Simple test of ceph-objectstore-tool utility
+"""
+from cStringIO import StringIO
+import contextlib
+import logging
+import ceph_manager
+from teuthology import misc as teuthology
+import time
+import os
+import string
+from teuthology.orchestra import run
+import sys
+import tempfile
+import json
+from util.rados import (rados, create_replicated_pool, create_ec_pool)
+# from util.rados import (rados, create_ec_pool,
+# create_replicated_pool,
+# create_cache_pool)
+
+log = logging.getLogger(__name__)
+
+# Should get cluster name "ceph" from somewhere
+# and normal path from osd_data and osd_journal in conf
+FSPATH = "/var/lib/ceph/osd/ceph-{id}"
+JPATH = "/var/lib/ceph/osd/ceph-{id}/journal"
+
+
+def cod_setup_local_data(log, ctx, NUM_OBJECTS, DATADIR,
+ BASE_NAME, DATALINECOUNT):
+ objects = range(1, NUM_OBJECTS + 1)
+ for i in objects:
+ NAME = BASE_NAME + "{num}".format(num=i)
+ LOCALNAME = os.path.join(DATADIR, NAME)
+
+ dataline = range(DATALINECOUNT)
+ fd = open(LOCALNAME, "w")
+ data = "This is the data for " + NAME + "\n"
+ for _ in dataline:
+ fd.write(data)
+ fd.close()
+
+
+def cod_setup_remote_data(log, ctx, remote, NUM_OBJECTS, DATADIR,
+ BASE_NAME, DATALINECOUNT):
+
+ objects = range(1, NUM_OBJECTS + 1)
+ for i in objects:
+ NAME = BASE_NAME + "{num}".format(num=i)
+ DDNAME = os.path.join(DATADIR, NAME)
+
+ remote.run(args=['rm', '-f', DDNAME])
+
+ dataline = range(DATALINECOUNT)
+ data = "This is the data for " + NAME + "\n"
+ DATA = ""
+ for _ in dataline:
+ DATA += data
+ teuthology.write_file(remote, DDNAME, DATA)
+
+
+def cod_setup(log, ctx, remote, NUM_OBJECTS, DATADIR,
+ BASE_NAME, DATALINECOUNT, POOL, db, ec):
+ ERRORS = 0
+ log.info("Creating {objs} objects in pool".format(objs=NUM_OBJECTS))
+
+ objects = range(1, NUM_OBJECTS + 1)
+ for i in objects:
+ NAME = BASE_NAME + "{num}".format(num=i)
+ DDNAME = os.path.join(DATADIR, NAME)
+
+ proc = rados(ctx, remote, ['-p', POOL, 'put', NAME, DDNAME],
+ wait=False)
+ # proc = remote.run(args=['rados', '-p', POOL, 'put', NAME, DDNAME])
+ ret = proc.wait()
+ if ret != 0:
+ log.critical("Rados put failed with status {ret}".
+ format(ret=proc.exitstatus))
+ sys.exit(1)
+
+ db[NAME] = {}
+
+ keys = range(i)
+ db[NAME]["xattr"] = {}
+ for k in keys:
+ if k == 0:
+ continue
+ mykey = "key{i}-{k}".format(i=i, k=k)
+ myval = "val{i}-{k}".format(i=i, k=k)
+ proc = remote.run(args=['rados', '-p', POOL, 'setxattr',
+ NAME, mykey, myval])
+ ret = proc.wait()
+ if ret != 0:
+ log.error("setxattr failed with {ret}".format(ret=ret))
+ ERRORS += 1
+ db[NAME]["xattr"][mykey] = myval
+
+ # Erasure coded pools don't support omap
+ if ec:
+ continue
+
+ # Create omap header in all objects but REPobject1
+ if i != 1:
+ myhdr = "hdr{i}".format(i=i)
+ proc = remote.run(args=['rados', '-p', POOL, 'setomapheader',
+ NAME, myhdr])
+ ret = proc.wait()
+ if ret != 0:
+ log.critical("setomapheader failed with {ret}".format(ret=ret))
+ ERRORS += 1
+ db[NAME]["omapheader"] = myhdr
+
+ db[NAME]["omap"] = {}
+ for k in keys:
+ if k == 0:
+ continue
+ mykey = "okey{i}-{k}".format(i=i, k=k)
+ myval = "oval{i}-{k}".format(i=i, k=k)
+ proc = remote.run(args=['rados', '-p', POOL, 'setomapval',
+ NAME, mykey, myval])
+ ret = proc.wait()
+ if ret != 0:
+ log.critical("setomapval failed with {ret}".format(ret=ret))
+ db[NAME]["omap"][mykey] = myval
+
+ return ERRORS
+
+
+def get_lines(filename):
+ tmpfd = open(filename, "r")
+ line = True
+ lines = []
+ while line:
+ line = tmpfd.readline().rstrip('\n')
+ if line:
+ lines += [line]
+ tmpfd.close()
+ os.unlink(filename)
+ return lines
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Run ceph_objectstore_tool test
+
+ The config should be as follows::
+
+ ceph_objectstore_tool:
+ objects: 20 # <number of objects>
+ pgnum: 12
+ """
+
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'ceph_objectstore_tool task only accepts a dict for configuration'
+
+ log.info('Beginning ceph_objectstore_tool...')
+
+ log.debug(config)
+ log.debug(ctx)
+ clients = ctx.cluster.only(teuthology.is_type('client'))
+ assert len(clients.remotes) > 0, 'Must specify at least 1 client'
+ (cli_remote, _) = clients.remotes.popitem()
+ log.debug(cli_remote)
+
+ # clients = dict(teuthology.get_clients(ctx=ctx, roles=config.keys()))
+ # client = clients.popitem()
+ # log.info(client)
+ osds = ctx.cluster.only(teuthology.is_type('osd'))
+ log.info("OSDS")
+ log.info(osds)
+ log.info(osds.remotes)
+
+ manager = ctx.managers['ceph']
+ while (len(manager.get_osd_status()['up']) !=
+ len(manager.get_osd_status()['raw'])):
+ time.sleep(10)
+ while (len(manager.get_osd_status()['in']) !=
+ len(manager.get_osd_status()['up'])):
+ time.sleep(10)
+ manager.raw_cluster_cmd('osd', 'set', 'noout')
+ manager.raw_cluster_cmd('osd', 'set', 'nodown')
+
+ PGNUM = config.get('pgnum', 12)
+ log.info("pgnum: {num}".format(num=PGNUM))
+
+ ERRORS = 0
+
+ REP_POOL = "rep_pool"
+ REP_NAME = "REPobject"
+ create_replicated_pool(cli_remote, REP_POOL, PGNUM)
+ ERRORS += test_objectstore(ctx, config, cli_remote, REP_POOL, REP_NAME)
+
+ EC_POOL = "ec_pool"
+ EC_NAME = "ECobject"
+ create_ec_pool(cli_remote, EC_POOL, 'default', PGNUM)
+ ERRORS += test_objectstore(ctx, config, cli_remote,
+ EC_POOL, EC_NAME, ec=True)
+
+ if ERRORS == 0:
+ log.info("TEST PASSED")
+ else:
+ log.error("TEST FAILED WITH {errcount} ERRORS".format(errcount=ERRORS))
+
+ assert ERRORS == 0
+
+ try:
+ yield
+ finally:
+ log.info('Ending ceph_objectstore_tool')
+
+
+def test_objectstore(ctx, config, cli_remote, REP_POOL, REP_NAME, ec=False):
+ manager = ctx.managers['ceph']
+
+ osds = ctx.cluster.only(teuthology.is_type('osd'))
+
+ TEUTHDIR = teuthology.get_testdir(ctx)
+ DATADIR = os.path.join(TEUTHDIR, "ceph.data")
+ DATALINECOUNT = 10000
+ ERRORS = 0
+ NUM_OBJECTS = config.get('objects', 10)
+ log.info("objects: {num}".format(num=NUM_OBJECTS))
+
+ pool_dump = manager.get_pool_dump(REP_POOL)
+ REPID = pool_dump['pool']
+
+ log.debug("repid={num}".format(num=REPID))
+
+ db = {}
+
+ LOCALDIR = tempfile.mkdtemp("cod")
+
+ cod_setup_local_data(log, ctx, NUM_OBJECTS, LOCALDIR,
+ REP_NAME, DATALINECOUNT)
+ allremote = []
+ allremote.append(cli_remote)
+ allremote += osds.remotes.keys()
+ allremote = list(set(allremote))
+ for remote in allremote:
+ cod_setup_remote_data(log, ctx, remote, NUM_OBJECTS, DATADIR,
+ REP_NAME, DATALINECOUNT)
+
+ ERRORS += cod_setup(log, ctx, cli_remote, NUM_OBJECTS, DATADIR,
+ REP_NAME, DATALINECOUNT, REP_POOL, db, ec)
+
+ pgs = {}
+ for stats in manager.get_pg_stats():
+ if stats["pgid"].find(str(REPID) + ".") != 0:
+ continue
+ if pool_dump["type"] == ceph_manager.CephManager.REPLICATED_POOL:
+ for osd in stats["acting"]:
+ pgs.setdefault(osd, []).append(stats["pgid"])
+ elif pool_dump["type"] == ceph_manager.CephManager.ERASURE_CODED_POOL:
+ shard = 0
+ for osd in stats["acting"]:
+ pgs.setdefault(osd, []).append("{pgid}s{shard}".
+ format(pgid=stats["pgid"],
+ shard=shard))
+ shard += 1
+ else:
+ raise Exception("{pool} has an unexpected type {type}".
+ format(pool=REP_POOL, type=pool_dump["type"]))
+
+ log.info(pgs)
+ log.info(db)
+
+ for osd in manager.get_osd_status()['up']:
+ manager.kill_osd(osd)
+ time.sleep(5)
+
+ pgswithobjects = set()
+ objsinpg = {}
+
+ # Test --op list and generate json for all objects
+ log.info("Test --op list by generating json for all objects")
+ prefix = ("sudo ceph-objectstore-tool "
+ "--data-path {fpath} "
+ "--journal-path {jpath} ").format(fpath=FSPATH, jpath=JPATH)
+ for remote in osds.remotes.iterkeys():
+ log.debug(remote)
+ log.debug(osds.remotes[remote])
+ for role in osds.remotes[remote]:
+ if string.find(role, "osd.") != 0:
+ continue
+ osdid = int(role.split('.')[1])
+ log.info("process osd.{id} on {remote}".
+ format(id=osdid, remote=remote))
+ cmd = (prefix + "--op list").format(id=osdid)
+ proc = remote.run(args=cmd.split(), check_status=False,
+ stdout=StringIO())
+ if proc.exitstatus != 0:
+ log.error("Bad exit status {ret} from --op list request".
+ format(ret=proc.exitstatus))
+ ERRORS += 1
+ else:
+ for pgline in proc.stdout.getvalue().splitlines():
+ if not pgline:
+ continue
+ (pg, obj) = json.loads(pgline)
+ name = obj['oid']
+ if name in db:
+ pgswithobjects.add(pg)
+ objsinpg.setdefault(pg, []).append(name)
+ db[name].setdefault("pg2json",
+ {})[pg] = json.dumps(obj)
+
+ log.info(db)
+ log.info(pgswithobjects)
+ log.info(objsinpg)
+
+ if pool_dump["type"] == ceph_manager.CephManager.REPLICATED_POOL:
+ # Test get-bytes
+ log.info("Test get-bytes and set-bytes")
+ for basename in db.keys():
+ file = os.path.join(DATADIR, basename)
+ GETNAME = os.path.join(DATADIR, "get")
+ SETNAME = os.path.join(DATADIR, "set")
+
+ for remote in osds.remotes.iterkeys():
+ for role in osds.remotes[remote]:
+ if string.find(role, "osd.") != 0:
+ continue
+ osdid = int(role.split('.')[1])
+ if osdid not in pgs:
+ continue
+
+ for pg, JSON in db[basename]["pg2json"].iteritems():
+ if pg in pgs[osdid]:
+ cmd = ((prefix + "--pgid {pg}").
+ format(id=osdid, pg=pg).split())
+ cmd.append(run.Raw("'{json}'".format(json=JSON)))
+ cmd += ("get-bytes {fname}".
+ format(fname=GETNAME).split())
+ proc = remote.run(args=cmd, check_status=False)
+ if proc.exitstatus != 0:
+ remote.run(args="rm -f {getfile}".
+ format(getfile=GETNAME).split())
+ log.error("Bad exit status {ret}".
+ format(ret=proc.exitstatus))
+ ERRORS += 1
+ continue
+ cmd = ("diff -q {file} {getfile}".
+ format(file=file, getfile=GETNAME))
+ proc = remote.run(args=cmd.split())
+ if proc.exitstatus != 0:
+ log.error("Data from get-bytes differ")
+ # log.debug("Got:")
+ # cat_file(logging.DEBUG, GETNAME)
+ # log.debug("Expected:")
+ # cat_file(logging.DEBUG, file)
+ ERRORS += 1
+ remote.run(args="rm -f {getfile}".
+ format(getfile=GETNAME).split())
+
+ data = ("put-bytes going into {file}\n".
+ format(file=file))
+ teuthology.write_file(remote, SETNAME, data)
+ cmd = ((prefix + "--pgid {pg}").
+ format(id=osdid, pg=pg).split())
+ cmd.append(run.Raw("'{json}'".format(json=JSON)))
+ cmd += ("set-bytes {fname}".
+ format(fname=SETNAME).split())
+ proc = remote.run(args=cmd, check_status=False)
+ proc.wait()
+ if proc.exitstatus != 0:
+ log.info("set-bytes failed for object {obj} "
+ "in pg {pg} osd.{id} ret={ret}".
+ format(obj=basename, pg=pg,
+ id=osdid, ret=proc.exitstatus))
+ ERRORS += 1
+
+ cmd = ((prefix + "--pgid {pg}").
+ format(id=osdid, pg=pg).split())
+ cmd.append(run.Raw("'{json}'".format(json=JSON)))
+ cmd += "get-bytes -".split()
+ proc = remote.run(args=cmd, check_status=False,
+ stdout=StringIO())
+ proc.wait()
+ if proc.exitstatus != 0:
+ log.error("get-bytes after "
+ "set-bytes ret={ret}".
+ format(ret=proc.exitstatus))
+ ERRORS += 1
+ else:
+ if data != proc.stdout.getvalue():
+ log.error("Data inconsistent after "
+ "set-bytes, got:")
+ log.error(proc.stdout.getvalue())
+ ERRORS += 1
+
+ cmd = ((prefix + "--pgid {pg}").
+ format(id=osdid, pg=pg).split())
+ cmd.append(run.Raw("'{json}'".format(json=JSON)))
+ cmd += ("set-bytes {fname}".
+ format(fname=file).split())
+ proc = remote.run(args=cmd, check_status=False)
+ proc.wait()
+ if proc.exitstatus != 0:
+ log.info("set-bytes failed for object {obj} "
+ "in pg {pg} osd.{id} ret={ret}".
+ format(obj=basename, pg=pg,
+ id=osdid, ret=proc.exitstatus))
+ ERRORS += 1
+
+ log.info("Test list-attrs get-attr")
+ for basename in db.keys():
+ file = os.path.join(DATADIR, basename)
+ GETNAME = os.path.join(DATADIR, "get")
+ SETNAME = os.path.join(DATADIR, "set")
+
+ for remote in osds.remotes.iterkeys():
+ for role in osds.remotes[remote]:
+ if string.find(role, "osd.") != 0:
+ continue
+ osdid = int(role.split('.')[1])
+ if osdid not in pgs:
+ continue
+
+ for pg, JSON in db[basename]["pg2json"].iteritems():
+ if pg in pgs[osdid]:
+ cmd = ((prefix + "--pgid {pg}").
+ format(id=osdid, pg=pg).split())
+ cmd.append(run.Raw("'{json}'".format(json=JSON)))
+ cmd += ["list-attrs"]
+ proc = remote.run(args=cmd, check_status=False,
+ stdout=StringIO(), stderr=StringIO())
+ proc.wait()
+ if proc.exitstatus != 0:
+ log.error("Bad exit status {ret}".
+ format(ret=proc.exitstatus))
+ ERRORS += 1
+ continue
+ keys = proc.stdout.getvalue().split()
+ values = dict(db[basename]["xattr"])
+
+ for key in keys:
+ if (key == "_" or
+ key == "snapset" or
+ key == "hinfo_key"):
+ continue
+ key = key.strip("_")
+ if key not in values:
+ log.error("The key {key} should be present".
+ format(key=key))
+ ERRORS += 1
+ continue
+ exp = values.pop(key)
+ cmd = ((prefix + "--pgid {pg}").
+ format(id=osdid, pg=pg).split())
+ cmd.append(run.Raw("'{json}'".format(json=JSON)))
+ cmd += ("get-attr {key}".
+ format(key="_" + key).split())
+ proc = remote.run(args=cmd, check_status=False,
+ stdout=StringIO())
+ proc.wait()
+ if proc.exitstatus != 0:
+ log.error("get-attr failed with {ret}".
+ format(ret=proc.exitstatus))
+ ERRORS += 1
+ continue
+ val = proc.stdout.getvalue()
+ if exp != val:
+ log.error("For key {key} got value {got} "
+ "instead of {expected}".
+ format(key=key, got=val,
+ expected=exp))
+ ERRORS += 1
+ if "hinfo_key" in keys:
+ cmd_prefix = prefix.format(id=osdid)
+ cmd = """
+ expected=$({prefix} --pgid {pg} '{json}' get-attr {key} | base64)
+ echo placeholder | {prefix} --pgid {pg} '{json}' set-attr {key} -
+ test $({prefix} --pgid {pg} '{json}' get-attr {key}) = placeholder
+ echo $expected | base64 --decode | \
+ {prefix} --pgid {pg} '{json}' set-attr {key} -
+ test $({prefix} --pgid {pg} '{json}' get-attr {key} | base64) = $expected
+ """.format(prefix=cmd_prefix, pg=pg, json=JSON,
+ key="hinfo_key")
+ log.debug(cmd)
+ proc = remote.run(args=['bash', '-e', '-x',
+ '-c', cmd],
+ check_status=False,
+ stdout=StringIO(),
+ stderr=StringIO())
+ proc.wait()
+ if proc.exitstatus != 0:
+ log.error("failed with " +
+ str(proc.exitstatus))
+ log.error(proc.stdout.getvalue() + " " +
+ proc.stderr.getvalue())
+ ERRORS += 1
+
+ if len(values) != 0:
+ log.error("Not all keys found, remaining keys:")
+ log.error(values)
+
+ log.info("Test pg info")
+ for remote in osds.remotes.iterkeys():
+ for role in osds.remotes[remote]:
+ if string.find(role, "osd.") != 0:
+ continue
+ osdid = int(role.split('.')[1])
+ if osdid not in pgs:
+ continue
+
+ for pg in pgs[osdid]:
+ cmd = ((prefix + "--op info --pgid {pg}").
+ format(id=osdid, pg=pg).split())
+ proc = remote.run(args=cmd, check_status=False,
+ stdout=StringIO())
+ proc.wait()
+ if proc.exitstatus != 0:
+ log.error("Failure of --op info command with {ret}".
+ format(proc.exitstatus))
+ ERRORS += 1
+ continue
+ info = proc.stdout.getvalue()
+ if not str(pg) in info:
+ log.error("Bad data from info: {info}".format(info=info))
+ ERRORS += 1
+
+ log.info("Test pg logging")
+ for remote in osds.remotes.iterkeys():
+ for role in osds.remotes[remote]:
+ if string.find(role, "osd.") != 0:
+ continue
+ osdid = int(role.split('.')[1])
+ if osdid not in pgs:
+ continue
+
+ for pg in pgs[osdid]:
+ cmd = ((prefix + "--op log --pgid {pg}").
+ format(id=osdid, pg=pg).split())
+ proc = remote.run(args=cmd, check_status=False,
+ stdout=StringIO())
+ proc.wait()
+ if proc.exitstatus != 0:
+ log.error("Getting log failed for pg {pg} "
+ "from osd.{id} with {ret}".
+ format(pg=pg, id=osdid, ret=proc.exitstatus))
+ ERRORS += 1
+ continue
+ HASOBJ = pg in pgswithobjects
+ MODOBJ = "modify" in proc.stdout.getvalue()
+ if HASOBJ != MODOBJ:
+ log.error("Bad log for pg {pg} from osd.{id}".
+ format(pg=pg, id=osdid))
+ MSG = (HASOBJ and [""] or ["NOT "])[0]
+ log.error("Log should {msg}have a modify entry".
+ format(msg=MSG))
+ ERRORS += 1
+
+ log.info("Test pg export")
+ EXP_ERRORS = 0
+ for remote in osds.remotes.iterkeys():
+ for role in osds.remotes[remote]:
+ if string.find(role, "osd.") != 0:
+ continue
+ osdid = int(role.split('.')[1])
+ if osdid not in pgs:
+ continue
+
+ for pg in pgs[osdid]:
+ fpath = os.path.join(DATADIR, "osd{id}.{pg}".
+ format(id=osdid, pg=pg))
+
+ cmd = ((prefix + "--op export --pgid {pg} --file {file}").
+ format(id=osdid, pg=pg, file=fpath))
+ proc = remote.run(args=cmd, check_status=False,
+ stdout=StringIO())
+ proc.wait()
+ if proc.exitstatus != 0:
+ log.error("Exporting failed for pg {pg} "
+ "on osd.{id} with {ret}".
+ format(pg=pg, id=osdid, ret=proc.exitstatus))
+ EXP_ERRORS += 1
+
+ ERRORS += EXP_ERRORS
+
+ log.info("Test pg removal")
+ RM_ERRORS = 0
+ for remote in osds.remotes.iterkeys():
+ for role in osds.remotes[remote]:
+ if string.find(role, "osd.") != 0:
+ continue
+ osdid = int(role.split('.')[1])
+ if osdid not in pgs:
+ continue
+
+ for pg in pgs[osdid]:
+ cmd = ((prefix + "--op remove --pgid {pg}").
+ format(pg=pg, id=osdid))
+ proc = remote.run(args=cmd, check_status=False,
+ stdout=StringIO())
+ proc.wait()
+ if proc.exitstatus != 0:
+ log.error("Removing failed for pg {pg} "
+ "on osd.{id} with {ret}".
+ format(pg=pg, id=osdid, ret=proc.exitstatus))
+ RM_ERRORS += 1
+
+ ERRORS += RM_ERRORS
+
+ IMP_ERRORS = 0
+ if EXP_ERRORS == 0 and RM_ERRORS == 0:
+ log.info("Test pg import")
+
+ for remote in osds.remotes.iterkeys():
+ for role in osds.remotes[remote]:
+ if string.find(role, "osd.") != 0:
+ continue
+ osdid = int(role.split('.')[1])
+ if osdid not in pgs:
+ continue
+
+ for pg in pgs[osdid]:
+ fpath = os.path.join(DATADIR, "osd{id}.{pg}".
+ format(id=osdid, pg=pg))
+
+ cmd = ((prefix + "--op import --file {file}").
+ format(id=osdid, file=fpath))
+ proc = remote.run(args=cmd, check_status=False,
+ stdout=StringIO())
+ proc.wait()
+ if proc.exitstatus != 0:
+ log.error("Import failed from {file} with {ret}".
+ format(file=fpath, ret=proc.exitstatus))
+ IMP_ERRORS += 1
+ else:
+ log.warning("SKIPPING IMPORT TESTS DUE TO PREVIOUS FAILURES")
+
+ ERRORS += IMP_ERRORS
+
+ if EXP_ERRORS == 0 and RM_ERRORS == 0 and IMP_ERRORS == 0:
+ log.info("Restarting OSDs....")
+ # They are still look to be up because of setting nodown
+ for osd in manager.get_osd_status()['up']:
+ manager.revive_osd(osd)
+ # Wait for health?
+ time.sleep(5)
+ # Let scrub after test runs verify consistency of all copies
+ log.info("Verify replicated import data")
+ objects = range(1, NUM_OBJECTS + 1)
+ for i in objects:
+ NAME = REP_NAME + "{num}".format(num=i)
+ TESTNAME = os.path.join(DATADIR, "gettest")
+ REFNAME = os.path.join(DATADIR, NAME)
+
+ proc = rados(ctx, cli_remote,
+ ['-p', REP_POOL, 'get', NAME, TESTNAME], wait=False)
+
+ ret = proc.wait()
+ if ret != 0:
+ log.error("After import, rados get failed with {ret}".
+ format(ret=proc.exitstatus))
+ ERRORS += 1
+ continue
+
+ cmd = "diff -q {gettest} {ref}".format(gettest=TESTNAME,
+ ref=REFNAME)
+ proc = cli_remote.run(args=cmd, check_status=False)
+ proc.wait()
+ if proc.exitstatus != 0:
+ log.error("Data comparison failed for {obj}".format(obj=NAME))
+ ERRORS += 1
+
+ return ERRORS
--- /dev/null
+
+import unittest
+import time
+import logging
+
+from teuthology.orchestra.run import CommandFailedError
+
+log = logging.getLogger(__name__)
+
+
+class CephTestCase(unittest.TestCase):
+ """
+ For test tasks that want to define a structured set of
+ tests implemented in python. Subclass this with appropriate
+ helpers for the subsystem you're testing.
+ """
+
+ # Environment references
+ mounts = None
+ fs = None
+ ceph_cluster = None
+ mds_cluster = None
+ mgr_cluster = None
+ ctx = None
+
+ mon_manager = None
+
+ def setUp(self):
+ self.ceph_cluster.mon_manager.raw_cluster_cmd("log",
+ "Starting test {0}".format(self.id()))
+
+ def tearDown(self):
+ self.ceph_cluster.mon_manager.raw_cluster_cmd("log",
+ "Ended test {0}".format(self.id()))
+
+ def assert_cluster_log(self, expected_pattern, invert_match=False, timeout=10):
+ """
+ Context manager. Assert that during execution, or up to 5 seconds later,
+ the Ceph cluster log emits a message matching the expected pattern.
+
+ :param expected_pattern: a string that you expect to see in the log output
+ """
+
+ ceph_manager = self.ceph_cluster.mon_manager
+
+ class ContextManager(object):
+ def match(self):
+ found = expected_pattern in self.watcher_process.stdout.getvalue()
+ if invert_match:
+ return not found
+
+ return found
+
+ def __enter__(self):
+ self.watcher_process = ceph_manager.run_ceph_w()
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ if not self.watcher_process.finished:
+ # Check if we got an early match, wait a bit if we didn't
+ if self.match():
+ return
+ else:
+ log.debug("No log hits yet, waiting...")
+ # Default monc tick interval is 10s, so wait that long and
+ # then some grace
+ time.sleep(5 + timeout)
+
+ self.watcher_process.stdin.close()
+ try:
+ self.watcher_process.wait()
+ except CommandFailedError:
+ pass
+
+ if not self.match():
+ log.error("Log output: \n{0}\n".format(self.watcher_process.stdout.getvalue()))
+ raise AssertionError("Expected log message not found: '{0}'".format(expected_pattern))
+
+ return ContextManager()
+
+ def wait_for_health(self, pattern, timeout):
+ """
+ Wait until 'ceph health' contains messages matching the pattern
+ """
+ def seen_health_warning():
+ health = self.ceph_cluster.mon_manager.get_mon_health()
+ summary_strings = [s['summary'] for s in health['summary']]
+ if len(summary_strings) == 0:
+ log.debug("Not expected number of summary strings ({0})".format(summary_strings))
+ return False
+ else:
+ for ss in summary_strings:
+ if pattern in ss:
+ return True
+
+ log.debug("Not found expected summary strings yet ({0})".format(summary_strings))
+ return False
+
+ self.wait_until_true(seen_health_warning, timeout)
+
+ def wait_for_health_clear(self, timeout):
+ """
+ Wait until `ceph health` returns no messages
+ """
+ def is_clear():
+ health = self.ceph_cluster.mon_manager.get_mon_health()
+ return len(health['summary']) == 0
+
+ self.wait_until_true(is_clear, timeout)
+
+ def wait_until_equal(self, get_fn, expect_val, timeout, reject_fn=None):
+ period = 5
+ elapsed = 0
+ while True:
+ val = get_fn()
+ if val == expect_val:
+ return
+ elif reject_fn and reject_fn(val):
+ raise RuntimeError("wait_until_equal: forbidden value {0} seen".format(val))
+ else:
+ if elapsed >= timeout:
+ raise RuntimeError("Timed out after {0} seconds waiting for {1} (currently {2})".format(
+ elapsed, expect_val, val
+ ))
+ else:
+ log.debug("wait_until_equal: {0} != {1}, waiting...".format(val, expect_val))
+ time.sleep(period)
+ elapsed += period
+
+ log.debug("wait_until_equal: success")
+
+ def wait_until_true(self, condition, timeout):
+ period = 5
+ elapsed = 0
+ while True:
+ if condition():
+ log.debug("wait_until_true: success in {0}s".format(elapsed))
+ return
+ else:
+ if elapsed >= timeout:
+ raise RuntimeError("Timed out after {0}s".format(elapsed))
+ else:
+ log.debug("wait_until_true: waiting...")
+ time.sleep(period)
+ elapsed += period
+
+
--- /dev/null
+import json
+import logging
+from unittest import case
+from tasks.ceph_test_case import CephTestCase
+import os
+import re
+from StringIO import StringIO
+
+from tasks.cephfs.fuse_mount import FuseMount
+
+from teuthology.orchestra import run
+from teuthology.orchestra.run import CommandFailedError
+
+
+log = logging.getLogger(__name__)
+
+
+def for_teuthology(f):
+ """
+ Decorator that adds an "is_for_teuthology" attribute to the wrapped function
+ """
+ f.is_for_teuthology = True
+ return f
+
+
+def needs_trimming(f):
+ """
+ Mark fn as requiring a client capable of trimming its cache (i.e. for ceph-fuse
+ this means it needs to be able to run as root, currently)
+ """
+ f.needs_trimming = True
+ return f
+
+
+class CephFSTestCase(CephTestCase):
+ """
+ Test case for Ceph FS, requires caller to populate Filesystem and Mounts,
+ into the fs, mount_a, mount_b class attributes (setting mount_b is optional)
+
+ Handles resetting the cluster under test between tests.
+ """
+
+ # FIXME weird explicit naming
+ mount_a = None
+ mount_b = None
+
+ # Declarative test requirements: subclasses should override these to indicate
+ # their special needs. If not met, tests will be skipped.
+ CLIENTS_REQUIRED = 1
+ MDSS_REQUIRED = 1
+ REQUIRE_KCLIENT_REMOTE = False
+ REQUIRE_ONE_CLIENT_REMOTE = False
+ REQUIRE_MEMSTORE = False
+
+ # Whether to create the default filesystem during setUp
+ REQUIRE_FILESYSTEM = True
+
+ LOAD_SETTINGS = []
+
+ def setUp(self):
+ super(CephFSTestCase, self).setUp()
+
+ if len(self.mds_cluster.mds_ids) < self.MDSS_REQUIRED:
+ raise case.SkipTest("Only have {0} MDSs, require {1}".format(
+ len(self.mds_cluster.mds_ids), self.MDSS_REQUIRED
+ ))
+
+ if len(self.mounts) < self.CLIENTS_REQUIRED:
+ raise case.SkipTest("Only have {0} clients, require {1}".format(
+ len(self.mounts), self.CLIENTS_REQUIRED
+ ))
+
+ if self.REQUIRE_KCLIENT_REMOTE:
+ if not isinstance(self.mounts[0], FuseMount) or not isinstance(self.mounts[1], FuseMount):
+ # kclient kill() power cycles nodes, so requires clients to each be on
+ # their own node
+ if self.mounts[0].client_remote.hostname == self.mounts[1].client_remote.hostname:
+ raise case.SkipTest("kclient clients must be on separate nodes")
+
+ if self.REQUIRE_ONE_CLIENT_REMOTE:
+ if self.mounts[0].client_remote.hostname in self.mds_cluster.get_mds_hostnames():
+ raise case.SkipTest("Require first client to be on separate server from MDSs")
+
+ if self.REQUIRE_MEMSTORE:
+ objectstore = self.mds_cluster.get_config("osd_objectstore", "osd")
+ if objectstore != "memstore":
+ # You certainly *could* run this on a real OSD, but you don't want to sit
+ # here for hours waiting for the test to fill up a 1TB drive!
+ raise case.SkipTest("Require `memstore` OSD backend to simulate full drives")
+
+ # Unmount all surplus clients
+ for i in range(self.CLIENTS_REQUIRED, len(self.mounts)):
+ mount = self.mounts[i]
+ log.info("Unmounting unneeded client {0}".format(mount.client_id))
+ mount.umount_wait()
+
+ # Create friendly mount_a, mount_b attrs
+ for i in range(0, self.CLIENTS_REQUIRED):
+ setattr(self, "mount_{0}".format(chr(ord('a') + i)), self.mounts[i])
+
+ self.mds_cluster.clear_firewall()
+
+ # Unmount in order to start each test on a fresh mount, such
+ # that test_barrier can have a firm expectation of what OSD
+ # epoch the clients start with.
+ if self.mount_a.is_mounted():
+ self.mount_a.umount_wait()
+
+ if self.mount_b:
+ if self.mount_b.is_mounted():
+ self.mount_b.umount_wait()
+
+ # To avoid any issues with e.g. unlink bugs, we destroy and recreate
+ # the filesystem rather than just doing a rm -rf of files
+ self.mds_cluster.mds_stop()
+ self.mds_cluster.delete_all_filesystems()
+ self.fs = None # is now invalid!
+
+ # In case the previous filesystem had filled up the RADOS cluster, wait for that
+ # flag to pass.
+ osd_mon_report_interval_max = int(self.mds_cluster.get_config("osd_mon_report_interval_max", service_type='osd'))
+ self.wait_until_true(lambda: not self.mds_cluster.is_full(),
+ timeout=osd_mon_report_interval_max * 5)
+
+ # In case anything is in the OSD blacklist list, clear it out. This is to avoid
+ # the OSD map changing in the background (due to blacklist expiry) while tests run.
+ try:
+ self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blacklist", "clear")
+ except CommandFailedError:
+ # Fallback for older Ceph cluster
+ blacklist = json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd("osd",
+ "dump", "--format=json-pretty"))['blacklist']
+ log.info("Removing {0} blacklist entries".format(len(blacklist)))
+ for addr, blacklisted_at in blacklist.items():
+ self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blacklist", "rm", addr)
+
+ client_mount_ids = [m.client_id for m in self.mounts]
+ # In case the test changes the IDs of clients, stash them so that we can
+ # reset in tearDown
+ self._original_client_ids = client_mount_ids
+ log.info(client_mount_ids)
+
+ # In case there were any extra auth identities around from a previous
+ # test, delete them
+ for entry in self.auth_list():
+ ent_type, ent_id = entry['entity'].split(".")
+ if ent_type == "client" and ent_id not in client_mount_ids and ent_id != "admin":
+ self.mds_cluster.mon_manager.raw_cluster_cmd("auth", "del", entry['entity'])
+
+ if self.REQUIRE_FILESYSTEM:
+ self.fs = self.mds_cluster.newfs(True)
+ self.fs.mds_restart()
+
+ # In case some test messed with auth caps, reset them
+ for client_id in client_mount_ids:
+ self.mds_cluster.mon_manager.raw_cluster_cmd_result(
+ 'auth', 'caps', "client.{0}".format(client_id),
+ 'mds', 'allow',
+ 'mon', 'allow r',
+ 'osd', 'allow rw pool={0}'.format(self.fs.get_data_pool_name()))
+
+ # wait for mds restart to complete...
+ self.fs.wait_for_daemons()
+ if not self.mount_a.is_mounted():
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+
+ if self.mount_b:
+ if not self.mount_b.is_mounted():
+ self.mount_b.mount()
+ self.mount_b.wait_until_mounted()
+
+ # Load an config settings of interest
+ for setting in self.LOAD_SETTINGS:
+ setattr(self, setting, int(self.fs.mds_asok(
+ ['config', 'get', setting], self.mds_cluster.mds_ids[0]
+ )[setting]))
+
+ self.configs_set = set()
+
+ def tearDown(self):
+ super(CephFSTestCase, self).tearDown()
+
+ self.mds_cluster.clear_firewall()
+ for m in self.mounts:
+ m.teardown()
+
+ for i, m in enumerate(self.mounts):
+ m.client_id = self._original_client_ids[i]
+
+ for subsys, key in self.configs_set:
+ self.mds_cluster.clear_ceph_conf(subsys, key)
+
+ def set_conf(self, subsys, key, value):
+ self.configs_set.add((subsys, key))
+ self.mds_cluster.set_ceph_conf(subsys, key, value)
+
+ def auth_list(self):
+ """
+ Convenience wrapper on "ceph auth list"
+ """
+ return json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd(
+ "auth", "list", "--format=json-pretty"
+ ))['auth_dump']
+
+ def assert_session_count(self, expected, ls_data=None, mds_id=None):
+ if ls_data is None:
+ ls_data = self.fs.mds_asok(['session', 'ls'], mds_id=mds_id)
+
+ self.assertEqual(expected, len(ls_data), "Expected {0} sessions, found {1}".format(
+ expected, len(ls_data)
+ ))
+
+ def assert_session_state(self, client_id, expected_state):
+ self.assertEqual(
+ self._session_by_id(
+ self.fs.mds_asok(['session', 'ls'])).get(client_id, {'state': None})['state'],
+ expected_state)
+
+ def get_session_data(self, client_id):
+ return self._session_by_id(client_id)
+
+ def _session_list(self):
+ ls_data = self.fs.mds_asok(['session', 'ls'])
+ ls_data = [s for s in ls_data if s['state'] not in ['stale', 'closed']]
+ return ls_data
+
+ def get_session(self, client_id, session_ls=None):
+ if session_ls is None:
+ session_ls = self.fs.mds_asok(['session', 'ls'])
+
+ return self._session_by_id(session_ls)[client_id]
+
+ def _session_by_id(self, session_ls):
+ return dict([(s['id'], s) for s in session_ls])
+
+ def wait_for_daemon_start(self, daemon_ids=None):
+ """
+ Wait until all the daemons appear in the FSMap, either assigned
+ MDS ranks or in the list of standbys
+ """
+ def get_daemon_names():
+ return [info['name'] for info in self.mds_cluster.status().get_all()]
+
+ if daemon_ids is None:
+ daemon_ids = self.mds_cluster.mds_ids
+
+ try:
+ self.wait_until_true(
+ lambda: set(daemon_ids) & set(get_daemon_names()) == set(daemon_ids),
+ timeout=30
+ )
+ except RuntimeError:
+ log.warn("Timeout waiting for daemons {0}, while we have {1}".format(
+ daemon_ids, get_daemon_names()
+ ))
+ raise
+
+ def assert_mds_crash(self, daemon_id):
+ """
+ Assert that the a particular MDS daemon crashes (block until
+ it does)
+ """
+ try:
+ self.mds_cluster.mds_daemons[daemon_id].proc.wait()
+ except CommandFailedError as e:
+ log.info("MDS '{0}' crashed with status {1} as expected".format(daemon_id, e.exitstatus))
+ self.mds_cluster.mds_daemons[daemon_id].proc = None
+
+ # Go remove the coredump from the crash, otherwise teuthology.internal.coredump will
+ # catch it later and treat it as a failure.
+ p = self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
+ "sudo", "sysctl", "-n", "kernel.core_pattern"], stdout=StringIO())
+ core_pattern = p.stdout.getvalue().strip()
+ if os.path.dirname(core_pattern): # Non-default core_pattern with a directory in it
+ # We have seen a core_pattern that looks like it's from teuthology's coredump
+ # task, so proceed to clear out the core file
+ log.info("Clearing core from pattern: {0}".format(core_pattern))
+
+ # Determine the PID of the crashed MDS by inspecting the MDSMap, it had
+ # to talk to the mons to get assigned a rank to reach the point of crashing
+ addr = self.mds_cluster.mon_manager.get_mds_status(daemon_id)['addr']
+ pid_str = addr.split("/")[1]
+ log.info("Determined crasher PID was {0}".format(pid_str))
+
+ # Substitute PID into core_pattern to get a glob
+ core_glob = core_pattern.replace("%p", pid_str)
+ core_glob = re.sub("%[a-z]", "*", core_glob) # Match all for all other % tokens
+
+ # Verify that we see the expected single coredump matching the expected pattern
+ ls_proc = self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
+ "sudo", "ls", run.Raw(core_glob)
+ ], stdout=StringIO())
+ cores = [f for f in ls_proc.stdout.getvalue().strip().split("\n") if f]
+ log.info("Enumerated cores: {0}".format(cores))
+ self.assertEqual(len(cores), 1)
+
+ log.info("Found core file {0}, deleting it".format(cores[0]))
+
+ self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
+ "sudo", "rm", "-f", cores[0]
+ ])
+ else:
+ log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)")
+
+ else:
+ raise AssertionError("MDS daemon '{0}' did not crash as expected".format(daemon_id))
--- /dev/null
+
+from StringIO import StringIO
+import json
+import logging
+from gevent import Greenlet
+import os
+import time
+import datetime
+import re
+import errno
+
+from teuthology.exceptions import CommandFailedError
+from teuthology import misc
+from teuthology.nuke import clear_firewall
+from teuthology.parallel import parallel
+from tasks.ceph_manager import write_conf
+from tasks import ceph_manager
+
+
+log = logging.getLogger(__name__)
+
+
+DAEMON_WAIT_TIMEOUT = 120
+ROOT_INO = 1
+
+
+class ObjectNotFound(Exception):
+ def __init__(self, object_name):
+ self._object_name = object_name
+
+ def __str__(self):
+ return "Object not found: '{0}'".format(self._object_name)
+
+class FSStatus(object):
+ """
+ Operations on a snapshot of the FSMap.
+ """
+ def __init__(self, mon_manager):
+ self.mon = mon_manager
+ self.map = json.loads(self.mon.raw_cluster_cmd("fs", "dump", "--format=json-pretty"))
+
+ def __str__(self):
+ return json.dumps(self.map, indent = 2, sort_keys = True)
+
+ # Expose the fsmap for manual inspection.
+ def __getitem__(self, key):
+ """
+ Get a field from the fsmap.
+ """
+ return self.map[key]
+
+ def get_filesystems(self):
+ """
+ Iterator for all filesystems.
+ """
+ for fs in self.map['filesystems']:
+ yield fs
+
+ def get_all(self):
+ """
+ Iterator for all the mds_info components in the FSMap.
+ """
+ for info in self.get_standbys():
+ yield info
+ for fs in self.map['filesystems']:
+ for info in fs['mdsmap']['info'].values():
+ yield info
+
+ def get_standbys(self):
+ """
+ Iterator for all standbys.
+ """
+ for info in self.map['standbys']:
+ yield info
+
+ def get_fsmap(self, fscid):
+ """
+ Get the fsmap for the given FSCID.
+ """
+ for fs in self.map['filesystems']:
+ if fscid is None or fs['id'] == fscid:
+ return fs
+ raise RuntimeError("FSCID {0} not in map".format(fscid))
+
+ def get_fsmap_byname(self, name):
+ """
+ Get the fsmap for the given file system name.
+ """
+ for fs in self.map['filesystems']:
+ if name is None or fs['mdsmap']['fs_name'] == name:
+ return fs
+ raise RuntimeError("FS {0} not in map".format(name))
+
+ def get_replays(self, fscid):
+ """
+ Get the standby:replay MDS for the given FSCID.
+ """
+ fs = self.get_fsmap(fscid)
+ for info in fs['mdsmap']['info'].values():
+ if info['state'] == 'up:standby-replay':
+ yield info
+
+ def get_ranks(self, fscid):
+ """
+ Get the ranks for the given FSCID.
+ """
+ fs = self.get_fsmap(fscid)
+ for info in fs['mdsmap']['info'].values():
+ if info['rank'] >= 0:
+ yield info
+
+ def get_rank(self, fscid, rank):
+ """
+ Get the rank for the given FSCID.
+ """
+ for info in self.get_ranks(fscid):
+ if info['rank'] == rank:
+ return info
+ raise RuntimeError("FSCID {0} has no rank {1}".format(fscid, rank))
+
+ def get_mds(self, name):
+ """
+ Get the info for the given MDS name.
+ """
+ for info in self.get_all():
+ if info['name'] == name:
+ return info
+ return None
+
+ def get_mds_addr(self, name):
+ """
+ Return the instance addr as a string, like "10.214.133.138:6807\/10825"
+ """
+ info = self.get_mds(name)
+ if info:
+ return info['addr']
+ else:
+ log.warn(json.dumps(list(self.get_all()), indent=2)) # dump for debugging
+ raise RuntimeError("MDS id '{0}' not found in map".format(name))
+
+class CephCluster(object):
+ @property
+ def admin_remote(self):
+ first_mon = misc.get_first_mon(self._ctx, None)
+ (result,) = self._ctx.cluster.only(first_mon).remotes.iterkeys()
+ return result
+
+ def __init__(self, ctx):
+ self._ctx = ctx
+ self.mon_manager = ceph_manager.CephManager(self.admin_remote, ctx=ctx, logger=log.getChild('ceph_manager'))
+
+ def get_config(self, key, service_type=None):
+ """
+ Get config from mon by default, or a specific service if caller asks for it
+ """
+ if service_type is None:
+ service_type = 'mon'
+
+ service_id = sorted(misc.all_roles_of_type(self._ctx.cluster, service_type))[0]
+ return self.json_asok(['config', 'get', key], service_type, service_id)[key]
+
+ def set_ceph_conf(self, subsys, key, value):
+ if subsys not in self._ctx.ceph['ceph'].conf:
+ self._ctx.ceph['ceph'].conf[subsys] = {}
+ self._ctx.ceph['ceph'].conf[subsys][key] = value
+ write_conf(self._ctx) # XXX because we don't have the ceph task's config object, if they
+ # used a different config path this won't work.
+
+ def clear_ceph_conf(self, subsys, key):
+ del self._ctx.ceph['ceph'].conf[subsys][key]
+ write_conf(self._ctx)
+
+ def json_asok(self, command, service_type, service_id):
+ proc = self.mon_manager.admin_socket(service_type, service_id, command)
+ response_data = proc.stdout.getvalue()
+ log.info("_json_asok output: {0}".format(response_data))
+ if response_data.strip():
+ return json.loads(response_data)
+ else:
+ return None
+
+
+class MDSCluster(CephCluster):
+ """
+ Collective operations on all the MDS daemons in the Ceph cluster. These
+ daemons may be in use by various Filesystems.
+
+ For the benefit of pre-multi-filesystem tests, this class is also
+ a parent of Filesystem. The correct way to use MDSCluster going forward is
+ as a separate instance outside of your (multiple) Filesystem instances.
+ """
+ def __init__(self, ctx):
+ super(MDSCluster, self).__init__(ctx)
+
+ self.mds_ids = list(misc.all_roles_of_type(ctx.cluster, 'mds'))
+
+ if len(self.mds_ids) == 0:
+ raise RuntimeError("This task requires at least one MDS")
+
+ if hasattr(self._ctx, "daemons"):
+ # Presence of 'daemons' attribute implies ceph task rather than ceph_deploy task
+ self.mds_daemons = dict([(mds_id, self._ctx.daemons.get_daemon('mds', mds_id)) for mds_id in self.mds_ids])
+
+ def _one_or_all(self, mds_id, cb, in_parallel=True):
+ """
+ Call a callback for a single named MDS, or for all.
+
+ Note that the parallelism here isn't for performance, it's to avoid being overly kind
+ to the cluster by waiting a graceful ssh-latency of time between doing things, and to
+ avoid being overly kind by executing them in a particular order. However, some actions
+ don't cope with being done in parallel, so it's optional (`in_parallel`)
+
+ :param mds_id: MDS daemon name, or None
+ :param cb: Callback taking single argument of MDS daemon name
+ :param in_parallel: whether to invoke callbacks concurrently (else one after the other)
+ """
+ if mds_id is None:
+ if in_parallel:
+ with parallel() as p:
+ for mds_id in self.mds_ids:
+ p.spawn(cb, mds_id)
+ else:
+ for mds_id in self.mds_ids:
+ cb(mds_id)
+ else:
+ cb(mds_id)
+
+ def mds_stop(self, mds_id=None):
+ """
+ Stop the MDS daemon process(se). If it held a rank, that rank
+ will eventually go laggy.
+ """
+ self._one_or_all(mds_id, lambda id_: self.mds_daemons[id_].stop())
+
+ def mds_fail(self, mds_id=None):
+ """
+ Inform MDSMonitor of the death of the daemon process(es). If it held
+ a rank, that rank will be relinquished.
+ """
+ self._one_or_all(mds_id, lambda id_: self.mon_manager.raw_cluster_cmd("mds", "fail", id_))
+
+ def mds_restart(self, mds_id=None):
+ self._one_or_all(mds_id, lambda id_: self.mds_daemons[id_].restart())
+
+ def mds_fail_restart(self, mds_id=None):
+ """
+ Variation on restart that includes marking MDSs as failed, so that doing this
+ operation followed by waiting for healthy daemon states guarantees that they
+ have gone down and come up, rather than potentially seeing the healthy states
+ that existed before the restart.
+ """
+ def _fail_restart(id_):
+ self.mds_daemons[id_].stop()
+ self.mon_manager.raw_cluster_cmd("mds", "fail", id_)
+ self.mds_daemons[id_].restart()
+
+ self._one_or_all(mds_id, _fail_restart)
+
+ def newfs(self, name):
+ return Filesystem(self._ctx, create=name)
+
+ def status(self):
+ return FSStatus(self.mon_manager)
+
+ def delete_all_filesystems(self):
+ """
+ Remove all filesystems that exist, and any pools in use by them.
+ """
+ pools = json.loads(self.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools']
+ pool_id_name = {}
+ for pool in pools:
+ pool_id_name[pool['pool']] = pool['pool_name']
+
+ # mark cluster down for each fs to prevent churn during deletion
+ status = self.status()
+ for fs in status.get_filesystems():
+ self.mon_manager.raw_cluster_cmd("fs", "set", fs['mdsmap']['fs_name'], "cluster_down", "true")
+
+ # get a new copy as actives may have since changed
+ status = self.status()
+ for fs in status.get_filesystems():
+ mdsmap = fs['mdsmap']
+ metadata_pool = pool_id_name[mdsmap['metadata_pool']]
+
+ for gid in mdsmap['up'].values():
+ self.mon_manager.raw_cluster_cmd('mds', 'fail', gid.__str__())
+
+ self.mon_manager.raw_cluster_cmd('fs', 'rm', mdsmap['fs_name'], '--yes-i-really-mean-it')
+ self.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
+ metadata_pool, metadata_pool,
+ '--yes-i-really-really-mean-it')
+ for data_pool in mdsmap['data_pools']:
+ data_pool = pool_id_name[data_pool]
+ self.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
+ data_pool, data_pool,
+ '--yes-i-really-really-mean-it')
+
+ def get_standby_daemons(self):
+ return set([s['name'] for s in self.status().get_standbys()])
+
+ def get_mds_hostnames(self):
+ result = set()
+ for mds_id in self.mds_ids:
+ mds_remote = self.mon_manager.find_remote('mds', mds_id)
+ result.add(mds_remote.hostname)
+
+ return list(result)
+
+ def set_clients_block(self, blocked, mds_id=None):
+ """
+ Block (using iptables) client communications to this MDS. Be careful: if
+ other services are running on this MDS, or other MDSs try to talk to this
+ MDS, their communications may also be blocked as collatoral damage.
+
+ :param mds_id: Optional ID of MDS to block, default to all
+ :return:
+ """
+ da_flag = "-A" if blocked else "-D"
+
+ def set_block(_mds_id):
+ remote = self.mon_manager.find_remote('mds', _mds_id)
+ status = self.status()
+
+ addr = status.get_mds_addr(_mds_id)
+ ip_str, port_str, inst_str = re.match("(.+):(.+)/(.+)", addr).groups()
+
+ remote.run(
+ args=["sudo", "iptables", da_flag, "OUTPUT", "-p", "tcp", "--sport", port_str, "-j", "REJECT", "-m",
+ "comment", "--comment", "teuthology"])
+ remote.run(
+ args=["sudo", "iptables", da_flag, "INPUT", "-p", "tcp", "--dport", port_str, "-j", "REJECT", "-m",
+ "comment", "--comment", "teuthology"])
+
+ self._one_or_all(mds_id, set_block, in_parallel=False)
+
+ def clear_firewall(self):
+ clear_firewall(self._ctx)
+
+ def get_mds_info(self, mds_id):
+ return FSStatus(self.mon_manager).get_mds(mds_id)
+
+ def is_full(self):
+ flags = json.loads(self.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['flags']
+ return 'full' in flags
+
+ def is_pool_full(self, pool_name):
+ pools = json.loads(self.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools']
+ for pool in pools:
+ if pool['pool_name'] == pool_name:
+ return 'full' in pool['flags_names'].split(",")
+
+ raise RuntimeError("Pool not found '{0}'".format(pool_name))
+
+class Filesystem(MDSCluster):
+ """
+ This object is for driving a CephFS filesystem. The MDS daemons driven by
+ MDSCluster may be shared with other Filesystems.
+ """
+ def __init__(self, ctx, fscid=None, create=None):
+ super(Filesystem, self).__init__(ctx)
+
+ self.id = None
+ self.name = None
+ self.metadata_pool_name = None
+ self.data_pools = None
+
+ client_list = list(misc.all_roles_of_type(self._ctx.cluster, 'client'))
+ self.client_id = client_list[0]
+ self.client_remote = list(misc.get_clients(ctx=ctx, roles=["client.{0}".format(self.client_id)]))[0][1]
+
+ if create is not None:
+ if fscid is not None:
+ raise RuntimeError("cannot specify fscid when creating fs")
+ if create is True:
+ self.name = 'cephfs'
+ else:
+ self.name = create
+ if not self.legacy_configured():
+ self.create()
+ elif fscid is not None:
+ self.id = fscid
+ self.getinfo(refresh = True)
+
+ # Stash a reference to the first created filesystem on ctx, so
+ # that if someone drops to the interactive shell they can easily
+ # poke our methods.
+ if not hasattr(self._ctx, "filesystem"):
+ self._ctx.filesystem = self
+
+ def getinfo(self, refresh = False):
+ status = self.status()
+ if self.id is not None:
+ fsmap = status.get_fsmap(self.id)
+ elif self.name is not None:
+ fsmap = status.get_fsmap_byname(self.name)
+ else:
+ fss = [fs for fs in status.get_filesystems()]
+ if len(fss) == 1:
+ fsmap = fss[0]
+ elif len(fss) == 0:
+ raise RuntimeError("no file system available")
+ else:
+ raise RuntimeError("more than one file system available")
+ self.id = fsmap['id']
+ self.name = fsmap['mdsmap']['fs_name']
+ self.get_pool_names(status = status, refresh = refresh)
+ return status
+
+ def deactivate(self, rank):
+ if rank < 0:
+ raise RuntimeError("invalid rank")
+ elif rank == 0:
+ raise RuntimeError("cannot deactivate rank 0")
+ self.mon_manager.raw_cluster_cmd("mds", "deactivate", "%d:%d" % (self.id, rank))
+
+ def set_max_mds(self, max_mds):
+ self.mon_manager.raw_cluster_cmd("fs", "set", self.name, "max_mds", "%d" % max_mds)
+
+ def get_pgs_per_fs_pool(self):
+ """
+ Calculate how many PGs to use when creating a pool, in order to avoid raising any
+ health warnings about mon_pg_warn_min_per_osd
+
+ :return: an integer number of PGs
+ """
+ pg_warn_min_per_osd = int(self.get_config('mon_pg_warn_min_per_osd'))
+ osd_count = len(list(misc.all_roles_of_type(self._ctx.cluster, 'osd')))
+ return pg_warn_min_per_osd * osd_count
+
+ def create(self):
+ if self.name is None:
+ self.name = "cephfs"
+ if self.metadata_pool_name is None:
+ self.metadata_pool_name = "{0}_metadata".format(self.name)
+ data_pool_name = "{0}_data".format(self.name)
+
+ log.info("Creating filesystem '{0}'".format(self.name))
+
+ pgs_per_fs_pool = self.get_pgs_per_fs_pool()
+
+ self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
+ self.metadata_pool_name, pgs_per_fs_pool.__str__())
+ self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
+ data_pool_name, pgs_per_fs_pool.__str__())
+ self.mon_manager.raw_cluster_cmd('fs', 'new',
+ self.name, self.metadata_pool_name, data_pool_name)
+
+ self.getinfo(refresh = True)
+
+ def __del__(self):
+ if getattr(self._ctx, "filesystem", None) == self:
+ delattr(self._ctx, "filesystem")
+
+ def exists(self):
+ """
+ Whether a filesystem exists in the mon's filesystem list
+ """
+ fs_list = json.loads(self.mon_manager.raw_cluster_cmd('fs', 'ls', '--format=json-pretty'))
+ return self.name in [fs['name'] for fs in fs_list]
+
+ def legacy_configured(self):
+ """
+ Check if a legacy (i.e. pre "fs new") filesystem configuration is present. If this is
+ the case, the caller should avoid using Filesystem.create
+ """
+ try:
+ out_text = self.mon_manager.raw_cluster_cmd('--format=json-pretty', 'osd', 'lspools')
+ pools = json.loads(out_text)
+ metadata_pool_exists = 'metadata' in [p['poolname'] for p in pools]
+ if metadata_pool_exists:
+ self.metadata_pool_name = 'metadata'
+ except CommandFailedError as e:
+ # For use in upgrade tests, Ceph cuttlefish and earlier don't support
+ # structured output (--format) from the CLI.
+ if e.exitstatus == 22:
+ metadata_pool_exists = True
+ else:
+ raise
+
+ return metadata_pool_exists
+
+ def _df(self):
+ return json.loads(self.mon_manager.raw_cluster_cmd("df", "--format=json-pretty"))
+
+ def get_mds_map(self):
+ return self.status().get_fsmap(self.id)['mdsmap']
+
+ def add_data_pool(self, name):
+ self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', name, self.get_pgs_per_fs_pool().__str__())
+ self.mon_manager.raw_cluster_cmd('fs', 'add_data_pool', self.name, name)
+ self.get_pool_names(refresh = True)
+ for poolid, fs_name in self.data_pools.items():
+ if name == fs_name:
+ return poolid
+ raise RuntimeError("could not get just created pool '{0}'".format(name))
+
+ def get_pool_names(self, refresh = False, status = None):
+ if refresh or self.metadata_pool_name is None or self.data_pools is None:
+ if status is None:
+ status = self.status()
+ fsmap = status.get_fsmap(self.id)
+
+ osd_map = self.mon_manager.get_osd_dump_json()
+ id_to_name = {}
+ for p in osd_map['pools']:
+ id_to_name[p['pool']] = p['pool_name']
+
+ self.metadata_pool_name = id_to_name[fsmap['mdsmap']['metadata_pool']]
+ self.data_pools = {}
+ for data_pool in fsmap['mdsmap']['data_pools']:
+ self.data_pools[data_pool] = id_to_name[data_pool]
+
+ def get_data_pool_name(self, refresh = False):
+ if refresh or self.data_pools is None:
+ self.get_pool_names(refresh = True)
+ assert(len(self.data_pools) == 1)
+ return self.data_pools.values()[0]
+
+ def get_data_pool_id(self, refresh = False):
+ """
+ Don't call this if you have multiple data pools
+ :return: integer
+ """
+ if refresh or self.data_pools is None:
+ self.get_pool_names(refresh = True)
+ assert(len(self.data_pools) == 1)
+ return self.data_pools.keys()[0]
+
+ def get_data_pool_names(self, refresh = False):
+ if refresh or self.data_pools is None:
+ self.get_pool_names(refresh = True)
+ return self.data_pools.values()
+
+ def get_metadata_pool_name(self):
+ return self.metadata_pool_name
+
+ def get_namespace_id(self):
+ return self.id
+
+ def get_pool_df(self, pool_name):
+ """
+ Return a dict like:
+ {u'bytes_used': 0, u'max_avail': 83848701, u'objects': 0, u'kb_used': 0}
+ """
+ for pool_df in self._df()['pools']:
+ if pool_df['name'] == pool_name:
+ return pool_df['stats']
+
+ raise RuntimeError("Pool name '{0}' not found".format(pool_name))
+
+ def get_usage(self):
+ return self._df()['stats']['total_used_bytes']
+
+ def are_daemons_healthy(self):
+ """
+ Return true if all daemons are in one of active, standby, standby-replay, and
+ at least max_mds daemons are in 'active'.
+
+ Unlike most of Filesystem, this function is tolerant of new-style `fs`
+ commands being missing, because we are part of the ceph installation
+ process during upgrade suites, so must fall back to old style commands
+ when we get an EINVAL on a new style command.
+
+ :return:
+ """
+
+ active_count = 0
+ try:
+ mds_map = self.get_mds_map()
+ except CommandFailedError as cfe:
+ # Old version, fall back to non-multi-fs commands
+ if cfe.exitstatus == errno.EINVAL:
+ mds_map = json.loads(
+ self.mon_manager.raw_cluster_cmd('mds', 'dump', '--format=json'))
+ else:
+ raise
+
+ log.info("are_daemons_healthy: mds map: {0}".format(mds_map))
+
+ for mds_id, mds_status in mds_map['info'].items():
+ if mds_status['state'] not in ["up:active", "up:standby", "up:standby-replay"]:
+ log.warning("Unhealthy mds state {0}:{1}".format(mds_id, mds_status['state']))
+ return False
+ elif mds_status['state'] == 'up:active':
+ active_count += 1
+
+ log.info("are_daemons_healthy: {0}/{1}".format(
+ active_count, mds_map['max_mds']
+ ))
+
+ if active_count >= mds_map['max_mds']:
+ # The MDSMap says these guys are active, but let's check they really are
+ for mds_id, mds_status in mds_map['info'].items():
+ if mds_status['state'] == 'up:active':
+ try:
+ daemon_status = self.mds_asok(["status"], mds_id=mds_status['name'])
+ except CommandFailedError as cfe:
+ if cfe.exitstatus == errno.EINVAL:
+ # Old version, can't do this check
+ continue
+ else:
+ # MDS not even running
+ return False
+
+ if daemon_status['state'] != 'up:active':
+ # MDS hasn't taken the latest map yet
+ return False
+
+ return True
+ else:
+ return False
+
+ def get_daemon_names(self, state=None):
+ """
+ Return MDS daemon names of those daemons in the given state
+ :param state:
+ :return:
+ """
+ status = self.get_mds_map()
+ result = []
+ for mds_status in sorted(status['info'].values(), lambda a, b: cmp(a['rank'], b['rank'])):
+ if mds_status['state'] == state or state is None:
+ result.append(mds_status['name'])
+
+ return result
+
+ def get_active_names(self):
+ """
+ Return MDS daemon names of those daemons holding ranks
+ in state up:active
+
+ :return: list of strings like ['a', 'b'], sorted by rank
+ """
+ return self.get_daemon_names("up:active")
+
+ def get_all_mds_rank(self):
+ status = self.get_mds_map()
+ result = []
+ for mds_status in sorted(status['info'].values(), lambda a, b: cmp(a['rank'], b['rank'])):
+ if mds_status['rank'] != -1 and mds_status['state'] != 'up:standby-replay':
+ result.append(mds_status['rank'])
+
+ return result
+
+ def get_rank_names(self):
+ """
+ Return MDS daemon names of those daemons holding a rank,
+ sorted by rank. This includes e.g. up:replay/reconnect
+ as well as active, but does not include standby or
+ standby-replay.
+ """
+ status = self.get_mds_map()
+ result = []
+ for mds_status in sorted(status['info'].values(), lambda a, b: cmp(a['rank'], b['rank'])):
+ if mds_status['rank'] != -1 and mds_status['state'] != 'up:standby-replay':
+ result.append(mds_status['name'])
+
+ return result
+
+ def wait_for_daemons(self, timeout=None):
+ """
+ Wait until all daemons are healthy
+ :return:
+ """
+
+ if timeout is None:
+ timeout = DAEMON_WAIT_TIMEOUT
+
+ elapsed = 0
+ while True:
+ if self.are_daemons_healthy():
+ return
+ else:
+ time.sleep(1)
+ elapsed += 1
+
+ if elapsed > timeout:
+ raise RuntimeError("Timed out waiting for MDS daemons to become healthy")
+
+ def get_lone_mds_id(self):
+ """
+ Get a single MDS ID: the only one if there is only one
+ configured, else the only one currently holding a rank,
+ else raise an error.
+ """
+ if len(self.mds_ids) != 1:
+ alive = self.get_rank_names()
+ if len(alive) == 1:
+ return alive[0]
+ else:
+ raise ValueError("Explicit MDS argument required when multiple MDSs in use")
+ else:
+ return self.mds_ids[0]
+
+ def recreate(self):
+ log.info("Creating new filesystem")
+ self.delete_all_filesystems()
+ self.id = None
+ self.create()
+
+ def put_metadata_object_raw(self, object_id, infile):
+ """
+ Save an object to the metadata pool
+ """
+ temp_bin_path = infile
+ self.client_remote.run(args=[
+ 'sudo', os.path.join(self._prefix, 'rados'), '-p', self.metadata_pool_name, 'put', object_id, temp_bin_path
+ ])
+
+ def get_metadata_object_raw(self, object_id):
+ """
+ Retrieve an object from the metadata pool and store it in a file.
+ """
+ temp_bin_path = '/tmp/' + object_id + '.bin'
+
+ self.client_remote.run(args=[
+ 'sudo', os.path.join(self._prefix, 'rados'), '-p', self.metadata_pool_name, 'get', object_id, temp_bin_path
+ ])
+
+ return temp_bin_path
+
+ def get_metadata_object(self, object_type, object_id):
+ """
+ Retrieve an object from the metadata pool, pass it through
+ ceph-dencoder to dump it to JSON, and return the decoded object.
+ """
+ temp_bin_path = '/tmp/out.bin'
+
+ self.client_remote.run(args=[
+ 'sudo', os.path.join(self._prefix, 'rados'), '-p', self.metadata_pool_name, 'get', object_id, temp_bin_path
+ ])
+
+ stdout = StringIO()
+ self.client_remote.run(args=[
+ 'sudo', os.path.join(self._prefix, 'ceph-dencoder'), 'type', object_type, 'import', temp_bin_path, 'decode', 'dump_json'
+ ], stdout=stdout)
+ dump_json = stdout.getvalue().strip()
+ try:
+ dump = json.loads(dump_json)
+ except (TypeError, ValueError):
+ log.error("Failed to decode JSON: '{0}'".format(dump_json))
+ raise
+
+ return dump
+
+ def get_journal_version(self):
+ """
+ Read the JournalPointer and Journal::Header objects to learn the version of
+ encoding in use.
+ """
+ journal_pointer_object = '400.00000000'
+ journal_pointer_dump = self.get_metadata_object("JournalPointer", journal_pointer_object)
+ journal_ino = journal_pointer_dump['journal_pointer']['front']
+
+ journal_header_object = "{0:x}.00000000".format(journal_ino)
+ journal_header_dump = self.get_metadata_object('Journaler::Header', journal_header_object)
+
+ version = journal_header_dump['journal_header']['stream_format']
+ log.info("Read journal version {0}".format(version))
+
+ return version
+
+ def mds_asok(self, command, mds_id=None):
+ if mds_id is None:
+ mds_id = self.get_lone_mds_id()
+
+ return self.json_asok(command, 'mds', mds_id)
+
+ def read_cache(self, path, depth=None):
+ cmd = ["dump", "tree", path]
+ if depth is not None:
+ cmd.append(depth.__str__())
+ result = self.mds_asok(cmd)
+ if len(result) == 0:
+ raise RuntimeError("Path not found in cache: {0}".format(path))
+
+ return result
+
+ def wait_for_state(self, goal_state, reject=None, timeout=None, mds_id=None):
+ """
+ Block until the MDS reaches a particular state, or a failure condition
+ is met.
+
+ When there are multiple MDSs, succeed when exaclty one MDS is in the
+ goal state, or fail when any MDS is in the reject state.
+
+ :param goal_state: Return once the MDS is in this state
+ :param reject: Fail if the MDS enters this state before the goal state
+ :param timeout: Fail if this many seconds pass before reaching goal
+ :return: number of seconds waited, rounded down to integer
+ """
+
+ started_at = time.time()
+ while True:
+ status = self.status()
+ if mds_id is not None:
+ # mds_info is None if no daemon with this ID exists in the map
+ mds_info = status.get_mds(mds_id)
+ current_state = mds_info['state'] if mds_info else None
+ log.info("Looked up MDS state for {0}: {1}".format(mds_id, current_state))
+ else:
+ # In general, look for a single MDS
+ states = [m['state'] for m in status.get_ranks(self.id)]
+ if [s for s in states if s == goal_state] == [goal_state]:
+ current_state = goal_state
+ elif reject in states:
+ current_state = reject
+ else:
+ current_state = None
+ log.info("mapped states {0} to {1}".format(states, current_state))
+
+ elapsed = time.time() - started_at
+ if current_state == goal_state:
+ log.info("reached state '{0}' in {1}s".format(current_state, elapsed))
+ return elapsed
+ elif reject is not None and current_state == reject:
+ raise RuntimeError("MDS in reject state {0}".format(current_state))
+ elif timeout is not None and elapsed > timeout:
+ log.error("MDS status at timeout: {0}".format(status.get_fsmap(self.id)))
+ raise RuntimeError(
+ "Reached timeout after {0} seconds waiting for state {1}, while in state {2}".format(
+ elapsed, goal_state, current_state
+ ))
+ else:
+ time.sleep(1)
+
+ def _read_data_xattr(self, ino_no, xattr_name, type, pool):
+ mds_id = self.mds_ids[0]
+ remote = self.mds_daemons[mds_id].remote
+ if pool is None:
+ pool = self.get_data_pool_name()
+
+ obj_name = "{0:x}.00000000".format(ino_no)
+
+ args = [
+ os.path.join(self._prefix, "rados"), "-p", pool, "getxattr", obj_name, xattr_name
+ ]
+ try:
+ proc = remote.run(
+ args=args,
+ stdout=StringIO())
+ except CommandFailedError as e:
+ log.error(e.__str__())
+ raise ObjectNotFound(obj_name)
+
+ data = proc.stdout.getvalue()
+
+ p = remote.run(
+ args=[os.path.join(self._prefix, "ceph-dencoder"), "type", type, "import", "-", "decode", "dump_json"],
+ stdout=StringIO(),
+ stdin=data
+ )
+
+ return json.loads(p.stdout.getvalue().strip())
+
+ def _write_data_xattr(self, ino_no, xattr_name, data, pool=None):
+ """
+ Write to an xattr of the 0th data object of an inode. Will
+ succeed whether the object and/or xattr already exist or not.
+
+ :param ino_no: integer inode number
+ :param xattr_name: string name of the xattr
+ :param data: byte array data to write to the xattr
+ :param pool: name of data pool or None to use primary data pool
+ :return: None
+ """
+ remote = self.mds_daemons[self.mds_ids[0]].remote
+ if pool is None:
+ pool = self.get_data_pool_name()
+
+ obj_name = "{0:x}.00000000".format(ino_no)
+ args = [
+ os.path.join(self._prefix, "rados"), "-p", pool, "setxattr",
+ obj_name, xattr_name, data
+ ]
+ remote.run(
+ args=args,
+ stdout=StringIO())
+
+ def read_backtrace(self, ino_no, pool=None):
+ """
+ Read the backtrace from the data pool, return a dict in the format
+ given by inode_backtrace_t::dump, which is something like:
+
+ ::
+
+ rados -p cephfs_data getxattr 10000000002.00000000 parent > out.bin
+ ceph-dencoder type inode_backtrace_t import out.bin decode dump_json
+
+ { "ino": 1099511627778,
+ "ancestors": [
+ { "dirino": 1,
+ "dname": "blah",
+ "version": 11}],
+ "pool": 1,
+ "old_pools": []}
+
+ :param pool: name of pool to read backtrace from. If omitted, FS must have only
+ one data pool and that will be used.
+ """
+ return self._read_data_xattr(ino_no, "parent", "inode_backtrace_t", pool)
+
+ def read_layout(self, ino_no, pool=None):
+ """
+ Read 'layout' xattr of an inode and parse the result, returning a dict like:
+ ::
+ {
+ "stripe_unit": 4194304,
+ "stripe_count": 1,
+ "object_size": 4194304,
+ "pool_id": 1,
+ "pool_ns": "",
+ }
+
+ :param pool: name of pool to read backtrace from. If omitted, FS must have only
+ one data pool and that will be used.
+ """
+ return self._read_data_xattr(ino_no, "layout", "file_layout_t", pool)
+
+ def _enumerate_data_objects(self, ino, size):
+ """
+ Get the list of expected data objects for a range, and the list of objects
+ that really exist.
+
+ :return a tuple of two lists of strings (expected, actual)
+ """
+ stripe_size = 1024 * 1024 * 4
+
+ size = max(stripe_size, size)
+
+ want_objects = [
+ "{0:x}.{1:08x}".format(ino, n)
+ for n in range(0, ((size - 1) / stripe_size) + 1)
+ ]
+
+ exist_objects = self.rados(["ls"], pool=self.get_data_pool_name()).split("\n")
+
+ return want_objects, exist_objects
+
+ def data_objects_present(self, ino, size):
+ """
+ Check that *all* the expected data objects for an inode are present in the data pool
+ """
+
+ want_objects, exist_objects = self._enumerate_data_objects(ino, size)
+ missing = set(want_objects) - set(exist_objects)
+
+ if missing:
+ log.info("Objects missing (ino {0}, size {1}): {2}".format(
+ ino, size, missing
+ ))
+ return False
+ else:
+ log.info("All objects for ino {0} size {1} found".format(ino, size))
+ return True
+
+ def data_objects_absent(self, ino, size):
+ want_objects, exist_objects = self._enumerate_data_objects(ino, size)
+ present = set(want_objects) & set(exist_objects)
+
+ if present:
+ log.info("Objects not absent (ino {0}, size {1}): {2}".format(
+ ino, size, present
+ ))
+ return False
+ else:
+ log.info("All objects for ino {0} size {1} are absent".format(ino, size))
+ return True
+
+ def rados(self, args, pool=None, namespace=None, stdin_data=None):
+ """
+ Call into the `rados` CLI from an MDS
+ """
+
+ if pool is None:
+ pool = self.get_metadata_pool_name()
+
+ # Doesn't matter which MDS we use to run rados commands, they all
+ # have access to the pools
+ mds_id = self.mds_ids[0]
+ remote = self.mds_daemons[mds_id].remote
+
+ # NB we could alternatively use librados pybindings for this, but it's a one-liner
+ # using the `rados` CLI
+ args = ([os.path.join(self._prefix, "rados"), "-p", pool] +
+ (["--namespace", namespace] if namespace else []) +
+ args)
+ p = remote.run(
+ args=args,
+ stdin=stdin_data,
+ stdout=StringIO())
+ return p.stdout.getvalue().strip()
+
+ def list_dirfrag(self, dir_ino):
+ """
+ Read the named object and return the list of omap keys
+
+ :return a list of 0 or more strings
+ """
+
+ dirfrag_obj_name = "{0:x}.00000000".format(dir_ino)
+
+ try:
+ key_list_str = self.rados(["listomapkeys", dirfrag_obj_name])
+ except CommandFailedError as e:
+ log.error(e.__str__())
+ raise ObjectNotFound(dirfrag_obj_name)
+
+ return key_list_str.split("\n") if key_list_str else []
+
+ def erase_metadata_objects(self, prefix):
+ """
+ For all objects in the metadata pool matching the prefix,
+ erase them.
+
+ This O(N) with the number of objects in the pool, so only suitable
+ for use on toy test filesystems.
+ """
+ all_objects = self.rados(["ls"]).split("\n")
+ matching_objects = [o for o in all_objects if o.startswith(prefix)]
+ for o in matching_objects:
+ self.rados(["rm", o])
+
+ def erase_mds_objects(self, rank):
+ """
+ Erase all the per-MDS objects for a particular rank. This includes
+ inotable, sessiontable, journal
+ """
+
+ def obj_prefix(multiplier):
+ """
+ MDS object naming conventions like rank 1's
+ journal is at 201.***
+ """
+ return "%x." % (multiplier * 0x100 + rank)
+
+ # MDS_INO_LOG_OFFSET
+ self.erase_metadata_objects(obj_prefix(2))
+ # MDS_INO_LOG_BACKUP_OFFSET
+ self.erase_metadata_objects(obj_prefix(3))
+ # MDS_INO_LOG_POINTER_OFFSET
+ self.erase_metadata_objects(obj_prefix(4))
+ # MDSTables & SessionMap
+ self.erase_metadata_objects("mds{rank:d}_".format(rank=rank))
+
+ @property
+ def _prefix(self):
+ """
+ Override this to set a different
+ """
+ return ""
+
+ def _run_tool(self, tool, args, rank=None, quiet=False):
+ # Tests frequently have [client] configuration that jacks up
+ # the objecter log level (unlikely to be interesting here)
+ # and does not set the mds log level (very interesting here)
+ if quiet:
+ base_args = [os.path.join(self._prefix, tool), '--debug-mds=1', '--debug-objecter=1']
+ else:
+ base_args = [os.path.join(self._prefix, tool), '--debug-mds=4', '--debug-objecter=1']
+
+ if rank is not None:
+ base_args.extend(["--rank", "%d" % rank])
+
+ t1 = datetime.datetime.now()
+ r = self.tool_remote.run(
+ args=base_args + args,
+ stdout=StringIO()).stdout.getvalue().strip()
+ duration = datetime.datetime.now() - t1
+ log.info("Ran {0} in time {1}, result:\n{2}".format(
+ base_args + args, duration, r
+ ))
+ return r
+
+ @property
+ def tool_remote(self):
+ """
+ An arbitrary remote to use when invoking recovery tools. Use an MDS host because
+ it'll definitely have keys with perms to access cephfs metadata pool. This is public
+ so that tests can use this remote to go get locally written output files from the tools.
+ """
+ mds_id = self.mds_ids[0]
+ return self.mds_daemons[mds_id].remote
+
+ def journal_tool(self, args, rank=None, quiet=False):
+ """
+ Invoke cephfs-journal-tool with the passed arguments, and return its stdout
+ """
+ return self._run_tool("cephfs-journal-tool", args, rank, quiet)
+
+ def table_tool(self, args, quiet=False):
+ """
+ Invoke cephfs-table-tool with the passed arguments, and return its stdout
+ """
+ return self._run_tool("cephfs-table-tool", args, None, quiet)
+
+ def data_scan(self, args, quiet=False, worker_count=1):
+ """
+ Invoke cephfs-data-scan with the passed arguments, and return its stdout
+
+ :param worker_count: if greater than 1, multiple workers will be run
+ in parallel and the return value will be None
+ """
+
+ workers = []
+
+ for n in range(0, worker_count):
+ if worker_count > 1:
+ # data-scan args first token is a command, followed by args to it.
+ # insert worker arguments after the command.
+ cmd = args[0]
+ worker_args = [cmd] + ["--worker_n", n.__str__(), "--worker_m", worker_count.__str__()] + args[1:]
+ else:
+ worker_args = args
+
+ workers.append(Greenlet.spawn(lambda wargs=worker_args:
+ self._run_tool("cephfs-data-scan", wargs, None, quiet)))
+
+ for w in workers:
+ w.get()
+
+ if worker_count == 1:
+ return workers[0].value
+ else:
+ return None
--- /dev/null
+
+from StringIO import StringIO
+import json
+import time
+import logging
+from textwrap import dedent
+
+from teuthology import misc
+from teuthology.contextutil import MaxWhileTries
+from teuthology.orchestra import run
+from teuthology.orchestra.run import CommandFailedError
+from .mount import CephFSMount
+
+log = logging.getLogger(__name__)
+
+
+class FuseMount(CephFSMount):
+ def __init__(self, client_config, test_dir, client_id, client_remote):
+ super(FuseMount, self).__init__(test_dir, client_id, client_remote)
+
+ self.client_config = client_config if client_config else {}
+ self.fuse_daemon = None
+ self._fuse_conn = None
+
+ def mount(self, mount_path=None, mount_fs_name=None):
+ log.info("Client client.%s config is %s" % (self.client_id, self.client_config))
+
+ daemon_signal = 'kill'
+ if self.client_config.get('coverage') or self.client_config.get('valgrind') is not None:
+ daemon_signal = 'term'
+
+ log.info('Mounting ceph-fuse client.{id} at {remote} {mnt}...'.format(
+ id=self.client_id, remote=self.client_remote, mnt=self.mountpoint))
+
+ self.client_remote.run(
+ args=[
+ 'mkdir',
+ '--',
+ self.mountpoint,
+ ],
+ )
+
+ run_cmd = [
+ 'sudo',
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=self.test_dir),
+ 'daemon-helper',
+ daemon_signal,
+ ]
+
+ fuse_cmd = ['ceph-fuse', "-f"]
+
+ if mount_path is not None:
+ fuse_cmd += ["--client_mountpoint={0}".format(mount_path)]
+
+ if mount_fs_name is not None:
+ fuse_cmd += ["--client_mds_namespace={0}".format(mount_fs_name)]
+
+ fuse_cmd += [
+ '--name', 'client.{id}'.format(id=self.client_id),
+ # TODO ceph-fuse doesn't understand dash dash '--',
+ self.mountpoint,
+ ]
+
+ if self.client_config.get('valgrind') is not None:
+ run_cmd = misc.get_valgrind_args(
+ self.test_dir,
+ 'client.{id}'.format(id=self.client_id),
+ run_cmd,
+ self.client_config.get('valgrind'),
+ )
+
+ run_cmd.extend(fuse_cmd)
+
+ def list_connections():
+ self.client_remote.run(
+ args=["sudo", "mount", "-t", "fusectl", "/sys/fs/fuse/connections", "/sys/fs/fuse/connections"],
+ check_status=False
+ )
+ p = self.client_remote.run(
+ args=["ls", "/sys/fs/fuse/connections"],
+ stdout=StringIO(),
+ check_status=False
+ )
+ if p.exitstatus != 0:
+ return []
+
+ ls_str = p.stdout.getvalue().strip()
+ if ls_str:
+ return [int(n) for n in ls_str.split("\n")]
+ else:
+ return []
+
+ # Before starting ceph-fuse process, note the contents of
+ # /sys/fs/fuse/connections
+ pre_mount_conns = list_connections()
+ log.info("Pre-mount connections: {0}".format(pre_mount_conns))
+
+ proc = self.client_remote.run(
+ args=run_cmd,
+ logger=log.getChild('ceph-fuse.{id}'.format(id=self.client_id)),
+ stdin=run.PIPE,
+ wait=False,
+ )
+ self.fuse_daemon = proc
+
+ # Wait for the connection reference to appear in /sys
+ mount_wait = self.client_config.get('mount_wait', 0)
+ if mount_wait > 0:
+ log.info("Fuse mount waits {0} seconds before checking /sys/".format(mount_wait))
+ time.sleep(mount_wait)
+ timeout = int(self.client_config.get('mount_timeout', 30))
+ waited = 0
+
+ post_mount_conns = list_connections()
+ while len(post_mount_conns) <= len(pre_mount_conns):
+ if self.fuse_daemon.finished:
+ # Did mount fail? Raise the CommandFailedError instead of
+ # hitting the "failed to populate /sys/" timeout
+ self.fuse_daemon.wait()
+ time.sleep(1)
+ waited += 1
+ if waited > timeout:
+ raise RuntimeError("Fuse mount failed to populate /sys/ after {0} seconds".format(
+ waited
+ ))
+ else:
+ post_mount_conns = list_connections()
+
+ log.info("Post-mount connections: {0}".format(post_mount_conns))
+
+ # Record our fuse connection number so that we can use it when
+ # forcing an unmount
+ new_conns = list(set(post_mount_conns) - set(pre_mount_conns))
+ if len(new_conns) == 0:
+ raise RuntimeError("New fuse connection directory not found ({0})".format(new_conns))
+ elif len(new_conns) > 1:
+ raise RuntimeError("Unexpectedly numerous fuse connections {0}".format(new_conns))
+ else:
+ self._fuse_conn = new_conns[0]
+
+ def is_mounted(self):
+ proc = self.client_remote.run(
+ args=[
+ 'stat',
+ '--file-system',
+ '--printf=%T\n',
+ '--',
+ self.mountpoint,
+ ],
+ stdout=StringIO(),
+ stderr=StringIO(),
+ wait=False
+ )
+ try:
+ proc.wait()
+ except CommandFailedError:
+ if ("endpoint is not connected" in proc.stderr.getvalue()
+ or "Software caused connection abort" in proc.stderr.getvalue()):
+ # This happens is fuse is killed without unmount
+ log.warn("Found stale moutn point at {0}".format(self.mountpoint))
+ return True
+ else:
+ # This happens if the mount directory doesn't exist
+ log.info('mount point does not exist: %s', self.mountpoint)
+ return False
+
+ fstype = proc.stdout.getvalue().rstrip('\n')
+ if fstype == 'fuseblk':
+ log.info('ceph-fuse is mounted on %s', self.mountpoint)
+ return True
+ else:
+ log.debug('ceph-fuse not mounted, got fs type {fstype!r}'.format(
+ fstype=fstype))
+ return False
+
+ def wait_until_mounted(self):
+ """
+ Check to make sure that fuse is mounted on mountpoint. If not,
+ sleep for 5 seconds and check again.
+ """
+
+ while not self.is_mounted():
+ # Even if it's not mounted, it should at least
+ # be running: catch simple failures where it has terminated.
+ assert not self.fuse_daemon.poll()
+
+ time.sleep(5)
+
+ # Now that we're mounted, set permissions so that the rest of the test will have
+ # unrestricted access to the filesystem mount.
+ self.client_remote.run(
+ args=['sudo', 'chmod', '1777', self.mountpoint])
+
+ def _mountpoint_exists(self):
+ return self.client_remote.run(args=["ls", "-d", self.mountpoint], check_status=False).exitstatus == 0
+
+ def umount(self):
+ try:
+ log.info('Running fusermount -u on {name}...'.format(name=self.client_remote.name))
+ self.client_remote.run(
+ args=[
+ 'sudo',
+ 'fusermount',
+ '-u',
+ self.mountpoint,
+ ],
+ )
+ except run.CommandFailedError:
+ log.info('Failed to unmount ceph-fuse on {name}, aborting...'.format(name=self.client_remote.name))
+
+ # abort the fuse mount, killing all hung processes
+ if self._fuse_conn:
+ self.run_python(dedent("""
+ import os
+ path = "/sys/fs/fuse/connections/{0}/abort"
+ if os.path.exists(path):
+ open(path, "w").write("1")
+ """).format(self._fuse_conn))
+ self._fuse_conn = None
+
+ stderr = StringIO()
+ try:
+ # make sure its unmounted
+ self.client_remote.run(
+ args=[
+ 'sudo',
+ 'umount',
+ '-l',
+ '-f',
+ self.mountpoint,
+ ],
+ stderr=stderr
+ )
+ except CommandFailedError:
+ if self.is_mounted():
+ raise
+
+ assert not self.is_mounted()
+ self._fuse_conn = None
+
+ def umount_wait(self, force=False, require_clean=False):
+ """
+ :param force: Complete cleanly even if the MDS is offline
+ """
+ if force:
+ assert not require_clean # mutually exclusive
+
+ # When we expect to be forcing, kill the ceph-fuse process directly.
+ # This should avoid hitting the more aggressive fallback killing
+ # in umount() which can affect other mounts too.
+ self.fuse_daemon.stdin.close()
+
+ # However, we will still hit the aggressive wait if there is an ongoing
+ # mount -o remount (especially if the remount is stuck because MDSs
+ # are unavailable)
+
+ self.umount()
+
+ try:
+ if self.fuse_daemon:
+ # Permit a timeout, so that we do not block forever
+ run.wait([self.fuse_daemon], 900)
+ except MaxWhileTries:
+ log.error("process failed to terminate after unmount. This probably"
+ "indicates a bug within ceph-fuse.")
+ raise
+ except CommandFailedError:
+ if require_clean:
+ raise
+
+ self.cleanup()
+
+ def cleanup(self):
+ """
+ Remove the mount point.
+
+ Prerequisite: the client is not mounted.
+ """
+ stderr = StringIO()
+ try:
+ self.client_remote.run(
+ args=[
+ 'rmdir',
+ '--',
+ self.mountpoint,
+ ],
+ stderr=stderr
+ )
+ except CommandFailedError:
+ if "No such file or directory" in stderr.getvalue():
+ pass
+ else:
+ raise
+
+ def kill(self):
+ """
+ Terminate the client without removing the mount point.
+ """
+ self.fuse_daemon.stdin.close()
+ try:
+ self.fuse_daemon.wait()
+ except CommandFailedError:
+ pass
+
+ def kill_cleanup(self):
+ """
+ Follow up ``kill`` to get to a clean unmounted state.
+ """
+ self.umount()
+ self.cleanup()
+
+ def teardown(self):
+ """
+ Whatever the state of the mount, get it gone.
+ """
+ super(FuseMount, self).teardown()
+
+ self.umount()
+
+ if self.fuse_daemon and not self.fuse_daemon.finished:
+ self.fuse_daemon.stdin.close()
+ try:
+ self.fuse_daemon.wait()
+ except CommandFailedError:
+ pass
+
+ # Indiscriminate, unlike the touchier cleanup()
+ self.client_remote.run(
+ args=[
+ 'rm',
+ '-rf',
+ self.mountpoint,
+ ],
+ )
+
+ def _asok_path(self):
+ return "/var/run/ceph/ceph-client.{0}.*.asok".format(self.client_id)
+
+ @property
+ def _prefix(self):
+ return ""
+
+ def admin_socket(self, args):
+ pyscript = """
+import glob
+import re
+import os
+import subprocess
+
+def find_socket(client_name):
+ asok_path = "{asok_path}"
+ files = glob.glob(asok_path)
+
+ # Given a non-glob path, it better be there
+ if "*" not in asok_path:
+ assert(len(files) == 1)
+ return files[0]
+
+ for f in files:
+ pid = re.match(".*\.(\d+)\.asok$", f).group(1)
+ if os.path.exists("/proc/{{0}}".format(pid)):
+ return f
+ raise RuntimeError("Client socket {{0}} not found".format(client_name))
+
+print find_socket("{client_name}")
+""".format(
+ asok_path=self._asok_path(),
+ client_name="client.{0}".format(self.client_id))
+
+ # Find the admin socket
+ p = self.client_remote.run(args=[
+ 'python', '-c', pyscript
+ ], stdout=StringIO())
+ asok_path = p.stdout.getvalue().strip()
+ log.info("Found client admin socket at {0}".format(asok_path))
+
+ # Query client ID from admin socket
+ p = self.client_remote.run(
+ args=['sudo', self._prefix + 'ceph', '--admin-daemon', asok_path] + args,
+ stdout=StringIO())
+ return json.loads(p.stdout.getvalue())
+
+ def get_global_id(self):
+ """
+ Look up the CephFS client ID for this mount
+ """
+
+ return self.admin_socket(['mds_sessions'])['id']
+
+ def get_osd_epoch(self):
+ """
+ Return 2-tuple of osd_epoch, osd_epoch_barrier
+ """
+ status = self.admin_socket(['status'])
+ return status['osd_epoch'], status['osd_epoch_barrier']
+
+ def get_dentry_count(self):
+ """
+ Return 2-tuple of dentry_count, dentry_pinned_count
+ """
+ status = self.admin_socket(['status'])
+ return status['dentry_count'], status['dentry_pinned_count']
+
+ def set_cache_size(self, size):
+ return self.admin_socket(['config', 'set', 'client_cache_size', str(size)])
--- /dev/null
+from StringIO import StringIO
+import json
+import logging
+from textwrap import dedent
+from teuthology.orchestra.run import CommandFailedError
+from teuthology import misc
+
+from teuthology.orchestra import remote as orchestra_remote
+from teuthology.orchestra import run
+from .mount import CephFSMount
+
+log = logging.getLogger(__name__)
+
+
+class KernelMount(CephFSMount):
+ def __init__(self, mons, test_dir, client_id, client_remote,
+ ipmi_user, ipmi_password, ipmi_domain):
+ super(KernelMount, self).__init__(test_dir, client_id, client_remote)
+ self.mons = mons
+
+ self.mounted = False
+ self.ipmi_user = ipmi_user
+ self.ipmi_password = ipmi_password
+ self.ipmi_domain = ipmi_domain
+
+ def write_secret_file(self, remote, role, keyring, filename):
+ """
+ Stash the keyring in the filename specified.
+ """
+ remote.run(
+ args=[
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=self.test_dir),
+ 'ceph-authtool',
+ '--name={role}'.format(role=role),
+ '--print-key',
+ keyring,
+ run.Raw('>'),
+ filename,
+ ],
+ )
+
+ def mount(self, mount_path=None, mount_fs_name=None):
+ log.info('Mounting kclient client.{id} at {remote} {mnt}...'.format(
+ id=self.client_id, remote=self.client_remote, mnt=self.mountpoint))
+
+ keyring = self.get_keyring_path()
+ secret = '{tdir}/ceph.data/client.{id}.secret'.format(tdir=self.test_dir, id=self.client_id)
+ self.write_secret_file(self.client_remote, 'client.{id}'.format(id=self.client_id),
+ keyring, secret)
+
+ self.client_remote.run(
+ args=[
+ 'mkdir',
+ '--',
+ self.mountpoint,
+ ],
+ )
+
+ if mount_path is None:
+ mount_path = "/"
+
+ opts = 'name={id},secretfile={secret},norequire_active_mds'.format(id=self.client_id,
+ secret=secret)
+
+ if mount_fs_name is not None:
+ opts += ",mds_namespace={0}".format(mount_fs_name)
+
+ self.client_remote.run(
+ args=[
+ 'sudo',
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=self.test_dir),
+ '/sbin/mount.ceph',
+ '{mons}:{mount_path}'.format(mons=','.join(self.mons), mount_path=mount_path),
+ self.mountpoint,
+ '-v',
+ '-o',
+ opts
+ ],
+ )
+
+ self.client_remote.run(
+ args=['sudo', 'chmod', '1777', self.mountpoint])
+
+ self.mounted = True
+
+ def umount(self):
+ log.debug('Unmounting client client.{id}...'.format(id=self.client_id))
+ self.client_remote.run(
+ args=[
+ 'sudo',
+ 'umount',
+ self.mountpoint,
+ ],
+ )
+ self.client_remote.run(
+ args=[
+ 'rmdir',
+ '--',
+ self.mountpoint,
+ ],
+ )
+ self.mounted = False
+
+ def cleanup(self):
+ pass
+
+ def umount_wait(self, force=False, require_clean=False):
+ """
+ Unlike the fuse client, the kernel client's umount is immediate
+ """
+ if not self.is_mounted():
+ return
+
+ try:
+ self.umount()
+ except CommandFailedError:
+ if not force:
+ raise
+
+ self.kill()
+ self.kill_cleanup()
+
+ self.mounted = False
+
+ def is_mounted(self):
+ return self.mounted
+
+ def wait_until_mounted(self):
+ """
+ Unlike the fuse client, the kernel client is up and running as soon
+ as the initial mount() function returns.
+ """
+ assert self.mounted
+
+ def teardown(self):
+ super(KernelMount, self).teardown()
+ if self.mounted:
+ self.umount()
+
+ def kill(self):
+ """
+ The Ceph kernel client doesn't have a mechanism to kill itself (doing
+ that in side the kernel would be weird anyway), so we reboot the whole node
+ to get the same effect.
+
+ We use IPMI to reboot, because we don't want the client to send any
+ releases of capabilities.
+ """
+
+ con = orchestra_remote.getRemoteConsole(self.client_remote.hostname,
+ self.ipmi_user,
+ self.ipmi_password,
+ self.ipmi_domain)
+ con.power_off()
+
+ self.mounted = False
+
+ def kill_cleanup(self):
+ assert not self.mounted
+
+ con = orchestra_remote.getRemoteConsole(self.client_remote.hostname,
+ self.ipmi_user,
+ self.ipmi_password,
+ self.ipmi_domain)
+ con.power_on()
+
+ # Wait for node to come back up after reboot
+ misc.reconnect(None, 300, [self.client_remote])
+
+ # Remove mount directory
+ self.client_remote.run(
+ args=[
+ 'rmdir',
+ '--',
+ self.mountpoint,
+ ],
+ )
+
+ def _find_debug_dir(self):
+ """
+ Find the debugfs folder for this mount
+ """
+ pyscript = dedent("""
+ import glob
+ import os
+ import json
+
+ def get_id_to_dir():
+ result = {}
+ for dir in glob.glob("/sys/kernel/debug/ceph/*"):
+ mds_sessions_lines = open(os.path.join(dir, "mds_sessions")).readlines()
+ client_id = mds_sessions_lines[1].split()[1].strip('"')
+
+ result[client_id] = dir
+ return result
+
+ print json.dumps(get_id_to_dir())
+ """)
+
+ p = self.client_remote.run(args=[
+ 'sudo', 'python', '-c', pyscript
+ ], stdout=StringIO())
+ client_id_to_dir = json.loads(p.stdout.getvalue())
+
+ try:
+ return client_id_to_dir[self.client_id]
+ except KeyError:
+ log.error("Client id '{0}' debug dir not found (clients seen were: {1})".format(
+ self.client_id, ",".join(client_id_to_dir.keys())
+ ))
+ raise
+
+ def _read_debug_file(self, filename):
+ debug_dir = self._find_debug_dir()
+
+ pyscript = dedent("""
+ import os
+
+ print open(os.path.join("{debug_dir}", "{filename}")).read()
+ """).format(debug_dir=debug_dir, filename=filename)
+
+ p = self.client_remote.run(args=[
+ 'sudo', 'python', '-c', pyscript
+ ], stdout=StringIO())
+ return p.stdout.getvalue()
+
+ def get_global_id(self):
+ """
+ Look up the CephFS client ID for this mount, using debugfs.
+ """
+
+ assert self.mounted
+
+ mds_sessions = self._read_debug_file("mds_sessions")
+ lines = mds_sessions.split("\n")
+ return int(lines[0].split()[1])
+
+ def get_osd_epoch(self):
+ """
+ Return 2-tuple of osd_epoch, osd_epoch_barrier
+ """
+ osd_map = self._read_debug_file("osdmap")
+ lines = osd_map.split("\n")
+ epoch = int(lines[0].split()[1])
+
+ mds_sessions = self._read_debug_file("mds_sessions")
+ lines = mds_sessions.split("\n")
+ epoch_barrier = int(lines[2].split()[1].strip('"'))
+
+ return epoch, epoch_barrier
--- /dev/null
+from contextlib import contextmanager
+import json
+import logging
+import datetime
+import time
+from textwrap import dedent
+import os
+from StringIO import StringIO
+from teuthology.orchestra import run
+from teuthology.orchestra.run import CommandFailedError, ConnectionLostError
+
+log = logging.getLogger(__name__)
+
+
+class CephFSMount(object):
+ def __init__(self, test_dir, client_id, client_remote):
+ """
+ :param test_dir: Global teuthology test dir
+ :param client_id: Client ID, the 'foo' in client.foo
+ :param client_remote: Remote instance for the host where client will run
+ """
+
+ self.test_dir = test_dir
+ self.client_id = client_id
+ self.client_remote = client_remote
+ self.mountpoint_dir_name = 'mnt.{id}'.format(id=self.client_id)
+
+ self.test_files = ['a', 'b', 'c']
+
+ self.background_procs = []
+
+ @property
+ def mountpoint(self):
+ return os.path.join(
+ self.test_dir, '{dir_name}'.format(dir_name=self.mountpoint_dir_name))
+
+ def is_mounted(self):
+ raise NotImplementedError()
+
+ def mount(self, mount_path=None, mount_fs_name=None):
+ raise NotImplementedError()
+
+ def umount(self):
+ raise NotImplementedError()
+
+ def umount_wait(self, force=False, require_clean=False):
+ """
+
+ :param force: Expect that the mount will not shutdown cleanly: kill
+ it hard.
+ :param require_clean: Wait for the Ceph client associated with the
+ mount (e.g. ceph-fuse) to terminate, and
+ raise if it doesn't do so cleanly.
+ :return:
+ """
+ raise NotImplementedError()
+
+ def kill_cleanup(self):
+ raise NotImplementedError()
+
+ def kill(self):
+ raise NotImplementedError()
+
+ def cleanup(self):
+ raise NotImplementedError()
+
+ def wait_until_mounted(self):
+ raise NotImplementedError()
+
+ def get_keyring_path(self):
+ return '/etc/ceph/ceph.client.{id}.keyring'.format(id=self.client_id)
+
+ @property
+ def config_path(self):
+ """
+ Path to ceph.conf: override this if you're not a normal systemwide ceph install
+ :return: stringv
+ """
+ return "/etc/ceph/ceph.conf"
+
+ @contextmanager
+ def mounted(self):
+ """
+ A context manager, from an initially unmounted state, to mount
+ this, yield, and then unmount and clean up.
+ """
+ self.mount()
+ self.wait_until_mounted()
+ try:
+ yield
+ finally:
+ self.umount_wait()
+
+ def create_files(self):
+ assert(self.is_mounted())
+
+ for suffix in self.test_files:
+ log.info("Creating file {0}".format(suffix))
+ self.client_remote.run(args=[
+ 'sudo', 'touch', os.path.join(self.mountpoint, suffix)
+ ])
+
+ def check_files(self):
+ assert(self.is_mounted())
+
+ for suffix in self.test_files:
+ log.info("Checking file {0}".format(suffix))
+ r = self.client_remote.run(args=[
+ 'sudo', 'ls', os.path.join(self.mountpoint, suffix)
+ ], check_status=False)
+ if r.exitstatus != 0:
+ raise RuntimeError("Expected file {0} not found".format(suffix))
+
+ def create_destroy(self):
+ assert(self.is_mounted())
+
+ filename = "{0} {1}".format(datetime.datetime.now(), self.client_id)
+ log.debug("Creating test file {0}".format(filename))
+ self.client_remote.run(args=[
+ 'sudo', 'touch', os.path.join(self.mountpoint, filename)
+ ])
+ log.debug("Deleting test file {0}".format(filename))
+ self.client_remote.run(args=[
+ 'sudo', 'rm', '-f', os.path.join(self.mountpoint, filename)
+ ])
+
+ def _run_python(self, pyscript):
+ return self.client_remote.run(args=[
+ 'sudo', 'adjust-ulimits', 'daemon-helper', 'kill', 'python', '-c', pyscript
+ ], wait=False, stdin=run.PIPE, stdout=StringIO())
+
+ def run_python(self, pyscript):
+ p = self._run_python(pyscript)
+ p.wait()
+ return p.stdout.getvalue().strip()
+
+ def run_shell(self, args, wait=True):
+ args = ["cd", self.mountpoint, run.Raw('&&'), "sudo"] + args
+ return self.client_remote.run(args=args, stdout=StringIO(), wait=wait)
+
+ def open_no_data(self, basename):
+ """
+ A pure metadata operation
+ """
+ assert(self.is_mounted())
+
+ path = os.path.join(self.mountpoint, basename)
+
+ p = self._run_python(dedent(
+ """
+ f = open("{path}", 'w')
+ """.format(path=path)
+ ))
+ p.wait()
+
+ def open_background(self, basename="background_file"):
+ """
+ Open a file for writing, then block such that the client
+ will hold a capability
+ """
+ assert(self.is_mounted())
+
+ path = os.path.join(self.mountpoint, basename)
+
+ pyscript = dedent("""
+ import time
+
+ f = open("{path}", 'w')
+ f.write('content')
+ f.flush()
+ f.write('content2')
+ while True:
+ time.sleep(1)
+ """).format(path=path)
+
+ rproc = self._run_python(pyscript)
+ self.background_procs.append(rproc)
+ return rproc
+
+ def wait_for_visible(self, basename="background_file", timeout=30):
+ i = 0
+ while i < timeout:
+ r = self.client_remote.run(args=[
+ 'sudo', 'ls', os.path.join(self.mountpoint, basename)
+ ], check_status=False)
+ if r.exitstatus == 0:
+ log.debug("File {0} became visible from {1} after {2}s".format(
+ basename, self.client_id, i))
+ return
+ else:
+ time.sleep(1)
+ i += 1
+
+ raise RuntimeError("Timed out after {0}s waiting for {1} to become visible from {2}".format(
+ i, basename, self.client_id))
+
+ def lock_background(self, basename="background_file", do_flock=True):
+ """
+ Open and lock a files for writing, hold the lock in a background process
+ """
+ assert(self.is_mounted())
+
+ path = os.path.join(self.mountpoint, basename)
+
+ script_builder = """
+ import time
+ import fcntl
+ import struct"""
+ if do_flock:
+ script_builder += """
+ f1 = open("{path}-1", 'w')
+ fcntl.flock(f1, fcntl.LOCK_EX | fcntl.LOCK_NB)"""
+ script_builder += """
+ f2 = open("{path}-2", 'w')
+ lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0)
+ fcntl.fcntl(f2, fcntl.F_SETLK, lockdata)
+ while True:
+ time.sleep(1)
+ """
+
+ pyscript = dedent(script_builder).format(path=path)
+
+ log.info("lock file {0}".format(basename))
+ rproc = self._run_python(pyscript)
+ self.background_procs.append(rproc)
+ return rproc
+
+ def check_filelock(self, basename="background_file", do_flock=True):
+ assert(self.is_mounted())
+
+ path = os.path.join(self.mountpoint, basename)
+
+ script_builder = """
+ import fcntl
+ import errno
+ import struct"""
+ if do_flock:
+ script_builder += """
+ f1 = open("{path}-1", 'r')
+ try:
+ fcntl.flock(f1, fcntl.LOCK_EX | fcntl.LOCK_NB)
+ except IOError, e:
+ if e.errno == errno.EAGAIN:
+ pass
+ else:
+ raise RuntimeError("flock on file {path}-1 not found")"""
+ script_builder += """
+ f2 = open("{path}-2", 'r')
+ try:
+ lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0)
+ fcntl.fcntl(f2, fcntl.F_SETLK, lockdata)
+ except IOError, e:
+ if e.errno == errno.EAGAIN:
+ pass
+ else:
+ raise RuntimeError("posix lock on file {path}-2 not found")
+ """
+ pyscript = dedent(script_builder).format(path=path)
+
+ log.info("check lock on file {0}".format(basename))
+ self.client_remote.run(args=[
+ 'sudo', 'python', '-c', pyscript
+ ])
+
+ def write_background(self, basename="background_file", loop=False):
+ """
+ Open a file for writing, complete as soon as you can
+ :param basename:
+ :return:
+ """
+ assert(self.is_mounted())
+
+ path = os.path.join(self.mountpoint, basename)
+
+ pyscript = dedent("""
+ import os
+ import time
+
+ fd = os.open("{path}", os.O_RDWR | os.O_CREAT, 0644)
+ try:
+ while True:
+ os.write(fd, 'content')
+ time.sleep(1)
+ if not {loop}:
+ break
+ except IOError, e:
+ pass
+ os.close(fd)
+ """).format(path=path, loop=str(loop))
+
+ rproc = self._run_python(pyscript)
+ self.background_procs.append(rproc)
+ return rproc
+
+ def write_n_mb(self, filename, n_mb, seek=0, wait=True):
+ """
+ Write the requested number of megabytes to a file
+ """
+ assert(self.is_mounted())
+
+ return self.run_shell(["dd", "if=/dev/urandom", "of={0}".format(filename),
+ "bs=1M", "conv=fdatasync",
+ "count={0}".format(n_mb),
+ "seek={0}".format(seek)
+ ], wait=wait)
+
+ def write_test_pattern(self, filename, size):
+ log.info("Writing {0} bytes to {1}".format(size, filename))
+ return self.run_python(dedent("""
+ import zlib
+ path = "{path}"
+ f = open(path, 'w')
+ for i in range(0, {size}):
+ val = zlib.crc32("%s" % i) & 7
+ f.write(chr(val))
+ f.close()
+ """.format(
+ path=os.path.join(self.mountpoint, filename),
+ size=size
+ )))
+
+ def validate_test_pattern(self, filename, size):
+ log.info("Validating {0} bytes from {1}".format(size, filename))
+ return self.run_python(dedent("""
+ import zlib
+ path = "{path}"
+ f = open(path, 'r')
+ bytes = f.read()
+ f.close()
+ if len(bytes) != {size}:
+ raise RuntimeError("Bad length {{0}} vs. expected {{1}}".format(
+ len(bytes), {size}
+ ))
+ for i, b in enumerate(bytes):
+ val = zlib.crc32("%s" % i) & 7
+ if b != chr(val):
+ raise RuntimeError("Bad data at offset {{0}}".format(i))
+ """.format(
+ path=os.path.join(self.mountpoint, filename),
+ size=size
+ )))
+
+ def open_n_background(self, fs_path, count):
+ """
+ Open N files for writing, hold them open in a background process
+
+ :param fs_path: Path relative to CephFS root, e.g. "foo/bar"
+ :return: a RemoteProcess
+ """
+ assert(self.is_mounted())
+
+ abs_path = os.path.join(self.mountpoint, fs_path)
+
+ pyscript = dedent("""
+ import sys
+ import time
+ import os
+
+ n = {count}
+ abs_path = "{abs_path}"
+
+ if not os.path.exists(os.path.dirname(abs_path)):
+ os.makedirs(os.path.dirname(abs_path))
+
+ handles = []
+ for i in range(0, n):
+ fname = "{{0}}_{{1}}".format(abs_path, i)
+ handles.append(open(fname, 'w'))
+
+ while True:
+ time.sleep(1)
+ """).format(abs_path=abs_path, count=count)
+
+ rproc = self._run_python(pyscript)
+ self.background_procs.append(rproc)
+ return rproc
+
+ def create_n_files(self, fs_path, count, sync=False):
+ assert(self.is_mounted())
+
+ abs_path = os.path.join(self.mountpoint, fs_path)
+
+ pyscript = dedent("""
+ import sys
+ import time
+ import os
+
+ n = {count}
+ abs_path = "{abs_path}"
+
+ if not os.path.exists(os.path.dirname(abs_path)):
+ os.makedirs(os.path.dirname(abs_path))
+
+ for i in range(0, n):
+ fname = "{{0}}_{{1}}".format(abs_path, i)
+ h = open(fname, 'w')
+ h.write('content')
+ if {sync}:
+ h.flush()
+ os.fsync(h.fileno())
+ h.close()
+ """).format(abs_path=abs_path, count=count, sync=str(sync))
+
+ self.run_python(pyscript)
+
+ def teardown(self):
+ for p in self.background_procs:
+ log.info("Terminating background process")
+ self._kill_background(p)
+
+ self.background_procs = []
+
+ def _kill_background(self, p):
+ if p.stdin:
+ p.stdin.close()
+ try:
+ p.wait()
+ except (CommandFailedError, ConnectionLostError):
+ pass
+
+ def kill_background(self, p):
+ """
+ For a process that was returned by one of the _background member functions,
+ kill it hard.
+ """
+ self._kill_background(p)
+ self.background_procs.remove(p)
+
+ def spam_dir_background(self, path):
+ """
+ Create directory `path` and do lots of metadata operations
+ in it until further notice.
+ """
+ assert(self.is_mounted())
+ abs_path = os.path.join(self.mountpoint, path)
+
+ pyscript = dedent("""
+ import sys
+ import time
+ import os
+
+ abs_path = "{abs_path}"
+
+ if not os.path.exists(abs_path):
+ os.makedirs(abs_path)
+
+ n = 0
+ while True:
+ file_path = os.path.join(abs_path, "tmp%d" % n)
+ f = open(file_path, 'w')
+ f.close()
+ n = n + 1
+ """).format(abs_path=abs_path)
+
+ rproc = self._run_python(pyscript)
+ self.background_procs.append(rproc)
+ return rproc
+
+ def get_global_id(self):
+ raise NotImplementedError()
+
+ def get_osd_epoch(self):
+ raise NotImplementedError()
+
+ def stat(self, fs_path, wait=True):
+ """
+ stat a file, and return the result as a dictionary like this:
+ {
+ "st_ctime": 1414161137.0,
+ "st_mtime": 1414161137.0,
+ "st_nlink": 33,
+ "st_gid": 0,
+ "st_dev": 16777218,
+ "st_size": 1190,
+ "st_ino": 2,
+ "st_uid": 0,
+ "st_mode": 16877,
+ "st_atime": 1431520593.0
+ }
+
+ Raises exception on absent file.
+ """
+ abs_path = os.path.join(self.mountpoint, fs_path)
+
+ pyscript = dedent("""
+ import os
+ import stat
+ import json
+ import sys
+
+ try:
+ s = os.stat("{path}")
+ except OSError as e:
+ sys.exit(e.errno)
+
+ attrs = ["st_mode", "st_ino", "st_dev", "st_nlink", "st_uid", "st_gid", "st_size", "st_atime", "st_mtime", "st_ctime"]
+ print json.dumps(
+ dict([(a, getattr(s, a)) for a in attrs]),
+ indent=2)
+ """).format(path=abs_path)
+ proc = self._run_python(pyscript)
+ if wait:
+ proc.wait()
+ return json.loads(proc.stdout.getvalue().strip())
+ else:
+ return proc
+
+ def touch(self, fs_path):
+ """
+ Create a dentry if it doesn't already exist. This python
+ implementation exists because the usual command line tool doesn't
+ pass through error codes like EIO.
+
+ :param fs_path:
+ :return:
+ """
+ abs_path = os.path.join(self.mountpoint, fs_path)
+ pyscript = dedent("""
+ import sys
+ import errno
+
+ try:
+ f = open("{path}", "w")
+ f.close()
+ except IOError as e:
+ sys.exit(errno.EIO)
+ """).format(path=abs_path)
+ proc = self._run_python(pyscript)
+ proc.wait()
+
+ def path_to_ino(self, fs_path, follow_symlinks=True):
+ abs_path = os.path.join(self.mountpoint, fs_path)
+
+ if follow_symlinks:
+ pyscript = dedent("""
+ import os
+ import stat
+
+ print os.stat("{path}").st_ino
+ """).format(path=abs_path)
+ else:
+ pyscript = dedent("""
+ import os
+ import stat
+
+ print os.lstat("{path}").st_ino
+ """).format(path=abs_path)
+
+ proc = self._run_python(pyscript)
+ proc.wait()
+ return int(proc.stdout.getvalue().strip())
+
+ def path_to_nlink(self, fs_path):
+ abs_path = os.path.join(self.mountpoint, fs_path)
+
+ pyscript = dedent("""
+ import os
+ import stat
+
+ print os.stat("{path}").st_nlink
+ """).format(path=abs_path)
+
+ proc = self._run_python(pyscript)
+ proc.wait()
+ return int(proc.stdout.getvalue().strip())
+
+ def ls(self, path=None):
+ """
+ Wrap ls: return a list of strings
+ """
+ cmd = ["ls"]
+ if path:
+ cmd.append(path)
+
+ ls_text = self.run_shell(cmd).stdout.getvalue().strip()
+
+ if ls_text:
+ return ls_text.split("\n")
+ else:
+ # Special case because otherwise split on empty string
+ # gives you [''] instead of []
+ return []
+
+ def getfattr(self, path, attr):
+ """
+ Wrap getfattr: return the values of a named xattr on one file.
+
+ :return: a string
+ """
+ p = self.run_shell(["getfattr", "--only-values", "-n", attr, path])
+ return p.stdout.getvalue()
+
+ def df(self):
+ """
+ Wrap df: return a dict of usage fields in bytes
+ """
+
+ p = self.run_shell(["df", "-B1", "."])
+ lines = p.stdout.getvalue().strip().split("\n")
+ fs, total, used, avail = lines[1].split()[:4]
+ log.warn(lines)
+
+ return {
+ "total": int(total),
+ "used": int(used),
+ "available": int(avail)
+ }
--- /dev/null
+
+"""
+Exercise the MDS's auto repair functions
+"""
+
+import logging
+import time
+
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+
+log = logging.getLogger(__name__)
+
+
+# Arbitrary timeouts for operations involving restarting
+# an MDS or waiting for it to come up
+MDS_RESTART_GRACE = 60
+
+
+class TestMDSAutoRepair(CephFSTestCase):
+ def test_backtrace_repair(self):
+ """
+ MDS should verify/fix backtrace on fetch dirfrag
+ """
+
+ self.mount_a.run_shell(["mkdir", "testdir1"])
+ self.mount_a.run_shell(["touch", "testdir1/testfile"])
+ dir_objname = "{:x}.00000000".format(self.mount_a.path_to_ino("testdir1"))
+
+ # drop inodes caps
+ self.mount_a.umount_wait()
+
+ # flush journal entries to dirfrag objects, and expire journal
+ self.fs.mds_asok(['flush', 'journal'])
+
+ # Restart the MDS to drop the metadata cache (because we expired the journal,
+ # nothing gets replayed into cache on restart)
+ self.fs.mds_stop()
+ self.fs.mds_fail_restart()
+ self.fs.wait_for_daemons()
+
+ # remove testdir1's backtrace
+ self.fs.rados(["rmxattr", dir_objname, "parent"])
+
+ # readdir (fetch dirfrag) should fix testdir1's backtrace
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+ self.mount_a.run_shell(["ls", "testdir1"])
+
+ # flush journal entries to dirfrag objects
+ self.fs.mds_asok(['flush', 'journal'])
+
+ # check if backtrace exists
+ self.fs.rados(["getxattr", dir_objname, "parent"])
+
+ def test_mds_readonly(self):
+ """
+ test if MDS behave correct when it's readonly
+ """
+ # operation should successd when MDS is not readonly
+ self.mount_a.run_shell(["touch", "test_file1"])
+ writer = self.mount_a.write_background(loop=True)
+
+ time.sleep(10)
+ self.assertFalse(writer.finished)
+
+ # force MDS to read-only mode
+ self.fs.mds_asok(['force_readonly'])
+ time.sleep(10)
+
+ # touching test file should fail
+ try:
+ self.mount_a.run_shell(["touch", "test_file1"])
+ except CommandFailedError:
+ pass
+ else:
+ self.assertTrue(False)
+
+ # background writer also should fail
+ self.assertTrue(writer.finished)
+
+ # The MDS should report its readonly health state to the mon
+ self.wait_for_health("MDS in read-only mode", timeout=30)
+
+ # restart mds to make it writable
+ self.fs.mds_fail_restart()
+ self.fs.wait_for_daemons()
+
+ self.wait_for_health_clear(timeout=30)
--- /dev/null
+
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+
+class TestBacktrace(CephFSTestCase):
+ def test_backtrace(self):
+ """
+ That the 'parent' and 'layout' xattrs on the head objects of files
+ are updated correctly.
+ """
+
+ old_data_pool_name = self.fs.get_data_pool_name()
+ old_pool_id = self.fs.get_data_pool_id()
+
+ # Create a file for subsequent checks
+ self.mount_a.run_shell(["mkdir", "parent_a"])
+ self.mount_a.run_shell(["touch", "parent_a/alpha"])
+ file_ino = self.mount_a.path_to_ino("parent_a/alpha")
+
+ # That backtrace and layout are written after initial flush
+ self.fs.mds_asok(["flush", "journal"])
+ backtrace = self.fs.read_backtrace(file_ino)
+ self.assertEqual(['alpha', 'parent_a'], [a['dname'] for a in backtrace['ancestors']])
+ layout = self.fs.read_layout(file_ino)
+ self.assertDictEqual(layout, {
+ "stripe_unit": 4194304,
+ "stripe_count": 1,
+ "object_size": 4194304,
+ "pool_id": old_pool_id,
+ "pool_ns": "",
+ })
+ self.assertEqual(backtrace['pool'], old_pool_id)
+
+ # That backtrace is written after parentage changes
+ self.mount_a.run_shell(["mkdir", "parent_b"])
+ self.mount_a.run_shell(["mv", "parent_a/alpha", "parent_b/alpha"])
+
+ self.fs.mds_asok(["flush", "journal"])
+ backtrace = self.fs.read_backtrace(file_ino)
+ self.assertEqual(['alpha', 'parent_b'], [a['dname'] for a in backtrace['ancestors']])
+
+ # Create a new data pool
+ new_pool_name = "data_new"
+ new_pool_id = self.fs.add_data_pool(new_pool_name)
+
+ # That an object which has switched pools gets its backtrace updated
+ self.mount_a.run_shell(["setfattr", "-n", "ceph.file.layout.pool", "-v", new_pool_name, "./parent_b/alpha"])
+ self.fs.mds_asok(["flush", "journal"])
+ backtrace_old_pool = self.fs.read_backtrace(file_ino, pool=old_data_pool_name)
+ self.assertEqual(backtrace_old_pool['pool'], new_pool_id)
+ backtrace_new_pool = self.fs.read_backtrace(file_ino, pool=new_pool_name)
+ self.assertEqual(backtrace_new_pool['pool'], new_pool_id)
+ new_pool_layout = self.fs.read_layout(file_ino, pool=new_pool_name)
+ self.assertEqual(new_pool_layout['pool_id'], new_pool_id)
+ self.assertEqual(new_pool_layout['pool_ns'], '')
+
+ # That subsequent linkage changes are only written to new pool backtrace
+ self.mount_a.run_shell(["mkdir", "parent_c"])
+ self.mount_a.run_shell(["mv", "parent_b/alpha", "parent_c/alpha"])
+ self.fs.mds_asok(["flush", "journal"])
+ backtrace_old_pool = self.fs.read_backtrace(file_ino, pool=old_data_pool_name)
+ self.assertEqual(['alpha', 'parent_b'], [a['dname'] for a in backtrace_old_pool['ancestors']])
+ backtrace_new_pool = self.fs.read_backtrace(file_ino, pool=new_pool_name)
+ self.assertEqual(['alpha', 'parent_c'], [a['dname'] for a in backtrace_new_pool['ancestors']])
+
+ # That layout is written to new pool after change to other field in layout
+ self.mount_a.run_shell(["setfattr", "-n", "ceph.file.layout.object_size", "-v", "8388608", "./parent_c/alpha"])
+
+ self.fs.mds_asok(["flush", "journal"])
+ new_pool_layout = self.fs.read_layout(file_ino, pool=new_pool_name)
+ self.assertEqual(new_pool_layout['object_size'], 8388608)
+
+ # ...but not to the old pool: the old pool's backtrace points to the new pool, and that's enough,
+ # we don't update the layout in all the old pools whenever it changes
+ old_pool_layout = self.fs.read_layout(file_ino, pool=old_data_pool_name)
+ self.assertEqual(old_pool_layout['object_size'], 4194304)
--- /dev/null
+
+import os
+import time
+from textwrap import dedent
+from unittest import SkipTest
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
+
+class TestCapFlush(CephFSTestCase):
+ @for_teuthology
+ def test_replay_create(self):
+ """
+ MDS starts to handle client caps when it enters clientreplay stage.
+ When handling a client cap in clientreplay stage, it's possible that
+ corresponding inode does not exist because the client request which
+ creates inode hasn't been replayed.
+ """
+
+ if not isinstance(self.mount_a, FuseMount):
+ raise SkipTest("Require FUSE client to inject client release failure")
+
+ dir_path = os.path.join(self.mount_a.mountpoint, "testdir")
+ py_script = dedent("""
+ import os
+ os.mkdir("{0}")
+ fd = os.open("{0}", os.O_RDONLY)
+ os.fchmod(fd, 0777)
+ os.fsync(fd)
+ """).format(dir_path)
+ self.mount_a.run_python(py_script)
+
+ self.fs.mds_asok(["flush", "journal"])
+
+ # client will only get unsafe replay
+ self.fs.mds_asok(["config", "set", "mds_log_pause", "1"])
+
+ file_name = "testfile"
+ file_path = dir_path + "/" + file_name
+
+ # Create a file and modify its mode. ceph-fuse will mark Ax cap dirty
+ py_script = dedent("""
+ import os
+ os.chdir("{0}")
+ os.setgid(65534)
+ os.setuid(65534)
+ fd = os.open("{1}", os.O_CREAT | os.O_RDWR, 0644)
+ os.fchmod(fd, 0640)
+ """).format(dir_path, file_name)
+ self.mount_a.run_python(py_script)
+
+ # Modify file mode by different user. ceph-fuse will send a setattr request
+ self.mount_a.run_shell(["chmod", "600", file_path], wait=False)
+
+ time.sleep(10)
+
+ # Restart mds. Client will re-send the unsafe request and cap flush
+ self.fs.mds_stop()
+ self.fs.mds_fail_restart()
+ self.fs.wait_for_daemons()
+
+ mode = self.mount_a.run_shell(['stat', '-c' '%a', file_path]).stdout.getvalue().strip()
+ # If the cap flush get dropped, mode should be 0644.
+ # (Ax cap stays in dirty state, which prevents setattr reply from updating file mode)
+ self.assertEqual(mode, "600")
--- /dev/null
+
+"""
+Exercise the MDS's behaviour when clients and the MDCache reach or
+exceed the limits of how many caps/inodes they should hold.
+"""
+
+import logging
+from textwrap import dedent
+from unittest import SkipTest
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, needs_trimming
+from tasks.cephfs.fuse_mount import FuseMount
+import os
+
+
+log = logging.getLogger(__name__)
+
+
+# Arbitrary timeouts for operations involving restarting
+# an MDS or waiting for it to come up
+MDS_RESTART_GRACE = 60
+
+# Hardcoded values from Server::recall_client_state
+CAP_RECALL_RATIO = 0.8
+CAP_RECALL_MIN = 100
+
+
+class TestClientLimits(CephFSTestCase):
+ REQUIRE_KCLIENT_REMOTE = True
+ CLIENTS_REQUIRED = 2
+
+ def _test_client_pin(self, use_subdir):
+ """
+ When a client pins an inode in its cache, for example because the file is held open,
+ it should reject requests from the MDS to trim these caps. The MDS should complain
+ to the user that it is unable to enforce its cache size limits because of this
+ objectionable client.
+
+ :param use_subdir: whether to put test files in a subdir or use root
+ """
+
+ cache_size = 100
+ open_files = 200
+
+ self.set_conf('mds', 'mds cache size', cache_size)
+ self.fs.mds_fail_restart()
+ self.fs.wait_for_daemons()
+
+ mount_a_client_id = self.mount_a.get_global_id()
+ path = "subdir/mount_a" if use_subdir else "mount_a"
+ open_proc = self.mount_a.open_n_background(path, open_files)
+
+ # Client should now hold:
+ # `open_files` caps for the open files
+ # 1 cap for root
+ # 1 cap for subdir
+ self.wait_until_equal(lambda: self.get_session(mount_a_client_id)['num_caps'],
+ open_files + (2 if use_subdir else 1),
+ timeout=600,
+ reject_fn=lambda x: x > open_files + 2)
+
+ # MDS should not be happy about that, as the client is failing to comply
+ # with the SESSION_RECALL messages it is being sent
+ mds_recall_state_timeout = int(self.fs.get_config("mds_recall_state_timeout"))
+ self.wait_for_health("failing to respond to cache pressure",
+ mds_recall_state_timeout + 10)
+
+ # We can also test that the MDS health warning for oversized
+ # cache is functioning as intended.
+ self.wait_for_health("Too many inodes in cache",
+ mds_recall_state_timeout + 10)
+
+ # When the client closes the files, it should retain only as many caps as allowed
+ # under the SESSION_RECALL policy
+ log.info("Terminating process holding files open")
+ open_proc.stdin.close()
+ try:
+ open_proc.wait()
+ except CommandFailedError:
+ # We killed it, so it raises an error
+ pass
+
+ # The remaining caps should comply with the numbers sent from MDS in SESSION_RECALL message,
+ # which depend on the cache size and overall ratio
+ self.wait_until_equal(
+ lambda: self.get_session(mount_a_client_id)['num_caps'],
+ int(cache_size * 0.8),
+ timeout=600,
+ reject_fn=lambda x: x < int(cache_size*.8))
+
+ @needs_trimming
+ def test_client_pin_root(self):
+ self._test_client_pin(False)
+
+ @needs_trimming
+ def test_client_pin(self):
+ self._test_client_pin(True)
+
+ def test_client_release_bug(self):
+ """
+ When a client has a bug (which we will simulate) preventing it from releasing caps,
+ the MDS should notice that releases are not being sent promptly, and generate a health
+ metric to that effect.
+ """
+
+ # The debug hook to inject the failure only exists in the fuse client
+ if not isinstance(self.mount_a, FuseMount):
+ raise SkipTest("Require FUSE client to inject client release failure")
+
+ self.set_conf('client.{0}'.format(self.mount_a.client_id), 'client inject release failure', 'true')
+ self.mount_a.teardown()
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+ mount_a_client_id = self.mount_a.get_global_id()
+
+ # Client A creates a file. He will hold the write caps on the file, and later (simulated bug) fail
+ # to comply with the MDSs request to release that cap
+ self.mount_a.run_shell(["touch", "file1"])
+
+ # Client B tries to stat the file that client A created
+ rproc = self.mount_b.write_background("file1")
+
+ # After mds_revoke_cap_timeout, we should see a health warning (extra lag from
+ # MDS beacon period)
+ mds_revoke_cap_timeout = int(self.fs.get_config("mds_revoke_cap_timeout"))
+ self.wait_for_health("failing to respond to capability release", mds_revoke_cap_timeout + 10)
+
+ # Client B should still be stuck
+ self.assertFalse(rproc.finished)
+
+ # Kill client A
+ self.mount_a.kill()
+ self.mount_a.kill_cleanup()
+
+ # Client B should complete
+ self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
+ rproc.wait()
+
+ def test_client_oldest_tid(self):
+ """
+ When a client does not advance its oldest tid, the MDS should notice that
+ and generate health warnings.
+ """
+
+ # num of requests client issues
+ max_requests = 1000
+
+ # The debug hook to inject the failure only exists in the fuse client
+ if not isinstance(self.mount_a, FuseMount):
+ raise SkipTest("Require FUSE client to inject client release failure")
+
+ self.set_conf('client', 'client inject fixed oldest tid', 'true')
+ self.mount_a.teardown()
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+
+ self.fs.mds_asok(['config', 'set', 'mds_max_completed_requests', '{0}'.format(max_requests)])
+
+ # Create lots of files
+ self.mount_a.create_n_files("testdir/file1", max_requests + 100)
+
+ # Create a few files synchronously. This makes sure previous requests are completed
+ self.mount_a.create_n_files("testdir/file2", 5, True)
+
+ # Wait for the health warnings. Assume mds can handle 10 request per second at least
+ self.wait_for_health("failing to advance its oldest client/flush tid", max_requests / 10)
+
+ def _test_client_cache_size(self, mount_subdir):
+ """
+ check if client invalidate kernel dcache according to its cache size config
+ """
+
+ # The debug hook to inject the failure only exists in the fuse client
+ if not isinstance(self.mount_a, FuseMount):
+ raise SkipTest("Require FUSE client to inject client release failure")
+
+ if mount_subdir:
+ # fuse assigns a fix inode number (1) to root inode. But in mounting into
+ # subdir case, the actual inode number of root is not 1. This mismatch
+ # confuses fuse_lowlevel_notify_inval_entry() when invalidating dentries
+ # in root directory.
+ self.mount_a.run_shell(["mkdir", "subdir"])
+ self.mount_a.umount_wait()
+ self.set_conf('client', 'client mountpoint', '/subdir')
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+ root_ino = self.mount_a.path_to_ino(".")
+ self.assertEqual(root_ino, 1);
+
+ dir_path = os.path.join(self.mount_a.mountpoint, "testdir")
+
+ mkdir_script = dedent("""
+ import os
+ os.mkdir("{path}")
+ for n in range(0, {num_dirs}):
+ os.mkdir("{path}/dir{{0}}".format(n))
+ """)
+
+ num_dirs = 1000
+ self.mount_a.run_python(mkdir_script.format(path=dir_path, num_dirs=num_dirs))
+ self.mount_a.run_shell(["sync"])
+
+ dentry_count, dentry_pinned_count = self.mount_a.get_dentry_count()
+ self.assertGreaterEqual(dentry_count, num_dirs)
+ self.assertGreaterEqual(dentry_pinned_count, num_dirs)
+
+ cache_size = num_dirs / 10
+ self.mount_a.set_cache_size(cache_size)
+
+ def trimmed():
+ dentry_count, dentry_pinned_count = self.mount_a.get_dentry_count()
+ log.info("waiting, dentry_count, dentry_pinned_count: {0}, {1}".format(
+ dentry_count, dentry_pinned_count
+ ))
+ if dentry_count > cache_size or dentry_pinned_count > cache_size:
+ return False
+
+ return True
+
+ self.wait_until_true(trimmed, 30)
+
+ @needs_trimming
+ def test_client_cache_size(self):
+ self._test_client_cache_size(False)
+ self._test_client_cache_size(True)
--- /dev/null
+
+"""
+Teuthology task for exercising CephFS client recovery
+"""
+
+import logging
+from textwrap import dedent
+import time
+import distutils.version as version
+import re
+import os
+
+from teuthology.orchestra.run import CommandFailedError, ConnectionLostError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from teuthology.packaging import get_package_version
+
+
+log = logging.getLogger(__name__)
+
+
+# Arbitrary timeouts for operations involving restarting
+# an MDS or waiting for it to come up
+MDS_RESTART_GRACE = 60
+
+
+class TestClientNetworkRecovery(CephFSTestCase):
+ REQUIRE_KCLIENT_REMOTE = True
+ REQUIRE_ONE_CLIENT_REMOTE = True
+ CLIENTS_REQUIRED = 2
+
+ LOAD_SETTINGS = ["mds_session_timeout", "mds_reconnect_timeout", "ms_max_backoff"]
+
+ # Environment references
+ mds_session_timeout = None
+ mds_reconnect_timeout = None
+ ms_max_backoff = None
+
+ def test_network_death(self):
+ """
+ Simulate software freeze or temporary network failure.
+
+ Check that the client blocks I/O during failure, and completes
+ I/O after failure.
+ """
+
+ # We only need one client
+ self.mount_b.umount_wait()
+
+ # Initially our one client session should be visible
+ client_id = self.mount_a.get_global_id()
+ ls_data = self._session_list()
+ self.assert_session_count(1, ls_data)
+ self.assertEqual(ls_data[0]['id'], client_id)
+ self.assert_session_state(client_id, "open")
+
+ # ...and capable of doing I/O without blocking
+ self.mount_a.create_files()
+
+ # ...but if we turn off the network
+ self.fs.set_clients_block(True)
+
+ # ...and try and start an I/O
+ write_blocked = self.mount_a.write_background()
+
+ # ...then it should block
+ self.assertFalse(write_blocked.finished)
+ self.assert_session_state(client_id, "open")
+ time.sleep(self.mds_session_timeout * 1.5) # Long enough for MDS to consider session stale
+ self.assertFalse(write_blocked.finished)
+ self.assert_session_state(client_id, "stale")
+
+ # ...until we re-enable I/O
+ self.fs.set_clients_block(False)
+
+ # ...when it should complete promptly
+ a = time.time()
+ self.wait_until_true(lambda: write_blocked.finished, self.ms_max_backoff * 2)
+ write_blocked.wait() # Already know we're finished, wait() to raise exception on errors
+ recovery_time = time.time() - a
+ log.info("recovery time: {0}".format(recovery_time))
+ self.assert_session_state(client_id, "open")
+
+
+class TestClientRecovery(CephFSTestCase):
+ REQUIRE_KCLIENT_REMOTE = True
+ CLIENTS_REQUIRED = 2
+
+ LOAD_SETTINGS = ["mds_session_timeout", "mds_reconnect_timeout", "ms_max_backoff"]
+
+ # Environment references
+ mds_session_timeout = None
+ mds_reconnect_timeout = None
+ ms_max_backoff = None
+
+ def test_basic(self):
+ # Check that two clients come up healthy and see each others' files
+ # =====================================================
+ self.mount_a.create_files()
+ self.mount_a.check_files()
+ self.mount_a.umount_wait()
+
+ self.mount_b.check_files()
+
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+
+ # Check that the admin socket interface is correctly reporting
+ # two sessions
+ # =====================================================
+ ls_data = self._session_list()
+ self.assert_session_count(2, ls_data)
+
+ self.assertSetEqual(
+ set([l['id'] for l in ls_data]),
+ {self.mount_a.get_global_id(), self.mount_b.get_global_id()}
+ )
+
+ def test_restart(self):
+ # Check that after an MDS restart both clients reconnect and continue
+ # to handle I/O
+ # =====================================================
+ self.fs.mds_fail_restart()
+ self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE)
+
+ self.mount_a.create_destroy()
+ self.mount_b.create_destroy()
+
+ def _session_num_caps(self, client_id):
+ ls_data = self.fs.mds_asok(['session', 'ls'])
+ return int(self._session_by_id(ls_data).get(client_id, {'num_caps': None})['num_caps'])
+
+ def test_reconnect_timeout(self):
+ # Reconnect timeout
+ # =================
+ # Check that if I stop an MDS and a client goes away, the MDS waits
+ # for the reconnect period
+ self.fs.mds_stop()
+ self.fs.mds_fail()
+
+ mount_a_client_id = self.mount_a.get_global_id()
+ self.mount_a.umount_wait(force=True)
+
+ self.fs.mds_restart()
+
+ self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE)
+ # Check that the MDS locally reports its state correctly
+ status = self.fs.mds_asok(['status'])
+ self.assertIn("reconnect_status", status)
+
+ ls_data = self._session_list()
+ self.assert_session_count(2, ls_data)
+
+ # The session for the dead client should have the 'reconnect' flag set
+ self.assertTrue(self.get_session(mount_a_client_id)['reconnecting'])
+
+ # Wait for the reconnect state to clear, this should take the
+ # reconnect timeout period.
+ in_reconnect_for = self.fs.wait_for_state('up:active', timeout=self.mds_reconnect_timeout * 2)
+ # Check that the period we waited to enter active is within a factor
+ # of two of the reconnect timeout.
+ self.assertGreater(in_reconnect_for, self.mds_reconnect_timeout / 2,
+ "Should have been in reconnect phase for {0} but only took {1}".format(
+ self.mds_reconnect_timeout, in_reconnect_for
+ ))
+
+ self.assert_session_count(1)
+
+ # Check that the client that timed out during reconnect can
+ # mount again and do I/O
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+ self.mount_a.create_destroy()
+
+ self.assert_session_count(2)
+
+ def test_reconnect_eviction(self):
+ # Eviction during reconnect
+ # =========================
+ mount_a_client_id = self.mount_a.get_global_id()
+
+ self.fs.mds_stop()
+ self.fs.mds_fail()
+
+ # The mount goes away while the MDS is offline
+ self.mount_a.kill()
+
+ self.fs.mds_restart()
+
+ # Enter reconnect phase
+ self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE)
+ self.assert_session_count(2)
+
+ # Evict the stuck client
+ self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
+ self.assert_session_count(1)
+
+ # Observe that we proceed to active phase without waiting full reconnect timeout
+ evict_til_active = self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE)
+ # Once we evict the troublemaker, the reconnect phase should complete
+ # in well under the reconnect timeout.
+ self.assertLess(evict_til_active, self.mds_reconnect_timeout * 0.5,
+ "reconnect did not complete soon enough after eviction, took {0}".format(
+ evict_til_active
+ ))
+
+ # We killed earlier so must clean up before trying to use again
+ self.mount_a.kill_cleanup()
+
+ # Bring the client back
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+ self.mount_a.create_destroy()
+
+ def test_stale_caps(self):
+ # Capability release from stale session
+ # =====================================
+ cap_holder = self.mount_a.open_background()
+
+ # Wait for the file to be visible from another client, indicating
+ # that mount_a has completed its network ops
+ self.mount_b.wait_for_visible()
+
+ # Simulate client death
+ self.mount_a.kill()
+
+ try:
+ # Now, after mds_session_timeout seconds, the waiter should
+ # complete their operation when the MDS marks the holder's
+ # session stale.
+ cap_waiter = self.mount_b.write_background()
+ a = time.time()
+ cap_waiter.wait()
+ b = time.time()
+
+ # Should have succeeded
+ self.assertEqual(cap_waiter.exitstatus, 0)
+
+ cap_waited = b - a
+ log.info("cap_waiter waited {0}s".format(cap_waited))
+ self.assertTrue(self.mds_session_timeout / 2.0 <= cap_waited <= self.mds_session_timeout * 2.0,
+ "Capability handover took {0}, expected approx {1}".format(
+ cap_waited, self.mds_session_timeout
+ ))
+
+ cap_holder.stdin.close()
+ try:
+ cap_holder.wait()
+ except (CommandFailedError, ConnectionLostError):
+ # We killed it (and possibly its node), so it raises an error
+ pass
+ finally:
+ # teardown() doesn't quite handle this case cleanly, so help it out
+ self.mount_a.kill_cleanup()
+
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+
+ def test_evicted_caps(self):
+ # Eviction while holding a capability
+ # ===================================
+
+ # Take out a write capability on a file on client A,
+ # and then immediately kill it.
+ cap_holder = self.mount_a.open_background()
+ mount_a_client_id = self.mount_a.get_global_id()
+
+ # Wait for the file to be visible from another client, indicating
+ # that mount_a has completed its network ops
+ self.mount_b.wait_for_visible()
+
+ # Simulate client death
+ self.mount_a.kill()
+
+ try:
+ # The waiter should get stuck waiting for the capability
+ # held on the MDS by the now-dead client A
+ cap_waiter = self.mount_b.write_background()
+ time.sleep(5)
+ self.assertFalse(cap_waiter.finished)
+
+ self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
+ # Now, because I evicted the old holder of the capability, it should
+ # immediately get handed over to the waiter
+ a = time.time()
+ cap_waiter.wait()
+ b = time.time()
+ cap_waited = b - a
+ log.info("cap_waiter waited {0}s".format(cap_waited))
+ # This is the check that it happened 'now' rather than waiting
+ # for the session timeout
+ self.assertLess(cap_waited, self.mds_session_timeout / 2.0,
+ "Capability handover took {0}, expected less than {1}".format(
+ cap_waited, self.mds_session_timeout / 2.0
+ ))
+
+ cap_holder.stdin.close()
+ try:
+ cap_holder.wait()
+ except (CommandFailedError, ConnectionLostError):
+ # We killed it (and possibly its node), so it raises an error
+ pass
+ finally:
+ self.mount_a.kill_cleanup()
+
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+
+ def test_trim_caps(self):
+ # Trim capability when reconnecting MDS
+ # ===================================
+
+ count = 500
+ # Create lots of files
+ for i in range(count):
+ self.mount_a.run_shell(["touch", "f{0}".format(i)])
+
+ # Populate mount_b's cache
+ self.mount_b.run_shell(["ls"])
+
+ client_id = self.mount_b.get_global_id()
+ num_caps = self._session_num_caps(client_id)
+ self.assertGreaterEqual(num_caps, count)
+
+ # Restart MDS. client should trim its cache when reconnecting to the MDS
+ self.fs.mds_fail_restart()
+ self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE)
+
+ num_caps = self._session_num_caps(client_id)
+ self.assertLess(num_caps, count,
+ "should have less than {0} capabilities, have {1}".format(
+ count, num_caps
+ ))
+
+ def test_filelock(self):
+ """
+ Check that file lock doesn't get lost after an MDS restart
+ """
+ a_version_str = get_package_version(self.mount_a.client_remote, "fuse")
+ b_version_str = get_package_version(self.mount_b.client_remote, "fuse")
+ flock_version_str = "2.9"
+
+ version_regex = re.compile(r"[0-9\.]+")
+ a_result = version_regex.match(a_version_str)
+ self.assertTrue(a_result)
+ b_result = version_regex.match(b_version_str)
+ self.assertTrue(b_result)
+ a_version = version.StrictVersion(a_result.group())
+ b_version = version.StrictVersion(b_result.group())
+ flock_version=version.StrictVersion(flock_version_str)
+
+ flockable = False
+ if (a_version >= flock_version and b_version >= flock_version):
+ log.info("testing flock locks")
+ flockable = True
+ else:
+ log.info("not testing flock locks, machines have versions {av} and {bv}".format(
+ av=a_version_str,bv=b_version_str))
+
+ lock_holder = self.mount_a.lock_background(do_flock=flockable)
+
+ self.mount_b.wait_for_visible("background_file-2")
+ self.mount_b.check_filelock(do_flock=flockable)
+
+ self.fs.mds_fail_restart()
+ self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE)
+
+ self.mount_b.check_filelock(do_flock=flockable)
+
+ # Tear down the background process
+ lock_holder.stdin.close()
+ try:
+ lock_holder.wait()
+ except (CommandFailedError, ConnectionLostError):
+ # We killed it, so it raises an error
+ pass
+
+ def test_dir_fsync(self):
+ self._test_fsync(True);
+
+ def test_create_fsync(self):
+ self._test_fsync(False);
+
+ def _test_fsync(self, dirfsync):
+ """
+ That calls to fsync guarantee visibility of metadata to another
+ client immediately after the fsyncing client dies.
+ """
+
+ # Leave this guy out until he's needed
+ self.mount_b.umount_wait()
+
+ # Create dir + child dentry on client A, and fsync the dir
+ path = os.path.join(self.mount_a.mountpoint, "subdir")
+ self.mount_a.run_python(
+ dedent("""
+ import os
+ import time
+
+ path = "{path}"
+
+ print "Starting creation..."
+ start = time.time()
+
+ os.mkdir(path)
+ dfd = os.open(path, os.O_DIRECTORY)
+
+ fd = open(os.path.join(path, "childfile"), "w")
+ print "Finished creation in {{0}}s".format(time.time() - start)
+
+ print "Starting fsync..."
+ start = time.time()
+ if {dirfsync}:
+ os.fsync(dfd)
+ else:
+ os.fsync(fd)
+ print "Finished fsync in {{0}}s".format(time.time() - start)
+ """.format(path=path,dirfsync=str(dirfsync)))
+ )
+
+ # Immediately kill the MDS and then client A
+ self.fs.mds_stop()
+ self.fs.mds_fail()
+ self.mount_a.kill()
+ self.mount_a.kill_cleanup()
+
+ # Restart the MDS. Wait for it to come up, it'll have to time out in clientreplay
+ self.fs.mds_restart()
+ log.info("Waiting for reconnect...")
+ self.fs.wait_for_state("up:reconnect")
+ log.info("Waiting for active...")
+ self.fs.wait_for_state("up:active", timeout=MDS_RESTART_GRACE + self.mds_reconnect_timeout)
+ log.info("Reached active...")
+
+ # Is the child dentry visible from mount B?
+ self.mount_b.mount()
+ self.mount_b.wait_until_mounted()
+ self.mount_b.run_shell(["ls", "subdir/childfile"])
--- /dev/null
+
+from unittest import case
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from tasks.cephfs.fuse_mount import FuseMount
+
+
+class TestConfigCommands(CephFSTestCase):
+ """
+ Test that daemons and clients respond to the otherwise rarely-used
+ runtime config modification operations.
+ """
+
+ CLIENTS_REQUIRED = 1
+ MDSS_REQUIRED = 1
+
+ def test_client_config(self):
+ """
+ That I can successfully issue asok "config set" commands
+
+ :return:
+ """
+
+ if not isinstance(self.mount_a, FuseMount):
+ raise case.SkipTest("Test only applies to FUSE clients")
+
+ test_key = "client_cache_size"
+ test_val = "123"
+ self.mount_a.admin_socket(['config', 'set', test_key, test_val])
+ out = self.mount_a.admin_socket(['config', 'get', test_key])
+ self.assertEqual(out[test_key], test_val)
+
+ self.mount_a.write_n_mb("file.bin", 1);
+
+ # Implicitly asserting that things don't have lockdep error in shutdown
+ self.mount_a.umount_wait(require_clean=True)
+ self.fs.mds_stop()
+
+ def test_mds_config_asok(self):
+ test_key = "mds_max_purge_ops"
+ test_val = "123"
+ self.fs.mds_asok(['config', 'set', test_key, test_val])
+ out = self.fs.mds_asok(['config', 'get', test_key])
+ self.assertEqual(out[test_key], test_val)
+
+ # Implicitly asserting that things don't have lockdep error in shutdown
+ self.mount_a.umount_wait(require_clean=True)
+ self.fs.mds_stop()
+
+ def test_mds_config_tell(self):
+ test_key = "mds_max_purge_ops"
+ test_val = "123"
+
+ mds_id = self.fs.get_lone_mds_id()
+ self.fs.mon_manager.raw_cluster_cmd("tell", "mds.{0}".format(mds_id), "injectargs",
+ "--{0}={1}".format(test_key, test_val))
+
+ # Read it back with asok because there is no `tell` equivalent
+ out = self.fs.mds_asok(['config', 'get', test_key])
+ self.assertEqual(out[test_key], test_val)
+
+ # Implicitly asserting that things don't have lockdep error in shutdown
+ self.mount_a.umount_wait(require_clean=True)
+ self.fs.mds_stop()
--- /dev/null
+import json
+import logging
+import errno
+import re
+from teuthology.contextutil import MaxWhileTries
+from teuthology.exceptions import CommandFailedError
+from teuthology.orchestra.run import wait
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
+
+DAMAGED_ON_START = "damaged_on_start"
+DAMAGED_ON_LS = "damaged_on_ls"
+CRASHED = "server crashed"
+NO_DAMAGE = "no damage"
+FAILED_CLIENT = "client failed"
+FAILED_SERVER = "server failed"
+
+# An EIO in response to a stat from the client
+EIO_ON_LS = "eio"
+
+# An EIO, but nothing in damage table (not ever what we expect)
+EIO_NO_DAMAGE = "eio without damage entry"
+
+
+log = logging.getLogger(__name__)
+
+
+class TestDamage(CephFSTestCase):
+ def _simple_workload_write(self):
+ self.mount_a.run_shell(["mkdir", "subdir"])
+ self.mount_a.write_n_mb("subdir/sixmegs", 6)
+ return self.mount_a.stat("subdir/sixmegs")
+
+ def is_marked_damaged(self, rank):
+ mds_map = self.fs.get_mds_map()
+ return rank in mds_map['damaged']
+
+ @for_teuthology #459s
+ def test_object_deletion(self):
+ """
+ That the MDS has a clean 'damaged' response to loss of any single metadata object
+ """
+
+ self._simple_workload_write()
+
+ # Hmm, actually it would be nice to permute whether the metadata pool
+ # state contains sessions or not, but for the moment close this session
+ # to avoid waiting through reconnect on every MDS start.
+ self.mount_a.umount_wait()
+ for mds_name in self.fs.get_active_names():
+ self.fs.mds_asok(["flush", "journal"], mds_name)
+
+ self.fs.mds_stop()
+ self.fs.mds_fail()
+
+ self.fs.rados(['export', '/tmp/metadata.bin'])
+
+ def is_ignored(obj_id, dentry=None):
+ """
+ A filter to avoid redundantly mutating many similar objects (e.g.
+ stray dirfrags) or similar dentries (e.g. stray dir dentries)
+ """
+ if re.match("60.\.00000000", obj_id) and obj_id != "600.00000000":
+ return True
+
+ if dentry and obj_id == "100.00000000":
+ if re.match("stray.+_head", dentry) and dentry != "stray0_head":
+ return True
+
+ return False
+
+ def get_path(obj_id, dentry=None):
+ """
+ What filesystem path does this object or dentry correspond to? i.e.
+ what should I poke to see EIO after damaging it?
+ """
+
+ if obj_id == "1.00000000" and dentry == "subdir_head":
+ return "./subdir"
+ elif obj_id == "10000000000.00000000" and dentry == "sixmegs_head":
+ return "./subdir/sixmegs"
+
+ # None means ls will do an "ls -R" in hope of seeing some errors
+ return None
+
+ objects = self.fs.rados(["ls"]).split("\n")
+ objects = [o for o in objects if not is_ignored(o)]
+
+ # Find all objects with an OMAP header
+ omap_header_objs = []
+ for o in objects:
+ header = self.fs.rados(["getomapheader", o])
+ # The rados CLI wraps the header output in a hex-printed style
+ header_bytes = int(re.match("header \((.+) bytes\)", header).group(1))
+ if header_bytes > 0:
+ omap_header_objs.append(o)
+
+ # Find all OMAP key/vals
+ omap_keys = []
+ for o in objects:
+ keys_str = self.fs.rados(["listomapkeys", o])
+ if keys_str:
+ for key in keys_str.split("\n"):
+ if not is_ignored(o, key):
+ omap_keys.append((o, key))
+
+ # Find objects that have data in their bodies
+ data_objects = []
+ for obj_id in objects:
+ stat_out = self.fs.rados(["stat", obj_id])
+ size = int(re.match(".+, size (.+)$", stat_out).group(1))
+ if size > 0:
+ data_objects.append(obj_id)
+
+ # Define the various forms of damage we will inflict
+ class MetadataMutation(object):
+ def __init__(self, obj_id_, desc_, mutate_fn_, expectation_, ls_path=None):
+ self.obj_id = obj_id_
+ self.desc = desc_
+ self.mutate_fn = mutate_fn_
+ self.expectation = expectation_
+ if ls_path is None:
+ self.ls_path = "."
+ else:
+ self.ls_path = ls_path
+
+ def __eq__(self, other):
+ return self.desc == other.desc
+
+ def __hash__(self):
+ return hash(self.desc)
+
+ junk = "deadbeef" * 10
+ mutations = []
+
+ # Removals
+ for obj_id in objects:
+ if obj_id in [
+ # JournalPointers are auto-replaced if missing (same path as upgrade)
+ "400.00000000",
+ # Missing dirfrags for non-system dirs result in empty directory
+ "10000000000.00000000",
+ ]:
+ expectation = NO_DAMAGE
+ else:
+ expectation = DAMAGED_ON_START
+
+ log.info("Expectation on rm '{0}' will be '{1}'".format(
+ obj_id, expectation
+ ))
+
+ mutations.append(MetadataMutation(
+ obj_id,
+ "Delete {0}".format(obj_id),
+ lambda o=obj_id: self.fs.rados(["rm", o]),
+ expectation
+ ))
+
+ # Blatant corruptions
+ mutations.extend([
+ MetadataMutation(
+ o,
+ "Corrupt {0}".format(o),
+ lambda o=o: self.fs.rados(["put", o, "-"], stdin_data=junk),
+ DAMAGED_ON_START
+ ) for o in data_objects
+ ])
+
+ # Truncations
+ mutations.extend([
+ MetadataMutation(
+ o,
+ "Truncate {0}".format(o),
+ lambda o=o: self.fs.rados(["truncate", o, "0"]),
+ DAMAGED_ON_START
+ ) for o in data_objects
+ ])
+
+ # OMAP value corruptions
+ for o, k in omap_keys:
+ if o.startswith("100."):
+ # Anything in rank 0's 'mydir'
+ expectation = DAMAGED_ON_START
+ else:
+ expectation = EIO_ON_LS
+
+ mutations.append(
+ MetadataMutation(
+ o,
+ "Corrupt omap key {0}:{1}".format(o, k),
+ lambda o=o,k=k: self.fs.rados(["setomapval", o, k, junk]),
+ expectation,
+ get_path(o, k)
+ )
+ )
+
+ # OMAP header corruptions
+ for obj_id in omap_header_objs:
+ if re.match("60.\.00000000", obj_id) \
+ or obj_id in ["1.00000000", "100.00000000", "mds0_sessionmap"]:
+ expectation = DAMAGED_ON_START
+ else:
+ expectation = NO_DAMAGE
+
+ log.info("Expectation on corrupt header '{0}' will be '{1}'".format(
+ obj_id, expectation
+ ))
+
+ mutations.append(
+ MetadataMutation(
+ obj_id,
+ "Corrupt omap header on {0}".format(obj_id),
+ lambda o=obj_id: self.fs.rados(["setomapheader", o, junk]),
+ expectation
+ )
+ )
+
+ results = {}
+
+ for mutation in mutations:
+ log.info("Applying mutation '{0}'".format(mutation.desc))
+
+ # Reset MDS state
+ self.mount_a.umount_wait(force=True)
+ self.fs.mds_stop()
+ self.fs.mds_fail()
+ self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
+
+ # Reset RADOS pool state
+ self.fs.rados(['import', '/tmp/metadata.bin'])
+
+ # Inject the mutation
+ mutation.mutate_fn()
+
+ # Try starting the MDS
+ self.fs.mds_restart()
+
+ # How long we'll wait between starting a daemon and expecting
+ # it to make it through startup, and potentially declare itself
+ # damaged to the mon cluster.
+ startup_timeout = 60
+
+ if mutation.expectation not in (EIO_ON_LS, DAMAGED_ON_LS, NO_DAMAGE):
+ if mutation.expectation == DAMAGED_ON_START:
+ # The MDS may pass through active before making it to damaged
+ try:
+ self.wait_until_true(lambda: self.is_marked_damaged(0), startup_timeout)
+ except RuntimeError:
+ pass
+
+ # Wait for MDS to either come up or go into damaged state
+ try:
+ self.wait_until_true(lambda: self.is_marked_damaged(0) or self.fs.are_daemons_healthy(), startup_timeout)
+ except RuntimeError:
+ crashed = False
+ # Didn't make it to healthy or damaged, did it crash?
+ for daemon_id, daemon in self.fs.mds_daemons.items():
+ if daemon.proc and daemon.proc.finished:
+ crashed = True
+ log.error("Daemon {0} crashed!".format(daemon_id))
+ daemon.proc = None # So that subsequent stop() doesn't raise error
+ if not crashed:
+ # Didn't go health, didn't go damaged, didn't crash, so what?
+ raise
+ else:
+ log.info("Result: Mutation '{0}' led to crash".format(mutation.desc))
+ results[mutation] = CRASHED
+ continue
+ if self.is_marked_damaged(0):
+ log.info("Result: Mutation '{0}' led to DAMAGED state".format(mutation.desc))
+ results[mutation] = DAMAGED_ON_START
+ continue
+ else:
+ log.info("Mutation '{0}' did not prevent MDS startup, attempting ls...".format(mutation.desc))
+ else:
+ try:
+ self.wait_until_true(self.fs.are_daemons_healthy, 60)
+ except RuntimeError:
+ log.info("Result: Mutation '{0}' should have left us healthy, actually not.".format(mutation.desc))
+ if self.is_marked_damaged(0):
+ results[mutation] = DAMAGED_ON_START
+ else:
+ results[mutation] = FAILED_SERVER
+ continue
+ log.info("Daemons came up after mutation '{0}', proceeding to ls".format(mutation.desc))
+
+ # MDS is up, should go damaged on ls or client mount
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+ if mutation.ls_path == ".":
+ proc = self.mount_a.run_shell(["ls", "-R", mutation.ls_path], wait=False)
+ else:
+ proc = self.mount_a.stat(mutation.ls_path, wait=False)
+
+ if mutation.expectation == DAMAGED_ON_LS:
+ try:
+ self.wait_until_true(lambda: self.is_marked_damaged(0), 60)
+ log.info("Result: Mutation '{0}' led to DAMAGED state after ls".format(mutation.desc))
+ results[mutation] = DAMAGED_ON_LS
+ except RuntimeError:
+ if self.fs.are_daemons_healthy():
+ log.error("Result: Failed to go damaged on mutation '{0}', actually went active".format(
+ mutation.desc))
+ results[mutation] = NO_DAMAGE
+ else:
+ log.error("Result: Failed to go damaged on mutation '{0}'".format(mutation.desc))
+ results[mutation] = FAILED_SERVER
+
+ else:
+ try:
+ wait([proc], 20)
+ log.info("Result: Mutation '{0}' did not caused DAMAGED state".format(mutation.desc))
+ results[mutation] = NO_DAMAGE
+ except MaxWhileTries:
+ log.info("Result: Failed to complete client IO on mutation '{0}'".format(mutation.desc))
+ results[mutation] = FAILED_CLIENT
+ except CommandFailedError as e:
+ if e.exitstatus == errno.EIO:
+ log.info("Result: EIO on client")
+ results[mutation] = EIO_ON_LS
+ else:
+ log.info("Result: unexpected error {0} on client".format(e))
+ results[mutation] = FAILED_CLIENT
+
+ if mutation.expectation == EIO_ON_LS:
+ # EIOs mean something handled by DamageTable: assert that it has
+ # been populated
+ damage = json.loads(
+ self.fs.mon_manager.raw_cluster_cmd(
+ 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), "damage", "ls", '--format=json-pretty'))
+ if len(damage) == 0:
+ results[mutation] = EIO_NO_DAMAGE
+
+ failures = [(mutation, result) for (mutation, result) in results.items() if mutation.expectation != result]
+ if failures:
+ log.error("{0} mutations had unexpected outcomes:".format(len(failures)))
+ for mutation, result in failures:
+ log.error(" Expected '{0}' actually '{1}' from '{2}'".format(
+ mutation.expectation, result, mutation.desc
+ ))
+ raise RuntimeError("{0} mutations had unexpected outcomes".format(len(failures)))
+ else:
+ log.info("All {0} mutations had expected outcomes".format(len(mutations)))
+
+ def test_damaged_dentry(self):
+ # Damage to dentrys is interesting because it leaves the
+ # directory's `complete` flag in a subtle state where
+ # we have marked the dir complete in order that folks
+ # can access it, but in actual fact there is a dentry
+ # missing
+ self.mount_a.run_shell(["mkdir", "subdir/"])
+
+ self.mount_a.run_shell(["touch", "subdir/file_undamaged"])
+ self.mount_a.run_shell(["touch", "subdir/file_to_be_damaged"])
+
+ subdir_ino = self.mount_a.path_to_ino("subdir")
+
+ self.mount_a.umount_wait()
+ for mds_name in self.fs.get_active_names():
+ self.fs.mds_asok(["flush", "journal"], mds_name)
+
+ self.fs.mds_stop()
+ self.fs.mds_fail()
+
+ # Corrupt a dentry
+ junk = "deadbeef" * 10
+ dirfrag_obj = "{0:x}.00000000".format(subdir_ino)
+ self.fs.rados(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk])
+
+ # Start up and try to list it
+ self.fs.mds_restart()
+ self.fs.wait_for_daemons()
+
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+ dentries = self.mount_a.ls("subdir/")
+
+ # The damaged guy should have disappeared
+ self.assertEqual(dentries, ["file_undamaged"])
+
+ # I should get ENOENT if I try and read it normally, because
+ # the dir is considered complete
+ try:
+ self.mount_a.stat("subdir/file_to_be_damaged", wait=True)
+ except CommandFailedError as e:
+ self.assertEqual(e.exitstatus, errno.ENOENT)
+ else:
+ raise AssertionError("Expected ENOENT")
+
+ # The fact that there is damaged should have bee recorded
+ damage = json.loads(
+ self.fs.mon_manager.raw_cluster_cmd(
+ 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
+ "damage", "ls", '--format=json-pretty'))
+ self.assertEqual(len(damage), 1)
+ damage_id = damage[0]['id']
+
+ # If I try to create a dentry with the same name as the damaged guy
+ # then that should be forbidden
+ try:
+ self.mount_a.touch("subdir/file_to_be_damaged")
+ except CommandFailedError as e:
+ self.assertEqual(e.exitstatus, errno.EIO)
+ else:
+ raise AssertionError("Expected EIO")
+
+ # Attempting that touch will clear the client's complete flag, now
+ # when I stat it I'll get EIO instead of ENOENT
+ try:
+ self.mount_a.stat("subdir/file_to_be_damaged", wait=True)
+ except CommandFailedError as e:
+ if isinstance(self.mount_a, FuseMount):
+ self.assertEqual(e.exitstatus, errno.EIO)
+ else:
+ # Kernel client handles this case differently
+ self.assertEqual(e.exitstatus, errno.ENOENT)
+ else:
+ raise AssertionError("Expected EIO")
+
+ nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files")
+ self.assertEqual(nfiles, "2")
+
+ self.mount_a.umount_wait()
+
+ # Now repair the stats
+ scrub_json = self.fs.mds_asok(["scrub_path", "/subdir", "repair"])
+ log.info(json.dumps(scrub_json, indent=2))
+
+ self.assertEqual(scrub_json["passed_validation"], False)
+ self.assertEqual(scrub_json["raw_stats"]["checked"], True)
+ self.assertEqual(scrub_json["raw_stats"]["passed"], False)
+
+ # Check that the file count is now correct
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+ nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files")
+ self.assertEqual(nfiles, "1")
+
+ # Clean up the omap object
+ self.fs.rados(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk])
+
+ # Clean up the damagetable entry
+ self.fs.mon_manager.raw_cluster_cmd(
+ 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
+ "damage", "rm", "{did}".format(did=damage_id))
+
+ # Now I should be able to create a file with the same name as the
+ # damaged guy if I want.
+ self.mount_a.touch("subdir/file_to_be_damaged")
--- /dev/null
+
+"""
+Test our tools for recovering metadata from the data pool
+"""
+import json
+
+import logging
+import os
+from textwrap import dedent
+import traceback
+from collections import namedtuple, defaultdict
+
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
+
+log = logging.getLogger(__name__)
+
+
+ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
+
+
+class Workload(object):
+ def __init__(self, filesystem, mount):
+ self._mount = mount
+ self._filesystem = filesystem
+ self._initial_state = None
+
+ # Accumulate backtraces for every failed validation, and return them. Backtraces
+ # are rather verbose, but we only see them when something breaks, and they
+ # let us see which check failed without having to decorate each check with
+ # a string
+ self._errors = []
+
+ def assert_equal(self, a, b):
+ try:
+ if a != b:
+ raise AssertionError("{0} != {1}".format(a, b))
+ except AssertionError as e:
+ self._errors.append(
+ ValidationError(e, traceback.format_exc(3))
+ )
+
+ def write(self):
+ """
+ Write the workload files to the mount
+ """
+ raise NotImplementedError()
+
+ def validate(self):
+ """
+ Read from the mount and validate that the workload files are present (i.e. have
+ survived or been reconstructed from the test scenario)
+ """
+ raise NotImplementedError()
+
+ def damage(self):
+ """
+ Damage the filesystem pools in ways that will be interesting to recover from. By
+ default just wipe everything in the metadata pool
+ """
+ # Delete every object in the metadata pool
+ objects = self._filesystem.rados(["ls"]).split("\n")
+ for o in objects:
+ self._filesystem.rados(["rm", o])
+
+ def flush(self):
+ """
+ Called after client unmount, after write: flush whatever you want
+ """
+ self._filesystem.mds_asok(["flush", "journal"])
+
+
+class SimpleWorkload(Workload):
+ """
+ Single file, single directory, check that it gets recovered and so does its size
+ """
+ def write(self):
+ self._mount.run_shell(["mkdir", "subdir"])
+ self._mount.write_n_mb("subdir/sixmegs", 6)
+ self._initial_state = self._mount.stat("subdir/sixmegs")
+
+ def validate(self):
+ self._mount.run_shell(["ls", "subdir"])
+ st = self._mount.stat("subdir/sixmegs")
+ self.assert_equal(st['st_size'], self._initial_state['st_size'])
+ return self._errors
+
+
+class MovedFile(Workload):
+ def write(self):
+ # Create a file whose backtrace disagrees with his eventual position
+ # in the metadata. We will see that he gets reconstructed in his
+ # original position according to his backtrace.
+ self._mount.run_shell(["mkdir", "subdir_alpha"])
+ self._mount.run_shell(["mkdir", "subdir_bravo"])
+ self._mount.write_n_mb("subdir_alpha/sixmegs", 6)
+ self._filesystem.mds_asok(["flush", "journal"])
+ self._mount.run_shell(["mv", "subdir_alpha/sixmegs", "subdir_bravo/sixmegs"])
+ self._initial_state = self._mount.stat("subdir_bravo/sixmegs")
+
+ def flush(self):
+ pass
+
+ def validate(self):
+ self.assert_equal(self._mount.ls(), ["subdir_alpha"])
+ st = self._mount.stat("subdir_alpha/sixmegs")
+ self.assert_equal(st['st_size'], self._initial_state['st_size'])
+ return self._errors
+
+
+class BacktracelessFile(Workload):
+ def write(self):
+ self._mount.run_shell(["mkdir", "subdir"])
+ self._mount.write_n_mb("subdir/sixmegs", 6)
+ self._initial_state = self._mount.stat("subdir/sixmegs")
+
+ def flush(self):
+ # Never flush metadata, so backtrace won't be written
+ pass
+
+ def validate(self):
+ ino_name = "%x" % self._initial_state["st_ino"]
+
+ # The inode should be linked into lost+found because we had no path for it
+ self.assert_equal(self._mount.ls(), ["lost+found"])
+ self.assert_equal(self._mount.ls("lost+found"), [ino_name])
+ st = self._mount.stat("lost+found/{ino_name}".format(ino_name=ino_name))
+
+ # We might not have got the name or path, but we should still get the size
+ self.assert_equal(st['st_size'], self._initial_state['st_size'])
+
+ return self._errors
+
+
+class StripedStashedLayout(Workload):
+ def __init__(self, fs, m):
+ super(StripedStashedLayout, self).__init__(fs, m)
+
+ # Nice small stripes so we can quickly do our writes+validates
+ self.sc = 4
+ self.ss = 65536
+ self.os = 262144
+
+ self.interesting_sizes = [
+ # Exactly stripe_count objects will exist
+ self.os * self.sc,
+ # Fewer than stripe_count objects will exist
+ self.os * self.sc / 2,
+ self.os * (self.sc - 1) + self.os / 2,
+ self.os * (self.sc - 1) + self.os / 2 - 1,
+ self.os * (self.sc + 1) + self.os / 2,
+ self.os * (self.sc + 1) + self.os / 2 + 1,
+ # More than stripe_count objects will exist
+ self.os * self.sc + self.os * self.sc / 2
+ ]
+
+ def write(self):
+ # Create a dir with a striped layout set on it
+ self._mount.run_shell(["mkdir", "stripey"])
+
+ self._mount.run_shell([
+ "setfattr", "-n", "ceph.dir.layout", "-v",
+ "stripe_unit={ss} stripe_count={sc} object_size={os} pool={pool}".format(
+ ss=self.ss, os=self.os, sc=self.sc,
+ pool=self._filesystem.get_data_pool_name()
+ ),
+ "./stripey"])
+
+ # Write files, then flush metadata so that its layout gets written into an xattr
+ for i, n_bytes in enumerate(self.interesting_sizes):
+ self._mount.write_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes)
+ # This is really just validating the validator
+ self._mount.validate_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes)
+ self._filesystem.mds_asok(["flush", "journal"])
+
+ # Write another file in the same way, but this time don't flush the metadata,
+ # so that it won't have the layout xattr
+ self._mount.write_test_pattern("stripey/unflushed_file", 1024 * 512)
+ self._mount.validate_test_pattern("stripey/unflushed_file", 1024 * 512)
+
+ self._initial_state = {
+ "unflushed_ino": self._mount.path_to_ino("stripey/unflushed_file")
+ }
+
+ def flush(self):
+ # Pass because we already selectively flushed during write
+ pass
+
+ def validate(self):
+ # The first files should have been recovered into its original location
+ # with the correct layout: read back correct data
+ for i, n_bytes in enumerate(self.interesting_sizes):
+ try:
+ self._mount.validate_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes)
+ except CommandFailedError as e:
+ self._errors.append(
+ ValidationError("File {0} (size {1}): {2}".format(i, n_bytes, e), traceback.format_exc(3))
+ )
+
+ # The unflushed file should have been recovered into lost+found without
+ # the correct layout: read back junk
+ ino_name = "%x" % self._initial_state["unflushed_ino"]
+ self.assert_equal(self._mount.ls("lost+found"), [ino_name])
+ try:
+ self._mount.validate_test_pattern(os.path.join("lost+found", ino_name), 1024 * 512)
+ except CommandFailedError:
+ pass
+ else:
+ self._errors.append(
+ ValidationError("Unexpectedly valid data in unflushed striped file", "")
+ )
+
+ return self._errors
+
+
+class ManyFilesWorkload(Workload):
+ def __init__(self, filesystem, mount, file_count):
+ super(ManyFilesWorkload, self).__init__(filesystem, mount)
+ self.file_count = file_count
+
+ def write(self):
+ self._mount.run_shell(["mkdir", "subdir"])
+ for n in range(0, self.file_count):
+ self._mount.write_test_pattern("subdir/{0}".format(n), 6 * 1024 * 1024)
+
+ def validate(self):
+ for n in range(0, self.file_count):
+ try:
+ self._mount.validate_test_pattern("subdir/{0}".format(n), 6 * 1024 * 1024)
+ except CommandFailedError as e:
+ self._errors.append(
+ ValidationError("File {0}: {1}".format(n, e), traceback.format_exc(3))
+ )
+
+ return self._errors
+
+
+class MovedDir(Workload):
+ def write(self):
+ # Create a nested dir that we will then move. Two files with two different
+ # backtraces referring to the moved dir, claiming two different locations for
+ # it. We will see that only one backtrace wins and the dir ends up with
+ # single linkage.
+ self._mount.run_shell(["mkdir", "-p", "grandmother/parent"])
+ self._mount.write_n_mb("grandmother/parent/orig_pos_file", 1)
+ self._filesystem.mds_asok(["flush", "journal"])
+ self._mount.run_shell(["mkdir", "grandfather"])
+ self._mount.run_shell(["mv", "grandmother/parent", "grandfather"])
+ self._mount.write_n_mb("grandfather/parent/new_pos_file", 2)
+ self._filesystem.mds_asok(["flush", "journal"])
+
+ self._initial_state = (
+ self._mount.stat("grandfather/parent/orig_pos_file"),
+ self._mount.stat("grandfather/parent/new_pos_file")
+ )
+
+ def validate(self):
+ root_files = self._mount.ls()
+ self.assert_equal(len(root_files), 1)
+ self.assert_equal(root_files[0] in ["grandfather", "grandmother"], True)
+ winner = root_files[0]
+ st_opf = self._mount.stat("{0}/parent/orig_pos_file".format(winner))
+ st_npf = self._mount.stat("{0}/parent/new_pos_file".format(winner))
+
+ self.assert_equal(st_opf['st_size'], self._initial_state[0]['st_size'])
+ self.assert_equal(st_npf['st_size'], self._initial_state[1]['st_size'])
+
+
+class MissingZerothObject(Workload):
+ def write(self):
+ self._mount.run_shell(["mkdir", "subdir"])
+ self._mount.write_n_mb("subdir/sixmegs", 6)
+ self._initial_state = self._mount.stat("subdir/sixmegs")
+
+ def damage(self):
+ super(MissingZerothObject, self).damage()
+ zeroth_id = "{0:x}.00000000".format(self._initial_state['st_ino'])
+ self._filesystem.rados(["rm", zeroth_id], pool=self._filesystem.get_data_pool_name())
+
+ def validate(self):
+ st = self._mount.stat("lost+found/{0:x}".format(self._initial_state['st_ino']))
+ self.assert_equal(st['st_size'], self._initial_state['st_size'])
+
+
+class NonDefaultLayout(Workload):
+ """
+ Check that the reconstruction copes with files that have a different
+ object size in their layout
+ """
+ def write(self):
+ self._mount.run_shell(["touch", "datafile"])
+ self._mount.run_shell(["setfattr", "-n", "ceph.file.layout.object_size", "-v", "8388608", "./datafile"])
+ self._mount.run_shell(["dd", "if=/dev/urandom", "of=./datafile", "bs=1M", "count=32"])
+ self._initial_state = self._mount.stat("datafile")
+
+ def validate(self):
+ p = self._mount.run_shell(["getfattr", "--only-values", "-n", "ceph.file.layout.object_size", "./datafile"])
+
+ # Check we got the layout reconstructed properly
+ object_size = int(p.stdout.getvalue().strip())
+ self.assert_equal(object_size, 8388608)
+
+ # Check we got the file size reconstructed properly
+ st = self._mount.stat("datafile")
+ self.assert_equal(st['st_size'], self._initial_state['st_size'])
+
+
+class TestDataScan(CephFSTestCase):
+ MDSS_REQUIRED = 2
+
+ def is_marked_damaged(self, rank):
+ mds_map = self.fs.get_mds_map()
+ return rank in mds_map['damaged']
+
+ def _rebuild_metadata(self, workload, workers=1):
+ """
+ That when all objects in metadata pool are removed, we can rebuild a metadata pool
+ based on the contents of a data pool, and a client can see and read our files.
+ """
+
+ # First, inject some files
+ workload.write()
+
+ # Unmount the client and flush the journal: the tool should also cope with
+ # situations where there is dirty metadata, but we'll test that separately
+ self.mount_a.umount_wait()
+ workload.flush()
+
+ # Stop the MDS
+ self.fs.mds_stop()
+ self.fs.mds_fail()
+
+ # After recovery, we need the MDS to not be strict about stats (in production these options
+ # are off by default, but in QA we need to explicitly disable them)
+ self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
+ self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
+
+ # Apply any data damage the workload wants
+ workload.damage()
+
+ # Reset the MDS map in case multiple ranks were in play: recovery procedure
+ # only understands how to rebuild metadata under rank 0
+ self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
+ '--yes-i-really-mean-it')
+
+ # Attempt to start an MDS, see that it goes into damaged state
+ self.fs.mds_restart()
+
+ def get_state(mds_id):
+ info = self.mds_cluster.get_mds_info(mds_id)
+ return info['state'] if info is not None else None
+
+ self.wait_until_true(lambda: self.is_marked_damaged(0), 60)
+ for mds_id in self.fs.mds_ids:
+ self.wait_until_equal(
+ lambda: get_state(mds_id),
+ "up:standby",
+ timeout=60)
+
+ # Run the recovery procedure
+ self.fs.table_tool(["0", "reset", "session"])
+ self.fs.table_tool(["0", "reset", "snap"])
+ self.fs.table_tool(["0", "reset", "inode"])
+ if False:
+ with self.assertRaises(CommandFailedError):
+ # Normal reset should fail when no objects are present, we'll use --force instead
+ self.fs.journal_tool(["journal", "reset"])
+ self.fs.journal_tool(["journal", "reset", "--force"])
+ self.fs.data_scan(["init"])
+ self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()], worker_count=workers)
+ self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()], worker_count=workers)
+
+ # Mark the MDS repaired
+ self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
+
+ # Start the MDS
+ self.fs.mds_restart()
+ self.fs.wait_for_daemons()
+ log.info(str(self.mds_cluster.status()))
+
+ # Mount a client
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+
+ # See that the files are present and correct
+ errors = workload.validate()
+ if errors:
+ log.error("Validation errors found: {0}".format(len(errors)))
+ for e in errors:
+ log.error(e.exception)
+ log.error(e.backtrace)
+ raise AssertionError("Validation failed, first error: {0}\n{1}".format(
+ errors[0].exception, errors[0].backtrace
+ ))
+
+ def test_rebuild_simple(self):
+ self._rebuild_metadata(SimpleWorkload(self.fs, self.mount_a))
+
+ def test_rebuild_moved_file(self):
+ self._rebuild_metadata(MovedFile(self.fs, self.mount_a))
+
+ def test_rebuild_backtraceless(self):
+ self._rebuild_metadata(BacktracelessFile(self.fs, self.mount_a))
+
+ def test_rebuild_moved_dir(self):
+ self._rebuild_metadata(MovedDir(self.fs, self.mount_a))
+
+ def test_rebuild_missing_zeroth(self):
+ self._rebuild_metadata(MissingZerothObject(self.fs, self.mount_a))
+
+ def test_rebuild_nondefault_layout(self):
+ self._rebuild_metadata(NonDefaultLayout(self.fs, self.mount_a))
+
+ def test_stashed_layout(self):
+ self._rebuild_metadata(StripedStashedLayout(self.fs, self.mount_a))
+
+ def _dirfrag_keys(self, object_id):
+ keys_str = self.fs.rados(["listomapkeys", object_id])
+ if keys_str:
+ return keys_str.split("\n")
+ else:
+ return []
+
+ def test_fragmented_injection(self):
+ """
+ That when injecting a dentry into a fragmented directory, we put it in the right fragment.
+ """
+
+ self.fs.mon_manager.raw_cluster_cmd("mds", "set", "allow_dirfrags", "true",
+ "--yes-i-really-mean-it")
+
+ file_count = 100
+ file_names = ["%s" % n for n in range(0, file_count)]
+
+ # Create a directory of `file_count` files, each named after its
+ # decimal number and containing the string of its decimal number
+ self.mount_a.run_python(dedent("""
+ import os
+ path = os.path.join("{path}", "subdir")
+ os.mkdir(path)
+ for n in range(0, {file_count}):
+ open(os.path.join(path, "%s" % n), 'w').write("%s" % n)
+ """.format(
+ path=self.mount_a.mountpoint,
+ file_count=file_count
+ )))
+
+ dir_ino = self.mount_a.path_to_ino("subdir")
+
+ # Only one MDS should be active!
+ self.assertEqual(len(self.fs.get_active_names()), 1)
+
+ # Ensure that one directory is fragmented
+ mds_id = self.fs.get_active_names()[0]
+ self.fs.mds_asok(["dirfrag", "split", "/subdir", "0/0", "1"], mds_id)
+
+ # Flush journal and stop MDS
+ self.mount_a.umount_wait()
+ self.fs.mds_asok(["flush", "journal"], mds_id)
+ self.fs.mds_stop()
+ self.fs.mds_fail()
+
+ # Pick a dentry and wipe out its key
+ # Because I did a 1 bit split, I know one frag will be named <inode>.01000000
+ frag_obj_id = "{0:x}.01000000".format(dir_ino)
+ keys = self._dirfrag_keys(frag_obj_id)
+ victim_key = keys[7] # arbitrary choice
+ log.info("victim_key={0}".format(victim_key))
+ victim_dentry = victim_key.split("_head")[0]
+ self.fs.rados(["rmomapkey", frag_obj_id, victim_key])
+
+ # Start filesystem back up, observe that the file appears to be gone in an `ls`
+ self.fs.mds_restart()
+ self.fs.wait_for_daemons()
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+ files = self.mount_a.run_shell(["ls", "subdir/"]).stdout.getvalue().strip().split("\n")
+ self.assertListEqual(sorted(files), sorted(list(set(file_names) - set([victim_dentry]))))
+
+ # Stop the filesystem
+ self.mount_a.umount_wait()
+ self.fs.mds_stop()
+ self.fs.mds_fail()
+
+ # Run data-scan, observe that it inserts our dentry back into the correct fragment
+ # by checking the omap now has the dentry's key again
+ self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()])
+ self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()])
+ self.assertIn(victim_key, self._dirfrag_keys(frag_obj_id))
+
+ # Start the filesystem and check that the dentry we deleted is now once again visible
+ # and points to the correct file data.
+ self.fs.mds_restart()
+ self.fs.wait_for_daemons()
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+ out = self.mount_a.run_shell(["cat", "subdir/{0}".format(victim_dentry)]).stdout.getvalue().strip()
+ self.assertEqual(out, victim_dentry)
+
+ # Finally, close the loop by checking our injected dentry survives a merge
+ mds_id = self.fs.get_active_names()[0]
+ self.mount_a.ls("subdir") # Do an ls to ensure both frags are in cache so the merge will work
+ self.fs.mds_asok(["dirfrag", "merge", "/subdir", "0/0"], mds_id)
+ self.fs.mds_asok(["flush", "journal"], mds_id)
+ frag_obj_id = "{0:x}.00000000".format(dir_ino)
+ keys = self._dirfrag_keys(frag_obj_id)
+ self.assertListEqual(sorted(keys), sorted(["%s_head" % f for f in file_names]))
+
+ @for_teuthology
+ def test_parallel_execution(self):
+ self._rebuild_metadata(ManyFilesWorkload(self.fs, self.mount_a, 25), workers=7)
+
+ def test_pg_files(self):
+ """
+ That the pg files command tells us which files are associated with
+ a particular PG
+ """
+ file_count = 20
+ self.mount_a.run_shell(["mkdir", "mydir"])
+ self.mount_a.create_n_files("mydir/myfile", file_count)
+
+ # Some files elsewhere in the system that we will ignore
+ # to check that the tool is filtering properly
+ self.mount_a.run_shell(["mkdir", "otherdir"])
+ self.mount_a.create_n_files("otherdir/otherfile", file_count)
+
+ pgs_to_files = defaultdict(list)
+ # Rough (slow) reimplementation of the logic
+ for i in range(0, file_count):
+ file_path = "mydir/myfile_{0}".format(i)
+ ino = self.mount_a.path_to_ino(file_path)
+ obj = "{0:x}.{1:08x}".format(ino, 0)
+ pgid = json.loads(self.fs.mon_manager.raw_cluster_cmd(
+ "osd", "map", self.fs.get_data_pool_name(), obj,
+ "--format=json-pretty"
+ ))['pgid']
+ pgs_to_files[pgid].append(file_path)
+ log.info("{0}: {1}".format(file_path, pgid))
+
+ pg_count = self.fs.get_pgs_per_fs_pool()
+ for pg_n in range(0, pg_count):
+ pg_str = "{0}.{1}".format(self.fs.get_data_pool_id(), pg_n)
+ out = self.fs.data_scan(["pg_files", "mydir", pg_str])
+ lines = [l for l in out.split("\n") if l]
+ log.info("{0}: {1}".format(pg_str, lines))
+ self.assertSetEqual(set(lines), set(pgs_to_files[pg_str]))
+
+ def test_scan_links(self):
+ """
+ The scan_links command fixes linkage errors
+ """
+ self.mount_a.run_shell(["mkdir", "testdir1"])
+ self.mount_a.run_shell(["mkdir", "testdir2"])
+ dir1_ino = self.mount_a.path_to_ino("testdir1")
+ dir2_ino = self.mount_a.path_to_ino("testdir2")
+ dirfrag1_oid = "{0:x}.00000000".format(dir1_ino)
+ dirfrag2_oid = "{0:x}.00000000".format(dir2_ino)
+
+ self.mount_a.run_shell(["touch", "testdir1/file1"])
+ self.mount_a.run_shell(["ln", "testdir1/file1", "testdir1/link1"])
+ self.mount_a.run_shell(["ln", "testdir1/file1", "testdir2/link2"])
+
+ mds_id = self.fs.get_active_names()[0]
+ self.fs.mds_asok(["flush", "journal"], mds_id)
+
+ dirfrag1_keys = self._dirfrag_keys(dirfrag1_oid)
+
+ # introduce duplicated primary link
+ file1_key = "file1_head"
+ self.assertIn(file1_key, dirfrag1_keys)
+ file1_omap_data = self.fs.rados(["getomapval", dirfrag1_oid, file1_key, '-'])
+ self.fs.rados(["setomapval", dirfrag2_oid, file1_key], stdin_data=file1_omap_data)
+ self.assertIn(file1_key, self._dirfrag_keys(dirfrag2_oid))
+
+ # remove a remote link, make inode link count incorrect
+ link1_key = 'link1_head'
+ self.assertIn(link1_key, dirfrag1_keys)
+ self.fs.rados(["rmomapkey", dirfrag1_oid, link1_key])
+
+ # increase good primary link's version
+ self.mount_a.run_shell(["touch", "testdir1/file1"])
+ self.mount_a.umount_wait()
+
+ self.fs.mds_asok(["flush", "journal"], mds_id)
+ self.fs.mds_stop()
+ self.fs.mds_fail()
+
+ # repair linkage errors
+ self.fs.data_scan(["scan_links"])
+
+ # primary link in testdir2 was deleted?
+ self.assertNotIn(file1_key, self._dirfrag_keys(dirfrag2_oid))
+
+ self.fs.mds_restart()
+ self.fs.wait_for_daemons()
+
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+
+ # link count was adjusted?
+ file1_nlink = self.mount_a.path_to_nlink("testdir1/file1")
+ self.assertEqual(file1_nlink, 2)
--- /dev/null
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+import random
+import os
+
+class TestDumpTree(CephFSTestCase):
+ def get_paths_to_ino(self):
+ inos = {}
+ p = self.mount_a.run_shell(["find", "./"])
+ paths = p.stdout.getvalue().strip().split()
+ for path in paths:
+ inos[path] = self.mount_a.path_to_ino(path, False)
+
+ return inos
+
+ def populate(self):
+ self.mount_a.run_shell(["git", "clone",
+ "https://github.com/ceph/ceph-qa-suite"])
+
+ def test_basic(self):
+ self.mount_a.run_shell(["mkdir", "parent"])
+ self.mount_a.run_shell(["mkdir", "parent/child"])
+ self.mount_a.run_shell(["touch", "parent/child/file"])
+ self.mount_a.run_shell(["mkdir", "parent/child/grandchild"])
+ self.mount_a.run_shell(["touch", "parent/child/grandchild/file"])
+
+ inos = self.get_paths_to_ino()
+ tree = self.fs.mds_asok(["dump", "tree", "/parent/child", "1"])
+
+ target_inos = [inos["./parent/child"], inos["./parent/child/file"],
+ inos["./parent/child/grandchild"]]
+
+ for ino in tree:
+ del target_inos[target_inos.index(ino['ino'])] # don't catch!
+
+ assert(len(target_inos) == 0)
+
+ def test_random(self):
+ random.seed(0)
+
+ self.populate()
+ inos = self.get_paths_to_ino()
+ target = random.choice(inos.keys())
+
+ if target != "./":
+ target = os.path.dirname(target)
+
+ subtree = [path for path in inos.keys() if path.startswith(target)]
+ target_inos = [inos[path] for path in subtree]
+ tree = self.fs.mds_asok(["dump", "tree", target[1:]])
+
+ for ino in tree:
+ del target_inos[target_inos.index(ino['ino'])] # don't catch!
+
+ assert(len(target_inos) == 0)
+
+ target_depth = target.count('/')
+ maxdepth = max([path.count('/') for path in subtree]) - target_depth
+ depth = random.randint(0, maxdepth)
+ target_inos = [inos[path] for path in subtree \
+ if path.count('/') <= depth + target_depth]
+ tree = self.fs.mds_asok(["dump", "tree", target[1:], str(depth)])
+
+ for ino in tree:
+ del target_inos[target_inos.index(ino['ino'])] # don't catch!
+
+ assert(len(target_inos) == 0)
--- /dev/null
+import json
+import logging
+from unittest import case, SkipTest
+
+from cephfs_test_case import CephFSTestCase
+from teuthology.exceptions import CommandFailedError
+from tasks.ceph_manager import CephManager
+from teuthology import misc as teuthology
+from tasks.cephfs.fuse_mount import FuseMount
+
+log = logging.getLogger(__name__)
+
+
+class TestFailover(CephFSTestCase):
+ CLIENTS_REQUIRED = 1
+ MDSS_REQUIRED = 2
+
+ def test_simple(self):
+ """
+ That when the active MDS is killed, a standby MDS is promoted into
+ its rank after the grace period.
+
+ This is just a simple unit test, the harder cases are covered
+ in thrashing tests.
+ """
+
+ # Need all my standbys up as well as the active daemons
+ self.wait_for_daemon_start()
+
+ (original_active, ) = self.fs.get_active_names()
+ original_standbys = self.mds_cluster.get_standby_daemons()
+
+ # Kill the rank 0 daemon's physical process
+ self.fs.mds_stop(original_active)
+
+ grace = int(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+
+ # Wait until the monitor promotes his replacement
+ def promoted():
+ active = self.fs.get_active_names()
+ return active and active[0] in original_standbys
+
+ log.info("Waiting for promotion of one of the original standbys {0}".format(
+ original_standbys))
+ self.wait_until_true(
+ promoted,
+ timeout=grace*2)
+
+ # Start the original rank 0 daemon up again, see that he becomes a standby
+ self.fs.mds_restart(original_active)
+ self.wait_until_true(
+ lambda: original_active in self.mds_cluster.get_standby_daemons(),
+ timeout=60 # Approximately long enough for MDS to start and mon to notice
+ )
+
+ def test_client_abort(self):
+ """
+ That a client will respect fuse_require_active_mds and error out
+ when the cluster appears to be unavailable.
+ """
+
+ if not isinstance(self.mount_a, FuseMount):
+ raise SkipTest("Requires FUSE client to inject client metadata")
+
+ require_active = self.fs.get_config("fuse_require_active_mds", service_type="mon").lower() == "true"
+ if not require_active:
+ raise case.SkipTest("fuse_require_active_mds is not set")
+
+ grace = int(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+
+ # Check it's not laggy to begin with
+ (original_active, ) = self.fs.get_active_names()
+ self.assertNotIn("laggy_since", self.fs.mon_manager.get_mds_status(original_active))
+
+ self.mounts[0].umount_wait()
+
+ # Control: that we can mount and unmount usually, while the cluster is healthy
+ self.mounts[0].mount()
+ self.mounts[0].wait_until_mounted()
+ self.mounts[0].umount_wait()
+
+ # Stop the daemon processes
+ self.fs.mds_stop()
+
+ # Wait for everyone to go laggy
+ def laggy():
+ mdsmap = self.fs.get_mds_map()
+ for info in mdsmap['info'].values():
+ if "laggy_since" not in info:
+ return False
+
+ return True
+
+ self.wait_until_true(laggy, grace * 2)
+ with self.assertRaises(CommandFailedError):
+ self.mounts[0].mount()
+
+
+class TestStandbyReplay(CephFSTestCase):
+ MDSS_REQUIRED = 4
+ REQUIRE_FILESYSTEM = False
+
+ def set_standby_for(self, leader, follower, replay):
+ self.set_conf("mds.{0}".format(follower), "mds_standby_for_name", leader)
+ if replay:
+ self.set_conf("mds.{0}".format(follower), "mds_standby_replay", "true")
+
+ def get_info_by_name(self, mds_name):
+ status = self.mds_cluster.status()
+ info = status.get_mds(mds_name)
+ if info is None:
+ log.warn(str(status))
+ raise RuntimeError("MDS '{0}' not found".format(mds_name))
+ else:
+ return info
+
+ def test_standby_replay_unused(self):
+ # Pick out exactly 3 daemons to be run during test
+ use_daemons = sorted(self.mds_cluster.mds_ids[0:3])
+ mds_a, mds_b, mds_c = use_daemons
+ log.info("Using MDS daemons: {0}".format(use_daemons))
+
+ # B and C should both follow A, but only one will
+ # really get into standby replay state.
+ self.set_standby_for(mds_a, mds_b, True)
+ self.set_standby_for(mds_a, mds_c, True)
+
+ # Create FS and start A
+ fs_a = self.mds_cluster.newfs("alpha")
+ self.mds_cluster.mds_restart(mds_a)
+ fs_a.wait_for_daemons()
+ self.assertEqual(fs_a.get_active_names(), [mds_a])
+
+ # Start B, he should go into standby replay
+ self.mds_cluster.mds_restart(mds_b)
+ self.wait_for_daemon_start([mds_b])
+ info_b = self.get_info_by_name(mds_b)
+ self.assertEqual(info_b['state'], "up:standby-replay")
+ self.assertEqual(info_b['standby_for_name'], mds_a)
+ self.assertEqual(info_b['rank'], 0)
+
+ # Start C, he should go into standby (*not* replay)
+ self.mds_cluster.mds_restart(mds_c)
+ self.wait_for_daemon_start([mds_c])
+ info_c = self.get_info_by_name(mds_c)
+ self.assertEqual(info_c['state'], "up:standby")
+ self.assertEqual(info_c['standby_for_name'], mds_a)
+ self.assertEqual(info_c['rank'], -1)
+
+ # Kill B, C should go into standby replay
+ self.mds_cluster.mds_stop(mds_b)
+ self.mds_cluster.mds_fail(mds_b)
+ self.wait_until_equal(
+ lambda: self.get_info_by_name(mds_c)['state'],
+ "up:standby-replay",
+ 60)
+ info_c = self.get_info_by_name(mds_c)
+ self.assertEqual(info_c['state'], "up:standby-replay")
+ self.assertEqual(info_c['standby_for_name'], mds_a)
+ self.assertEqual(info_c['rank'], 0)
+
+ def test_standby_failure(self):
+ """
+ That the failure of a standby-replay daemon happens cleanly
+ and doesn't interrupt anything else.
+ """
+ # Pick out exactly 2 daemons to be run during test
+ use_daemons = sorted(self.mds_cluster.mds_ids[0:2])
+ mds_a, mds_b = use_daemons
+ log.info("Using MDS daemons: {0}".format(use_daemons))
+
+ # Configure two pairs of MDSs that are standby for each other
+ self.set_standby_for(mds_a, mds_b, True)
+ self.set_standby_for(mds_b, mds_a, False)
+
+ # Create FS alpha and get mds_a to come up as active
+ fs_a = self.mds_cluster.newfs("alpha")
+ self.mds_cluster.mds_restart(mds_a)
+ fs_a.wait_for_daemons()
+ self.assertEqual(fs_a.get_active_names(), [mds_a])
+
+ # Start the standbys
+ self.mds_cluster.mds_restart(mds_b)
+ self.wait_for_daemon_start([mds_b])
+
+ # See the standby come up as the correct rank
+ info_b = self.get_info_by_name(mds_b)
+ self.assertEqual(info_b['state'], "up:standby-replay")
+ self.assertEqual(info_b['standby_for_name'], mds_a)
+ self.assertEqual(info_b['rank'], 0)
+
+ # Kill the standby
+ self.mds_cluster.mds_stop(mds_b)
+ self.mds_cluster.mds_fail(mds_b)
+
+ # See that the standby is gone and the active remains
+ self.assertEqual(fs_a.get_active_names(), [mds_a])
+ mds_map = fs_a.get_mds_map()
+ self.assertEqual(len(mds_map['info']), 1)
+ self.assertEqual(mds_map['failed'], [])
+ self.assertEqual(mds_map['damaged'], [])
+ self.assertEqual(mds_map['stopped'], [])
+
+ def test_rank_stopped(self):
+ """
+ That when a rank is STOPPED, standby replays for
+ that rank get torn down
+ """
+ # Pick out exactly 2 daemons to be run during test
+ use_daemons = sorted(self.mds_cluster.mds_ids[0:4])
+ mds_a, mds_b, mds_a_s, mds_b_s = use_daemons
+ log.info("Using MDS daemons: {0}".format(use_daemons))
+
+ # a and b both get a standby
+ self.set_standby_for(mds_a, mds_a_s, True)
+ self.set_standby_for(mds_b, mds_b_s, True)
+
+ # Create FS alpha and get mds_a to come up as active
+ fs_a = self.mds_cluster.newfs("alpha")
+ fs_a.mon_manager.raw_cluster_cmd('fs', 'set', fs_a.name,
+ 'allow_multimds', "true",
+ "--yes-i-really-mean-it")
+ fs_a.mon_manager.raw_cluster_cmd('fs', 'set', fs_a.name, 'max_mds', "2")
+
+ self.mds_cluster.mds_restart(mds_a)
+ self.wait_until_equal(lambda: fs_a.get_active_names(), [mds_a], 30)
+ self.mds_cluster.mds_restart(mds_b)
+ fs_a.wait_for_daemons()
+ self.assertEqual(sorted(fs_a.get_active_names()), [mds_a, mds_b])
+
+ # Start the standbys
+ self.mds_cluster.mds_restart(mds_b_s)
+ self.wait_for_daemon_start([mds_b_s])
+ self.mds_cluster.mds_restart(mds_a_s)
+ self.wait_for_daemon_start([mds_a_s])
+ info_b_s = self.get_info_by_name(mds_b_s)
+ self.assertEqual(info_b_s['state'], "up:standby-replay")
+ info_a_s = self.get_info_by_name(mds_a_s)
+ self.assertEqual(info_a_s['state'], "up:standby-replay")
+
+ # Shrink the cluster
+ fs_a.mon_manager.raw_cluster_cmd('fs', 'set', fs_a.name, 'max_mds', "1")
+ fs_a.mon_manager.raw_cluster_cmd("mds", "stop", "{0}:1".format(fs_a.name))
+ self.wait_until_equal(
+ lambda: fs_a.get_active_names(), [mds_a],
+ 60
+ )
+
+ # Both 'b' and 'b_s' should go back to being standbys
+ self.wait_until_equal(
+ lambda: self.mds_cluster.get_standby_daemons(), {mds_b, mds_b_s},
+ 60
+ )
+
+
+class TestMultiFilesystems(CephFSTestCase):
+ CLIENTS_REQUIRED = 2
+ MDSS_REQUIRED = 4
+
+ # We'll create our own filesystems and start our own daemons
+ REQUIRE_FILESYSTEM = False
+
+ def setUp(self):
+ super(TestMultiFilesystems, self).setUp()
+ self.mds_cluster.mon_manager.raw_cluster_cmd("fs", "flag", "set",
+ "enable_multiple", "true",
+ "--yes-i-really-mean-it")
+
+ def _setup_two(self):
+ fs_a = self.mds_cluster.newfs("alpha")
+ fs_b = self.mds_cluster.newfs("bravo")
+
+ self.mds_cluster.mds_restart()
+
+ # Wait for both filesystems to go healthy
+ fs_a.wait_for_daemons()
+ fs_b.wait_for_daemons()
+
+ # Reconfigure client auth caps
+ for mount in self.mounts:
+ self.mds_cluster.mon_manager.raw_cluster_cmd_result(
+ 'auth', 'caps', "client.{0}".format(mount.client_id),
+ 'mds', 'allow',
+ 'mon', 'allow r',
+ 'osd', 'allow rw pool={0}, allow rw pool={1}'.format(
+ fs_a.get_data_pool_name(), fs_b.get_data_pool_name()))
+
+ return fs_a, fs_b
+
+ def test_clients(self):
+ fs_a, fs_b = self._setup_two()
+
+ # Mount a client on fs_a
+ self.mount_a.mount(mount_fs_name=fs_a.name)
+ self.mount_a.write_n_mb("pad.bin", 1)
+ self.mount_a.write_n_mb("test.bin", 2)
+ a_created_ino = self.mount_a.path_to_ino("test.bin")
+ self.mount_a.create_files()
+
+ # Mount a client on fs_b
+ self.mount_b.mount(mount_fs_name=fs_b.name)
+ self.mount_b.write_n_mb("test.bin", 1)
+ b_created_ino = self.mount_b.path_to_ino("test.bin")
+ self.mount_b.create_files()
+
+ # Check that a non-default filesystem mount survives an MDS
+ # failover (i.e. that map subscription is continuous, not
+ # just the first time), reproduces #16022
+ old_fs_b_mds = fs_b.get_active_names()[0]
+ self.mds_cluster.mds_stop(old_fs_b_mds)
+ self.mds_cluster.mds_fail(old_fs_b_mds)
+ fs_b.wait_for_daemons()
+ background = self.mount_b.write_background()
+ # Raise exception if the write doesn't finish (i.e. if client
+ # has not kept up with MDS failure)
+ try:
+ self.wait_until_true(lambda: background.finished, timeout=30)
+ except RuntimeError:
+ # The mount is stuck, we'll have to force it to fail cleanly
+ background.stdin.close()
+ self.mount_b.umount_wait(force=True)
+ raise
+
+ self.mount_a.umount_wait()
+ self.mount_b.umount_wait()
+
+ # See that the client's files went into the correct pool
+ self.assertTrue(fs_a.data_objects_present(a_created_ino, 1024 * 1024))
+ self.assertTrue(fs_b.data_objects_present(b_created_ino, 1024 * 1024))
+
+ def test_standby(self):
+ fs_a, fs_b = self._setup_two()
+
+ # Assert that the remaining two MDS daemons are now standbys
+ a_daemons = fs_a.get_active_names()
+ b_daemons = fs_b.get_active_names()
+ self.assertEqual(len(a_daemons), 1)
+ self.assertEqual(len(b_daemons), 1)
+ original_a = a_daemons[0]
+ original_b = b_daemons[0]
+ expect_standby_daemons = set(self.mds_cluster.mds_ids) - (set(a_daemons) | set(b_daemons))
+
+ # Need all my standbys up as well as the active daemons
+ self.wait_for_daemon_start()
+ self.assertEqual(expect_standby_daemons, self.mds_cluster.get_standby_daemons())
+
+ # Kill fs_a's active MDS, see a standby take over
+ self.mds_cluster.mds_stop(original_a)
+ self.mds_cluster.mon_manager.raw_cluster_cmd("mds", "fail", original_a)
+ self.wait_until_equal(lambda: len(fs_a.get_active_names()), 1, 30,
+ reject_fn=lambda v: v > 1)
+ # Assert that it's a *different* daemon that has now appeared in the map for fs_a
+ self.assertNotEqual(fs_a.get_active_names()[0], original_a)
+
+ # Kill fs_b's active MDS, see a standby take over
+ self.mds_cluster.mds_stop(original_b)
+ self.mds_cluster.mon_manager.raw_cluster_cmd("mds", "fail", original_b)
+ self.wait_until_equal(lambda: len(fs_b.get_active_names()), 1, 30,
+ reject_fn=lambda v: v > 1)
+ # Assert that it's a *different* daemon that has now appeared in the map for fs_a
+ self.assertNotEqual(fs_b.get_active_names()[0], original_b)
+
+ # Both of the original active daemons should be gone, and all standbys used up
+ self.assertEqual(self.mds_cluster.get_standby_daemons(), set())
+
+ # Restart the ones I killed, see them reappear as standbys
+ self.mds_cluster.mds_restart(original_a)
+ self.mds_cluster.mds_restart(original_b)
+ self.wait_until_true(
+ lambda: {original_a, original_b} == self.mds_cluster.get_standby_daemons(),
+ timeout=30
+ )
+
+ def test_grow_shrink(self):
+ # Usual setup...
+ fs_a, fs_b = self._setup_two()
+ fs_a.mon_manager.raw_cluster_cmd("fs", "set", fs_a.name,
+ "allow_multimds", "true",
+ "--yes-i-really-mean-it")
+
+ fs_b.mon_manager.raw_cluster_cmd("fs", "set", fs_b.name,
+ "allow_multimds", "true",
+ "--yes-i-really-mean-it")
+
+ # Increase max_mds on fs_b, see a standby take up the role
+ fs_b.mon_manager.raw_cluster_cmd('fs', 'set', fs_b.name, 'max_mds', "2")
+ self.wait_until_equal(lambda: len(fs_b.get_active_names()), 2, 30,
+ reject_fn=lambda v: v > 2 or v < 1)
+
+ # Increase max_mds on fs_a, see a standby take up the role
+ fs_a.mon_manager.raw_cluster_cmd('fs', 'set', fs_a.name, 'max_mds', "2")
+ self.wait_until_equal(lambda: len(fs_a.get_active_names()), 2, 30,
+ reject_fn=lambda v: v > 2 or v < 1)
+
+ # Shrink fs_b back to 1, see a daemon go back to standby
+ fs_b.mon_manager.raw_cluster_cmd('fs', 'set', fs_b.name, 'max_mds', "1")
+ fs_b.mon_manager.raw_cluster_cmd('mds', 'deactivate', "{0}:1".format(fs_b.name))
+ self.wait_until_equal(lambda: len(fs_b.get_active_names()), 1, 30,
+ reject_fn=lambda v: v > 2 or v < 1)
+
+ # Grow fs_a up to 3, see the former fs_b daemon join it.
+ fs_a.mon_manager.raw_cluster_cmd('fs', 'set', fs_a.name, 'max_mds', "3")
+ self.wait_until_equal(lambda: len(fs_a.get_active_names()), 3, 60,
+ reject_fn=lambda v: v > 3 or v < 2)
+
+ def test_standby_for_name(self):
+ # Pick out exactly 4 daemons to be run during test
+ use_daemons = sorted(self.mds_cluster.mds_ids[0:4])
+ mds_a, mds_b, mds_c, mds_d = use_daemons
+ log.info("Using MDS daemons: {0}".format(use_daemons))
+
+ def set_standby_for(leader, follower, replay):
+ self.set_conf("mds.{0}".format(follower), "mds_standby_for_name", leader)
+ if replay:
+ self.set_conf("mds.{0}".format(follower), "mds_standby_replay", "true")
+
+ # Configure two pairs of MDSs that are standby for each other
+ set_standby_for(mds_a, mds_b, True)
+ set_standby_for(mds_b, mds_a, False)
+ set_standby_for(mds_c, mds_d, True)
+ set_standby_for(mds_d, mds_c, False)
+
+ # Create FS alpha and get mds_a to come up as active
+ fs_a = self.mds_cluster.newfs("alpha")
+ self.mds_cluster.mds_restart(mds_a)
+ fs_a.wait_for_daemons()
+ self.assertEqual(fs_a.get_active_names(), [mds_a])
+
+ # Create FS bravo and get mds_c to come up as active
+ fs_b = self.mds_cluster.newfs("bravo")
+ self.mds_cluster.mds_restart(mds_c)
+ fs_b.wait_for_daemons()
+ self.assertEqual(fs_b.get_active_names(), [mds_c])
+
+ # Start the standbys
+ self.mds_cluster.mds_restart(mds_b)
+ self.mds_cluster.mds_restart(mds_d)
+ self.wait_for_daemon_start([mds_b, mds_d])
+
+ def get_info_by_name(fs, mds_name):
+ mds_map = fs.get_mds_map()
+ for gid_str, info in mds_map['info'].items():
+ if info['name'] == mds_name:
+ return info
+
+ log.warn(json.dumps(mds_map, indent=2))
+ raise RuntimeError("MDS '{0}' not found in filesystem MDSMap".format(mds_name))
+
+ # See both standbys come up as standby replay for the correct ranks
+ # mds_b should be in filesystem alpha following mds_a
+ info_b = get_info_by_name(fs_a, mds_b)
+ self.assertEqual(info_b['state'], "up:standby-replay")
+ self.assertEqual(info_b['standby_for_name'], mds_a)
+ self.assertEqual(info_b['rank'], 0)
+ # mds_d should be in filesystem alpha following mds_c
+ info_d = get_info_by_name(fs_b, mds_d)
+ self.assertEqual(info_d['state'], "up:standby-replay")
+ self.assertEqual(info_d['standby_for_name'], mds_c)
+ self.assertEqual(info_d['rank'], 0)
+
+ # Kill both active daemons
+ self.mds_cluster.mds_stop(mds_a)
+ self.mds_cluster.mds_fail(mds_a)
+ self.mds_cluster.mds_stop(mds_c)
+ self.mds_cluster.mds_fail(mds_c)
+
+ # Wait for standbys to take over
+ fs_a.wait_for_daemons()
+ self.assertEqual(fs_a.get_active_names(), [mds_b])
+ fs_b.wait_for_daemons()
+ self.assertEqual(fs_b.get_active_names(), [mds_d])
+
+ # Start the original active daemons up again
+ self.mds_cluster.mds_restart(mds_a)
+ self.mds_cluster.mds_restart(mds_c)
+ self.wait_for_daemon_start([mds_a, mds_c])
+
+ self.assertEqual(set(self.mds_cluster.get_standby_daemons()),
+ {mds_a, mds_c})
+
+ def test_standby_for_rank(self):
+ use_daemons = sorted(self.mds_cluster.mds_ids[0:4])
+ mds_a, mds_b, mds_c, mds_d = use_daemons
+ log.info("Using MDS daemons: {0}".format(use_daemons))
+
+ def set_standby_for(leader_rank, leader_fs, follower_id):
+ self.set_conf("mds.{0}".format(follower_id),
+ "mds_standby_for_rank", leader_rank)
+
+ fscid = leader_fs.get_namespace_id()
+ self.set_conf("mds.{0}".format(follower_id),
+ "mds_standby_for_fscid", fscid)
+
+ fs_a = self.mds_cluster.newfs("alpha")
+ fs_b = self.mds_cluster.newfs("bravo")
+ set_standby_for(0, fs_a, mds_a)
+ set_standby_for(0, fs_a, mds_b)
+ set_standby_for(0, fs_b, mds_c)
+ set_standby_for(0, fs_b, mds_d)
+
+ self.mds_cluster.mds_restart(mds_a)
+ fs_a.wait_for_daemons()
+ self.assertEqual(fs_a.get_active_names(), [mds_a])
+
+ self.mds_cluster.mds_restart(mds_c)
+ fs_b.wait_for_daemons()
+ self.assertEqual(fs_b.get_active_names(), [mds_c])
+
+ self.mds_cluster.mds_restart(mds_b)
+ self.mds_cluster.mds_restart(mds_d)
+ self.wait_for_daemon_start([mds_b, mds_d])
+
+ self.mds_cluster.mds_stop(mds_a)
+ self.mds_cluster.mds_fail(mds_a)
+ self.mds_cluster.mds_stop(mds_c)
+ self.mds_cluster.mds_fail(mds_c)
+
+ fs_a.wait_for_daemons()
+ self.assertEqual(fs_a.get_active_names(), [mds_b])
+ fs_b.wait_for_daemons()
+ self.assertEqual(fs_b.get_active_names(), [mds_d])
+
+ def test_standby_for_fscid(self):
+ """
+ That I can set a standby FSCID with no rank, and the result is
+ that daemons join any rank for that filesystem.
+ """
+ use_daemons = sorted(self.mds_cluster.mds_ids[0:4])
+ mds_a, mds_b, mds_c, mds_d = use_daemons
+
+ log.info("Using MDS daemons: {0}".format(use_daemons))
+
+ def set_standby_for(leader_fs, follower_id):
+ fscid = leader_fs.get_namespace_id()
+ self.set_conf("mds.{0}".format(follower_id),
+ "mds_standby_for_fscid", fscid)
+
+ # Create two filesystems which should have two ranks each
+ fs_a = self.mds_cluster.newfs("alpha")
+ fs_a.mon_manager.raw_cluster_cmd("fs", "set", fs_a.name,
+ "allow_multimds", "true",
+ "--yes-i-really-mean-it")
+
+ fs_b = self.mds_cluster.newfs("bravo")
+ fs_b.mon_manager.raw_cluster_cmd("fs", "set", fs_b.name,
+ "allow_multimds", "true",
+ "--yes-i-really-mean-it")
+
+ fs_a.mon_manager.raw_cluster_cmd('fs', 'set', fs_a.name,
+ 'max_mds', "2")
+ fs_b.mon_manager.raw_cluster_cmd('fs', 'set', fs_b.name,
+ 'max_mds', "2")
+
+ # Set all the daemons to have a FSCID assignment but no other
+ # standby preferences.
+ set_standby_for(fs_a, mds_a)
+ set_standby_for(fs_a, mds_b)
+ set_standby_for(fs_b, mds_c)
+ set_standby_for(fs_b, mds_d)
+
+ # Now when we start all daemons at once, they should fall into
+ # ranks in the right filesystem
+ self.mds_cluster.mds_restart(mds_a)
+ self.mds_cluster.mds_restart(mds_b)
+ self.mds_cluster.mds_restart(mds_c)
+ self.mds_cluster.mds_restart(mds_d)
+ self.wait_for_daemon_start([mds_a, mds_b, mds_c, mds_d])
+ fs_a.wait_for_daemons()
+ fs_b.wait_for_daemons()
+ self.assertEqual(set(fs_a.get_active_names()), {mds_a, mds_b})
+ self.assertEqual(set(fs_b.get_active_names()), {mds_c, mds_d})
+
+ def test_standby_for_invalid_fscid(self):
+ # Set invalid standby_fscid with other mds standby_rank
+ # stopping active mds service should not end up in mon crash
+
+ # Get configured mons in the cluster
+ first_mon = teuthology.get_first_mon(self.ctx, self.configs_set)
+ (mon,) = self.ctx.cluster.only(first_mon).remotes.iterkeys()
+ manager = CephManager(
+ mon,
+ ctx=self.ctx,
+ logger=log.getChild('ceph_manager'),
+ )
+ configured_mons = manager.get_mon_quorum()
+
+ use_daemons = sorted(self.mds_cluster.mds_ids[0:3])
+ mds_a, mds_b, mds_c = use_daemons
+ log.info("Using MDS daemons: {0}".format(use_daemons))
+
+ def set_standby_for_rank(leader_rank, follower_id):
+ self.set_conf("mds.{0}".format(follower_id),
+ "mds_standby_for_rank", leader_rank)
+
+ # Create one fs
+ fs_a = self.mds_cluster.newfs("cephfs")
+
+ # Set all the daemons to have a rank assignment but no other
+ # standby preferences.
+ set_standby_for_rank(0, mds_a)
+ set_standby_for_rank(0, mds_b)
+
+ # Set third daemon to have invalid fscid assignment and no other
+ # standby preferences
+ invalid_fscid = 123
+ self.set_conf("mds.{0}".format(mds_c), "mds_standby_for_fscid", invalid_fscid)
+
+ #Restart all the daemons to make the standby preference applied
+ self.mds_cluster.mds_restart(mds_a)
+ self.mds_cluster.mds_restart(mds_b)
+ self.mds_cluster.mds_restart(mds_c)
+ self.wait_for_daemon_start([mds_a, mds_b, mds_c])
+
+ #Stop active mds daemon service of fs
+ if (fs_a.get_active_names(), [mds_a]):
+ self.mds_cluster.mds_stop(mds_a)
+ self.mds_cluster.mds_fail(mds_a)
+ fs_a.wait_for_daemons()
+ else:
+ self.mds_cluster.mds_stop(mds_b)
+ self.mds_cluster.mds_fail(mds_b)
+ fs_a.wait_for_daemons()
+
+ #Get active mons from cluster
+ active_mons = manager.get_mon_quorum()
+
+ #Check for active quorum mon status and configured mon status
+ self.assertEqual(active_mons, configured_mons, "Not all mons are in quorum Invalid standby invalid fscid test failed!")
--- /dev/null
+
+from textwrap import dedent
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO
+
+
+class TestFlush(CephFSTestCase):
+ def test_flush(self):
+ self.mount_a.run_shell(["mkdir", "mydir"])
+ self.mount_a.run_shell(["touch", "mydir/alpha"])
+ dir_ino = self.mount_a.path_to_ino("mydir")
+ file_ino = self.mount_a.path_to_ino("mydir/alpha")
+
+ # Unmount the client so that it isn't still holding caps
+ self.mount_a.umount_wait()
+
+ # Before flush, the dirfrag object does not exist
+ with self.assertRaises(ObjectNotFound):
+ self.fs.list_dirfrag(dir_ino)
+
+ # Before flush, the file's backtrace has not been written
+ with self.assertRaises(ObjectNotFound):
+ self.fs.read_backtrace(file_ino)
+
+ # Before flush, there are no dentries in the root
+ self.assertEqual(self.fs.list_dirfrag(ROOT_INO), [])
+
+ # Execute flush
+ flush_data = self.fs.mds_asok(["flush", "journal"])
+ self.assertEqual(flush_data['return_code'], 0)
+
+ # After flush, the dirfrag object has been created
+ dir_list = self.fs.list_dirfrag(dir_ino)
+ self.assertEqual(dir_list, ["alpha_head"])
+
+ # And the 'mydir' dentry is in the root
+ self.assertEqual(self.fs.list_dirfrag(ROOT_INO), ['mydir_head'])
+
+ # ...and the data object has its backtrace
+ backtrace = self.fs.read_backtrace(file_ino)
+ self.assertEqual(['alpha', 'mydir'], [a['dname'] for a in backtrace['ancestors']])
+ self.assertEqual([dir_ino, 1], [a['dirino'] for a in backtrace['ancestors']])
+ self.assertEqual(file_ino, backtrace['ino'])
+
+ # ...and the journal is truncated to just a single subtreemap from the
+ # newly created segment
+ summary_output = self.fs.journal_tool(["event", "get", "summary"])
+ try:
+ self.assertEqual(summary_output,
+ dedent(
+ """
+ Events by type:
+ SUBTREEMAP: 1
+ Errors: 0
+ """
+ ).strip())
+ except AssertionError:
+ # In some states, flushing the journal will leave you
+ # an extra event from locks a client held. This is
+ # correct behaviour: the MDS is flushing the journal,
+ # it's just that new events are getting added too.
+ # In this case, we should nevertheless see a fully
+ # empty journal after a second flush.
+ self.assertEqual(summary_output,
+ dedent(
+ """
+ Events by type:
+ SUBTREEMAP: 1
+ UPDATE: 1
+ Errors: 0
+ """
+ ).strip())
+ flush_data = self.fs.mds_asok(["flush", "journal"])
+ self.assertEqual(flush_data['return_code'], 0)
+ self.assertEqual(self.fs.journal_tool(["event", "get", "summary"]),
+ dedent(
+ """
+ Events by type:
+ SUBTREEMAP: 1
+ Errors: 0
+ """
+ ).strip())
+
+ # Now for deletion!
+ # We will count the RADOS deletions and MDS file purges, to verify that
+ # the expected behaviour is happening as a result of the purge
+ initial_dels = self.fs.mds_asok(['perf', 'dump', 'objecter'])['objecter']['osdop_delete']
+ initial_purges = self.fs.mds_asok(['perf', 'dump', 'mds_cache'])['mds_cache']['strays_purged']
+
+ # Use a client to delete a file
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+ self.mount_a.run_shell(["rm", "-rf", "mydir"])
+
+ # Flush the journal so that the directory inode can be purged
+ flush_data = self.fs.mds_asok(["flush", "journal"])
+ self.assertEqual(flush_data['return_code'], 0)
+
+ # We expect to see a single file purge
+ self.wait_until_true(
+ lambda: self.fs.mds_asok(['perf', 'dump', 'mds_cache'])['mds_cache']['strays_purged'] - initial_purges >= 2,
+ 60)
+
+ # We expect two deletions, one of the dirfrag and one of the backtrace
+ self.wait_until_true(
+ lambda: self.fs.mds_asok(['perf', 'dump', 'objecter'])['objecter']['osdop_delete'] - initial_dels >= 2,
+ 60) # timeout is fairly long to allow for tick+rados latencies
+
+ with self.assertRaises(ObjectNotFound):
+ self.fs.list_dirfrag(dir_ino)
+ with self.assertRaises(ObjectNotFound):
+ self.fs.read_backtrace(file_ino)
+ self.assertEqual(self.fs.list_dirfrag(ROOT_INO), [])
--- /dev/null
+
+"""
+Test that the forward scrub functionality can traverse metadata and apply
+requested tags, on well formed metadata.
+
+This is *not* the real testing for forward scrub, which will need to test
+how the functionality responds to damaged metadata.
+
+"""
+import json
+
+import logging
+from collections import namedtuple
+from textwrap import dedent
+
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+import struct
+
+log = logging.getLogger(__name__)
+
+
+ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
+
+
+class TestForwardScrub(CephFSTestCase):
+ MDSS_REQUIRED = 1
+
+ def _read_str_xattr(self, pool, obj, attr):
+ """
+ Read a ceph-encoded string from a rados xattr
+ """
+ output = self.fs.rados(["getxattr", obj, attr], pool=pool)
+ strlen = struct.unpack('i', output[0:4])[0]
+ return output[4:(4 + strlen)]
+
+ def _get_paths_to_ino(self):
+ inos = {}
+ p = self.mount_a.run_shell(["find", "./"])
+ paths = p.stdout.getvalue().strip().split()
+ for path in paths:
+ inos[path] = self.mount_a.path_to_ino(path)
+
+ return inos
+
+ def test_apply_tag(self):
+ self.mount_a.run_shell(["mkdir", "parentdir"])
+ self.mount_a.run_shell(["mkdir", "parentdir/childdir"])
+ self.mount_a.run_shell(["touch", "rfile"])
+ self.mount_a.run_shell(["touch", "parentdir/pfile"])
+ self.mount_a.run_shell(["touch", "parentdir/childdir/cfile"])
+
+ # Build a structure mapping path to inode, as we will later want
+ # to check object by object and objects are named after ino number
+ inos = self._get_paths_to_ino()
+
+ # Flush metadata: this is a friendly test of forward scrub so we're skipping
+ # the part where it's meant to cope with dirty metadata
+ self.mount_a.umount_wait()
+ self.fs.mds_asok(["flush", "journal"])
+
+ tag = "mytag"
+
+ # Execute tagging forward scrub
+ self.fs.mds_asok(["tag", "path", "/parentdir", tag])
+ # Wait for completion
+ import time
+ time.sleep(10)
+ # FIXME watching clog isn't a nice mechanism for this, once we have a ScrubMap we'll
+ # watch that instead
+
+ # Check that dirs were tagged
+ for dirpath in ["./parentdir", "./parentdir/childdir"]:
+ self.assertTagged(inos[dirpath], tag, self.fs.get_metadata_pool_name())
+
+ # Check that files were tagged
+ for filepath in ["./parentdir/pfile", "./parentdir/childdir/cfile"]:
+ self.assertTagged(inos[filepath], tag, self.fs.get_data_pool_name())
+
+ # This guy wasn't in the tag path, shouldn't have been tagged
+ self.assertUntagged(inos["./rfile"])
+
+ def assertUntagged(self, ino):
+ file_obj_name = "{0:x}.00000000".format(ino)
+ with self.assertRaises(CommandFailedError):
+ self._read_str_xattr(
+ self.fs.get_data_pool_name(),
+ file_obj_name,
+ "scrub_tag"
+ )
+
+ def assertTagged(self, ino, tag, pool):
+ file_obj_name = "{0:x}.00000000".format(ino)
+ wrote = self._read_str_xattr(
+ pool,
+ file_obj_name,
+ "scrub_tag"
+ )
+ self.assertEqual(wrote, tag)
+
+ def _validate_linkage(self, expected):
+ inos = self._get_paths_to_ino()
+ try:
+ self.assertDictEqual(inos, expected)
+ except AssertionError:
+ log.error("Expected: {0}".format(json.dumps(expected, indent=2)))
+ log.error("Actual: {0}".format(json.dumps(inos, indent=2)))
+ raise
+
+ def test_orphan_scan(self):
+ # Create some files whose metadata we will flush
+ self.mount_a.run_python(dedent("""
+ import os
+ mount_point = "{mount_point}"
+ parent = os.path.join(mount_point, "parent")
+ os.mkdir(parent)
+ flushed = os.path.join(parent, "flushed")
+ os.mkdir(flushed)
+ for f in ["alpha", "bravo", "charlie"]:
+ open(os.path.join(flushed, f), 'w').write(f)
+ """.format(mount_point=self.mount_a.mountpoint)))
+
+ inos = self._get_paths_to_ino()
+
+ # Flush journal
+ # Umount before flush to avoid cap releases putting
+ # things we don't want in the journal later.
+ self.mount_a.umount_wait()
+ self.fs.mds_asok(["flush", "journal"])
+
+ # Create a new inode that's just in the log, i.e. would
+ # look orphaned to backward scan if backward scan wisnae
+ # respectin' tha scrub_tag xattr.
+ self.mount_a.mount()
+ self.mount_a.run_shell(["mkdir", "parent/unflushed"])
+ self.mount_a.run_shell(["dd", "if=/dev/urandom",
+ "of=./parent/unflushed/jfile",
+ "bs=1M", "count=8"])
+ inos["./parent/unflushed"] = self.mount_a.path_to_ino("./parent/unflushed")
+ inos["./parent/unflushed/jfile"] = self.mount_a.path_to_ino("./parent/unflushed/jfile")
+ self.mount_a.umount_wait()
+
+ # Orphan an inode by deleting its dentry
+ # Our victim will be.... bravo.
+ self.mount_a.umount_wait()
+ self.fs.mds_stop()
+ self.fs.mds_fail()
+ self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
+ self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
+ frag_obj_id = "{0:x}.00000000".format(inos["./parent/flushed"])
+ self.fs.rados(["rmomapkey", frag_obj_id, "bravo_head"])
+
+ self.fs.mds_restart()
+ self.fs.wait_for_daemons()
+
+ # See that the orphaned file is indeed missing from a client's POV
+ self.mount_a.mount()
+ damaged_state = self._get_paths_to_ino()
+ self.assertNotIn("./parent/flushed/bravo", damaged_state)
+ self.mount_a.umount_wait()
+
+ # Run a tagging forward scrub
+ tag = "mytag123"
+ self.fs.mds_asok(["tag", "path", "/parent", tag])
+
+ # See that the orphan wisnae tagged
+ self.assertUntagged(inos['./parent/flushed/bravo'])
+
+ # See that the flushed-metadata-and-still-present files are tagged
+ self.assertTagged(inos['./parent/flushed/alpha'], tag, self.fs.get_data_pool_name())
+ self.assertTagged(inos['./parent/flushed/charlie'], tag, self.fs.get_data_pool_name())
+
+ # See that journalled-but-not-flushed file *was* tagged
+ self.assertTagged(inos['./parent/unflushed/jfile'], tag, self.fs.get_data_pool_name())
+
+ # Run cephfs-data-scan targeting only orphans
+ self.fs.mds_stop()
+ self.fs.mds_fail()
+ self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()])
+ self.fs.data_scan([
+ "scan_inodes",
+ "--filter-tag", tag,
+ self.fs.get_data_pool_name()
+ ])
+
+ # After in-place injection stats should be kosher again
+ self.fs.set_ceph_conf('mds', 'mds verify scatter', True)
+ self.fs.set_ceph_conf('mds', 'mds debug scatterstat', True)
+
+ # And we should have all the same linkage we started with,
+ # and no lost+found, and no extra inodes!
+ self.fs.mds_restart()
+ self.fs.wait_for_daemons()
+ self.mount_a.mount()
+ self._validate_linkage(inos)
+
+ def _stash_inotable(self):
+ # Get all active ranks
+ ranks = self.fs.get_all_mds_rank()
+
+ inotable_dict = {}
+ for rank in ranks:
+ inotable_oid = "mds{rank:d}_".format(rank=rank) + "inotable"
+ print "Trying to fetch inotable object: " + inotable_oid
+
+ #self.fs.get_metadata_object("InoTable", "mds0_inotable")
+ inotable_raw = self.fs.get_metadata_object_raw(inotable_oid)
+ inotable_dict[inotable_oid] = inotable_raw
+ return inotable_dict
+
+ def test_inotable_sync(self):
+ self.mount_a.write_n_mb("file1_sixmegs", 6)
+
+ # Flush journal
+ self.mount_a.umount_wait()
+ self.fs.mds_asok(["flush", "journal"])
+
+ inotable_copy = self._stash_inotable()
+
+ self.mount_a.mount()
+
+ self.mount_a.write_n_mb("file2_sixmegs", 6)
+ self.mount_a.write_n_mb("file3_sixmegs", 6)
+
+ inos = self._get_paths_to_ino()
+
+ # Flush journal
+ self.mount_a.umount_wait()
+ self.fs.mds_asok(["flush", "journal"])
+
+ self.mount_a.umount_wait()
+
+ with self.assert_cluster_log("inode table repaired", invert_match=True):
+ self.fs.mds_asok(["scrub_path", "/", "repair", "recursive"])
+
+ self.mds_cluster.mds_stop()
+ self.mds_cluster.mds_fail()
+
+ # Truncate the journal (to ensure the inotable on disk
+ # is all that will be in the InoTable in memory)
+
+ self.fs.journal_tool(["event", "splice",
+ "--inode={0}".format(inos["./file2_sixmegs"]), "summary"])
+
+ self.fs.journal_tool(["event", "splice",
+ "--inode={0}".format(inos["./file3_sixmegs"]), "summary"])
+
+ # Revert to old inotable.
+ for key, value in inotable_copy.iteritems():
+ self.fs.put_metadata_object_raw(key, value)
+
+ self.mds_cluster.mds_restart()
+ self.fs.wait_for_daemons()
+
+ with self.assert_cluster_log("inode table repaired"):
+ self.fs.mds_asok(["scrub_path", "/", "repair", "recursive"])
+
+ self.mds_cluster.mds_stop()
+ table_text = self.fs.table_tool(["0", "show", "inode"])
+ table = json.loads(table_text)
+ self.assertGreater(
+ table['0']['data']['inotable']['free'][0]['start'],
+ inos['./file3_sixmegs'])
+
+ def test_backtrace_repair(self):
+ """
+ That the MDS can repair an inodes backtrace in the data pool
+ if it is found to be damaged.
+ """
+ # Create a file for subsequent checks
+ self.mount_a.run_shell(["mkdir", "parent_a"])
+ self.mount_a.run_shell(["touch", "parent_a/alpha"])
+ file_ino = self.mount_a.path_to_ino("parent_a/alpha")
+
+ # That backtrace and layout are written after initial flush
+ self.fs.mds_asok(["flush", "journal"])
+ backtrace = self.fs.read_backtrace(file_ino)
+ self.assertEqual(['alpha', 'parent_a'],
+ [a['dname'] for a in backtrace['ancestors']])
+
+ # Go corrupt the backtrace
+ self.fs._write_data_xattr(file_ino, "parent",
+ "oh i'm sorry did i overwrite your xattr?")
+
+ with self.assert_cluster_log("bad backtrace on inode"):
+ self.fs.mds_asok(["scrub_path", "/", "repair", "recursive"])
+ self.fs.mds_asok(["flush", "journal"])
+ backtrace = self.fs.read_backtrace(file_ino)
+ self.assertEqual(['alpha', 'parent_a'],
+ [a['dname'] for a in backtrace['ancestors']])
--- /dev/null
+
+
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from teuthology.orchestra import run
+
+import logging
+log = logging.getLogger(__name__)
+
+
+class TestFragmentation(CephFSTestCase):
+ CLIENTS_REQUIRED = 1
+ MDSS_REQUIRED = 1
+
+ def get_splits(self):
+ return self.fs.mds_asok(['perf', 'dump', 'mds'])['mds']['dir_split']
+
+ def get_merges(self):
+ return self.fs.mds_asok(['perf', 'dump', 'mds'])['mds']['dir_merge']
+
+ def get_dir_ino(self, path):
+ dir_cache = self.fs.read_cache(path, 0)
+ dir_ino = None
+ dir_inono = self.mount_a.path_to_ino(path.strip("/"))
+ for ino in dir_cache:
+ if ino['ino'] == dir_inono:
+ dir_ino = ino
+ break
+ self.assertIsNotNone(dir_ino)
+ return dir_ino
+
+ def _configure(self, **kwargs):
+ """
+ Apply kwargs as MDS configuration settings, enable dirfrags
+ and restart the MDSs.
+ """
+ kwargs['mds_bal_frag'] = "true"
+
+ for k, v in kwargs.items():
+ self.ceph_cluster.set_ceph_conf("mds", k, v.__str__())
+
+ self.fs.mon_manager.raw_cluster_cmd("fs", "set", self.fs.name,
+ "allow_dirfrags", "true",
+ "--yes-i-really-mean-it")
+
+ self.mds_cluster.mds_fail_restart()
+ self.fs.wait_for_daemons()
+
+ def test_oversize(self):
+ """
+ That a directory is split when it becomes too large.
+ """
+
+ split_size = 20
+ merge_size = 5
+
+ self._configure(
+ mds_bal_split_size=split_size,
+ mds_bal_merge_size=merge_size,
+ mds_bal_split_bits=1
+ )
+
+ self.assertEqual(self.get_splits(), 0)
+
+ self.mount_a.create_n_files("splitdir/file", split_size + 1)
+
+ self.wait_until_true(
+ lambda: self.get_splits() == 1,
+ timeout=30
+ )
+
+ frags = self.get_dir_ino("/splitdir")['dirfrags']
+ self.assertEqual(len(frags), 2)
+ self.assertEqual(frags[0]['dirfrag'], "10000000000.0*")
+ self.assertEqual(frags[1]['dirfrag'], "10000000000.1*")
+ self.assertEqual(
+ sum([len(f['dentries']) for f in frags]),
+ split_size + 1
+ )
+
+ self.assertEqual(self.get_merges(), 0)
+
+ self.mount_a.run_shell(["rm", "-f", run.Raw("splitdir/file*")])
+
+ self.wait_until_true(
+ lambda: self.get_merges() == 1,
+ timeout=30
+ )
+
+ self.assertEqual(len(self.get_dir_ino("/splitdir")["dirfrags"]), 1)
+
+ def test_rapid_creation(self):
+ """
+ That the fast-splitting limit of 1.5x normal limit is
+ applied when creating dentries quickly.
+ """
+
+ split_size = 100
+ merge_size = 1
+
+ self._configure(
+ mds_bal_split_size=split_size,
+ mds_bal_merge_size=merge_size,
+ mds_bal_split_bits=3,
+ mds_bal_fragment_size_max=(split_size * 1.5 + 2)
+ )
+
+ # We test this only at a single split level. If a client was sending
+ # IO so fast that it hit a second split before the first split
+ # was complete, it could violate mds_bal_fragment_size_max -- there
+ # is a window where the child dirfrags of a split are unfrozen
+ # (so they can grow), but still have STATE_FRAGMENTING (so they
+ # can't be split).
+
+ # By writing 4x the split size when the split bits are set
+ # to 3 (i.e. 4-ways), I am reasonably sure to see precisely
+ # one split. The test is to check whether that split
+ # happens soon enough that the client doesn't exceed
+ # 2x the split_size (the "immediate" split mode should
+ # kick in at 1.5x the split size).
+
+ self.assertEqual(self.get_splits(), 0)
+ self.mount_a.create_n_files("splitdir/file", split_size * 4)
+ self.wait_until_equal(
+ self.get_splits,
+ 1,
+ reject_fn=lambda s: s > 1,
+ timeout=30
+ )
+
+ def test_deep_split(self):
+ """
+ That when the directory grows many times larger than split size,
+ the fragments get split again.
+ """
+
+ split_size = 100
+ merge_size = 1 # i.e. don't merge frag unless its empty
+ split_bits = 1
+
+ branch_factor = 2**split_bits
+
+ # Arbitrary: how many levels shall we try fragmenting before
+ # ending the test?
+ max_depth = 5
+
+ self._configure(
+ mds_bal_split_size=split_size,
+ mds_bal_merge_size=merge_size,
+ mds_bal_split_bits=split_bits
+ )
+
+ # Each iteration we will create another level of fragments. The
+ # placement of dentries into fragments is by hashes (i.e. pseudo
+ # random), so we rely on statistics to get the behaviour that
+ # by writing about 1.5x as many dentries as the split_size times
+ # the number of frags, we will get them all to exceed their
+ # split size and trigger a split.
+ depth = 0
+ files_written = 0
+ splits_expected = 0
+ while depth < max_depth:
+ log.info("Writing files for depth {0}".format(depth))
+ target_files = branch_factor**depth * int(split_size * 1.5)
+ create_files = target_files - files_written
+
+ self.ceph_cluster.mon_manager.raw_cluster_cmd("log",
+ "{0} Writing {1} files (depth={2})".format(
+ self.__class__.__name__, create_files, depth
+ ))
+ self.mount_a.create_n_files("splitdir/file_{0}".format(depth),
+ create_files)
+ self.ceph_cluster.mon_manager.raw_cluster_cmd("log",
+ "{0} Done".format(self.__class__.__name__))
+
+ files_written += create_files
+ log.info("Now have {0} files".format(files_written))
+
+ splits_expected += branch_factor**depth
+ log.info("Waiting to see {0} splits".format(splits_expected))
+ try:
+ self.wait_until_equal(
+ self.get_splits,
+ splits_expected,
+ timeout=30,
+ reject_fn=lambda x: x > splits_expected
+ )
+
+ frags = self.get_dir_ino("/splitdir")['dirfrags']
+ self.assertEqual(len(frags), branch_factor**(depth+1))
+ self.assertEqual(
+ sum([len(f['dentries']) for f in frags]),
+ target_files
+ )
+ except:
+ # On failures, log what fragmentation we actually ended
+ # up with. This block is just for logging, at the end
+ # we raise the exception again.
+ frags = self.get_dir_ino("/splitdir")['dirfrags']
+ log.info("depth={0} splits_expected={1} files_written={2}".format(
+ depth, splits_expected, files_written
+ ))
+ log.info("Dirfrags:")
+ for f in frags:
+ log.info("{0}: {1}".format(
+ f['dirfrag'], len(f['dentries'])
+ ))
+ raise
+
+ depth += 1
+
+ # Remember the inode number because we will be checking for
+ # objects later.
+ dir_inode_no = self.mount_a.path_to_ino("splitdir")
+
+ self.mount_a.run_shell(["rm", "-rf", "splitdir/"])
+ self.mount_a.umount_wait()
+
+ self.fs.mds_asok(['flush', 'journal'])
+
+ # Wait for all strays to purge
+ self.wait_until_equal(
+ lambda: self.fs.mds_asok(['perf', 'dump', 'mds_cache']
+ )['mds_cache']['num_strays'],
+ 0,
+ timeout=1200
+ )
+ # Check that the metadata pool objects for all the myriad
+ # child fragments are gone
+ metadata_objs = self.fs.rados(["ls"])
+ frag_objs = []
+ for o in metadata_objs:
+ if o.startswith("{0:x}.".format(dir_inode_no)):
+ frag_objs.append(o)
+ self.assertListEqual(frag_objs, [])
--- /dev/null
+
+
+import json
+import logging
+import os
+from textwrap import dedent
+import time
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+
+log = logging.getLogger(__name__)
+
+
+class FullnessTestCase(CephFSTestCase):
+ CLIENTS_REQUIRED = 2
+
+ # Subclasses define whether they're filling whole cluster or just data pool
+ data_only = False
+
+ # Subclasses define how many bytes should be written to achieve fullness
+ pool_capacity = None
+ fill_mb = None
+
+ # Subclasses define what fullness means to them
+ def is_full(self):
+ raise NotImplementedError()
+
+ def setUp(self):
+ CephFSTestCase.setUp(self)
+
+ if not isinstance(self.mount_a, FuseMount):
+ self.skipTest("FUSE needed: ENOSPC handling in kclient is tracker #17204")
+
+ # These tests just use a single active MDS throughout, so remember its ID
+ # for use in mds_asok calls
+ self.active_mds_id = self.fs.get_active_names()[0]
+
+ # Capture the initial OSD map epoch for later use
+ self.initial_osd_epoch = json.loads(
+ self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json").strip()
+ )['epoch']
+
+ # Check the initial barrier epoch on the MDS: this should be
+ # set to the latest map at MDS startup. We do this check in
+ # setUp to get in there before subclasses might touch things
+ # in their own setUp functions.
+ self.assertGreaterEqual(self.fs.mds_asok(["status"], mds_id=self.active_mds_id)['osdmap_epoch_barrier'],
+ self.initial_osd_epoch)
+
+ def test_barrier(self):
+ """
+ That when an OSD epoch barrier is set on an MDS, subsequently
+ issued capabilities cause clients to update their OSD map to that
+ epoch.
+ """
+
+ # Sync up clients with initial MDS OSD map barrier
+ self.mount_a.open_no_data("foo")
+ self.mount_b.open_no_data("bar")
+
+ # Grab mounts' initial OSD epochs: later we will check that
+ # it hasn't advanced beyond this point.
+ mount_a_initial_epoch = self.mount_a.get_osd_epoch()[0]
+ mount_b_initial_epoch = self.mount_b.get_osd_epoch()[0]
+
+ # Freshly mounted at start of test, should be up to date with OSD map
+ self.assertGreaterEqual(mount_a_initial_epoch, self.initial_osd_epoch)
+ self.assertGreaterEqual(mount_b_initial_epoch, self.initial_osd_epoch)
+
+ # Set and unset a flag to cause OSD epoch to increment
+ self.fs.mon_manager.raw_cluster_cmd("osd", "set", "pause")
+ self.fs.mon_manager.raw_cluster_cmd("osd", "unset", "pause")
+
+ out = self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json").strip()
+ new_epoch = json.loads(out)['epoch']
+ self.assertNotEqual(self.initial_osd_epoch, new_epoch)
+
+ # Do a metadata operation on clients, witness that they end up with
+ # the old OSD map from startup time (nothing has prompted client
+ # to update its map)
+ self.mount_a.open_no_data("alpha")
+ self.mount_b.open_no_data("bravo1")
+
+ # Sleep long enough that if the OSD map was propagating it would
+ # have done so (this is arbitrary because we are 'waiting' for something
+ # to *not* happen).
+ time.sleep(30)
+
+ mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch()
+ self.assertEqual(mount_a_epoch, mount_a_initial_epoch)
+ mount_b_epoch, mount_b_barrier = self.mount_b.get_osd_epoch()
+ self.assertEqual(mount_b_epoch, mount_b_initial_epoch)
+
+ # Set a barrier on the MDS
+ self.fs.mds_asok(["osdmap", "barrier", new_epoch.__str__()], mds_id=self.active_mds_id)
+
+ # Do an operation on client B, witness that it ends up with
+ # the latest OSD map from the barrier. This shouldn't generate any
+ # cap revokes to A because B was already the last one to touch
+ # a file in root.
+ self.mount_b.run_shell(["touch", "bravo2"])
+ self.mount_b.open_no_data("bravo2")
+
+ # Some time passes here because the metadata part of the operation
+ # completes immediately, while the resulting OSD map update happens
+ # asynchronously (it's an Objecter::_maybe_request_map) as a result
+ # of seeing the new epoch barrier.
+ self.wait_until_equal(
+ lambda: self.mount_b.get_osd_epoch(),
+ (new_epoch, new_epoch),
+ 30,
+ lambda x: x[0] > new_epoch or x[1] > new_epoch)
+
+ # ...and none of this should have affected the oblivious mount a,
+ # because it wasn't doing any data or metadata IO
+ mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch()
+ self.assertEqual(mount_a_epoch, mount_a_initial_epoch)
+
+ def _data_pool_name(self):
+ data_pool_names = self.fs.get_data_pool_names()
+ if len(data_pool_names) > 1:
+ raise RuntimeError("This test can't handle multiple data pools")
+ else:
+ return data_pool_names[0]
+
+ def _test_full(self, easy_case):
+ """
+ - That a client trying to write data to a file is prevented
+ from doing so with an -EFULL result
+ - That they are also prevented from creating new files by the MDS.
+ - That they may delete another file to get the system healthy again
+
+ :param easy_case: if true, delete a successfully written file to
+ free up space. else, delete the file that experienced
+ the failed write.
+ """
+
+ osd_mon_report_interval_max = int(self.fs.get_config("osd_mon_report_interval_max", service_type='osd'))
+
+ log.info("Writing {0}MB should fill this cluster".format(self.fill_mb))
+
+ # Fill up the cluster. This dd may or may not fail, as it depends on
+ # how soon the cluster recognises its own fullness
+ self.mount_a.write_n_mb("large_file_a", self.fill_mb / 2)
+ try:
+ self.mount_a.write_n_mb("large_file_b", self.fill_mb / 2)
+ except CommandFailedError:
+ log.info("Writing file B failed (full status happened already)")
+ assert self.is_full()
+ else:
+ log.info("Writing file B succeeded (full status will happen soon)")
+ self.wait_until_true(lambda: self.is_full(),
+ timeout=osd_mon_report_interval_max * 5)
+
+ # Attempting to write more data should give me ENOSPC
+ with self.assertRaises(CommandFailedError) as ar:
+ self.mount_a.write_n_mb("large_file_b", 50, seek=self.fill_mb / 2)
+ self.assertEqual(ar.exception.exitstatus, 1) # dd returns 1 on "No space"
+
+ # Wait for the MDS to see the latest OSD map so that it will reliably
+ # be applying the policy of rejecting non-deletion metadata operations
+ # while in the full state.
+ osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch']
+ self.wait_until_true(
+ lambda: self.fs.mds_asok(['status'], mds_id=self.active_mds_id)['osdmap_epoch'] >= osd_epoch,
+ timeout=10)
+
+ if not self.data_only:
+ with self.assertRaises(CommandFailedError):
+ self.mount_a.write_n_mb("small_file_1", 0)
+
+ # Clear out some space
+ if easy_case:
+ self.mount_a.run_shell(['rm', '-f', 'large_file_a'])
+ self.mount_a.run_shell(['rm', '-f', 'large_file_b'])
+ else:
+ # In the hard case it is the file that filled the system.
+ # Before the new #7317 (ENOSPC, epoch barrier) changes, this
+ # would fail because the last objects written would be
+ # stuck in the client cache as objecter operations.
+ self.mount_a.run_shell(['rm', '-f', 'large_file_b'])
+ self.mount_a.run_shell(['rm', '-f', 'large_file_a'])
+
+ # Here we are waiting for two things to happen:
+ # * The MDS to purge the stray folder and execute object deletions
+ # * The OSDs to inform the mon that they are no longer full
+ self.wait_until_true(lambda: not self.is_full(),
+ timeout=osd_mon_report_interval_max * 5)
+
+ # Wait for the MDS to see the latest OSD map so that it will reliably
+ # be applying the free space policy
+ osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch']
+ self.wait_until_true(
+ lambda: self.fs.mds_asok(['status'], mds_id=self.active_mds_id)['osdmap_epoch'] >= osd_epoch,
+ timeout=10)
+
+ # Now I should be able to write again
+ self.mount_a.write_n_mb("large_file", 50, seek=0)
+
+ # Ensure that the MDS keeps its OSD epoch barrier across a restart
+
+ def test_full_different_file(self):
+ self._test_full(True)
+
+ def test_full_same_file(self):
+ self._test_full(False)
+
+ def _remote_write_test(self, template):
+ """
+ Run some remote python in a way that's useful for
+ testing free space behaviour (see test_* methods using this)
+ """
+ file_path = os.path.join(self.mount_a.mountpoint, "full_test_file")
+
+ # Enough to trip the full flag
+ osd_mon_report_interval_max = int(self.fs.get_config("osd_mon_report_interval_max", service_type='osd'))
+ mon_tick_interval = int(self.fs.get_config("mon_tick_interval", service_type="mon"))
+
+ # Sufficient data to cause RADOS cluster to go 'full'
+ log.info("pool capacity {0}, {1}MB should be enough to fill it".format(self.pool_capacity, self.fill_mb))
+
+ # Long enough for RADOS cluster to notice it is full and set flag on mons
+ # (report_interval for mon to learn PG stats, tick interval for it to update OSD map,
+ # factor of 1.5 for I/O + network latency in committing OSD map and distributing it
+ # to the OSDs)
+ full_wait = (osd_mon_report_interval_max + mon_tick_interval) * 1.5
+
+ # Configs for this test should bring this setting down in order to
+ # run reasonably quickly
+ if osd_mon_report_interval_max > 10:
+ log.warn("This test may run rather slowly unless you decrease"
+ "osd_mon_report_interval_max (5 is a good setting)!")
+
+ self.mount_a.run_python(template.format(
+ fill_mb=self.fill_mb,
+ file_path=file_path,
+ full_wait=full_wait
+ ))
+
+ def test_full_fclose(self):
+ # A remote script which opens a file handle, fills up the filesystem, and then
+ # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync
+ remote_script = dedent("""
+ import time
+ import datetime
+ import subprocess
+ import os
+
+ # Write some buffered data through before going full, all should be well
+ print "writing some data through which we expect to succeed"
+ bytes = 0
+ f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT)
+ bytes += os.write(f, 'a' * 4096)
+ os.fsync(f)
+ print "fsync'ed data successfully, will now attempt to fill fs"
+
+ # Okay, now we're going to fill up the filesystem, and then keep
+ # writing until we see an error from fsync. As long as we're doing
+ # buffered IO, the error should always only appear from fsync and not
+ # from write
+ full = False
+
+ for n in range(0, {fill_mb}):
+ bytes += os.write(f, 'x' * 1024 * 1024)
+ print "wrote bytes via buffered write, may repeat"
+ print "done writing bytes"
+
+ # OK, now we should sneak in under the full condition
+ # due to the time it takes the OSDs to report to the
+ # mons, and get a successful fsync on our full-making data
+ os.fsync(f)
+ print "successfully fsync'ed prior to getting full state reported"
+
+ # Now wait for the full flag to get set so that our
+ # next flush IO will fail
+ time.sleep(30)
+
+ # A buffered IO, should succeed
+ print "starting buffered write we expect to succeed"
+ os.write(f, 'x' * 4096)
+ print "wrote, now waiting 30s and then doing a close we expect to fail"
+
+ # Wait long enough for a background flush that should fail
+ time.sleep(30)
+
+ # ...and check that the failed background flush is reflected in fclose
+ try:
+ os.close(f)
+ except OSError:
+ print "close() returned an error as expected"
+ else:
+ raise RuntimeError("close() failed to raise error")
+
+ os.unlink("{file_path}")
+ """)
+ self._remote_write_test(remote_script)
+
+ def test_full_fsync(self):
+ """
+ That when the full flag is encountered during asynchronous
+ flushes, such that an fwrite() succeeds but an fsync/fclose()
+ should return the ENOSPC error.
+ """
+
+ # A remote script which opens a file handle, fills up the filesystem, and then
+ # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync
+ remote_script = dedent("""
+ import time
+ import datetime
+ import subprocess
+ import os
+
+ # Write some buffered data through before going full, all should be well
+ print "writing some data through which we expect to succeed"
+ bytes = 0
+ f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT)
+ bytes += os.write(f, 'a' * 4096)
+ os.fsync(f)
+ print "fsync'ed data successfully, will now attempt to fill fs"
+
+ # Okay, now we're going to fill up the filesystem, and then keep
+ # writing until we see an error from fsync. As long as we're doing
+ # buffered IO, the error should always only appear from fsync and not
+ # from write
+ full = False
+
+ for n in range(0, {fill_mb} + 1):
+ try:
+ bytes += os.write(f, 'x' * 1024 * 1024)
+ print "wrote bytes via buffered write, moving on to fsync"
+ except OSError as e:
+ print "Unexpected error %s from write() instead of fsync()" % e
+ raise
+
+ try:
+ os.fsync(f)
+ print "fsync'ed successfully"
+ except OSError as e:
+ print "Reached fullness after %.2f MB" % (bytes / (1024.0 * 1024.0))
+ full = True
+ break
+ else:
+ print "Not full yet after %.2f MB" % (bytes / (1024.0 * 1024.0))
+
+ if n > {fill_mb} * 0.8:
+ # Be cautious in the last region where we expect to hit
+ # the full condition, so that we don't overshoot too dramatically
+ print "sleeping a bit as we've exceeded 80% of our expected full ratio"
+ time.sleep({full_wait})
+
+ if not full:
+ raise RuntimeError("Failed to reach fullness after writing %d bytes" % bytes)
+
+ # The error sticks to the inode until we dispose of it
+ try:
+ os.close(f)
+ except OSError:
+ print "Saw error from close() as expected"
+ else:
+ raise RuntimeError("Did not see expected error from close()")
+
+ os.unlink("{file_path}")
+ """)
+
+ self._remote_write_test(remote_script)
+
+
+class TestQuotaFull(FullnessTestCase):
+ """
+ Test per-pool fullness, which indicates quota limits exceeded
+ """
+ pool_capacity = 1024 * 1024 * 32 # arbitrary low-ish limit
+ fill_mb = pool_capacity / (1024 * 1024)
+
+ # We are only testing quota handling on the data pool, not the metadata
+ # pool.
+ data_only = True
+
+ def setUp(self):
+ super(TestQuotaFull, self).setUp()
+
+ pool_name = self.fs.get_data_pool_name()
+ self.fs.mon_manager.raw_cluster_cmd("osd", "pool", "set-quota", pool_name,
+ "max_bytes", "{0}".format(self.pool_capacity))
+
+ def is_full(self):
+ return self.fs.is_pool_full(self.fs.get_data_pool_name())
+
+
+class TestClusterFull(FullnessTestCase):
+ """
+ Test cluster-wide fullness, which indicates that an OSD has become too full
+ """
+ pool_capacity = None
+ REQUIRE_MEMSTORE = True
+
+ def setUp(self):
+ super(TestClusterFull, self).setUp()
+
+ if self.pool_capacity is None:
+ # This is a hack to overcome weird fluctuations in the reported
+ # `max_avail` attribute of pools that sometimes occurs in between
+ # tests (reason as yet unclear, but this dodges the issue)
+ TestClusterFull.pool_capacity = self.fs.get_pool_df(self._data_pool_name())['max_avail']
+ mon_osd_full_ratio = float(self.fs.get_config("mon_osd_full_ratio"))
+ TestClusterFull.fill_mb = int(1.05 * mon_osd_full_ratio * (self.pool_capacity / (1024.0 * 1024.0)))
+
+ def is_full(self):
+ return self.fs.is_full()
+
+# Hide the parent class so that unittest.loader doesn't try to run it.
+del globals()['FullnessTestCase']
--- /dev/null
+
+from StringIO import StringIO
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from tasks.workunit import task as workunit
+
+JOURNAL_FORMAT_LEGACY = 0
+JOURNAL_FORMAT_RESILIENT = 1
+
+
+class TestJournalMigration(CephFSTestCase):
+ CLIENTS_REQUIRED = 1
+
+ def test_journal_migration(self):
+ old_journal_version = JOURNAL_FORMAT_LEGACY
+ new_journal_version = JOURNAL_FORMAT_RESILIENT
+
+ self.fs.set_ceph_conf('mds', 'mds journal format', old_journal_version)
+
+ # Create a filesystem using the older journal format.
+ self.mount_a.umount_wait()
+ self.fs.mds_stop()
+ self.fs.recreate()
+ self.fs.mds_restart()
+ self.fs.wait_for_daemons()
+
+ # Do some client work so that the log is populated with something.
+ with self.mount_a.mounted():
+ self.mount_a.create_files()
+ self.mount_a.check_files() # sanity, this should always pass
+
+ # Run a more substantial workunit so that the length of the log to be
+ # coverted is going span at least a few segments
+ workunit(self.ctx, {
+ 'clients': {
+ "client.{0}".format(self.mount_a.client_id): ["suites/fsstress.sh"],
+ },
+ "timeout": "3h"
+ })
+
+ # Modify the ceph.conf to ask the MDS to use the new journal format.
+ self.fs.set_ceph_conf('mds', 'mds journal format', new_journal_version)
+
+ # Restart the MDS.
+ self.fs.mds_fail_restart()
+ self.fs.wait_for_daemons()
+
+ # This ensures that all daemons come up into a valid state
+ self.fs.wait_for_daemons()
+
+ # Check that files created in the initial client workload are still visible
+ # in a client mount.
+ with self.mount_a.mounted():
+ self.mount_a.check_files()
+
+ # Verify that the journal really has been rewritten.
+ journal_version = self.fs.get_journal_version()
+ if journal_version != new_journal_version:
+ raise RuntimeError("Journal was not upgraded, version should be {0} but is {1}".format(
+ new_journal_version, journal_version()
+ ))
+
+ # Verify that cephfs-journal-tool can now read the rewritten journal
+ inspect_out = self.fs.journal_tool(["journal", "inspect"])
+ if not inspect_out.endswith(": OK"):
+ raise RuntimeError("Unexpected journal-tool result: '{0}'".format(
+ inspect_out
+ ))
+
+ self.fs.journal_tool(["event", "get", "json", "--path", "/tmp/journal.json"])
+ p = self.fs.tool_remote.run(
+ args=[
+ "python",
+ "-c",
+ "import json; print len(json.load(open('/tmp/journal.json')))"
+ ],
+ stdout=StringIO())
+ event_count = int(p.stdout.getvalue().strip())
+ if event_count < 1000:
+ # Approximate value of "lots", expected from having run fsstress
+ raise RuntimeError("Unexpectedly few journal events: {0}".format(event_count))
+
+ # Do some client work so that the log is populated with something.
+ with self.mount_a.mounted():
+ workunit(self.ctx, {
+ 'clients': {
+ "client.{0}".format(self.mount_a.client_id): ["fs/misc/trivial_sync.sh"],
+ },
+ "timeout": "3h"
+ })
--- /dev/null
+
+"""
+Test our tools for recovering the content of damaged journals
+"""
+
+import json
+import logging
+from textwrap import dedent
+import time
+
+from teuthology.exceptions import CommandFailedError, ConnectionLostError
+from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
+from tasks.workunit import task as workunit
+
+log = logging.getLogger(__name__)
+
+
+class TestJournalRepair(CephFSTestCase):
+ MDSS_REQUIRED = 2
+
+ def test_inject_to_empty(self):
+ """
+ That when some dentries in the journal but nothing is in
+ the backing store, we correctly populate the backing store
+ from the journalled dentries.
+ """
+
+ # Inject metadata operations
+ self.mount_a.run_shell(["touch", "rootfile"])
+ self.mount_a.run_shell(["mkdir", "subdir"])
+ self.mount_a.run_shell(["touch", "subdir/subdirfile"])
+ # There are several different paths for handling hardlinks, depending
+ # on whether an existing dentry (being overwritten) is also a hardlink
+ self.mount_a.run_shell(["mkdir", "linkdir"])
+
+ # Test inode -> remote transition for a dentry
+ self.mount_a.run_shell(["touch", "linkdir/link0"])
+ self.mount_a.run_shell(["rm", "-f", "linkdir/link0"])
+ self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link0"])
+
+ # Test nothing -> remote transition
+ self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link1"])
+
+ # Test remote -> inode transition
+ self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link2"])
+ self.mount_a.run_shell(["rm", "-f", "linkdir/link2"])
+ self.mount_a.run_shell(["touch", "linkdir/link2"])
+
+ # Test remote -> diff remote transition
+ self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link3"])
+ self.mount_a.run_shell(["rm", "-f", "linkdir/link3"])
+ self.mount_a.run_shell(["ln", "rootfile", "linkdir/link3"])
+
+ # Test an empty directory
+ self.mount_a.run_shell(["mkdir", "subdir/subsubdir"])
+ self.mount_a.run_shell(["sync"])
+
+ # Before we unmount, make a note of the inode numbers, later we will
+ # check that they match what we recover from the journal
+ rootfile_ino = self.mount_a.path_to_ino("rootfile")
+ subdir_ino = self.mount_a.path_to_ino("subdir")
+ linkdir_ino = self.mount_a.path_to_ino("linkdir")
+ subdirfile_ino = self.mount_a.path_to_ino("subdir/subdirfile")
+ subsubdir_ino = self.mount_a.path_to_ino("subdir/subsubdir")
+
+ self.mount_a.umount_wait()
+
+ # Stop the MDS
+ self.fs.mds_stop()
+ self.fs.mds_fail()
+
+ # Now, the journal should contain the operations, but the backing
+ # store shouldn't
+ with self.assertRaises(ObjectNotFound):
+ self.fs.list_dirfrag(subdir_ino)
+ self.assertEqual(self.fs.list_dirfrag(ROOT_INO), [])
+
+ # Execute the dentry recovery, this should populate the backing store
+ self.fs.journal_tool(['event', 'recover_dentries', 'list'])
+
+ # Dentries in ROOT_INO are present
+ self.assertEqual(sorted(self.fs.list_dirfrag(ROOT_INO)), sorted(['rootfile_head', 'subdir_head', 'linkdir_head']))
+ self.assertEqual(self.fs.list_dirfrag(subdir_ino), ['subdirfile_head', 'subsubdir_head'])
+ self.assertEqual(sorted(self.fs.list_dirfrag(linkdir_ino)),
+ sorted(['link0_head', 'link1_head', 'link2_head', 'link3_head']))
+
+ # Now check the MDS can read what we wrote: truncate the journal
+ # and start the mds.
+ self.fs.journal_tool(['journal', 'reset'])
+ self.fs.mds_fail_restart()
+ self.fs.wait_for_daemons()
+
+ # List files
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+
+ # First ls -R to populate MDCache, such that hardlinks will
+ # resolve properly (recover_dentries does not create backtraces,
+ # so ordinarily hardlinks to inodes that happen not to have backtraces
+ # will be invisible in readdir).
+ # FIXME: hook in forward scrub here to regenerate backtraces
+ proc = self.mount_a.run_shell(['ls', '-R'])
+ self.mount_a.umount_wait() # remount to clear client cache before our second ls
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+
+ proc = self.mount_a.run_shell(['ls', '-R'])
+ self.assertEqual(proc.stdout.getvalue().strip(),
+ dedent("""
+ .:
+ linkdir
+ rootfile
+ subdir
+
+ ./linkdir:
+ link0
+ link1
+ link2
+ link3
+
+ ./subdir:
+ subdirfile
+ subsubdir
+
+ ./subdir/subsubdir:
+ """).strip())
+
+ # Check the correct inos were preserved by path
+ self.assertEqual(rootfile_ino, self.mount_a.path_to_ino("rootfile"))
+ self.assertEqual(subdir_ino, self.mount_a.path_to_ino("subdir"))
+ self.assertEqual(subdirfile_ino, self.mount_a.path_to_ino("subdir/subdirfile"))
+ self.assertEqual(subsubdir_ino, self.mount_a.path_to_ino("subdir/subsubdir"))
+
+ # Check that the hard link handling came out correctly
+ self.assertEqual(self.mount_a.path_to_ino("linkdir/link0"), subdirfile_ino)
+ self.assertEqual(self.mount_a.path_to_ino("linkdir/link1"), subdirfile_ino)
+ self.assertNotEqual(self.mount_a.path_to_ino("linkdir/link2"), subdirfile_ino)
+ self.assertEqual(self.mount_a.path_to_ino("linkdir/link3"), rootfile_ino)
+
+ # Create a new file, ensure it is not issued the same ino as one of the
+ # recovered ones
+ self.mount_a.run_shell(["touch", "afterwards"])
+ new_ino = self.mount_a.path_to_ino("afterwards")
+ self.assertNotIn(new_ino, [rootfile_ino, subdir_ino, subdirfile_ino])
+
+ # Check that we can do metadata ops in the recovered directory
+ self.mount_a.run_shell(["touch", "subdir/subsubdir/subsubdirfile"])
+
+ @for_teuthology # 308s
+ def test_reset(self):
+ """
+ That after forcibly modifying the backing store, we can get back into
+ a good state by resetting the MDSMap.
+
+ The scenario is that we have two active MDSs, and we lose the journals. Once
+ we have completely lost confidence in the integrity of the metadata, we want to
+ return the system to a single-MDS state to go into a scrub to recover what we
+ can.
+ """
+
+ # Set max_mds to 2
+ self.fs.mon_manager.raw_cluster_cmd_result('mds', 'set', "allow_multimds",
+ "true", "--yes-i-really-mean-it")
+ self.fs.mon_manager.raw_cluster_cmd_result('mds', 'set', "max_mds", "2")
+
+ # See that we have two active MDSs
+ self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30,
+ reject_fn=lambda v: v > 2 or v < 1)
+ active_mds_names = self.fs.get_active_names()
+
+ # Switch off any unneeded MDS daemons
+ for unneeded_mds in set(self.mds_cluster.mds_ids) - set(active_mds_names):
+ self.mds_cluster.mds_stop(unneeded_mds)
+ self.mds_cluster.mds_fail(unneeded_mds)
+
+ # Do a bunch of I/O such that at least some will hit the second MDS: create
+ # lots of directories so that the balancer should find it easy to make a decision
+ # to allocate some of them to the second mds.
+ spammers = []
+ for n in range(0, 16):
+ dir_name = "spam_{0}".format(n)
+ spammers.append(self.mount_a.spam_dir_background(dir_name))
+
+ def subtrees_assigned():
+ got_subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=active_mds_names[0])
+ rank_1_count = len([s for s in got_subtrees if s['auth_first'] == 1])
+
+ # Greater than 1, because there is typically 1 for ~mds1, and once it
+ # has been assigned something in addition to that it means it has been
+ # assigned a "real" subtree.
+ return rank_1_count > 1
+
+ # We are waiting for the MDS to respond to hot directories, which
+ # is not guaranteed to happen at a particular time, so a lengthy timeout here.
+ self.wait_until_true(subtrees_assigned, 600)
+
+ # Flush the journals so that we have some backing store data
+ # belonging to one MDS, and some to the other MDS.
+ for mds_name in active_mds_names:
+ self.fs.mds_asok(["flush", "journal"], mds_name)
+
+ # Stop (hard) the second MDS daemon
+ self.fs.mds_stop(active_mds_names[1])
+
+ # Wipe out the tables for MDS rank 1 so that it is broken and can't start
+ # (this is the simulated failure that we will demonstrate that the disaster
+ # recovery tools can get us back from)
+ self.fs.erase_metadata_objects(prefix="mds1_")
+
+ # Try to access files from the client
+ blocked_ls = self.mount_a.run_shell(["ls", "-R"], wait=False)
+
+ # Check that this "ls -R" blocked rather than completing: indicates
+ # it got stuck trying to access subtrees which were on the now-dead MDS.
+ log.info("Sleeping to check ls is blocked...")
+ time.sleep(60)
+ self.assertFalse(blocked_ls.finished)
+
+ # This mount is now useless because it will depend on MDS rank 1, and MDS rank 1
+ # is not coming back. Kill it.
+ log.info("Killing mount, it's blocked on the MDS we killed")
+ self.mount_a.kill()
+ self.mount_a.kill_cleanup()
+ try:
+ # Now that the mount is dead, the ls -R should error out.
+ blocked_ls.wait()
+ except (CommandFailedError, ConnectionLostError):
+ # The ConnectionLostError case is for kernel client, where
+ # killing the mount also means killing the node.
+ pass
+
+ log.info("Terminating spammer processes...")
+ for spammer_proc in spammers:
+ spammer_proc.stdin.close()
+ try:
+ spammer_proc.wait()
+ except (CommandFailedError, ConnectionLostError):
+ # The ConnectionLostError case is for kernel client, where
+ # killing the mount also means killing the node.
+ pass
+
+ # See that the second MDS will crash when it starts and tries to
+ # acquire rank 1
+ damaged_id = active_mds_names[1]
+ self.fs.mds_restart(damaged_id)
+
+ # The daemon taking the damaged rank should start starting, then
+ # restart back into standby after asking the mon to mark the rank
+ # damaged.
+ def is_marked_damaged():
+ mds_map = self.fs.get_mds_map()
+ return 1 in mds_map['damaged']
+
+ self.wait_until_true(is_marked_damaged, 60)
+
+ def get_state():
+ info = self.mds_cluster.get_mds_info(damaged_id)
+ return info['state'] if info is not None else None
+
+ self.wait_until_equal(
+ get_state,
+ "up:standby",
+ timeout=60)
+
+ self.fs.mds_stop(damaged_id)
+ self.fs.mds_fail(damaged_id)
+
+ # Now give up and go through a disaster recovery procedure
+ self.fs.mds_stop(active_mds_names[0])
+ self.fs.mds_fail(active_mds_names[0])
+ # Invoke recover_dentries quietly, because otherwise log spews millions of lines
+ self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=0, quiet=True)
+ self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=1, quiet=True)
+ self.fs.table_tool(["0", "reset", "session"])
+ self.fs.journal_tool(["journal", "reset"], rank=0)
+ self.fs.erase_mds_objects(1)
+ self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
+ '--yes-i-really-mean-it')
+
+ # Bring an MDS back online, mount a client, and see that we can walk the full
+ # filesystem tree again
+ self.fs.mds_fail_restart(active_mds_names[0])
+ self.wait_until_equal(lambda: self.fs.get_active_names(), [active_mds_names[0]], 30,
+ reject_fn=lambda v: len(v) > 1)
+ self.mount_a.mount()
+ self.mount_a.run_shell(["ls", "-R"], wait=True)
+
+ def test_table_tool(self):
+ active_mdss = self.fs.get_active_names()
+ self.assertEqual(len(active_mdss), 1)
+ mds_name = active_mdss[0]
+
+ self.mount_a.run_shell(["touch", "foo"])
+ self.fs.mds_asok(["flush", "journal"], mds_name)
+
+ log.info(self.fs.table_tool(["all", "show", "inode"]))
+ log.info(self.fs.table_tool(["all", "show", "snap"]))
+ log.info(self.fs.table_tool(["all", "show", "session"]))
+
+ # Inode table should always be the same because initial state
+ # and choice of inode are deterministic.
+ # Should see one inode consumed
+ self.assertEqual(
+ json.loads(self.fs.table_tool(["all", "show", "inode"])),
+ {"0": {
+ "data": {
+ "version": 2,
+ "inotable": {
+ "projected_free": [
+ {"start": 1099511628777,
+ "len": 1099511626775}],
+ "free": [
+ {"start": 1099511628777,
+ "len": 1099511626775}]}},
+ "result": 0}}
+
+ )
+
+ # Should see one session
+ session_data = json.loads(self.fs.table_tool(
+ ["all", "show", "session"]))
+ self.assertEqual(len(session_data["0"]["data"]["Sessions"]), 1)
+ self.assertEqual(session_data["0"]["result"], 0)
+
+ # Should see no snaps
+ self.assertEqual(
+ json.loads(self.fs.table_tool(["all", "show", "snap"])),
+ {"version": 0,
+ "snapserver": {"last_snap": 1,
+ "pending_noop": [],
+ "snaps": [],
+ "need_to_purge": {},
+ "pending_update": [],
+ "pending_destroy": []},
+ "result": 0}
+ )
+
+ # Reset everything
+ for table in ["session", "inode", "snap"]:
+ self.fs.table_tool(["all", "reset", table])
+
+ log.info(self.fs.table_tool(["all", "show", "inode"]))
+ log.info(self.fs.table_tool(["all", "show", "snap"]))
+ log.info(self.fs.table_tool(["all", "show", "session"]))
+
+ # Should see 0 sessions
+ session_data = json.loads(self.fs.table_tool(
+ ["all", "show", "session"]))
+ self.assertEqual(len(session_data["0"]["data"]["Sessions"]), 0)
+ self.assertEqual(session_data["0"]["result"], 0)
+
+ # Should see entire inode range now marked free
+ self.assertEqual(
+ json.loads(self.fs.table_tool(["all", "show", "inode"])),
+ {"0": {"data": {"version": 1,
+ "inotable": {"projected_free": [
+ {"start": 1099511627776,
+ "len": 1099511627776}],
+ "free": [
+ {"start": 1099511627776,
+ "len": 1099511627776}]}},
+ "result": 0}}
+ )
+
+ # Should see no snaps
+ self.assertEqual(
+ json.loads(self.fs.table_tool(["all", "show", "snap"])),
+ {"version": 1,
+ "snapserver": {"last_snap": 1,
+ "pending_noop": [],
+ "snaps": [],
+ "need_to_purge": {},
+ "pending_update": [],
+ "pending_destroy": []},
+ "result": 0}
+ )
+
+ def test_table_tool_take_inos(self):
+ initial_range_start = 1099511627776
+ initial_range_len = 1099511627776
+ # Initially a completely clear range
+ self.assertEqual(
+ json.loads(self.fs.table_tool(["all", "show", "inode"])),
+ {"0": {"data": {"version": 0,
+ "inotable": {"projected_free": [
+ {"start": initial_range_start,
+ "len": initial_range_len}],
+ "free": [
+ {"start": initial_range_start,
+ "len": initial_range_len}]}},
+ "result": 0}}
+ )
+
+ # Remove some
+ self.assertEqual(
+ json.loads(self.fs.table_tool(["all", "take_inos", "{0}".format(initial_range_start + 100)])),
+ {"0": {"data": {"version": 1,
+ "inotable": {"projected_free": [
+ {"start": initial_range_start + 101,
+ "len": initial_range_len - 101}],
+ "free": [
+ {"start": initial_range_start + 101,
+ "len": initial_range_len - 101}]}},
+ "result": 0}}
+ )
+
+ @for_teuthology # Hack: "for_teuthology" because .sh doesn't work outside teuth
+ def test_journal_smoke(self):
+ workunit(self.ctx, {
+ 'clients': {
+ "client.{0}".format(self.mount_a.client_id): [
+ "fs/misc/trivial_sync.sh"],
+ },
+ "timeout": "1h"
+ })
+
+ for mount in self.mounts:
+ mount.umount_wait()
+
+ self.fs.mds_stop()
+ self.fs.mds_fail()
+
+ # journal tool smoke
+ workunit(self.ctx, {
+ 'clients': {
+ "client.{0}".format(self.mount_a.client_id): [
+ "suites/cephfs_journal_tool_smoke.sh"],
+ },
+ "timeout": "1h"
+ })
+
+
+
+ self.fs.mds_restart()
+ self.fs.wait_for_daemons()
+
+ self.mount_a.mount()
+
+ # trivial sync moutn a
+ workunit(self.ctx, {
+ 'clients': {
+ "client.{0}".format(self.mount_a.client_id): [
+ "fs/misc/trivial_sync.sh"],
+ },
+ "timeout": "1h"
+ })
+
--- /dev/null
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+import json
+import logging
+
+log = logging.getLogger(__name__)
+failure = "using old balancer; mantle failed for balancer="
+success = "mantle balancer version changed: "
+
+class TestMantle(CephFSTestCase):
+ def start_mantle(self):
+ self.wait_for_health_clear(timeout=30)
+ self.fs.mon_manager.raw_cluster_cmd_result('mds', 'set', "allow_multimds",
+ "true", "--yes-i-really-mean-it")
+ self.fs.mon_manager.raw_cluster_cmd_result('mds', 'set', "max_mds", "2")
+ self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30,
+ reject_fn=lambda v: v > 2 or v < 1)
+
+ for m in self.fs.get_active_names():
+ self.fs.mds_asok(['config', 'set', 'debug_objecter', '20'], mds_id=m)
+ self.fs.mds_asok(['config', 'set', 'debug_ms', '0'], mds_id=m)
+ self.fs.mds_asok(['config', 'set', 'debug_mds', '0'], mds_id=m)
+ self.fs.mds_asok(['config', 'set', 'debug_mds_balancer', '5'], mds_id=m)
+
+ def push_balancer(self, obj, lua_code, expect):
+ self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', obj)
+ self.fs.rados(["put", obj, "-"], stdin_data=lua_code)
+ with self.assert_cluster_log(failure + obj + " " + expect):
+ log.info("run a " + obj + " balancer that expects=" + expect)
+
+ def test_version_empty(self):
+ self.start_mantle()
+ expect = " : (2) No such file or directory"
+
+ ret = self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer')
+ assert(ret == 22) # EINVAL
+
+ self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', " ")
+ with self.assert_cluster_log(failure + " " + expect): pass
+
+ def test_version_not_in_rados(self):
+ self.start_mantle()
+ expect = failure + "ghost.lua : (2) No such file or directory"
+ self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "ghost.lua")
+ with self.assert_cluster_log(expect): pass
+
+ def test_balancer_invalid(self):
+ self.start_mantle()
+ expect = ": (22) Invalid argument"
+
+ lua_code = "this is invalid lua code!"
+ self.push_balancer("invalid.lua", lua_code, expect)
+
+ lua_code = "BAL_LOG()"
+ self.push_balancer("invalid_log.lua", lua_code, expect)
+
+ lua_code = "BAL_LOG(0)"
+ self.push_balancer("invalid_log_again.lua", lua_code, expect)
+
+ def test_balancer_valid(self):
+ self.start_mantle()
+ lua_code = "BAL_LOG(0, \"test\")\nreturn {3, 4}"
+ self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "valid.lua")
+ self.fs.rados(["put", "valid.lua", "-"], stdin_data=lua_code)
+ with self.assert_cluster_log(success + "valid.lua"):
+ log.info("run a valid.lua balancer")
+
+ def test_return_invalid(self):
+ self.start_mantle()
+ expect = ": (22) Invalid argument"
+
+ lua_code = "return \"hello\""
+ self.push_balancer("string.lua", lua_code, expect)
+
+ lua_code = "return 3"
+ self.push_balancer("number.lua", lua_code, expect)
+
+ lua_code = "return {}"
+ self.push_balancer("dict_empty.lua", lua_code, expect)
+
+ lua_code = "return {\"this\", \"is\", \"a\", \"test\"}"
+ self.push_balancer("dict_of_strings.lua", lua_code, expect)
+
+ lua_code = "return {3, \"test\"}"
+ self.push_balancer("dict_of_mixed.lua", lua_code, expect)
+
+ lua_code = "return {3}"
+ self.push_balancer("not_enough_numbers.lua", lua_code, expect)
+
+ lua_code = "return {3, 4, 5, 6, 7, 8, 9}"
+ self.push_balancer("too_many_numbers.lua", lua_code, expect)
+
+ def test_dead_osd(self):
+ self.start_mantle()
+ expect = " : (110) Connection timed out"
+
+ # kill the OSDs so that the balancer pull from RADOS times out
+ osd_map = json.loads(self.fs.mon_manager.raw_cluster_cmd('osd', 'dump', '--format=json-pretty'))
+ for i in range(0, len(osd_map['osds'])):
+ self.fs.mon_manager.raw_cluster_cmd_result('osd', 'down', str(i))
+ self.fs.mon_manager.raw_cluster_cmd_result('osd', 'out', str(i))
+
+ # trigger a pull from RADOS
+ self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "valid.lua")
+
+ # make the timeout a little longer since dead OSDs spam ceph -w
+ with self.assert_cluster_log(failure + "valid.lua" + expect, timeout=30):
+ log.info("run a balancer that should timeout")
+
+ # cleanup
+ for i in range(0, len(osd_map['osds'])):
+ self.fs.mon_manager.raw_cluster_cmd_result('osd', 'in', str(i))
--- /dev/null
+
+from unittest import SkipTest
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+class TestMisc(CephFSTestCase):
+ CLIENTS_REQUIRED = 2
+ def test_getattr_caps(self):
+ """
+ Check if MDS recognizes the 'mask' parameter of open request.
+ The paramter allows client to request caps when opening file
+ """
+
+ if not isinstance(self.mount_a, FuseMount):
+ raise SkipTest("Require FUSE client")
+
+ # Enable debug. Client will requests CEPH_CAP_XATTR_SHARED
+ # on lookup/open
+ self.mount_b.umount_wait()
+ self.set_conf('client', 'client debug getattr caps', 'true')
+ self.mount_b.mount()
+ self.mount_b.wait_until_mounted()
+
+ # create a file and hold it open. MDS will issue CEPH_CAP_EXCL_*
+ # to mount_a
+ p = self.mount_a.open_background("testfile")
+ self.mount_b.wait_for_visible("testfile")
+
+ # this tiggers a lookup request and an open request. The debug
+ # code will check if lookup/open reply contains xattrs
+ self.mount_b.run_shell(["cat", "testfile"])
+
+ self.mount_a.kill_background(p)
--- /dev/null
+from textwrap import dedent
+from teuthology.exceptions import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+import os
+
+
+class TestPoolPerm(CephFSTestCase):
+ def test_pool_perm(self):
+ self.mount_a.run_shell(["touch", "test_file"])
+
+ file_path = os.path.join(self.mount_a.mountpoint, "test_file")
+
+ remote_script = dedent("""
+ import os
+ import errno
+
+ fd = os.open("{path}", os.O_RDWR)
+ try:
+ if {check_read}:
+ ret = os.read(fd, 1024)
+ else:
+ os.write(fd, 'content')
+ except OSError, e:
+ if e.errno != errno.EPERM:
+ raise
+ else:
+ raise RuntimeError("client does not check permission of data pool")
+ """)
+
+ client_name = "client.{0}".format(self.mount_a.client_id)
+
+ # set data pool read only
+ self.fs.mon_manager.raw_cluster_cmd_result(
+ 'auth', 'caps', client_name, 'mds', 'allow', 'mon', 'allow r', 'osd',
+ 'allow r pool={0}'.format(self.fs.get_data_pool_name()))
+
+ self.mount_a.umount_wait()
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+
+ # write should fail
+ self.mount_a.run_python(remote_script.format(path=file_path, check_read=str(False)))
+
+ # set data pool write only
+ self.fs.mon_manager.raw_cluster_cmd_result(
+ 'auth', 'caps', client_name, 'mds', 'allow', 'mon', 'allow r', 'osd',
+ 'allow w pool={0}'.format(self.fs.get_data_pool_name()))
+
+ self.mount_a.umount_wait()
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+
+ # read should fail
+ self.mount_a.run_python(remote_script.format(path=file_path, check_read=str(True)))
+
+ def test_forbidden_modification(self):
+ """
+ That a client who does not have the capability for setting
+ layout pools is prevented from doing so.
+ """
+
+ # Set up
+ client_name = "client.{0}".format(self.mount_a.client_id)
+ new_pool_name = "data_new"
+ self.fs.add_data_pool(new_pool_name)
+
+ self.mount_a.run_shell(["touch", "layoutfile"])
+ self.mount_a.run_shell(["mkdir", "layoutdir"])
+
+ # Set MDS 'rw' perms: missing 'p' means no setting pool layouts
+ self.fs.mon_manager.raw_cluster_cmd_result(
+ 'auth', 'caps', client_name, 'mds', 'allow rw', 'mon', 'allow r',
+ 'osd',
+ 'allow rw pool={0},allow rw pool={1}'.format(
+ self.fs.get_data_pool_names()[0],
+ self.fs.get_data_pool_names()[1],
+ ))
+
+ self.mount_a.umount_wait()
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+
+ with self.assertRaises(CommandFailedError):
+ self.mount_a.run_shell(["setfattr",
+ "-n", "ceph.file.layout.pool",
+ "-v", new_pool_name, "layoutfile"])
+ with self.assertRaises(CommandFailedError):
+ self.mount_a.run_shell(["setfattr",
+ "-n", "ceph.dir.layout.pool",
+ "-v", new_pool_name, "layoutdir"])
+ self.mount_a.umount_wait()
+
+ # Set MDS 'rwp' perms: should now be able to set layouts
+ self.fs.mon_manager.raw_cluster_cmd_result(
+ 'auth', 'caps', client_name, 'mds', 'allow rwp', 'mon', 'allow r',
+ 'osd',
+ 'allow rw pool={0},allow rw pool={1}'.format(
+ self.fs.get_data_pool_names()[0],
+ self.fs.get_data_pool_names()[1],
+ ))
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+ self.mount_a.run_shell(["setfattr",
+ "-n", "ceph.file.layout.pool",
+ "-v", new_pool_name, "layoutfile"])
+ self.mount_a.run_shell(["setfattr",
+ "-n", "ceph.dir.layout.pool",
+ "-v", new_pool_name, "layoutdir"])
+ self.mount_a.umount_wait()
+
+ def tearDown(self):
+ self.fs.mon_manager.raw_cluster_cmd_result(
+ 'auth', 'caps', "client.{0}".format(self.mount_a.client_id),
+ 'mds', 'allow', 'mon', 'allow r', 'osd',
+ 'allow rw pool={0}'.format(self.fs.get_data_pool_names()[0]))
+ super(TestPoolPerm, self).tearDown()
+
--- /dev/null
+import logging
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+log = logging.getLogger(__name__)
+
+
+class TestReadahead(CephFSTestCase):
+ def test_flush(self):
+ if not isinstance(self.mount_a, FuseMount):
+ self.skipTest("FUSE needed for measuring op counts")
+
+ # Create 32MB file
+ self.mount_a.run_shell(["dd", "if=/dev/urandom", "of=foo", "bs=1M", "count=32"])
+
+ # Unmount and remount the client to flush cache
+ self.mount_a.umount_wait()
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+
+ initial_op_r = self.mount_a.admin_socket(['perf', 'dump', 'objecter'])['objecter']['op_r']
+ self.mount_a.run_shell(["dd", "if=foo", "of=/dev/null", "bs=128k", "count=32"])
+ op_r = self.mount_a.admin_socket(['perf', 'dump', 'objecter'])['objecter']['op_r']
+ assert op_r >= initial_op_r
+ op_r -= initial_op_r
+ log.info("read operations: {0}".format(op_r))
+
+ # with exponentially increasing readahead, we should see fewer than 10 operations
+ # but this test simply checks if the client is doing a remote read for each local read
+ if op_r >= 32:
+ raise RuntimeError("readahead not working")
--- /dev/null
+"""
+MDS admin socket scrubbing-related tests.
+"""
+import json
+import logging
+import errno
+import time
+from teuthology.exceptions import CommandFailedError
+import os
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+log = logging.getLogger(__name__)
+
+
+class TestScrubChecks(CephFSTestCase):
+ """
+ Run flush and scrub commands on the specified files in the filesystem. This
+ task will run through a sequence of operations, but it is not comprehensive
+ on its own -- it doesn't manipulate the mds cache state to test on both
+ in- and out-of-memory parts of the hierarchy. So it's designed to be run
+ multiple times within a single test run, so that the test can manipulate
+ memory state.
+
+ Usage:
+ mds_scrub_checks:
+ mds_rank: 0
+ path: path/to/test/dir
+ client: 0
+ run_seq: [0-9]+
+
+ Increment the run_seq on subsequent invocations within a single test run;
+ it uses that value to generate unique folder and file names.
+ """
+
+ MDSS_REQUIRED = 1
+ CLIENTS_REQUIRED = 1
+
+ def test_scrub_checks(self):
+ self._checks(0)
+ self._checks(1)
+
+ def _checks(self, run_seq):
+ mds_rank = 0
+ test_dir = "scrub_test_path"
+
+ abs_test_path = "/{0}".format(test_dir)
+
+ log.info("mountpoint: {0}".format(self.mount_a.mountpoint))
+ client_path = os.path.join(self.mount_a.mountpoint, test_dir)
+ log.info("client_path: {0}".format(client_path))
+
+ log.info("Cloning repo into place")
+ repo_path = self.clone_repo(self.mount_a, client_path)
+
+ log.info("Initiating mds_scrub_checks on mds.{id_}, " +
+ "test_path {path}, run_seq {seq}".format(
+ id_=mds_rank, path=abs_test_path, seq=run_seq)
+ )
+
+
+ success_validator = lambda j, r: self.json_validator(j, r, "return_code", 0)
+
+ nep = "{test_path}/i/dont/exist".format(test_path=abs_test_path)
+ self.asok_command(mds_rank, "flush_path {nep}".format(nep=nep),
+ lambda j, r: self.json_validator(j, r, "return_code", -errno.ENOENT))
+ self.asok_command(mds_rank, "scrub_path {nep}".format(nep=nep),
+ lambda j, r: self.json_validator(j, r, "return_code", -errno.ENOENT))
+
+ test_repo_path = "{test_path}/ceph-qa-suite".format(test_path=abs_test_path)
+ dirpath = "{repo_path}/suites".format(repo_path=test_repo_path)
+
+ if run_seq == 0:
+ log.info("First run: flushing {dirpath}".format(dirpath=dirpath))
+ command = "flush_path {dirpath}".format(dirpath=dirpath)
+ self.asok_command(mds_rank, command, success_validator)
+ command = "scrub_path {dirpath}".format(dirpath=dirpath)
+ self.asok_command(mds_rank, command, success_validator)
+
+ filepath = "{repo_path}/suites/fs/verify/validater/valgrind.yaml".format(
+ repo_path=test_repo_path)
+ if run_seq == 0:
+ log.info("First run: flushing {filepath}".format(filepath=filepath))
+ command = "flush_path {filepath}".format(filepath=filepath)
+ self.asok_command(mds_rank, command, success_validator)
+ command = "scrub_path {filepath}".format(filepath=filepath)
+ self.asok_command(mds_rank, command, success_validator)
+
+ filepath = "{repo_path}/suites/fs/basic/clusters/fixed-3-cephfs.yaml". \
+ format(repo_path=test_repo_path)
+ command = "scrub_path {filepath}".format(filepath=filepath)
+ self.asok_command(mds_rank, command,
+ lambda j, r: self.json_validator(j, r, "performed_validation",
+ False))
+
+ if run_seq == 0:
+ log.info("First run: flushing base dir /")
+ command = "flush_path /"
+ self.asok_command(mds_rank, command, success_validator)
+ command = "scrub_path /"
+ self.asok_command(mds_rank, command, success_validator)
+
+ new_dir = "{repo_path}/new_dir_{i}".format(repo_path=repo_path, i=run_seq)
+ test_new_dir = "{repo_path}/new_dir_{i}".format(repo_path=test_repo_path,
+ i=run_seq)
+ self.mount_a.run_shell(["mkdir", new_dir])
+ command = "flush_path {dir}".format(dir=test_new_dir)
+ self.asok_command(mds_rank, command, success_validator)
+
+ new_file = "{repo_path}/new_file_{i}".format(repo_path=repo_path,
+ i=run_seq)
+ test_new_file = "{repo_path}/new_file_{i}".format(repo_path=test_repo_path,
+ i=run_seq)
+ self.mount_a.write_n_mb(new_file, 1)
+
+ command = "flush_path {file}".format(file=test_new_file)
+ self.asok_command(mds_rank, command, success_validator)
+
+ # check that scrub fails on errors
+ ino = self.mount_a.path_to_ino(new_file)
+ rados_obj_name = "{ino:x}.00000000".format(ino=ino)
+ command = "scrub_path {file}".format(file=test_new_file)
+
+ # Missing parent xattr -> ENODATA
+ self.fs.rados(["rmxattr", rados_obj_name, "parent"], pool=self.fs.get_data_pool_name())
+ self.asok_command(mds_rank, command,
+ lambda j, r: self.json_validator(j, r, "return_code", -errno.ENODATA))
+
+ # Missing object -> ENOENT
+ self.fs.rados(["rm", rados_obj_name], pool=self.fs.get_data_pool_name())
+ self.asok_command(mds_rank, command,
+ lambda j, r: self.json_validator(j, r, "return_code", -errno.ENOENT))
+
+ command = "flush_path /"
+ self.asok_command(mds_rank, command, success_validator)
+
+ def test_scrub_repair(self):
+ mds_rank = 0
+ test_dir = "scrub_repair_path"
+
+ self.mount_a.run_shell(["sudo", "mkdir", test_dir])
+ self.mount_a.run_shell(["sudo", "touch", "{0}/file".format(test_dir)])
+ dir_objname = "{:x}.00000000".format(self.mount_a.path_to_ino(test_dir))
+
+ self.mount_a.umount_wait()
+
+ # flush journal entries to dirfrag objects, and expire journal
+ self.fs.mds_asok(['flush', 'journal'])
+ self.fs.mds_stop()
+
+ # remove the dentry from dirfrag, cause incorrect fragstat/rstat
+ self.fs.rados(["rmomapkey", dir_objname, "file_head"],
+ pool=self.fs.get_metadata_pool_name())
+
+ self.fs.mds_fail_restart()
+ self.fs.wait_for_daemons()
+
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+
+ # fragstat indicates the directory is not empty, rmdir should fail
+ with self.assertRaises(CommandFailedError) as ar:
+ self.mount_a.run_shell(["sudo", "rmdir", test_dir])
+ self.assertEqual(ar.exception.exitstatus, 1)
+
+ self.asok_command(mds_rank, "scrub_path /{0} repair".format(test_dir),
+ lambda j, r: self.json_validator(j, r, "return_code", 0))
+
+ # wait a few second for background repair
+ time.sleep(10)
+
+ # fragstat should be fixed
+ self.mount_a.run_shell(["sudo", "rmdir", test_dir])
+
+ @staticmethod
+ def json_validator(json_out, rc, element, expected_value):
+ if rc != 0:
+ return False, "asok command returned error {rc}".format(rc=rc)
+ element_value = json_out.get(element)
+ if element_value != expected_value:
+ return False, "unexpectedly got {jv} instead of {ev}!".format(
+ jv=element_value, ev=expected_value)
+ return True, "Succeeded"
+
+ def asok_command(self, mds_rank, command, validator):
+ log.info("Running command '{command}'".format(command=command))
+
+ command_list = command.split()
+
+ # we just assume there's an active mds for every rank
+ mds_id = self.fs.get_active_names()[mds_rank]
+ proc = self.fs.mon_manager.admin_socket('mds', mds_id,
+ command_list, check_status=False)
+ rout = proc.exitstatus
+ sout = proc.stdout.getvalue()
+
+ if sout.strip():
+ jout = json.loads(sout)
+ else:
+ jout = None
+
+ log.info("command '{command}' got response code " +
+ "'{rout}' and stdout '{sout}'".format(
+ command=command, rout=rout, sout=sout))
+
+ success, errstring = validator(jout, rout)
+
+ if not success:
+ raise AsokCommandFailedError(command, rout, jout, errstring)
+
+ return jout
+
+ def clone_repo(self, client_mount, path):
+ repo = "ceph-qa-suite"
+ repo_path = os.path.join(path, repo)
+ client_mount.run_shell(["mkdir", "-p", path])
+
+ try:
+ client_mount.stat(repo_path)
+ except CommandFailedError:
+ client_mount.run_shell([
+ "git", "clone", '--branch', 'giant',
+ "http://github.com/ceph/{repo}".format(repo=repo),
+ "{path}/{repo}".format(path=path, repo=repo)
+ ])
+
+ return repo_path
+
+
+class AsokCommandFailedError(Exception):
+ """
+ Exception thrown when we get an unexpected response
+ on an admin socket command
+ """
+
+ def __init__(self, command, rc, json_out, errstring):
+ self.command = command
+ self.rc = rc
+ self.json = json_out
+ self.errstring = errstring
+
+ def __str__(self):
+ return "Admin socket: {command} failed with rc={rc}," + \
+ "json output={json}, because '{es}'".format(
+ command=self.command, rc=self.rc,
+ json=self.json, es=self.errstring)
--- /dev/null
+from StringIO import StringIO
+import json
+import logging
+from unittest import SkipTest
+
+from tasks.cephfs.fuse_mount import FuseMount
+from teuthology.exceptions import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+log = logging.getLogger(__name__)
+
+
+class TestSessionMap(CephFSTestCase):
+ CLIENTS_REQUIRED = 2
+ MDSS_REQUIRED = 2
+
+ def test_tell_session_drop(self):
+ """
+ That when a `tell` command is sent using the python CLI,
+ its MDS session is gone after it terminates
+ """
+ self.mount_a.umount_wait()
+ self.mount_b.umount_wait()
+
+ mds_id = self.fs.get_lone_mds_id()
+ self.fs.mon_manager.raw_cluster_cmd("tell", "mds.{0}".format(mds_id), "session", "ls")
+
+ ls_data = self.fs.mds_asok(['session', 'ls'])
+ self.assertEqual(len(ls_data), 0)
+
+ def _get_thread_count(self, mds_id):
+ remote = self.fs.mds_daemons[mds_id].remote
+
+ ps_txt = remote.run(
+ args=["ps", "-ww", "axo", "nlwp,cmd"],
+ stdout=StringIO()
+ ).stdout.getvalue().strip()
+ lines = ps_txt.split("\n")[1:]
+
+ for line in lines:
+ if "ceph-mds" in line and not "daemon-helper" in line:
+ if line.find("-i {0}".format(mds_id)) != -1:
+ log.info("Found ps line for daemon: {0}".format(line))
+ return int(line.split()[0])
+
+ raise RuntimeError("No process found in ps output for MDS {0}: {1}".format(
+ mds_id, ps_txt
+ ))
+
+ def test_tell_conn_close(self):
+ """
+ That when a `tell` command is sent using the python CLI,
+ the thread count goes back to where it started (i.e. we aren't
+ leaving connections open)
+ """
+ self.mount_a.umount_wait()
+ self.mount_b.umount_wait()
+
+ mds_id = self.fs.get_lone_mds_id()
+
+ initial_thread_count = self._get_thread_count(mds_id)
+ self.fs.mon_manager.raw_cluster_cmd("tell", "mds.{0}".format(mds_id), "session", "ls")
+ final_thread_count = self._get_thread_count(mds_id)
+
+ self.assertEqual(initial_thread_count, final_thread_count)
+
+ def test_mount_conn_close(self):
+ """
+ That when a client unmounts, the thread count on the MDS goes back
+ to what it was before the client mounted
+ """
+ self.mount_a.umount_wait()
+ self.mount_b.umount_wait()
+
+ mds_id = self.fs.get_lone_mds_id()
+
+ initial_thread_count = self._get_thread_count(mds_id)
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+ self.assertGreater(self._get_thread_count(mds_id), initial_thread_count)
+ self.mount_a.umount_wait()
+ final_thread_count = self._get_thread_count(mds_id)
+
+ self.assertEqual(initial_thread_count, final_thread_count)
+
+ def test_version_splitting(self):
+ """
+ That when many sessions are updated, they are correctly
+ split into multiple versions to obey mds_sessionmap_keys_per_op
+ """
+
+ # Start umounted
+ self.mount_a.umount_wait()
+ self.mount_b.umount_wait()
+
+ # Configure MDS to write one OMAP key at once
+ self.set_conf('mds', 'mds_sessionmap_keys_per_op', 1)
+ self.fs.mds_fail_restart()
+ self.fs.wait_for_daemons()
+
+ # I would like two MDSs, so that I can do an export dir later
+ self.fs.mon_manager.raw_cluster_cmd_result('mds', 'set', "allow_multimds",
+ "true", "--yes-i-really-mean-it")
+ self.fs.mon_manager.raw_cluster_cmd_result('mds', 'set', "max_mds", "2")
+ self.fs.wait_for_daemons()
+
+ active_mds_names = self.fs.get_active_names()
+ rank_0_id = active_mds_names[0]
+ rank_1_id = active_mds_names[1]
+ log.info("Ranks 0 and 1 are {0} and {1}".format(
+ rank_0_id, rank_1_id))
+
+ # Bring the clients back
+ self.mount_a.mount()
+ self.mount_b.mount()
+ self.mount_a.create_files() # Kick the client into opening sessions
+ self.mount_b.create_files()
+
+ # See that they've got sessions
+ self.assert_session_count(2, mds_id=rank_0_id)
+
+ # See that we persist their sessions
+ self.fs.mds_asok(["flush", "journal"], rank_0_id)
+ table_json = json.loads(self.fs.table_tool(["0", "show", "session"]))
+ log.info("SessionMap: {0}".format(json.dumps(table_json, indent=2)))
+ self.assertEqual(table_json['0']['result'], 0)
+ self.assertEqual(len(table_json['0']['data']['Sessions']), 2)
+
+ # Now, induce a "force_open_sessions" event by exporting a dir
+ self.mount_a.run_shell(["mkdir", "bravo"])
+ self.mount_a.run_shell(["touch", "bravo/file"])
+ self.mount_b.run_shell(["ls", "-l", "bravo/file"])
+
+ def get_omap_wrs():
+ return self.fs.mds_asok(['perf', 'dump', 'objecter'], rank_1_id)['objecter']['omap_wr']
+
+ # Flush so that there are no dirty sessions on rank 1
+ self.fs.mds_asok(["flush", "journal"], rank_1_id)
+
+ # Export so that we get a force_open to rank 1 for the two sessions from rank 0
+ initial_omap_wrs = get_omap_wrs()
+ self.fs.mds_asok(['export', 'dir', '/bravo', '1'], rank_0_id)
+
+ # This is the critical (if rather subtle) check: that in the process of doing an export dir,
+ # we hit force_open_sessions, and as a result we end up writing out the sessionmap. There
+ # will be two sessions dirtied here, and because we have set keys_per_op to 1, we should see
+ # a single session get written out (the first of the two, triggered by the second getting marked
+ # dirty)
+ # The number of writes is two per session, because the header (sessionmap version) update and
+ # KV write both count.
+ self.wait_until_true(
+ lambda: get_omap_wrs() - initial_omap_wrs == 2,
+ timeout=10 # Long enough for an export to get acked
+ )
+
+ # Now end our sessions and check the backing sessionmap is updated correctly
+ self.mount_a.umount_wait()
+ self.mount_b.umount_wait()
+
+ # In-memory sessionmap check
+ self.assert_session_count(0, mds_id=rank_0_id)
+
+ # On-disk sessionmap check
+ self.fs.mds_asok(["flush", "journal"], rank_0_id)
+ table_json = json.loads(self.fs.table_tool(["0", "show", "session"]))
+ log.info("SessionMap: {0}".format(json.dumps(table_json, indent=2)))
+ self.assertEqual(table_json['0']['result'], 0)
+ self.assertEqual(len(table_json['0']['data']['Sessions']), 0)
+
+ def _sudo_write_file(self, remote, path, data):
+ """
+ Write data to a remote file as super user
+
+ :param remote: Remote site.
+ :param path: Path on the remote being written to.
+ :param data: Data to be written.
+
+ Both perms and owner are passed directly to chmod.
+ """
+ remote.run(
+ args=[
+ 'sudo',
+ 'python',
+ '-c',
+ 'import shutil, sys; shutil.copyfileobj(sys.stdin, file(sys.argv[1], "wb"))',
+ path,
+ ],
+ stdin=data,
+ )
+
+ def _configure_auth(self, mount, id_name, mds_caps, osd_caps=None, mon_caps=None):
+ """
+ Set up auth credentials for a client mount, and write out the keyring
+ for the client to use.
+ """
+
+ if osd_caps is None:
+ osd_caps = "allow rw"
+
+ if mon_caps is None:
+ mon_caps = "allow r"
+
+ out = self.fs.mon_manager.raw_cluster_cmd(
+ "auth", "get-or-create", "client.{name}".format(name=id_name),
+ "mds", mds_caps,
+ "osd", osd_caps,
+ "mon", mon_caps
+ )
+ mount.client_id = id_name
+ self._sudo_write_file(mount.client_remote, mount.get_keyring_path(), out)
+ self.set_conf("client.{name}".format(name=id_name), "keyring", mount.get_keyring_path())
+
+ def test_session_reject(self):
+ if not isinstance(self.mount_a, FuseMount):
+ raise SkipTest("Requires FUSE client to inject client metadata")
+
+ self.mount_a.run_shell(["mkdir", "foo"])
+ self.mount_a.run_shell(["mkdir", "foo/bar"])
+ self.mount_a.umount_wait()
+
+ # Mount B will be my rejected client
+ self.mount_b.umount_wait()
+
+ # Configure a client that is limited to /foo/bar
+ self._configure_auth(self.mount_b, "badguy", "allow rw path=/foo/bar")
+ # Check he can mount that dir and do IO
+ self.mount_b.mount(mount_path="/foo/bar")
+ self.mount_b.wait_until_mounted()
+ self.mount_b.create_destroy()
+ self.mount_b.umount_wait()
+
+ # Configure the client to claim that its mount point metadata is /baz
+ self.set_conf("client.badguy", "client_metadata", "root=/baz")
+ # Try to mount the client, see that it fails
+ with self.assert_cluster_log("client session with invalid root '/baz' denied"):
+ with self.assertRaises(CommandFailedError):
+ self.mount_b.mount(mount_path="/foo/bar")
--- /dev/null
+import json
+import time
+import logging
+from textwrap import dedent
+import gevent
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
+
+log = logging.getLogger(__name__)
+
+
+class TestStrays(CephFSTestCase):
+ MDSS_REQUIRED = 2
+
+ OPS_THROTTLE = 1
+ FILES_THROTTLE = 2
+
+ # Range of different file sizes used in throttle test's workload
+ throttle_workload_size_range = 16
+
+ @for_teuthology
+ def test_ops_throttle(self):
+ self._test_throttling(self.OPS_THROTTLE)
+
+ @for_teuthology
+ def test_files_throttle(self):
+ self._test_throttling(self.FILES_THROTTLE)
+
+ def test_dir_deletion(self):
+ """
+ That when deleting a bunch of dentries and the containing
+ directory, everything gets purged.
+ Catches cases where the client might e.g. fail to trim
+ the unlinked dir from its cache.
+ """
+ file_count = 1000
+ create_script = dedent("""
+ import os
+
+ mount_path = "{mount_path}"
+ subdir = "delete_me"
+ size = {size}
+ file_count = {file_count}
+ os.mkdir(os.path.join(mount_path, subdir))
+ for i in xrange(0, file_count):
+ filename = "{{0}}_{{1}}.bin".format(i, size)
+ f = open(os.path.join(mount_path, subdir, filename), 'w')
+ f.write(size * 'x')
+ f.close()
+ """.format(
+ mount_path=self.mount_a.mountpoint,
+ size=1024,
+ file_count=file_count
+ ))
+
+ self.mount_a.run_python(create_script)
+ self.mount_a.run_shell(["rm", "-rf", "delete_me"])
+ self.fs.mds_asok(["flush", "journal"])
+ strays = self.get_mdc_stat("strays_created")
+ self.assertEqual(strays, file_count + 1)
+ self.wait_until_equal(
+ lambda: self.get_mdc_stat("strays_purged"),
+ strays,
+ timeout=600
+
+ )
+
+ def _test_throttling(self, throttle_type):
+ """
+ That the mds_max_purge_ops setting is respected
+ """
+
+ def set_throttles(files, ops):
+ """
+ Helper for updating ops/files limits, and calculating effective
+ ops_per_pg setting to give the same ops limit.
+ """
+ self.set_conf('mds', 'mds_max_purge_files', "%d" % files)
+ self.set_conf('mds', 'mds_max_purge_ops', "%d" % ops)
+
+ pgs = self.fs.mon_manager.get_pool_property(
+ self.fs.get_data_pool_name(),
+ "pg_num"
+ )
+ ops_per_pg = float(ops) / pgs
+ self.set_conf('mds', 'mds_max_purge_ops_per_pg', "%s" % ops_per_pg)
+
+ # Test conditions depend on what we're going to be exercising.
+ # * Lift the threshold on whatever throttle we are *not* testing, so
+ # that the throttle of interest is the one that will be the bottleneck
+ # * Create either many small files (test file count throttling) or fewer
+ # large files (test op throttling)
+ if throttle_type == self.OPS_THROTTLE:
+ set_throttles(files=100000000, ops=16)
+ size_unit = 1024 * 1024 # big files, generate lots of ops
+ file_multiplier = 100
+ elif throttle_type == self.FILES_THROTTLE:
+ # The default value of file limit is pretty permissive, so to avoid
+ # the test running too fast, create lots of files and set the limit
+ # pretty low.
+ set_throttles(ops=100000000, files=6)
+ size_unit = 1024 # small, numerous files
+ file_multiplier = 200
+ else:
+ raise NotImplemented(throttle_type)
+
+ # Pick up config changes
+ self.fs.mds_fail_restart()
+ self.fs.wait_for_daemons()
+
+ create_script = dedent("""
+ import os
+
+ mount_path = "{mount_path}"
+ subdir = "delete_me"
+ size_unit = {size_unit}
+ file_multiplier = {file_multiplier}
+ os.mkdir(os.path.join(mount_path, subdir))
+ for i in xrange(0, file_multiplier):
+ for size in xrange(0, {size_range}*size_unit, size_unit):
+ filename = "{{0}}_{{1}}.bin".format(i, size / size_unit)
+ f = open(os.path.join(mount_path, subdir, filename), 'w')
+ f.write(size * 'x')
+ f.close()
+ """.format(
+ mount_path=self.mount_a.mountpoint,
+ size_unit=size_unit,
+ file_multiplier=file_multiplier,
+ size_range=self.throttle_workload_size_range
+ ))
+
+ self.mount_a.run_python(create_script)
+
+ # We will run the deletion in the background, to reduce the risk of it completing before
+ # we have started monitoring the stray statistics.
+ def background():
+ self.mount_a.run_shell(["rm", "-rf", "delete_me"])
+ self.fs.mds_asok(["flush", "journal"])
+
+ background_thread = gevent.spawn(background)
+
+ total_inodes = file_multiplier * self.throttle_workload_size_range + 1
+ mds_max_purge_ops = int(self.fs.get_config("mds_max_purge_ops", 'mds'))
+ mds_max_purge_files = int(self.fs.get_config("mds_max_purge_files", 'mds'))
+
+ # During this phase we look for the concurrent ops to exceed half
+ # the limit (a heuristic) and not exceed the limit (a correctness
+ # condition).
+ purge_timeout = 600
+ elapsed = 0
+ files_high_water = 0
+ ops_high_water = 0
+ while True:
+ mdc_stats = self.fs.mds_asok(['perf', 'dump', 'mds_cache'])['mds_cache']
+ if elapsed >= purge_timeout:
+ raise RuntimeError("Timeout waiting for {0} inodes to purge, stats:{1}".format(total_inodes, mdc_stats))
+
+ num_strays = mdc_stats['num_strays']
+ num_strays_purging = mdc_stats['num_strays_purging']
+ num_purge_ops = mdc_stats['num_purge_ops']
+
+ files_high_water = max(files_high_water, num_strays_purging)
+ ops_high_water = max(ops_high_water, num_purge_ops)
+
+ total_strays_created = mdc_stats['strays_created']
+ total_strays_purged = mdc_stats['strays_purged']
+
+ if total_strays_purged == total_inodes:
+ log.info("Complete purge in {0} seconds".format(elapsed))
+ break
+ elif total_strays_purged > total_inodes:
+ raise RuntimeError("Saw more strays than expected, mdc stats: {0}".format(mdc_stats))
+ else:
+ if throttle_type == self.OPS_THROTTLE:
+ if num_strays_purging > mds_max_purge_files:
+ raise RuntimeError("num_purge_ops violates threshold {0}/{1}".format(
+ num_purge_ops, mds_max_purge_ops
+ ))
+ elif throttle_type == self.FILES_THROTTLE:
+ if num_strays_purging > mds_max_purge_files:
+ raise RuntimeError("num_strays_purging violates threshold {0}/{1}".format(
+ num_strays_purging, mds_max_purge_files
+ ))
+ else:
+ raise NotImplemented(throttle_type)
+
+ log.info("Waiting for purge to complete {0}/{1}, {2}/{3}".format(
+ num_strays_purging, num_strays,
+ total_strays_purged, total_strays_created
+ ))
+ time.sleep(1)
+ elapsed += 1
+
+ background_thread.join()
+
+ # Check that we got up to a respectable rate during the purge. This is totally
+ # racy, but should be safeish unless the cluster is pathologically slow, or
+ # insanely fast such that the deletions all pass before we have polled the
+ # statistics.
+ if throttle_type == self.OPS_THROTTLE:
+ if ops_high_water < mds_max_purge_ops / 2:
+ raise RuntimeError("Ops in flight high water is unexpectedly low ({0} / {1})".format(
+ ops_high_water, mds_max_purge_ops
+ ))
+ elif throttle_type == self.FILES_THROTTLE:
+ if files_high_water < mds_max_purge_files / 2:
+ raise RuntimeError("Files in flight high water is unexpectedly low ({0} / {1})".format(
+ ops_high_water, mds_max_purge_files
+ ))
+
+ # Sanity check all MDC stray stats
+ mdc_stats = self.fs.mds_asok(['perf', 'dump', 'mds_cache'])['mds_cache']
+ self.assertEqual(mdc_stats['num_strays'], 0)
+ self.assertEqual(mdc_stats['num_strays_purging'], 0)
+ self.assertEqual(mdc_stats['num_strays_delayed'], 0)
+ self.assertEqual(mdc_stats['num_purge_ops'], 0)
+ self.assertEqual(mdc_stats['strays_created'], total_inodes)
+ self.assertEqual(mdc_stats['strays_purged'], total_inodes)
+
+ def get_mdc_stat(self, name, mds_id=None):
+ return self.fs.mds_asok(['perf', 'dump', "mds_cache", name],
+ mds_id=mds_id)['mds_cache'][name]
+
+ def test_open_inode(self):
+ """
+ That the case of a dentry unlinked while a client holds an
+ inode open is handled correctly.
+
+ The inode should be moved into a stray dentry, while the original
+ dentry and directory should be purged.
+
+ The inode's data should be purged when the client eventually closes
+ it.
+ """
+ mount_a_client_id = self.mount_a.get_global_id()
+
+ # Write some bytes to a file
+ size_mb = 8
+ self.mount_a.write_n_mb("open_file", size_mb)
+ open_file_ino = self.mount_a.path_to_ino("open_file")
+
+ # Hold the file open
+ p = self.mount_a.open_background("open_file")
+
+ self.assertEqual(self.get_session(mount_a_client_id)['num_caps'], 2)
+
+ # Unlink the dentry
+ self.mount_a.run_shell(["rm", "-f", "open_file"])
+
+ # Wait to see the stray count increment
+ self.wait_until_equal(
+ lambda: self.get_mdc_stat("num_strays"),
+ expect_val=1, timeout=60, reject_fn=lambda x: x > 1)
+
+ # See that while the stray count has incremented, the purge count
+ # has not
+ self.assertEqual(self.get_mdc_stat("strays_created"), 1)
+ self.assertEqual(self.get_mdc_stat("strays_purged"), 0)
+
+ # See that the client still holds 2 caps
+ self.assertEqual(self.get_session(mount_a_client_id)['num_caps'], 2)
+
+ # See that the data objects remain in the data pool
+ self.assertTrue(self.fs.data_objects_present(open_file_ino, size_mb * 1024 * 1024))
+
+ # Now close the file
+ self.mount_a.kill_background(p)
+
+ # Wait to see the client cap count decrement
+ self.wait_until_equal(
+ lambda: self.get_session(mount_a_client_id)['num_caps'],
+ expect_val=1, timeout=60, reject_fn=lambda x: x > 2 or x < 1
+ )
+ # Wait to see the purge counter increment, stray count go to zero
+ self.wait_until_equal(
+ lambda: self.get_mdc_stat("strays_purged"),
+ expect_val=1, timeout=60, reject_fn=lambda x: x > 1
+ )
+ self.wait_until_equal(
+ lambda: self.get_mdc_stat("num_strays"),
+ expect_val=0, timeout=6, reject_fn=lambda x: x > 1
+ )
+
+ # See that the data objects no longer exist
+ self.assertTrue(self.fs.data_objects_absent(open_file_ino, size_mb * 1024 * 1024))
+
+ self.await_data_pool_empty()
+
+ def test_hardlink_reintegration(self):
+ """
+ That removal of primary dentry of hardlinked inode results
+ in reintegration of inode into the previously-remote dentry,
+ rather than lingering as a stray indefinitely.
+ """
+ # Write some bytes to file_a
+ size_mb = 8
+ self.mount_a.write_n_mb("file_a", size_mb)
+ ino = self.mount_a.path_to_ino("file_a")
+
+ # Create a hardlink named file_b
+ self.mount_a.run_shell(["ln", "file_a", "file_b"])
+ self.assertEqual(self.mount_a.path_to_ino("file_b"), ino)
+
+ # Flush journal
+ self.fs.mds_asok(['flush', 'journal'])
+
+ # See that backtrace for the file points to the file_a path
+ pre_unlink_bt = self.fs.read_backtrace(ino)
+ self.assertEqual(pre_unlink_bt['ancestors'][0]['dname'], "file_a")
+
+ # Unlink file_a
+ self.mount_a.run_shell(["rm", "-f", "file_a"])
+
+ # See that a stray was created
+ self.assertEqual(self.get_mdc_stat("num_strays"), 1)
+ self.assertEqual(self.get_mdc_stat("strays_created"), 1)
+
+ # Wait, see that data objects are still present (i.e. that the
+ # stray did not advance to purging given time)
+ time.sleep(30)
+ self.assertTrue(self.fs.data_objects_present(ino, size_mb * 1024 * 1024))
+ self.assertEqual(self.get_mdc_stat("strays_purged"), 0)
+
+ # See that before reintegration, the inode's backtrace points to a stray dir
+ self.fs.mds_asok(['flush', 'journal'])
+ self.assertTrue(self.get_backtrace_path(ino).startswith("stray"))
+
+ # Do a metadata operation on the remaining link (mv is heavy handed, but
+ # others like touch may be satisfied from caps without poking MDS)
+ self.mount_a.run_shell(["mv", "file_b", "file_c"])
+
+ # See the reintegration counter increment
+ # This should happen as a result of the eval_remote call on
+ # responding to a client request.
+ self.wait_until_equal(
+ lambda: self.get_mdc_stat("strays_reintegrated"),
+ expect_val=1, timeout=60, reject_fn=lambda x: x > 1
+ )
+
+ # Flush the journal
+ self.fs.mds_asok(['flush', 'journal'])
+
+ # See that the backtrace for the file points to the remaining link's path
+ post_reint_bt = self.fs.read_backtrace(ino)
+ self.assertEqual(post_reint_bt['ancestors'][0]['dname'], "file_c")
+
+ # See that the number of strays in existence is zero
+ self.assertEqual(self.get_mdc_stat("num_strays"), 0)
+
+ # Now really delete it
+ self.mount_a.run_shell(["rm", "-f", "file_c"])
+ self.wait_until_equal(
+ lambda: self.get_mdc_stat("strays_purged"),
+ expect_val=1, timeout=60, reject_fn=lambda x: x > 1
+ )
+ self.assert_purge_idle()
+ self.assertTrue(self.fs.data_objects_absent(ino, size_mb * 1024 * 1024))
+
+ # We caused the inode to go stray twice
+ self.assertEqual(self.get_mdc_stat("strays_created"), 2)
+ # One time we reintegrated it
+ self.assertEqual(self.get_mdc_stat("strays_reintegrated"), 1)
+ # Then the second time we purged it
+ self.assertEqual(self.get_mdc_stat("strays_purged"), 1)
+
+ def test_mv_hardlink_cleanup(self):
+ """
+ That when doing a rename from A to B, and B has hardlinks,
+ then we make a stray for B which is then reintegrated
+ into one of his hardlinks.
+ """
+ # Create file_a, file_b, and a hardlink to file_b
+ size_mb = 8
+ self.mount_a.write_n_mb("file_a", size_mb)
+ file_a_ino = self.mount_a.path_to_ino("file_a")
+
+ self.mount_a.write_n_mb("file_b", size_mb)
+ file_b_ino = self.mount_a.path_to_ino("file_b")
+
+ self.mount_a.run_shell(["ln", "file_b", "linkto_b"])
+ self.assertEqual(self.mount_a.path_to_ino("linkto_b"), file_b_ino)
+
+ # mv file_a file_b
+ self.mount_a.run_shell(["mv", "file_a", "file_b"])
+
+ self.fs.mds_asok(['flush', 'journal'])
+
+ # Initially, linkto_b will still be a remote inode pointing to a newly created
+ # stray from when file_b was unlinked due to the 'mv'. No data objects should
+ # have been deleted, as both files still have linkage.
+ self.assertEqual(self.get_mdc_stat("num_strays"), 1)
+ self.assertEqual(self.get_mdc_stat("strays_created"), 1)
+ self.assertTrue(self.get_backtrace_path(file_b_ino).startswith("stray"))
+ self.assertTrue(self.fs.data_objects_present(file_a_ino, size_mb * 1024 * 1024))
+ self.assertTrue(self.fs.data_objects_present(file_b_ino, size_mb * 1024 * 1024))
+
+ # Trigger reintegration and wait for it to happen
+ self.assertEqual(self.get_mdc_stat("strays_reintegrated"), 0)
+ self.mount_a.run_shell(["mv", "linkto_b", "file_c"])
+ self.wait_until_equal(
+ lambda: self.get_mdc_stat("strays_reintegrated"),
+ expect_val=1, timeout=60, reject_fn=lambda x: x > 1
+ )
+
+ self.fs.mds_asok(['flush', 'journal'])
+
+ post_reint_bt = self.fs.read_backtrace(file_b_ino)
+ self.assertEqual(post_reint_bt['ancestors'][0]['dname'], "file_c")
+ self.assertEqual(self.get_mdc_stat("num_strays"), 0)
+
+ def test_migration_on_shutdown(self):
+ """
+ That when an MDS rank is shut down, any not-yet-purging strays
+ are migrated to another MDS's stray dir.
+ """
+
+ # Set up two MDSs
+ self.fs.mon_manager.raw_cluster_cmd_result('mds', 'set', "allow_multimds",
+ "true", "--yes-i-really-mean-it")
+ self.fs.mon_manager.raw_cluster_cmd_result('mds', 'set', "max_mds", "2")
+
+ # See that we have two active MDSs
+ self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30,
+ reject_fn=lambda v: v > 2 or v < 1)
+
+ active_mds_names = self.fs.get_active_names()
+ rank_0_id = active_mds_names[0]
+ rank_1_id = active_mds_names[1]
+ log.info("Ranks 0 and 1 are {0} and {1}".format(
+ rank_0_id, rank_1_id))
+
+ # Get rid of other MDS daemons so that it's easier to know which
+ # daemons to expect in which ranks after restarts
+ for unneeded_mds in set(self.mds_cluster.mds_ids) - {rank_0_id, rank_1_id}:
+ self.mds_cluster.mds_stop(unneeded_mds)
+ self.mds_cluster.mds_fail(unneeded_mds)
+
+ # Set the purge file throttle to 0 on MDS rank 1
+ self.set_conf("mds.{0}".format(rank_1_id), 'mds_max_purge_files', "0")
+ self.fs.mds_fail_restart(rank_1_id)
+ self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30,
+ reject_fn=lambda v: v > 2 or v < 1)
+
+ # Create a file
+ # Export dir on an empty dir doesn't work, so we create the file before
+ # calling export dir in order to kick a dirfrag into existence
+ size_mb = 8
+ self.mount_a.run_shell(["mkdir", "ALPHA"])
+ self.mount_a.write_n_mb("ALPHA/alpha_file", size_mb)
+ ino = self.mount_a.path_to_ino("ALPHA/alpha_file")
+
+ result = self.fs.mds_asok(["export", "dir", "/ALPHA", "1"], rank_0_id)
+ self.assertEqual(result["return_code"], 0)
+
+ # Poll the MDS cache dump to watch for the export completing
+ migrated = False
+ migrate_timeout = 60
+ migrate_elapsed = 0
+ while not migrated:
+ data = self.fs.mds_asok(["dump", "cache"], rank_1_id)
+ for inode_data in data:
+ if inode_data['ino'] == ino:
+ log.debug("Found ino in cache: {0}".format(json.dumps(inode_data, indent=2)))
+ if inode_data['is_auth'] is True:
+ migrated = True
+ break
+
+ if not migrated:
+ if migrate_elapsed > migrate_timeout:
+ raise RuntimeError("Migration hasn't happened after {0}s!".format(migrate_elapsed))
+ else:
+ migrate_elapsed += 1
+ time.sleep(1)
+
+ # Delete the file on rank 1
+ self.mount_a.run_shell(["rm", "-f", "ALPHA/alpha_file"])
+
+ # See the stray counter increment, but the purge counter doesn't
+ # See that the file objects are still on disk
+ self.wait_until_equal(
+ lambda: self.get_mdc_stat("num_strays", rank_1_id),
+ expect_val=1, timeout=60, reject_fn=lambda x: x > 1)
+ self.assertEqual(self.get_mdc_stat("strays_created", rank_1_id), 1)
+ time.sleep(60) # period that we want to see if it gets purged
+ self.assertEqual(self.get_mdc_stat("strays_purged", rank_1_id), 0)
+ self.assertTrue(self.fs.data_objects_present(ino, size_mb * 1024 * 1024))
+
+ # Shut down rank 1
+ self.fs.mon_manager.raw_cluster_cmd_result('mds', 'set', "max_mds", "1")
+ self.fs.mon_manager.raw_cluster_cmd_result('mds', 'deactivate', "1")
+
+ # Wait til we get to a single active MDS mdsmap state
+ def is_stopped():
+ mds_map = self.fs.get_mds_map()
+ return 1 not in [i['rank'] for i in mds_map['info'].values()]
+
+ self.wait_until_true(is_stopped, timeout=120)
+
+ # See that the stray counter on rank 0 has incremented
+ self.assertEqual(self.get_mdc_stat("strays_created", rank_0_id), 1)
+
+ # Wait til the purge counter on rank 0 increments
+ self.wait_until_equal(
+ lambda: self.get_mdc_stat("strays_purged", rank_0_id),
+ 1, timeout=60, reject_fn=lambda x: x > 1)
+
+ # See that the file objects no longer exist
+ self.assertTrue(self.fs.data_objects_absent(ino, size_mb * 1024 * 1024))
+
+ self.await_data_pool_empty()
+
+ def assert_backtrace(self, ino, expected_path):
+ """
+ Assert that the backtrace in the data pool for an inode matches
+ an expected /foo/bar path.
+ """
+ expected_elements = expected_path.strip("/").split("/")
+ bt = self.fs.read_backtrace(ino)
+ actual_elements = list(reversed([dn['dname'] for dn in bt['ancestors']]))
+ self.assertListEqual(expected_elements, actual_elements)
+
+ def get_backtrace_path(self, ino):
+ bt = self.fs.read_backtrace(ino)
+ elements = reversed([dn['dname'] for dn in bt['ancestors']])
+ return "/".join(elements)
+
+ def assert_purge_idle(self):
+ """
+ Assert that the MDS perf counters indicate no strays exist and
+ no ongoing purge activity. Sanity check for when PurgeQueue should
+ be idle.
+ """
+ stats = self.fs.mds_asok(['perf', 'dump', "mds_cache"])['mds_cache']
+ self.assertEqual(stats["num_strays"], 0)
+ self.assertEqual(stats["num_strays_purging"], 0)
+ self.assertEqual(stats["num_strays_delayed"], 0)
+ self.assertEqual(stats["num_purge_ops"], 0)
+
+ def test_mv_cleanup(self):
+ """
+ That when doing a rename from A to B, and B has no hardlinks,
+ then we make a stray for B and purge him.
+ """
+ # Create file_a and file_b, write some to both
+ size_mb = 8
+ self.mount_a.write_n_mb("file_a", size_mb)
+ file_a_ino = self.mount_a.path_to_ino("file_a")
+ self.mount_a.write_n_mb("file_b", size_mb)
+ file_b_ino = self.mount_a.path_to_ino("file_b")
+
+ self.fs.mds_asok(['flush', 'journal'])
+ self.assert_backtrace(file_a_ino, "file_a")
+ self.assert_backtrace(file_b_ino, "file_b")
+
+ # mv file_a file_b
+ self.mount_a.run_shell(['mv', 'file_a', 'file_b'])
+
+ # See that stray counter increments
+ self.assertEqual(self.get_mdc_stat("strays_created"), 1)
+ # Wait for purge counter to increment
+ self.wait_until_equal(
+ lambda: self.get_mdc_stat("strays_purged"),
+ expect_val=1, timeout=60, reject_fn=lambda x: x > 1
+ )
+ self.assert_purge_idle()
+
+ # file_b should have been purged
+ self.assertTrue(self.fs.data_objects_absent(file_b_ino, size_mb * 1024 * 1024))
+
+ # Backtrace should have updated from file_a to file_b
+ self.fs.mds_asok(['flush', 'journal'])
+ self.assert_backtrace(file_a_ino, "file_b")
+
+ # file_a's data should still exist
+ self.assertTrue(self.fs.data_objects_present(file_a_ino, size_mb * 1024 * 1024))
+
+ def _pool_df(self, pool_name):
+ """
+ Return a dict like
+ {
+ "kb_used": 0,
+ "bytes_used": 0,
+ "max_avail": 19630292406,
+ "objects": 0
+ }
+
+ :param pool_name: Which pool (must exist)
+ """
+ out = self.fs.mon_manager.raw_cluster_cmd("df", "--format=json-pretty")
+ for p in json.loads(out)['pools']:
+ if p['name'] == pool_name:
+ return p['stats']
+
+ raise RuntimeError("Pool '{0}' not found".format(pool_name))
+
+ def await_data_pool_empty(self):
+ self.wait_until_true(
+ lambda: self._pool_df(
+ self.fs.get_data_pool_name()
+ )['objects'] == 0,
+ timeout=60)
+
+ def test_snapshot_remove(self):
+ """
+ That removal of a snapshot that references a now-unlinked file results
+ in purging on the stray for the file.
+ """
+ # Enable snapshots
+ self.fs.mon_manager.raw_cluster_cmd("mds", "set", "allow_new_snaps", "true",
+ "--yes-i-really-mean-it")
+
+ # Create a dir with a file in it
+ size_mb = 8
+ self.mount_a.run_shell(["mkdir", "snapdir"])
+ self.mount_a.run_shell(["mkdir", "snapdir/subdir"])
+ self.mount_a.write_test_pattern("snapdir/subdir/file_a", size_mb * 1024 * 1024)
+ file_a_ino = self.mount_a.path_to_ino("snapdir/subdir/file_a")
+
+ # Snapshot the dir
+ self.mount_a.run_shell(["mkdir", "snapdir/.snap/snap1"])
+
+ # Cause the head revision to deviate from the snapshot
+ self.mount_a.write_n_mb("snapdir/subdir/file_a", size_mb)
+
+ # Flush the journal so that backtraces, dirfrag objects will actually be written
+ self.fs.mds_asok(["flush", "journal"])
+
+ # Unlink the file
+ self.mount_a.run_shell(["rm", "-f", "snapdir/subdir/file_a"])
+ self.mount_a.run_shell(["rmdir", "snapdir/subdir"])
+
+ # Unmount the client because when I come back to check the data is still
+ # in the file I don't want to just see what's in the page cache.
+ self.mount_a.umount_wait()
+
+ self.assertEqual(self.get_mdc_stat("strays_created"), 2)
+
+ # FIXME: at this stage we see a purge and the stray count drops to
+ # zero, but there's actually still a stray, so at the very
+ # least the StrayManager stats code is slightly off
+
+ self.mount_a.mount()
+
+ # See that the data from the snapshotted revision of the file is still present
+ # and correct
+ self.mount_a.validate_test_pattern("snapdir/.snap/snap1/subdir/file_a", size_mb * 1024 * 1024)
+
+ # Remove the snapshot
+ self.mount_a.run_shell(["rmdir", "snapdir/.snap/snap1"])
+ self.mount_a.umount_wait()
+
+ # Purging file_a doesn't happen until after we've flushed the journal, because
+ # it is referenced by the snapshotted subdir, and the snapshot isn't really
+ # gone until the journal references to it are gone
+ self.fs.mds_asok(["flush", "journal"])
+
+ # See that a purge happens now
+ self.wait_until_equal(
+ lambda: self.get_mdc_stat("strays_purged"),
+ expect_val=2, timeout=60, reject_fn=lambda x: x > 1
+ )
+
+ self.assertTrue(self.fs.data_objects_absent(file_a_ino, size_mb * 1024 * 1024))
+ self.await_data_pool_empty()
+
+ def test_fancy_layout(self):
+ """
+ purge stray file with fancy layout
+ """
+
+ file_name = "fancy_layout_file"
+ self.mount_a.run_shell(["touch", file_name])
+
+ file_layout = "stripe_unit=1048576 stripe_count=4 object_size=8388608"
+ self.mount_a.run_shell(["setfattr", "-n", "ceph.file.layout", "-v", file_layout, file_name])
+
+ # 35MB requires 7 objects
+ size_mb = 35
+ self.mount_a.write_n_mb(file_name, size_mb)
+
+ self.mount_a.run_shell(["rm", "-f", file_name])
+ self.fs.mds_asok(["flush", "journal"])
+
+ # can't use self.fs.data_objects_absent here, it does not support fancy layout
+ self.await_data_pool_empty()
+
+ def test_dirfrag_limit(self):
+ """
+ That the directory fragment size cannot exceed mds_bal_fragment_size_max (using a limit of 50 in all configurations).
+
+ That fragmentation (forced) will allow more entries to be created.
+
+ That unlinking fails when the stray directory fragment becomes too large and that unlinking may continue once those strays are purged.
+ """
+
+ self.fs.mon_manager.raw_cluster_cmd("mds", "set", "allow_dirfrags", "true", "--yes-i-really-mean-it")
+
+ LOW_LIMIT = 50
+ for mds in self.fs.get_daemon_names():
+ self.fs.mds_asok(["config", "set", "mds_bal_fragment_size_max", str(LOW_LIMIT)], mds)
+
+ try:
+ self.mount_a.run_python(dedent("""
+ import os
+ path = os.path.join("{path}", "subdir")
+ os.mkdir(path)
+ for n in range(0, {file_count}):
+ open(os.path.join(path, "%s" % n), 'w').write("%s" % n)
+ """.format(
+ path=self.mount_a.mountpoint,
+ file_count=LOW_LIMIT+1
+ )))
+ except CommandFailedError:
+ pass # ENOSPAC
+ else:
+ raise RuntimeError("fragment size exceeded")
+
+ # Now test that we can go beyond the limit if we fragment the directory
+
+ self.mount_a.run_python(dedent("""
+ import os
+ path = os.path.join("{path}", "subdir2")
+ os.mkdir(path)
+ for n in range(0, {file_count}):
+ open(os.path.join(path, "%s" % n), 'w').write("%s" % n)
+ dfd = os.open(path, os.O_DIRECTORY)
+ os.fsync(dfd)
+ """.format(
+ path=self.mount_a.mountpoint,
+ file_count=LOW_LIMIT
+ )))
+
+ # Ensure that subdir2 is fragmented
+ mds_id = self.fs.get_active_names()[0]
+ self.fs.mds_asok(["dirfrag", "split", "/subdir2", "0/0", "1"], mds_id)
+
+ # remount+flush (release client caps)
+ self.mount_a.umount_wait()
+ self.fs.mds_asok(["flush", "journal"], mds_id)
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+
+ # Create 50% more files than the current fragment limit
+ self.mount_a.run_python(dedent("""
+ import os
+ path = os.path.join("{path}", "subdir2")
+ for n in range({file_count}, ({file_count}*3)//2):
+ open(os.path.join(path, "%s" % n), 'w').write("%s" % n)
+ """.format(
+ path=self.mount_a.mountpoint,
+ file_count=LOW_LIMIT
+ )))
+
+ # Now test the stray directory size is limited and recovers
+ strays_before = self.get_mdc_stat("strays_created")
+ try:
+ self.mount_a.run_python(dedent("""
+ import os
+ path = os.path.join("{path}", "subdir3")
+ os.mkdir(path)
+ for n in range({file_count}):
+ fpath = os.path.join(path, "%s" % n)
+ f = open(fpath, 'w')
+ f.write("%s" % n)
+ f.close()
+ os.unlink(fpath)
+ """.format(
+ path=self.mount_a.mountpoint,
+ file_count=LOW_LIMIT*10 # 10 stray directories, should collide before this count
+ )))
+ except CommandFailedError:
+ pass # ENOSPAC
+ else:
+ raise RuntimeError("fragment size exceeded")
+
+ strays_after = self.get_mdc_stat("strays_created")
+ self.assertGreaterEqual(strays_after-strays_before, LOW_LIMIT)
+
+ self.wait_until_equal(
+ lambda: self.get_mdc_stat("strays_purged"),
+ strays_after,
+ timeout=600
+ )
+
+ self.mount_a.run_python(dedent("""
+ import os
+ path = os.path.join("{path}", "subdir4")
+ os.mkdir(path)
+ for n in range({file_count}):
+ fpath = os.path.join(path, "%s" % n)
+ f = open(fpath, 'w')
+ f.write("%s" % n)
+ f.close()
+ os.unlink(fpath)
+ """.format(
+ path=self.mount_a.mountpoint,
+ file_count=LOW_LIMIT
+ )))
--- /dev/null
+import json
+import logging
+import time
+import os
+from textwrap import dedent
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from tasks.cephfs.fuse_mount import FuseMount
+from teuthology.exceptions import CommandFailedError
+
+log = logging.getLogger(__name__)
+
+
+class TestVolumeClient(CephFSTestCase):
+ #
+ # TODO: Test that VolumeClient can recover from partial auth updates.
+ #
+
+ # One for looking at the global filesystem, one for being
+ # the VolumeClient, two for mounting the created shares
+ CLIENTS_REQUIRED = 4
+
+ def _volume_client_python(self, client, script, vol_prefix=None, ns_prefix=None):
+ # Can't dedent this *and* the script we pass in, because they might have different
+ # levels of indentation to begin with, so leave this string zero-indented
+ if vol_prefix:
+ vol_prefix = "\"" + vol_prefix + "\""
+ if ns_prefix:
+ ns_prefix = "\"" + ns_prefix + "\""
+ return client.run_python("""
+from ceph_volume_client import CephFSVolumeClient, VolumePath
+import logging
+log = logging.getLogger("ceph_volume_client")
+log.addHandler(logging.StreamHandler())
+log.setLevel(logging.DEBUG)
+vc = CephFSVolumeClient("manila", "{conf_path}", "ceph", {vol_prefix}, {ns_prefix})
+vc.connect()
+{payload}
+vc.disconnect()
+ """.format(payload=script, conf_path=client.config_path, vol_prefix=vol_prefix, ns_prefix=ns_prefix))
+
+ def _sudo_write_file(self, remote, path, data):
+ """
+ Write data to a remote file as super user
+
+ :param remote: Remote site.
+ :param path: Path on the remote being written to.
+ :param data: Data to be written.
+
+ Both perms and owner are passed directly to chmod.
+ """
+ remote.run(
+ args=[
+ 'sudo',
+ 'python',
+ '-c',
+ 'import shutil, sys; shutil.copyfileobj(sys.stdin, file(sys.argv[1], "wb"))',
+ path,
+ ],
+ stdin=data,
+ )
+
+ def _configure_vc_auth(self, mount, id_name):
+ """
+ Set up auth credentials for the VolumeClient user
+ """
+ out = self.fs.mon_manager.raw_cluster_cmd(
+ "auth", "get-or-create", "client.{name}".format(name=id_name),
+ "mds", "allow *",
+ "osd", "allow rw",
+ "mon", "allow *"
+ )
+ mount.client_id = id_name
+ self._sudo_write_file(mount.client_remote, mount.get_keyring_path(), out)
+ self.set_conf("client.{name}".format(name=id_name), "keyring", mount.get_keyring_path())
+
+ def _configure_guest_auth(self, volumeclient_mount, guest_mount,
+ guest_entity, mount_path,
+ namespace_prefix=None, readonly=False,
+ tenant_id=None):
+ """
+ Set up auth credentials for the guest client to mount a volume.
+
+ :param volumeclient_mount: mount used as the handle for driving
+ volumeclient.
+ :param guest_mount: mount used by the guest client.
+ :param guest_entity: auth ID used by the guest client.
+ :param mount_path: path of the volume.
+ :param namespace_prefix: name prefix of the RADOS namespace, which
+ is used for the volume's layout.
+ :param readonly: defaults to False. If set to 'True' only read-only
+ mount access is granted to the guest.
+ :param tenant_id: (OpenStack) tenant ID of the guest client.
+ """
+
+ head, volume_id = os.path.split(mount_path)
+ head, group_id = os.path.split(head)
+ head, volume_prefix = os.path.split(head)
+ volume_prefix = "/" + volume_prefix
+
+ # Authorize the guest client's auth ID to mount the volume.
+ key = self._volume_client_python(volumeclient_mount, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ auth_result = vc.authorize(vp, "{guest_entity}", readonly={readonly},
+ tenant_id="{tenant_id}")
+ print auth_result['auth_key']
+ """.format(
+ group_id=group_id,
+ volume_id=volume_id,
+ guest_entity=guest_entity,
+ readonly=readonly,
+ tenant_id=tenant_id)), volume_prefix, namespace_prefix
+ )
+
+ # CephFSVolumeClient's authorize() does not return the secret
+ # key to a caller who isn't multi-tenant aware. Explicitly
+ # query the key for such a client.
+ if not tenant_id:
+ key = self.fs.mon_manager.raw_cluster_cmd(
+ "auth", "get-key", "client.{name}".format(name=guest_entity),
+ )
+
+ # The guest auth ID should exist.
+ existing_ids = [a['entity'] for a in self.auth_list()]
+ self.assertIn("client.{0}".format(guest_entity), existing_ids)
+
+ # Create keyring file for the guest client.
+ keyring_txt = dedent("""
+ [client.{guest_entity}]
+ key = {key}
+
+ """.format(
+ guest_entity=guest_entity,
+ key=key
+ ))
+ guest_mount.client_id = guest_entity
+ self._sudo_write_file(guest_mount.client_remote,
+ guest_mount.get_keyring_path(),
+ keyring_txt)
+
+ # Add a guest client section to the ceph config file.
+ self.set_conf("client.{0}".format(guest_entity), "client quota", "True")
+ self.set_conf("client.{0}".format(guest_entity), "debug client", "20")
+ self.set_conf("client.{0}".format(guest_entity), "debug objecter", "20")
+ self.set_conf("client.{0}".format(guest_entity),
+ "keyring", guest_mount.get_keyring_path())
+
+ def test_default_prefix(self):
+ group_id = "grpid"
+ volume_id = "volid"
+ DEFAULT_VOL_PREFIX = "volumes"
+ DEFAULT_NS_PREFIX = "fsvolumens_"
+
+ self.mount_b.umount_wait()
+ self._configure_vc_auth(self.mount_b, "manila")
+
+ #create a volume with default prefix
+ self._volume_client_python(self.mount_b, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ vc.create_volume(vp, 10, data_isolated=True)
+ """.format(
+ group_id=group_id,
+ volume_id=volume_id,
+ )))
+
+ # The dir should be created
+ self.mount_a.stat(os.path.join(DEFAULT_VOL_PREFIX, group_id, volume_id))
+
+ #namespace should be set
+ ns_in_attr = self.mount_a.getfattr(os.path.join(DEFAULT_VOL_PREFIX, group_id, volume_id), "ceph.dir.layout.pool_namespace")
+ namespace = "{0}{1}".format(DEFAULT_NS_PREFIX, volume_id)
+ self.assertEqual(namespace, ns_in_attr)
+
+
+ def test_lifecycle(self):
+ """
+ General smoke test for create, extend, destroy
+ """
+
+ # I'm going to use mount_c later as a guest for mounting the created
+ # shares
+ self.mounts[2].umount_wait()
+
+ # I'm going to leave mount_b unmounted and just use it as a handle for
+ # driving volumeclient. It's a little hacky but we don't have a more
+ # general concept for librados/libcephfs clients as opposed to full
+ # blown mounting clients.
+ self.mount_b.umount_wait()
+ self._configure_vc_auth(self.mount_b, "manila")
+
+ guest_entity = "guest"
+ group_id = "grpid"
+ volume_id = "volid"
+
+ volume_prefix = "/myprefix"
+ namespace_prefix = "mynsprefix_"
+
+ # Create a 100MB volume
+ volume_size = 100
+ mount_path = self._volume_client_python(self.mount_b, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ create_result = vc.create_volume(vp, 1024*1024*{volume_size})
+ print create_result['mount_path']
+ """.format(
+ group_id=group_id,
+ volume_id=volume_id,
+ volume_size=volume_size
+ )), volume_prefix, namespace_prefix)
+
+ # The dir should be created
+ self.mount_a.stat(os.path.join("myprefix", group_id, volume_id))
+
+ # Authorize and configure credentials for the guest to mount the
+ # the volume.
+ self._configure_guest_auth(self.mount_b, self.mounts[2], guest_entity,
+ mount_path, namespace_prefix)
+ self.mounts[2].mount(mount_path=mount_path)
+
+ # The kernel client doesn't have the quota-based df behaviour,
+ # or quotas at all, so only exercise the client behaviour when
+ # running fuse.
+ if isinstance(self.mounts[2], FuseMount):
+ # df should see volume size, same as the quota set on volume's dir
+ self.assertEqual(self.mounts[2].df()['total'],
+ volume_size * 1024 * 1024)
+ self.assertEqual(
+ self.mount_a.getfattr(
+ os.path.join(volume_prefix.strip("/"), group_id, volume_id),
+ "ceph.quota.max_bytes"),
+ "%s" % (volume_size * 1024 * 1024))
+
+ # df granularity is 4MB block so have to write at least that much
+ data_bin_mb = 4
+ self.mounts[2].write_n_mb("data.bin", data_bin_mb)
+
+ # Write something outside volume to check this space usage is
+ # not reported in the volume's DF.
+ other_bin_mb = 6
+ self.mount_a.write_n_mb("other.bin", other_bin_mb)
+
+ # global: df should see all the writes (data + other). This is a >
+ # rather than a == because the global spaced used includes all pools
+ self.assertGreater(self.mount_a.df()['used'],
+ (data_bin_mb + other_bin_mb) * 1024 * 1024)
+
+ # Hack: do a metadata IO to kick rstats
+ self.mounts[2].run_shell(["touch", "foo"])
+
+ # volume: df should see the data_bin_mb consumed from quota, same
+ # as the rbytes for the volume's dir
+ self.wait_until_equal(
+ lambda: self.mounts[2].df()['used'],
+ data_bin_mb * 1024 * 1024, timeout=60)
+ self.wait_until_equal(
+ lambda: self.mount_a.getfattr(
+ os.path.join(volume_prefix.strip("/"), group_id, volume_id),
+ "ceph.dir.rbytes"),
+ "%s" % (data_bin_mb * 1024 * 1024), timeout=60)
+
+ # sync so that file data are persist to rados
+ self.mounts[2].run_shell(["sync"])
+
+ # Our data should stay in particular rados namespace
+ pool_name = self.mount_a.getfattr(os.path.join("myprefix", group_id, volume_id), "ceph.dir.layout.pool")
+ namespace = "{0}{1}".format(namespace_prefix, volume_id)
+ ns_in_attr = self.mount_a.getfattr(os.path.join("myprefix", group_id, volume_id), "ceph.dir.layout.pool_namespace")
+ self.assertEqual(namespace, ns_in_attr)
+
+ objects_in_ns = set(self.fs.rados(["ls"], pool=pool_name, namespace=namespace).split("\n"))
+ self.assertNotEqual(objects_in_ns, set())
+
+ # De-authorize the guest
+ self._volume_client_python(self.mount_b, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ vc.deauthorize(vp, "{guest_entity}")
+ vc.evict("{guest_entity}")
+ """.format(
+ group_id=group_id,
+ volume_id=volume_id,
+ guest_entity=guest_entity
+ )), volume_prefix, namespace_prefix)
+
+ # Once deauthorized, the client should be unable to do any more metadata ops
+ # The way that the client currently behaves here is to block (it acts like
+ # it has lost network, because there is nothing to tell it that is messages
+ # are being dropped because it's identity is gone)
+ background = self.mounts[2].write_n_mb("rogue.bin", 1, wait=False)
+ time.sleep(10) # Approximate check for 'stuck' as 'still running after 10s'
+ self.assertFalse(background.finished)
+
+ # After deauthorisation, the client ID should be gone (this was the only
+ # volume it was authorised for)
+ self.assertNotIn("client.{0}".format(guest_entity), [e['entity'] for e in self.auth_list()])
+
+ # Clean up the dead mount (ceph-fuse's behaviour here is a bit undefined)
+ self.mounts[2].kill()
+ self.mounts[2].kill_cleanup()
+ try:
+ background.wait()
+ except CommandFailedError:
+ # We killed the mount out from under you
+ pass
+
+ self._volume_client_python(self.mount_b, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ vc.delete_volume(vp)
+ vc.purge_volume(vp)
+ """.format(
+ group_id=group_id,
+ volume_id=volume_id,
+ )), volume_prefix, namespace_prefix)
+
+ def test_idempotency(self):
+ """
+ That the volumeclient interface works when calling everything twice
+ """
+ self.mount_b.umount_wait()
+ self._configure_vc_auth(self.mount_b, "manila")
+
+ guest_entity = "guest"
+ group_id = "grpid"
+ volume_id = "volid"
+ self._volume_client_python(self.mount_b, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ vc.create_volume(vp, 10)
+ vc.create_volume(vp, 10)
+ vc.authorize(vp, "{guest_entity}")
+ vc.authorize(vp, "{guest_entity}")
+ vc.deauthorize(vp, "{guest_entity}")
+ vc.deauthorize(vp, "{guest_entity}")
+ vc.delete_volume(vp)
+ vc.delete_volume(vp)
+ vc.purge_volume(vp)
+ vc.purge_volume(vp)
+
+ vc.create_volume(vp, 10, data_isolated=True)
+ vc.create_volume(vp, 10, data_isolated=True)
+ vc.authorize(vp, "{guest_entity}")
+ vc.authorize(vp, "{guest_entity}")
+ vc.deauthorize(vp, "{guest_entity}")
+ vc.deauthorize(vp, "{guest_entity}")
+ vc.evict("{guest_entity}")
+ vc.evict("{guest_entity}")
+ vc.delete_volume(vp, data_isolated=True)
+ vc.delete_volume(vp, data_isolated=True)
+ vc.purge_volume(vp, data_isolated=True)
+ vc.purge_volume(vp, data_isolated=True)
+ """.format(
+ group_id=group_id,
+ volume_id=volume_id,
+ guest_entity=guest_entity
+ )))
+
+ def test_data_isolated(self):
+ """
+ That data isolated shares get their own pool
+ :return:
+ """
+
+ # Because the teuthology config template sets mon_pg_warn_max_per_osd to
+ # 10000 (i.e. it just tries to ignore health warnings), reset it to something
+ # sane before using volume_client, to avoid creating pools with absurdly large
+ # numbers of PGs.
+ self.set_conf("global", "mon pg warn max per osd", "300")
+ for mon_daemon_state in self.ctx.daemons.iter_daemons_of_role('mon'):
+ mon_daemon_state.restart()
+
+ self.mount_b.umount_wait()
+ self._configure_vc_auth(self.mount_b, "manila")
+
+ # Calculate how many PGs we'll expect the new volume pool to have
+ osd_map = json.loads(self.fs.mon_manager.raw_cluster_cmd('osd', 'dump', '--format=json-pretty'))
+ max_per_osd = int(self.fs.get_config('mon_pg_warn_max_per_osd'))
+ osd_count = len(osd_map['osds'])
+ max_overall = osd_count * max_per_osd
+
+ existing_pg_count = 0
+ for p in osd_map['pools']:
+ existing_pg_count += p['pg_num']
+
+ expected_pg_num = (max_overall - existing_pg_count) / 10
+ log.info("max_per_osd {0}".format(max_per_osd))
+ log.info("osd_count {0}".format(osd_count))
+ log.info("max_overall {0}".format(max_overall))
+ log.info("existing_pg_count {0}".format(existing_pg_count))
+ log.info("expected_pg_num {0}".format(expected_pg_num))
+
+ pools_a = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools']
+
+ group_id = "grpid"
+ volume_id = "volid"
+ self._volume_client_python(self.mount_b, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ vc.create_volume(vp, 10, data_isolated=True)
+ """.format(
+ group_id=group_id,
+ volume_id=volume_id,
+ )))
+
+ pools_b = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools']
+
+ # Should have created one new pool
+ new_pools = set(p['pool_name'] for p in pools_b) - set([p['pool_name'] for p in pools_a])
+ self.assertEqual(len(new_pools), 1)
+
+ # It should have followed the heuristic for PG count
+ # (this is an overly strict test condition, so we may want to remove
+ # it at some point as/when the logic gets fancier)
+ created_pg_num = self.fs.mon_manager.get_pool_property(list(new_pools)[0], "pg_num")
+ self.assertEqual(expected_pg_num, created_pg_num)
+
+ def test_15303(self):
+ """
+ Reproducer for #15303 "Client holds incorrect complete flag on dir
+ after losing caps" (http://tracker.ceph.com/issues/15303)
+ """
+ for m in self.mounts:
+ m.umount_wait()
+
+ # Create a dir on mount A
+ self.mount_a.mount()
+ self.mount_a.run_shell(["mkdir", "parent1"])
+ self.mount_a.run_shell(["mkdir", "parent2"])
+ self.mount_a.run_shell(["mkdir", "parent1/mydir"])
+
+ # Put some files in it from mount B
+ self.mount_b.mount()
+ self.mount_b.run_shell(["touch", "parent1/mydir/afile"])
+ self.mount_b.umount_wait()
+
+ # List the dir's contents on mount A
+ self.assertListEqual(self.mount_a.ls("parent1/mydir"),
+ ["afile"])
+
+ def test_evict_client(self):
+ """
+ That a volume client can be evicted based on its auth ID and the volume
+ path it has mounted.
+ """
+
+ if not isinstance(self.mount_a, FuseMount):
+ self.skipTest("Requires FUSE client to inject client metadata")
+
+ # mounts[1] would be used as handle for driving VolumeClient. mounts[2]
+ # and mounts[3] would be used as guests to mount the volumes/shares.
+
+ for i in range(1, 4):
+ self.mounts[i].umount_wait()
+
+ volumeclient_mount = self.mounts[1]
+ self._configure_vc_auth(volumeclient_mount, "manila")
+ guest_mounts = (self.mounts[2], self.mounts[3])
+
+ guest_entity = "guest"
+ group_id = "grpid"
+ mount_paths = []
+ volume_ids = []
+
+ # Create two volumes. Authorize 'guest' auth ID to mount the two
+ # volumes. Mount the two volumes. Write data to the volumes.
+ for i in range(2):
+ # Create volume.
+ volume_ids.append("volid_{0}".format(str(i)))
+ mount_paths.append(
+ self._volume_client_python(volumeclient_mount, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ create_result = vc.create_volume(vp, 10 * 1024 * 1024)
+ print create_result['mount_path']
+ """.format(
+ group_id=group_id,
+ volume_id=volume_ids[i]
+ ))))
+
+ # Authorize 'guest' auth ID to mount the volume.
+ self._configure_guest_auth(volumeclient_mount, guest_mounts[i],
+ guest_entity, mount_paths[i])
+
+ # Mount the volume.
+ guest_mounts[i].mountpoint_dir_name = 'mnt.{id}.{suffix}'.format(
+ id=guest_entity, suffix=str(i))
+ guest_mounts[i].mount(mount_path=mount_paths[i])
+ guest_mounts[i].write_n_mb("data.bin", 1)
+
+
+ # Evict client, guest_mounts[0], using auth ID 'guest' and has mounted
+ # one volume.
+ self._volume_client_python(self.mount_b, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ vc.deauthorize(vp, "{guest_entity}")
+ vc.evict("{guest_entity}", volume_path=vp)
+ """.format(
+ group_id=group_id,
+ volume_id=volume_ids[0],
+ guest_entity=guest_entity
+ )))
+
+ # Evicted guest client, guest_mounts[0], should not be able to do
+ # anymore metadata ops. It behaves as if it has lost network
+ # connection.
+ background = guest_mounts[0].write_n_mb("rogue.bin", 1, wait=False)
+ # Approximate check for 'stuck' as 'still running after 10s'.
+ time.sleep(10)
+ self.assertFalse(background.finished)
+
+ # Guest client, guest_mounts[1], using the same auth ID 'guest', but
+ # has mounted the other volume, should be able to use its volume
+ # unaffected.
+ guest_mounts[1].write_n_mb("data.bin.1", 1)
+
+ # Cleanup.
+ for i in range(2):
+ self._volume_client_python(volumeclient_mount, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ vc.deauthorize(vp, "{guest_entity}")
+ vc.delete_volume(vp)
+ vc.purge_volume(vp)
+ """.format(
+ group_id=group_id,
+ volume_id=volume_ids[i],
+ guest_entity=guest_entity
+ )))
+
+ # We must hard-umount the one that we evicted
+ guest_mounts[0].umount_wait(force=True)
+
+ def test_purge(self):
+ """
+ Reproducer for #15266, exception trying to purge volumes that
+ contain non-ascii filenames.
+
+ Additionally test any other purge corner cases here.
+ """
+ # I'm going to leave mount_b unmounted and just use it as a handle for
+ # driving volumeclient. It's a little hacky but we don't have a more
+ # general concept for librados/libcephfs clients as opposed to full
+ # blown mounting clients.
+ self.mount_b.umount_wait()
+ self._configure_vc_auth(self.mount_b, "manila")
+
+ group_id = "grpid"
+ # Use a unicode volume ID (like Manila), to reproduce #15266
+ volume_id = u"volid"
+
+ # Create
+ mount_path = self._volume_client_python(self.mount_b, dedent("""
+ vp = VolumePath("{group_id}", u"{volume_id}")
+ create_result = vc.create_volume(vp, 10)
+ print create_result['mount_path']
+ """.format(
+ group_id=group_id,
+ volume_id=volume_id
+ )))
+
+ # Strip leading "/"
+ mount_path = mount_path[1:]
+
+ # A file with non-ascii characters
+ self.mount_a.run_shell(["touch", os.path.join(mount_path, u"b\u00F6b")])
+
+ # A file with no permissions to do anything
+ self.mount_a.run_shell(["touch", os.path.join(mount_path, "noperms")])
+ self.mount_a.run_shell(["chmod", "0000", os.path.join(mount_path, "noperms")])
+
+ self._volume_client_python(self.mount_b, dedent("""
+ vp = VolumePath("{group_id}", u"{volume_id}")
+ vc.delete_volume(vp)
+ vc.purge_volume(vp)
+ """.format(
+ group_id=group_id,
+ volume_id=volume_id
+ )))
+
+ # Check it's really gone
+ self.assertEqual(self.mount_a.ls("volumes/_deleting"), [])
+ self.assertEqual(self.mount_a.ls("volumes/"), ["_deleting", group_id])
+
+ def test_readonly_authorization(self):
+ """
+ That guest clients can be restricted to read-only mounts of volumes.
+ """
+
+ volumeclient_mount = self.mounts[1]
+ guest_mount = self.mounts[2]
+ volumeclient_mount.umount_wait()
+ guest_mount.umount_wait()
+
+ # Configure volumeclient_mount as the handle for driving volumeclient.
+ self._configure_vc_auth(volumeclient_mount, "manila")
+
+ guest_entity = "guest"
+ group_id = "grpid"
+ volume_id = "volid"
+
+ # Create a volume.
+ mount_path = self._volume_client_python(volumeclient_mount, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ create_result = vc.create_volume(vp, 1024*1024*10)
+ print create_result['mount_path']
+ """.format(
+ group_id=group_id,
+ volume_id=volume_id,
+ )))
+
+ # Authorize and configure credentials for the guest to mount the
+ # the volume with read-write access.
+ self._configure_guest_auth(volumeclient_mount, guest_mount, guest_entity,
+ mount_path, readonly=False)
+
+ # Mount the volume, and write to it.
+ guest_mount.mount(mount_path=mount_path)
+ guest_mount.write_n_mb("data.bin", 1)
+
+ # Change the guest auth ID's authorization to read-only mount access.
+ self._volume_client_python(volumeclient_mount, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ vc.deauthorize(vp, "{guest_entity}")
+ """.format(
+ group_id=group_id,
+ volume_id=volume_id,
+ guest_entity=guest_entity
+ )))
+ self._configure_guest_auth(volumeclient_mount, guest_mount, guest_entity,
+ mount_path, readonly=True)
+
+ # The effect of the change in access level to read-only is not
+ # immediate. The guest sees the change only after a remount of
+ # the volume.
+ guest_mount.umount_wait()
+ guest_mount.mount(mount_path=mount_path)
+
+ # Read existing content of the volume.
+ self.assertListEqual(guest_mount.ls(guest_mount.mountpoint), ["data.bin"])
+ # Cannot write into read-only volume.
+ with self.assertRaises(CommandFailedError):
+ guest_mount.write_n_mb("rogue.bin", 1)
+
+ def test_get_authorized_ids(self):
+ """
+ That for a volume, the authorized IDs and their access levels
+ can be obtained using CephFSVolumeClient's get_authorized_ids().
+ """
+ volumeclient_mount = self.mounts[1]
+ volumeclient_mount.umount_wait()
+
+ # Configure volumeclient_mount as the handle for driving volumeclient.
+ self._configure_vc_auth(volumeclient_mount, "manila")
+
+ group_id = "grpid"
+ volume_id = "volid"
+ guest_entity_1 = "guest1"
+ guest_entity_2 = "guest2"
+
+ log.info("print group ID: {0}".format(group_id))
+
+ # Create a volume.
+ auths = self._volume_client_python(volumeclient_mount, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ vc.create_volume(vp, 1024*1024*10)
+ auths = vc.get_authorized_ids(vp)
+ print auths
+ """.format(
+ group_id=group_id,
+ volume_id=volume_id,
+ )))
+ # Check the list of authorized IDs for the volume.
+ expected_result = None
+ self.assertEqual(str(expected_result), auths)
+
+ # Allow two auth IDs access to the volume.
+ auths = self._volume_client_python(volumeclient_mount, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ vc.authorize(vp, "{guest_entity_1}", readonly=False)
+ vc.authorize(vp, "{guest_entity_2}", readonly=True)
+ auths = vc.get_authorized_ids(vp)
+ print auths
+ """.format(
+ group_id=group_id,
+ volume_id=volume_id,
+ guest_entity_1=guest_entity_1,
+ guest_entity_2=guest_entity_2,
+ )))
+ # Check the list of authorized IDs and their access levels.
+ expected_result = [(u'guest1', u'rw'), (u'guest2', u'r')]
+ self.assertItemsEqual(str(expected_result), auths)
+
+ # Disallow both the auth IDs' access to the volume.
+ auths = self._volume_client_python(volumeclient_mount, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ vc.deauthorize(vp, "{guest_entity_1}")
+ vc.deauthorize(vp, "{guest_entity_2}")
+ auths = vc.get_authorized_ids(vp)
+ print auths
+ """.format(
+ group_id=group_id,
+ volume_id=volume_id,
+ guest_entity_1=guest_entity_1,
+ guest_entity_2=guest_entity_2,
+ )))
+ # Check the list of authorized IDs for the volume.
+ expected_result = None
+ self.assertItemsEqual(str(expected_result), auths)
+
+ def test_multitenant_volumes(self):
+ """
+ That volume access can be restricted to a tenant.
+
+ That metadata used to enforce tenant isolation of
+ volumes is stored as a two-way mapping between auth
+ IDs and volumes that they're authorized to access.
+ """
+ volumeclient_mount = self.mounts[1]
+ volumeclient_mount.umount_wait()
+
+ # Configure volumeclient_mount as the handle for driving volumeclient.
+ self._configure_vc_auth(volumeclient_mount, "manila")
+
+ group_id = "groupid"
+ volume_id = "volumeid"
+
+ # Guest clients belonging to different tenants, but using the same
+ # auth ID.
+ auth_id = "guest"
+ guestclient_1 = {
+ "auth_id": auth_id,
+ "tenant_id": "tenant1",
+ }
+ guestclient_2 = {
+ "auth_id": auth_id,
+ "tenant_id": "tenant2",
+ }
+
+ # Create a volume.
+ self._volume_client_python(volumeclient_mount, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ vc.create_volume(vp, 1024*1024*10)
+ """.format(
+ group_id=group_id,
+ volume_id=volume_id,
+ )))
+
+ # Check that volume metadata file is created on volume creation.
+ vol_metadata_filename = "_{0}:{1}.meta".format(group_id, volume_id)
+ self.assertIn(vol_metadata_filename, self.mounts[0].ls("volumes"))
+
+ # Authorize 'guestclient_1', using auth ID 'guest' and belonging to
+ # 'tenant1', with 'rw' access to the volume.
+ self._volume_client_python(volumeclient_mount, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
+ """.format(
+ group_id=group_id,
+ volume_id=volume_id,
+ auth_id=guestclient_1["auth_id"],
+ tenant_id=guestclient_1["tenant_id"]
+ )))
+
+ # Check that auth metadata file for auth ID 'guest', is
+ # created on authorizing 'guest' access to the volume.
+ auth_metadata_filename = "${0}.meta".format(guestclient_1["auth_id"])
+ self.assertIn(auth_metadata_filename, self.mounts[0].ls("volumes"))
+
+ # Verify that the auth metadata file stores the tenant ID that the
+ # auth ID belongs to, the auth ID's authorized access levels
+ # for different volumes, versioning details, etc.
+ expected_auth_metadata = {
+ u"version": 1,
+ u"compat_version": 1,
+ u"dirty": False,
+ u"tenant_id": u"tenant1",
+ u"volumes": {
+ u"groupid/volumeid": {
+ u"dirty": False,
+ u"access_level": u"rw",
+ }
+ }
+ }
+
+ auth_metadata = self._volume_client_python(volumeclient_mount, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ auth_metadata = vc._auth_metadata_get("{auth_id}")
+ print auth_metadata
+ """.format(
+ group_id=group_id,
+ volume_id=volume_id,
+ auth_id=guestclient_1["auth_id"],
+ )))
+
+ self.assertItemsEqual(str(expected_auth_metadata), auth_metadata)
+
+ # Verify that the volume metadata file stores info about auth IDs
+ # and their access levels to the volume, versioning details, etc.
+ expected_vol_metadata = {
+ u"version": 1,
+ u"compat_version": 1,
+ u"auths": {
+ u"guest": {
+ u"dirty": False,
+ u"access_level": u"rw"
+ }
+ }
+ }
+
+ vol_metadata = self._volume_client_python(volumeclient_mount, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ volume_metadata = vc._volume_metadata_get(vp)
+ print volume_metadata
+ """.format(
+ group_id=group_id,
+ volume_id=volume_id,
+ )))
+ self.assertItemsEqual(str(expected_vol_metadata), vol_metadata)
+
+ # Cannot authorize 'guestclient_2' to access the volume.
+ # It uses auth ID 'guest', which has already been used by a
+ # 'guestclient_1' belonging to an another tenant for accessing
+ # the volume.
+ with self.assertRaises(CommandFailedError):
+ self._volume_client_python(volumeclient_mount, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
+ """.format(
+ group_id=group_id,
+ volume_id=volume_id,
+ auth_id=guestclient_2["auth_id"],
+ tenant_id=guestclient_2["tenant_id"]
+ )))
+
+ # Check that auth metadata file is cleaned up on removing
+ # auth ID's only access to a volume.
+ self._volume_client_python(volumeclient_mount, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ vc.deauthorize(vp, "{guest_entity}")
+ """.format(
+ group_id=group_id,
+ volume_id=volume_id,
+ guest_entity=guestclient_1["auth_id"]
+ )))
+
+ self.assertNotIn(auth_metadata_filename, self.mounts[0].ls("volumes"))
+
+ # Check that volume metadata file is cleaned up on volume deletion.
+ self._volume_client_python(volumeclient_mount, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ vc.delete_volume(vp)
+ """.format(
+ group_id=group_id,
+ volume_id=volume_id,
+ )))
+ self.assertNotIn(vol_metadata_filename, self.mounts[0].ls("volumes"))
+
+ def test_recover_metadata(self):
+ """
+ That volume client can recover from partial auth updates using
+ metadata files, which store auth info and its update status info.
+ """
+ volumeclient_mount = self.mounts[1]
+ volumeclient_mount.umount_wait()
+
+ # Configure volumeclient_mount as the handle for driving volumeclient.
+ self._configure_vc_auth(volumeclient_mount, "manila")
+
+ group_id = "groupid"
+ volume_id = "volumeid"
+
+ guestclient = {
+ "auth_id": "guest",
+ "tenant_id": "tenant",
+ }
+
+ # Create a volume.
+ self._volume_client_python(volumeclient_mount, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ vc.create_volume(vp, 1024*1024*10)
+ """.format(
+ group_id=group_id,
+ volume_id=volume_id,
+ )))
+
+ # Authorize 'guestclient' access to the volume.
+ self._volume_client_python(volumeclient_mount, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
+ """.format(
+ group_id=group_id,
+ volume_id=volume_id,
+ auth_id=guestclient["auth_id"],
+ tenant_id=guestclient["tenant_id"]
+ )))
+
+ # Check that auth metadata file for auth ID 'guest' is created.
+ auth_metadata_filename = "${0}.meta".format(guestclient["auth_id"])
+ self.assertIn(auth_metadata_filename, self.mounts[0].ls("volumes"))
+
+ # Induce partial auth update state by modifying the auth metadata file,
+ # and then run recovery procedure.
+ self._volume_client_python(volumeclient_mount, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ auth_metadata = vc._auth_metadata_get("{auth_id}")
+ auth_metadata['dirty'] = True
+ vc._auth_metadata_set("{auth_id}", auth_metadata)
+ vc.recover()
+ """.format(
+ group_id=group_id,
+ volume_id=volume_id,
+ auth_id=guestclient["auth_id"],
+ )))
--- /dev/null
+import contextlib
+import logging
+import os
+import unittest
+from unittest import suite, loader, case
+from teuthology.task import interactive
+from teuthology import misc
+from tasks.cephfs.filesystem import Filesystem, MDSCluster, CephCluster
+from tasks.mgr.mgr_test_case import MgrCluster
+
+log = logging.getLogger(__name__)
+
+
+class DecoratingLoader(loader.TestLoader):
+ """
+ A specialization of TestLoader that tags some extra attributes
+ onto test classes as they are loaded.
+ """
+ def __init__(self, params):
+ self._params = params
+ super(DecoratingLoader, self).__init__()
+
+ def _apply_params(self, obj):
+ for k, v in self._params.items():
+ setattr(obj, k, v)
+
+ def loadTestsFromTestCase(self, testCaseClass):
+ self._apply_params(testCaseClass)
+ return super(DecoratingLoader, self).loadTestsFromTestCase(testCaseClass)
+
+ def loadTestsFromName(self, name, module=None):
+ result = super(DecoratingLoader, self).loadTestsFromName(name, module)
+
+ # Special case for when we were called with the name of a method, we get
+ # a suite with one TestCase
+ tests_in_result = list(result)
+ if len(tests_in_result) == 1 and isinstance(tests_in_result[0], case.TestCase):
+ self._apply_params(tests_in_result[0])
+
+ return result
+
+
+class LogStream(object):
+ def __init__(self):
+ self.buffer = ""
+
+ def write(self, data):
+ self.buffer += data
+ if "\n" in self.buffer:
+ lines = self.buffer.split("\n")
+ for line in lines[:-1]:
+ log.info(line)
+ self.buffer = lines[-1]
+
+ def flush(self):
+ pass
+
+
+class InteractiveFailureResult(unittest.TextTestResult):
+ """
+ Specialization that implements interactive-on-error style
+ behavior.
+ """
+ ctx = None
+
+ def addFailure(self, test, err):
+ log.error(self._exc_info_to_string(err, test))
+ log.error("Failure in test '{0}', going interactive".format(
+ self.getDescription(test)
+ ))
+ interactive.task(ctx=self.ctx, config=None)
+
+ def addError(self, test, err):
+ log.error(self._exc_info_to_string(err, test))
+ log.error("Error in test '{0}', going interactive".format(
+ self.getDescription(test)
+ ))
+ interactive.task(ctx=self.ctx, config=None)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Run the CephFS test cases.
+
+ Run everything in tasks/cephfs/test_*.py:
+
+ ::
+
+ tasks:
+ - install:
+ - ceph:
+ - ceph-fuse:
+ - cephfs_test_runner:
+
+ `modules` argument allows running only some specific modules:
+
+ ::
+
+ tasks:
+ ...
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_sessionmap
+ - tasks.cephfs.test_auto_repair
+
+ By default, any cases that can't be run on the current cluster configuration
+ will generate a failure. When the optional `fail_on_skip` argument is set
+ to false, any tests that can't be run on the current configuration will
+ simply be skipped:
+
+ ::
+ tasks:
+ ...
+ - cephfs_test_runner:
+ fail_on_skip: false
+
+ """
+
+ ceph_cluster = CephCluster(ctx)
+
+ if len(list(misc.all_roles_of_type(ctx.cluster, 'mds'))):
+ mds_cluster = MDSCluster(ctx)
+ fs = Filesystem(ctx)
+ else:
+ mds_cluster = None
+ fs = None
+
+ if len(list(misc.all_roles_of_type(ctx.cluster, 'mgr'))):
+ mgr_cluster = MgrCluster(ctx)
+ else:
+ mgr_cluster = None
+
+ # Mount objects, sorted by ID
+ if hasattr(ctx, 'mounts'):
+ mounts = [v for k, v in sorted(ctx.mounts.items(), lambda a, b: cmp(a[0], b[0]))]
+ else:
+ # The test configuration has a filesystem but no fuse/kclient mounts
+ mounts = []
+
+ decorating_loader = DecoratingLoader({
+ "ctx": ctx,
+ "mounts": mounts,
+ "fs": fs,
+ "ceph_cluster": ceph_cluster,
+ "mds_cluster": mds_cluster,
+ "mgr_cluster": mgr_cluster,
+ })
+
+ fail_on_skip = config.get('fail_on_skip', True)
+
+ # Put useful things onto ctx for interactive debugging
+ ctx.fs = fs
+ ctx.mds_cluster = mds_cluster
+ ctx.mgr_cluster = mgr_cluster
+
+ # Depending on config, either load specific modules, or scan for moduless
+ if config and 'modules' in config and config['modules']:
+ module_suites = []
+ for mod_name in config['modules']:
+ # Test names like cephfs.test_auto_repair
+ module_suites.append(decorating_loader.loadTestsFromName(mod_name))
+ overall_suite = suite.TestSuite(module_suites)
+ else:
+ # Default, run all tests
+ overall_suite = decorating_loader.discover(
+ os.path.join(
+ os.path.dirname(os.path.abspath(__file__)),
+ "cephfs/"
+ )
+ )
+
+ if ctx.config.get("interactive-on-error", False):
+ InteractiveFailureResult.ctx = ctx
+ result_class = InteractiveFailureResult
+ else:
+ result_class = unittest.TextTestResult
+
+ class LoggingResult(result_class):
+ def startTest(self, test):
+ log.info("Starting test: {0}".format(self.getDescription(test)))
+ return super(LoggingResult, self).startTest(test)
+
+ def addSkip(self, test, reason):
+ if fail_on_skip:
+ # Don't just call addFailure because that requires a traceback
+ self.failures.append((test, reason))
+ else:
+ super(LoggingResult, self).addSkip(test, reason)
+
+ # Execute!
+ result = unittest.TextTestRunner(
+ stream=LogStream(),
+ resultclass=LoggingResult,
+ verbosity=2,
+ failfast=True).run(overall_suite)
+
+ if not result.wasSuccessful():
+ result.printErrors() # duplicate output at end for convenience
+
+ bad_tests = []
+ for test, error in result.errors:
+ bad_tests.append(str(test))
+ for test, failure in result.failures:
+ bad_tests.append(str(test))
+
+ raise RuntimeError("Test failure: {0}".format(", ".join(bad_tests)))
+
+ yield
--- /dev/null
+"""
+Mount cifs clients. Unmount when finished.
+"""
+import contextlib
+import logging
+import os
+
+from teuthology import misc as teuthology
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Mount/unmount a cifs client.
+
+ The config is optional and defaults to mounting on all clients. If
+ a config is given, it is expected to be a list of clients to do
+ this operation on.
+
+ Example that starts smbd and mounts cifs on all nodes::
+
+ tasks:
+ - ceph:
+ - samba:
+ - cifs-mount:
+ - interactive:
+
+ Example that splits smbd and cifs:
+
+ tasks:
+ - ceph:
+ - samba: [samba.0]
+ - cifs-mount: [client.0]
+ - ceph-fuse: [client.1]
+ - interactive:
+
+ Example that specifies the share name:
+
+ tasks:
+ - ceph:
+ - ceph-fuse:
+ - samba:
+ samba.0:
+ cephfuse: "{testdir}/mnt.0"
+ - cifs-mount:
+ client.0:
+ share: cephfuse
+
+ :param ctx: Context
+ :param config: Configuration
+ """
+ log.info('Mounting cifs clients...')
+
+ if config is None:
+ config = dict(('client.{id}'.format(id=id_), None)
+ for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client'))
+ elif isinstance(config, list):
+ config = dict((name, None) for name in config)
+
+ clients = list(teuthology.get_clients(ctx=ctx, roles=config.keys()))
+
+ from .samba import get_sambas
+ samba_roles = ['samba.{id_}'.format(id_=id_) for id_ in teuthology.all_roles_of_type(ctx.cluster, 'samba')]
+ sambas = list(get_sambas(ctx=ctx, roles=samba_roles))
+ (ip, _) = sambas[0][1].ssh.get_transport().getpeername()
+ log.info('samba ip: {ip}'.format(ip=ip))
+
+ for id_, remote in clients:
+ mnt = os.path.join(teuthology.get_testdir(ctx), 'mnt.{id}'.format(id=id_))
+ log.info('Mounting cifs client.{id} at {remote} {mnt}...'.format(
+ id=id_, remote=remote,mnt=mnt))
+
+ remote.run(
+ args=[
+ 'mkdir',
+ '--',
+ mnt,
+ ],
+ )
+
+ rolestr = 'client.{id_}'.format(id_=id_)
+ unc = "ceph"
+ log.info("config: {c}".format(c=config))
+ if config[rolestr] is not None and 'share' in config[rolestr]:
+ unc = config[rolestr]['share']
+
+ remote.run(
+ args=[
+ 'sudo',
+ 'mount',
+ '-t',
+ 'cifs',
+ '//{sambaip}/{unc}'.format(sambaip=ip, unc=unc),
+ '-o',
+ 'username=ubuntu,password=ubuntu',
+ mnt,
+ ],
+ )
+
+ remote.run(
+ args=[
+ 'sudo',
+ 'chown',
+ 'ubuntu:ubuntu',
+ '{m}/'.format(m=mnt),
+ ],
+ )
+
+ try:
+ yield
+ finally:
+ log.info('Unmounting cifs clients...')
+ for id_, remote in clients:
+ remote.run(
+ args=[
+ 'sudo',
+ 'umount',
+ mnt,
+ ],
+ )
+ for id_, remote in clients:
+ while True:
+ try:
+ remote.run(
+ args=[
+ 'rmdir', '--', mnt,
+ run.Raw('2>&1'),
+ run.Raw('|'),
+ 'grep', 'Device or resource busy',
+ ],
+ )
+ import time
+ time.sleep(1)
+ except Exception:
+ break
--- /dev/null
+"""
+Cram tests
+"""
+import logging
+import os
+
+from teuthology import misc as teuthology
+from teuthology.parallel import parallel
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+ """
+ Run all cram tests from the specified urls on the specified
+ clients. Each client runs tests in parallel.
+
+ Limitations:
+ Tests must have a .t suffix. Tests with duplicate names will
+ overwrite each other, so only the last one will run.
+
+ For example::
+
+ tasks:
+ - ceph:
+ - cram:
+ clients:
+ client.0:
+ - http://ceph.com/qa/test.t
+ - http://ceph.com/qa/test2.t]
+ client.1: [http://ceph.com/qa/test.t]
+ branch: foo
+
+ You can also run a list of cram tests on all clients::
+
+ tasks:
+ - ceph:
+ - cram:
+ clients:
+ all: [http://ceph.com/qa/test.t]
+
+ :param ctx: Context
+ :param config: Configuration
+ """
+ assert isinstance(config, dict)
+ assert 'clients' in config and isinstance(config['clients'], dict), \
+ 'configuration must contain a dictionary of clients'
+
+ clients = teuthology.replace_all_with_clients(ctx.cluster,
+ config['clients'])
+ testdir = teuthology.get_testdir(ctx)
+
+ overrides = ctx.config.get('overrides', {})
+ teuthology.deep_merge(config, overrides.get('workunit', {}))
+
+ refspec = config.get('branch')
+ if refspec is None:
+ refspec = config.get('tag')
+ if refspec is None:
+ refspec = config.get('sha1')
+ if refspec is None:
+ refspec = 'HEAD'
+
+ try:
+ for client, tests in clients.iteritems():
+ (remote,) = ctx.cluster.only(client).remotes.iterkeys()
+ client_dir = '{tdir}/archive/cram.{role}'.format(tdir=testdir, role=client)
+ remote.run(
+ args=[
+ 'mkdir', '--', client_dir,
+ run.Raw('&&'),
+ 'virtualenv', '{tdir}/virtualenv'.format(tdir=testdir),
+ run.Raw('&&'),
+ '{tdir}/virtualenv/bin/pip'.format(tdir=testdir),
+ 'install', 'cram==0.6',
+ ],
+ )
+ for test in tests:
+ log.info('fetching test %s for %s', test, client)
+ assert test.endswith('.t'), 'tests must end in .t'
+ remote.run(
+ args=[
+ 'wget', '-nc', '-nv', '-P', client_dir, '--', test.format(branch=refspec),
+ ],
+ )
+
+ with parallel() as p:
+ for role in clients.iterkeys():
+ p.spawn(_run_tests, ctx, role)
+ finally:
+ for client, tests in clients.iteritems():
+ (remote,) = ctx.cluster.only(client).remotes.iterkeys()
+ client_dir = '{tdir}/archive/cram.{role}'.format(tdir=testdir, role=client)
+ test_files = set([test.rsplit('/', 1)[1] for test in tests])
+
+ # remove test files unless they failed
+ for test_file in test_files:
+ abs_file = os.path.join(client_dir, test_file)
+ remote.run(
+ args=[
+ 'test', '-f', abs_file + '.err',
+ run.Raw('||'),
+ 'rm', '-f', '--', abs_file,
+ ],
+ )
+
+ # ignore failure since more than one client may
+ # be run on a host, and the client dir should be
+ # non-empty if the test failed
+ remote.run(
+ args=[
+ 'rm', '-rf', '--',
+ '{tdir}/virtualenv'.format(tdir=testdir),
+ run.Raw(';'),
+ 'rmdir', '--ignore-fail-on-non-empty', client_dir,
+ ],
+ )
+
+def _run_tests(ctx, role):
+ """
+ For each role, check to make sure it's a client, then run the cram on that client
+
+ :param ctx: Context
+ :param role: Roles
+ """
+ assert isinstance(role, basestring)
+ PREFIX = 'client.'
+ assert role.startswith(PREFIX)
+ id_ = role[len(PREFIX):]
+ (remote,) = ctx.cluster.only(role).remotes.iterkeys()
+ ceph_ref = ctx.summary.get('ceph-sha1', 'master')
+
+ testdir = teuthology.get_testdir(ctx)
+ log.info('Running tests for %s...', role)
+ remote.run(
+ args=[
+ run.Raw('CEPH_REF={ref}'.format(ref=ceph_ref)),
+ run.Raw('CEPH_ID="{id}"'.format(id=id_)),
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ '{tdir}/virtualenv/bin/cram'.format(tdir=testdir),
+ '-v', '--',
+ run.Raw('{tdir}/archive/cram.{role}/*.t'.format(tdir=testdir, role=role)),
+ ],
+ logger=log.getChild(role),
+ )
--- /dev/null
+"""
+Rados modle-based integration tests
+"""
+import contextlib
+import logging
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ For each combination of namespace and name_length, create
+ <num_objects> objects with name length <name_length>
+ on entry. On exit, verify that the objects still exist, can
+ be deleted, and then don't exist.
+
+ Usage::
+
+ create_verify_lfn_objects.py:
+ pool: <pool_name> default: 'data'
+ prefix: <prefix> default: ''
+ namespace: [<namespace>] default: ['']
+ num_objects: [<num_objects>] default: 10
+ name_length: [<name_length>] default: [400]
+ """
+ pool = config.get('pool', 'data')
+ num_objects = config.get('num_objects', 10)
+ name_length = config.get('name_length', [400])
+ namespace = config.get('namespace', [None])
+ prefix = config.get('prefix', None)
+ manager = ctx.managers['ceph']
+
+ objects = []
+ for l in name_length:
+ for ns in namespace:
+ def object_name(i):
+ nslength = 0
+ if namespace is not '':
+ nslength = len(namespace)
+ numstr = str(i)
+ fillerlen = l - nslength - len(prefix) - len(numstr)
+ assert fillerlen >= 0
+ return prefix + ('a'*fillerlen) + numstr
+ objects += [(ns, object_name(i)) for i in range(num_objects)]
+
+ for ns, name in objects:
+ err = manager.do_put(
+ pool,
+ name,
+ '/etc/resolv.conf',
+ namespace=ns)
+ log.info("err is " + str(err))
+ assert err == 0
+
+ try:
+ yield
+ finally:
+ log.info('ceph_verify_lfn_objects verifying...')
+ for ns, name in objects:
+ err = manager.do_get(
+ pool,
+ name,
+ namespace=ns)
+ log.info("err is " + str(err))
+ assert err == 0
+
+ log.info('ceph_verify_lfn_objects deleting...')
+ for ns, name in objects:
+ err = manager.do_rm(
+ pool,
+ name,
+ namespace=ns)
+ log.info("err is " + str(err))
+ assert err == 0
+
+ log.info('ceph_verify_lfn_objects verifying absent...')
+ for ns, name in objects:
+ err = manager.do_get(
+ pool,
+ name,
+ namespace=ns)
+ log.info("err is " + str(err))
+ assert err != 0
--- /dev/null
+#!/usr/bin/env python
+import contextlib
+import logging
+from cStringIO import StringIO
+import textwrap
+from configparser import ConfigParser
+import time
+
+from teuthology.orchestra import run
+from teuthology import misc
+from teuthology.contextutil import nested
+
+log = logging.getLogger(__name__)
+
+DEVSTACK_GIT_REPO = 'https://github.com/openstack-dev/devstack.git'
+DS_STABLE_BRANCHES = ("havana", "grizzly")
+
+is_devstack_node = lambda role: role.startswith('devstack')
+is_osd_node = lambda role: role.startswith('osd')
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+ if config is None:
+ config = {}
+ if not isinstance(config, dict):
+ raise TypeError("config must be a dict")
+ with nested(lambda: install(ctx=ctx, config=config),
+ lambda: smoke(ctx=ctx, config=config),
+ ):
+ yield
+
+
+@contextlib.contextmanager
+def install(ctx, config):
+ """
+ Install OpenStack DevStack and configure it to use a Ceph cluster for
+ Glance and Cinder.
+
+ Requires one node with a role 'devstack'
+
+ Since devstack runs rampant on the system it's used on, typically you will
+ want to reprovision that machine after using devstack on it.
+
+ Also, the default 2GB of RAM that is given to vps nodes is insufficient. I
+ recommend 4GB. Downburst can be instructed to give 4GB to a vps node by
+ adding this to the yaml:
+
+ downburst:
+ ram: 4G
+
+ This was created using documentation found here:
+ https://github.com/openstack-dev/devstack/blob/master/README.md
+ http://ceph.com/docs/master/rbd/rbd-openstack/
+ """
+ if config is None:
+ config = {}
+ if not isinstance(config, dict):
+ raise TypeError("config must be a dict")
+
+ devstack_node = ctx.cluster.only(is_devstack_node).remotes.keys()[0]
+ an_osd_node = ctx.cluster.only(is_osd_node).remotes.keys()[0]
+
+ devstack_branch = config.get("branch", "master")
+ install_devstack(devstack_node, devstack_branch)
+ try:
+ configure_devstack_and_ceph(ctx, config, devstack_node, an_osd_node)
+ yield
+ finally:
+ pass
+
+
+def install_devstack(devstack_node, branch="master"):
+ log.info("Cloning DevStack repo...")
+
+ args = ['git', 'clone', DEVSTACK_GIT_REPO]
+ devstack_node.run(args=args)
+
+ if branch != "master":
+ if branch in DS_STABLE_BRANCHES and not branch.startswith("stable"):
+ branch = "stable/" + branch
+ log.info("Checking out {branch} branch...".format(branch=branch))
+ cmd = "cd devstack && git checkout " + branch
+ devstack_node.run(args=cmd)
+
+ log.info("Installing DevStack...")
+ args = ['cd', 'devstack', run.Raw('&&'), './stack.sh']
+ devstack_node.run(args=args)
+
+
+def configure_devstack_and_ceph(ctx, config, devstack_node, ceph_node):
+ pool_size = config.get('pool_size', '128')
+ create_pools(ceph_node, pool_size)
+ distribute_ceph_conf(devstack_node, ceph_node)
+ # This is where we would install python-ceph and ceph-common but it appears
+ # the ceph task does that for us.
+ generate_ceph_keys(ceph_node)
+ distribute_ceph_keys(devstack_node, ceph_node)
+ secret_uuid = set_libvirt_secret(devstack_node, ceph_node)
+ update_devstack_config_files(devstack_node, secret_uuid)
+ set_apache_servername(devstack_node)
+ # Rebooting is the most-often-used method of restarting devstack services
+ misc.reboot(devstack_node)
+ start_devstack(devstack_node)
+ restart_apache(devstack_node)
+
+
+def create_pools(ceph_node, pool_size):
+ log.info("Creating pools on Ceph cluster...")
+
+ for pool_name in ['volumes', 'images', 'backups']:
+ args = ['sudo', 'ceph', 'osd', 'pool', 'create', pool_name, pool_size]
+ ceph_node.run(args=args)
+
+
+def distribute_ceph_conf(devstack_node, ceph_node):
+ log.info("Copying ceph.conf to DevStack node...")
+
+ ceph_conf_path = '/etc/ceph/ceph.conf'
+ ceph_conf = misc.get_file(ceph_node, ceph_conf_path, sudo=True)
+ misc.sudo_write_file(devstack_node, ceph_conf_path, ceph_conf)
+
+
+def generate_ceph_keys(ceph_node):
+ log.info("Generating Ceph keys...")
+
+ ceph_auth_cmds = [
+ ['sudo', 'ceph', 'auth', 'get-or-create', 'client.cinder', 'mon',
+ 'allow r', 'osd', 'allow class-read object_prefix rbd_children, allow rwx pool=volumes, allow rx pool=images'], # noqa
+ ['sudo', 'ceph', 'auth', 'get-or-create', 'client.glance', 'mon',
+ 'allow r', 'osd', 'allow class-read object_prefix rbd_children, allow rwx pool=images'], # noqa
+ ['sudo', 'ceph', 'auth', 'get-or-create', 'client.cinder-backup', 'mon',
+ 'allow r', 'osd', 'allow class-read object_prefix rbd_children, allow rwx pool=backups'], # noqa
+ ]
+ for cmd in ceph_auth_cmds:
+ ceph_node.run(args=cmd)
+
+
+def distribute_ceph_keys(devstack_node, ceph_node):
+ log.info("Copying Ceph keys to DevStack node...")
+
+ def copy_key(from_remote, key_name, to_remote, dest_path, owner):
+ key_stringio = StringIO()
+ from_remote.run(
+ args=['sudo', 'ceph', 'auth', 'get-or-create', key_name],
+ stdout=key_stringio)
+ key_stringio.seek(0)
+ misc.sudo_write_file(to_remote, dest_path,
+ key_stringio, owner=owner)
+ keys = [
+ dict(name='client.glance',
+ path='/etc/ceph/ceph.client.glance.keyring',
+ # devstack appears to just want root:root
+ #owner='glance:glance',
+ ),
+ dict(name='client.cinder',
+ path='/etc/ceph/ceph.client.cinder.keyring',
+ # devstack appears to just want root:root
+ #owner='cinder:cinder',
+ ),
+ dict(name='client.cinder-backup',
+ path='/etc/ceph/ceph.client.cinder-backup.keyring',
+ # devstack appears to just want root:root
+ #owner='cinder:cinder',
+ ),
+ ]
+ for key_dict in keys:
+ copy_key(ceph_node, key_dict['name'], devstack_node,
+ key_dict['path'], key_dict.get('owner'))
+
+
+def set_libvirt_secret(devstack_node, ceph_node):
+ log.info("Setting libvirt secret...")
+
+ cinder_key_stringio = StringIO()
+ ceph_node.run(args=['sudo', 'ceph', 'auth', 'get-key', 'client.cinder'],
+ stdout=cinder_key_stringio)
+ cinder_key = cinder_key_stringio.getvalue().strip()
+
+ uuid_stringio = StringIO()
+ devstack_node.run(args=['uuidgen'], stdout=uuid_stringio)
+ uuid = uuid_stringio.getvalue().strip()
+
+ secret_path = '/tmp/secret.xml'
+ secret_template = textwrap.dedent("""
+ <secret ephemeral='no' private='no'>
+ <uuid>{uuid}</uuid>
+ <usage type='ceph'>
+ <name>client.cinder secret</name>
+ </usage>
+ </secret>""")
+ misc.sudo_write_file(devstack_node, secret_path,
+ secret_template.format(uuid=uuid))
+ devstack_node.run(args=['sudo', 'virsh', 'secret-define', '--file',
+ secret_path])
+ devstack_node.run(args=['sudo', 'virsh', 'secret-set-value', '--secret',
+ uuid, '--base64', cinder_key])
+ return uuid
+
+
+def update_devstack_config_files(devstack_node, secret_uuid):
+ log.info("Updating DevStack config files to use Ceph...")
+
+ def backup_config(node, file_name, backup_ext='.orig.teuth'):
+ node.run(args=['cp', '-f', file_name, file_name + backup_ext])
+
+ def update_config(config_name, config_stream, update_dict,
+ section='DEFAULT'):
+ parser = ConfigParser()
+ parser.read_file(config_stream)
+ for (key, value) in update_dict.items():
+ parser.set(section, key, value)
+ out_stream = StringIO()
+ parser.write(out_stream)
+ out_stream.seek(0)
+ return out_stream
+
+ updates = [
+ dict(name='/etc/glance/glance-api.conf', options=dict(
+ default_store='rbd',
+ rbd_store_user='glance',
+ rbd_store_pool='images',
+ show_image_direct_url='True',)),
+ dict(name='/etc/cinder/cinder.conf', options=dict(
+ volume_driver='cinder.volume.drivers.rbd.RBDDriver',
+ rbd_pool='volumes',
+ rbd_ceph_conf='/etc/ceph/ceph.conf',
+ rbd_flatten_volume_from_snapshot='false',
+ rbd_max_clone_depth='5',
+ glance_api_version='2',
+ rbd_user='cinder',
+ rbd_secret_uuid=secret_uuid,
+ backup_driver='cinder.backup.drivers.ceph',
+ backup_ceph_conf='/etc/ceph/ceph.conf',
+ backup_ceph_user='cinder-backup',
+ backup_ceph_chunk_size='134217728',
+ backup_ceph_pool='backups',
+ backup_ceph_stripe_unit='0',
+ backup_ceph_stripe_count='0',
+ restore_discard_excess_bytes='true',
+ )),
+ dict(name='/etc/nova/nova.conf', options=dict(
+ libvirt_images_type='rbd',
+ libvirt_images_rbd_pool='volumes',
+ libvirt_images_rbd_ceph_conf='/etc/ceph/ceph.conf',
+ rbd_user='cinder',
+ rbd_secret_uuid=secret_uuid,
+ libvirt_inject_password='false',
+ libvirt_inject_key='false',
+ libvirt_inject_partition='-2',
+ )),
+ ]
+
+ for update in updates:
+ file_name = update['name']
+ options = update['options']
+ config_str = misc.get_file(devstack_node, file_name, sudo=True)
+ config_stream = StringIO(config_str)
+ backup_config(devstack_node, file_name)
+ new_config_stream = update_config(file_name, config_stream, options)
+ misc.sudo_write_file(devstack_node, file_name, new_config_stream)
+
+
+def set_apache_servername(node):
+ # Apache complains: "Could not reliably determine the server's fully
+ # qualified domain name, using 127.0.0.1 for ServerName"
+ # So, let's make sure it knows its name.
+ log.info("Setting Apache ServerName...")
+
+ hostname = node.hostname
+ config_file = '/etc/apache2/conf.d/servername'
+ misc.sudo_write_file(node, config_file,
+ "ServerName {name}".format(name=hostname))
+
+
+def start_devstack(devstack_node):
+ log.info("Patching devstack start script...")
+ # This causes screen to start headless - otherwise rejoin-stack.sh fails
+ # because there is no terminal attached.
+ cmd = "cd devstack && sed -ie 's/screen -c/screen -dm -c/' rejoin-stack.sh"
+ devstack_node.run(args=cmd)
+
+ log.info("Starting devstack...")
+ cmd = "cd devstack && ./rejoin-stack.sh"
+ devstack_node.run(args=cmd)
+
+ # This was added because I was getting timeouts on Cinder requests - which
+ # were trying to access Keystone on port 5000. A more robust way to handle
+ # this would be to introduce a wait-loop on devstack_node that checks to
+ # see if a service is listening on port 5000.
+ log.info("Waiting 30s for devstack to start...")
+ time.sleep(30)
+
+
+def restart_apache(node):
+ node.run(args=['sudo', '/etc/init.d/apache2', 'restart'], wait=True)
+
+
+@contextlib.contextmanager
+def exercise(ctx, config):
+ log.info("Running devstack exercises...")
+
+ if config is None:
+ config = {}
+ if not isinstance(config, dict):
+ raise TypeError("config must be a dict")
+
+ devstack_node = ctx.cluster.only(is_devstack_node).remotes.keys()[0]
+
+ # TODO: save the log *and* preserve failures
+ #devstack_archive_dir = create_devstack_archive(ctx, devstack_node)
+
+ try:
+ #cmd = "cd devstack && ./exercise.sh 2>&1 | tee {dir}/exercise.log".format( # noqa
+ # dir=devstack_archive_dir)
+ cmd = "cd devstack && ./exercise.sh"
+ devstack_node.run(args=cmd, wait=True)
+ yield
+ finally:
+ pass
+
+
+def create_devstack_archive(ctx, devstack_node):
+ test_dir = misc.get_testdir(ctx)
+ devstack_archive_dir = "{test_dir}/archive/devstack".format(
+ test_dir=test_dir)
+ devstack_node.run(args="mkdir -p " + devstack_archive_dir)
+ return devstack_archive_dir
+
+
+@contextlib.contextmanager
+def smoke(ctx, config):
+ log.info("Running a basic smoketest...")
+
+ devstack_node = ctx.cluster.only(is_devstack_node).remotes.keys()[0]
+ an_osd_node = ctx.cluster.only(is_osd_node).remotes.keys()[0]
+
+ try:
+ create_volume(devstack_node, an_osd_node, 'smoke0', 1)
+ yield
+ finally:
+ pass
+
+
+def create_volume(devstack_node, ceph_node, vol_name, size):
+ """
+ :param size: The size of the volume, in GB
+ """
+ size = str(size)
+ log.info("Creating a {size}GB volume named {name}...".format(
+ name=vol_name,
+ size=size))
+ args = ['source', 'devstack/openrc', run.Raw('&&'), 'cinder', 'create',
+ '--display-name', vol_name, size]
+ out_stream = StringIO()
+ devstack_node.run(args=args, stdout=out_stream, wait=True)
+ vol_info = parse_os_table(out_stream.getvalue())
+ log.debug("Volume info: %s", str(vol_info))
+
+ out_stream = StringIO()
+ try:
+ ceph_node.run(args="rbd --id cinder ls -l volumes", stdout=out_stream,
+ wait=True)
+ except run.CommandFailedError:
+ log.debug("Original rbd call failed; retrying without '--id cinder'")
+ ceph_node.run(args="rbd ls -l volumes", stdout=out_stream,
+ wait=True)
+
+ assert vol_info['id'] in out_stream.getvalue(), \
+ "Volume not found on Ceph cluster"
+ assert vol_info['size'] == size, \
+ "Volume size on Ceph cluster is different than specified"
+ return vol_info['id']
+
+
+def parse_os_table(table_str):
+ out_dict = dict()
+ for line in table_str.split('\n'):
+ if line.startswith('|'):
+ items = line.split()
+ out_dict[items[1]] = items[3]
+ return out_dict
--- /dev/null
+"""
+Raise exceptions on osd coredumps or test err directories
+"""
+import contextlib
+import logging
+import time
+from teuthology.orchestra import run
+
+import ceph_manager
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Die if {testdir}/err exists or if an OSD dumps core
+ """
+ if config is None:
+ config = {}
+
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+ num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
+ log.info('num_osds is %s' % num_osds)
+
+ manager = ceph_manager.CephManager(
+ mon,
+ ctx=ctx,
+ logger=log.getChild('ceph_manager'),
+ )
+
+ while len(manager.get_osd_status()['up']) < num_osds:
+ time.sleep(10)
+
+ testdir = teuthology.get_testdir(ctx)
+
+ while True:
+ for i in range(num_osds):
+ (osd_remote,) = ctx.cluster.only('osd.%d' % i).remotes.iterkeys()
+ p = osd_remote.run(
+ args = [ 'test', '-e', '{tdir}/err'.format(tdir=testdir) ],
+ wait=True,
+ check_status=False,
+ )
+ exit_status = p.exitstatus
+
+ if exit_status == 0:
+ log.info("osd %d has an error" % i)
+ raise Exception("osd %d error" % i)
+
+ log_path = '/var/log/ceph/osd.%d.log' % (i)
+
+ p = osd_remote.run(
+ args = [
+ 'tail', '-1', log_path,
+ run.Raw('|'),
+ 'grep', '-q', 'end dump'
+ ],
+ wait=True,
+ check_status=False,
+ )
+ exit_status = p.exitstatus
+
+ if exit_status == 0:
+ log.info("osd %d dumped core" % i)
+ raise Exception("osd %d dumped core" % i)
+
+ time.sleep(5)
--- /dev/null
+"""
+Special case divergence test
+"""
+import logging
+import time
+
+from teuthology import misc as teuthology
+from util.rados import rados
+
+
+log = logging.getLogger(__name__)
+
+
+def task(ctx, config):
+ """
+ Test handling of divergent entries with prior_version
+ prior to log_tail
+
+ overrides:
+ ceph:
+ conf:
+ osd:
+ debug osd: 5
+
+ Requires 3 osds on a single test node.
+ """
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'divergent_priors task only accepts a dict for configuration'
+
+ manager = ctx.managers['ceph']
+
+ while len(manager.get_osd_status()['up']) < 3:
+ time.sleep(10)
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.raw_cluster_cmd('osd', 'set', 'noout')
+ manager.raw_cluster_cmd('osd', 'set', 'noin')
+ manager.raw_cluster_cmd('osd', 'set', 'nodown')
+ manager.wait_for_clean()
+
+ # something that is always there
+ dummyfile = '/etc/fstab'
+ dummyfile2 = '/etc/resolv.conf'
+
+ # create 1 pg pool
+ log.info('creating foo')
+ manager.raw_cluster_cmd('osd', 'pool', 'create', 'foo', '1')
+
+ osds = [0, 1, 2]
+ for i in osds:
+ manager.set_config(i, osd_min_pg_log_entries=10)
+ manager.set_config(i, osd_max_pg_log_entries=10)
+ manager.set_config(i, osd_pg_log_trim_min=5)
+
+ # determine primary
+ divergent = manager.get_pg_primary('foo', 0)
+ log.info("primary and soon to be divergent is %d", divergent)
+ non_divergent = list(osds)
+ non_divergent.remove(divergent)
+
+ log.info('writing initial objects')
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+ # write 100 objects
+ for i in range(100):
+ rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile])
+
+ manager.wait_for_clean()
+
+ # blackhole non_divergent
+ log.info("blackholing osds %s", str(non_divergent))
+ for i in non_divergent:
+ manager.set_config(i, objectstore_blackhole=1)
+
+ DIVERGENT_WRITE = 5
+ DIVERGENT_REMOVE = 5
+ # Write some soon to be divergent
+ log.info('writing divergent objects')
+ for i in range(DIVERGENT_WRITE):
+ rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i,
+ dummyfile2], wait=False)
+ # Remove some soon to be divergent
+ log.info('remove divergent objects')
+ for i in range(DIVERGENT_REMOVE):
+ rados(ctx, mon, ['-p', 'foo', 'rm',
+ 'existing_%d' % (i + DIVERGENT_WRITE)], wait=False)
+ time.sleep(10)
+ mon.run(
+ args=['killall', '-9', 'rados'],
+ wait=True,
+ check_status=False)
+
+ # kill all the osds but leave divergent in
+ log.info('killing all the osds')
+ for i in osds:
+ manager.kill_osd(i)
+ for i in osds:
+ manager.mark_down_osd(i)
+ for i in non_divergent:
+ manager.mark_out_osd(i)
+
+ # bring up non-divergent
+ log.info("bringing up non_divergent %s", str(non_divergent))
+ for i in non_divergent:
+ manager.revive_osd(i)
+ for i in non_divergent:
+ manager.mark_in_osd(i)
+
+ # write 1 non-divergent object (ensure that old divergent one is divergent)
+ objname = "existing_%d" % (DIVERGENT_WRITE + DIVERGENT_REMOVE)
+ log.info('writing non-divergent object ' + objname)
+ rados(ctx, mon, ['-p', 'foo', 'put', objname, dummyfile2])
+
+ manager.wait_for_recovery()
+
+ # ensure no recovery of up osds first
+ log.info('delay recovery')
+ for i in non_divergent:
+ manager.wait_run_admin_socket(
+ 'osd', i, ['set_recovery_delay', '100000'])
+
+ # bring in our divergent friend
+ log.info("revive divergent %d", divergent)
+ manager.raw_cluster_cmd('osd', 'set', 'noup')
+ manager.revive_osd(divergent)
+
+ log.info('delay recovery divergent')
+ manager.wait_run_admin_socket(
+ 'osd', divergent, ['set_recovery_delay', '100000'])
+
+ manager.raw_cluster_cmd('osd', 'unset', 'noup')
+ while len(manager.get_osd_status()['up']) < 3:
+ time.sleep(10)
+
+ log.info('wait for peering')
+ rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile])
+
+ # At this point the divergent_priors should have been detected
+
+ log.info("killing divergent %d", divergent)
+ manager.kill_osd(divergent)
+ log.info("reviving divergent %d", divergent)
+ manager.revive_osd(divergent)
+
+ time.sleep(20)
+
+ log.info('allowing recovery')
+ # Set osd_recovery_delay_start back to 0 and kick the queue
+ for i in osds:
+ manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'debug',
+ 'kick_recovery_wq', ' 0')
+
+ log.info('reading divergent objects')
+ for i in range(DIVERGENT_WRITE + DIVERGENT_REMOVE):
+ exit_status = rados(ctx, mon, ['-p', 'foo', 'get', 'existing_%d' % i,
+ '/tmp/existing'])
+ assert exit_status is 0
+
+ log.info("success")
--- /dev/null
+"""
+Special case divergence test with ceph-objectstore-tool export/remove/import
+"""
+import logging
+import time
+from cStringIO import StringIO
+
+from teuthology import misc as teuthology
+from util.rados import rados
+import os
+
+
+log = logging.getLogger(__name__)
+
+
+def task(ctx, config):
+ """
+ Test handling of divergent entries with prior_version
+ prior to log_tail and a ceph-objectstore-tool export/import
+
+ overrides:
+ ceph:
+ conf:
+ osd:
+ debug osd: 5
+
+ Requires 3 osds on a single test node.
+ """
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'divergent_priors task only accepts a dict for configuration'
+
+ manager = ctx.managers['ceph']
+
+ while len(manager.get_osd_status()['up']) < 3:
+ time.sleep(10)
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.raw_cluster_cmd('osd', 'set', 'noout')
+ manager.raw_cluster_cmd('osd', 'set', 'noin')
+ manager.raw_cluster_cmd('osd', 'set', 'nodown')
+ manager.wait_for_clean()
+
+ # something that is always there
+ dummyfile = '/etc/fstab'
+ dummyfile2 = '/etc/resolv.conf'
+ testdir = teuthology.get_testdir(ctx)
+
+ # create 1 pg pool
+ log.info('creating foo')
+ manager.raw_cluster_cmd('osd', 'pool', 'create', 'foo', '1')
+
+ osds = [0, 1, 2]
+ for i in osds:
+ manager.set_config(i, osd_min_pg_log_entries=10)
+ manager.set_config(i, osd_max_pg_log_entries=10)
+ manager.set_config(i, osd_pg_log_trim_min=5)
+
+ # determine primary
+ divergent = manager.get_pg_primary('foo', 0)
+ log.info("primary and soon to be divergent is %d", divergent)
+ non_divergent = list(osds)
+ non_divergent.remove(divergent)
+
+ log.info('writing initial objects')
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+ # write 100 objects
+ for i in range(100):
+ rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile])
+
+ manager.wait_for_clean()
+
+ # blackhole non_divergent
+ log.info("blackholing osds %s", str(non_divergent))
+ for i in non_divergent:
+ manager.set_config(i, objectstore_blackhole=1)
+
+ DIVERGENT_WRITE = 5
+ DIVERGENT_REMOVE = 5
+ # Write some soon to be divergent
+ log.info('writing divergent objects')
+ for i in range(DIVERGENT_WRITE):
+ rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i,
+ dummyfile2], wait=False)
+ # Remove some soon to be divergent
+ log.info('remove divergent objects')
+ for i in range(DIVERGENT_REMOVE):
+ rados(ctx, mon, ['-p', 'foo', 'rm',
+ 'existing_%d' % (i + DIVERGENT_WRITE)], wait=False)
+ time.sleep(10)
+ mon.run(
+ args=['killall', '-9', 'rados'],
+ wait=True,
+ check_status=False)
+
+ # kill all the osds but leave divergent in
+ log.info('killing all the osds')
+ for i in osds:
+ manager.kill_osd(i)
+ for i in osds:
+ manager.mark_down_osd(i)
+ for i in non_divergent:
+ manager.mark_out_osd(i)
+
+ # bring up non-divergent
+ log.info("bringing up non_divergent %s", str(non_divergent))
+ for i in non_divergent:
+ manager.revive_osd(i)
+ for i in non_divergent:
+ manager.mark_in_osd(i)
+
+ # write 1 non-divergent object (ensure that old divergent one is divergent)
+ objname = "existing_%d" % (DIVERGENT_WRITE + DIVERGENT_REMOVE)
+ log.info('writing non-divergent object ' + objname)
+ rados(ctx, mon, ['-p', 'foo', 'put', objname, dummyfile2])
+
+ manager.wait_for_recovery()
+
+ # ensure no recovery of up osds first
+ log.info('delay recovery')
+ for i in non_divergent:
+ manager.wait_run_admin_socket(
+ 'osd', i, ['set_recovery_delay', '100000'])
+
+ # bring in our divergent friend
+ log.info("revive divergent %d", divergent)
+ manager.raw_cluster_cmd('osd', 'set', 'noup')
+ manager.revive_osd(divergent)
+
+ log.info('delay recovery divergent')
+ manager.wait_run_admin_socket(
+ 'osd', divergent, ['set_recovery_delay', '100000'])
+
+ manager.raw_cluster_cmd('osd', 'unset', 'noup')
+ while len(manager.get_osd_status()['up']) < 3:
+ time.sleep(10)
+
+ log.info('wait for peering')
+ rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile])
+
+ # At this point the divergent_priors should have been detected
+
+ log.info("killing divergent %d", divergent)
+ manager.kill_osd(divergent)
+
+ # Export a pg
+ (exp_remote,) = ctx.\
+ cluster.only('osd.{o}'.format(o=divergent)).remotes.iterkeys()
+ FSPATH = manager.get_filepath()
+ JPATH = os.path.join(FSPATH, "journal")
+ prefix = ("sudo adjust-ulimits ceph-objectstore-tool "
+ "--data-path {fpath} --journal-path {jpath} "
+ "--log-file="
+ "/var/log/ceph/objectstore_tool.$$.log ".
+ format(fpath=FSPATH, jpath=JPATH))
+ pid = os.getpid()
+ expfile = os.path.join(testdir, "exp.{pid}.out".format(pid=pid))
+ cmd = ((prefix + "--op export --pgid 1.0 --file {file}").
+ format(id=divergent, file=expfile))
+ proc = exp_remote.run(args=cmd, wait=True,
+ check_status=False, stdout=StringIO())
+ assert proc.exitstatus == 0
+
+ cmd = ((prefix + "--op remove --pgid 1.0").
+ format(id=divergent, file=expfile))
+ proc = exp_remote.run(args=cmd, wait=True,
+ check_status=False, stdout=StringIO())
+ assert proc.exitstatus == 0
+
+ cmd = ((prefix + "--op import --file {file}").
+ format(id=divergent, file=expfile))
+ proc = exp_remote.run(args=cmd, wait=True,
+ check_status=False, stdout=StringIO())
+ assert proc.exitstatus == 0
+
+ log.info("reviving divergent %d", divergent)
+ manager.revive_osd(divergent)
+ manager.wait_run_admin_socket('osd', divergent, ['dump_ops_in_flight'])
+ time.sleep(20);
+
+ log.info('allowing recovery')
+ # Set osd_recovery_delay_start back to 0 and kick the queue
+ for i in osds:
+ manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'debug',
+ 'kick_recovery_wq', ' 0')
+
+ log.info('reading divergent objects')
+ for i in range(DIVERGENT_WRITE + DIVERGENT_REMOVE):
+ exit_status = rados(ctx, mon, ['-p', 'foo', 'get', 'existing_%d' % i,
+ '/tmp/existing'])
+ assert exit_status is 0
+
+ cmd = 'rm {file}'.format(file=expfile)
+ exp_remote.run(args=cmd, wait=True)
+ log.info("success")
--- /dev/null
+"""
+Dump_stuck command
+"""
+import logging
+import re
+import time
+
+import ceph_manager
+from teuthology import misc as teuthology
+
+
+log = logging.getLogger(__name__)
+
+def check_stuck(manager, num_inactive, num_unclean, num_stale, timeout=10):
+ """
+ Do checks. Make sure get_stuck_pgs return the right amout of information, then
+ extract health information from the raw_cluster_cmd and compare the results with
+ values passed in. This passes if all asserts pass.
+
+ :param num_manager: Ceph manager
+ :param num_inactive: number of inaactive pages that are stuck
+ :param num_unclean: number of unclean pages that are stuck
+ :paran num_stale: number of stale pages that are stuck
+ :param timeout: timeout value for get_stuck_pgs calls
+ """
+ inactive = manager.get_stuck_pgs('inactive', timeout)
+ unclean = manager.get_stuck_pgs('unclean', timeout)
+ stale = manager.get_stuck_pgs('stale', timeout)
+ log.info('hi mom')
+ log.info('inactive %s / %d, unclean %s / %d, stale %s / %d',
+ len(inactive), num_inactive,
+ len(unclean), num_unclean,
+ len(stale), num_stale)
+ assert len(inactive) == num_inactive
+ assert len(unclean) == num_unclean
+ assert len(stale) == num_stale
+
+ # check health output as well
+ health = manager.raw_cluster_cmd('health')
+ log.debug('ceph health is: %s', health)
+ if num_inactive > 0:
+ m = re.search('(\d+) pgs stuck inactive', health)
+ assert int(m.group(1)) == num_inactive
+ if num_unclean > 0:
+ m = re.search('(\d+) pgs stuck unclean', health)
+ assert int(m.group(1)) == num_unclean
+ if num_stale > 0:
+ m = re.search('(\d+) pgs stuck stale', health)
+ assert int(m.group(1)) == num_stale
+
+def task(ctx, config):
+ """
+ Test the dump_stuck command.
+
+ :param ctx: Context
+ :param config: Configuration
+ """
+ assert config is None, \
+ 'dump_stuck requires no configuration'
+ assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \
+ 'dump_stuck requires exactly 2 osds'
+
+ timeout = 60
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+ manager = ceph_manager.CephManager(
+ mon,
+ ctx=ctx,
+ logger=log.getChild('ceph_manager'),
+ )
+
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.wait_for_clean(timeout)
+
+ manager.raw_cluster_cmd('tell', 'mon.0', 'injectargs', '--',
+# '--mon-osd-report-timeout 90',
+ '--mon-pg-stuck-threshold 10')
+
+ check_stuck(
+ manager,
+ num_inactive=0,
+ num_unclean=0,
+ num_stale=0,
+ )
+ num_pgs = manager.get_num_pgs()
+
+ manager.mark_out_osd(0)
+ time.sleep(timeout)
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.wait_for_recovery(timeout)
+
+ check_stuck(
+ manager,
+ num_inactive=0,
+ num_unclean=num_pgs,
+ num_stale=0,
+ )
+
+ manager.mark_in_osd(0)
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.wait_for_clean(timeout)
+
+ check_stuck(
+ manager,
+ num_inactive=0,
+ num_unclean=0,
+ num_stale=0,
+ )
+
+ log.info('stopping first osd')
+ manager.kill_osd(0)
+ manager.mark_down_osd(0)
+
+ log.info('waiting for all to be unclean')
+ starttime = time.time()
+ done = False
+ while not done:
+ try:
+ check_stuck(
+ manager,
+ num_inactive=0,
+ num_unclean=num_pgs,
+ num_stale=0,
+ )
+ done = True
+ except AssertionError:
+ # wait up to 15 minutes to become stale
+ if time.time() - starttime > 900:
+ raise
+
+
+ log.info('stopping second osd')
+ manager.kill_osd(1)
+ manager.mark_down_osd(1)
+
+ log.info('waiting for all to be stale')
+ starttime = time.time()
+ done = False
+ while not done:
+ try:
+ check_stuck(
+ manager,
+ num_inactive=0,
+ num_unclean=num_pgs,
+ num_stale=num_pgs,
+ )
+ done = True
+ except AssertionError:
+ # wait up to 15 minutes to become stale
+ if time.time() - starttime > 900:
+ raise
+
+ log.info('reviving')
+ for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):
+ manager.revive_osd(id_)
+ manager.mark_in_osd(id_)
+ while True:
+ try:
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ break
+ except Exception:
+ log.exception('osds must not be started yet, waiting...')
+ time.sleep(1)
+ manager.wait_for_clean(timeout)
+
+ check_stuck(
+ manager,
+ num_inactive=0,
+ num_unclean=0,
+ num_stale=0,
+ )
--- /dev/null
+"""
+Lost_unfound
+"""
+from teuthology.orchestra import run
+import logging
+import ceph_manager
+from teuthology import misc as teuthology
+from util.rados import rados
+import time
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+ """
+ Test handling of lost objects on an ec pool.
+
+ A pretty rigid cluster is brought up andtested by this task
+ """
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'lost_unfound task only accepts a dict for configuration'
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+ manager = ceph_manager.CephManager(
+ mon,
+ ctx=ctx,
+ logger=log.getChild('ceph_manager'),
+ )
+
+ manager.wait_for_clean()
+
+ profile = config.get('erasure_code_profile', {
+ 'k': '2',
+ 'm': '2',
+ 'ruleset-failure-domain': 'osd'
+ })
+ profile_name = profile.get('name', 'lost_unfound')
+ manager.create_erasure_code_profile(profile_name, profile)
+ pool = manager.create_pool_with_unique_name(
+ erasure_code_profile_name=profile_name,
+ min_size=2)
+
+ # something that is always there, readable and never empty
+ dummyfile = '/etc/group'
+
+ # kludge to make sure they get a map
+ rados(ctx, mon, ['-p', pool, 'put', 'dummy', dummyfile])
+
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.wait_for_recovery()
+
+ # create old objects
+ for f in range(1, 10):
+ rados(ctx, mon, ['-p', pool, 'put', 'existing_%d' % f, dummyfile])
+ rados(ctx, mon, ['-p', pool, 'put', 'existed_%d' % f, dummyfile])
+ rados(ctx, mon, ['-p', pool, 'rm', 'existed_%d' % f])
+
+ # delay recovery, and make the pg log very long (to prevent backfill)
+ manager.raw_cluster_cmd(
+ 'tell', 'osd.1',
+ 'injectargs',
+ '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000'
+ )
+
+ manager.kill_osd(0)
+ manager.mark_down_osd(0)
+ manager.kill_osd(3)
+ manager.mark_down_osd(3)
+
+ for f in range(1, 10):
+ rados(ctx, mon, ['-p', pool, 'put', 'new_%d' % f, dummyfile])
+ rados(ctx, mon, ['-p', pool, 'put', 'existed_%d' % f, dummyfile])
+ rados(ctx, mon, ['-p', pool, 'put', 'existing_%d' % f, dummyfile])
+
+ # take out osd.1 and a necessary shard of those objects.
+ manager.kill_osd(1)
+ manager.mark_down_osd(1)
+ manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it')
+ manager.revive_osd(0)
+ manager.wait_till_osd_is_up(0)
+ manager.revive_osd(3)
+ manager.wait_till_osd_is_up(3)
+
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats')
+ manager.wait_till_active()
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats')
+
+ # verify that there are unfound objects
+ unfound = manager.get_num_unfound_objects()
+ log.info("there are %d unfound objects" % unfound)
+ assert unfound
+
+ testdir = teuthology.get_testdir(ctx)
+ procs = []
+ if config.get('parallel_bench', True):
+ procs.append(mon.run(
+ args=[
+ "/bin/sh", "-c",
+ " ".join(['adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage',
+ 'rados',
+ '--no-log-to-stderr',
+ '--name', 'client.admin',
+ '-b', str(4<<10),
+ '-p' , pool,
+ '-t', '20',
+ 'bench', '240', 'write',
+ ]).format(tdir=testdir),
+ ],
+ logger=log.getChild('radosbench.{id}'.format(id='client.admin')),
+ stdin=run.PIPE,
+ wait=False
+ ))
+ time.sleep(10)
+
+ # mark stuff lost
+ pgs = manager.get_pg_stats()
+ for pg in pgs:
+ if pg['stat_sum']['num_objects_unfound'] > 0:
+ # verify that i can list them direct from the osd
+ log.info('listing missing/lost in %s state %s', pg['pgid'],
+ pg['state']);
+ m = manager.list_pg_missing(pg['pgid'])
+ log.info('%s' % m)
+ assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound']
+
+ log.info("reverting unfound in %s", pg['pgid'])
+ manager.raw_cluster_cmd('pg', pg['pgid'],
+ 'mark_unfound_lost', 'delete')
+ else:
+ log.info("no unfound in %s", pg['pgid'])
+
+ manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5')
+ manager.raw_cluster_cmd('tell', 'osd.3', 'debug', 'kick_recovery_wq', '5')
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats')
+ manager.wait_for_recovery()
+
+ if not config.get('parallel_bench', True):
+ time.sleep(20)
+
+ # verify result
+ for f in range(1, 10):
+ err = rados(ctx, mon, ['-p', pool, 'get', 'new_%d' % f, '-'])
+ assert err
+ err = rados(ctx, mon, ['-p', pool, 'get', 'existed_%d' % f, '-'])
+ assert err
+ err = rados(ctx, mon, ['-p', pool, 'get', 'existing_%d' % f, '-'])
+ assert err
+
+ # see if osd.1 can cope
+ manager.revive_osd(1)
+ manager.wait_till_osd_is_up(1)
+ manager.wait_for_clean()
+ run.wait(procs)
--- /dev/null
+"""
+Filestore/filejournal handler
+"""
+import logging
+from teuthology.orchestra import run
+import random
+
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+ """
+ Test filestore/filejournal handling of non-idempotent events.
+
+ Currently this is a kludge; we require the ceph task preceeds us just
+ so that we get the tarball installed to run the test binary.
+
+ :param ctx: Context
+ :param config: Configuration
+ """
+ assert config is None or isinstance(config, list) \
+ or isinstance(config, dict), \
+ "task only supports a list or dictionary for configuration"
+ all_clients = ['client.{id}'.format(id=id_)
+ for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+ if config is None:
+ config = all_clients
+ if isinstance(config, list):
+ config = dict.fromkeys(config)
+ clients = config.keys()
+
+ # just use the first client...
+ client = clients[0];
+ (remote,) = ctx.cluster.only(client).remotes.iterkeys()
+
+ testdir = teuthology.get_testdir(ctx)
+
+ dir = '%s/ceph.data/test.%s' % (testdir, client)
+
+ seed = str(int(random.uniform(1,100)))
+
+ try:
+ log.info('creating a working dir')
+ remote.run(args=['mkdir', dir])
+ remote.run(
+ args=[
+ 'cd', dir,
+ run.Raw('&&'),
+ 'wget','-q', '-Orun_seed_to.sh',
+ 'http://git.ceph.com/?p=ceph.git;a=blob_plain;f=src/test/objectstore/run_seed_to.sh;hb=HEAD',
+ run.Raw('&&'),
+ 'wget','-q', '-Orun_seed_to_range.sh',
+ 'http://git.ceph.com/?p=ceph.git;a=blob_plain;f=src/test/objectstore/run_seed_to_range.sh;hb=HEAD',
+ run.Raw('&&'),
+ 'chmod', '+x', 'run_seed_to.sh', 'run_seed_to_range.sh',
+ ]);
+
+ log.info('running a series of tests')
+ proc = remote.run(
+ args=[
+ 'cd', dir,
+ run.Raw('&&'),
+ './run_seed_to_range.sh', seed, '50', '300',
+ ],
+ wait=False,
+ check_status=False)
+ result = proc.wait()
+
+ if result != 0:
+ remote.run(
+ args=[
+ 'cp', '-a', dir, '{tdir}/archive/idempotent_failure'.format(tdir=testdir),
+ ])
+ raise Exception("./run_seed_to_range.sh errored out")
+
+ finally:
+ remote.run(args=[
+ 'rm', '-rf', '--', dir
+ ])
+
--- /dev/null
+"""
+Mount/unmount a ``kernel`` client.
+"""
+import contextlib
+import logging
+
+from teuthology.misc import deep_merge
+from teuthology import misc
+from cephfs.kernel_mount import KernelMount
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Mount/unmount a ``kernel`` client.
+
+ The config is optional and defaults to mounting on all clients. If
+ a config is given, it is expected to be a list of clients to do
+ this operation on. This lets you e.g. set up one client with
+ ``ceph-fuse`` and another with ``kclient``.
+
+ Example that mounts all clients::
+
+ tasks:
+ - ceph:
+ - kclient:
+ - interactive:
+
+ Example that uses both ``kclient` and ``ceph-fuse``::
+
+ tasks:
+ - ceph:
+ - ceph-fuse: [client.0]
+ - kclient: [client.1]
+ - interactive:
+
+
+ Pass a dictionary instead of lists to specify per-client config:
+
+ tasks:
+ -kclient:
+ client.0:
+ debug: true
+
+ :param ctx: Context
+ :param config: Configuration
+ """
+ log.info('Mounting kernel clients...')
+ assert config is None or isinstance(config, list) or isinstance(config, dict), \
+ "task kclient got invalid config"
+
+ if config is None:
+ config = ['client.{id}'.format(id=id_)
+ for id_ in misc.all_roles_of_type(ctx.cluster, 'client')]
+
+ if isinstance(config, list):
+ client_roles = config
+ config = dict([r, dict()] for r in client_roles)
+ elif isinstance(config, dict):
+ client_roles = filter(lambda x: 'client.' in x, config.keys())
+ else:
+ raise ValueError("Invalid config object: {0} ({1})".format(config, config.__class__))
+
+ # config has been converted to a dict by this point
+ overrides = ctx.config.get('overrides', {})
+ deep_merge(config, overrides.get('kclient', {}))
+
+ clients = list(misc.get_clients(ctx=ctx, roles=client_roles))
+
+ test_dir = misc.get_testdir(ctx)
+
+ # Assemble mon addresses
+ remotes_and_roles = ctx.cluster.remotes.items()
+ roles = [roles for (remote_, roles) in remotes_and_roles]
+ ips = [remote_.ssh.get_transport().getpeername()[0]
+ for (remote_, _) in remotes_and_roles]
+ mons = misc.get_mons(roles, ips).values()
+
+ mounts = {}
+ for id_, remote in clients:
+ client_config = config.get("client.%s" % id_)
+ if client_config is None:
+ client_config = {}
+
+ if config.get("disabled", False) or not client_config.get('mounted', True):
+ continue
+
+ kernel_mount = KernelMount(
+ mons,
+ test_dir,
+ id_,
+ remote,
+ ctx.teuthology_config.get('ipmi_user', None),
+ ctx.teuthology_config.get('ipmi_password', None),
+ ctx.teuthology_config.get('ipmi_domain', None)
+ )
+
+ mounts[id_] = kernel_mount
+
+ if client_config.get('debug', False):
+ remote.run(args=["sudo", "bash", "-c", "echo 'module ceph +p' > /sys/kernel/debug/dynamic_debug/control"])
+ remote.run(args=["sudo", "bash", "-c", "echo 'module libceph +p' > /sys/kernel/debug/dynamic_debug/control"])
+
+ kernel_mount.mount()
+
+ ctx.mounts = mounts
+ try:
+ yield mounts
+ finally:
+ log.info('Unmounting kernel clients...')
+ for mount in mounts.values():
+ if mount.is_mounted():
+ mount.umount()
--- /dev/null
+"""
+locktests
+"""
+import logging
+
+from teuthology.orchestra import run
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+ """
+ Run locktests, from the xfstests suite, on the given
+ clients. Whether the clients are ceph-fuse or kernel does not
+ matter, and the two clients can refer to the same mount.
+
+ The config is a list of two clients to run the locktest on. The
+ first client will be the host.
+
+ For example:
+ tasks:
+ - ceph:
+ - ceph-fuse: [client.0, client.1]
+ - locktest:
+ [client.0, client.1]
+
+ This task does not yield; there would be little point.
+
+ :param ctx: Context
+ :param config: Configuration
+ """
+
+ assert isinstance(config, list)
+ log.info('fetching and building locktests...')
+ (host,) = ctx.cluster.only(config[0]).remotes
+ (client,) = ctx.cluster.only(config[1]).remotes
+ ( _, _, host_id) = config[0].partition('.')
+ ( _, _, client_id) = config[1].partition('.')
+ testdir = teuthology.get_testdir(ctx)
+ hostmnt = '{tdir}/mnt.{id}'.format(tdir=testdir, id=host_id)
+ clientmnt = '{tdir}/mnt.{id}'.format(tdir=testdir, id=client_id)
+
+ try:
+ for client_name in config:
+ log.info('building on {client_}'.format(client_=client_name))
+ ctx.cluster.only(client_name).run(
+ args=[
+ # explicitly does not support multiple autotest tasks
+ # in a single run; the result archival would conflict
+ 'mkdir', '{tdir}/archive/locktest'.format(tdir=testdir),
+ run.Raw('&&'),
+ 'mkdir', '{tdir}/locktest'.format(tdir=testdir),
+ run.Raw('&&'),
+ 'wget',
+ '-nv',
+ 'https://raw.github.com/gregsfortytwo/xfstests-ceph/master/src/locktest.c',
+ '-O', '{tdir}/locktest/locktest.c'.format(tdir=testdir),
+ run.Raw('&&'),
+ 'g++', '{tdir}/locktest/locktest.c'.format(tdir=testdir),
+ '-o', '{tdir}/locktest/locktest'.format(tdir=testdir)
+ ],
+ logger=log.getChild('locktest_client.{id}'.format(id=client_name)),
+ )
+
+ log.info('built locktest on each client')
+
+ host.run(args=['sudo', 'touch',
+ '{mnt}/locktestfile'.format(mnt=hostmnt),
+ run.Raw('&&'),
+ 'sudo', 'chown', 'ubuntu.ubuntu',
+ '{mnt}/locktestfile'.format(mnt=hostmnt)
+ ]
+ )
+
+ log.info('starting on host')
+ hostproc = host.run(
+ args=[
+ '{tdir}/locktest/locktest'.format(tdir=testdir),
+ '-p', '6788',
+ '-d',
+ '{mnt}/locktestfile'.format(mnt=hostmnt),
+ ],
+ wait=False,
+ logger=log.getChild('locktest.host'),
+ )
+ log.info('starting on client')
+ (_,_,hostaddr) = host.name.partition('@')
+ clientproc = client.run(
+ args=[
+ '{tdir}/locktest/locktest'.format(tdir=testdir),
+ '-p', '6788',
+ '-d',
+ '-h', hostaddr,
+ '{mnt}/locktestfile'.format(mnt=clientmnt),
+ ],
+ logger=log.getChild('locktest.client'),
+ wait=False
+ )
+
+ hostresult = hostproc.wait()
+ clientresult = clientproc.wait()
+ if (hostresult != 0) or (clientresult != 0):
+ raise Exception("Did not pass locking test!")
+ log.info('finished locktest executable with results {r} and {s}'. \
+ format(r=hostresult, s=clientresult))
+
+ finally:
+ log.info('cleaning up host dir')
+ host.run(
+ args=[
+ 'mkdir', '-p', '{tdir}/locktest'.format(tdir=testdir),
+ run.Raw('&&'),
+ 'rm', '-f', '{tdir}/locktest/locktest.c'.format(tdir=testdir),
+ run.Raw('&&'),
+ 'rm', '-f', '{tdir}/locktest/locktest'.format(tdir=testdir),
+ run.Raw('&&'),
+ 'rmdir', '{tdir}/locktest'
+ ],
+ logger=log.getChild('.{id}'.format(id=config[0])),
+ )
+ log.info('cleaning up client dir')
+ client.run(
+ args=[
+ 'mkdir', '-p', '{tdir}/locktest'.format(tdir=testdir),
+ run.Raw('&&'),
+ 'rm', '-f', '{tdir}/locktest/locktest.c'.format(tdir=testdir),
+ run.Raw('&&'),
+ 'rm', '-f', '{tdir}/locktest/locktest'.format(tdir=testdir),
+ run.Raw('&&'),
+ 'rmdir', '{tdir}/locktest'.format(tdir=testdir)
+ ],
+ logger=log.getChild('.{id}'.format(\
+ id=config[1])),
+ )
--- /dev/null
+/var/log/ceph/*{daemon_type}*.log {{
+ rotate 100
+ size {max_size}
+ compress
+ sharedscripts
+ postrotate
+ killall {daemon_type} -1 || true
+ endscript
+ missingok
+ notifempty
+ su root root
+}}
+
--- /dev/null
+"""
+Lost_unfound
+"""
+import logging
+import time
+import ceph_manager
+from teuthology import misc as teuthology
+from teuthology.orchestra import run
+from util.rados import rados
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+ """
+ Test handling of lost objects.
+
+ A pretty rigid cluseter is brought up andtested by this task
+ """
+ POOL = 'unfound_pool'
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'lost_unfound task only accepts a dict for configuration'
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+ manager = ceph_manager.CephManager(
+ mon,
+ ctx=ctx,
+ logger=log.getChild('ceph_manager'),
+ )
+
+ while len(manager.get_osd_status()['up']) < 3:
+ time.sleep(10)
+
+ manager.wait_for_clean()
+
+ manager.create_pool(POOL)
+
+ # something that is always there
+ dummyfile = '/etc/fstab'
+
+ # take an osd out until the very end
+ manager.kill_osd(2)
+ manager.mark_down_osd(2)
+ manager.mark_out_osd(2)
+
+ # kludge to make sure they get a map
+ rados(ctx, mon, ['-p', POOL, 'put', 'dummy', dummyfile])
+
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.wait_for_recovery()
+
+ # create old objects
+ for f in range(1, 10):
+ rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile])
+ rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile])
+ rados(ctx, mon, ['-p', POOL, 'rm', 'existed_%d' % f])
+
+ # delay recovery, and make the pg log very long (to prevent backfill)
+ manager.raw_cluster_cmd(
+ 'tell', 'osd.1',
+ 'injectargs',
+ '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000'
+ )
+
+ manager.kill_osd(0)
+ manager.mark_down_osd(0)
+
+ for f in range(1, 10):
+ rados(ctx, mon, ['-p', POOL, 'put', 'new_%d' % f, dummyfile])
+ rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile])
+ rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile])
+
+ # bring osd.0 back up, let it peer, but don't replicate the new
+ # objects...
+ log.info('osd.0 command_args is %s' % 'foo')
+ log.info(ctx.daemons.get_daemon('osd', 0).command_args)
+ ctx.daemons.get_daemon('osd', 0).command_kwargs['args'].extend([
+ '--osd-recovery-delay-start', '1000'
+ ])
+ manager.revive_osd(0)
+ manager.mark_in_osd(0)
+ manager.wait_till_osd_is_up(0)
+
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.wait_till_active()
+
+ # take out osd.1 and the only copy of those objects.
+ manager.kill_osd(1)
+ manager.mark_down_osd(1)
+ manager.mark_out_osd(1)
+ manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it')
+
+ # bring up osd.2 so that things would otherwise, in theory, recovery fully
+ manager.revive_osd(2)
+ manager.mark_in_osd(2)
+ manager.wait_till_osd_is_up(2)
+
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.wait_till_active()
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+
+ # verify that there are unfound objects
+ unfound = manager.get_num_unfound_objects()
+ log.info("there are %d unfound objects" % unfound)
+ assert unfound
+
+ testdir = teuthology.get_testdir(ctx)
+ procs = []
+ if config.get('parallel_bench', True):
+ procs.append(mon.run(
+ args=[
+ "/bin/sh", "-c",
+ " ".join(['adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage',
+ 'rados',
+ '--no-log-to-stderr',
+ '--name', 'client.admin',
+ '-b', str(4<<10),
+ '-p' , POOL,
+ '-t', '20',
+ 'bench', '240', 'write',
+ ]).format(tdir=testdir),
+ ],
+ logger=log.getChild('radosbench.{id}'.format(id='client.admin')),
+ stdin=run.PIPE,
+ wait=False
+ ))
+ time.sleep(10)
+
+ # mark stuff lost
+ pgs = manager.get_pg_stats()
+ for pg in pgs:
+ if pg['stat_sum']['num_objects_unfound'] > 0:
+ primary = 'osd.%d' % pg['acting'][0]
+
+ # verify that i can list them direct from the osd
+ log.info('listing missing/lost in %s state %s', pg['pgid'],
+ pg['state']);
+ m = manager.list_pg_missing(pg['pgid'])
+ #log.info('%s' % m)
+ assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound']
+ num_unfound=0
+ for o in m['objects']:
+ if len(o['locations']) == 0:
+ num_unfound += 1
+ assert m['num_unfound'] == num_unfound
+
+ log.info("reverting unfound in %s on %s", pg['pgid'], primary)
+ manager.raw_cluster_cmd('pg', pg['pgid'],
+ 'mark_unfound_lost', 'revert')
+ else:
+ log.info("no unfound in %s", pg['pgid'])
+
+ manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5')
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.wait_for_recovery()
+
+ # verify result
+ for f in range(1, 10):
+ err = rados(ctx, mon, ['-p', POOL, 'get', 'new_%d' % f, '-'])
+ assert err
+ err = rados(ctx, mon, ['-p', POOL, 'get', 'existed_%d' % f, '-'])
+ assert err
+ err = rados(ctx, mon, ['-p', POOL, 'get', 'existing_%d' % f, '-'])
+ assert not err
+
+ # see if osd.1 can cope
+ manager.revive_osd(1)
+ manager.mark_in_osd(1)
+ manager.wait_till_osd_is_up(1)
+ manager.wait_for_clean()
+ run.wait(procs)
--- /dev/null
+"""
+Force pg creation on all osds
+"""
+from teuthology import misc as teuthology
+from teuthology.orchestra import run
+import logging
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+ """
+ Create the specified number of pools and write 16 objects to them (thereby forcing
+ the PG creation on each OSD). This task creates pools from all the clients,
+ in parallel. It is easy to add other daemon types which have the appropriate
+ permissions, but I don't think anything else does.
+ The config is just the number of pools to create. I recommend setting
+ "mon create pg interval" to a very low value in your ceph config to speed
+ this up.
+
+ You probably want to do this to look at memory consumption, and
+ maybe to test how performance changes with the number of PGs. For example:
+
+ tasks:
+ - ceph:
+ config:
+ mon:
+ mon create pg interval: 1
+ - manypools: 3000
+ - radosbench:
+ clients: [client.0]
+ time: 360
+ """
+
+ log.info('creating {n} pools'.format(n=config))
+
+ poolnum = int(config)
+ creator_remotes = []
+ client_roles = teuthology.all_roles_of_type(ctx.cluster, 'client')
+ log.info('got client_roles={client_roles_}'.format(client_roles_=client_roles))
+ for role in client_roles:
+ log.info('role={role_}'.format(role_=role))
+ (creator_remote, ) = ctx.cluster.only('client.{id}'.format(id=role)).remotes.iterkeys()
+ creator_remotes.append((creator_remote, 'client.{id}'.format(id=role)))
+
+ remaining_pools = poolnum
+ poolprocs=dict()
+ while (remaining_pools > 0):
+ log.info('{n} pools remaining to create'.format(n=remaining_pools))
+ for remote, role_ in creator_remotes:
+ poolnum = remaining_pools
+ remaining_pools -= 1
+ if remaining_pools < 0:
+ continue
+ log.info('creating pool{num} on {role}'.format(num=poolnum, role=role_))
+ proc = remote.run(
+ args=[
+ 'rados',
+ '--name', role_,
+ 'mkpool', 'pool{num}'.format(num=poolnum), '-1',
+ run.Raw('&&'),
+ 'rados',
+ '--name', role_,
+ '--pool', 'pool{num}'.format(num=poolnum),
+ 'bench', '0', 'write', '-t', '16', '--block-size', '1'
+ ],
+ wait = False
+ )
+ log.info('waiting for pool and object creates')
+ poolprocs[remote] = proc
+
+ run.wait(poolprocs.itervalues())
+
+ log.info('created all {n} pools and wrote 16 objects to each'.format(n=poolnum))
--- /dev/null
+
+import logging
+import contextlib
+import time
+import ceph_manager
+from teuthology import misc
+from teuthology.orchestra.run import CommandFailedError, Raw
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Go through filesystem creation with a synthetic failure in an MDS
+ in its 'up:creating' state, to exercise the retry behaviour.
+ """
+ # Grab handles to the teuthology objects of interest
+ mdslist = list(misc.all_roles_of_type(ctx.cluster, 'mds'))
+ if len(mdslist) != 1:
+ # Require exactly one MDS, the code path for creation failure when
+ # a standby is available is different
+ raise RuntimeError("This task requires exactly one MDS")
+
+ mds_id = mdslist[0]
+ (mds_remote,) = ctx.cluster.only('mds.{_id}'.format(_id=mds_id)).remotes.iterkeys()
+ manager = ceph_manager.CephManager(
+ mds_remote, ctx=ctx, logger=log.getChild('ceph_manager'),
+ )
+
+ # Stop MDS
+ manager.raw_cluster_cmd('mds', 'set', "max_mds", "0")
+ mds = ctx.daemons.get_daemon('mds', mds_id)
+ mds.stop()
+ manager.raw_cluster_cmd('mds', 'fail', mds_id)
+
+ # Reset the filesystem so that next start will go into CREATING
+ manager.raw_cluster_cmd('fs', 'rm', "default", "--yes-i-really-mean-it")
+ manager.raw_cluster_cmd('fs', 'new', "default", "metadata", "data")
+
+ # Start the MDS with mds_kill_create_at set, it will crash during creation
+ mds.restart_with_args(["--mds_kill_create_at=1"])
+ try:
+ mds.wait_for_exit()
+ except CommandFailedError as e:
+ if e.exitstatus == 1:
+ log.info("MDS creation killed as expected")
+ else:
+ log.error("Unexpected status code %s" % e.exitstatus)
+ raise
+
+ # Since I have intentionally caused a crash, I will clean up the resulting core
+ # file to avoid task.internal.coredump seeing it as a failure.
+ log.info("Removing core file from synthetic MDS failure")
+ mds_remote.run(args=['rm', '-f', Raw("{archive}/coredump/*.core".format(archive=misc.get_archive_dir(ctx)))])
+
+ # It should have left the MDS map state still in CREATING
+ status = manager.get_mds_status(mds_id)
+ assert status['state'] == 'up:creating'
+
+ # Start the MDS again without the kill flag set, it should proceed with creation successfully
+ mds.restart()
+
+ # Wait for state ACTIVE
+ t = 0
+ create_timeout = 120
+ while True:
+ status = manager.get_mds_status(mds_id)
+ if status['state'] == 'up:active':
+ log.info("MDS creation completed successfully")
+ break
+ elif status['state'] == 'up:creating':
+ log.info("MDS still in creating state")
+ if t > create_timeout:
+ log.error("Creating did not complete within %ss" % create_timeout)
+ raise RuntimeError("Creating did not complete within %ss" % create_timeout)
+ t += 1
+ time.sleep(1)
+ else:
+ log.error("Unexpected MDS state: %s" % status['state'])
+ assert(status['state'] in ['up:active', 'up:creating'])
+
+ # The system should be back up in a happy healthy state, go ahead and run any further tasks
+ # inside this context.
+ yield
--- /dev/null
+"""
+Thrash mds by simulating failures
+"""
+import logging
+import contextlib
+import ceph_manager
+import random
+import time
+
+from gevent.greenlet import Greenlet
+from gevent.event import Event
+from teuthology import misc as teuthology
+
+from tasks.cephfs.filesystem import MDSCluster, Filesystem
+
+log = logging.getLogger(__name__)
+
+
+class MDSThrasher(Greenlet):
+ """
+ MDSThrasher::
+
+ The MDSThrasher thrashes MDSs during execution of other tasks (workunits, etc).
+
+ The config is optional. Many of the config parameters are a a maximum value
+ to use when selecting a random value from a range. To always use the maximum
+ value, set no_random to true. The config is a dict containing some or all of:
+
+ max_thrash: [default: 1] the maximum number of active MDSs per FS that will be thrashed at
+ any given time.
+
+ max_thrash_delay: [default: 30] maximum number of seconds to delay before
+ thrashing again.
+
+ max_replay_thrash_delay: [default: 4] maximum number of seconds to delay while in
+ the replay state before thrashing.
+
+ max_revive_delay: [default: 10] maximum number of seconds to delay before
+ bringing back a thrashed MDS.
+
+ randomize: [default: true] enables randomization and use the max/min values
+
+ seed: [no default] seed the random number generator
+
+ thrash_in_replay: [default: 0.0] likelihood that the MDS will be thrashed
+ during replay. Value should be between 0.0 and 1.0.
+
+ thrash_max_mds: [default: 0.25] likelihood that the max_mds of the mds
+ cluster will be modified to a value [1, current) or (current, starting
+ max_mds]. When reduced, randomly selected MDSs other than rank 0 will be
+ deactivated to reach the new max_mds. Value should be between 0.0 and 1.0.
+
+ thrash_weights: allows specific MDSs to be thrashed more/less frequently.
+ This option overrides anything specified by max_thrash. This option is a
+ dict containing mds.x: weight pairs. For example, [mds.a: 0.7, mds.b:
+ 0.3, mds.c: 0.0]. Each weight is a value from 0.0 to 1.0. Any MDSs not
+ specified will be automatically given a weight of 0.0 (not thrashed).
+ For a given MDS, by default the trasher delays for up to
+ max_thrash_delay, trashes, waits for the MDS to recover, and iterates.
+ If a non-zero weight is specified for an MDS, for each iteration the
+ thrasher chooses whether to thrash during that iteration based on a
+ random value [0-1] not exceeding the weight of that MDS.
+
+ Examples::
+
+
+ The following example sets the likelihood that mds.a will be thrashed
+ to 80%, mds.b to 20%, and other MDSs will not be thrashed. It also sets the
+ likelihood that an MDS will be thrashed in replay to 40%.
+ Thrash weights do not have to sum to 1.
+
+ tasks:
+ - ceph:
+ - mds_thrash:
+ thrash_weights:
+ - mds.a: 0.8
+ - mds.b: 0.2
+ thrash_in_replay: 0.4
+ - ceph-fuse:
+ - workunit:
+ clients:
+ all: [suites/fsx.sh]
+
+ The following example disables randomization, and uses the max delay values:
+
+ tasks:
+ - ceph:
+ - mds_thrash:
+ max_thrash_delay: 10
+ max_revive_delay: 1
+ max_replay_thrash_delay: 4
+
+ """
+
+ def __init__(self, ctx, manager, config, logger, fs, max_mds):
+ super(MDSThrasher, self).__init__()
+
+ self.ctx = ctx
+ self.manager = manager
+ assert self.manager.is_clean()
+ self.config = config
+ self.logger = logger
+ self.fs = fs
+ self.max_mds = max_mds
+
+ self.stopping = Event()
+
+ self.randomize = bool(self.config.get('randomize', True))
+ self.thrash_max_mds = float(self.config.get('thrash_max_mds', 0.25))
+ self.max_thrash = int(self.config.get('max_thrash', 1))
+ self.max_thrash_delay = float(self.config.get('thrash_delay', 120.0))
+ self.thrash_in_replay = float(self.config.get('thrash_in_replay', False))
+ assert self.thrash_in_replay >= 0.0 and self.thrash_in_replay <= 1.0, 'thrash_in_replay ({v}) must be between [0.0, 1.0]'.format(
+ v=self.thrash_in_replay)
+ self.max_replay_thrash_delay = float(self.config.get('max_replay_thrash_delay', 4.0))
+ self.max_revive_delay = float(self.config.get('max_revive_delay', 10.0))
+
+ def _run(self):
+ try:
+ self.do_thrash()
+ except:
+ # Log exceptions here so we get the full backtrace (it's lost
+ # by the time someone does a .get() on this greenlet)
+ self.logger.exception("Exception in do_thrash:")
+ raise
+
+ def log(self, x):
+ """Write data to logger assigned to this MDThrasher"""
+ self.logger.info(x)
+
+ def stop(self):
+ self.stopping.set()
+
+ def kill_mds(self, mds):
+ if self.config.get('powercycle'):
+ (remote,) = (self.ctx.cluster.only('mds.{m}'.format(m=mds)).
+ remotes.iterkeys())
+ self.log('kill_mds on mds.{m} doing powercycle of {s}'.
+ format(m=mds, s=remote.name))
+ self._assert_ipmi(remote)
+ remote.console.power_off()
+ else:
+ self.ctx.daemons.get_daemon('mds', mds).stop()
+
+ @staticmethod
+ def _assert_ipmi(remote):
+ assert remote.console.has_ipmi_credentials, (
+ "powercycling requested but RemoteConsole is not "
+ "initialized. Check ipmi config.")
+
+ def revive_mds(self, mds, standby_for_rank=None):
+ """
+ Revive mds -- do an ipmpi powercycle (if indicated by the config)
+ and then restart (using --hot-standby if specified.
+ """
+ if self.config.get('powercycle'):
+ (remote,) = (self.ctx.cluster.only('mds.{m}'.format(m=mds)).
+ remotes.iterkeys())
+ self.log('revive_mds on mds.{m} doing powercycle of {s}'.
+ format(m=mds, s=remote.name))
+ self._assert_ipmi(remote)
+ remote.console.power_on()
+ self.manager.make_admin_daemon_dir(self.ctx, remote)
+ args = []
+ if standby_for_rank:
+ args.extend(['--hot-standby', standby_for_rank])
+ self.ctx.daemons.get_daemon('mds', mds).restart(*args)
+
+ def wait_for_stable(self, rank = None, gid = None):
+ self.log('waiting for mds cluster to stabilize...')
+ status = self.fs.status()
+ itercount = 0
+ while True:
+ max_mds = status.get_fsmap(self.fs.id)['mdsmap']['max_mds']
+ if rank is not None:
+ try:
+ info = status.get_rank(self.fs.id, rank)
+ if info['gid'] != gid:
+ self.log('mds.{name} has gained rank={rank}, replacing gid={gid}'.format(name = info['name'], rank = rank, gid = gid))
+ return status, info['name']
+ except:
+ pass # no rank present
+ else:
+ ranks = filter(lambda info: "up:active" == info['state'] and "laggy_since" not in info, list(status.get_ranks(self.fs.id)))
+ count = len(ranks)
+ if count >= max_mds:
+ self.log('mds cluster has {count} alive and active, now stable!'.format(count = count))
+ return status, None
+ itercount = itercount + 1
+ if itercount > 10:
+ self.log('mds map: {status}'.format(status=self.fs.status()))
+ time.sleep(2)
+ status = self.fs.status()
+
+ def do_thrash(self):
+ """
+ Perform the random thrashing action
+ """
+
+ self.log('starting mds_do_thrash for fs {fs}'.format(fs = self.fs.name))
+ stats = {
+ "max_mds": 0,
+ "deactivate": 0,
+ "kill": 0,
+ }
+
+ while not self.stopping.is_set():
+ delay = self.max_thrash_delay
+ if self.randomize:
+ delay = random.randrange(0.0, self.max_thrash_delay)
+
+ if delay > 0.0:
+ self.log('waiting for {delay} secs before thrashing'.format(delay=delay))
+ self.stopping.wait(delay)
+ if self.stopping.is_set():
+ continue
+
+ status = self.fs.status()
+
+ if random.randrange(0.0, 1.0) <= self.thrash_max_mds:
+ max_mds = status.get_fsmap(self.fs.id)['mdsmap']['max_mds']
+ options = range(1, max_mds)+range(max_mds+1, self.max_mds+1)
+ if len(options) > 0:
+ sample = random.sample(options, 1)
+ new_max_mds = sample[0]
+ self.log('thrashing max_mds: %d -> %d' % (max_mds, new_max_mds))
+ self.fs.set_max_mds(new_max_mds)
+ stats['max_mds'] += 1
+
+ # Now randomly deactivate mds if we shrank
+ for rank in random.sample(range(1, max_mds), max(0, max_mds-new_max_mds)):
+ self.fs.deactivate(rank)
+ stats['deactivate'] += 1
+
+ status = self.wait_for_stable()[0]
+
+ count = 0
+ for info in status.get_ranks(self.fs.id):
+ name = info['name']
+ label = 'mds.' + name
+ rank = info['rank']
+ gid = info['gid']
+
+ # if thrash_weights isn't specified and we've reached max_thrash,
+ # we're done
+ count = count + 1
+ if 'thrash_weights' not in self.config and count > self.max_thrash:
+ break
+
+ weight = 1.0
+ if 'thrash_weights' in self.config:
+ weight = self.config['thrash_weights'].get(label, '0.0')
+ skip = random.randrange(0.0, 1.0)
+ if weight <= skip:
+ self.log('skipping thrash iteration with skip ({skip}) > weight ({weight})'.format(skip=skip, weight=weight))
+ continue
+
+ self.log('kill {label} (rank={rank})'.format(label=label, rank=rank))
+ self.kill_mds(name)
+ stats['kill'] += 1
+
+ # wait for mon to report killed mds as crashed
+ last_laggy_since = None
+ itercount = 0
+ while True:
+ status = self.fs.status()
+ info = status.get_mds(name)
+ if not info:
+ break
+ if 'laggy_since' in info:
+ last_laggy_since = info['laggy_since']
+ break
+ if any([(f == name) for f in status.get_fsmap(self.fs.id)['mdsmap']['failed']]):
+ break
+ self.log(
+ 'waiting till mds map indicates {label} is laggy/crashed, in failed state, or {label} is removed from mdsmap'.format(
+ label=label))
+ itercount = itercount + 1
+ if itercount > 10:
+ self.log('mds map: {status}'.format(status=status))
+ time.sleep(2)
+
+ if last_laggy_since:
+ self.log(
+ '{label} reported laggy/crashed since: {since}'.format(label=label, since=last_laggy_since))
+ else:
+ self.log('{label} down, removed from mdsmap'.format(label=label, since=last_laggy_since))
+
+ # wait for a standby mds to takeover and become active
+ status, takeover_mds = self.wait_for_stable(rank, gid)
+ self.log('New active mds is mds.{_id}'.format(_id=takeover_mds))
+
+ # wait for a while before restarting old active to become new
+ # standby
+ delay = self.max_revive_delay
+ if self.randomize:
+ delay = random.randrange(0.0, self.max_revive_delay)
+
+ self.log('waiting for {delay} secs before reviving {label}'.format(
+ delay=delay, label=label))
+ time.sleep(delay)
+
+ self.log('reviving {label}'.format(label=label))
+ self.revive_mds(name)
+
+ while True:
+ status = self.fs.status()
+ info = status.get_mds(name)
+ if info and info['state'] in ('up:standby', 'up:standby-replay'):
+ self.log('{label} reported in {state} state'.format(label=label, state=info['state']))
+ break
+ self.log(
+ 'waiting till mds map indicates {label} is in standby or standby-replay'.format(label=label))
+ time.sleep(2)
+
+ for stat in stats:
+ self.log("stat['{key}'] = {value}".format(key = stat, value = stats[stat]))
+
+ # don't do replay thrashing right now
+# for info in status.get_replays(self.fs.id):
+# # this might race with replay -> active transition...
+# if status['state'] == 'up:replay' and random.randrange(0.0, 1.0) < self.thrash_in_replay:
+# delay = self.max_replay_thrash_delay
+# if self.randomize:
+# delay = random.randrange(0.0, self.max_replay_thrash_delay)
+# time.sleep(delay)
+# self.log('kill replaying mds.{id}'.format(id=self.to_kill))
+# self.kill_mds(self.to_kill)
+#
+# delay = self.max_revive_delay
+# if self.randomize:
+# delay = random.randrange(0.0, self.max_revive_delay)
+#
+# self.log('waiting for {delay} secs before reviving mds.{id}'.format(
+# delay=delay, id=self.to_kill))
+# time.sleep(delay)
+#
+# self.log('revive mds.{id}'.format(id=self.to_kill))
+# self.revive_mds(self.to_kill)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Stress test the mds by thrashing while another task/workunit
+ is running.
+
+ Please refer to MDSThrasher class for further information on the
+ available options.
+ """
+
+ mds_cluster = MDSCluster(ctx)
+
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'mds_thrash task only accepts a dict for configuration'
+ mdslist = list(teuthology.all_roles_of_type(ctx.cluster, 'mds'))
+ assert len(mdslist) > 1, \
+ 'mds_thrash task requires at least 2 metadata servers'
+
+ # choose random seed
+ if 'seed' in config:
+ seed = int(config['seed'])
+ else:
+ seed = int(time.time())
+ log.info('mds thrasher using random seed: {seed}'.format(seed=seed))
+ random.seed(seed)
+
+ (first,) = ctx.cluster.only('mds.{_id}'.format(_id=mdslist[0])).remotes.iterkeys()
+ manager = ceph_manager.CephManager(
+ first, ctx=ctx, logger=log.getChild('ceph_manager'),
+ )
+
+ # make sure everyone is in active, standby, or standby-replay
+ log.info('Wait for all MDSs to reach steady state...')
+ status = mds_cluster.status()
+ while True:
+ steady = True
+ for info in status.get_all():
+ state = info['state']
+ if state not in ('up:active', 'up:standby', 'up:standby-replay'):
+ steady = False
+ break
+ if steady:
+ break
+ time.sleep(2)
+ status = mds_cluster.status()
+ log.info('Ready to start thrashing')
+
+ manager.wait_for_clean()
+ thrashers = {}
+ for fs in status.get_filesystems():
+ name = fs['mdsmap']['fs_name']
+ log.info('Running thrasher against FS {f}'.format(f = name))
+ thrasher = MDSThrasher(
+ ctx, manager, config,
+ log.getChild('fs.[{f}]'.format(f = name)),
+ Filesystem(ctx, fs['id']), fs['mdsmap']['max_mds']
+ )
+ thrasher.start()
+ thrashers[name] = thrasher
+
+ try:
+ log.debug('Yielding')
+ yield
+ finally:
+ log.info('joining mds_thrashers')
+ for name in thrashers:
+ log.info('join thrasher mds_thrasher.fs.[{f}]'.format(f=name))
+ thrashers[name].stop()
+ thrashers[name].get() # Raise any exception from _run()
+ thrashers[name].join()
+ log.info('done joining')
--- /dev/null
+instance-id: test
+local-hostname: test
--- /dev/null
+
+from unittest import case
+import json
+
+from teuthology import misc
+from tasks.ceph_test_case import CephTestCase
+
+# TODO move definition of CephCluster
+from tasks.cephfs.filesystem import CephCluster
+
+
+class MgrCluster(CephCluster):
+ def __init__(self, ctx):
+ super(MgrCluster, self).__init__(ctx)
+ self.mgr_ids = list(misc.all_roles_of_type(ctx.cluster, 'mgr'))
+
+ if len(self.mgr_ids) == 0:
+ raise RuntimeError(
+ "This task requires at least one manager daemon")
+
+ self.mgr_daemons = dict(
+ [(mgr_id, self._ctx.daemons.get_daemon('mgr', mgr_id)) for mgr_id
+ in self.mgr_ids])
+
+ def mgr_stop(self, mgr_id):
+ self.mgr_daemons[mgr_id].stop()
+
+ def mgr_fail(self, mgr_id):
+ self.mon_manager.raw_cluster_cmd("mgr", "fail", mgr_id)
+
+ def mgr_restart(self, mgr_id):
+ self.mgr_daemons[mgr_id].restart()
+
+ def get_mgr_map(self):
+ status = json.loads(
+ self.mon_manager.raw_cluster_cmd("status", "--format=json-pretty"))
+
+ return status["mgrmap"]
+
+ def get_active_id(self):
+ return self.get_mgr_map()["active_name"]
+
+ def get_standby_ids(self):
+ return [s['name'] for s in self.get_mgr_map()["standbys"]]
+
+
+class MgrTestCase(CephTestCase):
+ REQUIRE_MGRS = 1
+
+ def setUp(self):
+ super(MgrTestCase, self).setUp()
+
+ # The test runner should have populated this
+ assert self.mgr_cluster is not None
+
+ if len(self.mgr_cluster.mgr_ids) < self.REQUIRE_MGRS:
+ raise case.SkipTest("Only have {0} manager daemons, "
+ "{1} are required".format(
+ len(self.mgr_cluster.mgr_ids), self.REQUIRE_MGRS))
+
+ # Restart all the daemons
+ for daemon in self.mgr_cluster.mgr_daemons.values():
+ daemon.stop()
+
+ for mgr_id in self.mgr_cluster.mgr_ids:
+ self.mgr_cluster.mgr_fail(mgr_id)
+
+ for daemon in self.mgr_cluster.mgr_daemons.values():
+ daemon.restart()
+
+ # Wait for an active to come up
+ self.wait_until_true(lambda: self.mgr_cluster.get_active_id() != "",
+ timeout=20)
+
+ expect_standbys = set(self.mgr_cluster.mgr_ids) \
+ - {self.mgr_cluster.get_active_id()}
+ self.wait_until_true(
+ lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys,
+ timeout=20)
--- /dev/null
+
+import logging
+
+from tasks.mgr.mgr_test_case import MgrTestCase
+
+
+log = logging.getLogger(__name__)
+
+
+class TestFailover(MgrTestCase):
+ REQUIRE_MGRS = 2
+
+ def test_timeout(self):
+ """
+ That when an active mgr stops responding, a standby is promoted
+ after mon_mgr_beacon_grace.
+ """
+
+ # Query which mgr is active
+ original_active = self.mgr_cluster.get_active_id()
+ original_standbys = self.mgr_cluster.get_standby_ids()
+
+ # Stop that daemon
+ self.mgr_cluster.mgr_stop(original_active)
+
+ # Assert that the other mgr becomes active
+ self.wait_until_true(
+ lambda: self.mgr_cluster.get_active_id() in original_standbys,
+ timeout=60
+ )
+
+ self.mgr_cluster.mgr_restart(original_active)
+ self.wait_until_true(
+ lambda: original_active in self.mgr_cluster.get_standby_ids(),
+ timeout=10
+ )
+
+ def test_explicit_fail(self):
+ """
+ That when a user explicitly fails a daemon, a standby immediately
+ replaces it.
+ :return:
+ """
+ # Query which mgr is active
+ original_active = self.mgr_cluster.get_active_id()
+ original_standbys = self.mgr_cluster.get_standby_ids()
+
+ self.mgr_cluster.mgr_fail(original_active)
+
+ # A standby should take over
+ self.wait_until_true(
+ lambda: self.mgr_cluster.get_active_id() in original_standbys,
+ timeout=60
+ )
+
+ # The one we failed should come back as a standby (he isn't
+ # really dead)
+ self.wait_until_true(
+ lambda: original_active in self.mgr_cluster.get_standby_ids(),
+ timeout=10
+ )
+
+ def test_standby_timeout(self):
+ """
+ That when a standby daemon stops sending beacons, it is
+ removed from the list of standbys
+ :return:
+ """
+ original_active = self.mgr_cluster.get_active_id()
+ original_standbys = self.mgr_cluster.get_standby_ids()
+
+ victim = original_standbys[0]
+ self.mgr_cluster.mgr_stop(victim)
+
+ expect_standbys = set(original_standbys) - {victim}
+
+ self.wait_until_true(
+ lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys,
+ timeout=60
+ )
+ self.assertEqual(self.mgr_cluster.get_active_id(), original_active)
--- /dev/null
+# mod_fastcgi config goes here
+
+# Set fastcgi environment variables.
+# Note that this is separate from Unix environment variables!
+SetEnv RGW_LOG_LEVEL 20
+SetEnv RGW_SHOULD_LOG yes
+SetEnv RGW_PRINT_CONTINUE {print_continue}
+
+<IfModule !fastcgi_module>
+ LoadModule fastcgi_module {mod_path}/mod_fastcgi.so
+</IfModule>
+
+FastCgiIPCDir {testdir}/apache/tmp.{client}/fastcgi_sock
+FastCgiExternalServer {testdir}/apache/htdocs.{client}/rgw.fcgi -socket rgw_sock -idle-timeout {idle_timeout}
+RewriteEngine On
+
+RewriteRule ^/([a-zA-Z0-9-_.]*)([/]?.*) /rgw.fcgi?page=$1¶ms=$2&%{{QUERY_STRING}} [E=HTTP_AUTHORIZATION:%{{HTTP:Authorization}},L]
--- /dev/null
+# mod_proxy_fcgi config, using TCP
+
+<IfModule !proxy_module>
+ LoadModule proxy_module {mod_path}/mod_proxy.so
+</IfModule>
+<IfModule !proxy_fcgi_module>
+ LoadModule proxy_fcgi_module {mod_path}/mod_proxy_fcgi.so
+</IfModule>
+
+RewriteEngine On
+
+RewriteRule .* - [E=HTTP_AUTHORIZATION:%{{HTTP:Authorization}},L]
+
+SetEnv proxy-nokeepalive 1
+
+ProxyPass / fcgi://0.0.0.0:9000/
--- /dev/null
+# mod_proxy_fcgi config, using UDS
+
+<IfModule !proxy_module>
+ LoadModule proxy_module {mod_path}/mod_proxy.so
+</IfModule>
+<IfModule !proxy_fcgi_module>
+ LoadModule proxy_fcgi_module {mod_path}/mod_proxy_fcgi.so
+</IfModule>
+
+RewriteEngine On
+
+RewriteRule .* - [E=HTTP_AUTHORIZATION:%{{HTTP:Authorization}},L]
+
+ProxyPass / unix://{testdir}/apache/tmp.{client}/fastcgi_sock/rgw_sock|fcgi://localhost:9000/ disablereuse=On
--- /dev/null
+"""
+Handle clock skews in monitors.
+"""
+import logging
+import contextlib
+import ceph_manager
+import time
+import gevent
+from StringIO import StringIO
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+class ClockSkewCheck:
+ """
+ Periodically check if there are any clock skews among the monitors in the
+ quorum. By default, assume no skews are supposed to exist; that can be
+ changed using the 'expect-skew' option. If 'fail-on-skew' is set to false,
+ then we will always succeed and only report skews if any are found.
+
+ This class does not spawn a thread. It assumes that, if that is indeed
+ wanted, it should be done by a third party (for instance, the task using
+ this class). We intend it as such in order to reuse this class if need be.
+
+ This task accepts the following options:
+
+ interval amount of seconds to wait in-between checks. (default: 30.0)
+ max-skew maximum skew, in seconds, that is considered tolerable before
+ issuing a warning. (default: 0.05)
+ expect-skew 'true' or 'false', to indicate whether to expect a skew during
+ the run or not. If 'true', the test will fail if no skew is
+ found, and succeed if a skew is indeed found; if 'false', it's
+ the other way around. (default: false)
+ never-fail Don't fail the run if a skew is detected and we weren't
+ expecting it, or if no skew is detected and we were expecting
+ it. (default: False)
+
+ at-least-once Runs at least once, even if we are told to stop.
+ (default: True)
+ at-least-once-timeout If we were told to stop but we are attempting to
+ run at least once, timeout after this many seconds.
+ (default: 600)
+
+ Example:
+ Expect a skew higher than 0.05 seconds, but only report it without
+ failing the teuthology run.
+
+ - mon_clock_skew_check:
+ interval: 30
+ max-skew: 0.05
+ expect_skew: true
+ never-fail: true
+ """
+
+ def __init__(self, ctx, manager, config, logger):
+ self.ctx = ctx
+ self.manager = manager
+
+ self.stopping = False
+ self.logger = logger
+ self.config = config
+
+ if self.config is None:
+ self.config = dict()
+
+ self.check_interval = float(self.config.get('interval', 30.0))
+
+ first_mon = teuthology.get_first_mon(ctx, config)
+ remote = ctx.cluster.only(first_mon).remotes.keys()[0]
+ proc = remote.run(
+ args=[
+ 'sudo',
+ 'ceph-mon',
+ '-i', first_mon[4:],
+ '--show-config-value', 'mon_clock_drift_allowed'
+ ], stdout=StringIO(), wait=True
+ )
+ self.max_skew = self.config.get('max-skew', float(proc.stdout.getvalue()))
+
+ self.expect_skew = self.config.get('expect-skew', False)
+ self.never_fail = self.config.get('never-fail', False)
+ self.at_least_once = self.config.get('at-least-once', True)
+ self.at_least_once_timeout = self.config.get('at-least-once-timeout', 600.0)
+
+ def info(self, x):
+ """
+ locally define logger for info messages
+ """
+ self.logger.info(x)
+
+ def warn(self, x):
+ """
+ locally define logger for warnings
+ """
+ self.logger.warn(x)
+
+ def debug(self, x):
+ """
+ locally define logger for debug messages
+ """
+ self.logger.info(x)
+ self.logger.debug(x)
+
+ def finish(self):
+ """
+ Break out of the do_check loop.
+ """
+ self.stopping = True
+
+ def sleep_interval(self):
+ """
+ If a sleep interval is set, sleep for that amount of time.
+ """
+ if self.check_interval > 0.0:
+ self.debug('sleeping for {s} seconds'.format(
+ s=self.check_interval))
+ time.sleep(self.check_interval)
+
+ def print_skews(self, skews):
+ """
+ Display skew values.
+ """
+ total = len(skews)
+ if total > 0:
+ self.info('---------- found {n} skews ----------'.format(n=total))
+ for mon_id, values in skews.iteritems():
+ self.info('mon.{id}: {v}'.format(id=mon_id, v=values))
+ self.info('-------------------------------------')
+ else:
+ self.info('---------- no skews were found ----------')
+
+ def do_check(self):
+ """
+ Clock skew checker. Loops until finish() is called.
+ """
+ self.info('start checking for clock skews')
+ skews = dict()
+ ran_once = False
+
+ started_on = None
+
+ while not self.stopping or (self.at_least_once and not ran_once):
+
+ if self.at_least_once and not ran_once and self.stopping:
+ if started_on is None:
+ self.info('kicking-off timeout (if any)')
+ started_on = time.time()
+ elif self.at_least_once_timeout > 0.0:
+ assert time.time() - started_on < self.at_least_once_timeout, \
+ 'failed to obtain a timecheck before timeout expired'
+
+ quorum_size = len(teuthology.get_mon_names(self.ctx))
+ self.manager.wait_for_mon_quorum_size(quorum_size)
+
+ health = self.manager.get_mon_health(True)
+ timechecks = health['timechecks']
+
+ clean_check = False
+
+ if timechecks['round_status'] == 'finished':
+ assert (timechecks['round'] % 2) == 0, \
+ 'timecheck marked as finished but round ' \
+ 'disagrees (r {r})'.format(
+ r=timechecks['round'])
+ clean_check = True
+ else:
+ assert timechecks['round_status'] == 'on-going', \
+ 'timecheck status expected \'on-going\' ' \
+ 'but found \'{s}\' instead'.format(
+ s=timechecks['round_status'])
+ if 'mons' in timechecks.keys() and len(timechecks['mons']) > 1:
+ self.info('round still on-going, but there are available reports')
+ else:
+ self.info('no timechecks available just yet')
+ self.sleep_interval()
+ continue
+
+ assert len(timechecks['mons']) > 1, \
+ 'there are not enough reported timechecks; ' \
+ 'expected > 1 found {n}'.format(n=len(timechecks['mons']))
+
+ for check in timechecks['mons']:
+ mon_skew = float(check['skew'])
+ mon_health = check['health']
+ mon_id = check['name']
+ if abs(mon_skew) > self.max_skew:
+ assert mon_health == 'HEALTH_WARN', \
+ 'mon.{id} health is \'{health}\' but skew {s} > max {ms}'.format(
+ id=mon_id,health=mon_health,s=abs(mon_skew),ms=self.max_skew)
+
+ log_str = 'mon.{id} with skew {s} > max {ms}'.format(
+ id=mon_id,s=abs(mon_skew),ms=self.max_skew)
+
+ """ add to skew list """
+ details = check['details']
+ skews[mon_id] = {'skew': mon_skew, 'details': details}
+
+ if self.expect_skew:
+ self.info('expected skew: {str}'.format(str=log_str))
+ else:
+ self.warn('unexpected skew: {str}'.format(str=log_str))
+
+ if clean_check or (self.expect_skew and len(skews) > 0):
+ ran_once = True
+ self.print_skews(skews)
+ self.sleep_interval()
+
+ total = len(skews)
+ self.print_skews(skews)
+
+ error_str = ''
+ found_error = False
+
+ if self.expect_skew:
+ if total == 0:
+ error_str = 'We were expecting a skew, but none was found!'
+ found_error = True
+ else:
+ if total > 0:
+ error_str = 'We were not expecting a skew, but we did find it!'
+ found_error = True
+
+ if found_error:
+ self.info(error_str)
+ if not self.never_fail:
+ assert False, error_str
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Use clas ClockSkewCheck to check for clock skews on the monitors.
+ This task will spawn a thread running ClockSkewCheck's do_check().
+
+ All the configuration will be directly handled by ClockSkewCheck,
+ so please refer to the class documentation for further information.
+ """
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'mon_clock_skew_check task only accepts a dict for configuration'
+ log.info('Beginning mon_clock_skew_check...')
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+ manager = ceph_manager.CephManager(
+ mon,
+ ctx=ctx,
+ logger=log.getChild('ceph_manager'),
+ )
+
+ skew_check = ClockSkewCheck(ctx,
+ manager, config,
+ logger=log.getChild('mon_clock_skew_check'))
+ skew_check_thread = gevent.spawn(skew_check.do_check)
+ try:
+ yield
+ finally:
+ log.info('joining mon_clock_skew_check')
+ skew_check.finish()
+ skew_check_thread.get()
+
+
--- /dev/null
+"""
+Monitor recovery
+"""
+import logging
+import ceph_manager
+from teuthology import misc as teuthology
+
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+ """
+ Test monitor recovery.
+ """
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'task only accepts a dict for configuration'
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+ manager = ceph_manager.CephManager(
+ mon,
+ ctx=ctx,
+ logger=log.getChild('ceph_manager'),
+ )
+
+ mons = [f.split('.')[1] for f in teuthology.get_mon_names(ctx)]
+ log.info("mon ids = %s" % mons)
+
+ manager.wait_for_mon_quorum_size(len(mons))
+
+ log.info('verifying all monitors are in the quorum')
+ for m in mons:
+ s = manager.get_mon_status(m)
+ assert s['state'] == 'leader' or s['state'] == 'peon'
+ assert len(s['quorum']) == len(mons)
+
+ log.info('restarting each monitor in turn')
+ for m in mons:
+ # stop a monitor
+ manager.kill_mon(m)
+ manager.wait_for_mon_quorum_size(len(mons) - 1)
+
+ # restart
+ manager.revive_mon(m)
+ manager.wait_for_mon_quorum_size(len(mons))
+
+ # in forward and reverse order,
+ rmons = mons
+ rmons.reverse()
+ for mons in mons, rmons:
+ log.info('stopping all monitors')
+ for m in mons:
+ manager.kill_mon(m)
+
+ log.info('forming a minimal quorum for %s, then adding monitors' % mons)
+ qnum = (len(mons) / 2) + 1
+ num = 0
+ for m in mons:
+ manager.revive_mon(m)
+ num += 1
+ if num >= qnum:
+ manager.wait_for_mon_quorum_size(num)
+
+ # on both leader and non-leader ranks...
+ for rank in [0, 1]:
+ # take one out
+ log.info('removing mon %s' % mons[rank])
+ manager.kill_mon(mons[rank])
+ manager.wait_for_mon_quorum_size(len(mons) - 1)
+
+ log.info('causing some monitor log activity')
+ m = 30
+ for n in range(1, m):
+ manager.raw_cluster_cmd('log', '%d of %d' % (n, m))
+
+ log.info('adding mon %s back in' % mons[rank])
+ manager.revive_mon(mons[rank])
+ manager.wait_for_mon_quorum_size(len(mons))
--- /dev/null
+from cStringIO import StringIO
+
+import contextlib
+import logging
+import random
+
+from teuthology import misc as teuthology
+from teuthology.orchestra import run
+
+from ceph_manager import CephManager, write_conf
+
+
+log = logging.getLogger(__name__)
+
+
+def _get_mons(ctx):
+ return [name[len('mon.'):] for name in teuthology.get_mon_names(ctx)]
+
+
+# teuthology prepares the monitor IPs (and ports) in get_mons(), we can
+# enumerate all monitor ports ([6789..]), and find the next available one.
+def _get_next_port(ctx, ip, cluster):
+ # assuming we have only one cluster here.
+ used = []
+ for name in teuthology.get_mon_names(ctx, cluster):
+ addr = ctx.ceph[cluster].conf[name]['mon addr']
+ mon_ip, mon_port = addr.split(':')
+ if mon_ip != ip:
+ continue
+ used.append(int(mon_port))
+ port = 6789
+ used.sort()
+ for p in used:
+ if p != port:
+ break
+ port += 1
+ return port
+
+
+def _setup_mon(ctx, manager, remote, mon, name, data_path, conf_path):
+ # co-locate a new monitor on remote where an existing monitor is hosted
+ cluster = manager.cluster
+ remote.run(args=['sudo', 'mkdir', '-p', data_path])
+ keyring_path = '/etc/ceph/{cluster}.keyring'.format(
+ cluster=manager.cluster)
+ testdir = teuthology.get_testdir(ctx)
+ monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
+ cluster=cluster)
+ manager.raw_cluster_cmd('mon', 'getmap', '-o', monmap_path)
+ if manager.controller != remote:
+ monmap = teuthology.get_file(manager.controller, monmap_path)
+ teuthology.write_file(remote, monmap_path, StringIO(monmap))
+ remote.run(
+ args=[
+ 'sudo',
+ 'ceph-mon',
+ '--cluster', cluster,
+ '--mkfs',
+ '-i', mon,
+ '--monmap', monmap_path,
+ '--keyring', keyring_path])
+ if manager.controller != remote:
+ teuthology.delete_file(remote, monmap_path)
+ # raw_cluster_cmd() is performed using sudo, so sudo here also.
+ teuthology.delete_file(manager.controller, monmap_path, sudo=True)
+ # update ceph.conf so that the ceph CLI is able to connect to the cluster
+ if conf_path:
+ ip = remote.ip_address
+ port = _get_next_port(ctx, ip, cluster)
+ mon_addr = '{ip}:{port}'.format(ip=ip, port=port)
+ ctx.ceph[cluster].conf[name] = {'mon addr': mon_addr}
+ write_conf(ctx, conf_path, cluster)
+
+
+def _teardown_mon(ctx, manager, remote, name, data_path, conf_path):
+ cluster = manager.cluster
+ del ctx.ceph[cluster].conf[name]
+ write_conf(ctx, conf_path, cluster)
+ remote.run(args=['sudo', 'rm', '-rf', data_path])
+
+
+@contextlib.contextmanager
+def _prepare_mon(ctx, manager, remote, mon):
+ cluster = manager.cluster
+ data_path = '/var/lib/ceph/mon/{cluster}-{id}'.format(
+ cluster=cluster, id=mon)
+ conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster)
+ name = 'mon.{0}'.format(mon)
+ _setup_mon(ctx, manager, remote, mon, name, data_path, conf_path)
+ yield
+ _teardown_mon(ctx, manager, remote, name,
+ data_path, conf_path)
+
+
+# run_daemon() in ceph.py starts a herd of daemons of the same type, but
+# _run_daemon() starts only one instance.
+@contextlib.contextmanager
+def _run_daemon(ctx, remote, cluster, type_, id_):
+ testdir = teuthology.get_testdir(ctx)
+ coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
+ daemon_signal = 'kill'
+ run_cmd = [
+ 'sudo',
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ coverage_dir,
+ 'daemon-helper',
+ daemon_signal,
+ ]
+ run_cmd_tail = [
+ 'ceph-%s' % (type_),
+ '-f',
+ '--cluster', cluster,
+ '-i', id_]
+ run_cmd.extend(run_cmd_tail)
+ ctx.daemons.add_daemon(remote, type_, id_,
+ cluster=cluster,
+ args=run_cmd,
+ logger=log.getChild(type_),
+ stdin=run.PIPE,
+ wait=False)
+ daemon = ctx.daemons.get_daemon(type_, id_, cluster)
+ yield daemon
+ daemon.stop()
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ replace a monitor with a newly added one, and then revert this change
+
+ How it works::
+ 1. add a mon with specified id (mon.victim_prime)
+ 2. wait for quorum
+ 3. remove a monitor with specified id (mon.victim), mon.victim will commit
+ suicide
+ 4. wait for quorum
+ 5. <yield>
+ 5. add mon.a back, and start it
+ 6. wait for quorum
+ 7. remove mon.a_prime
+
+ Options::
+ victim the id of the mon to be removed (pick a random mon by default)
+ replacer the id of the new mon (use "${victim}_prime" if not specified)
+ """
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+ manager = CephManager(mon, ctx=ctx, logger=log.getChild('ceph_manager'))
+
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ "task ceph only supports a dictionary for configuration"
+ overrides = ctx.config.get('overrides', {})
+ teuthology.deep_merge(config, overrides.get('mon_seesaw', {}))
+ victim = config.get('victim', random.choice(_get_mons(ctx)))
+ replacer = config.get('replacer', '{0}_prime'.format(victim))
+ remote = manager.find_remote('mon', victim)
+ quorum = manager.get_mon_quorum()
+ cluster = manager.cluster
+ log.info('replacing {victim} with {replacer}'.format(victim=victim,
+ replacer=replacer))
+ with _prepare_mon(ctx, manager, remote, replacer):
+ with _run_daemon(ctx, remote, cluster, 'mon', replacer):
+ # replacer will join the quorum automatically
+ manager.wait_for_mon_quorum_size(len(quorum) + 1, 10)
+ # if we don't remove the victim from monmap, there is chance that
+ # we are leaving the new joiner with a monmap of 2 mon, and it will
+ # not able to reach the other one, it will be keeping probing for
+ # ever.
+ log.info('removing {mon}'.format(mon=victim))
+ manager.raw_cluster_cmd('mon', 'remove', victim)
+ manager.wait_for_mon_quorum_size(len(quorum), 10)
+ # the victim will commit suicide after being removed from
+ # monmap, let's wait until it stops.
+ ctx.daemons.get_daemon('mon', victim, cluster).wait(10)
+ try:
+ # perform other tasks
+ yield
+ finally:
+ # bring the victim back online
+ # nuke the monstore of victim, otherwise it will refuse to boot
+ # with following message:
+ #
+ # not in monmap and have been in a quorum before; must have
+ # been removed
+ log.info('re-adding {mon}'.format(mon=victim))
+ data_path = '/var/lib/ceph/mon/{cluster}-{id}'.format(
+ cluster=cluster, id=victim)
+ remote.run(args=['sudo', 'rm', '-rf', data_path])
+ name = 'mon.{0}'.format(victim)
+ _setup_mon(ctx, manager, remote, victim, name, data_path, None)
+ log.info('reviving {mon}'.format(mon=victim))
+ manager.revive_mon(victim)
+ manager.wait_for_mon_quorum_size(len(quorum) + 1, 10)
+ manager.raw_cluster_cmd('mon', 'remove', replacer)
+ manager.wait_for_mon_quorum_size(len(quorum), 10)
--- /dev/null
+"""
+Monitor thrash
+"""
+import logging
+import contextlib
+import ceph_manager
+import random
+import time
+import gevent
+import json
+import math
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+def _get_mons(ctx):
+ """
+ Get monitor names from the context value.
+ """
+ mons = [f[len('mon.'):] for f in teuthology.get_mon_names(ctx)]
+ return mons
+
+class MonitorThrasher:
+ """
+ How it works::
+
+ - pick a monitor
+ - kill it
+ - wait for quorum to be formed
+ - sleep for 'revive_delay' seconds
+ - revive monitor
+ - wait for quorum to be formed
+ - sleep for 'thrash_delay' seconds
+
+ Options::
+
+ seed Seed to use on the RNG to reproduce a previous
+ behaviour (default: None; i.e., not set)
+ revive_delay Number of seconds to wait before reviving
+ the monitor (default: 10)
+ thrash_delay Number of seconds to wait in-between
+ test iterations (default: 0)
+ thrash_store Thrash monitor store before killing the monitor being thrashed (default: False)
+ thrash_store_probability Probability of thrashing a monitor's store
+ (default: 50)
+ thrash_many Thrash multiple monitors instead of just one. If
+ 'maintain-quorum' is set to False, then we will
+ thrash up to as many monitors as there are
+ available. (default: False)
+ maintain_quorum Always maintain quorum, taking care on how many
+ monitors we kill during the thrashing. If we
+ happen to only have one or two monitors configured,
+ if this option is set to True, then we won't run
+ this task as we cannot guarantee maintenance of
+ quorum. Setting it to false however would allow the
+ task to run with as many as just one single monitor.
+ (default: True)
+ freeze_mon_probability: how often to freeze the mon instead of killing it,
+ in % (default: 0)
+ freeze_mon_duration: how many seconds to freeze the mon (default: 15)
+ scrub Scrub after each iteration (default: True)
+
+ Note: if 'store-thrash' is set to True, then 'maintain-quorum' must also
+ be set to True.
+
+ For example::
+
+ tasks:
+ - ceph:
+ - mon_thrash:
+ revive_delay: 20
+ thrash_delay: 1
+ thrash_store: true
+ thrash_store_probability: 40
+ seed: 31337
+ maintain_quorum: true
+ thrash_many: true
+ - ceph-fuse:
+ - workunit:
+ clients:
+ all:
+ - mon/workloadgen.sh
+ """
+ def __init__(self, ctx, manager, config, logger):
+ self.ctx = ctx
+ self.manager = manager
+ self.manager.wait_for_clean()
+
+ self.stopping = False
+ self.logger = logger
+ self.config = config
+
+ if self.config is None:
+ self.config = dict()
+
+ """ Test reproducibility """
+ self.random_seed = self.config.get('seed', None)
+
+ if self.random_seed is None:
+ self.random_seed = int(time.time())
+
+ self.rng = random.Random()
+ self.rng.seed(int(self.random_seed))
+
+ """ Monitor thrashing """
+ self.revive_delay = float(self.config.get('revive_delay', 10.0))
+ self.thrash_delay = float(self.config.get('thrash_delay', 0.0))
+
+ self.thrash_many = self.config.get('thrash_many', False)
+ self.maintain_quorum = self.config.get('maintain_quorum', True)
+
+ self.scrub = self.config.get('scrub', True)
+
+ self.freeze_mon_probability = float(self.config.get('freeze_mon_probability', 10))
+ self.freeze_mon_duration = float(self.config.get('freeze_mon_duration', 15.0))
+
+ assert self.max_killable() > 0, \
+ 'Unable to kill at least one monitor with the current config.'
+
+ """ Store thrashing """
+ self.store_thrash = self.config.get('store_thrash', False)
+ self.store_thrash_probability = int(
+ self.config.get('store_thrash_probability', 50))
+ if self.store_thrash:
+ assert self.store_thrash_probability > 0, \
+ 'store_thrash is set, probability must be > 0'
+ assert self.maintain_quorum, \
+ 'store_thrash = true must imply maintain_quorum = true'
+
+ self.thread = gevent.spawn(self.do_thrash)
+
+ def log(self, x):
+ """
+ locally log info messages
+ """
+ self.logger.info(x)
+
+ def do_join(self):
+ """
+ Break out of this processes thrashing loop.
+ """
+ self.stopping = True
+ self.thread.get()
+
+ def should_thrash_store(self):
+ """
+ If allowed, indicate that we should thrash a certain percentage of
+ the time as determined by the store_thrash_probability value.
+ """
+ if not self.store_thrash:
+ return False
+ return self.rng.randrange(0, 101) < self.store_thrash_probability
+
+ def thrash_store(self, mon):
+ """
+ Thrash the monitor specified.
+ :param mon: monitor to thrash
+ """
+ addr = self.ctx.ceph['ceph'].conf['mon.%s' % mon]['mon addr']
+ self.log('thrashing mon.{id}@{addr} store'.format(id=mon, addr=addr))
+ out = self.manager.raw_cluster_cmd('-m', addr, 'sync', 'force')
+ j = json.loads(out)
+ assert j['ret'] == 0, \
+ 'error forcing store sync on mon.{id}:\n{ret}'.format(
+ id=mon,ret=out)
+
+ def should_freeze_mon(self):
+ """
+ Indicate that we should freeze a certain percentago of the time
+ as determined by the freeze_mon_probability value.
+ """
+ return self.rng.randrange(0, 101) < self.freeze_mon_probability
+
+ def freeze_mon(self, mon):
+ """
+ Send STOP signal to freeze the monitor.
+ """
+ log.info('Sending STOP to mon %s', mon)
+ self.manager.signal_mon(mon, 19) # STOP
+
+ def unfreeze_mon(self, mon):
+ """
+ Send CONT signal to unfreeze the monitor.
+ """
+ log.info('Sending CONT to mon %s', mon)
+ self.manager.signal_mon(mon, 18) # CONT
+
+ def kill_mon(self, mon):
+ """
+ Kill the monitor specified
+ """
+ self.log('killing mon.{id}'.format(id=mon))
+ self.manager.kill_mon(mon)
+
+ def revive_mon(self, mon):
+ """
+ Revive the monitor specified
+ """
+ self.log('killing mon.{id}'.format(id=mon))
+ self.log('reviving mon.{id}'.format(id=mon))
+ self.manager.revive_mon(mon)
+
+ def max_killable(self):
+ """
+ Return the maximum number of monitors we can kill.
+ """
+ m = len(_get_mons(self.ctx))
+ if self.maintain_quorum:
+ return max(math.ceil(m/2.0)-1, 0)
+ else:
+ return m
+
+ def do_thrash(self):
+ """
+ Cotinuously loop and thrash the monitors.
+ """
+ self.log('start thrashing')
+ self.log('seed: {s}, revive delay: {r}, thrash delay: {t} '\
+ 'thrash many: {tm}, maintain quorum: {mq} '\
+ 'store thrash: {st}, probability: {stp} '\
+ 'freeze mon: prob {fp} duration {fd}'.format(
+ s=self.random_seed,r=self.revive_delay,t=self.thrash_delay,
+ tm=self.thrash_many, mq=self.maintain_quorum,
+ st=self.store_thrash,stp=self.store_thrash_probability,
+ fp=self.freeze_mon_probability,fd=self.freeze_mon_duration,
+ ))
+
+ while not self.stopping:
+ mons = _get_mons(self.ctx)
+ self.manager.wait_for_mon_quorum_size(len(mons))
+ self.log('making sure all monitors are in the quorum')
+ for m in mons:
+ s = self.manager.get_mon_status(m)
+ assert s['state'] == 'leader' or s['state'] == 'peon'
+ assert len(s['quorum']) == len(mons)
+
+ kill_up_to = self.rng.randrange(1, self.max_killable()+1)
+ mons_to_kill = self.rng.sample(mons, kill_up_to)
+ self.log('monitors to thrash: {m}'.format(m=mons_to_kill))
+
+ mons_to_freeze = []
+ for mon in mons:
+ if mon in mons_to_kill:
+ continue
+ if self.should_freeze_mon():
+ mons_to_freeze.append(mon)
+ self.log('monitors to freeze: {m}'.format(m=mons_to_freeze))
+
+ for mon in mons_to_kill:
+ self.log('thrashing mon.{m}'.format(m=mon))
+
+ """ we only thrash stores if we are maintaining quorum """
+ if self.should_thrash_store() and self.maintain_quorum:
+ self.thrash_store(mon)
+
+ self.kill_mon(mon)
+
+ if mons_to_freeze:
+ for mon in mons_to_freeze:
+ self.freeze_mon(mon)
+ self.log('waiting for {delay} secs to unfreeze mons'.format(
+ delay=self.freeze_mon_duration))
+ time.sleep(self.freeze_mon_duration)
+ for mon in mons_to_freeze:
+ self.unfreeze_mon(mon)
+
+ if self.maintain_quorum:
+ self.manager.wait_for_mon_quorum_size(len(mons)-len(mons_to_kill))
+ for m in mons:
+ if m in mons_to_kill:
+ continue
+ s = self.manager.get_mon_status(m)
+ assert s['state'] == 'leader' or s['state'] == 'peon'
+ assert len(s['quorum']) == len(mons)-len(mons_to_kill)
+
+ self.log('waiting for {delay} secs before reviving monitors'.format(
+ delay=self.revive_delay))
+ time.sleep(self.revive_delay)
+
+ for mon in mons_to_kill:
+ self.revive_mon(mon)
+ # do more freezes
+ if mons_to_freeze:
+ for mon in mons_to_freeze:
+ self.freeze_mon(mon)
+ self.log('waiting for {delay} secs to unfreeze mons'.format(
+ delay=self.freeze_mon_duration))
+ time.sleep(self.freeze_mon_duration)
+ for mon in mons_to_freeze:
+ self.unfreeze_mon(mon)
+
+ self.manager.wait_for_mon_quorum_size(len(mons))
+ for m in mons:
+ s = self.manager.get_mon_status(m)
+ assert s['state'] == 'leader' or s['state'] == 'peon'
+ assert len(s['quorum']) == len(mons)
+
+ if self.scrub:
+ self.log('triggering scrub')
+ try:
+ self.manager.raw_cluster_cmd('scrub')
+ except Exception:
+ log.exception("Saw exception while triggering scrub")
+
+ if self.thrash_delay > 0.0:
+ self.log('waiting for {delay} secs before continuing thrashing'.format(
+ delay=self.thrash_delay))
+ time.sleep(self.thrash_delay)
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Stress test the monitor by thrashing them while another task/workunit
+ is running.
+
+ Please refer to MonitorThrasher class for further information on the
+ available options.
+ """
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'mon_thrash task only accepts a dict for configuration'
+ assert len(_get_mons(ctx)) > 2, \
+ 'mon_thrash task requires at least 3 monitors'
+ log.info('Beginning mon_thrash...')
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+ manager = ceph_manager.CephManager(
+ mon,
+ ctx=ctx,
+ logger=log.getChild('ceph_manager'),
+ )
+ thrash_proc = MonitorThrasher(ctx,
+ manager, config,
+ logger=log.getChild('mon_thrasher'))
+ try:
+ log.debug('Yielding')
+ yield
+ finally:
+ log.info('joining mon_thrasher')
+ thrash_proc.do_join()
+ mons = _get_mons(ctx)
+ manager.wait_for_mon_quorum_size(len(mons))
--- /dev/null
+"""
+Multibench testing
+"""
+import contextlib
+import logging
+import radosbench
+import time
+import copy
+import gevent
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Run multibench
+
+ The config should be as follows:
+
+ multibench:
+ time: <seconds to run total>
+ segments: <number of concurrent benches>
+ radosbench: <config for radosbench>
+
+ example:
+
+ tasks:
+ - ceph:
+ - multibench:
+ clients: [client.0]
+ time: 360
+ - interactive:
+ """
+ log.info('Beginning multibench...')
+ assert isinstance(config, dict), \
+ "please list clients to run on"
+
+ def run_one(num):
+ """Run test spawn from gevent"""
+ start = time.time()
+ if not config.get('radosbench'):
+ benchcontext = {}
+ else:
+ benchcontext = copy.copy(config.get('radosbench'))
+ iterations = 0
+ while time.time() - start < int(config.get('time', 600)):
+ log.info("Starting iteration %s of segment %s"%(iterations, num))
+ benchcontext['pool'] = str(num) + "-" + str(iterations)
+ with radosbench.task(ctx, benchcontext):
+ time.sleep()
+ iterations += 1
+ log.info("Starting %s threads"%(str(config.get('segments', 3)),))
+ segments = [
+ gevent.spawn(run_one, i)
+ for i in range(0, int(config.get('segments', 3)))]
+
+ try:
+ yield
+ finally:
+ [i.get() for i in segments]
--- /dev/null
+"""
+Test Object locations going down
+"""
+import logging
+import ceph_manager
+import time
+from teuthology import misc as teuthology
+from util.rados import rados
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+ """
+ Test handling of object location going down
+ """
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'lost_unfound task only accepts a dict for configuration'
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+ manager = ceph_manager.CephManager(
+ mon,
+ ctx=ctx,
+ logger=log.getChild('ceph_manager'),
+ )
+
+ while len(manager.get_osd_status()['up']) < 3:
+ time.sleep(10)
+ manager.wait_for_clean()
+
+ # something that is always there
+ dummyfile = '/etc/fstab'
+
+ # take 0, 1 out
+ manager.mark_out_osd(0)
+ manager.mark_out_osd(1)
+ manager.wait_for_clean()
+
+ # delay recovery, and make the pg log very long (to prevent backfill)
+ manager.raw_cluster_cmd(
+ 'tell', 'osd.0',
+ 'injectargs',
+ '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000'
+ )
+ # delay recovery, and make the pg log very long (to prevent backfill)
+ manager.raw_cluster_cmd(
+ 'tell', 'osd.1',
+ 'injectargs',
+ '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000'
+ )
+ # delay recovery, and make the pg log very long (to prevent backfill)
+ manager.raw_cluster_cmd(
+ 'tell', 'osd.2',
+ 'injectargs',
+ '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000'
+ )
+ # delay recovery, and make the pg log very long (to prevent backfill)
+ manager.raw_cluster_cmd(
+ 'tell', 'osd.3',
+ 'injectargs',
+ '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000'
+ )
+
+ # kludge to make sure they get a map
+ rados(ctx, mon, ['-p', 'data', 'put', 'dummy', dummyfile])
+
+ # create old objects
+ for f in range(1, 10):
+ rados(ctx, mon, ['-p', 'data', 'put', 'existing_%d' % f, dummyfile])
+
+ manager.mark_out_osd(3)
+ manager.wait_till_active()
+
+ manager.mark_in_osd(0)
+ manager.wait_till_active()
+
+ manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+
+ manager.mark_out_osd(2)
+ manager.wait_till_active()
+
+ # bring up 1
+ manager.mark_in_osd(1)
+ manager.wait_till_active()
+
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ log.info("Getting unfound objects")
+ unfound = manager.get_num_unfound_objects()
+ assert not unfound
+
+ manager.kill_osd(2)
+ manager.mark_down_osd(2)
+ manager.kill_osd(3)
+ manager.mark_down_osd(3)
+
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ log.info("Getting unfound objects")
+ unfound = manager.get_num_unfound_objects()
+ assert unfound
--- /dev/null
+"""
+Run omapbench executable within teuthology
+"""
+import contextlib
+import logging
+
+from teuthology.orchestra import run
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Run omapbench
+
+ The config should be as follows::
+
+ omapbench:
+ clients: [client list]
+ threads: <threads at once>
+ objects: <number of objects to write>
+ entries: <number of entries per object map>
+ keysize: <number of characters per object map key>
+ valsize: <number of characters per object map val>
+ increment: <interval to show in histogram (in ms)>
+ omaptype: <how the omaps should be generated>
+
+ example::
+
+ tasks:
+ - ceph:
+ - omapbench:
+ clients: [client.0]
+ threads: 30
+ objects: 1000
+ entries: 10
+ keysize: 10
+ valsize: 100
+ increment: 100
+ omaptype: uniform
+ - interactive:
+ """
+ log.info('Beginning omapbench...')
+ assert isinstance(config, dict), \
+ "please list clients to run on"
+ omapbench = {}
+ testdir = teuthology.get_testdir(ctx)
+ print(str(config.get('increment',-1)))
+ for role in config.get('clients', ['client.0']):
+ assert isinstance(role, basestring)
+ PREFIX = 'client.'
+ assert role.startswith(PREFIX)
+ id_ = role[len(PREFIX):]
+ (remote,) = ctx.cluster.only(role).remotes.iterkeys()
+ proc = remote.run(
+ args=[
+ "/bin/sh", "-c",
+ " ".join(['adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage',
+ 'omapbench',
+ '--name', role[len(PREFIX):],
+ '-t', str(config.get('threads', 30)),
+ '-o', str(config.get('objects', 1000)),
+ '--entries', str(config.get('entries',10)),
+ '--keysize', str(config.get('keysize',10)),
+ '--valsize', str(config.get('valsize',1000)),
+ '--inc', str(config.get('increment',10)),
+ '--omaptype', str(config.get('omaptype','uniform'))
+ ]).format(tdir=testdir),
+ ],
+ logger=log.getChild('omapbench.{id}'.format(id=id_)),
+ stdin=run.PIPE,
+ wait=False
+ )
+ omapbench[id_] = proc
+
+ try:
+ yield
+ finally:
+ log.info('joining omapbench')
+ run.wait(omapbench.itervalues())
--- /dev/null
+"""
+Osd backfill test
+"""
+import logging
+import ceph_manager
+import time
+from teuthology import misc as teuthology
+
+
+log = logging.getLogger(__name__)
+
+
+def rados_start(ctx, remote, cmd):
+ """
+ Run a remote rados command (currently used to only write data)
+ """
+ log.info("rados %s" % ' '.join(cmd))
+ testdir = teuthology.get_testdir(ctx)
+ pre = [
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'rados',
+ ];
+ pre.extend(cmd)
+ proc = remote.run(
+ args=pre,
+ wait=False,
+ )
+ return proc
+
+def task(ctx, config):
+ """
+ Test backfill
+ """
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'thrashosds task only accepts a dict for configuration'
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+ num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
+ log.info('num_osds is %s' % num_osds)
+ assert num_osds == 3
+
+ manager = ceph_manager.CephManager(
+ mon,
+ ctx=ctx,
+ logger=log.getChild('ceph_manager'),
+ )
+
+ while len(manager.get_osd_status()['up']) < 3:
+ time.sleep(10)
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.wait_for_clean()
+
+ # write some data
+ p = rados_start(ctx, mon, ['-p', 'rbd', 'bench', '15', 'write', '-b', '4096',
+ '--no-cleanup'])
+ err = p.wait()
+ log.info('err is %d' % err)
+
+ # mark osd.0 out to trigger a rebalance/backfill
+ manager.mark_out_osd(0)
+
+ # also mark it down to it won't be included in pg_temps
+ manager.kill_osd(0)
+ manager.mark_down_osd(0)
+
+ # wait for everything to peer and be happy...
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.wait_for_recovery()
+
+ # write some new data
+ p = rados_start(ctx, mon, ['-p', 'rbd', 'bench', '30', 'write', '-b', '4096',
+ '--no-cleanup'])
+
+ time.sleep(15)
+
+ # blackhole + restart osd.1
+ # this triggers a divergent backfill target
+ manager.blackhole_kill_osd(1)
+ time.sleep(2)
+ manager.revive_osd(1)
+
+ # wait for our writes to complete + succeed
+ err = p.wait()
+ log.info('err is %d' % err)
+
+ # cluster must recover
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.wait_for_recovery()
+
+ # re-add osd.0
+ manager.revive_osd(0)
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.wait_for_clean()
+
+
--- /dev/null
+"""
+Handle osdfailsafe configuration settings (nearfull ratio and full ratio)
+"""
+from cStringIO import StringIO
+import logging
+import time
+
+from teuthology.orchestra import run
+from util.rados import rados
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+ """
+ Test handling of osd_failsafe_nearfull_ratio and osd_failsafe_full_ratio
+ configuration settings
+
+ In order for test to pass must use log-whitelist as follows
+
+ tasks:
+ - chef:
+ - install:
+ - ceph:
+ log-whitelist: ['OSD near full', 'OSD full dropping all updates']
+ - osd_failsafe_enospc:
+
+ """
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'osd_failsafe_enospc task only accepts a dict for configuration'
+
+ # Give 2 seconds for injectargs + osd_op_complaint_time (30) + 2 * osd_heartbeat_interval (6) + 6 padding
+ sleep_time = 50
+
+ # something that is always there
+ dummyfile = '/etc/fstab'
+ dummyfile2 = '/etc/resolv.conf'
+
+ manager = ctx.managers['ceph']
+
+ # create 1 pg pool with 1 rep which can only be on osd.0
+ osds = manager.get_osd_dump()
+ for osd in osds:
+ if osd['osd'] != 0:
+ manager.mark_out_osd(osd['osd'])
+
+ log.info('creating pool foo')
+ manager.create_pool("foo")
+ manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'size', '1')
+
+ # State NONE -> NEAR
+ log.info('1. Verify warning messages when exceeding nearfull_ratio')
+
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+ proc = mon.run(
+ args=[
+ 'sudo',
+ 'daemon-helper',
+ 'kill',
+ 'ceph', '-w'
+ ],
+ stdin=run.PIPE,
+ stdout=StringIO(),
+ wait=False,
+ )
+
+ manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .00001')
+
+ time.sleep(sleep_time)
+ proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
+ proc.wait()
+
+ lines = proc.stdout.getvalue().split('\n')
+
+ count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
+ assert count == 2, 'Incorrect number of warning messages expected 2 got %d' % count
+ count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
+ assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
+
+ # State NEAR -> FULL
+ log.info('2. Verify error messages when exceeding full_ratio')
+
+ proc = mon.run(
+ args=[
+ 'sudo',
+ 'daemon-helper',
+ 'kill',
+ 'ceph', '-w'
+ ],
+ stdin=run.PIPE,
+ stdout=StringIO(),
+ wait=False,
+ )
+
+ manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')
+
+ time.sleep(sleep_time)
+ proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
+ proc.wait()
+
+ lines = proc.stdout.getvalue().split('\n')
+
+ count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
+ assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count
+
+ log.info('3. Verify write failure when exceeding full_ratio')
+
+ # Write data should fail
+ ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile1', dummyfile])
+ assert ret != 0, 'Expected write failure but it succeeded with exit status 0'
+
+ # Put back default
+ manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
+ time.sleep(10)
+
+ # State FULL -> NEAR
+ log.info('4. Verify write success when NOT exceeding full_ratio')
+
+ # Write should succeed
+ ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile2', dummyfile2])
+ assert ret == 0, 'Expected write to succeed, but got exit status %d' % ret
+
+ log.info('5. Verify warning messages again when exceeding nearfull_ratio')
+
+ proc = mon.run(
+ args=[
+ 'sudo',
+ 'daemon-helper',
+ 'kill',
+ 'ceph', '-w'
+ ],
+ stdin=run.PIPE,
+ stdout=StringIO(),
+ wait=False,
+ )
+
+ time.sleep(sleep_time)
+ proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
+ proc.wait()
+
+ lines = proc.stdout.getvalue().split('\n')
+
+ count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
+ assert count == 1 or count == 2, 'Incorrect number of warning messages expected 1 or 2 got %d' % count
+ count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
+ assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
+
+ manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .90')
+ time.sleep(10)
+
+ # State NONE -> FULL
+ log.info('6. Verify error messages again when exceeding full_ratio')
+
+ proc = mon.run(
+ args=[
+ 'sudo',
+ 'daemon-helper',
+ 'kill',
+ 'ceph', '-w'
+ ],
+ stdin=run.PIPE,
+ stdout=StringIO(),
+ wait=False,
+ )
+
+ manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')
+
+ time.sleep(sleep_time)
+ proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
+ proc.wait()
+
+ lines = proc.stdout.getvalue().split('\n')
+
+ count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
+ assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
+ count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
+ assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count
+
+ # State FULL -> NONE
+ log.info('7. Verify no messages settings back to default')
+
+ manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
+ time.sleep(10)
+
+ proc = mon.run(
+ args=[
+ 'sudo',
+ 'daemon-helper',
+ 'kill',
+ 'ceph', '-w'
+ ],
+ stdin=run.PIPE,
+ stdout=StringIO(),
+ wait=False,
+ )
+
+ time.sleep(sleep_time)
+ proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
+ proc.wait()
+
+ lines = proc.stdout.getvalue().split('\n')
+
+ count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
+ assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
+ count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
+ assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
+
+ log.info('Test Passed')
+
+ # Bring all OSDs back in
+ manager.remove_pool("foo")
+ for osd in osds:
+ if osd['osd'] != 0:
+ manager.mark_in_osd(osd['osd'])
--- /dev/null
+"""
+osd recovery
+"""
+import logging
+import ceph_manager
+import time
+from teuthology import misc as teuthology
+
+
+log = logging.getLogger(__name__)
+
+
+def rados_start(testdir, remote, cmd):
+ """
+ Run a remote rados command (currently used to only write data)
+ """
+ log.info("rados %s" % ' '.join(cmd))
+ pre = [
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'rados',
+ ];
+ pre.extend(cmd)
+ proc = remote.run(
+ args=pre,
+ wait=False,
+ )
+ return proc
+
+def task(ctx, config):
+ """
+ Test (non-backfill) recovery
+ """
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'task only accepts a dict for configuration'
+ testdir = teuthology.get_testdir(ctx)
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+ num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
+ log.info('num_osds is %s' % num_osds)
+ assert num_osds == 3
+
+ manager = ceph_manager.CephManager(
+ mon,
+ ctx=ctx,
+ logger=log.getChild('ceph_manager'),
+ )
+
+ while len(manager.get_osd_status()['up']) < 3:
+ time.sleep(10)
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.wait_for_clean()
+
+ # test some osdmap flags
+ manager.raw_cluster_cmd('osd', 'set', 'noin')
+ manager.raw_cluster_cmd('osd', 'set', 'noout')
+ manager.raw_cluster_cmd('osd', 'set', 'noup')
+ manager.raw_cluster_cmd('osd', 'set', 'nodown')
+ manager.raw_cluster_cmd('osd', 'unset', 'noin')
+ manager.raw_cluster_cmd('osd', 'unset', 'noout')
+ manager.raw_cluster_cmd('osd', 'unset', 'noup')
+ manager.raw_cluster_cmd('osd', 'unset', 'nodown')
+
+ # write some new data
+ p = rados_start(testdir, mon, ['-p', 'rbd', 'bench', '20', 'write', '-b', '4096',
+ '--no-cleanup'])
+
+ time.sleep(15)
+
+ # trigger a divergent target:
+ # blackhole + restart osd.1 (shorter log)
+ manager.blackhole_kill_osd(1)
+ # kill osd.2 (longer log... we'll make it divergent below)
+ manager.kill_osd(2)
+ time.sleep(2)
+ manager.revive_osd(1)
+
+ # wait for our writes to complete + succeed
+ err = p.wait()
+ log.info('err is %d' % err)
+
+ # cluster must repeer
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.wait_for_active_or_down()
+
+ # write some more (make sure osd.2 really is divergent)
+ p = rados_start(testdir, mon, ['-p', 'rbd', 'bench', '15', 'write', '-b', '4096'])
+ p.wait()
+
+ # revive divergent osd
+ manager.revive_osd(2)
+
+ while len(manager.get_osd_status()['up']) < 3:
+ log.info('waiting a bit...')
+ time.sleep(2)
+ log.info('3 are up!')
+
+ # cluster must recover
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.wait_for_clean()
+
+
+def test_incomplete_pgs(ctx, config):
+ """
+ Test handling of incomplete pgs. Requires 4 osds.
+ """
+ testdir = teuthology.get_testdir(ctx)
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'task only accepts a dict for configuration'
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+ num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
+ log.info('num_osds is %s' % num_osds)
+ assert num_osds == 4
+
+ manager = ceph_manager.CephManager(
+ mon,
+ ctx=ctx,
+ logger=log.getChild('ceph_manager'),
+ )
+
+ while len(manager.get_osd_status()['up']) < 4:
+ time.sleep(10)
+
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats')
+ manager.wait_for_clean()
+
+ log.info('Testing incomplete pgs...')
+
+ for i in range(4):
+ manager.set_config(
+ i,
+ osd_recovery_delay_start=1000)
+
+ # move data off of osd.0, osd.1
+ manager.raw_cluster_cmd('osd', 'out', '0', '1')
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats')
+ manager.wait_for_clean()
+
+ # lots of objects in rbd (no pg log, will backfill)
+ p = rados_start(testdir, mon,
+ ['-p', 'rbd', 'bench', '20', 'write', '-b', '1',
+ '--no-cleanup'])
+ p.wait()
+
+ # few objects in rbd pool (with pg log, normal recovery)
+ for f in range(1, 20):
+ p = rados_start(testdir, mon, ['-p', 'rbd', 'put',
+ 'foo.%d' % f, '/etc/passwd'])
+ p.wait()
+
+ # move it back
+ manager.raw_cluster_cmd('osd', 'in', '0', '1')
+ manager.raw_cluster_cmd('osd', 'out', '2', '3')
+ time.sleep(10)
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats')
+ time.sleep(10)
+ manager.wait_for_active()
+
+ assert not manager.is_clean()
+ assert not manager.is_recovered()
+
+ # kill 2 + 3
+ log.info('stopping 2,3')
+ manager.kill_osd(2)
+ manager.kill_osd(3)
+ log.info('...')
+ manager.raw_cluster_cmd('osd', 'down', '2', '3')
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.wait_for_active_or_down()
+
+ assert manager.get_num_down() > 0
+
+ # revive 2 + 3
+ manager.revive_osd(2)
+ manager.revive_osd(3)
+ while len(manager.get_osd_status()['up']) < 4:
+ log.info('waiting a bit...')
+ time.sleep(2)
+ log.info('all are up!')
+
+ for i in range(4):
+ manager.kick_recovery_wq(i)
+
+ # cluster must recover
+ manager.wait_for_clean()
--- /dev/null
+"""
+Peer test (Single test, not much configurable here)
+"""
+import logging
+import json
+import time
+
+import ceph_manager
+from teuthology import misc as teuthology
+from util.rados import rados
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+ """
+ Test peering.
+ """
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'peer task only accepts a dict for configuration'
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+ manager = ceph_manager.CephManager(
+ mon,
+ ctx=ctx,
+ logger=log.getChild('ceph_manager'),
+ )
+
+ while len(manager.get_osd_status()['up']) < 3:
+ time.sleep(10)
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.wait_for_clean()
+
+ for i in range(3):
+ manager.set_config(
+ i,
+ osd_recovery_delay_start=120)
+
+ # take on osd down
+ manager.kill_osd(2)
+ manager.mark_down_osd(2)
+
+ # kludge to make sure they get a map
+ rados(ctx, mon, ['-p', 'data', 'get', 'dummy', '-'])
+
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.wait_for_recovery()
+
+ # kill another and revive 2, so that some pgs can't peer.
+ manager.kill_osd(1)
+ manager.mark_down_osd(1)
+ manager.revive_osd(2)
+ manager.wait_till_osd_is_up(2)
+
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+
+ manager.wait_for_active_or_down()
+
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+
+ # look for down pgs
+ num_down_pgs = 0
+ pgs = manager.get_pg_stats()
+ for pg in pgs:
+ out = manager.raw_cluster_cmd('pg', pg['pgid'], 'query')
+ log.debug("out string %s",out)
+ j = json.loads(out)
+ log.info("pg is %s, query json is %s", pg, j)
+
+ if pg['state'].count('down'):
+ num_down_pgs += 1
+ # verify that it is blocked on osd.1
+ rs = j['recovery_state']
+ assert len(rs) > 0
+ assert rs[0]['name'] == 'Started/Primary/Peering/GetInfo'
+ assert rs[1]['name'] == 'Started/Primary/Peering'
+ assert rs[1]['blocked']
+ assert rs[1]['down_osds_we_would_probe'] == [1]
+ assert len(rs[1]['peering_blocked_by']) == 1
+ assert rs[1]['peering_blocked_by'][0]['osd'] == 1
+
+ assert num_down_pgs > 0
+
+ # bring it all back
+ manager.revive_osd(1)
+ manager.wait_till_osd_is_up(1)
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.wait_for_clean()
--- /dev/null
+"""
+Remotely run peering tests.
+"""
+import logging
+import time
+
+log = logging.getLogger(__name__)
+
+from args import argify
+
+POOLNAME = "POOLNAME"
+ARGS = [
+ ('num_pgs', 'number of pgs to create', 256, int),
+ ('max_time', 'seconds to complete peering', 0, int),
+ ('runs', 'trials to run', 10, int),
+ ('num_objects', 'objects to create', 256 * 1024, int),
+ ('object_size', 'size in bytes for objects', 64, int),
+ ('creation_time_limit', 'time limit for pool population', 60*60, int),
+ ('create_threads', 'concurrent writes for create', 256, int)
+ ]
+
+def setup(ctx, config):
+ """
+ Setup peering test on remotes.
+ """
+ manager = ctx.managers['ceph']
+ manager.clear_pools()
+ manager.create_pool(POOLNAME, config.num_pgs)
+ log.info("populating pool")
+ manager.rados_write_objects(
+ POOLNAME,
+ config.num_objects,
+ config.object_size,
+ config.creation_time_limit,
+ config.create_threads)
+ log.info("done populating pool")
+
+def do_run(ctx, config):
+ """
+ Perform the test.
+ """
+ start = time.time()
+ # mark in osd
+ manager = ctx.managers['ceph']
+ manager.mark_in_osd(0)
+ log.info("writing out objects")
+ manager.rados_write_objects(
+ POOLNAME,
+ config.num_pgs, # write 1 object per pg or so
+ 1,
+ config.creation_time_limit,
+ config.num_pgs, # lots of concurrency
+ cleanup = True)
+ peering_end = time.time()
+
+ log.info("peering done, waiting on recovery")
+ manager.wait_for_clean()
+
+ log.info("recovery done")
+ recovery_end = time.time()
+ if config.max_time:
+ assert(peering_end - start < config.max_time)
+ manager.mark_out_osd(0)
+ manager.wait_for_clean()
+ return {
+ 'time_to_active': peering_end - start,
+ 'time_to_clean': recovery_end - start
+ }
+
+@argify("peering_speed_test", ARGS)
+def task(ctx, config):
+ """
+ Peering speed test
+ """
+ setup(ctx, config)
+ manager = ctx.managers['ceph']
+ manager.mark_out_osd(0)
+ manager.wait_for_clean()
+ ret = []
+ for i in range(config.runs):
+ log.info("Run {i}".format(i = i))
+ ret.append(do_run(ctx, config))
+
+ manager.mark_in_osd(0)
+ ctx.summary['recovery_times'] = {
+ 'runs': ret
+ }
--- /dev/null
+"""
+Populate rbd pools
+"""
+import contextlib
+import logging
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Populate <num_pools> pools with prefix <pool_prefix> with <num_images>
+ rbd images at <num_snaps> snaps
+
+ The config could be as follows::
+
+ populate_rbd_pool:
+ client: <client>
+ pool_prefix: foo
+ num_pools: 5
+ num_images: 10
+ num_snaps: 3
+ image_size: 10737418240
+ """
+ if config is None:
+ config = {}
+ client = config.get("client", "client.0")
+ pool_prefix = config.get("pool_prefix", "foo")
+ num_pools = config.get("num_pools", 2)
+ num_images = config.get("num_images", 20)
+ num_snaps = config.get("num_snaps", 4)
+ image_size = config.get("image_size", 100)
+ write_size = config.get("write_size", 1024*1024)
+ write_threads = config.get("write_threads", 10)
+ write_total_per_snap = config.get("write_total_per_snap", 1024*1024*30)
+
+ (remote,) = ctx.cluster.only(client).remotes.iterkeys()
+
+ for poolid in range(num_pools):
+ poolname = "%s-%s" % (pool_prefix, str(poolid))
+ log.info("Creating pool %s" % (poolname,))
+ ctx.managers['ceph'].create_pool(poolname)
+ for imageid in range(num_images):
+ imagename = "rbd-%s" % (str(imageid),)
+ log.info("Creating imagename %s" % (imagename,))
+ remote.run(
+ args = [
+ "rbd",
+ "create",
+ imagename,
+ "--image-format", "1",
+ "--size", str(image_size),
+ "--pool", str(poolname)])
+ def bench_run():
+ remote.run(
+ args = [
+ "rbd",
+ "bench-write",
+ imagename,
+ "--pool", poolname,
+ "--io-size", str(write_size),
+ "--io-threads", str(write_threads),
+ "--io-total", str(write_total_per_snap),
+ "--io-pattern", "rand"])
+ log.info("imagename %s first bench" % (imagename,))
+ bench_run()
+ for snapid in range(num_snaps):
+ snapname = "snap-%s" % (str(snapid),)
+ log.info("imagename %s creating snap %s" % (imagename, snapname))
+ remote.run(
+ args = [
+ "rbd", "snap", "create",
+ "--pool", poolname,
+ "--snap", snapname,
+ imagename
+ ])
+ bench_run()
+
+ try:
+ yield
+ finally:
+ log.info('done')
--- /dev/null
+"""
+Qemu task
+"""
+from cStringIO import StringIO
+
+import contextlib
+import logging
+import os
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from tasks import rbd
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+DEFAULT_NUM_RBD = 1
+DEFAULT_IMAGE_URL = 'http://ceph.com/qa/ubuntu-12.04.qcow2'
+DEFAULT_MEM = 4096 # in megabytes
+
+def create_images(ctx, config, managers):
+ for client, client_config in config.iteritems():
+ num_rbd = client_config.get('num_rbd', 1)
+ clone = client_config.get('clone', False)
+ assert num_rbd > 0, 'at least one rbd device must be used'
+ for i in xrange(num_rbd):
+ create_config = {
+ client: {
+ 'image_name': '{client}.{num}'.format(client=client, num=i),
+ 'image_format': 2 if clone else 1,
+ }
+ }
+ managers.append(
+ lambda create_config=create_config:
+ rbd.create_image(ctx=ctx, config=create_config)
+ )
+
+def create_clones(ctx, config, managers):
+ for client, client_config in config.iteritems():
+ num_rbd = client_config.get('num_rbd', 1)
+ clone = client_config.get('clone', False)
+ if clone:
+ for i in xrange(num_rbd):
+ create_config = {
+ client: {
+ 'image_name':
+ '{client}.{num}-clone'.format(client=client, num=i),
+ 'parent_name':
+ '{client}.{num}'.format(client=client, num=i),
+ }
+ }
+ managers.append(
+ lambda create_config=create_config:
+ rbd.clone_image(ctx=ctx, config=create_config)
+ )
+
+@contextlib.contextmanager
+def create_dirs(ctx, config):
+ """
+ Handle directory creation and cleanup
+ """
+ testdir = teuthology.get_testdir(ctx)
+ for client, client_config in config.iteritems():
+ assert 'test' in client_config, 'You must specify a test to run'
+ (remote,) = ctx.cluster.only(client).remotes.keys()
+ remote.run(
+ args=[
+ 'install', '-d', '-m0755', '--',
+ '{tdir}/qemu'.format(tdir=testdir),
+ '{tdir}/archive/qemu'.format(tdir=testdir),
+ ]
+ )
+ try:
+ yield
+ finally:
+ for client, client_config in config.iteritems():
+ assert 'test' in client_config, 'You must specify a test to run'
+ (remote,) = ctx.cluster.only(client).remotes.keys()
+ remote.run(
+ args=[
+ 'rmdir', '{tdir}/qemu'.format(tdir=testdir), run.Raw('||'), 'true',
+ ]
+ )
+
+@contextlib.contextmanager
+def generate_iso(ctx, config):
+ """Execute system commands to generate iso"""
+ log.info('generating iso...')
+ testdir = teuthology.get_testdir(ctx)
+ for client, client_config in config.iteritems():
+ assert 'test' in client_config, 'You must specify a test to run'
+ (remote,) = ctx.cluster.only(client).remotes.keys()
+ src_dir = os.path.dirname(__file__)
+ userdata_path = os.path.join(testdir, 'qemu', 'userdata.' + client)
+ metadata_path = os.path.join(testdir, 'qemu', 'metadata.' + client)
+
+ with file(os.path.join(src_dir, 'userdata_setup.yaml'), 'rb') as f:
+ test_setup = ''.join(f.readlines())
+ # configuring the commands to setup the nfs mount
+ mnt_dir = "/export/{client}".format(client=client)
+ test_setup = test_setup.format(
+ mnt_dir=mnt_dir
+ )
+
+ with file(os.path.join(src_dir, 'userdata_teardown.yaml'), 'rb') as f:
+ test_teardown = ''.join(f.readlines())
+
+ user_data = test_setup
+ if client_config.get('type', 'filesystem') == 'filesystem':
+ for i in xrange(0, client_config.get('num_rbd', DEFAULT_NUM_RBD)):
+ dev_letter = chr(ord('b') + i)
+ user_data += """
+- |
+ #!/bin/bash
+ mkdir /mnt/test_{dev_letter}
+ mkfs -t xfs /dev/vd{dev_letter}
+ mount -t xfs /dev/vd{dev_letter} /mnt/test_{dev_letter}
+""".format(dev_letter=dev_letter)
+
+ # this may change later to pass the directories as args to the
+ # script or something. xfstests needs that.
+ user_data += """
+- |
+ #!/bin/bash
+ test -d /mnt/test_b && cd /mnt/test_b
+ /mnt/cdrom/test.sh > /mnt/log/test.log 2>&1 && touch /mnt/log/success
+""" + test_teardown
+
+ teuthology.write_file(remote, userdata_path, StringIO(user_data))
+
+ with file(os.path.join(src_dir, 'metadata.yaml'), 'rb') as f:
+ teuthology.write_file(remote, metadata_path, f)
+
+ test_file = '{tdir}/qemu/{client}.test.sh'.format(tdir=testdir, client=client)
+ remote.run(
+ args=[
+ 'wget', '-nv', '-O', test_file,
+ client_config['test'],
+ run.Raw('&&'),
+ 'chmod', '755', test_file,
+ ],
+ )
+ remote.run(
+ args=[
+ 'genisoimage', '-quiet', '-input-charset', 'utf-8',
+ '-volid', 'cidata', '-joliet', '-rock',
+ '-o', '{tdir}/qemu/{client}.iso'.format(tdir=testdir, client=client),
+ '-graft-points',
+ 'user-data={userdata}'.format(userdata=userdata_path),
+ 'meta-data={metadata}'.format(metadata=metadata_path),
+ 'test.sh={file}'.format(file=test_file),
+ ],
+ )
+ try:
+ yield
+ finally:
+ for client in config.iterkeys():
+ (remote,) = ctx.cluster.only(client).remotes.keys()
+ remote.run(
+ args=[
+ 'rm', '-f',
+ '{tdir}/qemu/{client}.iso'.format(tdir=testdir, client=client),
+ os.path.join(testdir, 'qemu', 'userdata.' + client),
+ os.path.join(testdir, 'qemu', 'metadata.' + client),
+ '{tdir}/qemu/{client}.test.sh'.format(tdir=testdir, client=client),
+ ],
+ )
+
+@contextlib.contextmanager
+def download_image(ctx, config):
+ """Downland base image, remove image file when done"""
+ log.info('downloading base image')
+ testdir = teuthology.get_testdir(ctx)
+ for client, client_config in config.iteritems():
+ (remote,) = ctx.cluster.only(client).remotes.keys()
+ base_file = '{tdir}/qemu/base.{client}.qcow2'.format(tdir=testdir, client=client)
+ remote.run(
+ args=[
+ 'wget', '-nv', '-O', base_file, DEFAULT_IMAGE_URL,
+ ]
+ )
+ try:
+ yield
+ finally:
+ log.debug('cleaning up base image files')
+ for client in config.iterkeys():
+ base_file = '{tdir}/qemu/base.{client}.qcow2'.format(
+ tdir=testdir,
+ client=client,
+ )
+ (remote,) = ctx.cluster.only(client).remotes.keys()
+ remote.run(
+ args=[
+ 'rm', '-f', base_file,
+ ],
+ )
+
+
+def _setup_nfs_mount(remote, client, mount_dir):
+ """
+ Sets up an nfs mount on the remote that the guest can use to
+ store logs. This nfs mount is also used to touch a file
+ at the end of the test to indiciate if the test was successful
+ or not.
+ """
+ export_dir = "/export/{client}".format(client=client)
+ log.info("Creating the nfs export directory...")
+ remote.run(args=[
+ 'sudo', 'mkdir', '-p', export_dir,
+ ])
+ log.info("Mounting the test directory...")
+ remote.run(args=[
+ 'sudo', 'mount', '--bind', mount_dir, export_dir,
+ ])
+ log.info("Adding mount to /etc/exports...")
+ export = "{dir} *(rw,no_root_squash,no_subtree_check,insecure)".format(
+ dir=export_dir
+ )
+ remote.run(args=[
+ 'sudo', 'sed', '-i', '/^\/export\//d', "/etc/exports",
+ ])
+ remote.run(args=[
+ 'echo', export, run.Raw("|"),
+ 'sudo', 'tee', '-a', "/etc/exports",
+ ])
+ log.info("Restarting NFS...")
+ if remote.os.package_type == "deb":
+ remote.run(args=['sudo', 'service', 'nfs-kernel-server', 'restart'])
+ else:
+ remote.run(args=['sudo', 'systemctl', 'restart', 'nfs'])
+
+
+def _teardown_nfs_mount(remote, client):
+ """
+ Tears down the nfs mount on the remote used for logging and reporting the
+ status of the tests being ran in the guest.
+ """
+ log.info("Tearing down the nfs mount for {remote}".format(remote=remote))
+ export_dir = "/export/{client}".format(client=client)
+ log.info("Stopping NFS...")
+ if remote.os.package_type == "deb":
+ remote.run(args=[
+ 'sudo', 'service', 'nfs-kernel-server', 'stop'
+ ])
+ else:
+ remote.run(args=[
+ 'sudo', 'systemctl', 'stop', 'nfs'
+ ])
+ log.info("Unmounting exported directory...")
+ remote.run(args=[
+ 'sudo', 'umount', export_dir
+ ])
+ log.info("Deleting exported directory...")
+ remote.run(args=[
+ 'sudo', 'rm', '-r', '/export'
+ ])
+ log.info("Deleting export from /etc/exports...")
+ remote.run(args=[
+ 'sudo', 'sed', '-i', '$ d', '/etc/exports'
+ ])
+ log.info("Starting NFS...")
+ if remote.os.package_type == "deb":
+ remote.run(args=[
+ 'sudo', 'service', 'nfs-kernel-server', 'start'
+ ])
+ else:
+ remote.run(args=[
+ 'sudo', 'systemctl', 'start', 'nfs'
+ ])
+
+
+@contextlib.contextmanager
+def run_qemu(ctx, config):
+ """Setup kvm environment and start qemu"""
+ procs = []
+ testdir = teuthology.get_testdir(ctx)
+ for client, client_config in config.iteritems():
+ (remote,) = ctx.cluster.only(client).remotes.keys()
+ log_dir = '{tdir}/archive/qemu/{client}'.format(tdir=testdir, client=client)
+ remote.run(
+ args=[
+ 'mkdir', log_dir, run.Raw('&&'),
+ 'sudo', 'modprobe', 'kvm',
+ ]
+ )
+
+ # make an nfs mount to use for logging and to
+ # allow to test to tell teuthology the tests outcome
+ _setup_nfs_mount(remote, client, log_dir)
+
+ base_file = '{tdir}/qemu/base.{client}.qcow2'.format(
+ tdir=testdir,
+ client=client
+ )
+ # Hack to make sure /dev/kvm permissions are set correctly
+ # See http://tracker.ceph.com/issues/17977 and
+ # https://bugzilla.redhat.com/show_bug.cgi?id=1333159
+ remote.run(args='sudo udevadm control --reload')
+ remote.run(args='sudo udevadm trigger /dev/kvm')
+ remote.run(args='ls -l /dev/kvm')
+
+ qemu_cmd = 'qemu-system-x86_64'
+ if remote.os.package_type == "rpm":
+ qemu_cmd = "/usr/libexec/qemu-kvm"
+ args=[
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'daemon-helper',
+ 'term',
+ qemu_cmd, '-enable-kvm', '-nographic',
+ '-m', str(client_config.get('memory', DEFAULT_MEM)),
+ # base OS device
+ '-drive',
+ 'file={base},format=qcow2,if=virtio'.format(base=base_file),
+ # cd holding metadata for cloud-init
+ '-cdrom', '{tdir}/qemu/{client}.iso'.format(tdir=testdir, client=client),
+ ]
+
+ cachemode = 'none'
+ ceph_config = ctx.ceph['ceph'].conf.get('global', {})
+ ceph_config.update(ctx.ceph['ceph'].conf.get('client', {}))
+ ceph_config.update(ctx.ceph['ceph'].conf.get(client, {}))
+ if ceph_config.get('rbd cache'):
+ if ceph_config.get('rbd cache max dirty', 1) > 0:
+ cachemode = 'writeback'
+ else:
+ cachemode = 'writethrough'
+
+ clone = client_config.get('clone', False)
+ for i in xrange(client_config.get('num_rbd', DEFAULT_NUM_RBD)):
+ suffix = '-clone' if clone else ''
+ args.extend([
+ '-drive',
+ 'file=rbd:rbd/{img}:id={id},format=raw,if=virtio,cache={cachemode}'.format(
+ img='{client}.{num}{suffix}'.format(client=client, num=i,
+ suffix=suffix),
+ id=client[len('client.'):],
+ cachemode=cachemode,
+ ),
+ ])
+
+ log.info('starting qemu...')
+ procs.append(
+ remote.run(
+ args=args,
+ logger=log.getChild(client),
+ stdin=run.PIPE,
+ wait=False,
+ )
+ )
+
+ try:
+ yield
+ finally:
+ log.info('waiting for qemu tests to finish...')
+ run.wait(procs)
+
+ log.debug('checking that qemu tests succeeded...')
+ for client in config.iterkeys():
+ (remote,) = ctx.cluster.only(client).remotes.keys()
+ # teardown nfs mount
+ _teardown_nfs_mount(remote, client)
+ # check for test status
+ remote.run(
+ args=[
+ 'test', '-f',
+ '{tdir}/archive/qemu/{client}/success'.format(
+ tdir=testdir,
+ client=client
+ ),
+ ],
+ )
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Run a test inside of QEMU on top of rbd. Only one test
+ is supported per client.
+
+ For example, you can specify which clients to run on::
+
+ tasks:
+ - ceph:
+ - qemu:
+ client.0:
+ test: http://ceph.com/qa/test.sh
+ client.1:
+ test: http://ceph.com/qa/test2.sh
+
+ Or use the same settings on all clients:
+
+ tasks:
+ - ceph:
+ - qemu:
+ all:
+ test: http://ceph.com/qa/test.sh
+
+ For tests that don't need a filesystem, set type to block::
+
+ tasks:
+ - ceph:
+ - qemu:
+ client.0:
+ test: http://ceph.com/qa/test.sh
+ type: block
+
+ The test should be configured to run on /dev/vdb and later
+ devices.
+
+ If you want to run a test that uses more than one rbd image,
+ specify how many images to use::
+
+ tasks:
+ - ceph:
+ - qemu:
+ client.0:
+ test: http://ceph.com/qa/test.sh
+ type: block
+ num_rbd: 2
+
+ You can set the amount of memory the VM has (default is 1024 MB)::
+
+ tasks:
+ - ceph:
+ - qemu:
+ client.0:
+ test: http://ceph.com/qa/test.sh
+ memory: 512 # megabytes
+
+ If you want to run a test against a cloned rbd image, set clone to true::
+
+ tasks:
+ - ceph:
+ - qemu:
+ client.0:
+ test: http://ceph.com/qa/test.sh
+ clone: true
+ """
+ assert isinstance(config, dict), \
+ "task qemu only supports a dictionary for configuration"
+
+ config = teuthology.replace_all_with_clients(ctx.cluster, config)
+
+ managers = []
+ create_images(ctx=ctx, config=config, managers=managers)
+ managers.extend([
+ lambda: create_dirs(ctx=ctx, config=config),
+ lambda: generate_iso(ctx=ctx, config=config),
+ lambda: download_image(ctx=ctx, config=config),
+ ])
+ create_clones(ctx=ctx, config=config, managers=managers)
+ managers.append(
+ lambda: run_qemu(ctx=ctx, config=config),
+ )
+
+ with contextutil.nested(*managers):
+ yield
--- /dev/null
+"""
+Rados modle-based integration tests
+"""
+import contextlib
+import logging
+import gevent
+from teuthology import misc as teuthology
+
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Run RadosModel-based integration tests.
+
+ The config should be as follows::
+
+ rados:
+ clients: [client list]
+ ops: <number of ops>
+ objects: <number of objects to use>
+ max_in_flight: <max number of operations in flight>
+ object_size: <size of objects in bytes>
+ min_stride_size: <minimum write stride size in bytes>
+ max_stride_size: <maximum write stride size in bytes>
+ op_weights: <dictionary mapping operation type to integer weight>
+ runs: <number of times to run> - the pool is remade between runs
+ ec_pool: use an ec pool
+ erasure_code_profile: profile to use with the erasure coded pool
+ pool_snaps: use pool snapshots instead of selfmanaged snapshots
+ write_fadvise_dontneed: write behavior like with LIBRADOS_OP_FLAG_FADVISE_DONTNEED.
+ This mean data don't access in the near future.
+ Let osd backend don't keep data in cache.
+
+ For example::
+
+ tasks:
+ - ceph:
+ - rados:
+ clients: [client.0]
+ ops: 1000
+ max_seconds: 0 # 0 for no limit
+ objects: 25
+ max_in_flight: 16
+ object_size: 4000000
+ min_stride_size: 1024
+ max_stride_size: 4096
+ op_weights:
+ read: 20
+ write: 10
+ delete: 2
+ snap_create: 3
+ rollback: 2
+ snap_remove: 0
+ ec_pool: create an ec pool, defaults to False
+ erasure_code_use_hacky_overwrites: use the whitebox
+ testing experimental
+ overwrites mode
+ erasure_code_profile:
+ name: teuthologyprofile
+ k: 2
+ m: 1
+ ruleset-failure-domain: osd
+ pool_snaps: true
+ write_fadvise_dontneed: true
+ runs: 10
+ - interactive:
+
+ Optionally, you can provide the pool name to run against:
+
+ tasks:
+ - ceph:
+ - exec:
+ client.0:
+ - ceph osd pool create foo
+ - rados:
+ clients: [client.0]
+ pools: [foo]
+ ...
+
+ Alternatively, you can provide a pool prefix:
+
+ tasks:
+ - ceph:
+ - exec:
+ client.0:
+ - ceph osd pool create foo.client.0
+ - rados:
+ clients: [client.0]
+ pool_prefix: foo
+ ...
+
+ The tests are run asynchronously, they are not complete when the task
+ returns. For instance:
+
+ - rados:
+ clients: [client.0]
+ pools: [ecbase]
+ ops: 4000
+ objects: 500
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ copy_from: 50
+ - print: "**** done rados ec-cache-agent (part 2)"
+
+ will run the print task immediately after the rados tasks begins but
+ not after it completes. To make the rados task a blocking / sequential
+ task, use:
+
+ - sequential:
+ - rados:
+ clients: [client.0]
+ pools: [ecbase]
+ ops: 4000
+ objects: 500
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ copy_from: 50
+ - print: "**** done rados ec-cache-agent (part 2)"
+
+ """
+ log.info('Beginning rados...')
+ assert isinstance(config, dict), \
+ "please list clients to run on"
+
+ object_size = int(config.get('object_size', 4000000))
+ op_weights = config.get('op_weights', {})
+ testdir = teuthology.get_testdir(ctx)
+ args = [
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'ceph_test_rados']
+ if config.get('ec_pool', False):
+ args.extend(['--no-omap'])
+ if config.get('erasure_code_use_hacky_overwrites', False):
+ args.extend(['--no-sparse'])
+ else:
+ args.extend(['--ec-pool'])
+ if config.get('write_fadvise_dontneed', False):
+ args.extend(['--write-fadvise-dontneed'])
+ if config.get('pool_snaps', False):
+ args.extend(['--pool-snaps'])
+ args.extend([
+ '--max-ops', str(config.get('ops', 10000)),
+ '--objects', str(config.get('objects', 500)),
+ '--max-in-flight', str(config.get('max_in_flight', 16)),
+ '--size', str(object_size),
+ '--min-stride-size', str(config.get('min_stride_size', object_size / 10)),
+ '--max-stride-size', str(config.get('max_stride_size', object_size / 5)),
+ '--max-seconds', str(config.get('max_seconds', 0))
+ ])
+
+ weights = {}
+ weights['read'] = 100
+ weights['write'] = 100
+ weights['delete'] = 10
+ # Parallel of the op_types in test/osd/TestRados.cc
+ for field in [
+ # read handled above
+ # write handled above
+ # delete handled above
+ "snap_create",
+ "snap_remove",
+ "rollback",
+ "setattr",
+ "rmattr",
+ "watch",
+ "copy_from",
+ "hit_set_list",
+ "is_dirty",
+ "undirty",
+ "cache_flush",
+ "cache_try_flush",
+ "cache_evict",
+ "append",
+ "write",
+ "read",
+ "delete"
+ ]:
+ if field in op_weights:
+ weights[field] = op_weights[field]
+
+ if config.get('write_append_excl', True):
+ if 'write' in weights:
+ weights['write'] = weights['write'] / 2
+ weights['write_excl'] = weights['write']
+
+ if 'append' in weights:
+ weights['append'] = weights['append'] / 2
+ weights['append_excl'] = weights['append']
+
+ for op, weight in weights.iteritems():
+ args.extend([
+ '--op', op, str(weight)
+ ])
+
+
+ def thread():
+ """Thread spawned by gevent"""
+ clients = ['client.{id}'.format(id=id_) for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+ log.info('clients are %s' % clients)
+ manager = ctx.managers['ceph']
+ if config.get('ec_pool', False):
+ profile = config.get('erasure_code_profile', {})
+ profile_name = profile.get('name', 'teuthologyprofile')
+ manager.create_erasure_code_profile(profile_name, profile)
+ else:
+ profile_name = None
+ for i in range(int(config.get('runs', '1'))):
+ log.info("starting run %s out of %s", str(i), config.get('runs', '1'))
+ tests = {}
+ existing_pools = config.get('pools', [])
+ created_pools = []
+ for role in config.get('clients', clients):
+ assert isinstance(role, basestring)
+ PREFIX = 'client.'
+ assert role.startswith(PREFIX)
+ id_ = role[len(PREFIX):]
+
+ pool = config.get('pool', None)
+ if not pool and existing_pools:
+ pool = existing_pools.pop()
+ else:
+ pool = manager.create_pool_with_unique_name(
+ erasure_code_profile_name=profile_name,
+ erasure_code_use_hacky_overwrites=
+ config.get('erasure_code_use_hacky_overwrites', False)
+ )
+ created_pools.append(pool)
+ if config.get('fast_read', False):
+ manager.raw_cluster_cmd(
+ 'osd', 'pool', 'set', pool, 'fast_read', 'true')
+
+ (remote,) = ctx.cluster.only(role).remotes.iterkeys()
+ proc = remote.run(
+ args=["CEPH_CLIENT_ID={id_}".format(id_=id_)] + args +
+ ["--pool", pool],
+ logger=log.getChild("rados.{id}".format(id=id_)),
+ stdin=run.PIPE,
+ wait=False
+ )
+ tests[id_] = proc
+ run.wait(tests.itervalues())
+
+ for pool in created_pools:
+ manager.remove_pool(pool)
+
+ running = gevent.spawn(thread)
+
+ try:
+ yield
+ finally:
+ log.info('joining rados')
+ running.get()
--- /dev/null
+"""
+Rados benchmarking
+"""
+import contextlib
+import logging
+
+from teuthology.orchestra import run
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Run radosbench
+
+ The config should be as follows:
+
+ radosbench:
+ clients: [client list]
+ time: <seconds to run>
+ pool: <pool to use>
+ size: write size to use
+ unique_pool: use a unique pool, defaults to False
+ ec_pool: create an ec pool, defaults to False
+ create_pool: create pool, defaults to False
+ erasure_code_profile:
+ name: teuthologyprofile
+ k: 2
+ m: 1
+ ruleset-failure-domain: osd
+ cleanup: false (defaults to true)
+ example:
+
+ tasks:
+ - ceph:
+ - radosbench:
+ clients: [client.0]
+ time: 360
+ - interactive:
+ """
+ log.info('Beginning radosbench...')
+ assert isinstance(config, dict), \
+ "please list clients to run on"
+ radosbench = {}
+
+ testdir = teuthology.get_testdir(ctx)
+ manager = ctx.managers['ceph']
+
+ create_pool = config.get('create_pool', True)
+ for role in config.get('clients', ['client.0']):
+ assert isinstance(role, basestring)
+ PREFIX = 'client.'
+ assert role.startswith(PREFIX)
+ id_ = role[len(PREFIX):]
+ (remote,) = ctx.cluster.only(role).remotes.iterkeys()
+
+ if config.get('ec_pool', False):
+ profile = config.get('erasure_code_profile', {})
+ profile_name = profile.get('name', 'teuthologyprofile')
+ manager.create_erasure_code_profile(profile_name, profile)
+ else:
+ profile_name = None
+
+ cleanup = []
+ if not config.get('cleanup', True):
+ cleanup = ['--no-cleanup']
+
+ pool = config.get('pool', 'data')
+ if create_pool:
+ if pool != 'data':
+ manager.create_pool(pool, erasure_code_profile_name=profile_name)
+ else:
+ pool = manager.create_pool_with_unique_name(erasure_code_profile_name=profile_name)
+
+ proc = remote.run(
+ args=[
+ "/bin/sh", "-c",
+ " ".join(['adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage',
+ 'rados',
+ '--no-log-to-stderr',
+ '--name', role,
+ '-b', str(config.get('size', 4<<20)),
+ '-p' , pool,
+ 'bench', str(config.get('time', 360)), 'write',
+ ] + cleanup).format(tdir=testdir),
+ ],
+ logger=log.getChild('radosbench.{id}'.format(id=id_)),
+ stdin=run.PIPE,
+ wait=False
+ )
+ radosbench[id_] = proc
+
+ try:
+ yield
+ finally:
+ timeout = config.get('time', 360) * 5 + 180
+ log.info('joining radosbench (timing out after %ss)', timeout)
+ run.wait(radosbench.itervalues(), timeout=timeout)
+
+ if pool is not 'data' and create_pool:
+ manager.remove_pool(pool)
--- /dev/null
+"""
+Rados benchmarking sweep
+"""
+import contextlib
+import logging
+import re
+
+from cStringIO import StringIO
+from itertools import product
+
+from teuthology.orchestra import run
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Execute a radosbench parameter sweep
+
+ Puts radosbench in a loop, taking values from the given config at each
+ iteration. If given, the min and max values below create a range, e.g.
+ min_replicas=1 and max_replicas=3 implies executing with 1-3 replicas.
+
+ Parameters:
+
+ clients: [client list]
+ time: seconds to run (default=120)
+ sizes: [list of object sizes] (default=[4M])
+ mode: <write|read|seq> (default=write)
+ repetitions: execute the same configuration multiple times (default=1)
+ min_num_replicas: minimum number of replicas to use (default = 3)
+ max_num_replicas: maximum number of replicas to use (default = 3)
+ min_num_osds: the minimum number of OSDs in a pool (default=all)
+ max_num_osds: the maximum number of OSDs in a pool (default=all)
+ file: name of CSV-formatted output file (default='radosbench.csv')
+ columns: columns to include (default=all)
+ - rep: execution number (takes values from 'repetitions')
+ - num_osd: number of osds for pool
+ - num_replica: number of replicas
+ - avg_throughput: throughput
+ - avg_latency: latency
+ - stdev_throughput:
+ - stdev_latency:
+
+ Example:
+ - radsobenchsweep:
+ columns: [rep, num_osd, num_replica, avg_throughput, stdev_throughput]
+ """
+ log.info('Beginning radosbenchsweep...')
+ assert isinstance(config, dict), 'expecting dictionary for configuration'
+
+ # get and validate config values
+ # {
+
+ # only one client supported for now
+ if len(config.get('clients', [])) != 1:
+ raise Exception("Only one client can be specified")
+
+ # only write mode
+ if config.get('mode', 'write') != 'write':
+ raise Exception("Only 'write' mode supported for now.")
+
+ # OSDs
+ total_osds_in_cluster = teuthology.num_instances_of_type(ctx.cluster, 'osd')
+ min_num_osds = config.get('min_num_osds', total_osds_in_cluster)
+ max_num_osds = config.get('max_num_osds', total_osds_in_cluster)
+
+ if max_num_osds > total_osds_in_cluster:
+ raise Exception('max_num_osds cannot be greater than total in cluster')
+ if min_num_osds < 1:
+ raise Exception('min_num_osds cannot be less than 1')
+ if min_num_osds > max_num_osds:
+ raise Exception('min_num_osds cannot be greater than max_num_osd')
+ osds = range(0, (total_osds_in_cluster + 1))
+
+ # replicas
+ min_num_replicas = config.get('min_num_replicas', 3)
+ max_num_replicas = config.get('max_num_replicas', 3)
+
+ if min_num_replicas < 1:
+ raise Exception('min_num_replicas cannot be less than 1')
+ if min_num_replicas > max_num_replicas:
+ raise Exception('min_num_replicas cannot be greater than max_replicas')
+ if max_num_replicas > max_num_osds:
+ raise Exception('max_num_replicas cannot be greater than max_num_osds')
+ replicas = range(min_num_replicas, (max_num_replicas + 1))
+
+ # object size
+ sizes = config.get('size', [4 << 20])
+
+ # repetitions
+ reps = range(config.get('repetitions', 1))
+
+ # file
+ fname = config.get('file', 'radosbench.csv')
+ f = open('{}/{}'.format(ctx.archive, fname), 'w')
+ f.write(get_csv_header(config) + '\n')
+ # }
+
+ # set default pools size=1 to avoid 'unhealthy' issues
+ ctx.manager.set_pool_property('data', 'size', 1)
+ ctx.manager.set_pool_property('metadata', 'size', 1)
+ ctx.manager.set_pool_property('rbd', 'size', 1)
+
+ current_osds_out = 0
+
+ # sweep through all parameters
+ for osds_out, size, replica, rep in product(osds, sizes, replicas, reps):
+
+ osds_in = total_osds_in_cluster - osds_out
+
+ if osds_in == 0:
+ # we're done
+ break
+
+ if current_osds_out != osds_out:
+ # take an osd out
+ ctx.manager.raw_cluster_cmd(
+ 'osd', 'reweight', str(osds_out-1), '0.0')
+ wait_until_healthy(ctx, config)
+ current_osds_out = osds_out
+
+ if osds_in not in range(min_num_osds, (max_num_osds + 1)):
+ # no need to execute with a number of osds that wasn't requested
+ continue
+
+ if osds_in < replica:
+ # cannot execute with more replicas than available osds
+ continue
+
+ run_radosbench(ctx, config, f, osds_in, size, replica, rep)
+
+ f.close()
+
+ yield
+
+
+def get_csv_header(conf):
+ all_columns = [
+ 'rep', 'num_osd', 'num_replica', 'avg_throughput',
+ 'avg_latency', 'stdev_throughput', 'stdev_latency'
+ ]
+ given_columns = conf.get('columns', None)
+ if given_columns and len(given_columns) != 0:
+ for column in given_columns:
+ if column not in all_columns:
+ raise Exception('Unknown column ' + column)
+ return ','.join(conf['columns'])
+ else:
+ conf['columns'] = all_columns
+ return ','.join(all_columns)
+
+
+def run_radosbench(ctx, config, f, num_osds, size, replica, rep):
+ pool = ctx.manager.create_pool_with_unique_name()
+
+ ctx.manager.set_pool_property(pool, 'size', replica)
+
+ wait_until_healthy(ctx, config)
+
+ log.info('Executing with parameters: ')
+ log.info(' num_osd =' + str(num_osds))
+ log.info(' size =' + str(size))
+ log.info(' num_replicas =' + str(replica))
+ log.info(' repetition =' + str(rep))
+
+ for role in config.get('clients', ['client.0']):
+ assert isinstance(role, basestring)
+ PREFIX = 'client.'
+ assert role.startswith(PREFIX)
+ id_ = role[len(PREFIX):]
+ (remote,) = ctx.cluster.only(role).remotes.iterkeys()
+
+ proc = remote.run(
+ args=[
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{}/archive/coverage'.format(teuthology.get_testdir(ctx)),
+ 'rados',
+ '--no-log-to-stderr',
+ '--name', role,
+ '-b', str(size),
+ '-p', pool,
+ 'bench', str(config.get('time', 120)), 'write',
+ ],
+ logger=log.getChild('radosbench.{id}'.format(id=id_)),
+ stdin=run.PIPE,
+ stdout=StringIO(),
+ wait=False
+ )
+
+ # parse output to get summary and format it as CSV
+ proc.wait()
+ out = proc.stdout.getvalue()
+ all_values = {
+ 'stdev_throughput': re.sub(r'Stddev Bandwidth: ', '', re.search(
+ r'Stddev Bandwidth:.*', out).group(0)),
+ 'stdev_latency': re.sub(r'Stddev Latency: ', '', re.search(
+ r'Stddev Latency:.*', out).group(0)),
+ 'avg_throughput': re.sub(r'Bandwidth \(MB/sec\): ', '', re.search(
+ r'Bandwidth \(MB/sec\):.*', out).group(0)),
+ 'avg_latency': re.sub(r'Average Latency: ', '', re.search(
+ r'Average Latency:.*', out).group(0)),
+ 'rep': str(rep),
+ 'num_osd': str(num_osds),
+ 'num_replica': str(replica)
+ }
+ values_to_write = []
+ for column in config['columns']:
+ values_to_write.extend([all_values[column]])
+ f.write(','.join(values_to_write) + '\n')
+
+ ctx.manager.remove_pool(pool)
+
+
+def wait_until_healthy(ctx, config):
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+ teuthology.wait_until_healthy(ctx, mon_remote)
--- /dev/null
+"""
+Rgw admin testing against a running instance
+"""
+# The test cases in this file have been annotated for inventory.
+# To extract the inventory (in csv format) use the command:
+#
+# grep '^ *# TESTCASE' | sed 's/^ *# TESTCASE //'
+#
+
+import copy
+import json
+import logging
+import time
+import datetime
+
+from cStringIO import StringIO
+
+import boto.exception
+import boto.s3.connection
+import boto.s3.acl
+
+import httplib2
+
+import util.rgw as rgw_utils
+
+from teuthology import misc as teuthology
+from util.rgw import rgwadmin, get_user_summary, get_user_successful_ops
+
+log = logging.getLogger(__name__)
+
+def create_presigned_url(conn, method, bucket_name, key_name, expiration):
+ return conn.generate_url(expires_in=expiration,
+ method=method,
+ bucket=bucket_name,
+ key=key_name,
+ query_auth=True,
+ )
+
+def send_raw_http_request(conn, method, bucket_name, key_name, follow_redirects = False):
+ url = create_presigned_url(conn, method, bucket_name, key_name, 3600)
+ print url
+ h = httplib2.Http()
+ h.follow_redirects = follow_redirects
+ return h.request(url, method)
+
+
+def get_acl(key):
+ """
+ Helper function to get the xml acl from a key, ensuring that the xml
+ version tag is removed from the acl response
+ """
+ raw_acl = key.get_xml_acl()
+
+ def remove_version(string):
+ return string.split(
+ '<?xml version="1.0" encoding="UTF-8"?>'
+ )[-1]
+
+ def remove_newlines(string):
+ return string.strip('\n')
+
+ return remove_version(
+ remove_newlines(raw_acl)
+ )
+
+
+def task(ctx, config):
+ """
+ Test radosgw-admin functionality against a running rgw instance.
+ """
+ global log
+ assert config is None or isinstance(config, list) \
+ or isinstance(config, dict), \
+ "task s3tests only supports a list or dictionary for configuration"
+ all_clients = ['client.{id}'.format(id=id_)
+ for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+ if config is None:
+ config = all_clients
+ if isinstance(config, list):
+ config = dict.fromkeys(config)
+ clients = config.keys()
+
+ multi_region_run = rgw_utils.multi_region_enabled(ctx)
+
+ client = clients[0]; # default choice, multi-region code may overwrite this
+ if multi_region_run:
+ client = rgw_utils.get_master_client(ctx, clients)
+
+ # once the client is chosen, pull the host name and assigned port out of
+ # the role_endpoints that were assigned by the rgw task
+ (remote_host, remote_port) = ctx.rgw.role_endpoints[client]
+
+ realm = ctx.rgw.realm
+ log.debug('radosgw-admin: realm %r', realm)
+
+ ##
+ user1='foo'
+ user2='fud'
+ subuser1='foo:foo1'
+ subuser2='foo:foo2'
+ display_name1='Foo'
+ display_name2='Fud'
+ email='foo@foo.com'
+ email2='bar@bar.com'
+ access_key='9te6NH5mcdcq0Tc5i8i1'
+ secret_key='Ny4IOauQoL18Gp2zM7lC1vLmoawgqcYP/YGcWfXu'
+ access_key2='p5YnriCv1nAtykxBrupQ'
+ secret_key2='Q8Tk6Q/27hfbFSYdSkPtUqhqx1GgzvpXa4WARozh'
+ swift_secret1='gpS2G9RREMrnbqlp29PP2D36kgPR1tm72n5fPYfL'
+ swift_secret2='ri2VJQcKSYATOY6uaDUX7pxgkW+W1YmC6OCxPHwy'
+
+ bucket_name='myfoo'
+ bucket_name2='mybar'
+
+ # connect to rgw
+ connection = boto.s3.connection.S3Connection(
+ aws_access_key_id=access_key,
+ aws_secret_access_key=secret_key,
+ is_secure=False,
+ port=remote_port,
+ host=remote_host,
+ calling_format=boto.s3.connection.OrdinaryCallingFormat(),
+ )
+ connection2 = boto.s3.connection.S3Connection(
+ aws_access_key_id=access_key2,
+ aws_secret_access_key=secret_key2,
+ is_secure=False,
+ port=remote_port,
+ host=remote_host,
+ calling_format=boto.s3.connection.OrdinaryCallingFormat(),
+ )
+
+ # legend (test cases can be easily grep-ed out)
+ # TESTCASE 'testname','object','method','operation','assertion'
+ # TESTCASE 'info-nosuch','user','info','non-existent user','fails'
+ (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1])
+ assert err
+
+ # TESTCASE 'create-ok','user','create','w/all valid info','succeeds'
+ (err, out) = rgwadmin(ctx, client, [
+ 'user', 'create',
+ '--uid', user1,
+ '--display-name', display_name1,
+ '--email', email,
+ '--access-key', access_key,
+ '--secret', secret_key,
+ '--max-buckets', '4'
+ ],
+ check_status=True)
+
+ # TESTCASE 'duplicate email','user','create','existing user email','fails'
+ (err, out) = rgwadmin(ctx, client, [
+ 'user', 'create',
+ '--uid', user2,
+ '--display-name', display_name2,
+ '--email', email,
+ ])
+ assert err
+
+ # TESTCASE 'info-existing','user','info','existing user','returns correct info'
+ (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True)
+ assert out['user_id'] == user1
+ assert out['email'] == email
+ assert out['display_name'] == display_name1
+ assert len(out['keys']) == 1
+ assert out['keys'][0]['access_key'] == access_key
+ assert out['keys'][0]['secret_key'] == secret_key
+ assert not out['suspended']
+
+ # this whole block should only be run if regions have been configured
+ if multi_region_run:
+ rgw_utils.radosgw_agent_sync_all(ctx)
+ # post-sync, validate that user1 exists on the sync destination host
+ for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
+ dest_client = c_config['dest']
+ (err, out) = rgwadmin(ctx, dest_client, ['metadata', 'list', 'user'])
+ (err, out) = rgwadmin(ctx, dest_client, ['user', 'info', '--uid', user1], check_status=True)
+ assert out['user_id'] == user1
+ assert out['email'] == email
+ assert out['display_name'] == display_name1
+ assert len(out['keys']) == 1
+ assert out['keys'][0]['access_key'] == access_key
+ assert out['keys'][0]['secret_key'] == secret_key
+ assert not out['suspended']
+
+ # compare the metadata between different regions, make sure it matches
+ log.debug('compare the metadata between different regions, make sure it matches')
+ for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
+ source_client = c_config['src']
+ dest_client = c_config['dest']
+ (err1, out1) = rgwadmin(ctx, source_client,
+ ['metadata', 'get', 'user:{uid}'.format(uid=user1)], check_status=True)
+ (err2, out2) = rgwadmin(ctx, dest_client,
+ ['metadata', 'get', 'user:{uid}'.format(uid=user1)], check_status=True)
+ assert out1 == out2
+
+ # suspend a user on the master, then check the status on the destination
+ log.debug('suspend a user on the master, then check the status on the destination')
+ for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
+ source_client = c_config['src']
+ dest_client = c_config['dest']
+ (err, out) = rgwadmin(ctx, source_client, ['user', 'suspend', '--uid', user1])
+ rgw_utils.radosgw_agent_sync_all(ctx)
+ (err, out) = rgwadmin(ctx, dest_client, ['user', 'info', '--uid', user1], check_status=True)
+ assert out['suspended']
+
+ # delete a user on the master, then check that it's gone on the destination
+ log.debug('delete a user on the master, then check that it\'s gone on the destination')
+ for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
+ source_client = c_config['src']
+ dest_client = c_config['dest']
+ (err, out) = rgwadmin(ctx, source_client, ['user', 'rm', '--uid', user1], check_status=True)
+ rgw_utils.radosgw_agent_sync_all(ctx)
+ (err, out) = rgwadmin(ctx, source_client, ['user', 'info', '--uid', user1])
+ assert out is None
+ (err, out) = rgwadmin(ctx, dest_client, ['user', 'info', '--uid', user1])
+ assert out is None
+
+ # then recreate it so later tests pass
+ (err, out) = rgwadmin(ctx, client, [
+ 'user', 'create',
+ '--uid', user1,
+ '--display-name', display_name1,
+ '--email', email,
+ '--access-key', access_key,
+ '--secret', secret_key,
+ '--max-buckets', '4'
+ ],
+ check_status=True)
+
+ # now do the multi-region bucket tests
+ log.debug('now do the multi-region bucket tests')
+
+ # Create a second user for the following tests
+ log.debug('Create a second user for the following tests')
+ (err, out) = rgwadmin(ctx, client, [
+ 'user', 'create',
+ '--uid', user2,
+ '--display-name', display_name2,
+ '--email', email2,
+ '--access-key', access_key2,
+ '--secret', secret_key2,
+ '--max-buckets', '4'
+ ],
+ check_status=True)
+ (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user2], check_status=True)
+ assert out is not None
+
+ # create a bucket and do a sync
+ log.debug('create a bucket and do a sync')
+ bucket = connection.create_bucket(bucket_name2)
+ rgw_utils.radosgw_agent_sync_all(ctx)
+
+ # compare the metadata for the bucket between different regions, make sure it matches
+ log.debug('compare the metadata for the bucket between different regions, make sure it matches')
+ for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
+ source_client = c_config['src']
+ dest_client = c_config['dest']
+ (err1, out1) = rgwadmin(ctx, source_client,
+ ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)],
+ check_status=True)
+ (err2, out2) = rgwadmin(ctx, dest_client,
+ ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)],
+ check_status=True)
+ log.debug('metadata 1 %r', out1)
+ log.debug('metadata 2 %r', out2)
+ assert out1 == out2
+
+ # get the bucket.instance info and compare that
+ src_bucket_id = out1['data']['bucket']['bucket_id']
+ dest_bucket_id = out2['data']['bucket']['bucket_id']
+ (err1, out1) = rgwadmin(ctx, source_client, ['metadata', 'get',
+ 'bucket.instance:{bucket_name}:{bucket_instance}'.format(
+ bucket_name=bucket_name2,bucket_instance=src_bucket_id)],
+ check_status=True)
+ (err2, out2) = rgwadmin(ctx, dest_client, ['metadata', 'get',
+ 'bucket.instance:{bucket_name}:{bucket_instance}'.format(
+ bucket_name=bucket_name2,bucket_instance=dest_bucket_id)],
+ check_status=True)
+ del out1['data']['bucket_info']['bucket']['pool']
+ del out1['data']['bucket_info']['bucket']['index_pool']
+ del out1['data']['bucket_info']['bucket']['data_extra_pool']
+ del out2['data']['bucket_info']['bucket']['pool']
+ del out2['data']['bucket_info']['bucket']['index_pool']
+ del out2['data']['bucket_info']['bucket']['data_extra_pool']
+ assert out1 == out2
+
+ same_region = 0
+ for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
+ source_client = c_config['src']
+ dest_client = c_config['dest']
+
+ source_region = rgw_utils.region_for_client(ctx, source_client)
+ dest_region = rgw_utils.region_for_client(ctx, dest_client)
+
+ # 301 is only returned for requests to something in a different region
+ if source_region == dest_region:
+ log.debug('301 is only returned for requests to something in a different region')
+ same_region += 1
+ continue
+
+ # Attempt to create a new connection with user1 to the destination RGW
+ log.debug('Attempt to create a new connection with user1 to the destination RGW')
+ # and use that to attempt a delete (that should fail)
+
+ (dest_remote_host, dest_remote_port) = ctx.rgw.role_endpoints[dest_client]
+ connection_dest = boto.s3.connection.S3Connection(
+ aws_access_key_id=access_key,
+ aws_secret_access_key=secret_key,
+ is_secure=False,
+ port=dest_remote_port,
+ host=dest_remote_host,
+ calling_format=boto.s3.connection.OrdinaryCallingFormat(),
+ )
+
+ # this should fail
+ r, content = send_raw_http_request(connection_dest, 'DELETE', bucket_name2, '', follow_redirects = False)
+ assert r.status == 301
+
+ # now delete the bucket on the source RGW and do another sync
+ log.debug('now delete the bucket on the source RGW and do another sync')
+ bucket.delete()
+ rgw_utils.radosgw_agent_sync_all(ctx)
+
+ if same_region == len(ctx.radosgw_agent.config):
+ bucket.delete()
+ rgw_utils.radosgw_agent_sync_all(ctx)
+
+ # make sure that the bucket no longer exists in either region
+ log.debug('make sure that the bucket no longer exists in either region')
+ for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
+ source_client = c_config['src']
+ dest_client = c_config['dest']
+ (err1, out1) = rgwadmin(ctx, source_client, ['metadata', 'get',
+ 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)])
+ (err2, out2) = rgwadmin(ctx, dest_client, ['metadata', 'get',
+ 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)])
+ # Both of the previous calls should have errors due to requesting
+ # metadata for non-existent buckets
+ assert err1
+ assert err2
+
+ # create a bucket and then sync it
+ log.debug('create a bucket and then sync it')
+ bucket = connection.create_bucket(bucket_name2)
+ rgw_utils.radosgw_agent_sync_all(ctx)
+
+ # compare the metadata for the bucket between different regions, make sure it matches
+ log.debug('compare the metadata for the bucket between different regions, make sure it matches')
+ for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
+ source_client = c_config['src']
+ dest_client = c_config['dest']
+ (err1, out1) = rgwadmin(ctx, source_client,
+ ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)],
+ check_status=True)
+ (err2, out2) = rgwadmin(ctx, dest_client,
+ ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)],
+ check_status=True)
+ assert out1 == out2
+
+ # Now delete the bucket and recreate it with a different user
+ log.debug('Now delete the bucket and recreate it with a different user')
+ # within the same window of time and then sync.
+ bucket.delete()
+ bucket = connection2.create_bucket(bucket_name2)
+ rgw_utils.radosgw_agent_sync_all(ctx)
+
+ # compare the metadata for the bucket between different regions, make sure it matches
+ log.debug('compare the metadata for the bucket between different regions, make sure it matches')
+ # user2 should own the bucket in both regions
+ for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
+ source_client = c_config['src']
+ dest_client = c_config['dest']
+ (err1, out1) = rgwadmin(ctx, source_client,
+ ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)],
+ check_status=True)
+ (err2, out2) = rgwadmin(ctx, dest_client,
+ ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)],
+ check_status=True)
+ assert out1 == out2
+ assert out1['data']['owner'] == user2
+ assert out1['data']['owner'] != user1
+
+ # now we're going to use this bucket to test meta-data update propagation
+ log.debug('now we\'re going to use this bucket to test meta-data update propagation')
+ for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
+ source_client = c_config['src']
+ dest_client = c_config['dest']
+
+ # get the metadata so we can tweak it
+ log.debug('get the metadata so we can tweak it')
+ (err, orig_data) = rgwadmin(ctx, source_client,
+ ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)],
+ check_status=True)
+
+ # manually edit mtime for this bucket to be 300 seconds in the past
+ log.debug('manually edit mtime for this bucket to be 300 seconds in the past')
+ new_data = copy.deepcopy(orig_data)
+ mtime = datetime.datetime.strptime(orig_data['mtime'], "%Y-%m-%d %H:%M:%S.%fZ") - datetime.timedelta(300)
+ new_data['mtime'] = unicode(mtime.strftime("%Y-%m-%d %H:%M:%S.%fZ"))
+ log.debug("new mtime ", mtime)
+ assert new_data != orig_data
+ (err, out) = rgwadmin(ctx, source_client,
+ ['metadata', 'put', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)],
+ stdin=StringIO(json.dumps(new_data)),
+ check_status=True)
+
+ # get the metadata and make sure that the 'put' worked
+ log.debug('get the metadata and make sure that the \'put\' worked')
+ (err, out) = rgwadmin(ctx, source_client,
+ ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)],
+ check_status=True)
+ assert out == new_data
+
+ # sync to propagate the new metadata
+ log.debug('sync to propagate the new metadata')
+ rgw_utils.radosgw_agent_sync_all(ctx)
+
+ # get the metadata from the dest and compare it to what we just set
+ log.debug('get the metadata from the dest and compare it to what we just set')
+ # and what the source region has.
+ (err1, out1) = rgwadmin(ctx, source_client,
+ ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)],
+ check_status=True)
+ (err2, out2) = rgwadmin(ctx, dest_client,
+ ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)],
+ check_status=True)
+ # yeah for the transitive property
+ assert out1 == out2
+ assert out1 == new_data
+
+ # now we delete the bucket
+ log.debug('now we delete the bucket')
+ bucket.delete()
+
+ log.debug('sync to propagate the deleted bucket')
+ rgw_utils.radosgw_agent_sync_all(ctx)
+
+ # Delete user2 as later tests do not expect it to exist.
+ # Verify that it is gone on both regions
+ for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
+ source_client = c_config['src']
+ dest_client = c_config['dest']
+ (err, out) = rgwadmin(ctx, source_client,
+ ['user', 'rm', '--uid', user2], check_status=True)
+ rgw_utils.radosgw_agent_sync_all(ctx)
+ # The two 'user info' calls should fail and not return any data
+ # since we just deleted this user.
+ (err, out) = rgwadmin(ctx, source_client, ['user', 'info', '--uid', user2])
+ assert out is None
+ (err, out) = rgwadmin(ctx, dest_client, ['user', 'info', '--uid', user2])
+ assert out is None
+
+ # Test data sync
+
+ # First create a bucket for data sync test purpose
+ bucket = connection.create_bucket(bucket_name + 'data')
+
+ # Create a tiny file and check if in sync
+ for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
+ if c_config.get('metadata-only'):
+ continue
+
+ for full in (True, False):
+ source_client = c_config['src']
+ dest_client = c_config['dest']
+ k = boto.s3.key.Key(bucket)
+ k.key = 'tiny_file'
+ k.set_contents_from_string("123456789")
+ safety_window = rgw_utils.radosgw_data_log_window(ctx, source_client)
+ time.sleep(safety_window)
+ rgw_utils.radosgw_agent_sync_all(ctx, data=True, full=full)
+ (dest_host, dest_port) = ctx.rgw.role_endpoints[dest_client]
+ dest_connection = boto.s3.connection.S3Connection(
+ aws_access_key_id=access_key,
+ aws_secret_access_key=secret_key,
+ is_secure=False,
+ port=dest_port,
+ host=dest_host,
+ calling_format=boto.s3.connection.OrdinaryCallingFormat(),
+ )
+ dest_k = dest_connection.get_bucket(bucket_name + 'data').get_key('tiny_file')
+ assert k.get_contents_as_string() == dest_k.get_contents_as_string()
+
+ # check that deleting it removes it from the dest zone
+ k.delete()
+ time.sleep(safety_window)
+ # full sync doesn't handle deleted objects yet
+ rgw_utils.radosgw_agent_sync_all(ctx, data=True, full=False)
+
+ dest_bucket = dest_connection.get_bucket(bucket_name + 'data')
+ dest_k = dest_bucket.get_key('tiny_file')
+ assert dest_k == None, 'object not deleted from destination zone'
+
+ # finally we delete the bucket
+ bucket.delete()
+
+ bucket = connection.create_bucket(bucket_name + 'data2')
+ for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
+ if c_config.get('metadata-only'):
+ continue
+
+ for full in (True, False):
+ source_client = c_config['src']
+ dest_client = c_config['dest']
+ (dest_host, dest_port) = ctx.rgw.role_endpoints[dest_client]
+ dest_connection = boto.s3.connection.S3Connection(
+ aws_access_key_id=access_key,
+ aws_secret_access_key=secret_key,
+ is_secure=False,
+ port=dest_port,
+ host=dest_host,
+ calling_format=boto.s3.connection.OrdinaryCallingFormat(),
+ )
+ for i in range(20):
+ k = boto.s3.key.Key(bucket)
+ k.key = 'tiny_file_' + str(i)
+ k.set_contents_from_string(str(i) * 100)
+
+ safety_window = rgw_utils.radosgw_data_log_window(ctx, source_client)
+ time.sleep(safety_window)
+ rgw_utils.radosgw_agent_sync_all(ctx, data=True, full=full)
+
+ for i in range(20):
+ dest_k = dest_connection.get_bucket(bucket_name + 'data2').get_key('tiny_file_' + str(i))
+ assert (str(i) * 100) == dest_k.get_contents_as_string()
+ k = boto.s3.key.Key(bucket)
+ k.key = 'tiny_file_' + str(i)
+ k.delete()
+
+ # check that deleting removes the objects from the dest zone
+ time.sleep(safety_window)
+ # full sync doesn't delete deleted objects yet
+ rgw_utils.radosgw_agent_sync_all(ctx, data=True, full=False)
+
+ for i in range(20):
+ dest_bucket = dest_connection.get_bucket(bucket_name + 'data2')
+ dest_k = dest_bucket.get_key('tiny_file_' + str(i))
+ assert dest_k == None, 'object %d not deleted from destination zone' % i
+ bucket.delete()
+
+ # end of 'if multi_region_run:'
+
+ # TESTCASE 'suspend-ok','user','suspend','active user','succeeds'
+ (err, out) = rgwadmin(ctx, client, ['user', 'suspend', '--uid', user1],
+ check_status=True)
+
+ # TESTCASE 'suspend-suspended','user','suspend','suspended user','succeeds w/advisory'
+ (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True)
+ assert out['suspended']
+
+ # TESTCASE 're-enable','user','enable','suspended user','succeeds'
+ (err, out) = rgwadmin(ctx, client, ['user', 'enable', '--uid', user1], check_status=True)
+
+ # TESTCASE 'info-re-enabled','user','info','re-enabled user','no longer suspended'
+ (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True)
+ assert not out['suspended']
+
+ # TESTCASE 'add-keys','key','create','w/valid info','succeeds'
+ (err, out) = rgwadmin(ctx, client, [
+ 'key', 'create', '--uid', user1,
+ '--access-key', access_key2, '--secret', secret_key2,
+ ], check_status=True)
+
+ # TESTCASE 'info-new-key','user','info','after key addition','returns all keys'
+ (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1],
+ check_status=True)
+ assert len(out['keys']) == 2
+ assert out['keys'][0]['access_key'] == access_key2 or out['keys'][1]['access_key'] == access_key2
+ assert out['keys'][0]['secret_key'] == secret_key2 or out['keys'][1]['secret_key'] == secret_key2
+
+ # TESTCASE 'rm-key','key','rm','newly added key','succeeds, key is removed'
+ (err, out) = rgwadmin(ctx, client, [
+ 'key', 'rm', '--uid', user1,
+ '--access-key', access_key2,
+ ], check_status=True)
+ assert len(out['keys']) == 1
+ assert out['keys'][0]['access_key'] == access_key
+ assert out['keys'][0]['secret_key'] == secret_key
+
+ # TESTCASE 'add-swift-key','key','create','swift key','succeeds'
+ subuser_access = 'full'
+ subuser_perm = 'full-control'
+
+ (err, out) = rgwadmin(ctx, client, [
+ 'subuser', 'create', '--subuser', subuser1,
+ '--access', subuser_access
+ ], check_status=True)
+
+ # TESTCASE 'add-swift-key','key','create','swift key','succeeds'
+ (err, out) = rgwadmin(ctx, client, [
+ 'subuser', 'modify', '--subuser', subuser1,
+ '--secret', swift_secret1,
+ '--key-type', 'swift',
+ ], check_status=True)
+
+ # TESTCASE 'subuser-perm-mask', 'subuser', 'info', 'test subuser perm mask durability', 'succeeds'
+ (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1])
+
+ assert out['subusers'][0]['permissions'] == subuser_perm
+
+ # TESTCASE 'info-swift-key','user','info','after key addition','returns all keys'
+ (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True)
+ assert len(out['swift_keys']) == 1
+ assert out['swift_keys'][0]['user'] == subuser1
+ assert out['swift_keys'][0]['secret_key'] == swift_secret1
+
+ # TESTCASE 'add-swift-subuser','key','create','swift sub-user key','succeeds'
+ (err, out) = rgwadmin(ctx, client, [
+ 'subuser', 'create', '--subuser', subuser2,
+ '--secret', swift_secret2,
+ '--key-type', 'swift',
+ ], check_status=True)
+
+ # TESTCASE 'info-swift-subuser','user','info','after key addition','returns all sub-users/keys'
+ (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True)
+ assert len(out['swift_keys']) == 2
+ assert out['swift_keys'][0]['user'] == subuser2 or out['swift_keys'][1]['user'] == subuser2
+ assert out['swift_keys'][0]['secret_key'] == swift_secret2 or out['swift_keys'][1]['secret_key'] == swift_secret2
+
+ # TESTCASE 'rm-swift-key1','key','rm','subuser','succeeds, one key is removed'
+ (err, out) = rgwadmin(ctx, client, [
+ 'key', 'rm', '--subuser', subuser1,
+ '--key-type', 'swift',
+ ], check_status=True)
+ assert len(out['swift_keys']) == 1
+
+ # TESTCASE 'rm-subuser','subuser','rm','subuser','success, subuser is removed'
+ (err, out) = rgwadmin(ctx, client, [
+ 'subuser', 'rm', '--subuser', subuser1,
+ ], check_status=True)
+ assert len(out['subusers']) == 1
+
+ # TESTCASE 'rm-subuser-with-keys','subuser','rm','subuser','succeeds, second subser and key is removed'
+ (err, out) = rgwadmin(ctx, client, [
+ 'subuser', 'rm', '--subuser', subuser2,
+ '--key-type', 'swift', '--purge-keys',
+ ], check_status=True)
+ assert len(out['swift_keys']) == 0
+ assert len(out['subusers']) == 0
+
+ # TESTCASE 'bucket-stats','bucket','stats','no session/buckets','succeeds, empty list'
+ (err, out) = rgwadmin(ctx, client, ['bucket', 'stats', '--uid', user1],
+ check_status=True)
+ assert len(out) == 0
+
+ if multi_region_run:
+ rgw_utils.radosgw_agent_sync_all(ctx)
+
+ # TESTCASE 'bucket-stats2','bucket','stats','no buckets','succeeds, empty list'
+ (err, out) = rgwadmin(ctx, client, ['bucket', 'list', '--uid', user1], check_status=True)
+ assert len(out) == 0
+
+ # create a first bucket
+ bucket = connection.create_bucket(bucket_name)
+
+ # TESTCASE 'bucket-list','bucket','list','one bucket','succeeds, expected list'
+ (err, out) = rgwadmin(ctx, client, ['bucket', 'list', '--uid', user1], check_status=True)
+ assert len(out) == 1
+ assert out[0] == bucket_name
+
+ # TESTCASE 'bucket-list-all','bucket','list','all buckets','succeeds, expected list'
+ (err, out) = rgwadmin(ctx, client, ['bucket', 'list'], check_status=True)
+ assert len(out) >= 1
+ assert bucket_name in out;
+
+ # TESTCASE 'max-bucket-limit,'bucket','create','4 buckets','5th bucket fails due to max buckets == 4'
+ bucket2 = connection.create_bucket(bucket_name + '2')
+ bucket3 = connection.create_bucket(bucket_name + '3')
+ bucket4 = connection.create_bucket(bucket_name + '4')
+ # the 5th should fail.
+ failed = False
+ try:
+ connection.create_bucket(bucket_name + '5')
+ except Exception:
+ failed = True
+ assert failed
+
+ # delete the buckets
+ bucket2.delete()
+ bucket3.delete()
+ bucket4.delete()
+
+ # TESTCASE 'bucket-stats3','bucket','stats','new empty bucket','succeeds, empty list'
+ (err, out) = rgwadmin(ctx, client, [
+ 'bucket', 'stats', '--bucket', bucket_name], check_status=True)
+ assert out['owner'] == user1
+ bucket_id = out['id']
+
+ # TESTCASE 'bucket-stats4','bucket','stats','new empty bucket','succeeds, expected bucket ID'
+ (err, out) = rgwadmin(ctx, client, ['bucket', 'stats', '--uid', user1], check_status=True)
+ assert len(out) == 1
+ assert out[0]['id'] == bucket_id # does it return the same ID twice in a row?
+
+ # use some space
+ key = boto.s3.key.Key(bucket)
+ key.set_contents_from_string('one')
+
+ # TESTCASE 'bucket-stats5','bucket','stats','after creating key','succeeds, lists one non-empty object'
+ (err, out) = rgwadmin(ctx, client, [
+ 'bucket', 'stats', '--bucket', bucket_name], check_status=True)
+ assert out['id'] == bucket_id
+ assert out['usage']['rgw.main']['num_objects'] == 1
+ assert out['usage']['rgw.main']['size_kb'] > 0
+
+ # reclaim it
+ key.delete()
+
+ # TESTCASE 'bucket unlink', 'bucket', 'unlink', 'unlink bucket from user', 'fails', 'access denied error'
+ (err, out) = rgwadmin(ctx, client,
+ ['bucket', 'unlink', '--uid', user1, '--bucket', bucket_name],
+ check_status=True)
+
+ # create a second user to link the bucket to
+ (err, out) = rgwadmin(ctx, client, [
+ 'user', 'create',
+ '--uid', user2,
+ '--display-name', display_name2,
+ '--access-key', access_key2,
+ '--secret', secret_key2,
+ '--max-buckets', '1',
+ ],
+ check_status=True)
+
+ # try creating an object with the first user before the bucket is relinked
+ denied = False
+ key = boto.s3.key.Key(bucket)
+
+ try:
+ key.set_contents_from_string('two')
+ except boto.exception.S3ResponseError:
+ denied = True
+
+ assert not denied
+
+ # delete the object
+ key.delete()
+
+ # link the bucket to another user
+ (err, out) = rgwadmin(ctx, client, ['metadata', 'get', 'bucket:{n}'.format(n=bucket_name)],
+ check_status=True)
+
+ bucket_data = out['data']
+ assert bucket_data['bucket']['name'] == bucket_name
+
+ bucket_id = bucket_data['bucket']['bucket_id']
+
+ # link the bucket to another user
+ (err, out) = rgwadmin(ctx, client, ['bucket', 'link', '--uid', user2, '--bucket', bucket_name, '--bucket-id', bucket_id],
+ check_status=True)
+
+ # try to remove user, should fail (has a linked bucket)
+ (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--uid', user2])
+ assert err
+
+ # TESTCASE 'bucket unlink', 'bucket', 'unlink', 'unlink bucket from user', 'succeeds, bucket unlinked'
+ (err, out) = rgwadmin(ctx, client, ['bucket', 'unlink', '--uid', user2, '--bucket', bucket_name],
+ check_status=True)
+
+ # relink the bucket to the first user and delete the second user
+ (err, out) = rgwadmin(ctx, client,
+ ['bucket', 'link', '--uid', user1, '--bucket', bucket_name, '--bucket-id', bucket_id],
+ check_status=True)
+
+ (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--uid', user2],
+ check_status=True)
+
+ # TESTCASE 'object-rm', 'object', 'rm', 'remove object', 'succeeds, object is removed'
+
+ # upload an object
+ object_name = 'four'
+ key = boto.s3.key.Key(bucket, object_name)
+ key.set_contents_from_string(object_name)
+
+ # now delete it
+ (err, out) = rgwadmin(ctx, client,
+ ['object', 'rm', '--bucket', bucket_name, '--object', object_name],
+ check_status=True)
+
+ # TESTCASE 'bucket-stats6','bucket','stats','after deleting key','succeeds, lists one no objects'
+ (err, out) = rgwadmin(ctx, client, [
+ 'bucket', 'stats', '--bucket', bucket_name],
+ check_status=True)
+ assert out['id'] == bucket_id
+ assert out['usage']['rgw.main']['num_objects'] == 0
+
+ # list log objects
+ # TESTCASE 'log-list','log','list','after activity','succeeds, lists one no objects'
+ (err, out) = rgwadmin(ctx, client, ['log', 'list'], check_status=True)
+ assert len(out) > 0
+
+ for obj in out:
+ # TESTCASE 'log-show','log','show','after activity','returns expected info'
+ if obj[:4] == 'meta' or obj[:4] == 'data' or obj[:18] == 'obj_delete_at_hint':
+ continue
+
+ (err, rgwlog) = rgwadmin(ctx, client, ['log', 'show', '--object', obj],
+ check_status=True)
+ assert len(rgwlog) > 0
+
+ # exempt bucket_name2 from checking as it was only used for multi-region tests
+ assert rgwlog['bucket'].find(bucket_name) == 0 or rgwlog['bucket'].find(bucket_name2) == 0
+ assert rgwlog['bucket'] != bucket_name or rgwlog['bucket_id'] == bucket_id
+ assert rgwlog['bucket_owner'] == user1 or rgwlog['bucket'] == bucket_name + '5' or rgwlog['bucket'] == bucket_name2
+ for entry in rgwlog['log_entries']:
+ log.debug('checking log entry: ', entry)
+ assert entry['bucket'] == rgwlog['bucket']
+ possible_buckets = [bucket_name + '5', bucket_name2]
+ user = entry['user']
+ assert user == user1 or user.endswith('system-user') or \
+ rgwlog['bucket'] in possible_buckets
+
+ # TESTCASE 'log-rm','log','rm','delete log objects','succeeds'
+ (err, out) = rgwadmin(ctx, client, ['log', 'rm', '--object', obj],
+ check_status=True)
+
+ # TODO: show log by bucket+date
+
+ # need to wait for all usage data to get flushed, should take up to 30 seconds
+ timestamp = time.time()
+ while time.time() - timestamp <= (20 * 60): # wait up to 20 minutes
+ (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--categories', 'delete_obj']) # last operation we did is delete obj, wait for it to flush
+ if get_user_successful_ops(out, user1) > 0:
+ break
+ time.sleep(1)
+
+ assert time.time() - timestamp <= (20 * 60)
+
+ # TESTCASE 'usage-show' 'usage' 'show' 'all usage' 'succeeds'
+ (err, out) = rgwadmin(ctx, client, ['usage', 'show'], check_status=True)
+ assert len(out['entries']) > 0
+ assert len(out['summary']) > 0
+
+ user_summary = get_user_summary(out, user1)
+
+ total = user_summary['total']
+ assert total['successful_ops'] > 0
+
+ # TESTCASE 'usage-show2' 'usage' 'show' 'user usage' 'succeeds'
+ (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1],
+ check_status=True)
+ assert len(out['entries']) > 0
+ assert len(out['summary']) > 0
+ user_summary = out['summary'][0]
+ for entry in user_summary['categories']:
+ assert entry['successful_ops'] > 0
+ assert user_summary['user'] == user1
+
+ # TESTCASE 'usage-show3' 'usage' 'show' 'user usage categories' 'succeeds'
+ test_categories = ['create_bucket', 'put_obj', 'delete_obj', 'delete_bucket']
+ for cat in test_categories:
+ (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1, '--categories', cat],
+ check_status=True)
+ assert len(out['summary']) > 0
+ user_summary = out['summary'][0]
+ assert user_summary['user'] == user1
+ assert len(user_summary['categories']) == 1
+ entry = user_summary['categories'][0]
+ assert entry['category'] == cat
+ assert entry['successful_ops'] > 0
+
+ # the usage flush interval is 30 seconds, wait that much an then some
+ # to make sure everything has been flushed
+ time.sleep(35)
+
+ # TESTCASE 'usage-trim' 'usage' 'trim' 'user usage' 'succeeds, usage removed'
+ (err, out) = rgwadmin(ctx, client, ['usage', 'trim', '--uid', user1],
+ check_status=True)
+ (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1],
+ check_status=True)
+ assert len(out['entries']) == 0
+ assert len(out['summary']) == 0
+
+ # TESTCASE 'user-suspend2','user','suspend','existing user','succeeds'
+ (err, out) = rgwadmin(ctx, client, ['user', 'suspend', '--uid', user1],
+ check_status=True)
+
+ # TESTCASE 'user-suspend3','user','suspend','suspended user','cannot write objects'
+ try:
+ key = boto.s3.key.Key(bucket)
+ key.set_contents_from_string('five')
+ except boto.exception.S3ResponseError as e:
+ assert e.status == 403
+
+ # TESTCASE 'user-renable2','user','enable','suspended user','succeeds'
+ (err, out) = rgwadmin(ctx, client, ['user', 'enable', '--uid', user1],
+ check_status=True)
+
+ # TESTCASE 'user-renable3','user','enable','reenabled user','can write objects'
+ key = boto.s3.key.Key(bucket)
+ key.set_contents_from_string('six')
+
+ # TESTCASE 'gc-list', 'gc', 'list', 'get list of objects ready for garbage collection'
+
+ # create an object large enough to be split into multiple parts
+ test_string = 'foo'*10000000
+
+ big_key = boto.s3.key.Key(bucket)
+ big_key.set_contents_from_string(test_string)
+
+ # now delete the head
+ big_key.delete()
+
+ # wait a bit to give the garbage collector time to cycle
+ time.sleep(15)
+
+ (err, out) = rgwadmin(ctx, client, ['gc', 'list'])
+
+ assert len(out) > 0
+
+ # TESTCASE 'gc-process', 'gc', 'process', 'manually collect garbage'
+ (err, out) = rgwadmin(ctx, client, ['gc', 'process'], check_status=True)
+
+ #confirm
+ (err, out) = rgwadmin(ctx, client, ['gc', 'list'])
+
+ assert len(out) == 0
+
+ # TESTCASE 'rm-user-buckets','user','rm','existing user','fails, still has buckets'
+ (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--uid', user1])
+ assert err
+
+ # delete should fail because ``key`` still exists
+ try:
+ bucket.delete()
+ except boto.exception.S3ResponseError as e:
+ assert e.status == 409
+
+ key.delete()
+ bucket.delete()
+
+ # TESTCASE 'policy', 'bucket', 'policy', 'get bucket policy', 'returns S3 policy'
+ bucket = connection.create_bucket(bucket_name)
+
+ # create an object
+ key = boto.s3.key.Key(bucket)
+ key.set_contents_from_string('seven')
+
+ # should be private already but guarantee it
+ key.set_acl('private')
+
+ (err, out) = rgwadmin(ctx, client,
+ ['policy', '--bucket', bucket.name, '--object', key.key],
+ check_status=True, format='xml')
+
+ acl = get_acl(key)
+
+ assert acl == out.strip('\n')
+
+ # add another grantee by making the object public read
+ key.set_acl('public-read')
+
+ (err, out) = rgwadmin(ctx, client,
+ ['policy', '--bucket', bucket.name, '--object', key.key],
+ check_status=True, format='xml')
+
+ acl = get_acl(key)
+
+ assert acl == out.strip('\n')
+
+ # TESTCASE 'rm-bucket', 'bucket', 'rm', 'bucket with objects', 'succeeds'
+ bucket = connection.create_bucket(bucket_name)
+ key_name = ['eight', 'nine', 'ten', 'eleven']
+ for i in range(4):
+ key = boto.s3.key.Key(bucket)
+ key.set_contents_from_string(key_name[i])
+
+ (err, out) = rgwadmin(ctx, client,
+ ['bucket', 'rm', '--bucket', bucket_name, '--purge-objects'],
+ check_status=True)
+
+ # TESTCASE 'caps-add', 'caps', 'add', 'add user cap', 'succeeds'
+ caps='user=read'
+ (err, out) = rgwadmin(ctx, client, ['caps', 'add', '--uid', user1, '--caps', caps])
+
+ assert out['caps'][0]['perm'] == 'read'
+
+ # TESTCASE 'caps-rm', 'caps', 'rm', 'remove existing cap from user', 'succeeds'
+ (err, out) = rgwadmin(ctx, client, ['caps', 'rm', '--uid', user1, '--caps', caps])
+
+ assert not out['caps']
+
+ # TESTCASE 'rm-user','user','rm','existing user','fails, still has buckets'
+ bucket = connection.create_bucket(bucket_name)
+ key = boto.s3.key.Key(bucket)
+
+ (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--uid', user1])
+ assert err
+
+ # TESTCASE 'rm-user2', 'user', 'rm', 'user with data', 'succeeds'
+ bucket = connection.create_bucket(bucket_name)
+ key = boto.s3.key.Key(bucket)
+ key.set_contents_from_string('twelve')
+
+ (err, out) = rgwadmin(ctx, client,
+ ['user', 'rm', '--uid', user1, '--purge-data' ],
+ check_status=True)
+
+ # TESTCASE 'rm-user3','user','rm','deleted user','fails'
+ (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1])
+ assert err
+
+ # TESTCASE 'zone-info', 'zone', 'get', 'get zone info', 'succeeds, has default placement rule'
+ #
+
+ if realm is None:
+ (err, out) = rgwadmin(ctx, client, ['zone', 'get','--rgw-zone','default'])
+ else:
+ (err, out) = rgwadmin(ctx, client, ['zone', 'get'])
+ orig_placement_pools = len(out['placement_pools'])
+
+ # removed this test, it is not correct to assume that zone has default placement, it really
+ # depends on how we set it up before
+ #
+ # assert len(out) > 0
+ # assert len(out['placement_pools']) == 1
+
+ # default_rule = out['placement_pools'][0]
+ # assert default_rule['key'] == 'default-placement'
+
+ rule={'key': 'new-placement', 'val': {'data_pool': '.rgw.buckets.2', 'index_pool': '.rgw.buckets.index.2'}}
+
+ out['placement_pools'].append(rule)
+
+ (err, out) = rgwadmin(ctx, client, ['zone', 'set'],
+ stdin=StringIO(json.dumps(out)),
+ check_status=True)
+
+ if realm is None:
+ (err, out) = rgwadmin(ctx, client, ['zone', 'get','--rgw-zone','default'])
+ else:
+ (err, out) = rgwadmin(ctx, client, ['zone', 'get'])
+ assert len(out) > 0
+ assert len(out['placement_pools']) == orig_placement_pools + 1
--- /dev/null
+"""
+Run a series of rgw admin commands through the rest interface.
+
+The test cases in this file have been annotated for inventory.
+To extract the inventory (in csv format) use the command:
+
+ grep '^ *# TESTCASE' | sed 's/^ *# TESTCASE //'
+
+"""
+from cStringIO import StringIO
+import logging
+import json
+
+import boto.exception
+import boto.s3.connection
+import boto.s3.acl
+
+import requests
+import time
+
+from boto.connection import AWSAuthConnection
+from teuthology import misc as teuthology
+from util.rgw import get_user_summary, get_user_successful_ops
+
+log = logging.getLogger(__name__)
+
+def rgwadmin(ctx, client, cmd):
+ """
+ Perform rgw admin command
+
+ :param client: client
+ :param cmd: command to execute.
+ :return: command exit status, json result.
+ """
+ log.info('radosgw-admin: %s' % cmd)
+ testdir = teuthology.get_testdir(ctx)
+ pre = [
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'radosgw-admin',
+ '--log-to-stderr',
+ '--format', 'json',
+ ]
+ pre.extend(cmd)
+ (remote,) = ctx.cluster.only(client).remotes.iterkeys()
+ proc = remote.run(
+ args=pre,
+ check_status=False,
+ stdout=StringIO(),
+ stderr=StringIO(),
+ )
+ r = proc.exitstatus
+ out = proc.stdout.getvalue()
+ j = None
+ if not r and out != '':
+ try:
+ j = json.loads(out)
+ log.info(' json result: %s' % j)
+ except ValueError:
+ j = out
+ log.info(' raw result: %s' % j)
+ return (r, j)
+
+
+def rgwadmin_rest(connection, cmd, params=None, headers=None, raw=False):
+ """
+ perform a rest command
+ """
+ log.info('radosgw-admin-rest: %s %s' % (cmd, params))
+ put_cmds = ['create', 'link', 'add']
+ post_cmds = ['unlink', 'modify']
+ delete_cmds = ['trim', 'rm', 'process']
+ get_cmds = ['check', 'info', 'show', 'list']
+
+ bucket_sub_resources = ['object', 'policy', 'index']
+ user_sub_resources = ['subuser', 'key', 'caps']
+ zone_sub_resources = ['pool', 'log', 'garbage']
+
+ def get_cmd_method_and_handler(cmd):
+ """
+ Get the rest command and handler from information in cmd and
+ from the imported requests object.
+ """
+ if cmd[1] in put_cmds:
+ return 'PUT', requests.put
+ elif cmd[1] in delete_cmds:
+ return 'DELETE', requests.delete
+ elif cmd[1] in post_cmds:
+ return 'POST', requests.post
+ elif cmd[1] in get_cmds:
+ return 'GET', requests.get
+
+ def get_resource(cmd):
+ """
+ Get the name of the resource from information in cmd.
+ """
+ if cmd[0] == 'bucket' or cmd[0] in bucket_sub_resources:
+ if cmd[0] == 'bucket':
+ return 'bucket', ''
+ else:
+ return 'bucket', cmd[0]
+ elif cmd[0] == 'user' or cmd[0] in user_sub_resources:
+ if cmd[0] == 'user':
+ return 'user', ''
+ else:
+ return 'user', cmd[0]
+ elif cmd[0] == 'usage':
+ return 'usage', ''
+ elif cmd[0] == 'zone' or cmd[0] in zone_sub_resources:
+ if cmd[0] == 'zone':
+ return 'zone', ''
+ else:
+ return 'zone', cmd[0]
+
+ def build_admin_request(conn, method, resource = '', headers=None, data='',
+ query_args=None, params=None):
+ """
+ Build an administative request adapted from the build_request()
+ method of boto.connection
+ """
+
+ path = conn.calling_format.build_path_base('admin', resource)
+ auth_path = conn.calling_format.build_auth_path('admin', resource)
+ host = conn.calling_format.build_host(conn.server_name(), 'admin')
+ if query_args:
+ path += '?' + query_args
+ boto.log.debug('path=%s' % path)
+ auth_path += '?' + query_args
+ boto.log.debug('auth_path=%s' % auth_path)
+ return AWSAuthConnection.build_base_http_request(conn, method, path,
+ auth_path, params, headers, data, host)
+
+ method, handler = get_cmd_method_and_handler(cmd)
+ resource, query_args = get_resource(cmd)
+ request = build_admin_request(connection, method, resource,
+ query_args=query_args, headers=headers)
+
+ url = '{protocol}://{host}{path}'.format(protocol=request.protocol,
+ host=request.host, path=request.path)
+
+ request.authorize(connection=connection)
+ result = handler(url, params=params, headers=request.headers)
+
+ if raw:
+ log.info(' text result: %s' % result.txt)
+ return result.status_code, result.txt
+ else:
+ log.info(' json result: %s' % result.json())
+ return result.status_code, result.json()
+
+
+def task(ctx, config):
+ """
+ Test radosgw-admin functionality through the RESTful interface
+ """
+ assert config is None or isinstance(config, list) \
+ or isinstance(config, dict), \
+ "task s3tests only supports a list or dictionary for configuration"
+ all_clients = ['client.{id}'.format(id=id_)
+ for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+ if config is None:
+ config = all_clients
+ if isinstance(config, list):
+ config = dict.fromkeys(config)
+ clients = config.keys()
+
+ # just use the first client...
+ client = clients[0]
+
+ ##
+ admin_user = 'ada'
+ admin_display_name = 'Ms. Admin User'
+ admin_access_key = 'MH1WC2XQ1S8UISFDZC8W'
+ admin_secret_key = 'dQyrTPA0s248YeN5bBv4ukvKU0kh54LWWywkrpoG'
+ admin_caps = 'users=read, write; usage=read, write; buckets=read, write; zone=read, write'
+
+ user1 = 'foo'
+ user2 = 'fud'
+ subuser1 = 'foo:foo1'
+ subuser2 = 'foo:foo2'
+ display_name1 = 'Foo'
+ display_name2 = 'Fud'
+ email = 'foo@foo.com'
+ access_key = '9te6NH5mcdcq0Tc5i8i1'
+ secret_key = 'Ny4IOauQoL18Gp2zM7lC1vLmoawgqcYP/YGcWfXu'
+ access_key2 = 'p5YnriCv1nAtykxBrupQ'
+ secret_key2 = 'Q8Tk6Q/27hfbFSYdSkPtUqhqx1GgzvpXa4WARozh'
+ swift_secret1 = 'gpS2G9RREMrnbqlp29PP2D36kgPR1tm72n5fPYfL'
+ swift_secret2 = 'ri2VJQcKSYATOY6uaDUX7pxgkW+W1YmC6OCxPHwy'
+
+ bucket_name = 'myfoo'
+
+ # legend (test cases can be easily grep-ed out)
+ # TESTCASE 'testname','object','method','operation','assertion'
+ # TESTCASE 'create-admin-user','user','create','administrative user','succeeds'
+ (err, out) = rgwadmin(ctx, client, [
+ 'user', 'create',
+ '--uid', admin_user,
+ '--display-name', admin_display_name,
+ '--access-key', admin_access_key,
+ '--secret', admin_secret_key,
+ '--max-buckets', '0',
+ '--caps', admin_caps
+ ])
+ logging.error(out)
+ logging.error(err)
+ assert not err
+
+ (remote,) = ctx.cluster.only(client).remotes.iterkeys()
+ remote_host = remote.name.split('@')[1]
+ admin_conn = boto.s3.connection.S3Connection(
+ aws_access_key_id=admin_access_key,
+ aws_secret_access_key=admin_secret_key,
+ is_secure=False,
+ port=7280,
+ host=remote_host,
+ calling_format=boto.s3.connection.OrdinaryCallingFormat(),
+ )
+
+ # TESTCASE 'info-nosuch','user','info','non-existent user','fails'
+ (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {"uid": user1})
+ assert ret == 404
+
+ # TESTCASE 'create-ok','user','create','w/all valid info','succeeds'
+ (ret, out) = rgwadmin_rest(admin_conn,
+ ['user', 'create'],
+ {'uid' : user1,
+ 'display-name' : display_name1,
+ 'email' : email,
+ 'access-key' : access_key,
+ 'secret-key' : secret_key,
+ 'max-buckets' : '4'
+ })
+
+ assert ret == 200
+
+ # TESTCASE 'info-existing','user','info','existing user','returns correct info'
+ (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
+
+ assert out['user_id'] == user1
+ assert out['email'] == email
+ assert out['display_name'] == display_name1
+ assert len(out['keys']) == 1
+ assert out['keys'][0]['access_key'] == access_key
+ assert out['keys'][0]['secret_key'] == secret_key
+ assert not out['suspended']
+
+ # TESTCASE 'suspend-ok','user','suspend','active user','succeeds'
+ (ret, out) = rgwadmin_rest(admin_conn, ['user', 'modify'], {'uid' : user1, 'suspended' : True})
+ assert ret == 200
+
+ # TESTCASE 'suspend-suspended','user','suspend','suspended user','succeeds w/advisory'
+ (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
+ assert ret == 200
+ assert out['suspended']
+
+ # TESTCASE 're-enable','user','enable','suspended user','succeeds'
+ (ret, out) = rgwadmin_rest(admin_conn, ['user', 'modify'], {'uid' : user1, 'suspended' : 'false'})
+ assert not err
+
+ # TESTCASE 'info-re-enabled','user','info','re-enabled user','no longer suspended'
+ (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
+ assert ret == 200
+ assert not out['suspended']
+
+ # TESTCASE 'add-keys','key','create','w/valid info','succeeds'
+ (ret, out) = rgwadmin_rest(admin_conn,
+ ['key', 'create'],
+ {'uid' : user1,
+ 'access-key' : access_key2,
+ 'secret-key' : secret_key2
+ })
+
+
+ assert ret == 200
+
+ # TESTCASE 'info-new-key','user','info','after key addition','returns all keys'
+ (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
+ assert ret == 200
+ assert len(out['keys']) == 2
+ assert out['keys'][0]['access_key'] == access_key2 or out['keys'][1]['access_key'] == access_key2
+ assert out['keys'][0]['secret_key'] == secret_key2 or out['keys'][1]['secret_key'] == secret_key2
+
+ # TESTCASE 'rm-key','key','rm','newly added key','succeeds, key is removed'
+ (ret, out) = rgwadmin_rest(admin_conn,
+ ['key', 'rm'],
+ {'uid' : user1,
+ 'access-key' : access_key2
+ })
+
+ assert ret == 200
+
+ (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
+
+ assert len(out['keys']) == 1
+ assert out['keys'][0]['access_key'] == access_key
+ assert out['keys'][0]['secret_key'] == secret_key
+
+ # TESTCASE 'add-swift-key','key','create','swift key','succeeds'
+ (ret, out) = rgwadmin_rest(admin_conn,
+ ['subuser', 'create'],
+ {'subuser' : subuser1,
+ 'secret-key' : swift_secret1,
+ 'key-type' : 'swift'
+ })
+
+ assert ret == 200
+
+ # TESTCASE 'info-swift-key','user','info','after key addition','returns all keys'
+ (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
+ assert ret == 200
+ assert len(out['swift_keys']) == 1
+ assert out['swift_keys'][0]['user'] == subuser1
+ assert out['swift_keys'][0]['secret_key'] == swift_secret1
+
+ # TESTCASE 'add-swift-subuser','key','create','swift sub-user key','succeeds'
+ (ret, out) = rgwadmin_rest(admin_conn,
+ ['subuser', 'create'],
+ {'subuser' : subuser2,
+ 'secret-key' : swift_secret2,
+ 'key-type' : 'swift'
+ })
+
+ assert ret == 200
+
+ # TESTCASE 'info-swift-subuser','user','info','after key addition','returns all sub-users/keys'
+ (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
+ assert ret == 200
+ assert len(out['swift_keys']) == 2
+ assert out['swift_keys'][0]['user'] == subuser2 or out['swift_keys'][1]['user'] == subuser2
+ assert out['swift_keys'][0]['secret_key'] == swift_secret2 or out['swift_keys'][1]['secret_key'] == swift_secret2
+
+ # TESTCASE 'rm-swift-key1','key','rm','subuser','succeeds, one key is removed'
+ (ret, out) = rgwadmin_rest(admin_conn,
+ ['key', 'rm'],
+ {'subuser' : subuser1,
+ 'key-type' :'swift'
+ })
+
+ assert ret == 200
+
+ (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
+ assert len(out['swift_keys']) == 1
+
+ # TESTCASE 'rm-subuser','subuser','rm','subuser','success, subuser is removed'
+ (ret, out) = rgwadmin_rest(admin_conn,
+ ['subuser', 'rm'],
+ {'subuser' : subuser1
+ })
+
+ assert ret == 200
+
+ (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
+ assert len(out['subusers']) == 1
+
+ # TESTCASE 'rm-subuser-with-keys','subuser','rm','subuser','succeeds, second subser and key is removed'
+ (ret, out) = rgwadmin_rest(admin_conn,
+ ['subuser', 'rm'],
+ {'subuser' : subuser2,
+ 'key-type' : 'swift',
+ '{purge-keys' :True
+ })
+
+ assert ret == 200
+
+ (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
+ assert len(out['swift_keys']) == 0
+ assert len(out['subusers']) == 0
+
+ # TESTCASE 'bucket-stats','bucket','info','no session/buckets','succeeds, empty list'
+ (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'uid' : user1})
+ assert ret == 200
+ assert len(out) == 0
+
+ # connect to rgw
+ connection = boto.s3.connection.S3Connection(
+ aws_access_key_id=access_key,
+ aws_secret_access_key=secret_key,
+ is_secure=False,
+ port=7280,
+ host=remote_host,
+ calling_format=boto.s3.connection.OrdinaryCallingFormat(),
+ )
+
+ # TESTCASE 'bucket-stats2','bucket','stats','no buckets','succeeds, empty list'
+ (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'uid' : user1, 'stats' : True})
+ assert ret == 200
+ assert len(out) == 0
+
+ # create a first bucket
+ bucket = connection.create_bucket(bucket_name)
+
+ # TESTCASE 'bucket-list','bucket','list','one bucket','succeeds, expected list'
+ (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'uid' : user1})
+ assert ret == 200
+ assert len(out) == 1
+ assert out[0] == bucket_name
+
+ # TESTCASE 'bucket-stats3','bucket','stats','new empty bucket','succeeds, empty list'
+ (ret, out) = rgwadmin_rest(admin_conn,
+ ['bucket', 'info'], {'bucket' : bucket_name, 'stats' : True})
+
+ assert ret == 200
+ assert out['owner'] == user1
+ bucket_id = out['id']
+
+ # TESTCASE 'bucket-stats4','bucket','stats','new empty bucket','succeeds, expected bucket ID'
+ (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'uid' : user1, 'stats' : True})
+ assert ret == 200
+ assert len(out) == 1
+ assert out[0]['id'] == bucket_id # does it return the same ID twice in a row?
+
+ # use some space
+ key = boto.s3.key.Key(bucket)
+ key.set_contents_from_string('one')
+
+ # TESTCASE 'bucket-stats5','bucket','stats','after creating key','succeeds, lists one non-empty object'
+ (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'bucket' : bucket_name, 'stats' : True})
+ assert ret == 200
+ assert out['id'] == bucket_id
+ assert out['usage']['rgw.main']['num_objects'] == 1
+ assert out['usage']['rgw.main']['size_kb'] > 0
+
+ # reclaim it
+ key.delete()
+
+ # TESTCASE 'bucket unlink', 'bucket', 'unlink', 'unlink bucket from user', 'fails', 'access denied error'
+ (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'unlink'], {'uid' : user1, 'bucket' : bucket_name})
+
+ assert ret == 200
+
+ # create a second user to link the bucket to
+ (ret, out) = rgwadmin_rest(admin_conn,
+ ['user', 'create'],
+ {'uid' : user2,
+ 'display-name' : display_name2,
+ 'access-key' : access_key2,
+ 'secret-key' : secret_key2,
+ 'max-buckets' : '1',
+ })
+
+ assert ret == 200
+
+ # try creating an object with the first user before the bucket is relinked
+ denied = False
+ key = boto.s3.key.Key(bucket)
+
+ try:
+ key.set_contents_from_string('two')
+ except boto.exception.S3ResponseError:
+ denied = True
+
+ assert not denied
+
+ # delete the object
+ key.delete()
+
+ # link the bucket to another user
+ (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'link'], {'uid' : user2, 'bucket' : bucket_name})
+
+ assert ret == 200
+
+ # try creating an object with the first user which should cause an error
+ key = boto.s3.key.Key(bucket)
+
+ try:
+ key.set_contents_from_string('three')
+ except boto.exception.S3ResponseError:
+ denied = True
+
+ assert denied
+
+ # relink the bucket to the first user and delete the second user
+ (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'link'], {'uid' : user1, 'bucket' : bucket_name})
+ assert ret == 200
+
+ (ret, out) = rgwadmin_rest(admin_conn, ['user', 'rm'], {'uid' : user2})
+ assert ret == 200
+
+ # TESTCASE 'object-rm', 'object', 'rm', 'remove object', 'succeeds, object is removed'
+
+ # upload an object
+ object_name = 'four'
+ key = boto.s3.key.Key(bucket, object_name)
+ key.set_contents_from_string(object_name)
+
+ # now delete it
+ (ret, out) = rgwadmin_rest(admin_conn, ['object', 'rm'], {'bucket' : bucket_name, 'object' : object_name})
+ assert ret == 200
+
+ # TESTCASE 'bucket-stats6','bucket','stats','after deleting key','succeeds, lists one no objects'
+ (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'bucket' : bucket_name, 'stats' : True})
+ assert ret == 200
+ assert out['id'] == bucket_id
+ assert out['usage']['rgw.main']['num_objects'] == 0
+
+ # create a bucket for deletion stats
+ useless_bucket = connection.create_bucket('useless_bucket')
+ useless_key = useless_bucket.new_key('useless_key')
+ useless_key.set_contents_from_string('useless string')
+
+ # delete it
+ useless_key.delete()
+ useless_bucket.delete()
+
+ # wait for the statistics to flush
+ time.sleep(60)
+
+ # need to wait for all usage data to get flushed, should take up to 30 seconds
+ timestamp = time.time()
+ while time.time() - timestamp <= (20 * 60): # wait up to 20 minutes
+ (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'], {'categories' : 'delete_obj'}) # last operation we did is delete obj, wait for it to flush
+
+ if get_user_successful_ops(out, user1) > 0:
+ break
+ time.sleep(1)
+
+ assert time.time() - timestamp <= (20 * 60)
+
+ # TESTCASE 'usage-show' 'usage' 'show' 'all usage' 'succeeds'
+ (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'])
+ assert ret == 200
+ assert len(out['entries']) > 0
+ assert len(out['summary']) > 0
+ user_summary = get_user_summary(out, user1)
+ total = user_summary['total']
+ assert total['successful_ops'] > 0
+
+ # TESTCASE 'usage-show2' 'usage' 'show' 'user usage' 'succeeds'
+ (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'], {'uid' : user1})
+ assert ret == 200
+ assert len(out['entries']) > 0
+ assert len(out['summary']) > 0
+ user_summary = out['summary'][0]
+ for entry in user_summary['categories']:
+ assert entry['successful_ops'] > 0
+ assert user_summary['user'] == user1
+
+ # TESTCASE 'usage-show3' 'usage' 'show' 'user usage categories' 'succeeds'
+ test_categories = ['create_bucket', 'put_obj', 'delete_obj', 'delete_bucket']
+ for cat in test_categories:
+ (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'], {'uid' : user1, 'categories' : cat})
+ assert ret == 200
+ assert len(out['summary']) > 0
+ user_summary = out['summary'][0]
+ assert user_summary['user'] == user1
+ assert len(user_summary['categories']) == 1
+ entry = user_summary['categories'][0]
+ assert entry['category'] == cat
+ assert entry['successful_ops'] > 0
+
+ # TESTCASE 'usage-trim' 'usage' 'trim' 'user usage' 'succeeds, usage removed'
+ (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'trim'], {'uid' : user1})
+ assert ret == 200
+ (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'], {'uid' : user1})
+ assert ret == 200
+ assert len(out['entries']) == 0
+ assert len(out['summary']) == 0
+
+ # TESTCASE 'user-suspend2','user','suspend','existing user','succeeds'
+ (ret, out) = rgwadmin_rest(admin_conn, ['user', 'modify'], {'uid' : user1, 'suspended' : True})
+ assert ret == 200
+
+ # TESTCASE 'user-suspend3','user','suspend','suspended user','cannot write objects'
+ try:
+ key = boto.s3.key.Key(bucket)
+ key.set_contents_from_string('five')
+ except boto.exception.S3ResponseError as e:
+ assert e.status == 403
+
+ # TESTCASE 'user-renable2','user','enable','suspended user','succeeds'
+ (ret, out) = rgwadmin_rest(admin_conn, ['user', 'modify'], {'uid' : user1, 'suspended' : 'false'})
+ assert ret == 200
+
+ # TESTCASE 'user-renable3','user','enable','reenabled user','can write objects'
+ key = boto.s3.key.Key(bucket)
+ key.set_contents_from_string('six')
+
+ # TESTCASE 'garbage-list', 'garbage', 'list', 'get list of objects ready for garbage collection'
+
+ # create an object large enough to be split into multiple parts
+ test_string = 'foo'*10000000
+
+ big_key = boto.s3.key.Key(bucket)
+ big_key.set_contents_from_string(test_string)
+
+ # now delete the head
+ big_key.delete()
+
+ # TESTCASE 'rm-user-buckets','user','rm','existing user','fails, still has buckets'
+ (ret, out) = rgwadmin_rest(admin_conn, ['user', 'rm'], {'uid' : user1})
+ assert ret == 409
+
+ # delete should fail because ``key`` still exists
+ try:
+ bucket.delete()
+ except boto.exception.S3ResponseError as e:
+ assert e.status == 409
+
+ key.delete()
+ bucket.delete()
+
+ # TESTCASE 'policy', 'bucket', 'policy', 'get bucket policy', 'returns S3 policy'
+ bucket = connection.create_bucket(bucket_name)
+
+ # create an object
+ key = boto.s3.key.Key(bucket)
+ key.set_contents_from_string('seven')
+
+ # should be private already but guarantee it
+ key.set_acl('private')
+
+ (ret, out) = rgwadmin_rest(admin_conn, ['policy', 'show'], {'bucket' : bucket.name, 'object' : key.key})
+ assert ret == 200
+
+ acl = key.get_xml_acl()
+ assert acl == out.strip('\n')
+
+ # add another grantee by making the object public read
+ key.set_acl('public-read')
+
+ (ret, out) = rgwadmin_rest(admin_conn, ['policy', 'show'], {'bucket' : bucket.name, 'object' : key.key})
+ assert ret == 200
+
+ acl = key.get_xml_acl()
+ assert acl == out.strip('\n')
+
+ # TESTCASE 'rm-bucket', 'bucket', 'rm', 'bucket with objects', 'succeeds'
+ bucket = connection.create_bucket(bucket_name)
+ key_name = ['eight', 'nine', 'ten', 'eleven']
+ for i in range(4):
+ key = boto.s3.key.Key(bucket)
+ key.set_contents_from_string(key_name[i])
+
+ (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'rm'], {'bucket' : bucket_name, 'purge-objects' : True})
+ assert ret == 200
+
+ # TESTCASE 'caps-add', 'caps', 'add', 'add user cap', 'succeeds'
+ caps = 'usage=read'
+ (ret, out) = rgwadmin_rest(admin_conn, ['caps', 'add'], {'uid' : user1, 'user-caps' : caps})
+ assert ret == 200
+ assert out[0]['perm'] == 'read'
+
+ # TESTCASE 'caps-rm', 'caps', 'rm', 'remove existing cap from user', 'succeeds'
+ (ret, out) = rgwadmin_rest(admin_conn, ['caps', 'rm'], {'uid' : user1, 'user-caps' : caps})
+ assert ret == 200
+ assert not out
+
+ # TESTCASE 'rm-user','user','rm','existing user','fails, still has buckets'
+ bucket = connection.create_bucket(bucket_name)
+ key = boto.s3.key.Key(bucket)
+
+ (ret, out) = rgwadmin_rest(admin_conn, ['user', 'rm'], {'uid' : user1})
+ assert ret == 409
+
+ # TESTCASE 'rm-user2', 'user', 'rm', user with data', 'succeeds'
+ bucket = connection.create_bucket(bucket_name)
+ key = boto.s3.key.Key(bucket)
+ key.set_contents_from_string('twelve')
+
+ (ret, out) = rgwadmin_rest(admin_conn, ['user', 'rm'], {'uid' : user1, 'purge-data' : True})
+ assert ret == 200
+
+ # TESTCASE 'rm-user3','user','info','deleted user','fails'
+ (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
+ assert ret == 404
+
--- /dev/null
+"""
+Run rados gateway agent in test mode
+"""
+import contextlib
+import logging
+import argparse
+
+from teuthology.orchestra import run
+from teuthology import misc as teuthology
+import util.rgw as rgw_utils
+
+log = logging.getLogger(__name__)
+
+def run_radosgw_agent(ctx, config):
+ """
+ Run a single radosgw-agent. See task() for config format.
+ """
+ return_list = list()
+ for (client, cconf) in config.items():
+ # don't process entries that are not clients
+ if not client.startswith('client.'):
+ log.debug('key {data} does not start with \'client.\', moving on'.format(
+ data=client))
+ continue
+
+ src_client = cconf['src']
+ dest_client = cconf['dest']
+
+ src_zone = rgw_utils.zone_for_client(ctx, src_client)
+ dest_zone = rgw_utils.zone_for_client(ctx, dest_client)
+
+ log.info("source is %s", src_zone)
+ log.info("dest is %s", dest_zone)
+
+ testdir = teuthology.get_testdir(ctx)
+ (remote,) = ctx.cluster.only(client).remotes.keys()
+ # figure out which branch to pull from
+ branch = cconf.get('force-branch', None)
+ if not branch:
+ branch = cconf.get('branch', 'master')
+ sha1 = cconf.get('sha1')
+ remote.run(
+ args=[
+ 'cd', testdir, run.Raw('&&'),
+ 'git', 'clone',
+ '-b', branch,
+# 'https://github.com/ceph/radosgw-agent.git',
+ 'git://git.ceph.com/radosgw-agent.git',
+ 'radosgw-agent.{client}'.format(client=client),
+ ]
+ )
+ if sha1 is not None:
+ remote.run(
+ args=[
+ 'cd', testdir, run.Raw('&&'),
+ run.Raw('&&'),
+ 'git', 'reset', '--hard', sha1,
+ ]
+ )
+ remote.run(
+ args=[
+ 'cd', testdir, run.Raw('&&'),
+ 'cd', 'radosgw-agent.{client}'.format(client=client),
+ run.Raw('&&'),
+ './bootstrap',
+ ]
+ )
+
+ src_host, src_port = rgw_utils.get_zone_host_and_port(ctx, src_client,
+ src_zone)
+ dest_host, dest_port = rgw_utils.get_zone_host_and_port(ctx, dest_client,
+ dest_zone)
+ src_access, src_secret = rgw_utils.get_zone_system_keys(ctx, src_client,
+ src_zone)
+ dest_access, dest_secret = rgw_utils.get_zone_system_keys(ctx, dest_client,
+ dest_zone)
+ sync_scope = cconf.get('sync-scope', None)
+ port = cconf.get('port', 8000)
+ daemon_name = '{host}.{port}.syncdaemon'.format(host=remote.name, port=port)
+ in_args=[
+ 'daemon-helper',
+ 'kill',
+ '{tdir}/radosgw-agent.{client}/radosgw-agent'.format(tdir=testdir,
+ client=client),
+ '-v',
+ '--src-access-key', src_access,
+ '--src-secret-key', src_secret,
+ '--source', "http://{addr}:{port}".format(addr=src_host, port=src_port),
+ '--dest-access-key', dest_access,
+ '--dest-secret-key', dest_secret,
+ '--max-entries', str(cconf.get('max-entries', 1000)),
+ '--log-file', '{tdir}/archive/rgw_sync_agent.{client}.log'.format(
+ tdir=testdir,
+ client=client),
+ '--object-sync-timeout', '30',
+ ]
+
+ if cconf.get('metadata-only', False):
+ in_args.append('--metadata-only')
+
+ # the test server and full/incremental flags are mutually exclusive
+ if sync_scope is None:
+ in_args.append('--test-server-host')
+ in_args.append('0.0.0.0')
+ in_args.append('--test-server-port')
+ in_args.append(str(port))
+ log.debug('Starting a sync test server on {client}'.format(client=client))
+ # Stash the radosgw-agent server / port # for use by subsequent tasks
+ ctx.radosgw_agent.endpoint = (client, str(port))
+ else:
+ in_args.append('--sync-scope')
+ in_args.append(sync_scope)
+ log.debug('Starting a {scope} sync on {client}'.format(scope=sync_scope,client=client))
+
+ # positional arg for destination must come last
+ in_args.append("http://{addr}:{port}".format(addr=dest_host,
+ port=dest_port))
+
+ return_list.append((client, remote.run(
+ args=in_args,
+ wait=False,
+ stdin=run.PIPE,
+ logger=log.getChild(daemon_name),
+ )))
+ return return_list
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Run radosgw-agents in test mode.
+
+ Configuration is clients to run the agents on, with settings for
+ source client, destination client, and port to listen on. Binds
+ to 0.0.0.0. Port defaults to 8000. This must be run on clients
+ that have the correct zone root pools and rgw zone set in
+ ceph.conf, or the task cannot read the region information from the
+ cluster.
+
+ By default, this task will start an HTTP server that will trigger full
+ or incremental syncs based on requests made to it.
+ Alternatively, a single full sync can be triggered by
+ specifying 'sync-scope: full' or a loop of incremental syncs can be triggered
+ by specifying 'sync-scope: incremental' (the loop will sleep
+ '--incremental-sync-delay' seconds between each sync, default is 30 seconds).
+
+ By default, both data and metadata are synced. To only sync
+ metadata, for example because you want to sync between regions,
+ set metadata-only: true.
+
+ An example::
+
+ tasks:
+ - ceph:
+ conf:
+ client.0:
+ rgw zone = foo
+ rgw zone root pool = .root.pool
+ client.1:
+ rgw zone = bar
+ rgw zone root pool = .root.pool2
+ - rgw: # region configuration omitted for brevity
+ - radosgw-agent:
+ client.0:
+ branch: wip-next-feature-branch
+ src: client.0
+ dest: client.1
+ sync-scope: full
+ metadata-only: true
+ # port: 8000 (default)
+ client.1:
+ src: client.1
+ dest: client.0
+ port: 8001
+ """
+ assert isinstance(config, dict), 'rgw_sync_agent requires a dictionary config'
+ log.debug("config is %s", config)
+
+ overrides = ctx.config.get('overrides', {})
+ # merge each client section, but only if it exists in config since there isn't
+ # a sensible default action for this task
+ for client in config.iterkeys():
+ if config[client]:
+ log.debug('config[{client}]: {data}'.format(client=client, data=config[client]))
+ teuthology.deep_merge(config[client], overrides.get('radosgw-agent', {}))
+
+ ctx.radosgw_agent = argparse.Namespace()
+ ctx.radosgw_agent.config = config
+
+ procs = run_radosgw_agent(ctx, config)
+
+ ctx.radosgw_agent.procs = procs
+
+ try:
+ yield
+ finally:
+ testdir = teuthology.get_testdir(ctx)
+ try:
+ for client, proc in procs:
+ log.info("shutting down sync agent on %s", client)
+ proc.stdin.close()
+ proc.wait()
+ finally:
+ for client, proc in procs:
+ ctx.cluster.only(client).run(
+ args=[
+ 'rm', '-rf',
+ '{tdir}/radosgw-agent.{client}'.format(tdir=testdir,
+ client=client)
+ ]
+ )
--- /dev/null
+"""
+Rbd testing task
+"""
+import contextlib
+import logging
+import os
+
+from cStringIO import StringIO
+from teuthology.orchestra import run
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.parallel import parallel
+from teuthology.task.common_fs_utils import generic_mkfs
+from teuthology.task.common_fs_utils import generic_mount
+from teuthology.task.common_fs_utils import default_image_name
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def create_image(ctx, config):
+ """
+ Create an rbd image.
+
+ For example::
+
+ tasks:
+ - ceph:
+ - rbd.create_image:
+ client.0:
+ image_name: testimage
+ image_size: 100
+ image_format: 1
+ client.1:
+
+ Image size is expressed as a number of megabytes; default value
+ is 10240.
+
+ Image format value must be either 1 or 2; default value is 1.
+
+ """
+ assert isinstance(config, dict) or isinstance(config, list), \
+ "task create_image only supports a list or dictionary for configuration"
+
+ if isinstance(config, dict):
+ images = config.items()
+ else:
+ images = [(role, None) for role in config]
+
+ testdir = teuthology.get_testdir(ctx)
+ for role, properties in images:
+ if properties is None:
+ properties = {}
+ name = properties.get('image_name', default_image_name(role))
+ size = properties.get('image_size', 10240)
+ fmt = properties.get('image_format', 1)
+ (remote,) = ctx.cluster.only(role).remotes.keys()
+ log.info('Creating image {name} with size {size}'.format(name=name,
+ size=size))
+ args = [
+ 'adjust-ulimits',
+ 'ceph-coverage'.format(tdir=testdir),
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'rbd',
+ '-p', 'rbd',
+ 'create',
+ '--size', str(size),
+ name,
+ ]
+ # omit format option if using the default (format 1)
+ # since old versions of don't support it
+ if int(fmt) != 1:
+ args += ['--image-format', str(fmt)]
+ remote.run(args=args)
+ try:
+ yield
+ finally:
+ log.info('Deleting rbd images...')
+ for role, properties in images:
+ if properties is None:
+ properties = {}
+ name = properties.get('image_name', default_image_name(role))
+ (remote,) = ctx.cluster.only(role).remotes.keys()
+ remote.run(
+ args=[
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'rbd',
+ '-p', 'rbd',
+ 'rm',
+ name,
+ ],
+ )
+
+@contextlib.contextmanager
+def clone_image(ctx, config):
+ """
+ Clones a parent imag
+
+ For example::
+
+ tasks:
+ - ceph:
+ - rbd.clone_image:
+ client.0:
+ parent_name: testimage
+ image_name: cloneimage
+ """
+ assert isinstance(config, dict) or isinstance(config, list), \
+ "task clone_image only supports a list or dictionary for configuration"
+
+ if isinstance(config, dict):
+ images = config.items()
+ else:
+ images = [(role, None) for role in config]
+
+ testdir = teuthology.get_testdir(ctx)
+ for role, properties in images:
+ if properties is None:
+ properties = {}
+
+ name = properties.get('image_name', default_image_name(role))
+ parent_name = properties.get('parent_name')
+ assert parent_name is not None, \
+ "parent_name is required"
+ parent_spec = '{name}@{snap}'.format(name=parent_name, snap=name)
+
+ (remote,) = ctx.cluster.only(role).remotes.keys()
+ log.info('Clone image {parent} to {child}'.format(parent=parent_name,
+ child=name))
+ for cmd in [('snap', 'create', parent_spec),
+ ('snap', 'protect', parent_spec),
+ ('clone', parent_spec, name)]:
+ args = [
+ 'adjust-ulimits',
+ 'ceph-coverage'.format(tdir=testdir),
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'rbd', '-p', 'rbd'
+ ]
+ args.extend(cmd)
+ remote.run(args=args)
+
+ try:
+ yield
+ finally:
+ log.info('Deleting rbd clones...')
+ for role, properties in images:
+ if properties is None:
+ properties = {}
+ name = properties.get('image_name', default_image_name(role))
+ parent_name = properties.get('parent_name')
+ parent_spec = '{name}@{snap}'.format(name=parent_name, snap=name)
+
+ (remote,) = ctx.cluster.only(role).remotes.keys()
+
+ for cmd in [('rm', name),
+ ('snap', 'unprotect', parent_spec),
+ ('snap', 'rm', parent_spec)]:
+ args = [
+ 'adjust-ulimits',
+ 'ceph-coverage'.format(tdir=testdir),
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'rbd', '-p', 'rbd'
+ ]
+ args.extend(cmd)
+ remote.run(args=args)
+
+@contextlib.contextmanager
+def modprobe(ctx, config):
+ """
+ Load the rbd kernel module..
+
+ For example::
+
+ tasks:
+ - ceph:
+ - rbd.create_image: [client.0]
+ - rbd.modprobe: [client.0]
+ """
+ log.info('Loading rbd kernel module...')
+ for role in config:
+ (remote,) = ctx.cluster.only(role).remotes.keys()
+ remote.run(
+ args=[
+ 'sudo',
+ 'modprobe',
+ 'rbd',
+ ],
+ )
+ try:
+ yield
+ finally:
+ log.info('Unloading rbd kernel module...')
+ for role in config:
+ (remote,) = ctx.cluster.only(role).remotes.keys()
+ remote.run(
+ args=[
+ 'sudo',
+ 'modprobe',
+ '-r',
+ 'rbd',
+ # force errors to be ignored; necessary if more
+ # than one device was created, which may mean
+ # the module isn't quite ready to go the first
+ # time through.
+ run.Raw('||'),
+ 'true',
+ ],
+ )
+
+@contextlib.contextmanager
+def dev_create(ctx, config):
+ """
+ Map block devices to rbd images.
+
+ For example::
+
+ tasks:
+ - ceph:
+ - rbd.create_image: [client.0]
+ - rbd.modprobe: [client.0]
+ - rbd.dev_create:
+ client.0: testimage.client.0
+ """
+ assert isinstance(config, dict) or isinstance(config, list), \
+ "task dev_create only supports a list or dictionary for configuration"
+
+ if isinstance(config, dict):
+ role_images = config.items()
+ else:
+ role_images = [(role, None) for role in config]
+
+ log.info('Creating rbd block devices...')
+
+ testdir = teuthology.get_testdir(ctx)
+
+ for role, image in role_images:
+ if image is None:
+ image = default_image_name(role)
+ (remote,) = ctx.cluster.only(role).remotes.keys()
+
+ remote.run(
+ args=[
+ 'sudo',
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'rbd',
+ '--user', role.rsplit('.')[-1],
+ '-p', 'rbd',
+ 'map',
+ image,
+ run.Raw('&&'),
+ # wait for the symlink to be created by udev
+ 'while', 'test', '!', '-e', '/dev/rbd/rbd/{image}'.format(image=image), run.Raw(';'), 'do',
+ 'sleep', '1', run.Raw(';'),
+ 'done',
+ ],
+ )
+ try:
+ yield
+ finally:
+ log.info('Unmapping rbd devices...')
+ for role, image in role_images:
+ if image is None:
+ image = default_image_name(role)
+ (remote,) = ctx.cluster.only(role).remotes.keys()
+ remote.run(
+ args=[
+ 'LD_LIBRARY_PATH={tdir}/binary/usr/local/lib'.format(tdir=testdir),
+ 'sudo',
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'rbd',
+ '-p', 'rbd',
+ 'unmap',
+ '/dev/rbd/rbd/{imgname}'.format(imgname=image),
+ run.Raw('&&'),
+ # wait for the symlink to be deleted by udev
+ 'while', 'test', '-e', '/dev/rbd/rbd/{image}'.format(image=image),
+ run.Raw(';'),
+ 'do',
+ 'sleep', '1', run.Raw(';'),
+ 'done',
+ ],
+ )
+
+
+def rbd_devname_rtn(ctx, image):
+ return '/dev/rbd/rbd/{image}'.format(image=image)
+
+def canonical_path(ctx, role, path):
+ """
+ Determine the canonical path for a given path on the host
+ representing the given role. A canonical path contains no
+ . or .. components, and includes no symbolic links.
+ """
+ version_fp = StringIO()
+ ctx.cluster.only(role).run(
+ args=[ 'readlink', '-f', path ],
+ stdout=version_fp,
+ )
+ canonical_path = version_fp.getvalue().rstrip('\n')
+ version_fp.close()
+ return canonical_path
+
+@contextlib.contextmanager
+def run_xfstests(ctx, config):
+ """
+ Run xfstests over specified devices.
+
+ Warning: both the test and scratch devices specified will be
+ overwritten. Normally xfstests modifies (but does not destroy)
+ the test device, but for now the run script used here re-makes
+ both filesystems.
+
+ Note: Only one instance of xfstests can run on a single host at
+ a time, although this is not enforced.
+
+ This task in its current form needs some improvement. For
+ example, it assumes all roles provided in the config are
+ clients, and that the config provided is a list of key/value
+ pairs. For now please use the xfstests() interface, below.
+
+ For example::
+
+ tasks:
+ - ceph:
+ - rbd.run_xfstests:
+ client.0:
+ count: 2
+ test_dev: 'test_dev'
+ scratch_dev: 'scratch_dev'
+ fs_type: 'xfs'
+ tests: 'generic/100 xfs/003 xfs/005 xfs/006 generic/015'
+ randomize: true
+ """
+ with parallel() as p:
+ for role, properties in config.items():
+ p.spawn(run_xfstests_one_client, ctx, role, properties)
+ yield
+
+def run_xfstests_one_client(ctx, role, properties):
+ """
+ Spawned routine to handle xfs tests for a single client
+ """
+ testdir = teuthology.get_testdir(ctx)
+ try:
+ count = properties.get('count')
+ test_dev = properties.get('test_dev')
+ assert test_dev is not None, \
+ "task run_xfstests requires test_dev to be defined"
+ test_dev = canonical_path(ctx, role, test_dev)
+
+ scratch_dev = properties.get('scratch_dev')
+ assert scratch_dev is not None, \
+ "task run_xfstests requires scratch_dev to be defined"
+ scratch_dev = canonical_path(ctx, role, scratch_dev)
+
+ fs_type = properties.get('fs_type')
+ tests = properties.get('tests')
+ randomize = properties.get('randomize')
+
+
+ (remote,) = ctx.cluster.only(role).remotes.keys()
+
+ # Fetch the test script
+ test_root = teuthology.get_testdir(ctx)
+ test_script = 'run_xfstests_krbd.sh'
+ test_path = os.path.join(test_root, test_script)
+
+ xfstests_url = properties.get('xfstests_url')
+ assert xfstests_url is not None, \
+ "task run_xfstests requires xfstests_url to be defined"
+
+ xfstests_krbd_url = xfstests_url + '/' + test_script
+
+ log.info('Fetching {script} for {role} from {url}'.format(
+ script=test_script,
+ role=role,
+ url=xfstests_krbd_url))
+
+ args = [ 'wget', '-O', test_path, '--', xfstests_krbd_url ]
+ remote.run(args=args)
+
+ log.info('Running xfstests on {role}:'.format(role=role))
+ log.info(' iteration count: {count}:'.format(count=count))
+ log.info(' test device: {dev}'.format(dev=test_dev))
+ log.info(' scratch device: {dev}'.format(dev=scratch_dev))
+ log.info(' using fs_type: {fs_type}'.format(fs_type=fs_type))
+ log.info(' tests to run: {tests}'.format(tests=tests))
+ log.info(' randomize: {randomize}'.format(randomize=randomize))
+
+ # Note that the device paths are interpreted using
+ # readlink -f <path> in order to get their canonical
+ # pathname (so it matches what the kernel remembers).
+ args = [
+ '/usr/bin/sudo',
+ 'TESTDIR={tdir}'.format(tdir=testdir),
+ 'URL_BASE={url}'.format(url=xfstests_url),
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ '/bin/bash',
+ test_path,
+ '-c', str(count),
+ '-f', fs_type,
+ '-t', test_dev,
+ '-s', scratch_dev,
+ ]
+ if randomize:
+ args.append('-r')
+ if tests:
+ args.extend(['--', tests])
+ remote.run(args=args, logger=log.getChild(role))
+ finally:
+ log.info('Removing {script} on {role}'.format(script=test_script,
+ role=role))
+ remote.run(args=['rm', '-f', test_path])
+
+@contextlib.contextmanager
+def xfstests(ctx, config):
+ """
+ Run xfstests over rbd devices. This interface sets up all
+ required configuration automatically if not otherwise specified.
+ Note that only one instance of xfstests can run on a single host
+ at a time. By default, the set of tests specified is run once.
+ If a (non-zero) count value is supplied, the complete set of
+ tests will be run that number of times.
+
+ For example::
+
+ tasks:
+ - ceph:
+ # Image sizes are in MB
+ - rbd.xfstests:
+ client.0:
+ count: 3
+ test_image: 'test_image'
+ test_size: 250
+ test_format: 2
+ scratch_image: 'scratch_image'
+ scratch_size: 250
+ scratch_format: 1
+ fs_type: 'xfs'
+ tests: 'generic/100 xfs/003 xfs/005 xfs/006 generic/015'
+ randomize: true
+ xfstests_branch: master
+ xfstests_url: 'https://raw.github.com/ceph/branch/master/qa'
+ """
+ if config is None:
+ config = { 'all': None }
+ assert isinstance(config, dict) or isinstance(config, list), \
+ "task xfstests only supports a list or dictionary for configuration"
+ if isinstance(config, dict):
+ config = teuthology.replace_all_with_clients(ctx.cluster, config)
+ runs = config.items()
+ else:
+ runs = [(role, None) for role in config]
+
+ running_xfstests = {}
+ for role, properties in runs:
+ assert role.startswith('client.'), \
+ "task xfstests can only run on client nodes"
+ for host, roles_for_host in ctx.cluster.remotes.items():
+ if role in roles_for_host:
+ assert host not in running_xfstests, \
+ "task xfstests allows only one instance at a time per host"
+ running_xfstests[host] = True
+
+ images_config = {}
+ scratch_config = {}
+ modprobe_config = {}
+ image_map_config = {}
+ scratch_map_config = {}
+ xfstests_config = {}
+ for role, properties in runs:
+ if properties is None:
+ properties = {}
+
+ test_image = properties.get('test_image', 'test_image.{role}'.format(role=role))
+ test_size = properties.get('test_size', 10000) # 10G
+ test_fmt = properties.get('test_format', 1)
+ scratch_image = properties.get('scratch_image', 'scratch_image.{role}'.format(role=role))
+ scratch_size = properties.get('scratch_size', 10000) # 10G
+ scratch_fmt = properties.get('scratch_format', 1)
+
+ images_config[role] = dict(
+ image_name=test_image,
+ image_size=test_size,
+ image_format=test_fmt,
+ )
+
+ scratch_config[role] = dict(
+ image_name=scratch_image,
+ image_size=scratch_size,
+ image_format=scratch_fmt,
+ )
+
+ xfstests_branch = properties.get('xfstests_branch', 'master')
+ xfstests_url = properties.get('xfstests_url', 'https://raw.github.com/ceph/ceph/{branch}/qa'.format(branch=xfstests_branch))
+
+ xfstests_config[role] = dict(
+ count=properties.get('count', 1),
+ test_dev='/dev/rbd/rbd/{image}'.format(image=test_image),
+ scratch_dev='/dev/rbd/rbd/{image}'.format(image=scratch_image),
+ fs_type=properties.get('fs_type', 'xfs'),
+ randomize=properties.get('randomize', False),
+ tests=properties.get('tests'),
+ xfstests_url=xfstests_url,
+ )
+
+ log.info('Setting up xfstests using RBD images:')
+ log.info(' test ({size} MB): {image}'.format(size=test_size,
+ image=test_image))
+ log.info(' scratch ({size} MB): {image}'.format(size=scratch_size,
+ image=scratch_image))
+ modprobe_config[role] = None
+ image_map_config[role] = test_image
+ scratch_map_config[role] = scratch_image
+
+ with contextutil.nested(
+ lambda: create_image(ctx=ctx, config=images_config),
+ lambda: create_image(ctx=ctx, config=scratch_config),
+ lambda: modprobe(ctx=ctx, config=modprobe_config),
+ lambda: dev_create(ctx=ctx, config=image_map_config),
+ lambda: dev_create(ctx=ctx, config=scratch_map_config),
+ lambda: run_xfstests(ctx=ctx, config=xfstests_config),
+ ):
+ yield
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Create and mount an rbd image.
+
+ For example, you can specify which clients to run on::
+
+ tasks:
+ - ceph:
+ - rbd: [client.0, client.1]
+
+ There are a few image options::
+
+ tasks:
+ - ceph:
+ - rbd:
+ client.0: # uses defaults
+ client.1:
+ image_name: foo
+ image_size: 2048
+ image_format: 2
+ fs_type: xfs
+
+ To use default options on all clients::
+
+ tasks:
+ - ceph:
+ - rbd:
+ all:
+
+ To create 20GiB images and format them with xfs on all clients::
+
+ tasks:
+ - ceph:
+ - rbd:
+ all:
+ image_size: 20480
+ fs_type: xfs
+ """
+ if config is None:
+ config = { 'all': None }
+ norm_config = config
+ if isinstance(config, dict):
+ norm_config = teuthology.replace_all_with_clients(ctx.cluster, config)
+ if isinstance(norm_config, dict):
+ role_images = {}
+ for role, properties in norm_config.iteritems():
+ if properties is None:
+ properties = {}
+ role_images[role] = properties.get('image_name')
+ else:
+ role_images = norm_config
+
+ log.debug('rbd config is: %s', norm_config)
+
+ with contextutil.nested(
+ lambda: create_image(ctx=ctx, config=norm_config),
+ lambda: modprobe(ctx=ctx, config=norm_config),
+ lambda: dev_create(ctx=ctx, config=role_images),
+ lambda: generic_mkfs(ctx=ctx, config=norm_config,
+ devname_rtn=rbd_devname_rtn),
+ lambda: generic_mount(ctx=ctx, config=role_images,
+ devname_rtn=rbd_devname_rtn),
+ ):
+ yield
--- /dev/null
+"""
+ Long running fio tests on rbd mapped devices for format/features provided in config
+ Many fio parameters can be configured so that this task can be used along with thrash/power-cut tests
+ and exercise IO on full disk for all format/features
+ - This test should not be run on VM due to heavy use of resource
+
+"""
+import contextlib
+import json
+import logging
+import StringIO
+import re
+
+from teuthology.parallel import parallel
+from teuthology import misc as teuthology
+from tempfile import NamedTemporaryFile
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ client.0:
+ fio-io-size: 100g or 80% or 100m
+ fio-version: 2.2.9
+ formats: [2]
+ features: [[layering],[striping],[layering,exclusive-lock,object-map]]
+ test-clone-io: 1 #remove this option to not run create rbd clone and not run io on clone
+ io-engine: "sync or rbd or any io-engine"
+ rw: randrw
+ client.1:
+ fio-io-size: 100g
+ fio-version: 2.2.9
+ rw: read
+ image-size:20480
+
+or
+ all:
+ fio-io-size: 400g
+ rw: randrw
+ formats: [2]
+ features: [[layering],[striping]]
+ io-engine: libaio
+
+ Create rbd image + device and exercise IO for format/features provided in config file
+ Config can be per client or one config can be used for all clients, fio jobs are run in parallel for client provided
+
+ """
+ if config.get('all'):
+ client_config = config['all']
+ clients = ctx.cluster.only(teuthology.is_type('client'))
+ rbd_test_dir = teuthology.get_testdir(ctx) + "/rbd_fio_test"
+ for remote,role in clients.remotes.iteritems():
+ if 'client_config' in locals():
+ with parallel() as p:
+ p.spawn(run_fio, remote, client_config, rbd_test_dir)
+ else:
+ for client_config in config:
+ if client_config in role:
+ with parallel() as p:
+ p.spawn(run_fio, remote, config[client_config], rbd_test_dir)
+
+ yield
+
+
+def run_fio(remote, config, rbd_test_dir):
+ """
+ create fio config file with options based on above config
+ get the fio from github, generate binary, and use it to run on
+ the generated fio config file
+ """
+ fio_config=NamedTemporaryFile(prefix='fio_rbd_', dir='/tmp/', delete=False)
+ fio_config.write('[global]\n')
+ if config.get('io-engine'):
+ ioengine=config['io-engine']
+ fio_config.write('ioengine={ioe}\n'.format(ioe=ioengine))
+ else:
+ fio_config.write('ioengine=sync\n')
+ if config.get('bs'):
+ bs=config['bs']
+ fio_config.write('bs={bs}\n'.format(bs=bs))
+ else:
+ fio_config.write('bs=4k\n')
+ fio_config.write('iodepth=2\n')
+ if config.get('fio-io-size'):
+ size=config['fio-io-size']
+ fio_config.write('size={size}\n'.format(size=size))
+ else:
+ fio_config.write('size=100m\n')
+
+ fio_config.write('time_based\n')
+ if config.get('runtime'):
+ runtime=config['runtime']
+ fio_config.write('runtime={runtime}\n'.format(runtime=runtime))
+ else:
+ fio_config.write('runtime=1800\n')
+ fio_config.write('allow_file_create=0\n')
+ image_size=10240
+ if config.get('image_size'):
+ image_size=config['image_size']
+
+ formats=[1,2]
+ features=[['layering'],['striping'],['exclusive-lock','object-map']]
+ fio_version='2.7'
+ if config.get('formats'):
+ formats=config['formats']
+ if config.get('features'):
+ features=config['features']
+ if config.get('fio-version'):
+ fio_version=config['fio-version']
+
+ fio_config.write('norandommap\n')
+ if ioengine == 'rbd':
+ fio_config.write('invalidate=0\n')
+ #handle package required for librbd engine
+ sn=remote.shortname
+ system_type= teuthology.get_system_type(remote)
+ if system_type == 'rpm' and ioengine == 'rbd':
+ log.info("Installing librbd1 devel package on {sn}".format(sn=sn))
+ remote.run(args=['sudo', 'yum' , 'install', 'librbd1-devel', '-y'])
+ elif ioengine == 'rbd':
+ log.info("Installing librbd devel package on {sn}".format(sn=sn))
+ remote.run(args=['sudo', 'apt-get', '-y',
+ '--force-yes',
+ 'install', 'librbd-dev'])
+ if ioengine == 'rbd':
+ fio_config.write('clientname=admin\n')
+ fio_config.write('pool=rbd\n')
+ for frmt in formats:
+ for feature in features:
+ log.info("Creating rbd images on {sn}".format(sn=sn))
+ feature_name = '-'.join(feature)
+ rbd_name = 'i{i}f{f}{sn}'.format(i=frmt,f=feature_name,sn=sn)
+ rbd_snap_name = 'i{i}f{f}{sn}@i{i}f{f}{sn}Snap'.format(i=frmt,f=feature_name,sn=sn)
+ rbd_clone_name = 'i{i}f{f}{sn}Clone'.format(i=frmt,f=feature_name,sn=sn)
+ create_args=['rbd', 'create',
+ '--size', '{size}'.format(size=image_size),
+ '--image', rbd_name,
+ '--image-format', '{f}'.format(f=frmt)]
+ map(lambda x: create_args.extend(['--image-feature', x]), feature)
+ remote.run(args=create_args)
+ remote.run(args=['rbd', 'info', rbd_name])
+ if ioengine != 'rbd':
+ out=StringIO.StringIO()
+ remote.run(args=['sudo', 'rbd', 'map', rbd_name ],stdout=out)
+ dev=re.search(r'(/dev/rbd\d+)',out.getvalue())
+ rbd_dev=dev.group(1)
+ if config.get('test-clone-io'):
+ log.info("Testing clones using fio")
+ remote.run(args=['rbd', 'snap', 'create', rbd_snap_name])
+ remote.run(args=['rbd', 'snap', 'protect', rbd_snap_name])
+ remote.run(args=['rbd', 'clone', rbd_snap_name, rbd_clone_name])
+ remote.run(args=['sudo', 'rbd', 'map', rbd_clone_name], stdout=out)
+ dev=re.search(r'(/dev/rbd\d+)',out.getvalue())
+ rbd_clone_dev=dev.group(1)
+ fio_config.write('[{rbd_dev}]\n'.format(rbd_dev=rbd_dev))
+ if config.get('rw'):
+ rw=config['rw']
+ fio_config.write('rw={rw}\n'.format(rw=rw))
+ else:
+ fio_config .write('rw=randrw\n')
+ fio_config.write('filename={rbd_dev}\n'.format(rbd_dev=rbd_dev))
+ if config.get('test-clone-io'):
+ fio_config.write('[{rbd_clone_dev}]\n'.format(rbd_clone_dev=rbd_clone_dev))
+ fio_config.write('rw={rw}\n'.format(rw=rw))
+ fio_config.write('filename={rbd_clone_dev}\n'.format(rbd_clone_dev=rbd_clone_dev))
+ else:
+ if config.get('test-clone-io'):
+ log.info("Testing clones using fio")
+ remote.run(args=['rbd', 'snap', 'create', rbd_snap_name])
+ remote.run(args=['rbd', 'snap', 'protect', rbd_snap_name])
+ remote.run(args=['rbd', 'clone', rbd_snap_name, rbd_clone_name])
+ fio_config.write('[{img_name}]\n'.format(img_name=rbd_name))
+ if config.get('rw'):
+ rw=config['rw']
+ fio_config.write('rw={rw}\n'.format(rw=rw))
+ else:
+ fio_config.write('rw=randrw\n')
+ fio_config.write('rbdname={img_name}\n'.format(img_name=rbd_name))
+ if config.get('test-clone-io'):
+ fio_config.write('[{clone_img_name}]\n'.format(clone_img_name=rbd_clone_name))
+ fio_config.write('rw={rw}\n'.format(rw=rw))
+ fio_config.write('rbdname={clone_img_name}\n'.format(clone_img_name=rbd_clone_name))
+
+
+ fio_config.close()
+ remote.put_file(fio_config.name,fio_config.name)
+ try:
+ log.info("Running rbd feature - fio test on {sn}".format(sn=sn))
+ fio = "https://github.com/axboe/fio/archive/fio-" + fio_version + ".tar.gz"
+ remote.run(args=['mkdir', run.Raw(rbd_test_dir),])
+ remote.run(args=['cd' , run.Raw(rbd_test_dir),
+ run.Raw(';'), 'wget' , fio , run.Raw(';'), run.Raw('tar -xvf fio*tar.gz'), run.Raw(';'),
+ run.Raw('cd fio-fio*'), 'configure', run.Raw(';') ,'make'])
+ remote.run(args=['ceph', '-s'])
+ remote.run(args=['sudo', run.Raw('{tdir}/fio-fio-{v}/fio {f}'.format(tdir=rbd_test_dir,v=fio_version,f=fio_config.name))])
+ remote.run(args=['ceph', '-s'])
+ finally:
+ out=StringIO.StringIO()
+ remote.run(args=['rbd','showmapped', '--format=json'], stdout=out)
+ mapped_images = json.loads(out.getvalue())
+ if mapped_images:
+ log.info("Unmapping rbd images on {sn}".format(sn=sn))
+ for image in mapped_images.itervalues():
+ remote.run(args=['sudo', 'rbd', 'unmap', str(image['device'])])
+ log.info("Cleaning up fio install")
+ remote.run(args=['rm','-rf', run.Raw(rbd_test_dir)])
+ if system_type == 'rpm' and ioengine == 'rbd':
+ log.info("Uninstall librbd1 devel package on {sn}".format(sn=sn))
+ remote.run(args=['sudo', 'yum' , 'remove', 'librbd1-devel', '-y'])
+ elif ioengine == 'rbd':
+ log.info("Uninstall librbd devel package on {sn}".format(sn=sn))
+ remote.run(args=['sudo', 'apt-get', '-y', 'remove', 'librbd-dev'])
--- /dev/null
+"""
+Run fsx on an rbd image
+"""
+import contextlib
+import logging
+
+from teuthology.parallel import parallel
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Run fsx on an rbd image.
+
+ Currently this requires running as client.admin
+ to create a pool.
+
+ Specify which clients to run on as a list::
+
+ tasks:
+ ceph:
+ rbd_fsx:
+ clients: [client.0, client.1]
+
+ You can optionally change some properties of fsx:
+
+ tasks:
+ ceph:
+ rbd_fsx:
+ clients: <list of clients>
+ seed: <random seed number, or 0 to use the time>
+ ops: <number of operations to do>
+ size: <maximum image size in bytes>
+ valgrind: [--tool=<valgrind tool>]
+ """
+ log.info('starting rbd_fsx...')
+ with parallel() as p:
+ for role in config['clients']:
+ p.spawn(_run_one_client, ctx, config, role)
+ yield
+
+def _run_one_client(ctx, config, role):
+ """Spawned task that runs the client"""
+ krbd = config.get('krbd', False)
+ nbd = config.get('nbd', False)
+ testdir = teuthology.get_testdir(ctx)
+ (remote,) = ctx.cluster.only(role).remotes.iterkeys()
+
+ args = []
+ if krbd or nbd:
+ args.append('sudo') # rbd(-nbd) map/unmap need privileges
+ args.extend([
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir)
+ ])
+
+ overrides = ctx.config.get('overrides', {})
+ teuthology.deep_merge(config, overrides.get('rbd_fsx', {}))
+
+ if config.get('valgrind'):
+ args = teuthology.get_valgrind_args(
+ testdir,
+ 'fsx_{id}'.format(id=role),
+ args,
+ config.get('valgrind')
+ )
+
+ args.extend([
+ 'ceph_test_librbd_fsx',
+ '-d', # debug output for all operations
+ '-W', '-R', # mmap doesn't work with rbd
+ '-p', str(config.get('progress_interval', 100)), # show progress
+ '-P', '{tdir}/archive'.format(tdir=testdir),
+ '-r', str(config.get('readbdy',1)),
+ '-w', str(config.get('writebdy',1)),
+ '-t', str(config.get('truncbdy',1)),
+ '-h', str(config.get('holebdy',1)),
+ '-l', str(config.get('size', 250000000)),
+ '-S', str(config.get('seed', 0)),
+ '-N', str(config.get('ops', 1000)),
+ ])
+ if krbd:
+ args.append('-K') # -K enables krbd mode
+ if nbd:
+ args.append('-M') # -M enables nbd mode
+ if config.get('direct_io', False):
+ args.append('-Z') # -Z use direct IO
+ if not config.get('randomized_striping', True):
+ args.append('-U') # -U disables randomized striping
+ if not config.get('punch_holes', True):
+ args.append('-H') # -H disables discard ops
+ if config.get('journal_replay', False):
+ args.append('-j') # -j replay all IO events from journal
+ args.extend([
+ 'pool_{pool}'.format(pool=role),
+ 'image_{image}'.format(image=role),
+ ])
+
+ remote.run(args=args)
--- /dev/null
+"""
+Task for running rbd mirroring daemons and configuring mirroring
+"""
+
+import logging
+
+from teuthology.orchestra import run
+from teuthology import misc
+from teuthology.exceptions import ConfigError
+from teuthology.task import Task
+from util import get_remote_for_role
+
+log = logging.getLogger(__name__)
+
+
+class RBDMirror(Task):
+ """
+ Run an rbd-mirror daemon to sync rbd images between clusters.
+
+ This requires two clients (one from each cluster) on the same host
+ to connect with. The pool configuration should be adjusted by later
+ test scripts to include the remote client and cluster name. This task
+ just needs to know how to connect to the local cluster.
+
+ For example:
+
+ roles:
+ - [primary.mon.a, primary.osd.0, primary.osd.1, primary.osd.2]
+ - [secondary.mon.a, secondary.osd.0, secondary.osd.1, secondary.osd.2]
+ - [primary.client.mirror, secondary.client.mirror]
+ tasks:
+ - ceph:
+ cluster: primary
+ - ceph:
+ cluster: secondary
+ - rbd-mirror:
+ client: primary.client.mirror
+
+ To mirror back to the primary cluster as well, add another
+ rbd_mirror instance:
+
+ - rbd-mirror:
+ client: secondary.client.mirror
+
+ Possible options for this task are:
+
+ client: role - ceph client to connect as
+ valgrind: [--tool=<valgrind tool>] - none by default
+ coverage: bool - whether this run may be collecting coverage data
+ """
+ def __init__(self, ctx, config):
+ super(RBDMirror, self).__init__(ctx, config)
+ self.log = log
+
+ def setup(self):
+ super(RBDMirror, self).setup()
+ try:
+ self.client = self.config['client']
+ except KeyError:
+ raise ConfigError('rbd-mirror requires a client to connect with')
+
+ self.cluster_name, type_, self.client_id = misc.split_role(self.client)
+
+ if type_ != 'client':
+ msg = 'client role ({0}) must be a client'.format(self.client)
+ raise ConfigError(msg)
+
+ self.remote = get_remote_for_role(self.ctx, self.client)
+
+ def begin(self):
+ super(RBDMirror, self).begin()
+ testdir = misc.get_testdir(self.ctx)
+ daemon_signal = 'kill'
+ if 'coverage' in self.config or 'valgrind' in self.config:
+ daemon_signal = 'term'
+
+ args = [
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'daemon-helper',
+ daemon_signal,
+ ]
+
+ if 'valgrind' in self.config:
+ args = misc.get_valgrind_args(
+ testdir,
+ 'rbd-mirror-{id}'.format(id=self.client),
+ args,
+ self.config.get('valgrind')
+ )
+
+ args.extend([
+ 'rbd-mirror',
+ '--cluster',
+ self.cluster_name,
+ '--id',
+ self.client_id,
+ ])
+
+ self.ctx.daemons.add_daemon(
+ self.remote, 'rbd-mirror', self.client,
+ cluster=self.cluster_name,
+ args=args,
+ logger=self.log.getChild(self.client),
+ stdin=run.PIPE,
+ wait=False,
+ )
+
+ def end(self):
+ mirror_daemon = self.ctx.daemons.get_daemon('rbd-mirror',
+ self.client,
+ self.cluster_name)
+ mirror_daemon.stop()
+ super(RBDMirror, self).end()
+
+task = RBDMirror
--- /dev/null
+"""
+Test if we can recover the leveldb from OSD after where all leveldbs are
+corrupted
+"""
+
+import logging
+import os.path
+import shutil
+import tempfile
+
+import ceph_manager
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+
+def push_directory(path, remote, remote_dir):
+ """
+ local_temp_path=`mktemp`
+ tar czf $local_temp_path $path
+ ssh remote mkdir -p remote_dir
+ remote_temp_path=`mktemp`
+ scp $local_temp_path $remote_temp_path
+ rm $local_temp_path
+ tar xzf $remote_temp_path -C $remote_dir
+ ssh remote:$remote_temp_path
+ """
+ fd, local_temp_path = tempfile.mkstemp(suffix='.tgz',
+ prefix='rebuild_mondb-')
+ os.close(fd)
+ cmd = ' '.join(['tar', 'cz',
+ '-f', local_temp_path,
+ '-C', path,
+ '--', '.'])
+ teuthology.sh(cmd)
+ _, fname = os.path.split(local_temp_path)
+ fd, remote_temp_path = tempfile.mkstemp(suffix='.tgz',
+ prefix='rebuild_mondb-')
+ os.close(fd)
+ remote.put_file(local_temp_path, remote_temp_path)
+ os.remove(local_temp_path)
+ remote.run(args=['sudo',
+ 'tar', 'xz',
+ '-C', remote_dir,
+ '-f', remote_temp_path])
+ remote.run(args=['sudo', 'rm', '-fr', remote_temp_path])
+
+
+def task(ctx, config):
+ """
+ Test monitor recovery from OSD
+ """
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'task only accepts a dict for configuration'
+
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+ manager = ceph_manager.CephManager(
+ mon,
+ ctx=ctx,
+ logger=log.getChild('ceph_manager'))
+
+ mons = ctx.cluster.only(teuthology.is_type('mon'))
+ assert mons
+ # note down the first cluster_name and mon_id
+ # we will recover it later on
+ cluster_name = None
+ mon_id = None
+ for remote, roles in mons.remotes.iteritems():
+ is_mon = teuthology.is_type('mon')
+ for role in roles:
+ if not is_mon(role):
+ continue
+ cluster, _, m = teuthology.split_role(role)
+ if cluster_name is None:
+ cluster_name = cluster
+ mon_id = m
+ assert cluster_name == cluster
+ log.info('killing {cluster}:mon.{mon}'.format(
+ cluster=cluster,
+ mon=m))
+ manager.kill_mon(m)
+ mon_data = os.path.join('/var/lib/ceph/mon/',
+ '{0}-{1}'.format(cluster_name, m))
+ if m == mon_id:
+ # so we will only need to recreate the store.db for the
+ # first mon, would be easier than mkfs on it then replace
+ # the its store.db with the recovered one
+ store_dir = os.path.join(mon_data, 'store.db')
+ remote.run(args=['sudo', 'rm', '-r', store_dir])
+ else:
+ remote.run(args=['sudo', 'rm', '-r', mon_data])
+
+ local_mstore = tempfile.mkdtemp()
+
+ # collect the maps from all OSDs
+ osds = ctx.cluster.only(teuthology.is_type('osd'))
+ assert osds
+ for osd, roles in osds.remotes.iteritems():
+ is_osd = teuthology.is_type('osd')
+ for role in roles:
+ if not is_osd(role):
+ continue
+ cluster, _, osd_id = teuthology.split_role(role)
+ assert cluster_name == cluster
+ log.info('collecting maps from {cluster}:osd.{osd}'.format(
+ cluster=cluster,
+ osd=osd_id))
+ # push leveldb to OSD
+ osd_mstore = os.path.join(teuthology.get_testdir(ctx), 'mon-store')
+ osd.run(args=['sudo', 'mkdir', '-m', 'o+x', '-p', osd_mstore])
+
+ push_directory(local_mstore, osd, osd_mstore)
+ log.info('rm -rf {0}'.format(local_mstore))
+ shutil.rmtree(local_mstore)
+ # update leveldb with OSD data
+ options = '--op update-mon-db --mon-store-path {0}'
+ log.info('cot {0}'.format(osd_mstore))
+ manager.objectstore_tool(pool=None,
+ options=options.format(osd_mstore),
+ args='',
+ osd=osd_id,
+ do_revive=False)
+ # pull the updated mon db
+ log.info('pull dir {0} -> {1}'.format(osd_mstore, local_mstore))
+ local_mstore = tempfile.mkdtemp()
+ teuthology.pull_directory(osd, osd_mstore, local_mstore)
+ log.info('rm -rf osd:{0}'.format(osd_mstore))
+ osd.run(args=['sudo', 'rm', '-fr', osd_mstore])
+
+ # recover the first_mon with re-built mon db
+ # pull from recovered leveldb from client
+ mon_store_dir = os.path.join('/var/lib/ceph/mon',
+ '{0}-{1}'.format(cluster_name, mon_id))
+ push_directory(local_mstore, mon, mon_store_dir)
+ mon.run(args=['sudo', 'chown', '-R', 'ceph:ceph', mon_store_dir])
+ shutil.rmtree(local_mstore)
+ default_keyring = '/etc/ceph/{cluster}.keyring'.format(
+ cluster=cluster_name)
+ keyring_path = config.get('keyring_path', default_keyring)
+ # fill up the caps in the keyring file
+ mon.run(args=['sudo',
+ 'ceph-authtool', keyring_path,
+ '-n', 'mon.',
+ '--cap', 'mon', 'allow *'])
+ mon.run(args=['sudo',
+ 'ceph-authtool', keyring_path,
+ '-n', 'client.admin',
+ '--cap', 'mon', 'allow *',
+ '--cap', 'osd', 'allow *',
+ '--cap', 'mds', 'allow *'])
+ mon.run(args=['sudo', '-u', 'ceph',
+ 'ceph-monstore-tool', mon_store_dir,
+ 'rebuild', '--', '--keyring',
+ keyring_path])
+
+ # revive monitors
+ # the initial monmap is in the ceph.conf, so we are good.
+ n_mons = 0
+ for remote, roles in mons.remotes.iteritems():
+ is_mon = teuthology.is_type('mon')
+ for role in roles:
+ if not is_mon(role):
+ continue
+ cluster, _, m = teuthology.split_role(role)
+ assert cluster_name == cluster
+ if mon_id != m:
+ log.info('running mkfs on {cluster}:mon.{mon}'.format(
+ cluster=cluster,
+ mon=m))
+ remote.run(
+ args=[
+ 'sudo',
+ 'ceph-mon',
+ '--cluster', cluster,
+ '--mkfs',
+ '-i', m,
+ '--keyring', keyring_path])
+ manager.revive_mon(m)
+ n_mons += 1
+
+ manager.wait_for_mon_quorum_size(n_mons, timeout=30)
+ for osd, roles in osds.remotes.iteritems():
+ is_osd = teuthology.is_type('osd')
+ for role in roles:
+ if not is_osd(role):
+ continue
+ _, _, osd_id = teuthology.split_role(role)
+ log.info('reviving osd.{0}'.format(osd_id))
+ manager.revive_osd(osd_id)
--- /dev/null
+"""
+Recovery system benchmarking
+"""
+from cStringIO import StringIO
+
+import contextlib
+import gevent
+import json
+import logging
+import random
+import time
+
+import ceph_manager
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Benchmark the recovery system.
+
+ Generates objects with smalliobench, runs it normally to get a
+ baseline performance measurement, then marks an OSD out and reruns
+ to measure performance during recovery.
+
+ The config should be as follows:
+
+ recovery_bench:
+ duration: <seconds for each measurement run>
+ num_objects: <number of objects>
+ io_size: <io size in bytes>
+
+ example:
+
+ tasks:
+ - ceph:
+ - recovery_bench:
+ duration: 60
+ num_objects: 500
+ io_size: 4096
+ """
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'recovery_bench task only accepts a dict for configuration'
+
+ log.info('Beginning recovery bench...')
+
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+ manager = ceph_manager.CephManager(
+ mon,
+ ctx=ctx,
+ logger=log.getChild('ceph_manager'),
+ )
+
+ num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
+ while len(manager.get_osd_status()['up']) < num_osds:
+ time.sleep(10)
+
+ bench_proc = RecoveryBencher(
+ manager,
+ config,
+ )
+ try:
+ yield
+ finally:
+ log.info('joining recovery bencher')
+ bench_proc.do_join()
+
+class RecoveryBencher:
+ """
+ RecoveryBencher
+ """
+ def __init__(self, manager, config):
+ self.ceph_manager = manager
+ self.ceph_manager.wait_for_clean()
+
+ osd_status = self.ceph_manager.get_osd_status()
+ self.osds = osd_status['up']
+
+ self.config = config
+ if self.config is None:
+ self.config = dict()
+
+ else:
+ def tmp(x):
+ """
+ Local wrapper to print value.
+ """
+ print x
+ self.log = tmp
+
+ log.info("spawning thread")
+
+ self.thread = gevent.spawn(self.do_bench)
+
+ def do_join(self):
+ """
+ Join the recovery bencher. This is called after the main
+ task exits.
+ """
+ self.thread.get()
+
+ def do_bench(self):
+ """
+ Do the benchmarking.
+ """
+ duration = self.config.get("duration", 60)
+ num_objects = self.config.get("num_objects", 500)
+ io_size = self.config.get("io_size", 4096)
+
+ osd = str(random.choice(self.osds))
+ (osd_remote,) = self.ceph_manager.ctx.cluster.only('osd.%s' % osd).remotes.iterkeys()
+
+ testdir = teuthology.get_testdir(self.ceph_manager.ctx)
+
+ # create the objects
+ osd_remote.run(
+ args=[
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'smalliobench'.format(tdir=testdir),
+ '--use-prefix', 'recovery_bench',
+ '--init-only', '1',
+ '--num-objects', str(num_objects),
+ '--io-size', str(io_size),
+ ],
+ wait=True,
+ )
+
+ # baseline bench
+ log.info('non-recovery (baseline)')
+ p = osd_remote.run(
+ args=[
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'smalliobench',
+ '--use-prefix', 'recovery_bench',
+ '--do-not-init', '1',
+ '--duration', str(duration),
+ '--io-size', str(io_size),
+ ],
+ stdout=StringIO(),
+ stderr=StringIO(),
+ wait=True,
+ )
+ self.process_samples(p.stderr.getvalue())
+
+ self.ceph_manager.raw_cluster_cmd('osd', 'out', osd)
+ time.sleep(5)
+
+ # recovery bench
+ log.info('recovery active')
+ p = osd_remote.run(
+ args=[
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'smalliobench',
+ '--use-prefix', 'recovery_bench',
+ '--do-not-init', '1',
+ '--duration', str(duration),
+ '--io-size', str(io_size),
+ ],
+ stdout=StringIO(),
+ stderr=StringIO(),
+ wait=True,
+ )
+ self.process_samples(p.stderr.getvalue())
+
+ self.ceph_manager.raw_cluster_cmd('osd', 'in', osd)
+
+ def process_samples(self, input):
+ """
+ Extract samples from the input and process the results
+
+ :param input: input lines in JSON format
+ """
+ lat = {}
+ for line in input.split('\n'):
+ try:
+ sample = json.loads(line)
+ samples = lat.setdefault(sample['type'], [])
+ samples.append(float(sample['latency']))
+ except Exception:
+ pass
+
+ for type in lat:
+ samples = lat[type]
+ samples.sort()
+
+ num = len(samples)
+
+ # median
+ if num & 1 == 1: # odd number of samples
+ median = samples[num / 2]
+ else:
+ median = (samples[num / 2] + samples[num / 2 - 1]) / 2
+
+ # 99%
+ ninety_nine = samples[int(num * 0.99)]
+
+ log.info("%s: median %f, 99%% %f" % (type, median, ninety_nine))
--- /dev/null
+"""
+Special regression test for tracker #11184
+
+Synopsis: osd/SnapMapper.cc: 282: FAILED assert(check(oid))
+
+This is accomplished by moving a pg that wasn't part of split and still include
+divergent priors.
+"""
+import logging
+import time
+from cStringIO import StringIO
+
+from teuthology import misc as teuthology
+from util.rados import rados
+import os
+
+
+log = logging.getLogger(__name__)
+
+
+def task(ctx, config):
+ """
+ Test handling of divergent entries during export / import
+ to regression test tracker #11184
+
+ overrides:
+ ceph:
+ conf:
+ osd:
+ debug osd: 5
+
+ Requires 3 osds on a single test node.
+ """
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'divergent_priors task only accepts a dict for configuration'
+
+ manager = ctx.managers['ceph']
+
+ while len(manager.get_osd_status()['up']) < 3:
+ time.sleep(10)
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.raw_cluster_cmd('osd', 'set', 'noout')
+ manager.raw_cluster_cmd('osd', 'set', 'noin')
+ manager.raw_cluster_cmd('osd', 'set', 'nodown')
+ manager.wait_for_clean()
+
+ # something that is always there
+ dummyfile = '/etc/fstab'
+ dummyfile2 = '/etc/resolv.conf'
+ testdir = teuthology.get_testdir(ctx)
+
+ # create 1 pg pool
+ log.info('creating foo')
+ manager.raw_cluster_cmd('osd', 'pool', 'create', 'foo', '1')
+
+ osds = [0, 1, 2]
+ for i in osds:
+ manager.set_config(i, osd_min_pg_log_entries=10)
+ manager.set_config(i, osd_max_pg_log_entries=10)
+ manager.set_config(i, osd_pg_log_trim_min=5)
+
+ # determine primary
+ divergent = manager.get_pg_primary('foo', 0)
+ log.info("primary and soon to be divergent is %d", divergent)
+ non_divergent = list(osds)
+ non_divergent.remove(divergent)
+
+ log.info('writing initial objects')
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+ # write 100 objects
+ for i in range(100):
+ rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile])
+
+ manager.wait_for_clean()
+
+ # blackhole non_divergent
+ log.info("blackholing osds %s", str(non_divergent))
+ for i in non_divergent:
+ manager.set_config(i, objectstore_blackhole=1)
+
+ DIVERGENT_WRITE = 5
+ DIVERGENT_REMOVE = 5
+ # Write some soon to be divergent
+ log.info('writing divergent objects')
+ for i in range(DIVERGENT_WRITE):
+ rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i,
+ dummyfile2], wait=False)
+ # Remove some soon to be divergent
+ log.info('remove divergent objects')
+ for i in range(DIVERGENT_REMOVE):
+ rados(ctx, mon, ['-p', 'foo', 'rm',
+ 'existing_%d' % (i + DIVERGENT_WRITE)], wait=False)
+ time.sleep(10)
+ mon.run(
+ args=['killall', '-9', 'rados'],
+ wait=True,
+ check_status=False)
+
+ # kill all the osds but leave divergent in
+ log.info('killing all the osds')
+ for i in osds:
+ manager.kill_osd(i)
+ for i in osds:
+ manager.mark_down_osd(i)
+ for i in non_divergent:
+ manager.mark_out_osd(i)
+
+ # bring up non-divergent
+ log.info("bringing up non_divergent %s", str(non_divergent))
+ for i in non_divergent:
+ manager.revive_osd(i)
+ for i in non_divergent:
+ manager.mark_in_osd(i)
+
+ # write 1 non-divergent object (ensure that old divergent one is divergent)
+ objname = "existing_%d" % (DIVERGENT_WRITE + DIVERGENT_REMOVE)
+ log.info('writing non-divergent object ' + objname)
+ rados(ctx, mon, ['-p', 'foo', 'put', objname, dummyfile2])
+
+ manager.wait_for_recovery()
+
+ # ensure no recovery of up osds first
+ log.info('delay recovery')
+ for i in non_divergent:
+ manager.wait_run_admin_socket(
+ 'osd', i, ['set_recovery_delay', '100000'])
+
+ # bring in our divergent friend
+ log.info("revive divergent %d", divergent)
+ manager.raw_cluster_cmd('osd', 'set', 'noup')
+ manager.revive_osd(divergent)
+
+ log.info('delay recovery divergent')
+ manager.wait_run_admin_socket(
+ 'osd', divergent, ['set_recovery_delay', '100000'])
+
+ manager.raw_cluster_cmd('osd', 'unset', 'noup')
+ while len(manager.get_osd_status()['up']) < 3:
+ time.sleep(10)
+
+ log.info('wait for peering')
+ rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile])
+
+ # At this point the divergent_priors should have been detected
+
+ log.info("killing divergent %d", divergent)
+ manager.kill_osd(divergent)
+
+ # Split pgs for pool foo
+ manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'pg_num', '2')
+ time.sleep(5)
+
+ # Export a pg
+ (exp_remote,) = ctx.\
+ cluster.only('osd.{o}'.format(o=divergent)).remotes.iterkeys()
+ FSPATH = manager.get_filepath()
+ JPATH = os.path.join(FSPATH, "journal")
+ prefix = ("sudo adjust-ulimits ceph-objectstore-tool "
+ "--data-path {fpath} --journal-path {jpath} "
+ "--log-file="
+ "/var/log/ceph/objectstore_tool.$$.log ".
+ format(fpath=FSPATH, jpath=JPATH))
+ pid = os.getpid()
+ expfile = os.path.join(testdir, "exp.{pid}.out".format(pid=pid))
+ cmd = ((prefix + "--op export --pgid 1.0 --file {file}").
+ format(id=divergent, file=expfile))
+ proc = exp_remote.run(args=cmd, wait=True,
+ check_status=False, stdout=StringIO())
+ assert proc.exitstatus == 0
+
+ # Remove the same pg that was exported
+ cmd = ((prefix + "--op remove --pgid 1.0").
+ format(id=divergent, file=expfile))
+ proc = exp_remote.run(args=cmd, wait=True,
+ check_status=False, stdout=StringIO())
+ assert proc.exitstatus == 0
+
+ # Kill one of non-divergent OSDs
+ log.info('killing osd.%d' % non_divergent[1])
+ manager.kill_osd(non_divergent[1])
+ manager.mark_down_osd(non_divergent[1])
+ # manager.mark_out_osd(non_divergent[1])
+
+ cmd = ((prefix + "--op import --file {file}").
+ format(id=non_divergent[1], file=expfile))
+ proc = exp_remote.run(args=cmd, wait=True,
+ check_status=False, stdout=StringIO())
+ assert proc.exitstatus == 0
+
+ # bring in our divergent friend and other node
+ log.info("revive divergent %d", divergent)
+ manager.revive_osd(divergent)
+ manager.mark_in_osd(divergent)
+ log.info("revive %d", non_divergent[1])
+ manager.revive_osd(non_divergent[1])
+
+ while len(manager.get_osd_status()['up']) < 3:
+ time.sleep(10)
+
+ log.info('delay recovery divergent')
+ manager.set_config(divergent, osd_recovery_delay_start=100000)
+ log.info('mark divergent in')
+ manager.mark_in_osd(divergent)
+
+ log.info('wait for peering')
+ rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile])
+
+ log.info("killing divergent %d", divergent)
+ manager.kill_osd(divergent)
+ log.info("reviving divergent %d", divergent)
+ manager.revive_osd(divergent)
+ time.sleep(3)
+
+ log.info('allowing recovery')
+ # Set osd_recovery_delay_start back to 0 and kick the queue
+ for i in osds:
+ manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'debug',
+ 'kick_recovery_wq', ' 0')
+
+ log.info('reading divergent objects')
+ for i in range(DIVERGENT_WRITE + DIVERGENT_REMOVE):
+ exit_status = rados(ctx, mon, ['-p', 'foo', 'get', 'existing_%d' % i,
+ '/tmp/existing'])
+ assert exit_status is 0
+
+ (remote,) = ctx.\
+ cluster.only('osd.{o}'.format(o=divergent)).remotes.iterkeys()
+ cmd = 'rm {file}'.format(file=expfile)
+ remote.run(args=cmd, wait=True)
+ log.info("success")
--- /dev/null
+"""
+Lost_unfound
+"""
+import logging
+from teuthology.orchestra import run
+import ceph_manager
+import time
+from teuthology import misc as teuthology
+from util.rados import rados
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+ """
+ Test handling of lost objects.
+
+ A pretty rigid cluseter is brought up andtested by this task
+ """
+ POOL = 'unfounddel_pool'
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'lost_unfound task only accepts a dict for configuration'
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+ manager = ceph_manager.CephManager(
+ mon,
+ ctx=ctx,
+ logger=log.getChild('ceph_manager'),
+ )
+
+ while len(manager.get_osd_status()['up']) < 3:
+ time.sleep(10)
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.wait_for_clean()
+
+ manager.create_pool(POOL)
+
+ # something that is always there
+ dummyfile = '/etc/fstab'
+
+ # take an osd out until the very end
+ manager.kill_osd(2)
+ manager.mark_down_osd(2)
+ manager.mark_out_osd(2)
+
+ # kludge to make sure they get a map
+ rados(ctx, mon, ['-p', POOL, 'put', 'dummy', dummyfile])
+
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.wait_for_recovery()
+
+ # create old objects
+ for f in range(1, 10):
+ rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile])
+ rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile])
+ rados(ctx, mon, ['-p', POOL, 'rm', 'existed_%d' % f])
+
+ # delay recovery, and make the pg log very long (to prevent backfill)
+ manager.raw_cluster_cmd(
+ 'tell', 'osd.1',
+ 'injectargs',
+ '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000'
+ )
+
+ manager.kill_osd(0)
+ manager.mark_down_osd(0)
+
+ for f in range(1, 10):
+ rados(ctx, mon, ['-p', POOL, 'put', 'new_%d' % f, dummyfile])
+ rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile])
+ rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile])
+
+ # bring osd.0 back up, let it peer, but don't replicate the new
+ # objects...
+ log.info('osd.0 command_args is %s' % 'foo')
+ log.info(ctx.daemons.get_daemon('osd', 0).command_args)
+ ctx.daemons.get_daemon('osd', 0).command_kwargs['args'].extend([
+ '--osd-recovery-delay-start', '1000'
+ ])
+ manager.revive_osd(0)
+ manager.mark_in_osd(0)
+ manager.wait_till_osd_is_up(0)
+
+ manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.wait_till_active()
+
+ # take out osd.1 and the only copy of those objects.
+ manager.kill_osd(1)
+ manager.mark_down_osd(1)
+ manager.mark_out_osd(1)
+ manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it')
+
+ # bring up osd.2 so that things would otherwise, in theory, recovery fully
+ manager.revive_osd(2)
+ manager.mark_in_osd(2)
+ manager.wait_till_osd_is_up(2)
+
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.wait_till_active()
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+
+ # verify that there are unfound objects
+ unfound = manager.get_num_unfound_objects()
+ log.info("there are %d unfound objects" % unfound)
+ assert unfound
+
+ testdir = teuthology.get_testdir(ctx)
+ procs = []
+ if config.get('parallel_bench', True):
+ procs.append(mon.run(
+ args=[
+ "/bin/sh", "-c",
+ " ".join(['adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage',
+ 'rados',
+ '--no-log-to-stderr',
+ '--name', 'client.admin',
+ '-b', str(4<<10),
+ '-p' , POOL,
+ '-t', '20',
+ 'bench', '240', 'write',
+ ]).format(tdir=testdir),
+ ],
+ logger=log.getChild('radosbench.{id}'.format(id='client.admin')),
+ stdin=run.PIPE,
+ wait=False
+ ))
+ time.sleep(10)
+
+ # mark stuff lost
+ pgs = manager.get_pg_stats()
+ for pg in pgs:
+ if pg['stat_sum']['num_objects_unfound'] > 0:
+ primary = 'osd.%d' % pg['acting'][0]
+
+ # verify that i can list them direct from the osd
+ log.info('listing missing/lost in %s state %s', pg['pgid'],
+ pg['state']);
+ m = manager.list_pg_missing(pg['pgid'])
+ #log.info('%s' % m)
+ assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound']
+ num_unfound=0
+ for o in m['objects']:
+ if len(o['locations']) == 0:
+ num_unfound += 1
+ assert m['num_unfound'] == num_unfound
+
+ log.info("reverting unfound in %s on %s", pg['pgid'], primary)
+ manager.raw_cluster_cmd('pg', pg['pgid'],
+ 'mark_unfound_lost', 'delete')
+ else:
+ log.info("no unfound in %s", pg['pgid'])
+
+ manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5')
+ manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+ manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+ manager.wait_for_recovery()
+
+ # verify result
+ for f in range(1, 10):
+ err = rados(ctx, mon, ['-p', POOL, 'get', 'new_%d' % f, '-'])
+ assert err
+ err = rados(ctx, mon, ['-p', POOL, 'get', 'existed_%d' % f, '-'])
+ assert err
+ err = rados(ctx, mon, ['-p', POOL, 'get', 'existing_%d' % f, '-'])
+ assert err
+
+ # see if osd.1 can cope
+ manager.revive_osd(1)
+ manager.mark_in_osd(1)
+ manager.wait_till_osd_is_up(1)
+ manager.wait_for_clean()
+ run.wait(procs)
+
--- /dev/null
+"""
+Test pool repairing after objects are damaged.
+"""
+import logging
+import time
+
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+
+def choose_primary(manager, pool, num):
+ """
+ Return primary to test on.
+ """
+ log.info("Choosing primary")
+ return manager.get_pg_primary(pool, num)
+
+
+def choose_replica(manager, pool, num):
+ """
+ Return replica to test on.
+ """
+ log.info("Choosing replica")
+ return manager.get_pg_replica(pool, num)
+
+
+def trunc(manager, osd, pool, obj):
+ """
+ truncate an object
+ """
+ log.info("truncating object")
+ return manager.osd_admin_socket(
+ osd,
+ ['truncobj', pool, obj, '1'])
+
+
+def dataerr(manager, osd, pool, obj):
+ """
+ cause an error in the data
+ """
+ log.info("injecting data err on object")
+ return manager.osd_admin_socket(
+ osd,
+ ['injectdataerr', pool, obj])
+
+
+def mdataerr(manager, osd, pool, obj):
+ """
+ cause an error in the mdata
+ """
+ log.info("injecting mdata err on object")
+ return manager.osd_admin_socket(
+ osd,
+ ['injectmdataerr', pool, obj])
+
+
+def omaperr(manager, osd, pool, obj):
+ """
+ Cause an omap error.
+ """
+ log.info("injecting omap err on object")
+ return manager.osd_admin_socket(osd, ['setomapval', pool, obj,
+ 'badkey', 'badval'])
+
+
+def repair_test_1(manager, corrupter, chooser, scrub_type):
+ """
+ Creates an object in the pool, corrupts it,
+ scrubs it, and verifies that the pool is inconsistent. It then repairs
+ the pool, rescrubs it, and verifies that the pool is consistent
+
+ :param corrupter: error generating function (truncate, data-error, or
+ meta-data error, for example).
+ :param chooser: osd type chooser (primary or replica)
+ :param scrub_type: regular scrub or deep-scrub
+ """
+ pool = "repair_pool_1"
+ manager.wait_for_clean()
+ with manager.pool(pool, 1):
+
+ log.info("starting repair test type 1")
+ victim_osd = chooser(manager, pool, 0)
+
+ # create object
+ log.info("doing put")
+ manager.do_put(pool, 'repair_test_obj', '/etc/hosts')
+
+ # corrupt object
+ log.info("corrupting object")
+ corrupter(manager, victim_osd, pool, 'repair_test_obj')
+
+ # verify inconsistent
+ log.info("scrubbing")
+ manager.do_pg_scrub(pool, 0, scrub_type)
+
+ assert manager.pg_inconsistent(pool, 0)
+
+ # repair
+ log.info("repairing")
+ manager.do_pg_scrub(pool, 0, "repair")
+
+ log.info("re-scrubbing")
+ manager.do_pg_scrub(pool, 0, scrub_type)
+
+ # verify consistent
+ assert not manager.pg_inconsistent(pool, 0)
+ log.info("done")
+
+
+def repair_test_2(ctx, manager, config, chooser):
+ """
+ First creates a set of objects and
+ sets the omap value. It then corrupts an object, does both a scrub
+ and a deep-scrub, and then corrupts more objects. After that, it
+ repairs the pool and makes sure that the pool is consistent some
+ time after a deep-scrub.
+
+ :param chooser: primary or replica selection routine.
+ """
+ pool = "repair_pool_2"
+ manager.wait_for_clean()
+ with manager.pool(pool, 1):
+ log.info("starting repair test type 2")
+ victim_osd = chooser(manager, pool, 0)
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+ # create object
+ log.info("doing put and setomapval")
+ manager.do_put(pool, 'file1', '/etc/hosts')
+ manager.do_rados(mon, ['-p', pool, 'setomapval', 'file1',
+ 'key', 'val'])
+ manager.do_put(pool, 'file2', '/etc/hosts')
+ manager.do_put(pool, 'file3', '/etc/hosts')
+ manager.do_put(pool, 'file4', '/etc/hosts')
+ manager.do_put(pool, 'file5', '/etc/hosts')
+ manager.do_rados(mon, ['-p', pool, 'setomapval', 'file5',
+ 'key', 'val'])
+ manager.do_put(pool, 'file6', '/etc/hosts')
+
+ # corrupt object
+ log.info("corrupting object")
+ omaperr(manager, victim_osd, pool, 'file1')
+
+ # verify inconsistent
+ log.info("scrubbing")
+ manager.do_pg_scrub(pool, 0, 'deep-scrub')
+
+ assert manager.pg_inconsistent(pool, 0)
+
+ # Regression test for bug #4778, should still
+ # be inconsistent after scrub
+ manager.do_pg_scrub(pool, 0, 'scrub')
+
+ assert manager.pg_inconsistent(pool, 0)
+
+ # Additional corruptions including 2 types for file1
+ log.info("corrupting more objects")
+ dataerr(manager, victim_osd, pool, 'file1')
+ mdataerr(manager, victim_osd, pool, 'file2')
+ trunc(manager, victim_osd, pool, 'file3')
+ omaperr(manager, victim_osd, pool, 'file6')
+
+ # see still inconsistent
+ log.info("scrubbing")
+ manager.do_pg_scrub(pool, 0, 'deep-scrub')
+
+ assert manager.pg_inconsistent(pool, 0)
+
+ # repair
+ log.info("repairing")
+ manager.do_pg_scrub(pool, 0, "repair")
+
+ # Let repair clear inconsistent flag
+ time.sleep(10)
+
+ # verify consistent
+ assert not manager.pg_inconsistent(pool, 0)
+
+ # In the future repair might determine state of
+ # inconsistency itself, verify with a deep-scrub
+ log.info("scrubbing")
+ manager.do_pg_scrub(pool, 0, 'deep-scrub')
+
+ # verify consistent
+ assert not manager.pg_inconsistent(pool, 0)
+
+ log.info("done")
+
+
+def hinfoerr(manager, victim, pool, obj):
+ """
+ cause an error in the hinfo_key
+ """
+ log.info("remove the hinfo_key")
+ manager.objectstore_tool(pool,
+ options='',
+ args='rm-attr hinfo_key',
+ object_name=obj,
+ osd=victim)
+
+
+def repair_test_erasure_code(manager, corrupter, victim, scrub_type):
+ """
+ Creates an object in the pool, corrupts it,
+ scrubs it, and verifies that the pool is inconsistent. It then repairs
+ the pool, rescrubs it, and verifies that the pool is consistent
+
+ :param corrupter: error generating function.
+ :param chooser: osd type chooser (primary or replica)
+ :param scrub_type: regular scrub or deep-scrub
+ """
+ pool = "repair_pool_3"
+ manager.wait_for_clean()
+ with manager.pool(pool_name=pool, pg_num=1,
+ erasure_code_profile_name='default'):
+
+ log.info("starting repair test for erasure code")
+
+ # create object
+ log.info("doing put")
+ manager.do_put(pool, 'repair_test_obj', '/etc/hosts')
+
+ # corrupt object
+ log.info("corrupting object")
+ corrupter(manager, victim, pool, 'repair_test_obj')
+
+ # verify inconsistent
+ log.info("scrubbing")
+ manager.do_pg_scrub(pool, 0, scrub_type)
+
+ assert manager.pg_inconsistent(pool, 0)
+
+ # repair
+ log.info("repairing")
+ manager.do_pg_scrub(pool, 0, "repair")
+
+ log.info("re-scrubbing")
+ manager.do_pg_scrub(pool, 0, scrub_type)
+
+ # verify consistent
+ assert not manager.pg_inconsistent(pool, 0)
+ log.info("done")
+
+
+def task(ctx, config):
+ """
+ Test [deep] repair in several situations:
+ Repair [Truncate, Data EIO, MData EIO] on [Primary|Replica]
+
+ The config should be as follows:
+
+ Must include the log-whitelist below
+ Must enable filestore_debug_inject_read_err config
+
+ example:
+
+ tasks:
+ - chef:
+ - install:
+ - ceph:
+ log-whitelist:
+ - 'candidate had a stat error'
+ - 'candidate had a read error'
+ - 'deep-scrub 0 missing, 1 inconsistent objects'
+ - 'deep-scrub 0 missing, 4 inconsistent objects'
+ - 'deep-scrub [0-9]+ errors'
+ - '!= omap_digest'
+ - '!= data_digest'
+ - 'repair 0 missing, 1 inconsistent objects'
+ - 'repair 0 missing, 4 inconsistent objects'
+ - 'repair [0-9]+ errors, [0-9]+ fixed'
+ - 'scrub 0 missing, 1 inconsistent objects'
+ - 'scrub [0-9]+ errors'
+ - 'size 1 != size'
+ - 'attr name mismatch'
+ conf:
+ osd:
+ filestore debug inject read err: true
+ - repair_test:
+
+ """
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'repair_test task only accepts a dict for config'
+
+ manager = ctx.managers['ceph']
+ manager.wait_for_all_up()
+
+ manager.raw_cluster_cmd('osd', 'set', 'noscrub')
+ manager.raw_cluster_cmd('osd', 'set', 'nodeep-scrub')
+
+ repair_test_1(manager, mdataerr, choose_primary, "scrub")
+ repair_test_1(manager, mdataerr, choose_replica, "scrub")
+ repair_test_1(manager, dataerr, choose_primary, "deep-scrub")
+ repair_test_1(manager, dataerr, choose_replica, "deep-scrub")
+ repair_test_1(manager, trunc, choose_primary, "scrub")
+ repair_test_1(manager, trunc, choose_replica, "scrub")
+ repair_test_2(ctx, manager, config, choose_primary)
+ repair_test_2(ctx, manager, config, choose_replica)
+
+ repair_test_erasure_code(manager, hinfoerr, 'primary', "deep-scrub")
--- /dev/null
+"""
+Resolve stuck peering
+"""
+import logging
+import time
+
+from teuthology import misc as teuthology
+from util.rados import rados
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+ """
+ Test handling resolve stuck peering
+
+ requires 3 osds on a single test node
+ """
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'Resolve stuck peering only accepts a dict for config'
+
+ manager = ctx.managers['ceph']
+
+ while len(manager.get_osd_status()['up']) < 3:
+ time.sleep(10)
+
+
+ manager.wait_for_clean()
+
+ dummyfile = '/etc/fstab'
+ dummyfile1 = '/etc/resolv.conf'
+
+ #create 1 PG pool
+ pool='foo'
+ log.info('creating pool foo')
+ manager.raw_cluster_cmd('osd', 'pool', 'create', '%s' % pool, '1')
+
+ #set min_size of the pool to 1
+ #so that we can continue with I/O
+ #when 2 osds are down
+ manager.set_pool_property(pool, "min_size", 1)
+
+ osds = [0, 1, 2]
+
+ primary = manager.get_pg_primary('foo', 0)
+ log.info("primary osd is %d", primary)
+
+ others = list(osds)
+ others.remove(primary)
+
+ log.info('writing initial objects')
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+ #create few objects
+ for i in range(100):
+ rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile])
+
+ manager.wait_for_clean()
+
+ #kill other osds except primary
+ log.info('killing other osds except primary')
+ for i in others:
+ manager.kill_osd(i)
+ for i in others:
+ manager.mark_down_osd(i)
+
+
+ for i in range(100):
+ rados(ctx, mon, ['-p', 'foo', 'put', 'new_%d' % i, dummyfile1])
+
+ #kill primary osd
+ manager.kill_osd(primary)
+ manager.mark_down_osd(primary)
+
+ #revive other 2 osds
+ for i in others:
+ manager.revive_osd(i)
+
+ #make sure that pg is down
+ #Assuming pg number for single pg pool will start from 0
+ pgnum=0
+ pgstr = manager.get_pgid(pool, pgnum)
+ stats = manager.get_single_pg_stats(pgstr)
+ print stats['state']
+
+ timeout=60
+ start=time.time()
+
+ while 'down' not in stats['state']:
+ assert time.time() - start < timeout, \
+ 'failed to reach down state before timeout expired'
+ stats = manager.get_single_pg_stats(pgstr)
+
+ #mark primary as lost
+ manager.raw_cluster_cmd('osd', 'lost', '%d' % primary,\
+ '--yes-i-really-mean-it')
+
+
+ #expect the pg status to be active+undersized+degraded
+ #pg should recover and become active+clean within timeout
+ stats = manager.get_single_pg_stats(pgstr)
+ print stats['state']
+
+ timeout=10
+ start=time.time()
+
+ while manager.get_num_down():
+ assert time.time() - start < timeout, \
+ 'failed to recover before timeout expired'
--- /dev/null
+"""
+Rest Api
+"""
+import logging
+import contextlib
+import time
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.orchestra import run
+from teuthology.orchestra.daemon import DaemonGroup
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def run_rest_api_daemon(ctx, api_clients):
+ """
+ Wrapper starts the rest api daemons
+ """
+ if not hasattr(ctx, 'daemons'):
+ ctx.daemons = DaemonGroup()
+ remotes = ctx.cluster.only(teuthology.is_type('client')).remotes
+ for rems, roles in remotes.iteritems():
+ for whole_id_ in roles:
+ if whole_id_ in api_clients:
+ id_ = whole_id_[len('clients'):]
+ run_cmd = [
+ 'sudo',
+ 'daemon-helper',
+ 'kill',
+ 'ceph-rest-api',
+ '-n',
+ 'client.rest{id}'.format(id=id_), ]
+ cl_rest_id = 'client.rest{id}'.format(id=id_)
+ ctx.daemons.add_daemon(rems, 'restapi',
+ cl_rest_id,
+ args=run_cmd,
+ logger=log.getChild(cl_rest_id),
+ stdin=run.PIPE,
+ wait=False,
+ )
+ for i in range(1, 12):
+ log.info('testing for ceph-rest-api try {0}'.format(i))
+ run_cmd = [
+ 'wget',
+ '-O',
+ '/dev/null',
+ '-q',
+ 'http://localhost:5000/api/v0.1/status'
+ ]
+ proc = rems.run(
+ args=run_cmd,
+ check_status=False
+ )
+ if proc.exitstatus == 0:
+ break
+ time.sleep(5)
+ if proc.exitstatus != 0:
+ raise RuntimeError('Cannot contact ceph-rest-api')
+ try:
+ yield
+
+ finally:
+ """
+ TO DO: destroy daemons started -- modify iter_daemons_of_role
+ """
+ teuthology.stop_daemons_of_type(ctx, 'restapi')
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Start up rest-api.
+
+ To start on on all clients::
+
+ tasks:
+ - ceph:
+ - rest-api:
+
+ To only run on certain clients::
+
+ tasks:
+ - ceph:
+ - rest-api: [client.0, client.3]
+
+ or
+
+ tasks:
+ - ceph:
+ - rest-api:
+ client.0:
+ client.3:
+
+ The general flow of things here is:
+ 1. Find clients on which rest-api is supposed to run (api_clients)
+ 2. Generate keyring values
+ 3. Start up ceph-rest-api daemons
+ On cleanup:
+ 4. Stop the daemons
+ 5. Delete keyring value files.
+ """
+ api_clients = []
+ remotes = ctx.cluster.only(teuthology.is_type('client')).remotes
+ log.info(remotes)
+ if config == None:
+ api_clients = ['client.{id}'.format(id=id_)
+ for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+ else:
+ api_clients = config
+ log.info(api_clients)
+ testdir = teuthology.get_testdir(ctx)
+ coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
+ for rems, roles in remotes.iteritems():
+ for whole_id_ in roles:
+ if whole_id_ in api_clients:
+ id_ = whole_id_[len('client.'):]
+ keyring = '/etc/ceph/ceph.client.rest{id}.keyring'.format(
+ id=id_)
+ rems.run(
+ args=[
+ 'sudo',
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ coverage_dir,
+ 'ceph-authtool',
+ '--create-keyring',
+ '--gen-key',
+ '--name=client.rest{id}'.format(id=id_),
+ '--set-uid=0',
+ '--cap', 'mon', 'allow *',
+ '--cap', 'osd', 'allow *',
+ '--cap', 'mds', 'allow',
+ keyring,
+ run.Raw('&&'),
+ 'sudo',
+ 'chmod',
+ '0644',
+ keyring,
+ ],
+ )
+ rems.run(
+ args=[
+ 'sudo',
+ 'sh',
+ '-c',
+ run.Raw("'"),
+ "echo",
+ '[client.rest{id}]'.format(id=id_),
+ run.Raw('>>'),
+ "/etc/ceph/ceph.conf",
+ run.Raw("'")
+ ]
+ )
+ rems.run(
+ args=[
+ 'sudo',
+ 'sh',
+ '-c',
+ run.Raw("'"),
+ 'echo',
+ 'restapi',
+ 'keyring',
+ '=',
+ '/etc/ceph/ceph.client.rest{id}.keyring'.format(id=id_),
+ run.Raw('>>'),
+ '/etc/ceph/ceph.conf',
+ run.Raw("'"),
+ ]
+ )
+ rems.run(
+ args=[
+ 'sudo',
+ 'ceph',
+ 'auth',
+ 'import',
+ '-i',
+ '/etc/ceph/ceph.client.rest{id}.keyring'.format(id=id_),
+ ]
+ )
+ with contextutil.nested(
+ lambda: run_rest_api_daemon(ctx=ctx, api_clients=api_clients),):
+ yield
+
--- /dev/null
+"""
+Daemon restart
+"""
+import logging
+import pipes
+
+from teuthology import misc as teuthology
+from teuthology.orchestra import run as tor
+
+from teuthology.orchestra import run
+log = logging.getLogger(__name__)
+
+def restart_daemon(ctx, config, role, id_, *args):
+ """
+ Handle restart (including the execution of the command parameters passed)
+ """
+ log.info('Restarting {r}.{i} daemon...'.format(r=role, i=id_))
+ daemon = ctx.daemons.get_daemon(role, id_)
+ log.debug('Waiting for exit of {r}.{i} daemon...'.format(r=role, i=id_))
+ try:
+ daemon.wait_for_exit()
+ except tor.CommandFailedError as e:
+ log.debug('Command Failed: {e}'.format(e=e))
+ if len(args) > 0:
+ confargs = ['--{k}={v}'.format(k=k, v=v) for k,v in zip(args[0::2], args[1::2])]
+ log.debug('Doing restart of {r}.{i} daemon with args: {a}...'.format(r=role, i=id_, a=confargs))
+ daemon.restart_with_args(confargs)
+ else:
+ log.debug('Doing restart of {r}.{i} daemon...'.format(r=role, i=id_))
+ daemon.restart()
+
+def get_tests(ctx, config, role, remote, testdir):
+ """Download restart tests"""
+ srcdir = '{tdir}/restart.{role}'.format(tdir=testdir, role=role)
+
+ refspec = config.get('branch')
+ if refspec is None:
+ refspec = config.get('sha1')
+ if refspec is None:
+ refspec = config.get('tag')
+ if refspec is None:
+ refspec = 'HEAD'
+ log.info('Pulling restart qa/workunits from ref %s', refspec)
+
+ remote.run(
+ logger=log.getChild(role),
+ args=[
+ 'mkdir', '--', srcdir,
+ run.Raw('&&'),
+ 'git',
+ 'archive',
+ '--remote=git://git.ceph.com/ceph.git',
+ '%s:qa/workunits' % refspec,
+ run.Raw('|'),
+ 'tar',
+ '-C', srcdir,
+ '-x',
+ '-f-',
+ run.Raw('&&'),
+ 'cd', '--', srcdir,
+ run.Raw('&&'),
+ 'if', 'test', '-e', 'Makefile', run.Raw(';'), 'then', 'make', run.Raw(';'), 'fi',
+ run.Raw('&&'),
+ 'find', '-executable', '-type', 'f', '-printf', r'%P\0'.format(srcdir=srcdir),
+ run.Raw('>{tdir}/restarts.list'.format(tdir=testdir)),
+ ],
+ )
+ restarts = sorted(teuthology.get_file(
+ remote,
+ '{tdir}/restarts.list'.format(tdir=testdir)).split('\0'))
+ return (srcdir, restarts)
+
+def task(ctx, config):
+ """
+ Execute commands and allow daemon restart with config options.
+ Each process executed can output to stdout restart commands of the form:
+ restart <role> <id> <conf_key1> <conf_value1> <conf_key2> <conf_value2>
+ This will restart the daemon <role>.<id> with the specified config values once
+ by modifying the conf file with those values, and then replacing the old conf file
+ once the daemon is restarted.
+ This task does not kill a running daemon, it assumes the daemon will abort on an
+ assert specified in the config.
+
+ tasks:
+ - install:
+ - ceph:
+ - restart:
+ exec:
+ client.0:
+ - test_backtraces.py
+
+ """
+ assert isinstance(config, dict), "task kill got invalid config"
+
+ testdir = teuthology.get_testdir(ctx)
+
+ try:
+ assert 'exec' in config, "config requires exec key with <role>: <command> entries"
+ for role, task in config['exec'].iteritems():
+ log.info('restart for role {r}'.format(r=role))
+ (remote,) = ctx.cluster.only(role).remotes.iterkeys()
+ srcdir, restarts = get_tests(ctx, config, role, remote, testdir)
+ log.info('Running command on role %s host %s', role, remote.name)
+ spec = '{spec}'.format(spec=task[0])
+ log.info('Restarts list: %s', restarts)
+ log.info('Spec is %s', spec)
+ to_run = [w for w in restarts if w == task or w.find(spec) != -1]
+ log.info('To run: %s', to_run)
+ for c in to_run:
+ log.info('Running restart script %s...', c)
+ args = [
+ run.Raw('TESTDIR="{tdir}"'.format(tdir=testdir)),
+ ]
+ env = config.get('env')
+ if env is not None:
+ for var, val in env.iteritems():
+ quoted_val = pipes.quote(val)
+ env_arg = '{var}={val}'.format(var=var, val=quoted_val)
+ args.append(run.Raw(env_arg))
+ args.extend([
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ '{srcdir}/{c}'.format(
+ srcdir=srcdir,
+ c=c,
+ ),
+ ])
+ proc = remote.run(
+ args=args,
+ stdout=tor.PIPE,
+ stdin=tor.PIPE,
+ stderr=log,
+ wait=False,
+ )
+ log.info('waiting for a command from script')
+ while True:
+ l = proc.stdout.readline()
+ if not l or l == '':
+ break
+ log.debug('script command: {c}'.format(c=l))
+ ll = l.strip()
+ cmd = ll.split(' ')
+ if cmd[0] == "done":
+ break
+ assert cmd[0] == 'restart', "script sent invalid command request to kill task"
+ # cmd should be: restart <role> <id> <conf_key1> <conf_value1> <conf_key2> <conf_value2>
+ # or to clear, just: restart <role> <id>
+ restart_daemon(ctx, config, cmd[1], cmd[2], *cmd[3:])
+ proc.stdin.writelines(['restarted\n'])
+ proc.stdin.flush()
+ try:
+ proc.wait()
+ except tor.CommandFailedError:
+ raise Exception('restart task got non-zero exit status from script: {s}'.format(s=c))
+ finally:
+ log.info('Finishing %s on %s...', task, role)
+ remote.run(
+ logger=log.getChild(role),
+ args=[
+ 'rm', '-rf', '--', '{tdir}/restarts.list'.format(tdir=testdir), srcdir,
+ ],
+ )
--- /dev/null
+"""
+rgw routines
+"""
+import argparse
+import contextlib
+import json
+import logging
+import os
+import errno
+import util.rgw as rgw_utils
+
+from requests.packages.urllib3 import PoolManager
+from requests.packages.urllib3.util import Retry
+
+from cStringIO import StringIO
+
+from teuthology.orchestra import run
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.orchestra.run import CommandFailedError
+from util.rgw import rgwadmin
+from util.rados import (rados, create_ec_pool,
+ create_replicated_pool,
+ create_cache_pool)
+
+log = logging.getLogger(__name__)
+
+def get_config_master_client(ctx, config, regions):
+
+ role_zones = dict([(client, extract_zone_info(ctx, client, c_config))
+ for client, c_config in config.iteritems()])
+ log.debug('roles_zones = %r', role_zones)
+ region_info = dict([
+ (region_name, extract_region_info(region_name, r_config))
+ for region_name, r_config in regions.iteritems()])
+
+ # read master zonegroup and master_zone
+ for zonegroup, zg_info in region_info.iteritems():
+ if zg_info['is_master']:
+ master_zonegroup = zonegroup
+ master_zone = zg_info['master_zone']
+ break
+
+ for client in config.iterkeys():
+ (zonegroup, zone, zone_info) = role_zones[client]
+ if zonegroup == master_zonegroup and zone == master_zone:
+ return client
+
+ return None
+
+@contextlib.contextmanager
+def create_apache_dirs(ctx, config, on_client = None, except_client = None):
+ """
+ Remotely create apache directories. Delete when finished.
+ """
+ log.info('Creating apache directories...')
+ log.debug('client is %r', on_client)
+ testdir = teuthology.get_testdir(ctx)
+ clients_to_create_as = [on_client]
+ if on_client is None:
+ clients_to_create_as = config.keys()
+ for client in clients_to_create_as:
+ if client == except_client:
+ continue
+ ctx.cluster.only(client).run(
+ args=[
+ 'mkdir',
+ '-p',
+ '{tdir}/apache/htdocs.{client}'.format(tdir=testdir,
+ client=client),
+ '{tdir}/apache/tmp.{client}/fastcgi_sock'.format(
+ tdir=testdir,
+ client=client),
+ run.Raw('&&'),
+ 'mkdir',
+ '{tdir}/archive/apache.{client}'.format(tdir=testdir,
+ client=client),
+ ],
+ )
+ try:
+ yield
+ finally:
+ log.info('Cleaning up apache directories...')
+ for client in clients_to_create_as:
+ ctx.cluster.only(client).run(
+ args=[
+ 'rm',
+ '-rf',
+ '{tdir}/apache/tmp.{client}'.format(tdir=testdir,
+ client=client),
+ run.Raw('&&'),
+ 'rmdir',
+ '{tdir}/apache/htdocs.{client}'.format(tdir=testdir,
+ client=client),
+ ],
+ )
+ for client in clients_to_create_as:
+ ctx.cluster.only(client).run(
+ args=[
+ 'rmdir',
+ '{tdir}/apache'.format(tdir=testdir),
+ ],
+ check_status=False, # only need to remove once per host
+ )
+
+
+def _use_uds_with_fcgi(remote):
+ """
+ Returns true if this node supports the usage of
+ unix domain sockets with mod_proxy_fcgi.
+
+ FIXME: returns False always for now until we know for
+ sure what distros will support UDS. RHEL 7.0 is the only one
+ currently I know of, but we can't install that version of apache
+ yet in the labs.
+ """
+ return False
+
+
+@contextlib.contextmanager
+def ship_apache_configs(ctx, config, role_endpoints, on_client = None,
+ except_client = None):
+ """
+ Ship apache config and rgw.fgci to all clients. Clean up on termination
+ """
+ assert isinstance(config, dict)
+ assert isinstance(role_endpoints, dict)
+ testdir = teuthology.get_testdir(ctx)
+ log.info('Shipping apache config and rgw.fcgi...')
+ src = os.path.join(os.path.dirname(__file__), 'apache.conf.template')
+ clients_to_create_as = [on_client]
+ if on_client is None:
+ clients_to_create_as = config.keys()
+ for client in clients_to_create_as:
+ if client == except_client:
+ continue
+ (remote,) = ctx.cluster.only(client).remotes.keys()
+ system_type = teuthology.get_system_type(remote)
+ conf = config.get(client)
+ if not conf:
+ conf = {}
+ idle_timeout = conf.get('idle_timeout', ctx.rgw.default_idle_timeout)
+ if system_type == 'deb':
+ mod_path = '/usr/lib/apache2/modules'
+ print_continue = 'on'
+ user = 'www-data'
+ group = 'www-data'
+ apache24_modconfig = '''
+ IncludeOptional /etc/apache2/mods-available/mpm_event.conf
+ IncludeOptional /etc/apache2/mods-available/mpm_event.load
+'''
+ else:
+ mod_path = '/usr/lib64/httpd/modules'
+ print_continue = 'off'
+ user = 'apache'
+ group = 'apache'
+ apache24_modconfig = \
+ 'IncludeOptional /etc/httpd/conf.modules.d/00-mpm.conf'
+ host, port = role_endpoints[client]
+
+ # decide if we want to use mod_fastcgi or mod_proxy_fcgi
+ template_dir = os.path.dirname(__file__)
+ fcgi_config = os.path.join(template_dir,
+ 'mod_proxy_fcgi.tcp.conf.template')
+ if ctx.rgw.use_fastcgi:
+ log.info("Apache is configured to use mod_fastcgi")
+ fcgi_config = os.path.join(template_dir,
+ 'mod_fastcgi.conf.template')
+ elif _use_uds_with_fcgi(remote):
+ log.info("Apache is configured to use mod_proxy_fcgi with UDS")
+ fcgi_config = os.path.join(template_dir,
+ 'mod_proxy_fcgi.uds.conf.template')
+ else:
+ log.info("Apache is configured to use mod_proxy_fcgi with TCP")
+
+ with file(fcgi_config, 'rb') as f:
+ fcgi_config = f.read()
+ with file(src, 'rb') as f:
+ conf = f.read() + fcgi_config
+ conf = conf.format(
+ testdir=testdir,
+ mod_path=mod_path,
+ print_continue=print_continue,
+ host=host,
+ port=port,
+ client=client,
+ idle_timeout=idle_timeout,
+ user=user,
+ group=group,
+ apache24_modconfig=apache24_modconfig,
+ )
+ teuthology.write_file(
+ remote=remote,
+ path='{tdir}/apache/apache.{client}.conf'.format(
+ tdir=testdir,
+ client=client),
+ data=conf,
+ )
+ rgw_options = []
+ if ctx.rgw.use_fastcgi or _use_uds_with_fcgi(remote):
+ rgw_options = [
+ '--rgw-socket-path',
+ '{tdir}/apache/tmp.{client}/fastcgi_sock/rgw_sock'.format(
+ tdir=testdir,
+ client=client
+ ),
+ '--rgw-frontends',
+ 'fastcgi',
+ ]
+ else:
+ rgw_options = [
+ '--rgw-socket-path', '""',
+ '--rgw-print-continue', 'false',
+ '--rgw-frontends',
+ 'fastcgi socket_port=9000 socket_host=0.0.0.0',
+ ]
+
+ teuthology.write_file(
+ remote=remote,
+ path='{tdir}/apache/htdocs.{client}/rgw.fcgi'.format(
+ tdir=testdir,
+ client=client),
+ data="""#!/bin/sh
+ulimit -c unlimited
+exec radosgw -f -n {client} -k /etc/ceph/ceph.{client}.keyring {rgw_options}
+
+""".format(tdir=testdir, client=client, rgw_options=" ".join(rgw_options))
+ )
+ remote.run(
+ args=[
+ 'chmod',
+ 'a=rx',
+ '{tdir}/apache/htdocs.{client}/rgw.fcgi'.format(tdir=testdir,
+ client=client),
+ ],
+ )
+ try:
+ yield
+ finally:
+ log.info('Removing apache config...')
+ for client in clients_to_create_as:
+ ctx.cluster.only(client).run(
+ args=[
+ 'rm',
+ '-f',
+ '{tdir}/apache/apache.{client}.conf'.format(tdir=testdir,
+ client=client),
+ run.Raw('&&'),
+ 'rm',
+ '-f',
+ '{tdir}/apache/htdocs.{client}/rgw.fcgi'.format(
+ tdir=testdir,
+ client=client),
+ ],
+ )
+
+
+@contextlib.contextmanager
+def start_rgw(ctx, config, on_client = None, except_client = None):
+ """
+ Start rgw on remote sites.
+ """
+ log.info('Starting rgw...')
+ log.debug('client %r', on_client)
+ clients_to_run = [on_client]
+ if on_client is None:
+ clients_to_run = config.keys()
+ testdir = teuthology.get_testdir(ctx)
+ for client in clients_to_run:
+ if client == except_client:
+ continue
+ (remote,) = ctx.cluster.only(client).remotes.iterkeys()
+ zone = rgw_utils.zone_for_client(ctx, client)
+ log.debug('zone %s', zone)
+ client_config = config.get(client)
+ if client_config is None:
+ client_config = {}
+ log.info("rgw %s config is %s", client, client_config)
+ id_ = client.split('.', 1)[1]
+ log.info('client {client} is id {id}'.format(client=client, id=id_))
+ cmd_prefix = [
+ 'sudo',
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'daemon-helper',
+ 'term',
+ ]
+
+ rgw_cmd = ['radosgw']
+
+ if ctx.rgw.frontend == 'apache':
+ if ctx.rgw.use_fastcgi or _use_uds_with_fcgi(remote):
+ rgw_cmd.extend([
+ '--rgw-socket-path',
+ '{tdir}/apache/tmp.{client}/fastcgi_sock/rgw_sock'.format(
+ tdir=testdir,
+ client=client,
+ ),
+ '--rgw-frontends',
+ 'fastcgi',
+ ])
+ else:
+ # for mod_proxy_fcgi, using tcp
+ rgw_cmd.extend([
+ '--rgw-socket-path', '',
+ '--rgw-print-continue', 'false',
+ '--rgw-frontends',
+ 'fastcgi socket_port=9000 socket_host=0.0.0.0',
+ ])
+
+ elif ctx.rgw.frontend == 'civetweb':
+ host, port = ctx.rgw.role_endpoints[client]
+ rgw_cmd.extend([
+ '--rgw-frontends',
+ 'civetweb port={port}'.format(port=port),
+ ])
+
+ if zone is not None:
+ rgw_cmd.extend(['--rgw-zone', zone])
+
+ rgw_cmd.extend([
+ '-n', client,
+ '-k', '/etc/ceph/ceph.{client}.keyring'.format(client=client),
+ '--log-file',
+ '/var/log/ceph/rgw.{client}.log'.format(client=client),
+ '--rgw_ops_log_socket_path',
+ '{tdir}/rgw.opslog.{client}.sock'.format(tdir=testdir,
+ client=client),
+ '--foreground',
+ run.Raw('|'),
+ 'sudo',
+ 'tee',
+ '/var/log/ceph/rgw.{client}.stdout'.format(tdir=testdir,
+ client=client),
+ run.Raw('2>&1'),
+ ])
+
+ if client_config.get('valgrind'):
+ cmd_prefix = teuthology.get_valgrind_args(
+ testdir,
+ client,
+ cmd_prefix,
+ client_config.get('valgrind')
+ )
+
+ run_cmd = list(cmd_prefix)
+ run_cmd.extend(rgw_cmd)
+
+ ctx.daemons.add_daemon(
+ remote, 'rgw', client,
+ args=run_cmd,
+ logger=log.getChild(client),
+ stdin=run.PIPE,
+ wait=False,
+ )
+
+ # XXX: add_daemon() doesn't let us wait until radosgw finishes startup
+ # use a connection pool with retry/backoff to poll each gateway until it starts listening
+ http = PoolManager(retries=Retry(connect=8, backoff_factor=1))
+ for client in clients_to_run:
+ if client == except_client:
+ continue
+ host, port = ctx.rgw.role_endpoints[client]
+ endpoint = 'http://{host}:{port}/'.format(host=host, port=port)
+ log.info('Polling {client} until it starts accepting connections on {endpoint}'.format(client=client, endpoint=endpoint))
+ http.request('GET', endpoint)
+
+ try:
+ yield
+ finally:
+ teuthology.stop_daemons_of_type(ctx, 'rgw')
+ for client in config.iterkeys():
+ ctx.cluster.only(client).run(
+ args=[
+ 'rm',
+ '-f',
+ '{tdir}/rgw.opslog.{client}.sock'.format(tdir=testdir,
+ client=client),
+ ],
+ )
+
+
+@contextlib.contextmanager
+def start_apache(ctx, config, on_client = None, except_client = None):
+ """
+ Start apache on remote sites.
+ """
+ log.info('Starting apache...')
+ testdir = teuthology.get_testdir(ctx)
+ apaches = {}
+ clients_to_run = [on_client]
+ if on_client is None:
+ clients_to_run = config.keys()
+ for client in clients_to_run:
+ if client == except_client:
+ continue
+ (remote,) = ctx.cluster.only(client).remotes.keys()
+ system_type = teuthology.get_system_type(remote)
+ if system_type == 'deb':
+ apache_name = 'apache2'
+ else:
+ try:
+ remote.run(
+ args=[
+ 'stat',
+ '/usr/sbin/httpd.worker',
+ ],
+ )
+ apache_name = '/usr/sbin/httpd.worker'
+ except CommandFailedError:
+ apache_name = '/usr/sbin/httpd'
+
+ proc = remote.run(
+ args=[
+ 'adjust-ulimits',
+ 'daemon-helper',
+ 'kill',
+ apache_name,
+ '-X',
+ '-f',
+ '{tdir}/apache/apache.{client}.conf'.format(tdir=testdir,
+ client=client),
+ ],
+ logger=log.getChild(client),
+ stdin=run.PIPE,
+ wait=False,
+ )
+ apaches[client] = proc
+
+ try:
+ yield
+ finally:
+ log.info('Stopping apache...')
+ for client, proc in apaches.iteritems():
+ proc.stdin.close()
+
+ run.wait(apaches.itervalues())
+
+
+def extract_user_info(client_config):
+ """
+ Extract user info from the client config specified. Returns a dict
+ that includes system key information.
+ """
+ # test if there isn't a system user or if there isn't a name for that
+ # user, return None
+ if ('system user' not in client_config or
+ 'name' not in client_config['system user']):
+ return None
+
+ user_info = dict()
+ user_info['system_key'] = dict(
+ user=client_config['system user']['name'],
+ access_key=client_config['system user']['access key'],
+ secret_key=client_config['system user']['secret key'],
+ )
+ return user_info
+
+
+def extract_zone_info(ctx, client, client_config):
+ """
+ Get zone information.
+ :param client: dictionary of client information
+ :param client_config: dictionary of client configuration information
+ :returns: zone extracted from client and client_config information
+ """
+ ceph_config = ctx.ceph['ceph'].conf.get('global', {})
+ ceph_config.update(ctx.ceph['ceph'].conf.get('client', {}))
+ ceph_config.update(ctx.ceph['ceph'].conf.get(client, {}))
+ for key in ['rgw zone', 'rgw region', 'rgw zone root pool']:
+ assert key in ceph_config, \
+ 'ceph conf must contain {key} for {client}'.format(key=key,
+ client=client)
+ region = ceph_config['rgw region']
+ zone = ceph_config['rgw zone']
+ zone_info = dict()
+ for key in ['rgw control pool', 'rgw gc pool', 'rgw log pool',
+ 'rgw intent log pool', 'rgw usage log pool',
+ 'rgw user keys pool', 'rgw user email pool',
+ 'rgw user swift pool', 'rgw user uid pool',
+ 'rgw domain root']:
+ new_key = key.split(' ', 1)[1]
+ new_key = new_key.replace(' ', '_')
+
+ if key in ceph_config:
+ value = ceph_config[key]
+ log.debug('{key} specified in ceph_config ({val})'.format(
+ key=key, val=value))
+ zone_info[new_key] = value
+ else:
+ zone_info[new_key] = '.' + region + '.' + zone + '.' + new_key
+
+ index_pool = '.' + region + '.' + zone + '.' + 'index_pool'
+ data_pool = '.' + region + '.' + zone + '.' + 'data_pool'
+ data_extra_pool = '.' + region + '.' + zone + '.' + 'data_extra_pool'
+ compression_type = ceph_config.get('rgw compression type', '')
+
+ zone_info['placement_pools'] = [{'key': 'default_placement',
+ 'val': {'index_pool': index_pool,
+ 'data_pool': data_pool,
+ 'data_extra_pool': data_extra_pool,
+ 'compression': compression_type}
+ }]
+
+ # these keys are meant for the zones argument in the region info. We
+ # insert them into zone_info with a different format and then remove them
+ # in the fill_in_endpoints() method
+ for key in ['rgw log meta', 'rgw log data']:
+ if key in ceph_config:
+ zone_info[key] = ceph_config[key]
+
+ # these keys are meant for the zones argument in the region info. We
+ # insert them into zone_info with a different format and then remove them
+ # in the fill_in_endpoints() method
+ for key in ['rgw log meta', 'rgw log data']:
+ if key in ceph_config:
+ zone_info[key] = ceph_config[key]
+
+ return region, zone, zone_info
+
+
+def extract_region_info(region, region_info):
+ """
+ Extract region information from the region_info parameter, using get
+ to set default values.
+
+ :param region: name of the region
+ :param region_info: region information (in dictionary form).
+ :returns: dictionary of region information set from region_info, using
+ default values for missing fields.
+ """
+ assert isinstance(region_info['zones'], list) and region_info['zones'], \
+ 'zones must be a non-empty list'
+ return dict(
+ name=region,
+ api_name=region_info.get('api name', region),
+ is_master=region_info.get('is master', False),
+ log_meta=region_info.get('log meta', False),
+ log_data=region_info.get('log data', False),
+ master_zone=region_info.get('master zone', region_info['zones'][0]),
+ placement_targets=region_info.get('placement targets',
+ [{'name': 'default_placement',
+ 'tags': []}]),
+ default_placement=region_info.get('default placement',
+ 'default_placement'),
+ )
+
+
+def assign_ports(ctx, config):
+ """
+ Assign port numberst starting with port 7280.
+ """
+ port = 7280
+ role_endpoints = {}
+ for remote, roles_for_host in ctx.cluster.remotes.iteritems():
+ for role in roles_for_host:
+ if role in config:
+ role_endpoints[role] = (remote.name.split('@')[1], port)
+ port += 1
+
+ return role_endpoints
+
+
+def fill_in_endpoints(region_info, role_zones, role_endpoints):
+ """
+ Iterate through the list of role_endpoints, filling in zone information
+
+ :param region_info: region data
+ :param role_zones: region and zone information.
+ :param role_endpoints: endpoints being used
+ """
+ for role, (host, port) in role_endpoints.iteritems():
+ region, zone, zone_info, _ = role_zones[role]
+ host, port = role_endpoints[role]
+ endpoint = 'http://{host}:{port}/'.format(host=host, port=port)
+ # check if the region specified under client actually exists
+ # in region_info (it should, if properly configured).
+ # If not, throw a reasonable error
+ if region not in region_info:
+ raise Exception(
+ 'Region: {region} was specified but no corresponding'
+ ' entry was found under \'regions\''.format(region=region))
+
+ region_conf = region_info[region]
+ region_conf.setdefault('endpoints', [])
+ region_conf['endpoints'].append(endpoint)
+
+ # this is the payload for the 'zones' field in the region field
+ zone_payload = dict()
+ zone_payload['endpoints'] = [endpoint]
+ zone_payload['name'] = zone
+
+ # Pull the log meta and log data settings out of zone_info, if they
+ # exist, then pop them as they don't actually belong in the zone info
+ for key in ['rgw log meta', 'rgw log data']:
+ new_key = key.split(' ', 1)[1]
+ new_key = new_key.replace(' ', '_')
+
+ if key in zone_info:
+ value = zone_info.pop(key)
+ else:
+ value = 'false'
+
+ zone_payload[new_key] = value
+
+ region_conf.setdefault('zones', [])
+ region_conf['zones'].append(zone_payload)
+
+
+@contextlib.contextmanager
+def configure_users_for_client(ctx, config, client, everywhere=False):
+ """
+ Create users by remotely running rgwadmin commands using extracted
+ user information.
+ """
+ log.info('Configuring users...')
+ log.info('for client %s', client)
+ log.info('everywhere %s', everywhere)
+
+ # For data sync the master zones and regions must have the
+ # system users of the secondary zones. To keep this simple,
+ # just create the system users on every client if regions are
+ # configured.
+ clients_to_create_as = [client]
+ if everywhere:
+ clients_to_create_as = config.keys()
+
+ # extract the user info and append it to the payload tuple for the given
+ # client
+ for client, c_config in config.iteritems():
+ if not c_config:
+ continue
+ user_info = extract_user_info(c_config)
+ if not user_info:
+ continue
+
+ for client_name in clients_to_create_as:
+ log.debug('Creating user {user} on {client}'.format(
+ user=user_info['system_key']['user'], client=client_name))
+ rgwadmin(ctx, client_name,
+ cmd=[
+ 'user', 'create',
+ '--uid', user_info['system_key']['user'],
+ '--access-key', user_info['system_key']['access_key'],
+ '--secret', user_info['system_key']['secret_key'],
+ '--display-name', user_info['system_key']['user'],
+ '--system',
+ ],
+ check_status=True,
+ )
+ yield
+
+@contextlib.contextmanager
+def configure_users(ctx, config, everywhere=False):
+ """
+ Create users by remotely running rgwadmin commands using extracted
+ user information.
+ """
+ log.info('Configuring users...')
+
+ # extract the user info and append it to the payload tuple for the given
+ # client
+ for client, c_config in config.iteritems():
+ if not c_config:
+ continue
+ user_info = extract_user_info(c_config)
+ if not user_info:
+ continue
+
+ # For data sync the master zones and regions must have the
+ # system users of the secondary zones. To keep this simple,
+ # just create the system users on every client if regions are
+ # configured.
+ clients_to_create_as = [client]
+ if everywhere:
+ clients_to_create_as = config.keys()
+ for client_name in clients_to_create_as:
+ log.debug('Creating user {user} on {client}'.format(
+ user=user_info['system_key']['user'], client=client))
+ rgwadmin(ctx, client_name,
+ cmd=[
+ 'user', 'create',
+ '--uid', user_info['system_key']['user'],
+ '--access-key', user_info['system_key']['access_key'],
+ '--secret', user_info['system_key']['secret_key'],
+ '--display-name', user_info['system_key']['user'],
+ '--system',
+ ],
+ check_status=True,
+ )
+
+ yield
+
+@contextlib.contextmanager
+def create_nonregion_pools(ctx, config, regions):
+ """Create replicated or erasure coded data pools for rgw."""
+ if regions:
+ yield
+ return
+
+ log.info('creating data pools')
+ for client in config.keys():
+ (remote,) = ctx.cluster.only(client).remotes.iterkeys()
+ data_pool = '.rgw.buckets'
+ if ctx.rgw.ec_data_pool:
+ create_ec_pool(remote, data_pool, client, 64,
+ ctx.rgw.erasure_code_profile)
+ else:
+ create_replicated_pool(remote, data_pool, 64)
+ if ctx.rgw.cache_pools:
+ create_cache_pool(remote, data_pool, data_pool + '.cache', 64,
+ 64*1024*1024)
+ yield
+
+@contextlib.contextmanager
+def configure_multisite_regions_and_zones(ctx, config, regions, role_endpoints, realm, master_client):
+ """
+ Configure multisite regions and zones from rados and rgw.
+ """
+ if not regions:
+ log.debug(
+ 'In rgw.configure_multisite_regions_and_zones() and regions is None. '
+ 'Bailing')
+ yield
+ return
+
+ if not realm:
+ log.debug(
+ 'In rgw.configure_multisite_regions_and_zones() and realm is None. '
+ 'Bailing')
+ yield
+ return
+
+ log.info('Configuring multisite regions and zones...')
+
+ log.debug('config is %r', config)
+ log.debug('regions are %r', regions)
+ log.debug('role_endpoints = %r', role_endpoints)
+ log.debug('realm is %r', realm)
+ # extract the zone info
+ role_zones = dict([(client, extract_zone_info(ctx, client, c_config))
+ for client, c_config in config.iteritems()])
+ log.debug('role_zones = %r', role_zones)
+
+ # extract the user info and append it to the payload tuple for the given
+ # client
+ for client, c_config in config.iteritems():
+ if not c_config:
+ user_info = None
+ else:
+ user_info = extract_user_info(c_config)
+
+ (region, zone, zone_info) = role_zones[client]
+ role_zones[client] = (region, zone, zone_info, user_info)
+
+ region_info = dict([
+ (region_name, extract_region_info(region_name, r_config))
+ for region_name, r_config in regions.iteritems()])
+
+ fill_in_endpoints(region_info, role_zones, role_endpoints)
+
+ # clear out the old defaults
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+ # read master zonegroup and master_zone
+ for zonegroup, zg_info in region_info.iteritems():
+ if zg_info['is_master']:
+ master_zonegroup = zonegroup
+ master_zone = zg_info['master_zone']
+ break
+
+ log.debug('master zonegroup =%r', master_zonegroup)
+ log.debug('master zone = %r', master_zone)
+ log.debug('master client = %r', master_client)
+
+ rgwadmin(ctx, master_client,
+ cmd=['realm', 'create', '--rgw-realm', realm, '--default'],
+ check_status=True)
+
+ for region, info in region_info.iteritems():
+ region_json = json.dumps(info)
+ log.debug('region info is: %s', region_json)
+ rgwadmin(ctx, master_client,
+ cmd=['zonegroup', 'set'],
+ stdin=StringIO(region_json),
+ check_status=True)
+
+ rgwadmin(ctx, master_client,
+ cmd=['zonegroup', 'default', '--rgw-zonegroup', master_zonegroup],
+ check_status=True)
+
+ for role, (zonegroup, zone, zone_info, user_info) in role_zones.iteritems():
+ (remote,) = ctx.cluster.only(role).remotes.keys()
+ for pool_info in zone_info['placement_pools']:
+ remote.run(args=['sudo', 'ceph', 'osd', 'pool', 'create',
+ pool_info['val']['index_pool'], '64', '64'])
+ if ctx.rgw.ec_data_pool:
+ create_ec_pool(remote, pool_info['val']['data_pool'],
+ zone, 64, ctx.rgw.erasure_code_profile)
+ else:
+ create_replicated_pool(remote, pool_info['val']['data_pool'], 64)
+
+ (zonegroup, zone, zone_info, user_info) = role_zones[master_client]
+ zone_json = json.dumps(dict(zone_info.items() + user_info.items()))
+ log.debug("zone info is: %r", zone_json)
+ rgwadmin(ctx, master_client,
+ cmd=['zone', 'set', '--rgw-zonegroup', zonegroup,
+ '--rgw-zone', zone],
+ stdin=StringIO(zone_json),
+ check_status=True)
+
+ rgwadmin(ctx, master_client,
+ cmd=['-n', master_client, 'zone', 'default', zone],
+ check_status=True)
+
+ rgwadmin(ctx, master_client,
+ cmd=['-n', master_client, 'period', 'update', '--commit'],
+ check_status=True)
+
+ yield
+
+def configure_compression_in_default_zone(ctx, config):
+ ceph_config = ctx.ceph['ceph'].conf.get('global', {})
+ ceph_config.update(ctx.ceph['ceph'].conf.get('client', {}))
+ for client, c_config in config.iteritems():
+ ceph_config.update(ctx.ceph['ceph'].conf.get(client, {}))
+ key = 'rgw compression type'
+ if not key in ceph_config:
+ log.debug('No compression setting to enable')
+ break
+ compression = ceph_config[key]
+ log.debug('Configuring compression type = %s', compression)
+
+ # XXX: the 'default' zone and zonegroup aren't created until we run RGWRados::init_complete().
+ # issue a 'radosgw-admin user list' command to trigger this
+ rgwadmin(ctx, client, cmd=['user', 'list'], check_status=True)
+
+ rgwadmin(ctx, client,
+ cmd=['zone', 'placement', 'modify', '--rgw-zone', 'default',
+ '--placement-id', 'default-placement', '--compression', compression],
+ check_status=True)
+ break # only the first client
+
+@contextlib.contextmanager
+def configure_regions_and_zones(ctx, config, regions, role_endpoints, realm):
+ """
+ Configure regions and zones from rados and rgw.
+ """
+ if not regions:
+ log.debug(
+ 'In rgw.configure_regions_and_zones() and regions is None. '
+ 'Bailing')
+ configure_compression_in_default_zone(ctx, config)
+ yield
+ return
+
+ if not realm:
+ log.debug(
+ 'In rgw.configure_regions_and_zones() and realm is None. '
+ 'Bailing')
+ configure_compression_in_default_zone(ctx, config)
+ yield
+ return
+
+ log.info('Configuring regions and zones...')
+
+ log.debug('config is %r', config)
+ log.debug('regions are %r', regions)
+ log.debug('role_endpoints = %r', role_endpoints)
+ log.debug('realm is %r', realm)
+ # extract the zone info
+ role_zones = dict([(client, extract_zone_info(ctx, client, c_config))
+ for client, c_config in config.iteritems()])
+ log.debug('roles_zones = %r', role_zones)
+
+ # extract the user info and append it to the payload tuple for the given
+ # client
+ for client, c_config in config.iteritems():
+ if not c_config:
+ user_info = None
+ else:
+ user_info = extract_user_info(c_config)
+
+ (region, zone, zone_info) = role_zones[client]
+ role_zones[client] = (region, zone, zone_info, user_info)
+
+ region_info = dict([
+ (region_name, extract_region_info(region_name, r_config))
+ for region_name, r_config in regions.iteritems()])
+
+ fill_in_endpoints(region_info, role_zones, role_endpoints)
+
+ # clear out the old defaults
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+ # removing these objects from .rgw.root and the per-zone root pools
+ # may or may not matter
+ rados(ctx, mon,
+ cmd=['-p', '.rgw.root', 'rm', 'region_info.default'])
+ rados(ctx, mon,
+ cmd=['-p', '.rgw.root', 'rm', 'zone_info.default'])
+
+ # read master zonegroup and master_zone
+ for zonegroup, zg_info in region_info.iteritems():
+ if zg_info['is_master']:
+ master_zonegroup = zonegroup
+ master_zone = zg_info['master_zone']
+ break
+
+ for client in config.iterkeys():
+ (zonegroup, zone, zone_info, user_info) = role_zones[client]
+ if zonegroup == master_zonegroup and zone == master_zone:
+ master_client = client
+ break
+
+ log.debug('master zonegroup =%r', master_zonegroup)
+ log.debug('master zone = %r', master_zone)
+ log.debug('master client = %r', master_client)
+ log.debug('config %r ', config)
+
+ (ret, out)=rgwadmin(ctx, master_client,
+ cmd=['realm', 'create', '--rgw-realm', realm, '--default'])
+ log.debug('realm create ret %r exists %r', -ret, errno.EEXIST)
+ assert ret == 0 or ret != -errno.EEXIST
+ if ret is -errno.EEXIST:
+ log.debug('realm %r exists', realm)
+
+ for client in config.iterkeys():
+ for role, (zonegroup, zone, zone_info, user_info) in role_zones.iteritems():
+ rados(ctx, mon,
+ cmd=['-p', zone_info['domain_root'],
+ 'rm', 'region_info.default'])
+ rados(ctx, mon,
+ cmd=['-p', zone_info['domain_root'],
+ 'rm', 'zone_info.default'])
+
+ (remote,) = ctx.cluster.only(role).remotes.keys()
+ for pool_info in zone_info['placement_pools']:
+ remote.run(args=['sudo', 'ceph', 'osd', 'pool', 'create',
+ pool_info['val']['index_pool'], '64', '64'])
+ if ctx.rgw.ec_data_pool:
+ create_ec_pool(remote, pool_info['val']['data_pool'],
+ zone, 64, ctx.rgw.erasure_code_profile)
+ else:
+ create_replicated_pool(
+ remote, pool_info['val']['data_pool'],
+ 64)
+ zone_json = json.dumps(dict(zone_info.items() + user_info.items()))
+ log.debug('zone info is: %r', zone_json)
+ rgwadmin(ctx, client,
+ cmd=['zone', 'set', '--rgw-zonegroup', zonegroup,
+ '--rgw-zone', zone],
+ stdin=StringIO(zone_json),
+ check_status=True)
+
+ for region, info in region_info.iteritems():
+ region_json = json.dumps(info)
+ log.debug('region info is: %s', region_json)
+ rgwadmin(ctx, client,
+ cmd=['zonegroup', 'set'],
+ stdin=StringIO(region_json),
+ check_status=True)
+ if info['is_master']:
+ rgwadmin(ctx, client,
+ cmd=['zonegroup', 'default', '--rgw-zonegroup', master_zonegroup],
+ check_status=True)
+
+ (zonegroup, zone, zone_info, user_info) = role_zones[client]
+ rgwadmin(ctx, client,
+ cmd=['zone', 'default', zone],
+ check_status=True)
+
+ rgwadmin(ctx, master_client,
+ cmd=['-n', master_client, 'period', 'update', '--commit'],
+ check_status=True)
+
+ yield
+
+@contextlib.contextmanager
+def pull_configuration(ctx, config, regions, role_endpoints, realm, master_client):
+ """
+ Configure regions and zones from rados and rgw.
+ """
+ if not regions:
+ log.debug(
+ 'In rgw.pull_confguration() and regions is None. '
+ 'Bailing')
+ yield
+ return
+
+ if not realm:
+ log.debug(
+ 'In rgw.pull_configuration() and realm is None. '
+ 'Bailing')
+ yield
+ return
+
+ log.info('Pulling configuration...')
+
+ log.debug('config is %r', config)
+ log.debug('regions are %r', regions)
+ log.debug('role_endpoints = %r', role_endpoints)
+ log.debug('realm is %r', realm)
+ log.debug('master client = %r', master_client)
+
+ # extract the zone info
+ role_zones = dict([(client, extract_zone_info(ctx, client, c_config))
+ for client, c_config in config.iteritems()])
+ log.debug('roles_zones = %r', role_zones)
+
+ # extract the user info and append it to the payload tuple for the given
+ # client
+ for client, c_config in config.iteritems():
+ if not c_config:
+ user_info = None
+ else:
+ user_info = extract_user_info(c_config)
+
+ (region, zone, zone_info) = role_zones[client]
+ role_zones[client] = (region, zone, zone_info, user_info)
+
+ region_info = dict([
+ (region_name, extract_region_info(region_name, r_config))
+ for region_name, r_config in regions.iteritems()])
+
+ fill_in_endpoints(region_info, role_zones, role_endpoints)
+
+ for client in config.iterkeys():
+ if client != master_client:
+ host, port = role_endpoints[master_client]
+ endpoint = 'http://{host}:{port}/'.format(host=host, port=port)
+ log.debug("endpoint: %s", endpoint)
+ rgwadmin(ctx, client,
+ cmd=['-n', client, 'realm', 'pull', '--rgw-realm', realm, '--default', '--url',
+ endpoint, '--access_key',
+ user_info['system_key']['access_key'], '--secret',
+ user_info['system_key']['secret_key']],
+ check_status=True)
+
+ (zonegroup, zone, zone_info, zone_user_info) = role_zones[client]
+ zone_json = json.dumps(dict(zone_info.items() + zone_user_info.items()))
+ log.debug("zone info is: %r"), zone_json
+ rgwadmin(ctx, client,
+ cmd=['zone', 'set', '--rgw-zonegroup', zonegroup,
+ '--rgw-zone', zone],
+ stdin=StringIO(zone_json),
+ check_status=True)
+
+ rgwadmin(ctx, client,
+ cmd=['period', 'update', '--commit', '--url',
+ endpoint, '--access_key',
+ user_info['system_key']['access_key'], '--secret',
+ user_info['system_key']['secret_key']],
+ check_status=True)
+
+ yield
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Either use configure apache to run a rados gateway, or use the built-in
+ civetweb server.
+ Only one should be run per machine, since it uses a hard-coded port for
+ now.
+
+ For example, to run rgw on all clients::
+
+ tasks:
+ - ceph:
+ - rgw:
+
+ To only run on certain clients::
+
+ tasks:
+ - ceph:
+ - rgw: [client.0, client.3]
+
+ or
+
+ tasks:
+ - ceph:
+ - rgw:
+ client.0:
+ client.3:
+
+ You can adjust the idle timeout for fastcgi (default is 30 seconds):
+
+ tasks:
+ - ceph:
+ - rgw:
+ client.0:
+ idle_timeout: 90
+
+ To run radosgw through valgrind:
+
+ tasks:
+ - ceph:
+ - rgw:
+ client.0:
+ valgrind: [--tool=memcheck]
+ client.3:
+ valgrind: [--tool=memcheck]
+
+ To use civetweb instead of apache:
+
+ tasks:
+ - ceph:
+ - rgw:
+ - client.0
+ overrides:
+ rgw:
+ frontend: civetweb
+
+ Note that without a modified fastcgi module e.g. with the default
+ one on CentOS, you must have rgw print continue = false in ceph.conf::
+
+ tasks:
+ - ceph:
+ conf:
+ global:
+ rgw print continue: false
+ - rgw: [client.0]
+
+ To use mod_proxy_fcgi instead of mod_fastcgi:
+
+ overrides:
+ rgw:
+ use_fcgi: true
+
+ To run rgws for multiple regions or zones, describe the regions
+ and their zones in a regions section. The endpoints will be
+ generated by this task. Each client must have a region, zone,
+ and pools assigned in ceph.conf::
+
+ tasks:
+ - install:
+ - ceph:
+ conf:
+ client.0:
+ rgw region: foo
+ rgw zone: foo-1
+ rgw region root pool: .rgw.rroot.foo
+ rgw zone root pool: .rgw.zroot.foo
+ rgw log meta: true
+ rgw log data: true
+ client.1:
+ rgw region: bar
+ rgw zone: bar-master
+ rgw region root pool: .rgw.rroot.bar
+ rgw zone root pool: .rgw.zroot.bar
+ rgw log meta: true
+ rgw log data: true
+ client.2:
+ rgw region: bar
+ rgw zone: bar-secondary
+ rgw region root pool: .rgw.rroot.bar
+ rgw zone root pool: .rgw.zroot.bar-secondary
+ - rgw:
+ default_idle_timeout: 30
+ ec-data-pool: true
+ erasure_code_profile:
+ k: 2
+ m: 1
+ ruleset-failure-domain: osd
+ realm: foo
+ regions:
+ foo:
+ api name: api_name # default: region name
+ is master: true # default: false
+ master zone: foo-1 # default: first zone
+ zones: [foo-1]
+ log meta: true
+ log data: true
+ placement targets: [target1, target2] # default: []
+ default placement: target2 # default: ''
+ bar:
+ api name: bar-api
+ zones: [bar-master, bar-secondary]
+ client.0:
+ system user:
+ name: foo-system
+ access key: X2IYPSTY1072DDY1SJMC
+ secret key: YIMHICpPvT+MhLTbSsiBJ1jQF15IFvJA8tgwJEcm
+ client.1:
+ system user:
+ name: bar1
+ access key: Y2IYPSTY1072DDY1SJMC
+ secret key: XIMHICpPvT+MhLTbSsiBJ1jQF15IFvJA8tgwJEcm
+ client.2:
+ system user:
+ name: bar2
+ access key: Z2IYPSTY1072DDY1SJMC
+ secret key: ZIMHICpPvT+MhLTbSsiBJ1jQF15IFvJA8tgwJEcm
+ """
+ if config is None:
+ config = dict(('client.{id}'.format(id=id_), None)
+ for id_ in teuthology.all_roles_of_type(
+ ctx.cluster, 'client'))
+ elif isinstance(config, list):
+ config = dict((name, None) for name in config)
+
+ overrides = ctx.config.get('overrides', {})
+ teuthology.deep_merge(config, overrides.get('rgw', {}))
+
+ regions = {}
+ if 'regions' in config:
+ # separate region info so only clients are keys in config
+ regions = config['regions']
+ del config['regions']
+
+ role_endpoints = assign_ports(ctx, config)
+ ctx.rgw = argparse.Namespace()
+ ctx.rgw.role_endpoints = role_endpoints
+ # stash the region info for later, since it was deleted from the config
+ # structure
+ ctx.rgw.regions = regions
+
+ realm = None
+ if 'realm' in config:
+ # separate region info so only clients are keys in config
+ realm = config['realm']
+ del config['realm']
+ ctx.rgw.realm = realm
+
+ ctx.rgw.ec_data_pool = False
+ if 'ec-data-pool' in config:
+ ctx.rgw.ec_data_pool = bool(config['ec-data-pool'])
+ del config['ec-data-pool']
+ ctx.rgw.erasure_code_profile = {}
+ if 'erasure_code_profile' in config:
+ ctx.rgw.erasure_code_profile = config['erasure_code_profile']
+ del config['erasure_code_profile']
+ ctx.rgw.default_idle_timeout = 30
+ if 'default_idle_timeout' in config:
+ ctx.rgw.default_idle_timeout = int(config['default_idle_timeout'])
+ del config['default_idle_timeout']
+ ctx.rgw.cache_pools = False
+ if 'cache-pools' in config:
+ ctx.rgw.cache_pools = bool(config['cache-pools'])
+ del config['cache-pools']
+
+ ctx.rgw.frontend = 'civetweb'
+ if 'frontend' in config:
+ ctx.rgw.frontend = config['frontend']
+ del config['frontend']
+
+ ctx.rgw.use_fastcgi = True
+ if "use_fcgi" in config:
+ ctx.rgw.use_fastcgi = False
+ log.info("Using mod_proxy_fcgi instead of mod_fastcgi...")
+ del config['use_fcgi']
+
+ subtasks = [
+ lambda: create_nonregion_pools(
+ ctx=ctx, config=config, regions=regions),
+ ]
+
+ multisite = len(regions) > 1
+
+ if not multisite:
+ for zonegroup, zonegroup_info in regions.iteritems():
+ log.debug("zonegroup_info =%r", zonegroup_info)
+ if len(zonegroup_info['zones']) > 1:
+ multisite = True
+ break
+
+ log.debug('multisite %s', multisite)
+ multi_cluster = multisite and len(ctx.config['roles']) > 1
+ log.debug('multi_cluster %s', multi_cluster)
+ master_client = None
+
+ if multi_cluster:
+ log.debug('multi cluster run')
+
+ master_client = get_config_master_client(ctx=ctx,
+ config=config,
+ regions=regions)
+ log.debug('master_client %r', master_client)
+ subtasks.extend([
+ lambda: configure_multisite_regions_and_zones(
+ ctx=ctx,
+ config=config,
+ regions=regions,
+ role_endpoints=role_endpoints,
+ realm=realm,
+ master_client = master_client,
+ )
+ ])
+
+ subtasks.extend([
+ lambda: configure_users_for_client(
+ ctx=ctx,
+ config=config,
+ client=master_client,
+ everywhere=False,
+ ),
+ ])
+
+ if ctx.rgw.frontend == 'apache':
+ subtasks.insert(0,
+ lambda: create_apache_dirs(ctx=ctx, config=config,
+ on_client=master_client))
+ subtasks.extend([
+ lambda: ship_apache_configs(ctx=ctx, config=config,
+ role_endpoints=role_endpoints, on_client=master_client),
+ lambda: start_rgw(ctx=ctx, config=config, on_client=master_client),
+ lambda: start_apache(ctx=ctx, config=config, on_client=master_client),
+ ])
+ elif ctx.rgw.frontend == 'civetweb':
+ subtasks.extend([
+ lambda: start_rgw(ctx=ctx, config=config, on_client=master_client),
+ ])
+ else:
+ raise ValueError("frontend must be 'apache' or 'civetweb'")
+
+ subtasks.extend([
+ lambda: pull_configuration(ctx=ctx,
+ config=config,
+ regions=regions,
+ role_endpoints=role_endpoints,
+ realm=realm,
+ master_client=master_client
+ ),
+ ])
+
+ subtasks.extend([
+ lambda: configure_users_for_client(
+ ctx=ctx,
+ config=config,
+ client=master_client,
+ everywhere=True
+ ),
+ ])
+
+ if ctx.rgw.frontend == 'apache':
+ subtasks.insert(0,
+ lambda: create_apache_dirs(ctx=ctx, config=config,
+ on_client=None,
+ except_client = master_client))
+ subtasks.extend([
+ lambda: ship_apache_configs(ctx=ctx, config=config,
+ role_endpoints=role_endpoints,
+ on_client=None,
+ except_client = master_client,
+ ),
+ lambda: start_rgw(ctx=ctx,
+ config=config,
+ on_client=None,
+ except_client = master_client),
+ lambda: start_apache(ctx=ctx,
+ config = config,
+ on_client=None,
+ except_client = master_client,
+ ),
+ ])
+ elif ctx.rgw.frontend == 'civetweb':
+ subtasks.extend([
+ lambda: start_rgw(ctx=ctx,
+ config=config,
+ on_client=None,
+ except_client = master_client),
+ ])
+ else:
+ raise ValueError("frontend must be 'apache' or 'civetweb'")
+
+ else:
+ log.debug('single cluster run')
+ subtasks.extend([
+ lambda: configure_regions_and_zones(
+ ctx=ctx,
+ config=config,
+ regions=regions,
+ role_endpoints=role_endpoints,
+ realm=realm,
+ ),
+ lambda: configure_users(
+ ctx=ctx,
+ config=config,
+ everywhere=True,
+ ),
+ ])
+ if ctx.rgw.frontend == 'apache':
+ subtasks.insert(0, lambda: create_apache_dirs(ctx=ctx, config=config))
+ subtasks.extend([
+ lambda: ship_apache_configs(ctx=ctx, config=config,
+ role_endpoints=role_endpoints),
+ lambda: start_rgw(ctx=ctx,
+ config=config),
+ lambda: start_apache(ctx=ctx, config=config),
+ ])
+ elif ctx.rgw.frontend == 'civetweb':
+ subtasks.extend([
+ lambda: start_rgw(ctx=ctx,
+ config=config),
+ ])
+ else:
+ raise ValueError("frontend must be 'apache' or 'civetweb'")
+
+ log.info("Using %s as radosgw frontend", ctx.rgw.frontend)
+ with contextutil.nested(*subtasks):
+ yield
--- /dev/null
+"""
+rgw s3tests logging wrappers
+"""
+from cStringIO import StringIO
+from configobj import ConfigObj
+import contextlib
+import logging
+import s3tests
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def download(ctx, config):
+ """
+ Run s3tests download function
+ """
+ return s3tests.download(ctx, config)
+
+def _config_user(s3tests_conf, section, user):
+ """
+ Run s3tests user config function
+ """
+ return s3tests._config_user(s3tests_conf, section, user)
+
+@contextlib.contextmanager
+def create_users(ctx, config):
+ """
+ Run s3tests user create function
+ """
+ return s3tests.create_users(ctx, config)
+
+@contextlib.contextmanager
+def configure(ctx, config):
+ """
+ Run s3tests user configure function
+ """
+ return s3tests.configure(ctx, config)
+
+@contextlib.contextmanager
+def run_tests(ctx, config):
+ """
+ Run remote netcat tests
+ """
+ assert isinstance(config, dict)
+ testdir = teuthology.get_testdir(ctx)
+ for client, client_config in config.iteritems():
+ client_config['extra_args'] = [
+ 's3tests.functional.test_s3:test_bucket_list_return_data',
+ ]
+# args = [
+# 'S3TEST_CONF={tdir}/archive/s3-tests.{client}.conf'.format(tdir=testdir, client=client),
+# '{tdir}/s3-tests/virtualenv/bin/nosetests'.format(tdir=testdir),
+# '-w',
+# '{tdir}/s3-tests'.format(tdir=testdir),
+# '-v',
+# 's3tests.functional.test_s3:test_bucket_list_return_data',
+# ]
+# if client_config is not None and 'extra_args' in client_config:
+# args.extend(client_config['extra_args'])
+#
+# ctx.cluster.only(client).run(
+# args=args,
+# )
+
+ s3tests.run_tests(ctx, config)
+
+ netcat_out = StringIO()
+
+ for client, client_config in config.iteritems():
+ ctx.cluster.only(client).run(
+ args = [
+ 'netcat',
+ '-w', '5',
+ '-U', '{tdir}/rgw.opslog.sock'.format(tdir=testdir),
+ ],
+ stdout = netcat_out,
+ )
+
+ out = netcat_out.getvalue()
+
+ assert len(out) > 100
+
+ log.info('Received', out)
+
+ yield
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Run some s3-tests suite against rgw, verify opslog socket returns data
+
+ Must restrict testing to a particular client::
+
+ tasks:
+ - ceph:
+ - rgw: [client.0]
+ - s3tests: [client.0]
+
+ To pass extra arguments to nose (e.g. to run a certain test)::
+
+ tasks:
+ - ceph:
+ - rgw: [client.0]
+ - s3tests:
+ client.0:
+ extra_args: ['test_s3:test_object_acl_grand_public_read']
+ client.1:
+ extra_args: ['--exclude', 'test_100_continue']
+ """
+ assert config is None or isinstance(config, list) \
+ or isinstance(config, dict), \
+ "task s3tests only supports a list or dictionary for configuration"
+ all_clients = ['client.{id}'.format(id=id_)
+ for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+ if config is None:
+ config = all_clients
+ if isinstance(config, list):
+ config = dict.fromkeys(config)
+ clients = config.keys()
+
+ overrides = ctx.config.get('overrides', {})
+ # merge each client section, not the top level.
+ for (client, cconf) in config.iteritems():
+ teuthology.deep_merge(cconf, overrides.get('rgw-logsocket', {}))
+
+ log.debug('config is %s', config)
+
+ s3tests_conf = {}
+ for client in clients:
+ s3tests_conf[client] = ConfigObj(
+ indent_type='',
+ infile={
+ 'DEFAULT':
+ {
+ 'port' : 7280,
+ 'is_secure' : 'no',
+ },
+ 'fixtures' : {},
+ 's3 main' : {},
+ 's3 alt' : {},
+ }
+ )
+
+ with contextutil.nested(
+ lambda: download(ctx=ctx, config=config),
+ lambda: create_users(ctx=ctx, config=dict(
+ clients=clients,
+ s3tests_conf=s3tests_conf,
+ )),
+ lambda: configure(ctx=ctx, config=dict(
+ clients=config,
+ s3tests_conf=s3tests_conf,
+ )),
+ lambda: run_tests(ctx=ctx, config=config),
+ ):
+ yield
--- /dev/null
+"""
+Run rgw s3 readwite tests
+"""
+from cStringIO import StringIO
+import base64
+import contextlib
+import logging
+import os
+import random
+import string
+import yaml
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.config import config as teuth_config
+from teuthology.orchestra import run
+from teuthology.orchestra.connection import split_user
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def download(ctx, config):
+ """
+ Download the s3 tests from the git builder.
+ Remove downloaded s3 file upon exit.
+
+ The context passed in should be identical to the context
+ passed in to the main task.
+ """
+ assert isinstance(config, dict)
+ log.info('Downloading s3-tests...')
+ testdir = teuthology.get_testdir(ctx)
+ for (client, cconf) in config.items():
+ branch = cconf.get('force-branch', None)
+ if not branch:
+ branch = cconf.get('branch', 'master')
+ sha1 = cconf.get('sha1')
+ ctx.cluster.only(client).run(
+ args=[
+ 'git', 'clone',
+ '-b', branch,
+ teuth_config.ceph_git_base_url + 's3-tests.git',
+ '{tdir}/s3-tests'.format(tdir=testdir),
+ ],
+ )
+ if sha1 is not None:
+ ctx.cluster.only(client).run(
+ args=[
+ 'cd', '{tdir}/s3-tests'.format(tdir=testdir),
+ run.Raw('&&'),
+ 'git', 'reset', '--hard', sha1,
+ ],
+ )
+ try:
+ yield
+ finally:
+ log.info('Removing s3-tests...')
+ testdir = teuthology.get_testdir(ctx)
+ for client in config:
+ ctx.cluster.only(client).run(
+ args=[
+ 'rm',
+ '-rf',
+ '{tdir}/s3-tests'.format(tdir=testdir),
+ ],
+ )
+
+
+def _config_user(s3tests_conf, section, user):
+ """
+ Configure users for this section by stashing away keys, ids, and
+ email addresses.
+ """
+ s3tests_conf[section].setdefault('user_id', user)
+ s3tests_conf[section].setdefault('email', '{user}+test@test.test'.format(user=user))
+ s3tests_conf[section].setdefault('display_name', 'Mr. {user}'.format(user=user))
+ s3tests_conf[section].setdefault('access_key', ''.join(random.choice(string.uppercase) for i in xrange(20)))
+ s3tests_conf[section].setdefault('secret_key', base64.b64encode(os.urandom(40)))
+
+@contextlib.contextmanager
+def create_users(ctx, config):
+ """
+ Create a default s3 user.
+ """
+ assert isinstance(config, dict)
+ log.info('Creating rgw users...')
+ testdir = teuthology.get_testdir(ctx)
+ users = {'s3': 'foo'}
+ cached_client_user_names = dict()
+ for client in config['clients']:
+ cached_client_user_names[client] = dict()
+ s3tests_conf = config['s3tests_conf'][client]
+ s3tests_conf.setdefault('readwrite', {})
+ s3tests_conf['readwrite'].setdefault('bucket', 'rwtest-' + client + '-{random}-')
+ s3tests_conf['readwrite'].setdefault('readers', 10)
+ s3tests_conf['readwrite'].setdefault('writers', 3)
+ s3tests_conf['readwrite'].setdefault('duration', 300)
+ s3tests_conf['readwrite'].setdefault('files', {})
+ rwconf = s3tests_conf['readwrite']
+ rwconf['files'].setdefault('num', 10)
+ rwconf['files'].setdefault('size', 2000)
+ rwconf['files'].setdefault('stddev', 500)
+ for section, user in users.iteritems():
+ _config_user(s3tests_conf, section, '{user}.{client}'.format(user=user, client=client))
+ log.debug('creating user {user} on {client}'.format(user=s3tests_conf[section]['user_id'],
+ client=client))
+
+ # stash the 'delete_user' flag along with user name for easier cleanup
+ delete_this_user = True
+ if 'delete_user' in s3tests_conf['s3']:
+ delete_this_user = s3tests_conf['s3']['delete_user']
+ log.debug('delete_user set to {flag} for {client}'.format(flag=delete_this_user, client=client))
+ cached_client_user_names[client][section+user] = (s3tests_conf[section]['user_id'], delete_this_user)
+
+ # skip actual user creation if the create_user flag is set to false for this client
+ if 'create_user' in s3tests_conf['s3'] and s3tests_conf['s3']['create_user'] == False:
+ log.debug('create_user set to False, skipping user creation for {client}'.format(client=client))
+ continue
+ else:
+ ctx.cluster.only(client).run(
+ args=[
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'radosgw-admin',
+ '-n', client,
+ 'user', 'create',
+ '--uid', s3tests_conf[section]['user_id'],
+ '--display-name', s3tests_conf[section]['display_name'],
+ '--access-key', s3tests_conf[section]['access_key'],
+ '--secret', s3tests_conf[section]['secret_key'],
+ '--email', s3tests_conf[section]['email'],
+ ],
+ )
+ try:
+ yield
+ finally:
+ for client in config['clients']:
+ for section, user in users.iteritems():
+ #uid = '{user}.{client}'.format(user=user, client=client)
+ real_uid, delete_this_user = cached_client_user_names[client][section+user]
+ if delete_this_user:
+ ctx.cluster.only(client).run(
+ args=[
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'radosgw-admin',
+ '-n', client,
+ 'user', 'rm',
+ '--uid', real_uid,
+ '--purge-data',
+ ],
+ )
+ else:
+ log.debug('skipping delete for user {uid} on {client}'.format(uid=real_uid, client=client))
+
+@contextlib.contextmanager
+def configure(ctx, config):
+ """
+ Configure the s3-tests. This includes the running of the
+ bootstrap code and the updating of local conf files.
+ """
+ assert isinstance(config, dict)
+ log.info('Configuring s3-readwrite-tests...')
+ for client, properties in config['clients'].iteritems():
+ s3tests_conf = config['s3tests_conf'][client]
+ if properties is not None and 'rgw_server' in properties:
+ host = None
+ for target, roles in zip(ctx.config['targets'].iterkeys(), ctx.config['roles']):
+ log.info('roles: ' + str(roles))
+ log.info('target: ' + str(target))
+ if properties['rgw_server'] in roles:
+ _, host = split_user(target)
+ assert host is not None, "Invalid client specified as the rgw_server"
+ s3tests_conf['s3']['host'] = host
+ else:
+ s3tests_conf['s3']['host'] = 'localhost'
+
+ def_conf = s3tests_conf['DEFAULT']
+ s3tests_conf['s3'].setdefault('port', def_conf['port'])
+ s3tests_conf['s3'].setdefault('is_secure', def_conf['is_secure'])
+
+ (remote,) = ctx.cluster.only(client).remotes.keys()
+ remote.run(
+ args=[
+ 'cd',
+ '{tdir}/s3-tests'.format(tdir=teuthology.get_testdir(ctx)),
+ run.Raw('&&'),
+ './bootstrap',
+ ],
+ )
+ conf_fp = StringIO()
+ conf = dict(
+ s3=s3tests_conf['s3'],
+ readwrite=s3tests_conf['readwrite'],
+ )
+ yaml.safe_dump(conf, conf_fp, default_flow_style=False)
+ teuthology.write_file(
+ remote=remote,
+ path='{tdir}/archive/s3readwrite.{client}.config.yaml'.format(tdir=teuthology.get_testdir(ctx), client=client),
+ data=conf_fp.getvalue(),
+ )
+ yield
+
+
+@contextlib.contextmanager
+def run_tests(ctx, config):
+ """
+ Run the s3readwrite tests after everything is set up.
+
+ :param ctx: Context passed to task
+ :param config: specific configuration information
+ """
+ assert isinstance(config, dict)
+ testdir = teuthology.get_testdir(ctx)
+ for client, client_config in config.iteritems():
+ (remote,) = ctx.cluster.only(client).remotes.keys()
+ conf = teuthology.get_file(remote, '{tdir}/archive/s3readwrite.{client}.config.yaml'.format(tdir=testdir, client=client))
+ args = [
+ '{tdir}/s3-tests/virtualenv/bin/s3tests-test-readwrite'.format(tdir=testdir),
+ ]
+ if client_config is not None and 'extra_args' in client_config:
+ args.extend(client_config['extra_args'])
+
+ ctx.cluster.only(client).run(
+ args=args,
+ stdin=conf,
+ )
+ yield
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Run the s3tests-test-readwrite suite against rgw.
+
+ To run all tests on all clients::
+
+ tasks:
+ - ceph:
+ - rgw:
+ - s3readwrite:
+
+ To restrict testing to particular clients::
+
+ tasks:
+ - ceph:
+ - rgw: [client.0]
+ - s3readwrite: [client.0]
+
+ To run against a server on client.1::
+
+ tasks:
+ - ceph:
+ - rgw: [client.1]
+ - s3readwrite:
+ client.0:
+ rgw_server: client.1
+
+ To pass extra test arguments
+
+ tasks:
+ - ceph:
+ - rgw: [client.0]
+ - s3readwrite:
+ client.0:
+ readwrite:
+ bucket: mybucket
+ readers: 10
+ writers: 3
+ duration: 600
+ files:
+ num: 10
+ size: 2000
+ stddev: 500
+ client.1:
+ ...
+
+ To override s3 configuration
+
+ tasks:
+ - ceph:
+ - rgw: [client.0]
+ - s3readwrite:
+ client.0:
+ s3:
+ user_id: myuserid
+ display_name: myname
+ email: my@email
+ access_key: myaccesskey
+ secret_key: mysecretkey
+
+ """
+ assert config is None or isinstance(config, list) \
+ or isinstance(config, dict), \
+ "task s3tests only supports a list or dictionary for configuration"
+ all_clients = ['client.{id}'.format(id=id_)
+ for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+ if config is None:
+ config = all_clients
+ if isinstance(config, list):
+ config = dict.fromkeys(config)
+ clients = config.keys()
+
+ overrides = ctx.config.get('overrides', {})
+ # merge each client section, not the top level.
+ for client in config.iterkeys():
+ if not config[client]:
+ config[client] = {}
+ teuthology.deep_merge(config[client], overrides.get('s3readwrite', {}))
+
+ log.debug('in s3readwrite, config is %s', config)
+
+ s3tests_conf = {}
+ for client in clients:
+ if config[client] is None:
+ config[client] = {}
+ config[client].setdefault('s3', {})
+ config[client].setdefault('readwrite', {})
+
+ s3tests_conf[client] = ({
+ 'DEFAULT':
+ {
+ 'port' : 7280,
+ 'is_secure' : False,
+ },
+ 'readwrite' : config[client]['readwrite'],
+ 's3' : config[client]['s3'],
+ })
+
+ with contextutil.nested(
+ lambda: download(ctx=ctx, config=config),
+ lambda: create_users(ctx=ctx, config=dict(
+ clients=clients,
+ s3tests_conf=s3tests_conf,
+ )),
+ lambda: configure(ctx=ctx, config=dict(
+ clients=config,
+ s3tests_conf=s3tests_conf,
+ )),
+ lambda: run_tests(ctx=ctx, config=config),
+ ):
+ pass
+ yield
--- /dev/null
+"""
+Run rgw roundtrip message tests
+"""
+from cStringIO import StringIO
+import base64
+import contextlib
+import logging
+import os
+import random
+import string
+import yaml
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.config import config as teuth_config
+from teuthology.orchestra import run
+from teuthology.orchestra.connection import split_user
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def download(ctx, config):
+ """
+ Download the s3 tests from the git builder.
+ Remove downloaded s3 file upon exit.
+
+ The context passed in should be identical to the context
+ passed in to the main task.
+ """
+ assert isinstance(config, dict)
+ log.info('Downloading s3-tests...')
+ testdir = teuthology.get_testdir(ctx)
+ for (client, cconf) in config.iteritems():
+ branch = cconf.get('force-branch', None)
+ if not branch:
+ branch = cconf.get('branch', 'master')
+ ctx.cluster.only(client).run(
+ args=[
+ 'git', 'clone',
+ '-b', branch,
+ teuth_config.ceph_git_base_url + 's3-tests.git',
+ '{tdir}/s3-tests'.format(tdir=testdir),
+ ],
+ )
+ try:
+ yield
+ finally:
+ log.info('Removing s3-tests...')
+ for client in config:
+ ctx.cluster.only(client).run(
+ args=[
+ 'rm',
+ '-rf',
+ '{tdir}/s3-tests'.format(tdir=testdir),
+ ],
+ )
+
+def _config_user(s3tests_conf, section, user):
+ """
+ Configure users for this section by stashing away keys, ids, and
+ email addresses.
+ """
+ s3tests_conf[section].setdefault('user_id', user)
+ s3tests_conf[section].setdefault('email', '{user}+test@test.test'.format(user=user))
+ s3tests_conf[section].setdefault('display_name', 'Mr. {user}'.format(user=user))
+ s3tests_conf[section].setdefault('access_key', ''.join(random.choice(string.uppercase) for i in xrange(20)))
+ s3tests_conf[section].setdefault('secret_key', base64.b64encode(os.urandom(40)))
+
+@contextlib.contextmanager
+def create_users(ctx, config):
+ """
+ Create a default s3 user.
+ """
+ assert isinstance(config, dict)
+ log.info('Creating rgw users...')
+ testdir = teuthology.get_testdir(ctx)
+ users = {'s3': 'foo'}
+ for client in config['clients']:
+ s3tests_conf = config['s3tests_conf'][client]
+ s3tests_conf.setdefault('roundtrip', {})
+ s3tests_conf['roundtrip'].setdefault('bucket', 'rttest-' + client + '-{random}-')
+ s3tests_conf['roundtrip'].setdefault('readers', 10)
+ s3tests_conf['roundtrip'].setdefault('writers', 3)
+ s3tests_conf['roundtrip'].setdefault('duration', 300)
+ s3tests_conf['roundtrip'].setdefault('files', {})
+ rtconf = s3tests_conf['roundtrip']
+ rtconf['files'].setdefault('num', 10)
+ rtconf['files'].setdefault('size', 2000)
+ rtconf['files'].setdefault('stddev', 500)
+ for section, user in [('s3', 'foo')]:
+ _config_user(s3tests_conf, section, '{user}.{client}'.format(user=user, client=client))
+ ctx.cluster.only(client).run(
+ args=[
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'radosgw-admin',
+ '-n', client,
+ 'user', 'create',
+ '--uid', s3tests_conf[section]['user_id'],
+ '--display-name', s3tests_conf[section]['display_name'],
+ '--access-key', s3tests_conf[section]['access_key'],
+ '--secret', s3tests_conf[section]['secret_key'],
+ '--email', s3tests_conf[section]['email'],
+ ],
+ )
+ try:
+ yield
+ finally:
+ for client in config['clients']:
+ for user in users.itervalues():
+ uid = '{user}.{client}'.format(user=user, client=client)
+ ctx.cluster.only(client).run(
+ args=[
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'radosgw-admin',
+ '-n', client,
+ 'user', 'rm',
+ '--uid', uid,
+ '--purge-data',
+ ],
+ )
+
+@contextlib.contextmanager
+def configure(ctx, config):
+ """
+ Configure the s3-tests. This includes the running of the
+ bootstrap code and the updating of local conf files.
+ """
+ assert isinstance(config, dict)
+ log.info('Configuring s3-roundtrip-tests...')
+ testdir = teuthology.get_testdir(ctx)
+ for client, properties in config['clients'].iteritems():
+ s3tests_conf = config['s3tests_conf'][client]
+ if properties is not None and 'rgw_server' in properties:
+ host = None
+ for target, roles in zip(ctx.config['targets'].iterkeys(), ctx.config['roles']):
+ log.info('roles: ' + str(roles))
+ log.info('target: ' + str(target))
+ if properties['rgw_server'] in roles:
+ _, host = split_user(target)
+ assert host is not None, "Invalid client specified as the rgw_server"
+ s3tests_conf['s3']['host'] = host
+ else:
+ s3tests_conf['s3']['host'] = 'localhost'
+
+ def_conf = s3tests_conf['DEFAULT']
+ s3tests_conf['s3'].setdefault('port', def_conf['port'])
+ s3tests_conf['s3'].setdefault('is_secure', def_conf['is_secure'])
+
+ (remote,) = ctx.cluster.only(client).remotes.keys()
+ remote.run(
+ args=[
+ 'cd',
+ '{tdir}/s3-tests'.format(tdir=testdir),
+ run.Raw('&&'),
+ './bootstrap',
+ ],
+ )
+ conf_fp = StringIO()
+ conf = dict(
+ s3=s3tests_conf['s3'],
+ roundtrip=s3tests_conf['roundtrip'],
+ )
+ yaml.safe_dump(conf, conf_fp, default_flow_style=False)
+ teuthology.write_file(
+ remote=remote,
+ path='{tdir}/archive/s3roundtrip.{client}.config.yaml'.format(tdir=testdir, client=client),
+ data=conf_fp.getvalue(),
+ )
+ yield
+
+
+@contextlib.contextmanager
+def run_tests(ctx, config):
+ """
+ Run the s3 roundtrip after everything is set up.
+
+ :param ctx: Context passed to task
+ :param config: specific configuration information
+ """
+ assert isinstance(config, dict)
+ testdir = teuthology.get_testdir(ctx)
+ for client, client_config in config.iteritems():
+ (remote,) = ctx.cluster.only(client).remotes.keys()
+ conf = teuthology.get_file(remote, '{tdir}/archive/s3roundtrip.{client}.config.yaml'.format(tdir=testdir, client=client))
+ args = [
+ '{tdir}/s3-tests/virtualenv/bin/s3tests-test-roundtrip'.format(tdir=testdir),
+ ]
+ if client_config is not None and 'extra_args' in client_config:
+ args.extend(client_config['extra_args'])
+
+ ctx.cluster.only(client).run(
+ args=args,
+ stdin=conf,
+ )
+ yield
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Run the s3tests-test-roundtrip suite against rgw.
+
+ To run all tests on all clients::
+
+ tasks:
+ - ceph:
+ - rgw:
+ - s3roundtrip:
+
+ To restrict testing to particular clients::
+
+ tasks:
+ - ceph:
+ - rgw: [client.0]
+ - s3roundtrip: [client.0]
+
+ To run against a server on client.1::
+
+ tasks:
+ - ceph:
+ - rgw: [client.1]
+ - s3roundtrip:
+ client.0:
+ rgw_server: client.1
+
+ To pass extra test arguments
+
+ tasks:
+ - ceph:
+ - rgw: [client.0]
+ - s3roundtrip:
+ client.0:
+ roundtrip:
+ bucket: mybucket
+ readers: 10
+ writers: 3
+ duration: 600
+ files:
+ num: 10
+ size: 2000
+ stddev: 500
+ client.1:
+ ...
+
+ To override s3 configuration
+
+ tasks:
+ - ceph:
+ - rgw: [client.0]
+ - s3roundtrip:
+ client.0:
+ s3:
+ user_id: myuserid
+ display_name: myname
+ email: my@email
+ access_key: myaccesskey
+ secret_key: mysecretkey
+
+ """
+ assert config is None or isinstance(config, list) \
+ or isinstance(config, dict), \
+ "task s3tests only supports a list or dictionary for configuration"
+ all_clients = ['client.{id}'.format(id=id_)
+ for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+ if config is None:
+ config = all_clients
+ if isinstance(config, list):
+ config = dict.fromkeys(config)
+ clients = config.keys()
+
+ s3tests_conf = {}
+ for client in clients:
+ if config[client] is None:
+ config[client] = {}
+ config[client].setdefault('s3', {})
+ config[client].setdefault('roundtrip', {})
+
+ s3tests_conf[client] = ({
+ 'DEFAULT':
+ {
+ 'port' : 7280,
+ 'is_secure' : False,
+ },
+ 'roundtrip' : config[client]['roundtrip'],
+ 's3' : config[client]['s3'],
+ })
+
+ with contextutil.nested(
+ lambda: download(ctx=ctx, config=config),
+ lambda: create_users(ctx=ctx, config=dict(
+ clients=clients,
+ s3tests_conf=s3tests_conf,
+ )),
+ lambda: configure(ctx=ctx, config=dict(
+ clients=config,
+ s3tests_conf=s3tests_conf,
+ )),
+ lambda: run_tests(ctx=ctx, config=config),
+ ):
+ pass
+ yield
--- /dev/null
+"""
+Run a set of s3 tests on rgw.
+"""
+from cStringIO import StringIO
+from configobj import ConfigObj
+import base64
+import contextlib
+import logging
+import os
+import random
+import string
+
+import util.rgw as rgw_utils
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.config import config as teuth_config
+from teuthology.orchestra import run
+from teuthology.orchestra.connection import split_user
+
+log = logging.getLogger(__name__)
+
+def extract_sync_client_data(ctx, client_name):
+ """
+ Extract synchronized client rgw zone and rgw region information.
+
+ :param ctx: Context passed to the s3tests task
+ :param name: Name of client that we are synching with
+ """
+ return_region_name = None
+ return_dict = None
+ client = ctx.ceph['ceph'].conf.get(client_name, None)
+ if client:
+ current_client_zone = client.get('rgw zone', None)
+ if current_client_zone:
+ (endpoint_host, endpoint_port) = ctx.rgw.role_endpoints.get(client_name, (None, None))
+ # pull out the radosgw_agent stuff
+ regions = ctx.rgw.regions
+ for region in regions:
+ log.debug('jbuck, region is {region}'.format(region=region))
+ region_data = ctx.rgw.regions[region]
+ log.debug('region data is {region}'.format(region=region_data))
+ zones = region_data['zones']
+ for zone in zones:
+ if current_client_zone in zone:
+ return_region_name = region
+ return_dict = dict()
+ return_dict['api_name'] = region_data['api name']
+ return_dict['is_master'] = region_data['is master']
+ return_dict['port'] = endpoint_port
+ return_dict['host'] = endpoint_host
+
+ # The s3tests expect the sync_agent_[addr|port} to be
+ # set on the non-master node for some reason
+ if not region_data['is master']:
+ (rgwagent_host, rgwagent_port) = ctx.radosgw_agent.endpoint
+ (return_dict['sync_agent_addr'], _) = ctx.rgw.role_endpoints[rgwagent_host]
+ return_dict['sync_agent_port'] = rgwagent_port
+
+ else: #if client_zone:
+ log.debug('No zone info for {host}'.format(host=client_name))
+ else: # if client
+ log.debug('No ceph conf for {host}'.format(host=client_name))
+
+ return return_region_name, return_dict
+
+def update_conf_with_region_info(ctx, config, s3tests_conf):
+ """
+ Scan for a client (passed in s3tests_conf) that is an s3agent
+ with which we can sync. Update information in local conf file
+ if such a client is found.
+ """
+ for key in s3tests_conf.keys():
+ # we'll assume that there's only one sync relationship (source / destination) with client.X
+ # as the key for now
+
+ # Iterate through all of the radosgw_agent (rgwa) configs and see if a
+ # given client is involved in a relationship.
+ # If a given client isn't, skip it
+ this_client_in_rgwa_config = False
+ for rgwa in ctx.radosgw_agent.config.keys():
+ rgwa_data = ctx.radosgw_agent.config[rgwa]
+
+ if key in rgwa_data['src'] or key in rgwa_data['dest']:
+ this_client_in_rgwa_config = True
+ log.debug('{client} is in an radosgw-agent sync relationship'.format(client=key))
+ radosgw_sync_data = ctx.radosgw_agent.config[key]
+ break
+ if not this_client_in_rgwa_config:
+ log.debug('{client} is NOT in an radosgw-agent sync relationship'.format(client=key))
+ continue
+
+ source_client = radosgw_sync_data['src']
+ dest_client = radosgw_sync_data['dest']
+
+ # #xtract the pertinent info for the source side
+ source_region_name, source_region_dict = extract_sync_client_data(ctx, source_client)
+ log.debug('\t{key} source_region {source_region} source_dict {source_dict}'.format
+ (key=key,source_region=source_region_name,source_dict=source_region_dict))
+
+ # The source *should* be the master region, but test anyway and then set it as the default region
+ if source_region_dict['is_master']:
+ log.debug('Setting {region} as default_region'.format(region=source_region_name))
+ s3tests_conf[key]['fixtures'].setdefault('default_region', source_region_name)
+
+ # Extract the pertinent info for the destination side
+ dest_region_name, dest_region_dict = extract_sync_client_data(ctx, dest_client)
+ log.debug('\t{key} dest_region {dest_region} dest_dict {dest_dict}'.format
+ (key=key,dest_region=dest_region_name,dest_dict=dest_region_dict))
+
+ # now add these regions to the s3tests_conf object
+ s3tests_conf[key]['region {region_name}'.format(region_name=source_region_name)] = source_region_dict
+ s3tests_conf[key]['region {region_name}'.format(region_name=dest_region_name)] = dest_region_dict
+
+@contextlib.contextmanager
+def download(ctx, config):
+ """
+ Download the s3 tests from the git builder.
+ Remove downloaded s3 file upon exit.
+
+ The context passed in should be identical to the context
+ passed in to the main task.
+ """
+ assert isinstance(config, dict)
+ log.info('Downloading s3-tests...')
+ testdir = teuthology.get_testdir(ctx)
+ s3_branches = [ 'giant', 'firefly', 'firefly-original', 'hammer' ]
+ for (client, cconf) in config.items():
+ branch = cconf.get('force-branch', None)
+ if not branch:
+ ceph_branch = ctx.config.get('branch')
+ suite_branch = ctx.config.get('suite_branch', ceph_branch)
+ if suite_branch in s3_branches:
+ branch = cconf.get('branch', suite_branch)
+ else:
+ branch = cconf.get('branch', 'ceph-' + suite_branch)
+ if not branch:
+ raise ValueError(
+ "Could not determine what branch to use for s3tests!")
+ else:
+ log.info("Using branch '%s' for s3tests", branch)
+ sha1 = cconf.get('sha1')
+ ctx.cluster.only(client).run(
+ args=[
+ 'git', 'clone',
+ '-b', branch,
+ teuth_config.ceph_git_base_url + 's3-tests.git',
+ '{tdir}/s3-tests'.format(tdir=testdir),
+ ],
+ )
+ if sha1 is not None:
+ ctx.cluster.only(client).run(
+ args=[
+ 'cd', '{tdir}/s3-tests'.format(tdir=testdir),
+ run.Raw('&&'),
+ 'git', 'reset', '--hard', sha1,
+ ],
+ )
+ try:
+ yield
+ finally:
+ log.info('Removing s3-tests...')
+ testdir = teuthology.get_testdir(ctx)
+ for client in config:
+ ctx.cluster.only(client).run(
+ args=[
+ 'rm',
+ '-rf',
+ '{tdir}/s3-tests'.format(tdir=testdir),
+ ],
+ )
+
+
+def _config_user(s3tests_conf, section, user):
+ """
+ Configure users for this section by stashing away keys, ids, and
+ email addresses.
+ """
+ s3tests_conf[section].setdefault('user_id', user)
+ s3tests_conf[section].setdefault('email', '{user}+test@test.test'.format(user=user))
+ s3tests_conf[section].setdefault('display_name', 'Mr. {user}'.format(user=user))
+ s3tests_conf[section].setdefault('access_key', ''.join(random.choice(string.uppercase) for i in xrange(20)))
+ s3tests_conf[section].setdefault('secret_key', base64.b64encode(os.urandom(40)))
+
+
+@contextlib.contextmanager
+def create_users(ctx, config):
+ """
+ Create a main and an alternate s3 user.
+ """
+ assert isinstance(config, dict)
+ log.info('Creating rgw users...')
+ testdir = teuthology.get_testdir(ctx)
+ users = {'s3 main': 'foo', 's3 alt': 'bar'}
+ for client in config['clients']:
+ s3tests_conf = config['s3tests_conf'][client]
+ s3tests_conf.setdefault('fixtures', {})
+ s3tests_conf['fixtures'].setdefault('bucket prefix', 'test-' + client + '-{random}-')
+ for section, user in users.iteritems():
+ _config_user(s3tests_conf, section, '{user}.{client}'.format(user=user, client=client))
+ log.debug('Creating user {user} on {host}'.format(user=s3tests_conf[section]['user_id'], host=client))
+ ctx.cluster.only(client).run(
+ args=[
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'radosgw-admin',
+ '-n', client,
+ 'user', 'create',
+ '--uid', s3tests_conf[section]['user_id'],
+ '--display-name', s3tests_conf[section]['display_name'],
+ '--access-key', s3tests_conf[section]['access_key'],
+ '--secret', s3tests_conf[section]['secret_key'],
+ '--email', s3tests_conf[section]['email'],
+ ],
+ )
+ try:
+ yield
+ finally:
+ for client in config['clients']:
+ for user in users.itervalues():
+ uid = '{user}.{client}'.format(user=user, client=client)
+ ctx.cluster.only(client).run(
+ args=[
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'radosgw-admin',
+ '-n', client,
+ 'user', 'rm',
+ '--uid', uid,
+ '--purge-data',
+ ],
+ )
+
+
+@contextlib.contextmanager
+def configure(ctx, config):
+ """
+ Configure the s3-tests. This includes the running of the
+ bootstrap code and the updating of local conf files.
+ """
+ assert isinstance(config, dict)
+ log.info('Configuring s3-tests...')
+ testdir = teuthology.get_testdir(ctx)
+ for client, properties in config['clients'].iteritems():
+ s3tests_conf = config['s3tests_conf'][client]
+ if properties is not None and 'rgw_server' in properties:
+ host = None
+ for target, roles in zip(ctx.config['targets'].iterkeys(), ctx.config['roles']):
+ log.info('roles: ' + str(roles))
+ log.info('target: ' + str(target))
+ if properties['rgw_server'] in roles:
+ _, host = split_user(target)
+ assert host is not None, "Invalid client specified as the rgw_server"
+ s3tests_conf['DEFAULT']['host'] = host
+ else:
+ s3tests_conf['DEFAULT']['host'] = 'localhost'
+
+ if properties is not None and 'slow_backend' in properties:
+ s3tests_conf['fixtures']['slow backend'] = properties['slow_backend']
+
+ (remote,) = ctx.cluster.only(client).remotes.keys()
+ remote.run(
+ args=[
+ 'cd',
+ '{tdir}/s3-tests'.format(tdir=testdir),
+ run.Raw('&&'),
+ './bootstrap',
+ ],
+ )
+ conf_fp = StringIO()
+ s3tests_conf.write(conf_fp)
+ teuthology.write_file(
+ remote=remote,
+ path='{tdir}/archive/s3-tests.{client}.conf'.format(tdir=testdir, client=client),
+ data=conf_fp.getvalue(),
+ )
+
+ log.info('Configuring boto...')
+ boto_src = os.path.join(os.path.dirname(__file__), 'boto.cfg.template')
+ for client, properties in config['clients'].iteritems():
+ with file(boto_src, 'rb') as f:
+ (remote,) = ctx.cluster.only(client).remotes.keys()
+ conf = f.read().format(
+ idle_timeout=config.get('idle_timeout', 30)
+ )
+ teuthology.write_file(
+ remote=remote,
+ path='{tdir}/boto.cfg'.format(tdir=testdir),
+ data=conf,
+ )
+
+ try:
+ yield
+
+ finally:
+ log.info('Cleaning up boto...')
+ for client, properties in config['clients'].iteritems():
+ (remote,) = ctx.cluster.only(client).remotes.keys()
+ remote.run(
+ args=[
+ 'rm',
+ '{tdir}/boto.cfg'.format(tdir=testdir),
+ ],
+ )
+
+@contextlib.contextmanager
+def sync_users(ctx, config):
+ """
+ Sync this user.
+ """
+ assert isinstance(config, dict)
+ # do a full sync if this is a multi-region test
+ if rgw_utils.multi_region_enabled(ctx):
+ log.debug('Doing a full sync')
+ rgw_utils.radosgw_agent_sync_all(ctx)
+ else:
+ log.debug('Not a multi-region config; skipping the metadata sync')
+
+ yield
+
+@contextlib.contextmanager
+def run_tests(ctx, config):
+ """
+ Run the s3tests after everything is set up.
+
+ :param ctx: Context passed to task
+ :param config: specific configuration information
+ """
+ assert isinstance(config, dict)
+ testdir = teuthology.get_testdir(ctx)
+ attrs = ["!fails_on_rgw"]
+ if not ctx.rgw.use_fastcgi:
+ attrs.append("!fails_on_mod_proxy_fcgi")
+ for client, client_config in config.iteritems():
+ args = [
+ 'S3TEST_CONF={tdir}/archive/s3-tests.{client}.conf'.format(tdir=testdir, client=client),
+ 'BOTO_CONFIG={tdir}/boto.cfg'.format(tdir=testdir),
+ '{tdir}/s3-tests/virtualenv/bin/nosetests'.format(tdir=testdir),
+ '-w',
+ '{tdir}/s3-tests'.format(tdir=testdir),
+ '-v',
+ '-a', ','.join(attrs),
+ ]
+ if client_config is not None and 'extra_args' in client_config:
+ args.extend(client_config['extra_args'])
+
+ ctx.cluster.only(client).run(
+ args=args,
+ label="s3 tests against rgw"
+ )
+ yield
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Run the s3-tests suite against rgw.
+
+ To run all tests on all clients::
+
+ tasks:
+ - ceph:
+ - rgw:
+ - s3tests:
+
+ To restrict testing to particular clients::
+
+ tasks:
+ - ceph:
+ - rgw: [client.0]
+ - s3tests: [client.0]
+
+ To run against a server on client.1 and increase the boto timeout to 10m::
+
+ tasks:
+ - ceph:
+ - rgw: [client.1]
+ - s3tests:
+ client.0:
+ rgw_server: client.1
+ idle_timeout: 600
+
+ To pass extra arguments to nose (e.g. to run a certain test)::
+
+ tasks:
+ - ceph:
+ - rgw: [client.0]
+ - s3tests:
+ client.0:
+ extra_args: ['test_s3:test_object_acl_grand_public_read']
+ client.1:
+ extra_args: ['--exclude', 'test_100_continue']
+ """
+ assert config is None or isinstance(config, list) \
+ or isinstance(config, dict), \
+ "task s3tests only supports a list or dictionary for configuration"
+ all_clients = ['client.{id}'.format(id=id_)
+ for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+ if config is None:
+ config = all_clients
+ if isinstance(config, list):
+ config = dict.fromkeys(config)
+ clients = config.keys()
+
+ overrides = ctx.config.get('overrides', {})
+ # merge each client section, not the top level.
+ for client in config.iterkeys():
+ if not config[client]:
+ config[client] = {}
+ teuthology.deep_merge(config[client], overrides.get('s3tests', {}))
+
+ log.debug('s3tests config is %s', config)
+
+ s3tests_conf = {}
+ for client in clients:
+ s3tests_conf[client] = ConfigObj(
+ indent_type='',
+ infile={
+ 'DEFAULT':
+ {
+ 'port' : 7280,
+ 'is_secure' : 'no',
+ },
+ 'fixtures' : {},
+ 's3 main' : {},
+ 's3 alt' : {},
+ }
+ )
+
+ # Only attempt to add in the region info if there's a radosgw_agent configured
+ if hasattr(ctx, 'radosgw_agent'):
+ update_conf_with_region_info(ctx, config, s3tests_conf)
+
+ with contextutil.nested(
+ lambda: download(ctx=ctx, config=config),
+ lambda: create_users(ctx=ctx, config=dict(
+ clients=clients,
+ s3tests_conf=s3tests_conf,
+ )),
+ lambda: sync_users(ctx=ctx, config=config),
+ lambda: configure(ctx=ctx, config=dict(
+ clients=config,
+ s3tests_conf=s3tests_conf,
+ )),
+ lambda: run_tests(ctx=ctx, config=config),
+ ):
+ pass
+ yield
--- /dev/null
+"""
+Samba
+"""
+import contextlib
+import logging
+import sys
+import time
+
+from teuthology import misc as teuthology
+from teuthology.orchestra import run
+from teuthology.orchestra.daemon import DaemonGroup
+
+log = logging.getLogger(__name__)
+
+
+def get_sambas(ctx, roles):
+ """
+ Scan for roles that are samba. Yield the id of the the samba role
+ (samba.0, samba.1...) and the associated remote site
+
+ :param ctx: Context
+ :param roles: roles for this test (extracted from yaml files)
+ """
+ for role in roles:
+ assert isinstance(role, basestring)
+ PREFIX = 'samba.'
+ assert role.startswith(PREFIX)
+ id_ = role[len(PREFIX):]
+ (remote,) = ctx.cluster.only(role).remotes.iterkeys()
+ yield (id_, remote)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Setup samba smbd with ceph vfs module. This task assumes the samba
+ package has already been installed via the install task.
+
+ The config is optional and defaults to starting samba on all nodes.
+ If a config is given, it is expected to be a list of
+ samba nodes to start smbd servers on.
+
+ Example that starts smbd on all samba nodes::
+
+ tasks:
+ - install:
+ - install:
+ project: samba
+ extra_packages: ['samba']
+ - ceph:
+ - samba:
+ - interactive:
+
+ Example that starts smbd on just one of the samba nodes and cifs on the other::
+
+ tasks:
+ - samba: [samba.0]
+ - cifs: [samba.1]
+
+ An optional backend can be specified, and requires a path which smbd will
+ use as the backend storage location:
+
+ roles:
+ - [osd.0, osd.1, osd.2, mon.0, mon.1, mon.2, mds.a]
+ - [client.0, samba.0]
+
+ tasks:
+ - ceph:
+ - ceph-fuse: [client.0]
+ - samba:
+ samba.0:
+ cephfuse: "{testdir}/mnt.0"
+
+ This mounts ceph to {testdir}/mnt.0 using fuse, and starts smbd with
+ a UNC of //localhost/cephfuse. Access through that UNC will be on
+ the ceph fuse mount point.
+
+ If no arguments are specified in the samba
+ role, the default behavior is to enable the ceph UNC //localhost/ceph
+ and use the ceph vfs module as the smbd backend.
+
+ :param ctx: Context
+ :param config: Configuration
+ """
+ log.info("Setting up smbd with ceph vfs...")
+ assert config is None or isinstance(config, list) or isinstance(config, dict), \
+ "task samba got invalid config"
+
+ if config is None:
+ config = dict(('samba.{id}'.format(id=id_), None)
+ for id_ in teuthology.all_roles_of_type(ctx.cluster, 'samba'))
+ elif isinstance(config, list):
+ config = dict((name, None) for name in config)
+
+ samba_servers = list(get_sambas(ctx=ctx, roles=config.keys()))
+
+ testdir = teuthology.get_testdir(ctx)
+
+ if not hasattr(ctx, 'daemons'):
+ ctx.daemons = DaemonGroup()
+
+ for id_, remote in samba_servers:
+
+ rolestr = "samba.{id_}".format(id_=id_)
+
+ confextras = """vfs objects = ceph
+ ceph:config_file = /etc/ceph/ceph.conf"""
+
+ unc = "ceph"
+ backend = "/"
+
+ if config[rolestr] is not None:
+ # verify that there's just one parameter in role
+ if len(config[rolestr]) != 1:
+ log.error("samba config for role samba.{id_} must have only one parameter".format(id_=id_))
+ raise Exception('invalid config')
+ confextras = ""
+ (unc, backendstr) = config[rolestr].items()[0]
+ backend = backendstr.format(testdir=testdir)
+
+ # on first samba role, set ownership and permissions of ceph root
+ # so that samba tests succeed
+ if config[rolestr] is None and id_ == samba_servers[0][0]:
+ remote.run(
+ args=[
+ 'mkdir', '-p', '/tmp/cmnt', run.Raw('&&'),
+ 'sudo', 'ceph-fuse', '/tmp/cmnt', run.Raw('&&'),
+ 'sudo', 'chown', 'ubuntu:ubuntu', '/tmp/cmnt/', run.Raw('&&'),
+ 'sudo', 'chmod', '1777', '/tmp/cmnt/', run.Raw('&&'),
+ 'sudo', 'umount', '/tmp/cmnt/', run.Raw('&&'),
+ 'rm', '-rf', '/tmp/cmnt',
+ ],
+ )
+ else:
+ remote.run(
+ args=[
+ 'sudo', 'chown', 'ubuntu:ubuntu', backend, run.Raw('&&'),
+ 'sudo', 'chmod', '1777', backend,
+ ],
+ )
+
+ teuthology.sudo_write_file(remote, "/usr/local/samba/etc/smb.conf", """
+[global]
+ workgroup = WORKGROUP
+ netbios name = DOMAIN
+
+[{unc}]
+ path = {backend}
+ {extras}
+ writeable = yes
+ valid users = ubuntu
+""".format(extras=confextras, unc=unc, backend=backend))
+
+ # create ubuntu user
+ remote.run(
+ args=[
+ 'sudo', '/usr/local/samba/bin/smbpasswd', '-e', 'ubuntu',
+ run.Raw('||'),
+ 'printf', run.Raw('"ubuntu\nubuntu\n"'),
+ run.Raw('|'),
+ 'sudo', '/usr/local/samba/bin/smbpasswd', '-s', '-a', 'ubuntu'
+ ])
+
+ smbd_cmd = [
+ 'sudo',
+ 'daemon-helper',
+ 'term',
+ 'nostdin',
+ '/usr/local/samba/sbin/smbd',
+ '-F',
+ ]
+ ctx.daemons.add_daemon(remote, 'smbd', id_,
+ args=smbd_cmd,
+ logger=log.getChild("smbd.{id_}".format(id_=id_)),
+ stdin=run.PIPE,
+ wait=False,
+ )
+
+ # let smbd initialize, probably a better way...
+ seconds_to_sleep = 100
+ log.info('Sleeping for %s seconds...' % seconds_to_sleep)
+ time.sleep(seconds_to_sleep)
+ log.info('Sleeping stopped...')
+
+ try:
+ yield
+ finally:
+ log.info('Stopping smbd processes...')
+ exc_info = (None, None, None)
+ for d in ctx.daemons.iter_daemons_of_role('smbd'):
+ try:
+ d.stop()
+ except (run.CommandFailedError,
+ run.CommandCrashedError,
+ run.ConnectionLostError):
+ exc_info = sys.exc_info()
+ log.exception('Saw exception from %s.%s', d.role, d.id_)
+ if exc_info != (None, None, None):
+ raise exc_info[0], exc_info[1], exc_info[2]
+
+ for id_, remote in samba_servers:
+ remote.run(
+ args=[
+ 'sudo',
+ 'rm', '-rf',
+ '/usr/local/samba/etc/smb.conf',
+ '/usr/local/samba/private/*',
+ '/usr/local/samba/var/run/',
+ '/usr/local/samba/var/locks',
+ '/usr/local/samba/var/lock',
+ ],
+ )
+ # make sure daemons are gone
+ try:
+ remote.run(
+ args=[
+ 'while',
+ 'sudo', 'killall', '-9', 'smbd',
+ run.Raw(';'),
+ 'do', 'sleep', '1',
+ run.Raw(';'),
+ 'done',
+ ],
+ )
+
+ remote.run(
+ args=[
+ 'sudo',
+ 'lsof',
+ backend,
+ ],
+ check_status=False
+ )
+ remote.run(
+ args=[
+ 'sudo',
+ 'fuser',
+ '-M',
+ backend,
+ ],
+ check_status=False
+ )
+ except Exception:
+ log.exception("Saw exception")
+ pass
--- /dev/null
+"""
+Scrub osds
+"""
+import contextlib
+import gevent
+import logging
+import random
+import time
+
+import ceph_manager
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Run scrub periodically. Randomly chooses an OSD to scrub.
+
+ The config should be as follows:
+
+ scrub:
+ frequency: <seconds between scrubs>
+ deep: <bool for deepness>
+
+ example:
+
+ tasks:
+ - ceph:
+ - scrub:
+ frequency: 30
+ deep: 0
+ """
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'scrub task only accepts a dict for configuration'
+
+ log.info('Beginning scrub...')
+
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+ manager = ceph_manager.CephManager(
+ mon,
+ ctx=ctx,
+ logger=log.getChild('ceph_manager'),
+ )
+
+ num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
+ while len(manager.get_osd_status()['up']) < num_osds:
+ time.sleep(10)
+
+ scrub_proc = Scrubber(
+ manager,
+ config,
+ )
+ try:
+ yield
+ finally:
+ log.info('joining scrub')
+ scrub_proc.do_join()
+
+class Scrubber:
+ """
+ Scrubbing is actually performed during initialzation
+ """
+ def __init__(self, manager, config):
+ """
+ Spawn scrubbing thread upon completion.
+ """
+ self.ceph_manager = manager
+ self.ceph_manager.wait_for_clean()
+
+ osd_status = self.ceph_manager.get_osd_status()
+ self.osds = osd_status['up']
+
+ self.config = config
+ if self.config is None:
+ self.config = dict()
+
+ else:
+ def tmp(x):
+ """Local display"""
+ print x
+ self.log = tmp
+
+ self.stopping = False
+
+ log.info("spawning thread")
+
+ self.thread = gevent.spawn(self.do_scrub)
+
+ def do_join(self):
+ """Scrubbing thread finished"""
+ self.stopping = True
+ self.thread.get()
+
+ def do_scrub(self):
+ """Perform the scrub operation"""
+ frequency = self.config.get("frequency", 30)
+ deep = self.config.get("deep", 0)
+
+ log.info("stopping %s" % self.stopping)
+
+ while not self.stopping:
+ osd = str(random.choice(self.osds))
+
+ if deep:
+ cmd = 'deep-scrub'
+ else:
+ cmd = 'scrub'
+
+ log.info('%sbing %s' % (cmd, osd))
+ self.ceph_manager.raw_cluster_cmd('osd', cmd, osd)
+
+ time.sleep(frequency)
--- /dev/null
+"""Scrub testing"""
+from cStringIO import StringIO
+
+import contextlib
+import json
+import logging
+import os
+import time
+import tempfile
+
+import ceph_manager
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+
+def wait_for_victim_pg(manager):
+ """Return a PG with some data and its acting set"""
+ # wait for some PG to have data that we can mess with
+ victim = None
+ while victim is None:
+ stats = manager.get_pg_stats()
+ for pg in stats:
+ size = pg['stat_sum']['num_bytes']
+ if size > 0:
+ victim = pg['pgid']
+ acting = pg['acting']
+ return victim, acting
+ time.sleep(3)
+
+
+def find_victim_object(ctx, pg, osd):
+ """Return a file to be fuzzed"""
+ (osd_remote,) = ctx.cluster.only('osd.%d' % osd).remotes.iterkeys()
+ data_path = os.path.join(
+ '/var/lib/ceph/osd',
+ 'ceph-{id}'.format(id=osd),
+ 'fuse',
+ '{pg}_head'.format(pg=pg),
+ 'all',
+ )
+
+ # fuzz time
+ with contextlib.closing(StringIO()) as ls_fp:
+ osd_remote.run(
+ args=['sudo', 'ls', data_path],
+ stdout=ls_fp,
+ )
+ ls_out = ls_fp.getvalue()
+
+ # find an object file we can mess with (and not the pg info object)
+ osdfilename = next(line for line in ls_out.split('\n')
+ if not line.endswith('::::head#'))
+ assert osdfilename is not None
+
+ # Get actual object name from osd stored filename
+ objname = osdfilename.split(':')[4]
+ return osd_remote, os.path.join(data_path, osdfilename), objname
+
+
+def corrupt_file(osd_remote, path):
+ # put a single \0 at the beginning of the file
+ osd_remote.run(
+ args=['sudo', 'dd',
+ 'if=/dev/zero',
+ 'of=%s/data' % path,
+ 'bs=1', 'count=1', 'conv=notrunc']
+ )
+
+
+def get_pgnum(pgid):
+ pos = pgid.find('.')
+ assert pos != -1
+ return pgid[pos+1:]
+
+
+def deep_scrub(manager, victim, pool):
+ # scrub, verify inconsistent
+ pgnum = get_pgnum(victim)
+ manager.do_pg_scrub(pool, pgnum, 'deep-scrub')
+
+ stats = manager.get_single_pg_stats(victim)
+ inconsistent = stats['state'].find('+inconsistent') != -1
+ assert inconsistent
+
+
+def repair(manager, victim, pool):
+ # repair, verify no longer inconsistent
+ pgnum = get_pgnum(victim)
+ manager.do_pg_scrub(pool, pgnum, 'repair')
+
+ stats = manager.get_single_pg_stats(victim)
+ inconsistent = stats['state'].find('+inconsistent') != -1
+ assert not inconsistent
+
+
+def test_repair_corrupted_obj(ctx, manager, pg, osd_remote, obj_path, pool):
+ corrupt_file(osd_remote, obj_path)
+ deep_scrub(manager, pg, pool)
+ repair(manager, pg, pool)
+
+
+def test_repair_bad_omap(ctx, manager, pg, osd, objname):
+ # Test deep-scrub with various omap modifications
+ # Modify omap on specific osd
+ log.info('fuzzing omap of %s' % objname)
+ manager.osd_admin_socket(osd, ['rmomapkey', 'rbd', objname, 'key'])
+ manager.osd_admin_socket(osd, ['setomapval', 'rbd', objname,
+ 'badkey', 'badval'])
+ manager.osd_admin_socket(osd, ['setomapheader', 'rbd', objname, 'badhdr'])
+
+ deep_scrub(manager, pg, 'rbd')
+ # please note, the repair here is errnomous, it rewrites the correct omap
+ # digest and data digest on the replicas with the corresponding digests
+ # from the primary osd which is hosting the victim object, see
+ # find_victim_object().
+ # so we need to either put this test and the end of this task or
+ # undo the mess-up manually before the "repair()" that just ensures
+ # the cleanup is sane, otherwise the succeeding tests will fail. if they
+ # try set "badkey" in hope to get an "inconsistent" pg with a deep-scrub.
+ manager.osd_admin_socket(osd, ['setomapheader', 'rbd', objname, 'hdr'])
+ manager.osd_admin_socket(osd, ['rmomapkey', 'rbd', objname, 'badkey'])
+ manager.osd_admin_socket(osd, ['setomapval', 'rbd', objname,
+ 'key', 'val'])
+ repair(manager, pg, 'rbd')
+
+
+class MessUp:
+ def __init__(self, manager, osd_remote, pool, osd_id,
+ obj_name, obj_path, omap_key, omap_val):
+ self.manager = manager
+ self.osd = osd_remote
+ self.pool = pool
+ self.osd_id = osd_id
+ self.obj = obj_name
+ self.path = obj_path
+ self.omap_key = omap_key
+ self.omap_val = omap_val
+
+ @contextlib.contextmanager
+ def _test_with_file(self, messup_cmd, *checks):
+ temp = tempfile.mktemp()
+ backup_cmd = ['sudo', 'cp', os.path.join(self.path, 'data'), temp]
+ self.osd.run(args=backup_cmd)
+ self.osd.run(args=messup_cmd.split())
+ yield checks
+ create_cmd = ['sudo', 'mkdir', self.path]
+ self.osd.run(args=create_cmd, check_status=False)
+ restore_cmd = ['sudo', 'cp', temp, os.path.join(self.path, 'data')]
+ self.osd.run(args=restore_cmd)
+
+ def remove(self):
+ cmd = 'sudo rmdir {path}'.format(path=self.path)
+ return self._test_with_file(cmd, 'missing')
+
+ def append(self):
+ cmd = 'sudo dd if=/dev/zero of={path}/data bs=1 count=1 ' \
+ 'conv=notrunc oflag=append'.format(path=self.path)
+ return self._test_with_file(cmd,
+ 'data_digest_mismatch',
+ 'size_mismatch')
+
+ def truncate(self):
+ cmd = 'sudo dd if=/dev/null of={path}/data'.format(path=self.path)
+ return self._test_with_file(cmd,
+ 'data_digest_mismatch',
+ 'size_mismatch')
+
+ def change_obj(self):
+ cmd = 'sudo dd if=/dev/zero of={path}/data bs=1 count=1 ' \
+ 'conv=notrunc'.format(path=self.path)
+ return self._test_with_file(cmd,
+ 'data_digest_mismatch')
+
+ @contextlib.contextmanager
+ def rm_omap(self):
+ cmd = ['rmomapkey', self.pool, self.obj, self.omap_key]
+ self.manager.osd_admin_socket(self.osd_id, cmd)
+ yield ('omap_digest_mismatch',)
+ cmd = ['setomapval', self.pool, self.obj,
+ self.omap_key, self.omap_val]
+ self.manager.osd_admin_socket(self.osd_id, cmd)
+
+ @contextlib.contextmanager
+ def add_omap(self):
+ cmd = ['setomapval', self.pool, self.obj, 'badkey', 'badval']
+ self.manager.osd_admin_socket(self.osd_id, cmd)
+ yield ('omap_digest_mismatch',)
+ cmd = ['rmomapkey', self.pool, self.obj, 'badkey']
+ self.manager.osd_admin_socket(self.osd_id, cmd)
+
+ @contextlib.contextmanager
+ def change_omap(self):
+ cmd = ['setomapval', self.pool, self.obj, self.omap_key, 'badval']
+ self.manager.osd_admin_socket(self.osd_id, cmd)
+ yield ('omap_digest_mismatch',)
+ cmd = ['setomapval', self.pool, self.obj, self.omap_key, self.omap_val]
+ self.manager.osd_admin_socket(self.osd_id, cmd)
+
+
+class InconsistentObjChecker:
+ """Check the returned inconsistents/inconsistent info"""
+
+ def __init__(self, osd, acting, obj_name):
+ self.osd = osd
+ self.acting = acting
+ self.obj = obj_name
+ assert self.osd in self.acting
+
+ def basic_checks(self, inc):
+ assert inc['object']['name'] == self.obj
+ assert inc['object']['snap'] == "head"
+ assert len(inc['shards']) == len(self.acting), \
+ "the number of returned shard does not match with the acting set"
+
+ def run(self, check, inc):
+ func = getattr(self, check)
+ func(inc)
+
+ def _check_errors(self, inc, err_name):
+ bad_found = False
+ good_found = False
+ for shard in inc['shards']:
+ log.info('shard = %r' % shard)
+ log.info('err = %s' % err_name)
+ assert 'osd' in shard
+ osd = shard['osd']
+ err = err_name in shard['errors']
+ if osd == self.osd:
+ assert bad_found is False, \
+ "multiple entries found for the given OSD"
+ assert err is True, \
+ "Didn't find '{err}' in errors".format(err=err_name)
+ bad_found = True
+ else:
+ assert osd in self.acting, "shard not in acting set"
+ assert err is False, \
+ "Expected '{err}' in errors".format(err=err_name)
+ good_found = True
+ assert bad_found is True, \
+ "Shard for osd.{osd} not found".format(osd=self.osd)
+ assert good_found is True, \
+ "No other acting shards found"
+
+ def _check_attrs(self, inc, attr_name):
+ bad_attr = None
+ good_attr = None
+ for shard in inc['shards']:
+ log.info('shard = %r' % shard)
+ log.info('attr = %s' % attr_name)
+ assert 'osd' in shard
+ osd = shard['osd']
+ attr = shard.get(attr_name, False)
+ if osd == self.osd:
+ assert bad_attr is None, \
+ "multiple entries found for the given OSD"
+ bad_attr = attr
+ else:
+ assert osd in self.acting, "shard not in acting set"
+ assert good_attr is None or good_attr == attr, \
+ "multiple good attrs found"
+ good_attr = attr
+ assert bad_attr is not None, \
+ "bad {attr} not found".format(attr=attr_name)
+ assert good_attr is not None, \
+ "good {attr} not found".format(attr=attr_name)
+ assert good_attr != bad_attr, \
+ "bad attr is identical to the good ones: " \
+ "{0} == {1}".format(good_attr, bad_attr)
+
+ def data_digest_mismatch(self, inc):
+ assert 'data_digest_mismatch' in inc['errors']
+ self._check_attrs(inc, 'data_digest')
+
+ def missing(self, inc):
+ assert 'missing' in inc['union_shard_errors']
+ self._check_errors(inc, 'missing')
+
+ def size_mismatch(self, inc):
+ assert 'size_mismatch' in inc['errors']
+ self._check_attrs(inc, 'size')
+
+ def omap_digest_mismatch(self, inc):
+ assert 'omap_digest_mismatch' in inc['errors']
+ self._check_attrs(inc, 'omap_digest')
+
+
+def test_list_inconsistent_obj(ctx, manager, osd_remote, pg, acting, osd_id,
+ obj_name, obj_path):
+ mon = manager.controller
+ pool = 'rbd'
+ omap_key = 'key'
+ omap_val = 'val'
+ manager.do_rados(mon, ['-p', pool, 'setomapval', obj_name,
+ omap_key, omap_val])
+ # Update missing digests, requires "osd deep scrub update digest min age: 0"
+ pgnum = get_pgnum(pg)
+ manager.do_pg_scrub(pool, pgnum, 'deep-scrub')
+
+ messup = MessUp(manager, osd_remote, pool, osd_id, obj_name, obj_path,
+ omap_key, omap_val)
+ for test in [messup.rm_omap, messup.add_omap, messup.change_omap,
+ messup.append, messup.truncate, messup.change_obj,
+ messup.remove]:
+ with test() as checks:
+ deep_scrub(manager, pg, pool)
+ cmd = 'rados list-inconsistent-pg {pool} ' \
+ '--format=json'.format(pool=pool)
+ with contextlib.closing(StringIO()) as out:
+ mon.run(args=cmd.split(), stdout=out)
+ pgs = json.loads(out.getvalue())
+ assert pgs == [pg]
+
+ cmd = 'rados list-inconsistent-obj {pg} ' \
+ '--format=json'.format(pg=pg)
+ with contextlib.closing(StringIO()) as out:
+ mon.run(args=cmd.split(), stdout=out)
+ objs = json.loads(out.getvalue())
+ assert len(objs['inconsistents']) == 1
+
+ checker = InconsistentObjChecker(osd_id, acting, obj_name)
+ inc_obj = objs['inconsistents'][0]
+ log.info('inc = %r', inc_obj)
+ checker.basic_checks(inc_obj)
+ for check in checks:
+ checker.run(check, inc_obj)
+
+
+def task(ctx, config):
+ """
+ Test [deep] scrub
+
+ tasks:
+ - chef:
+ - install:
+ - ceph:
+ log-whitelist:
+ - '!= data_digest'
+ - '!= omap_digest'
+ - '!= size'
+ - deep-scrub 0 missing, 1 inconsistent objects
+ - deep-scrub [0-9]+ errors
+ - repair 0 missing, 1 inconsistent objects
+ - repair [0-9]+ errors, [0-9]+ fixed
+ - shard [0-9]+ missing
+ - deep-scrub 1 missing, 1 inconsistent objects
+ - does not match object info size
+ - attr name mistmatch
+ - deep-scrub 1 missing, 0 inconsistent objects
+ - failed to pick suitable auth object
+ conf:
+ osd:
+ osd deep scrub update digest min age: 0
+ - scrub_test:
+ """
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'scrub_test task only accepts a dict for configuration'
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+ num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
+ log.info('num_osds is %s' % num_osds)
+
+ manager = ceph_manager.CephManager(
+ mon,
+ ctx=ctx,
+ logger=log.getChild('ceph_manager'),
+ )
+
+ while len(manager.get_osd_status()['up']) < num_osds:
+ time.sleep(10)
+
+ for i in range(num_osds):
+ manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'injectargs',
+ '--', '--osd-objectstore-fuse')
+ for i in range(num_osds):
+ manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'flush_pg_stats')
+ manager.wait_for_clean()
+
+ # write some data
+ p = manager.do_rados(mon, ['-p', 'rbd', 'bench', '--no-cleanup', '1',
+ 'write', '-b', '4096'])
+ log.info('err is %d' % p.exitstatus)
+
+ # wait for some PG to have data that we can mess with
+ pg, acting = wait_for_victim_pg(manager)
+ osd = acting[0]
+
+ osd_remote, obj_path, obj_name = find_victim_object(ctx, pg, osd)
+ manager.do_rados(mon, ['-p', 'rbd', 'setomapval', obj_name, 'key', 'val'])
+ log.info('err is %d' % p.exitstatus)
+ manager.do_rados(mon, ['-p', 'rbd', 'setomapheader', obj_name, 'hdr'])
+ log.info('err is %d' % p.exitstatus)
+
+ # Update missing digests, requires "osd deep scrub update digest min age: 0"
+ pgnum = get_pgnum(pg)
+ manager.do_pg_scrub('rbd', pgnum, 'deep-scrub')
+
+ log.info('messing with PG %s on osd %d' % (pg, osd))
+ test_repair_corrupted_obj(ctx, manager, pg, osd_remote, obj_path, 'rbd')
+ test_repair_bad_omap(ctx, manager, pg, osd, obj_name)
+ test_list_inconsistent_obj(ctx, manager, osd_remote, pg, acting, osd,
+ obj_name, obj_path)
+ log.info('test successful!')
+
+ # shut down fuse mount
+ for i in range(num_osds):
+ manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'injectargs',
+ '--', '--no-osd-objectstore-fuse')
+ time.sleep(5)
+ log.info('done')
--- /dev/null
+"""
+Systemd test
+"""
+import contextlib
+import logging
+import re
+import time
+
+from cStringIO import StringIO
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ - tasks:
+ ceph-deploy:
+ systemd:
+
+ Test ceph systemd services can start, stop and restart and
+ check for any failed services and report back errors
+ """
+ for remote, roles in ctx.cluster.remotes.iteritems():
+ remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
+ 'grep', 'ceph'])
+ r = remote.run(args=['sudo', 'systemctl', 'list-units', run.Raw('|'),
+ 'grep', 'ceph'], stdout=StringIO(),
+ check_status=False)
+ log.info(r.stdout.getvalue())
+ if r.stdout.getvalue().find('failed'):
+ log.info("Ceph services in failed state")
+
+ # test overall service stop and start using ceph.target
+ # ceph.target tests are meant for ceph systemd tests
+ # and not actual process testing using 'ps'
+ log.info("Stopping all Ceph services")
+ remote.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
+ r = remote.run(args=['sudo', 'systemctl', 'status', 'ceph.target'],
+ stdout=StringIO(), check_status=False)
+ log.info(r.stdout.getvalue())
+ log.info("Checking process status")
+ r = remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
+ 'grep', 'ceph'], stdout=StringIO())
+ if r.stdout.getvalue().find('Active: inactive'):
+ log.info("Sucessfully stopped all ceph services")
+ else:
+ log.info("Failed to stop ceph services")
+
+ log.info("Starting all Ceph services")
+ remote.run(args=['sudo', 'systemctl', 'start', 'ceph.target'])
+ r = remote.run(args=['sudo', 'systemctl', 'status', 'ceph.target'],
+ stdout=StringIO())
+ log.info(r.stdout.getvalue())
+ if r.stdout.getvalue().find('Active: active'):
+ log.info("Sucessfully started all Ceph services")
+ else:
+ log.info("info", "Failed to start Ceph services")
+ r = remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
+ 'grep', 'ceph'], stdout=StringIO())
+ log.info(r.stdout.getvalue())
+ time.sleep(4)
+
+ # test individual services start stop
+ name = remote.shortname
+ mon_name = 'ceph-mon@' + name + '.service'
+ mds_name = 'ceph-mds@' + name + '.service'
+ mgr_name = 'ceph-mgr@' + name + '.service'
+ mon_role_name = 'mon.' + name
+ mds_role_name = 'mds.' + name
+ mgr_role_name = 'mgr.' + name
+ m_osd = re.search('--id (\d+) --setuser ceph', r.stdout.getvalue())
+ if m_osd:
+ osd_service = 'ceph-osd@{m}.service'.format(m=m_osd.group(1))
+ remote.run(args=['sudo', 'systemctl', 'status',
+ osd_service])
+ remote.run(args=['sudo', 'systemctl', 'stop',
+ osd_service])
+ time.sleep(4) # immediate check will result in deactivating state
+ r = remote.run(args=['sudo', 'systemctl', 'status', osd_service],
+ stdout=StringIO(), check_status=False)
+ log.info(r.stdout.getvalue())
+ if r.stdout.getvalue().find('Active: inactive'):
+ log.info("Sucessfully stopped single osd ceph service")
+ else:
+ log.info("Failed to stop ceph osd services")
+ remote.run(args=['sudo', 'systemctl', 'start',
+ osd_service])
+ time.sleep(4)
+ if mon_role_name in roles:
+ remote.run(args=['sudo', 'systemctl', 'status', mon_name])
+ remote.run(args=['sudo', 'systemctl', 'stop', mon_name])
+ time.sleep(4) # immediate check will result in deactivating state
+ r = remote.run(args=['sudo', 'systemctl', 'status', mon_name],
+ stdout=StringIO(), check_status=False)
+ if r.stdout.getvalue().find('Active: inactive'):
+ log.info("Sucessfully stopped single mon ceph service")
+ else:
+ log.info("Failed to stop ceph mon service")
+ remote.run(args=['sudo', 'systemctl', 'start', mon_name])
+ time.sleep(4)
+ if mgr_role_name in roles:
+ remote.run(args=['sudo', 'systemctl', 'status', mgr_name])
+ remote.run(args=['sudo', 'systemctl', 'stop', mgr_name])
+ time.sleep(4) # immediate check will result in deactivating state
+ r = remote.run(args=['sudo', 'systemctl', 'status', mgr_name],
+ stdout=StringIO(), check_status=False)
+ if r.stdout.getvalue().find('Active: inactive'):
+ log.info("Sucessfully stopped single ceph mgr service")
+ else:
+ log.info("Failed to stop ceph mgr service")
+ remote.run(args=['sudo', 'systemctl', 'start', mgr_name])
+ time.sleep(4)
+ if mds_role_name in roles:
+ remote.run(args=['sudo', 'systemctl', 'status', mds_name])
+ remote.run(args=['sudo', 'systemctl', 'stop', mds_name])
+ time.sleep(4) # immediate check will result in deactivating state
+ r = remote.run(args=['sudo', 'systemctl', 'status', mds_name],
+ stdout=StringIO(), check_status=False)
+ if r.stdout.getvalue().find('Active: inactive'):
+ log.info("Sucessfully stopped single ceph mds service")
+ else:
+ log.info("Failed to stop ceph mds service")
+ remote.run(args=['sudo', 'systemctl', 'start', mds_name])
+ time.sleep(4)
+ yield
--- /dev/null
+# py.test -v -s tests/test_buildpackages.py
+
+from mock import patch, Mock
+
+from .. import buildpackages
+from teuthology import packaging
+
+def test_get_tag_branch_sha1():
+ gitbuilder = packaging.GitbuilderProject(
+ 'ceph',
+ {
+ 'os_type': 'centos',
+ 'os_version': '7.0',
+ })
+ (tag, branch, sha1) = buildpackages.get_tag_branch_sha1(gitbuilder)
+ assert tag == None
+ assert branch == None
+ assert sha1 is not None
+
+ gitbuilder = packaging.GitbuilderProject(
+ 'ceph',
+ {
+ 'os_type': 'centos',
+ 'os_version': '7.0',
+ 'sha1': 'asha1',
+ })
+ (tag, branch, sha1) = buildpackages.get_tag_branch_sha1(gitbuilder)
+ assert tag == None
+ assert branch == None
+ assert sha1 == 'asha1'
+
+ remote = Mock
+ remote.arch = 'x86_64'
+ remote.os = Mock
+ remote.os.name = 'ubuntu'
+ remote.os.version = '14.04'
+ remote.os.codename = 'trusty'
+ remote.system_type = 'deb'
+ ctx = Mock
+ ctx.cluster = Mock
+ ctx.cluster.remotes = {remote: ['client.0']}
+
+ expected_tag = 'v0.94.1'
+ expected_sha1 = 'expectedsha1'
+ def check_output(cmd, shell):
+ assert shell == True
+ return expected_sha1 + " refs/tags/" + expected_tag
+ with patch.multiple(
+ buildpackages,
+ check_output=check_output,
+ ):
+ gitbuilder = packaging.GitbuilderProject(
+ 'ceph',
+ {
+ 'os_type': 'centos',
+ 'os_version': '7.0',
+ 'sha1': 'asha1',
+ 'all': {
+ 'tag': tag,
+ },
+ },
+ ctx = ctx,
+ remote = remote)
+ (tag, branch, sha1) = buildpackages.get_tag_branch_sha1(gitbuilder)
+ assert tag == expected_tag
+ assert branch == None
+ assert sha1 == expected_sha1
+
+ expected_branch = 'hammer'
+ expected_sha1 = 'otherexpectedsha1'
+ def check_output(cmd, shell):
+ assert shell == True
+ return expected_sha1 + " refs/heads/" + expected_branch
+ with patch.multiple(
+ buildpackages,
+ check_output=check_output,
+ ):
+ gitbuilder = packaging.GitbuilderProject(
+ 'ceph',
+ {
+ 'os_type': 'centos',
+ 'os_version': '7.0',
+ 'sha1': 'asha1',
+ 'all': {
+ 'branch': branch,
+ },
+ },
+ ctx = ctx,
+ remote = remote)
+ (tag, branch, sha1) = buildpackages.get_tag_branch_sha1(gitbuilder)
+ assert tag == None
+ assert branch == expected_branch
+ assert sha1 == expected_sha1
+
+def test_lookup_configs():
+ expected_system_type = 'deb'
+ def make_remote():
+ remote = Mock()
+ remote.arch = 'x86_64'
+ remote.os = Mock()
+ remote.os.name = 'ubuntu'
+ remote.os.version = '14.04'
+ remote.os.codename = 'trusty'
+ remote.system_type = expected_system_type
+ return remote
+ ctx = Mock()
+ class cluster:
+ remote1 = make_remote()
+ remote2 = make_remote()
+ remotes = {
+ remote1: ['client.0'],
+ remote2: ['mon.a','osd.0'],
+ }
+ def only(self, role):
+ result = Mock()
+ if role in ('client.0',):
+ result.remotes = { cluster.remote1: None }
+ elif role in ('osd.0', 'mon.a'):
+ result.remotes = { cluster.remote2: None }
+ else:
+ result.remotes = None
+ return result
+ ctx.cluster = cluster()
+ ctx.config = {
+ 'roles': [ ['client.0'], ['mon.a','osd.0'] ],
+ }
+
+ # nothing -> nothing
+ assert buildpackages.lookup_configs(ctx, {}) == []
+ assert buildpackages.lookup_configs(ctx, {1:[1,2,3]}) == []
+ assert buildpackages.lookup_configs(ctx, [[1,2,3]]) == []
+ assert buildpackages.lookup_configs(ctx, None) == []
+
+ #
+ # the overrides applies to install and to install.upgrade
+ # that have no tag, branch or sha1
+ #
+ config = {
+ 'overrides': {
+ 'install': {
+ 'ceph': {
+ 'sha1': 'overridesha1',
+ 'tag': 'overridetag',
+ 'branch': 'overridebranch',
+ },
+ },
+ },
+ 'tasks': [
+ {
+ 'install': {
+ 'sha1': 'installsha1',
+ },
+ },
+ {
+ 'install.upgrade': {
+ 'osd.0': {
+ },
+ 'client.0': {
+ 'sha1': 'client0sha1',
+ },
+ },
+ }
+ ],
+ }
+ ctx.config = config
+ expected_configs = [{'branch': 'overridebranch', 'sha1': 'overridesha1', 'tag': 'overridetag'},
+ {'project': 'ceph', 'branch': 'overridebranch', 'sha1': 'overridesha1', 'tag': 'overridetag'},
+ {'project': 'ceph', 'sha1': 'client0sha1'}]
+
+ assert buildpackages.lookup_configs(ctx, config) == expected_configs
--- /dev/null
+from textwrap import dedent
+
+from .. import devstack
+
+
+class TestDevstack(object):
+ def test_parse_os_table(self):
+ table_str = dedent("""
+ +---------------------+--------------------------------------+
+ | Property | Value |
+ +---------------------+--------------------------------------+
+ | attachments | [] |
+ | availability_zone | nova |
+ | bootable | false |
+ | created_at | 2014-02-21T17:14:47.548361 |
+ | display_description | None |
+ | display_name | NAME |
+ | id | ffdbd1bb-60dc-4d95-acfe-88774c09ad3e |
+ | metadata | {} |
+ | size | 1 |
+ | snapshot_id | None |
+ | source_volid | None |
+ | status | creating |
+ | volume_type | None |
+ +---------------------+--------------------------------------+
+ """).strip()
+ expected = {
+ 'Property': 'Value',
+ 'attachments': '[]',
+ 'availability_zone': 'nova',
+ 'bootable': 'false',
+ 'created_at': '2014-02-21T17:14:47.548361',
+ 'display_description': 'None',
+ 'display_name': 'NAME',
+ 'id': 'ffdbd1bb-60dc-4d95-acfe-88774c09ad3e',
+ 'metadata': '{}',
+ 'size': '1',
+ 'snapshot_id': 'None',
+ 'source_volid': 'None',
+ 'status': 'creating',
+ 'volume_type': 'None'}
+
+ vol_info = devstack.parse_os_table(table_str)
+ assert vol_info == expected
+
+
+
+
--- /dev/null
+from mock import Mock
+
+from .. import radosgw_admin
+
+acl_with_version = """<?xml version="1.0" encoding="UTF-8"?><AccessControlPolicy xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Owner><ID>foo</ID><DisplayName>Foo</DisplayName></Owner><AccessControlList><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>foo</ID><DisplayName>Foo</DisplayName></Grantee><Permission>FULL_CONTROL</Permission></Grant></AccessControlList></AccessControlPolicy>
+""" # noqa
+
+
+acl_without_version = """<AccessControlPolicy xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Owner><ID>foo</ID><DisplayName>Foo</DisplayName></Owner><AccessControlList><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>foo</ID><DisplayName>Foo</DisplayName></Grantee><Permission>FULL_CONTROL</Permission></Grant></AccessControlList></AccessControlPolicy>
+""" # noqa
+
+
+class TestGetAcl(object):
+
+ def setup(self):
+ self.key = Mock()
+
+ def test_removes_xml_version(self):
+ self.key.get_xml_acl = Mock(return_value=acl_with_version)
+ result = radosgw_admin.get_acl(self.key)
+ assert result.startswith('<AccessControlPolicy')
+
+ def test_xml_version_is_already_removed(self):
+ self.key.get_xml_acl = Mock(return_value=acl_without_version)
+ result = radosgw_admin.get_acl(self.key)
+ assert result.startswith('<AccessControlPolicy')
+
+ def test_newline_gets_trimmed(self):
+ self.key.get_xml_acl = Mock(return_value=acl_without_version)
+ result = radosgw_admin.get_acl(self.key)
+ assert result.endswith('\n') is False
--- /dev/null
+import logging
+from teuthology import misc
+from teuthology.task import Task
+
+log = logging.getLogger(__name__)
+
+
+class TeuthologyIntegration(Task):
+
+ def begin(self):
+ misc.sh("""
+ set -x
+ pip install tox
+ tox
+ # tox -e py27-integration
+ tox -e openstack-integration
+ """)
+
+task = TeuthologyIntegration
--- /dev/null
+"""
+Task to handle tgt
+
+Assumptions made:
+ The ceph-extras tgt package may need to get installed.
+ The open-iscsi package needs to get installed.
+"""
+import logging
+import contextlib
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def start_tgt_remotes(ctx, start_tgtd):
+ """
+ This subtask starts up a tgtd on the clients specified
+ """
+ remotes = ctx.cluster.only(teuthology.is_type('client')).remotes
+ tgtd_list = []
+ for rem, roles in remotes.iteritems():
+ for _id in roles:
+ if _id in start_tgtd:
+ if not rem in tgtd_list:
+ tgtd_list.append(rem)
+ size = ctx.config.get('image_size', 10240)
+ rem.run(
+ args=[
+ 'rbd',
+ 'create',
+ 'iscsi-image',
+ '--size',
+ str(size),
+ ])
+ rem.run(
+ args=[
+ 'sudo',
+ 'tgtadm',
+ '--lld',
+ 'iscsi',
+ '--mode',
+ 'target',
+ '--op',
+ 'new',
+ '--tid',
+ '1',
+ '--targetname',
+ 'rbd',
+ ])
+ rem.run(
+ args=[
+ 'sudo',
+ 'tgtadm',
+ '--lld',
+ 'iscsi',
+ '--mode',
+ 'logicalunit',
+ '--op',
+ 'new',
+ '--tid',
+ '1',
+ '--lun',
+ '1',
+ '--backing-store',
+ 'iscsi-image',
+ '--bstype',
+ 'rbd',
+ ])
+ rem.run(
+ args=[
+ 'sudo',
+ 'tgtadm',
+ '--lld',
+ 'iscsi',
+ '--op',
+ 'bind',
+ '--mode',
+ 'target',
+ '--tid',
+ '1',
+ '-I',
+ 'ALL',
+ ])
+ try:
+ yield
+
+ finally:
+ for rem in tgtd_list:
+ rem.run(
+ args=[
+ 'sudo',
+ 'tgtadm',
+ '--lld',
+ 'iscsi',
+ '--mode',
+ 'target',
+ '--op',
+ 'delete',
+ '--force',
+ '--tid',
+ '1',
+ ])
+ rem.run(
+ args=[
+ 'rbd',
+ 'snap',
+ 'purge',
+ 'iscsi-image',
+ ])
+ rem.run(
+ args=[
+ 'sudo',
+ 'rbd',
+ 'rm',
+ 'iscsi-image',
+ ])
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Start up tgt.
+
+ To start on on all clients::
+
+ tasks:
+ - ceph:
+ - tgt:
+
+ To start on certain clients::
+
+ tasks:
+ - ceph:
+ - tgt: [client.0, client.3]
+
+ or
+
+ tasks:
+ - ceph:
+ - tgt:
+ client.0:
+ client.3:
+
+ An image blocksize size can also be specified::
+
+ tasks:
+ - ceph:
+ - tgt:
+ image_size = 20480
+
+ The general flow of things here is:
+ 1. Find clients on which tgt is supposed to run (start_tgtd)
+ 2. Remotely start up tgt daemon
+ On cleanup:
+ 3. Stop tgt daemon
+
+ The iscsi administration is handled by the iscsi task.
+ """
+ if config:
+ config = {key : val for key, val in config.items()
+ if key.startswith('client')}
+ # config at this point should only contain keys starting with 'client'
+ start_tgtd = []
+ remotes = ctx.cluster.only(teuthology.is_type('client')).remotes
+ log.info(remotes)
+ if not config:
+ start_tgtd = ['client.{id}'.format(id=id_)
+ for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+ else:
+ start_tgtd = config
+ log.info(start_tgtd)
+ with contextutil.nested(
+ lambda: start_tgt_remotes(ctx=ctx, start_tgtd=start_tgtd),):
+ yield
--- /dev/null
+"""
+Thrash -- Simulate random osd failures.
+"""
+import contextlib
+import logging
+import gevent
+import time
+import random
+
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ "Thrash" snap creation and removal on the listed pools
+
+ Example:
+
+ thrash_pool_snaps:
+ pools: [.rgw.buckets, .rgw.buckets.index]
+ max_snaps: 10
+ min_snaps: 5
+ period: 10
+ """
+ stopping = False
+ def do_thrash():
+ pools = config.get('pools', [])
+ max_snaps = config.get('max_snaps', 10)
+ min_snaps = config.get('min_snaps', 5)
+ period = config.get('period', 30)
+ snaps = []
+ manager = ctx.managers['ceph']
+ def remove_snap():
+ assert len(snaps) > 0
+ snap = random.choice(snaps)
+ log.info("Removing snap %s" % (snap,))
+ for pool in pools:
+ manager.remove_pool_snap(pool, str(snap))
+ snaps.remove(snap)
+ def add_snap(snap):
+ log.info("Adding snap %s" % (snap,))
+ for pool in pools:
+ manager.add_pool_snap(pool, str(snap))
+ snaps.append(snap)
+ index = 0
+ while not stopping:
+ index += 1
+ time.sleep(period)
+ if len(snaps) <= min_snaps:
+ add_snap(index)
+ elif len(snaps) >= max_snaps:
+ remove_snap()
+ else:
+ random.choice([lambda: add_snap(index), remove_snap])()
+ log.info("Stopping")
+ thread = gevent.spawn(do_thrash)
+ yield
+ stopping = True
+ thread.join()
+
--- /dev/null
+"""
+Thrash -- Simulate random osd failures.
+"""
+import contextlib
+import logging
+import ceph_manager
+from teuthology import misc as teuthology
+
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ "Thrash" the OSDs by randomly marking them out/down (and then back
+ in) until the task is ended. This loops, and every op_delay
+ seconds it randomly chooses to add or remove an OSD (even odds)
+ unless there are fewer than min_out OSDs out of the cluster, or
+ more than min_in OSDs in the cluster.
+
+ All commands are run on mon0 and it stops when __exit__ is called.
+
+ The config is optional, and is a dict containing some or all of:
+
+ cluster: (default 'ceph') the name of the cluster to thrash
+
+ min_in: (default 3) the minimum number of OSDs to keep in the
+ cluster
+
+ min_out: (default 0) the minimum number of OSDs to keep out of the
+ cluster
+
+ op_delay: (5) the length of time to sleep between changing an
+ OSD's status
+
+ min_dead: (0) minimum number of osds to leave down/dead.
+
+ max_dead: (0) maximum number of osds to leave down/dead before waiting
+ for clean. This should probably be num_replicas - 1.
+
+ clean_interval: (60) the approximate length of time to loop before
+ waiting until the cluster goes clean. (In reality this is used
+ to probabilistically choose when to wait, and the method used
+ makes it closer to -- but not identical to -- the half-life.)
+
+ scrub_interval: (-1) the approximate length of time to loop before
+ waiting until a scrub is performed while cleaning. (In reality
+ this is used to probabilistically choose when to wait, and it
+ only applies to the cases where cleaning is being performed).
+ -1 is used to indicate that no scrubbing will be done.
+
+ chance_down: (0.4) the probability that the thrasher will mark an
+ OSD down rather than marking it out. (The thrasher will not
+ consider that OSD out of the cluster, since presently an OSD
+ wrongly marked down will mark itself back up again.) This value
+ can be either an integer (eg, 75) or a float probability (eg
+ 0.75).
+
+ chance_test_min_size: (0) chance to run test_pool_min_size,
+ which:
+ - kills all but one osd
+ - waits
+ - kills that osd
+ - revives all other osds
+ - verifies that the osds fully recover
+
+ timeout: (360) the number of seconds to wait for the cluster
+ to become clean after each cluster change. If this doesn't
+ happen within the timeout, an exception will be raised.
+
+ revive_timeout: (150) number of seconds to wait for an osd asok to
+ appear after attempting to revive the osd
+
+ thrash_primary_affinity: (true) randomly adjust primary-affinity
+
+ chance_pgnum_grow: (0) chance to increase a pool's size
+ chance_pgpnum_fix: (0) chance to adjust pgpnum to pg for a pool
+ pool_grow_by: (10) amount to increase pgnum by
+ max_pgs_per_pool_osd: (1200) don't expand pools past this size per osd
+
+ pause_short: (3) duration of short pause
+ pause_long: (80) duration of long pause
+ pause_check_after: (50) assert osd down after this long
+ chance_inject_pause_short: (1) chance of injecting short stall
+ chance_inject_pause_long: (0) chance of injecting long stall
+
+ clean_wait: (0) duration to wait before resuming thrashing once clean
+
+ sighup_delay: (0.1) duration to delay between sending signal.SIGHUP to a
+ random live osd
+
+ powercycle: (false) whether to power cycle the node instead
+ of just the osd process. Note that this assumes that a single
+ osd is the only important process on the node.
+
+ bdev_inject_crash: (0) seconds to delay while inducing a synthetic crash.
+ the delay lets the BlockDevice "accept" more aio operations but blocks
+ any flush, and then eventually crashes (losing some or all ios). If 0,
+ no bdev failure injection is enabled.
+
+ bdev_inject_crash_probability: (.5) probability of doing a bdev failure
+ injection crash vs a normal OSD kill.
+
+ chance_test_backfill_full: (0) chance to simulate full disks stopping
+ backfill
+
+ chance_test_map_discontinuity: (0) chance to test map discontinuity
+ map_discontinuity_sleep_time: (40) time to wait for map trims
+
+ ceph_objectstore_tool: (true) whether to export/import a pg while an osd is down
+ chance_move_pg: (1.0) chance of moving a pg if more than 1 osd is down (default 100%)
+
+ optrack_toggle_delay: (2.0) duration to delay between toggling op tracker
+ enablement to all osds
+
+ dump_ops_enable: (true) continuously dump ops on all live osds
+
+ noscrub_toggle_delay: (2.0) duration to delay between toggling noscrub
+
+ disable_objectstore_tool_tests: (false) disable ceph_objectstore_tool based
+ tests
+
+ example:
+
+ tasks:
+ - ceph:
+ - thrashosds:
+ cluster: ceph
+ chance_down: 10
+ op_delay: 3
+ min_in: 1
+ timeout: 600
+ - interactive:
+ """
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'thrashosds task only accepts a dict for configuration'
+ # add default value for sighup_delay
+ config['sighup_delay'] = config.get('sighup_delay', 0.1)
+ # add default value for optrack_toggle_delay
+ config['optrack_toggle_delay'] = config.get('optrack_toggle_delay', 2.0)
+ # add default value for dump_ops_enable
+ config['dump_ops_enable'] = config.get('dump_ops_enable', "true")
+ # add default value for noscrub_toggle_delay
+ config['noscrub_toggle_delay'] = config.get('noscrub_toggle_delay', 2.0)
+
+ log.info("config is {config}".format(config=str(config)))
+
+ overrides = ctx.config.get('overrides', {})
+ log.info("overrides is {overrides}".format(overrides=str(overrides)))
+ teuthology.deep_merge(config, overrides.get('thrashosds', {}))
+ cluster = config.get('cluster', 'ceph')
+
+ log.info("config is {config}".format(config=str(config)))
+
+ if 'powercycle' in config:
+
+ # sync everyone first to avoid collateral damage to / etc.
+ log.info('Doing preliminary sync to avoid collateral damage...')
+ ctx.cluster.run(args=['sync'])
+
+ if 'ipmi_user' in ctx.teuthology_config:
+ for remote in ctx.cluster.remotes.keys():
+ log.debug('checking console status of %s' % remote.shortname)
+ if not remote.console.check_status():
+ log.warn('Failed to get console status for %s',
+ remote.shortname)
+
+ # check that all osd remotes have a valid console
+ osds = ctx.cluster.only(teuthology.is_type('osd', cluster))
+ for remote in osds.remotes.keys():
+ if not remote.console.has_ipmi_credentials:
+ raise Exception(
+ 'IPMI console required for powercycling, '
+ 'but not available on osd role: {r}'.format(
+ r=remote.name))
+
+ cluster_manager = ctx.managers[cluster]
+ for f in ['powercycle', 'bdev_inject_crash']:
+ if config.get(f):
+ cluster_manager.config[f] = config.get(f)
+
+ log.info('Beginning thrashosds...')
+ thrash_proc = ceph_manager.Thrasher(
+ cluster_manager,
+ config,
+ logger=log.getChild('thrasher')
+ )
+ try:
+ yield
+ finally:
+ log.info('joining thrashosds')
+ thrash_proc.do_join()
+ cluster_manager.wait_for_recovery(config.get('timeout', 360))
--- /dev/null
+#cloud-config-archive
+
+- type: text/cloud-config
+ content: |
+ output:
+ all: '| tee -a /var/log/cloud-init-output.log'
+
+# allow passwordless access for debugging
+- |
+ #!/bin/bash
+ exec passwd -d ubuntu
+
+- |
+ #!/bin/bash
+
+ # mount a NFS share for storing logs
+ apt-get update
+ apt-get -y install nfs-common
+ mkdir /mnt/log
+ # 10.0.2.2 is the host
+ mount -v -t nfs -o proto=tcp 10.0.2.2:{mnt_dir} /mnt/log
+
+ # mount the iso image that has the test script
+ mkdir /mnt/cdrom
+ mount -t auto /dev/cdrom /mnt/cdrom
--- /dev/null
+- |
+ #!/bin/bash
+ cp /var/log/cloud-init-output.log /mnt/log
+
+- |
+ #!/bin/bash
+ umount /mnt/log
+
+- |
+ #!/bin/bash
+ shutdown -h -P now
--- /dev/null
+from teuthology import misc
+
+def get_remote(ctx, cluster, service_type, service_id):
+ """
+ Get the Remote for the host where a particular role runs.
+
+ :param cluster: name of the cluster the service is part of
+ :param service_type: e.g. 'mds', 'osd', 'client'
+ :param service_id: The third part of a role, e.g. '0' for
+ the role 'ceph.client.0'
+ :return: a Remote instance for the host where the
+ requested role is placed
+ """
+ def _is_instance(role):
+ role_tuple = misc.split_role(role)
+ return role_tuple == (cluster, service_type, str(service_id))
+ try:
+ (remote,) = ctx.cluster.only(_is_instance).remotes.keys()
+ except ValueError:
+ raise KeyError("Service {0}.{1}.{2} not found".format(cluster,
+ service_type,
+ service_id))
+ return remote
+
+def get_remote_for_role(ctx, role):
+ return get_remote(ctx, *misc.split_role(role))
--- /dev/null
+import logging
+
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+def rados(ctx, remote, cmd, wait=True, check_status=False):
+ testdir = teuthology.get_testdir(ctx)
+ log.info("rados %s" % ' '.join(cmd))
+ pre = [
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'rados',
+ ];
+ pre.extend(cmd)
+ proc = remote.run(
+ args=pre,
+ check_status=check_status,
+ wait=wait,
+ )
+ if wait:
+ return proc.exitstatus
+ else:
+ return proc
+
+def create_ec_pool(remote, name, profile_name, pgnum, profile={}):
+ remote.run(args=['sudo', 'ceph'] +
+ cmd_erasure_code_profile(profile_name, profile))
+ remote.run(args=[
+ 'sudo', 'ceph', 'osd', 'pool', 'create', name,
+ str(pgnum), str(pgnum), 'erasure', profile_name,
+ ])
+
+def create_replicated_pool(remote, name, pgnum):
+ remote.run(args=[
+ 'sudo', 'ceph', 'osd', 'pool', 'create', name, str(pgnum), str(pgnum),
+ ])
+
+def create_cache_pool(remote, base_name, cache_name, pgnum, size):
+ remote.run(args=[
+ 'sudo', 'ceph', 'osd', 'pool', 'create', cache_name, str(pgnum)
+ ])
+ remote.run(args=[
+ 'sudo', 'ceph', 'osd', 'tier', 'add-cache', base_name, cache_name,
+ str(size),
+ ])
+
+def cmd_erasure_code_profile(profile_name, profile):
+ """
+ Return the shell command to run to create the erasure code profile
+ described by the profile parameter.
+
+ :param profile_name: a string matching [A-Za-z0-9-_.]+
+ :param profile: a map whose semantic depends on the erasure code plugin
+ :returns: a shell command as an array suitable for Remote.run
+
+ If profile is {}, it is replaced with
+
+ { 'k': '2', 'm': '1', 'ruleset-failure-domain': 'osd'}
+
+ for backward compatibility. In previous versions of teuthology,
+ these values were hardcoded as function arguments and some yaml
+ files were designed with these implicit values. The teuthology
+ code should not know anything about the erasure code profile
+ content or semantic. The valid values and parameters are outside
+ its scope.
+ """
+
+ if profile == {}:
+ profile = {
+ 'k': '2',
+ 'm': '1',
+ 'ruleset-failure-domain': 'osd'
+ }
+ return [
+ 'osd', 'erasure-code-profile', 'set',
+ profile_name
+ ] + [ str(key) + '=' + str(value) for key, value in profile.iteritems() ]
--- /dev/null
+from cStringIO import StringIO
+import logging
+import json
+import requests
+from requests.packages.urllib3.util import Retry
+from urlparse import urlparse
+
+from teuthology.orchestra.connection import split_user
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+# simple test to indicate if multi-region testing should occur
+def multi_region_enabled(ctx):
+ # this is populated by the radosgw-agent task, seems reasonable to
+ # use that as an indicator that we're testing multi-region sync
+ return 'radosgw_agent' in ctx
+
+def rgwadmin(ctx, client, cmd, stdin=StringIO(), check_status=False,
+ format='json'):
+ log.info('rgwadmin: {client} : {cmd}'.format(client=client,cmd=cmd))
+ testdir = teuthology.get_testdir(ctx)
+ pre = [
+ 'adjust-ulimits',
+ 'ceph-coverage'.format(tdir=testdir),
+ '{tdir}/archive/coverage'.format(tdir=testdir),
+ 'radosgw-admin'.format(tdir=testdir),
+ '--log-to-stderr',
+ '--format', format,
+ '-n', client,
+ ]
+ pre.extend(cmd)
+ log.info('rgwadmin: cmd=%s' % pre)
+ (remote,) = ctx.cluster.only(client).remotes.iterkeys()
+ proc = remote.run(
+ args=pre,
+ check_status=check_status,
+ stdout=StringIO(),
+ stderr=StringIO(),
+ stdin=stdin,
+ )
+ r = proc.exitstatus
+ out = proc.stdout.getvalue()
+ j = None
+ if not r and out != '':
+ try:
+ j = json.loads(out)
+ log.info(' json result: %s' % j)
+ except ValueError:
+ j = out
+ log.info(' raw result: %s' % j)
+ return (r, j)
+
+def get_user_summary(out, user):
+ """Extract the summary for a given user"""
+ user_summary = None
+ for summary in out['summary']:
+ if summary.get('user') == user:
+ user_summary = summary
+
+ if not user_summary:
+ raise AssertionError('No summary info found for user: %s' % user)
+
+ return user_summary
+
+def get_user_successful_ops(out, user):
+ summary = out['summary']
+ if len(summary) == 0:
+ return 0
+ return get_user_summary(out, user)['total']['successful_ops']
+
+def get_zone_host_and_port(ctx, client, zone):
+ _, region_map = rgwadmin(ctx, client, check_status=True,
+ cmd=['-n', client, 'region-map', 'get'])
+ regions = region_map['zonegroups']
+ for region in regions:
+ for zone_info in region['val']['zones']:
+ if zone_info['name'] == zone:
+ endpoint = urlparse(zone_info['endpoints'][0])
+ host, port = endpoint.hostname, endpoint.port
+ if port is None:
+ port = 80
+ return host, port
+ assert False, 'no endpoint for zone {zone} found'.format(zone=zone)
+
+def get_master_zone(ctx, client):
+ _, region_map = rgwadmin(ctx, client, check_status=True,
+ cmd=['-n', client, 'region-map', 'get'])
+ regions = region_map['zonegroups']
+ for region in regions:
+ is_master = (region['val']['is_master'] == "true")
+ log.info('region={r} is_master={ism}'.format(r=region, ism=is_master))
+ if not is_master:
+ continue
+ master_zone = region['val']['master_zone']
+ log.info('master_zone=%s' % master_zone)
+ for zone_info in region['val']['zones']:
+ if zone_info['name'] == master_zone:
+ return master_zone
+ log.info('couldn\'t find master zone')
+ return None
+
+def get_master_client(ctx, clients):
+ master_zone = get_master_zone(ctx, clients[0]) # can use any client for this as long as system configured correctly
+ if not master_zone:
+ return None
+
+ for client in clients:
+ zone = zone_for_client(ctx, client)
+ if zone == master_zone:
+ return client
+
+ return None
+
+def get_zone_system_keys(ctx, client, zone):
+ _, zone_info = rgwadmin(ctx, client, check_status=True,
+ cmd=['-n', client,
+ 'zone', 'get', '--rgw-zone', zone])
+ system_key = zone_info['system_key']
+ return system_key['access_key'], system_key['secret_key']
+
+def zone_for_client(ctx, client):
+ ceph_config = ctx.ceph['ceph'].conf.get('global', {})
+ ceph_config.update(ctx.ceph['ceph'].conf.get('client', {}))
+ ceph_config.update(ctx.ceph['ceph'].conf.get(client, {}))
+ return ceph_config.get('rgw zone')
+
+def region_for_client(ctx, client):
+ ceph_config = ctx.ceph['ceph'].conf.get('global', {})
+ ceph_config.update(ctx.ceph['ceph'].conf.get('client', {}))
+ ceph_config.update(ctx.ceph['ceph'].conf.get(client, {}))
+ return ceph_config.get('rgw region')
+
+def radosgw_data_log_window(ctx, client):
+ ceph_config = ctx.ceph['ceph'].conf.get('global', {})
+ ceph_config.update(ctx.ceph['ceph'].conf.get('client', {}))
+ ceph_config.update(ctx.ceph['ceph'].conf.get(client, {}))
+ return ceph_config.get('rgw data log window', 30)
+
+def radosgw_agent_sync_data(ctx, agent_host, agent_port, full=False):
+ log.info('sync agent {h}:{p}'.format(h=agent_host, p=agent_port))
+ # use retry with backoff to tolerate slow startup of radosgw-agent
+ s = requests.Session()
+ s.mount('http://{addr}:{port}/'.format(addr = agent_host, port = agent_port),
+ requests.adapters.HTTPAdapter(max_retries=Retry(total=5, backoff_factor=1)))
+ method = "full" if full else "incremental"
+ return s.post('http://{addr}:{port}/data/{method}'.format(addr = agent_host, port = agent_port, method = method))
+
+def radosgw_agent_sync_metadata(ctx, agent_host, agent_port, full=False):
+ log.info('sync agent {h}:{p}'.format(h=agent_host, p=agent_port))
+ # use retry with backoff to tolerate slow startup of radosgw-agent
+ s = requests.Session()
+ s.mount('http://{addr}:{port}/'.format(addr = agent_host, port = agent_port),
+ requests.adapters.HTTPAdapter(max_retries=Retry(total=5, backoff_factor=1)))
+ method = "full" if full else "incremental"
+ return s.post('http://{addr}:{port}/metadata/{method}'.format(addr = agent_host, port = agent_port, method = method))
+
+def radosgw_agent_sync_all(ctx, full=False, data=False):
+ if ctx.radosgw_agent.procs:
+ for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
+ zone_for_client(ctx, agent_client)
+ sync_host, sync_port = get_sync_agent(ctx, agent_client)
+ log.debug('doing a sync via {host1}'.format(host1=sync_host))
+ radosgw_agent_sync_metadata(ctx, sync_host, sync_port, full)
+ if (data):
+ radosgw_agent_sync_data(ctx, sync_host, sync_port, full)
+
+def host_for_role(ctx, role):
+ for target, roles in zip(ctx.config['targets'].iterkeys(), ctx.config['roles']):
+ if role in roles:
+ _, host = split_user(target)
+ return host
+
+def get_sync_agent(ctx, source):
+ for task in ctx.config['tasks']:
+ if 'radosgw-agent' not in task:
+ continue
+ for client, conf in task['radosgw-agent'].iteritems():
+ if conf['src'] == source:
+ return host_for_role(ctx, source), conf.get('port', 8000)
+ return None, None
--- /dev/null
+#
+# The MIT License
+#
+# Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
+#
+# Author: Loic Dachary <loic@dachary.org>
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+from .. import rados
+
+class TestRados(object):
+
+ def test_cmd_erasure_code_profile(self):
+ name = 'NAME'
+ cmd = rados.cmd_erasure_code_profile(name, {})
+ assert 'k=2' in cmd
+ assert name in cmd
+ cmd = rados.cmd_erasure_code_profile(name, { 'k': '88' })
+ assert 'k=88' in cmd
+ assert name in cmd
--- /dev/null
+"""
+vstart_runner: override Filesystem and Mount interfaces to run a CephFSTestCase against a vstart
+ceph instance instead of a packaged/installed cluster. Use this to turn around test cases
+quickly during development.
+
+Usage (assuming teuthology, ceph, ceph-qa-suite checked out in ~/git):
+
+ # Activate the teuthology virtualenv
+ source ~/git/teuthology/virtualenv/bin/activate
+ # Go into your ceph build directory
+ cd ~/git/ceph/build
+ # Start a vstart cluster
+ MDS=2 MON=1 OSD=3 ../src/vstart.sh -n
+ # Invoke a test using this script, with PYTHONPATH set appropriately
+ python ~/git/ceph-qa-suite/tasks/vstart_runner.py
+
+ # Alternatively, if you use different paths, specify them as follows:
+ LD_LIBRARY_PATH=`pwd`/lib PYTHONPATH=~/git/teuthology:~/git/ceph-qa-suite:`pwd`/../src/pybind:`pwd`/lib/cython_modules/lib.2 python ~/git/ceph-qa-suite/tasks/vstart_runner.py
+
+ # If you wish to drop to a python shell on failures, use --interactive:
+ python ~/git/ceph-qa-suite/tasks/vstart_runner.py --interactive
+
+ # If you wish to run a named test case, pass it as an argument:
+ python ~/git/ceph-qa-suite/tasks/vstart_runner.py tasks.cephfs.test_data_scan
+
+"""
+
+from StringIO import StringIO
+from collections import defaultdict
+import getpass
+import signal
+import tempfile
+import threading
+import datetime
+import shutil
+import re
+import os
+import time
+import json
+import sys
+import errno
+from unittest import suite
+import unittest
+import platform
+from teuthology.orchestra.run import Raw, quote
+from teuthology.orchestra.daemon import DaemonGroup
+from teuthology.config import config as teuth_config
+
+import logging
+
+log = logging.getLogger(__name__)
+
+handler = logging.FileHandler("./vstart_runner.log")
+formatter = logging.Formatter(
+ fmt=u'%(asctime)s.%(msecs)03d %(levelname)s:%(name)s:%(message)s',
+ datefmt='%Y-%m-%dT%H:%M:%S')
+handler.setFormatter(formatter)
+log.addHandler(handler)
+log.setLevel(logging.INFO)
+
+
+def respawn_in_path(lib_path, python_paths):
+ execv_cmd = ['python']
+ if platform.system() == "Darwin":
+ lib_path_var = "DYLD_LIBRARY_PATH"
+ else:
+ lib_path_var = "LD_LIBRARY_PATH"
+
+ py_binary = os.environ.get("PYTHON", "python")
+
+ if lib_path_var in os.environ:
+ if lib_path not in os.environ[lib_path_var]:
+ os.environ[lib_path_var] += ':' + lib_path
+ os.execvp(py_binary, execv_cmd + sys.argv)
+ else:
+ os.environ[lib_path_var] = lib_path
+ os.execvp(py_binary, execv_cmd + sys.argv)
+
+ for p in python_paths:
+ sys.path.insert(0, p)
+
+
+# Let's use some sensible defaults
+if os.path.exists("./CMakeCache.txt") and os.path.exists("./bin"):
+
+ # A list of candidate paths for each package we need
+ guesses = [
+ ["~/git/teuthology", "~/scm/teuthology", "~/teuthology"],
+ ["~/git/ceph-qa-suite", "~/scm/ceph-qa-suite", "~/ceph-qa-suite"],
+ ["lib/cython_modules/lib.2"],
+ ["../src/pybind"],
+ ]
+
+ python_paths = []
+ for package_guesses in guesses:
+ for g in package_guesses:
+ g_exp = os.path.abspath(os.path.expanduser(g))
+ if os.path.exists(g_exp):
+ python_paths.append(g_exp)
+
+ ld_path = os.path.join(os.getcwd(), "lib/")
+ print "Using guessed paths {0} {1}".format(ld_path, python_paths)
+ respawn_in_path(ld_path, python_paths)
+
+
+try:
+ from teuthology.exceptions import CommandFailedError
+ from tasks.ceph_manager import CephManager
+ from tasks.cephfs.fuse_mount import FuseMount
+ from tasks.cephfs.filesystem import Filesystem, MDSCluster, CephCluster
+ from mgr.mgr_test_case import MgrCluster
+ from teuthology.contextutil import MaxWhileTries
+ from teuthology.task import interactive
+except ImportError:
+ sys.stderr.write("***\nError importing packages, have you activated your teuthology virtualenv "
+ "and set PYTHONPATH to point to teuthology and ceph-qa-suite?\n***\n\n")
+ raise
+
+# Must import after teuthology because of gevent monkey patching
+import subprocess
+
+if os.path.exists("./CMakeCache.txt"):
+ # Running in build dir of a cmake build
+ BIN_PREFIX = "./bin/"
+else:
+ # Running in src/ of an autotools build
+ BIN_PREFIX = "./"
+
+
+class LocalRemoteProcess(object):
+ def __init__(self, args, subproc, check_status, stdout, stderr):
+ self.args = args
+ self.subproc = subproc
+ if stdout is None:
+ self.stdout = StringIO()
+ else:
+ self.stdout = stdout
+
+ if stderr is None:
+ self.stderr = StringIO()
+ else:
+ self.stderr = stderr
+
+ self.check_status = check_status
+ self.exitstatus = self.returncode = None
+
+ def wait(self):
+ if self.finished:
+ # Avoid calling communicate() on a dead process because it'll
+ # give you stick about std* already being closed
+ if self.exitstatus != 0:
+ raise CommandFailedError(self.args, self.exitstatus)
+ else:
+ return
+
+ out, err = self.subproc.communicate()
+ self.stdout.write(out)
+ self.stderr.write(err)
+
+ self.exitstatus = self.returncode = self.subproc.returncode
+
+ if self.exitstatus != 0:
+ sys.stderr.write(out)
+ sys.stderr.write(err)
+
+ if self.check_status and self.exitstatus != 0:
+ raise CommandFailedError(self.args, self.exitstatus)
+
+ @property
+ def finished(self):
+ if self.exitstatus is not None:
+ return True
+
+ if self.subproc.poll() is not None:
+ out, err = self.subproc.communicate()
+ self.stdout.write(out)
+ self.stderr.write(err)
+ self.exitstatus = self.returncode = self.subproc.returncode
+ return True
+ else:
+ return False
+
+ def kill(self):
+ log.info("kill ")
+ if self.subproc.pid and not self.finished:
+ log.info("kill: killing pid {0} ({1})".format(
+ self.subproc.pid, self.args))
+ safe_kill(self.subproc.pid)
+ else:
+ log.info("kill: already terminated ({0})".format(self.args))
+
+ @property
+ def stdin(self):
+ class FakeStdIn(object):
+ def __init__(self, mount_daemon):
+ self.mount_daemon = mount_daemon
+
+ def close(self):
+ self.mount_daemon.kill()
+
+ return FakeStdIn(self)
+
+
+class LocalRemote(object):
+ """
+ Amusingly named class to present the teuthology RemoteProcess interface when we are really
+ running things locally for vstart
+
+ Run this inside your src/ dir!
+ """
+
+ def __init__(self):
+ self.name = "local"
+ self.hostname = "localhost"
+ self.user = getpass.getuser()
+
+ def get_file(self, path, sudo, dest_dir):
+ tmpfile = tempfile.NamedTemporaryFile(delete=False).name
+ shutil.copy(path, tmpfile)
+ return tmpfile
+
+ def put_file(self, src, dst, sudo=False):
+ shutil.copy(src, dst)
+
+ def run(self, args, check_status=True, wait=True,
+ stdout=None, stderr=None, cwd=None, stdin=None,
+ logger=None, label=None):
+ log.info("run args={0}".format(args))
+
+ # We don't need no stinkin' sudo
+ args = [a for a in args if a != "sudo"]
+
+ # We have to use shell=True if any run.Raw was present, e.g. &&
+ shell = any([a for a in args if isinstance(a, Raw)])
+
+ if shell:
+ filtered = []
+ i = 0
+ while i < len(args):
+ if args[i] == 'adjust-ulimits':
+ i += 1
+ elif args[i] == 'ceph-coverage':
+ i += 2
+ elif args[i] == 'timeout':
+ i += 2
+ else:
+ filtered.append(args[i])
+ i += 1
+
+ args = quote(filtered)
+ log.info("Running {0}".format(args))
+
+ subproc = subprocess.Popen(args,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ stdin=subprocess.PIPE,
+ cwd=cwd,
+ shell=True)
+ else:
+ log.info("Running {0}".format(args))
+
+ for arg in args:
+ if not isinstance(arg, basestring):
+ raise RuntimeError("Oops, can't handle arg {0} type {1}".format(
+ arg, arg.__class__
+ ))
+
+ subproc = subprocess.Popen(args,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ stdin=subprocess.PIPE,
+ cwd=cwd)
+
+ if stdin:
+ if not isinstance(stdin, basestring):
+ raise RuntimeError("Can't handle non-string stdins on a vstart cluster")
+
+ # Hack: writing to stdin is not deadlock-safe, but it "always" works
+ # as long as the input buffer is "small"
+ subproc.stdin.write(stdin)
+
+ proc = LocalRemoteProcess(
+ args, subproc, check_status,
+ stdout, stderr
+ )
+
+ if wait:
+ proc.wait()
+
+ return proc
+
+
+class LocalDaemon(object):
+ def __init__(self, daemon_type, daemon_id):
+ self.daemon_type = daemon_type
+ self.daemon_id = daemon_id
+ self.controller = LocalRemote()
+ self.proc = None
+
+ @property
+ def remote(self):
+ return LocalRemote()
+
+ def running(self):
+ return self._get_pid() is not None
+
+ def _get_pid(self):
+ """
+ Return PID as an integer or None if not found
+ """
+ ps_txt = self.controller.run(
+ args=["ps", "-xwwu"+str(os.getuid())]
+ ).stdout.getvalue().strip()
+ lines = ps_txt.split("\n")[1:]
+
+ for line in lines:
+ if line.find("ceph-{0} -i {1}".format(self.daemon_type, self.daemon_id)) != -1:
+ log.info("Found ps line for daemon: {0}".format(line))
+ return int(line.split()[1])
+ log.info("No match for {0} {1}: {2}".format(
+ self.daemon_type, self.daemon_id, ps_txt
+ ))
+ return None
+
+ def wait(self, timeout):
+ waited = 0
+ while self._get_pid() is not None:
+ if waited > timeout:
+ raise MaxWhileTries("Timed out waiting for daemon {0}.{1}".format(self.daemon_type, self.daemon_id))
+ time.sleep(1)
+ waited += 1
+
+ def stop(self, timeout=300):
+ if not self.running():
+ log.error('tried to stop a non-running daemon')
+ return
+
+ pid = self._get_pid()
+ log.info("Killing PID {0} for {1}.{2}".format(pid, self.daemon_type, self.daemon_id))
+ os.kill(pid, signal.SIGKILL)
+ self.wait(timeout=timeout)
+
+ def restart(self):
+ if self._get_pid() is not None:
+ self.stop()
+
+ self.proc = self.controller.run([os.path.join(BIN_PREFIX, "./ceph-{0}".format(self.daemon_type)), "-i", self.daemon_id])
+
+
+def safe_kill(pid):
+ """
+ os.kill annoyingly raises exception if process already dead. Ignore it.
+ """
+ try:
+ return os.kill(pid, signal.SIGKILL)
+ except OSError as e:
+ if e.errno == errno.ESRCH:
+ # Raced with process termination
+ pass
+ else:
+ raise
+
+
+class LocalFuseMount(FuseMount):
+ def __init__(self, test_dir, client_id):
+ super(LocalFuseMount, self).__init__(None, test_dir, client_id, LocalRemote())
+
+ @property
+ def config_path(self):
+ return "./ceph.conf"
+
+ def get_keyring_path(self):
+ # This is going to end up in a config file, so use an absolute path
+ # to avoid assumptions about daemons' pwd
+ return os.path.abspath("./client.{0}.keyring".format(self.client_id))
+
+ def run_shell(self, args, wait=True):
+ # FIXME maybe should add a pwd arg to teuthology.orchestra so that
+ # the "cd foo && bar" shenanigans isn't needed to begin with and
+ # then we wouldn't have to special case this
+ return self.client_remote.run(
+ args, wait=wait, cwd=self.mountpoint
+ )
+
+ @property
+ def _prefix(self):
+ return BIN_PREFIX
+
+ def _asok_path(self):
+ # In teuthology, the asok is named after the PID of the ceph-fuse process, because it's
+ # run foreground. When running it daemonized however, the asok is named after
+ # the PID of the launching process, not the long running ceph-fuse process. Therefore
+ # we need to give an exact path here as the logic for checking /proc/ for which
+ # asok is alive does not work.
+ path = "./out/client.{0}.{1}.asok".format(self.client_id, self.fuse_daemon.subproc.pid)
+ log.info("I think my launching pid was {0}".format(self.fuse_daemon.subproc.pid))
+ return path
+
+ def umount(self):
+ if self.is_mounted():
+ super(LocalFuseMount, self).umount()
+
+ def mount(self, mount_path=None, mount_fs_name=None):
+ self.client_remote.run(
+ args=[
+ 'mkdir',
+ '--',
+ self.mountpoint,
+ ],
+ )
+
+ def list_connections():
+ self.client_remote.run(
+ args=["mount", "-t", "fusectl", "/sys/fs/fuse/connections", "/sys/fs/fuse/connections"],
+ check_status=False
+ )
+ p = self.client_remote.run(
+ args=["ls", "/sys/fs/fuse/connections"],
+ check_status=False
+ )
+ if p.exitstatus != 0:
+ log.warn("ls conns failed with {0}, assuming none".format(p.exitstatus))
+ return []
+
+ ls_str = p.stdout.getvalue().strip()
+ if ls_str:
+ return [int(n) for n in ls_str.split("\n")]
+ else:
+ return []
+
+ # Before starting ceph-fuse process, note the contents of
+ # /sys/fs/fuse/connections
+ pre_mount_conns = list_connections()
+ log.info("Pre-mount connections: {0}".format(pre_mount_conns))
+
+ prefix = [os.path.join(BIN_PREFIX, "ceph-fuse")]
+ if os.getuid() != 0:
+ prefix += ["--client-die-on-failed-remount=false"]
+
+ if mount_path is not None:
+ prefix += ["--client_mountpoint={0}".format(mount_path)]
+
+ if mount_fs_name is not None:
+ prefix += ["--client_mds_namespace={0}".format(mount_fs_name)]
+
+ self.fuse_daemon = self.client_remote.run(args=
+ prefix + [
+ "-f",
+ "--name",
+ "client.{0}".format(self.client_id),
+ self.mountpoint
+ ], wait=False)
+
+ log.info("Mounting client.{0} with pid {1}".format(self.client_id, self.fuse_daemon.subproc.pid))
+
+ # Wait for the connection reference to appear in /sys
+ waited = 0
+ post_mount_conns = list_connections()
+ while len(post_mount_conns) <= len(pre_mount_conns):
+ if self.fuse_daemon.finished:
+ # Did mount fail? Raise the CommandFailedError instead of
+ # hitting the "failed to populate /sys/" timeout
+ self.fuse_daemon.wait()
+ time.sleep(1)
+ waited += 1
+ if waited > 30:
+ raise RuntimeError("Fuse mount failed to populate /sys/ after {0} seconds".format(
+ waited
+ ))
+ post_mount_conns = list_connections()
+
+ log.info("Post-mount connections: {0}".format(post_mount_conns))
+
+ # Record our fuse connection number so that we can use it when
+ # forcing an unmount
+ new_conns = list(set(post_mount_conns) - set(pre_mount_conns))
+ if len(new_conns) == 0:
+ raise RuntimeError("New fuse connection directory not found ({0})".format(new_conns))
+ elif len(new_conns) > 1:
+ raise RuntimeError("Unexpectedly numerous fuse connections {0}".format(new_conns))
+ else:
+ self._fuse_conn = new_conns[0]
+
+ def _run_python(self, pyscript):
+ """
+ Override this to remove the daemon-helper prefix that is used otherwise
+ to make the process killable.
+ """
+ return self.client_remote.run(args=[
+ 'python', '-c', pyscript
+ ], wait=False)
+
+
+class LocalCephManager(CephManager):
+ def __init__(self):
+ # Deliberately skip parent init, only inheriting from it to get
+ # util methods like osd_dump that sit on top of raw_cluster_cmd
+ self.controller = LocalRemote()
+
+ # A minority of CephManager fns actually bother locking for when
+ # certain teuthology tests want to run tasks in parallel
+ self.lock = threading.RLock()
+
+ def find_remote(self, daemon_type, daemon_id):
+ """
+ daemon_type like 'mds', 'osd'
+ daemon_id like 'a', '0'
+ """
+ return LocalRemote()
+
+ def run_ceph_w(self):
+ proc = self.controller.run([os.path.join(BIN_PREFIX, "ceph"), "-w"], wait=False, stdout=StringIO())
+ return proc
+
+ def raw_cluster_cmd(self, *args):
+ """
+ args like ["osd", "dump"}
+ return stdout string
+ """
+ proc = self.controller.run([os.path.join(BIN_PREFIX, "ceph")] + list(args))
+ return proc.stdout.getvalue()
+
+ def raw_cluster_cmd_result(self, *args):
+ """
+ like raw_cluster_cmd but don't check status, just return rc
+ """
+ proc = self.controller.run([os.path.join(BIN_PREFIX, "ceph")] + list(args), check_status=False)
+ return proc.exitstatus
+
+ def admin_socket(self, daemon_type, daemon_id, command, check_status=True):
+ return self.controller.run(
+ args=[os.path.join(BIN_PREFIX, "ceph"), "daemon", "{0}.{1}".format(daemon_type, daemon_id)] + command, check_status=check_status
+ )
+
+ # FIXME: copypasta
+ def get_mds_status(self, mds):
+ """
+ Run cluster commands for the mds in order to get mds information
+ """
+ out = self.raw_cluster_cmd('mds', 'dump', '--format=json')
+ j = json.loads(' '.join(out.splitlines()[1:]))
+ # collate; for dup ids, larger gid wins.
+ for info in j['info'].itervalues():
+ if info['name'] == mds:
+ return info
+ return None
+
+ # FIXME: copypasta
+ def get_mds_status_by_rank(self, rank):
+ """
+ Run cluster commands for the mds in order to get mds information
+ check rank.
+ """
+ j = self.get_mds_status_all()
+ # collate; for dup ids, larger gid wins.
+ for info in j['info'].itervalues():
+ if info['rank'] == rank:
+ return info
+ return None
+
+ def get_mds_status_all(self):
+ """
+ Run cluster command to extract all the mds status.
+ """
+ out = self.raw_cluster_cmd('mds', 'dump', '--format=json')
+ j = json.loads(' '.join(out.splitlines()[1:]))
+ return j
+
+
+class LocalCephCluster(CephCluster):
+ def __init__(self, ctx):
+ # Deliberately skip calling parent constructor
+ self._ctx = ctx
+ self.mon_manager = LocalCephManager()
+ self._conf = defaultdict(dict)
+
+ def get_config(self, key, service_type=None):
+ if service_type is None:
+ service_type = 'mon'
+
+ # FIXME hardcoded vstart service IDs
+ service_id = {
+ 'mon': 'a',
+ 'mds': 'a',
+ 'osd': '0'
+ }[service_type]
+
+ return self.json_asok(['config', 'get', key], service_type, service_id)[key]
+
+ def _write_conf(self):
+ # In teuthology, we have the honour of writing the entire ceph.conf, but
+ # in vstart land it has mostly already been written and we need to carefully
+ # append to it.
+ conf_path = "./ceph.conf"
+ banner = "\n#LOCAL_TEST\n"
+ existing_str = open(conf_path).read()
+
+ if banner in existing_str:
+ existing_str = existing_str[0:existing_str.find(banner)]
+
+ existing_str += banner
+
+ for subsys, kvs in self._conf.items():
+ existing_str += "\n[{0}]\n".format(subsys)
+ for key, val in kvs.items():
+ # Comment out existing instance if it exists
+ log.info("Searching for existing instance {0}/{1}".format(
+ key, subsys
+ ))
+ existing_section = re.search("^\[{0}\]$([\n]|[^\[])+".format(
+ subsys
+ ), existing_str, re.MULTILINE)
+
+ if existing_section:
+ section_str = existing_str[existing_section.start():existing_section.end()]
+ existing_val = re.search("^\s*[^#]({0}) =".format(key), section_str, re.MULTILINE)
+ if existing_val:
+ start = existing_section.start() + existing_val.start(1)
+ log.info("Found string to replace at {0}".format(
+ start
+ ))
+ existing_str = existing_str[0:start] + "#" + existing_str[start:]
+
+ existing_str += "{0} = {1}\n".format(key, val)
+
+ open(conf_path, "w").write(existing_str)
+
+ def set_ceph_conf(self, subsys, key, value):
+ self._conf[subsys][key] = value
+ self._write_conf()
+
+ def clear_ceph_conf(self, subsys, key):
+ del self._conf[subsys][key]
+ self._write_conf()
+
+
+class LocalMDSCluster(LocalCephCluster, MDSCluster):
+ def __init__(self, ctx):
+ super(LocalMDSCluster, self).__init__(ctx)
+
+ self.mds_ids = ctx.daemons.daemons['mds'].keys()
+ if not self.mds_ids:
+ raise RuntimeError("No MDSs found in ceph.conf!")
+
+ self.mds_daemons = dict([(id_, LocalDaemon("mds", id_)) for id_ in self.mds_ids])
+
+ def clear_firewall(self):
+ # FIXME: unimplemented
+ pass
+
+ def newfs(self, name):
+ return LocalFilesystem(self._ctx, create=name)
+
+
+class LocalMgrCluster(LocalCephCluster, MgrCluster):
+ def __init__(self, ctx):
+ super(LocalMgrCluster, self).__init__(ctx)
+
+ self.mgr_ids = ctx.daemons.daemons['mgr'].keys()
+ if not self.mgr_ids:
+ raise RuntimeError("No manager daemonss found in ceph.conf!")
+
+ self.mgr_daemons = dict([(id_, LocalDaemon("mgr", id_)) for id_ in self.mgr_ids])
+
+
+class LocalFilesystem(Filesystem, LocalMDSCluster):
+ @property
+ def admin_remote(self):
+ return LocalRemote()
+
+ def __init__(self, ctx, fscid=None, create=None):
+ # Deliberately skip calling parent constructor
+ self._ctx = ctx
+
+ self.id = None
+ self.name = None
+ self.metadata_pool_name = None
+ self.data_pools = None
+
+ # Hack: cheeky inspection of ceph.conf to see what MDSs exist
+ self.mds_ids = set()
+ for line in open("ceph.conf").readlines():
+ match = re.match("^\[mds\.(.+)\]$", line)
+ if match:
+ self.mds_ids.add(match.group(1))
+
+ if not self.mds_ids:
+ raise RuntimeError("No MDSs found in ceph.conf!")
+
+ self.mds_ids = list(self.mds_ids)
+
+ log.info("Discovered MDS IDs: {0}".format(self.mds_ids))
+
+ self.mon_manager = LocalCephManager()
+
+ self.mds_daemons = dict([(id_, LocalDaemon("mds", id_)) for id_ in self.mds_ids])
+
+ self.client_remote = LocalRemote()
+
+ self._conf = defaultdict(dict)
+
+ if create is not None:
+ if fscid is not None:
+ raise RuntimeError("cannot specify fscid when creating fs")
+ if create is True:
+ self.name = 'cephfs'
+ else:
+ self.name = create
+ self.create()
+ elif fscid is not None:
+ self.id = fscid
+ self.getinfo(refresh=True)
+
+ # Stash a reference to the first created filesystem on ctx, so
+ # that if someone drops to the interactive shell they can easily
+ # poke our methods.
+ if not hasattr(self._ctx, "filesystem"):
+ self._ctx.filesystem = self
+
+ @property
+ def _prefix(self):
+ return BIN_PREFIX
+
+ def set_clients_block(self, blocked, mds_id=None):
+ raise NotImplementedError()
+
+ def get_pgs_per_fs_pool(self):
+ # FIXME: assuming there are 3 OSDs
+ return 3 * int(self.get_config('mon_pg_warn_min_per_osd'))
+
+
+class InteractiveFailureResult(unittest.TextTestResult):
+ """
+ Specialization that implements interactive-on-error style
+ behavior.
+ """
+ def addFailure(self, test, err):
+ super(InteractiveFailureResult, self).addFailure(test, err)
+ log.error(self._exc_info_to_string(err, test))
+ log.error("Failure in test '{0}', going interactive".format(
+ self.getDescription(test)
+ ))
+ interactive.task(ctx=None, config=None)
+
+ def addError(self, test, err):
+ super(InteractiveFailureResult, self).addError(test, err)
+ log.error(self._exc_info_to_string(err, test))
+ log.error("Error in test '{0}', going interactive".format(
+ self.getDescription(test)
+ ))
+ interactive.task(ctx=None, config=None)
+
+
+def exec_test():
+ # Help developers by stopping up-front if their tree isn't built enough for all the
+ # tools that the tests might want to use (add more here if needed)
+ require_binaries = ["ceph-dencoder", "cephfs-journal-tool", "cephfs-data-scan",
+ "cephfs-table-tool", "ceph-fuse", "rados"]
+ missing_binaries = [b for b in require_binaries if not os.path.exists(os.path.join(BIN_PREFIX, b))]
+ if missing_binaries:
+ log.error("Some ceph binaries missing, please build them: {0}".format(" ".join(missing_binaries)))
+ sys.exit(-1)
+
+ test_dir = tempfile.mkdtemp()
+
+ # Create as many of these as the biggest test requires
+ clients = ["0", "1", "2", "3"]
+
+ remote = LocalRemote()
+
+ # Tolerate no MDSs or clients running at start
+ ps_txt = remote.run(
+ args=["ps", "-u"+str(os.getuid())]
+ ).stdout.getvalue().strip()
+ lines = ps_txt.split("\n")[1:]
+
+ for line in lines:
+ if 'ceph-fuse' in line or 'ceph-mds' in line:
+ pid = int(line.split()[0])
+ log.warn("Killing stray process {0}".format(line))
+ os.kill(pid, signal.SIGKILL)
+
+ class LocalCluster(object):
+ def __init__(self, rolename="placeholder"):
+ self.remotes = {
+ remote: [rolename]
+ }
+
+ def only(self, requested):
+ return self.__class__(rolename=requested)
+
+ teuth_config['test_path'] = test_dir
+
+ class LocalContext(object):
+ def __init__(self):
+ self.config = {}
+ self.teuthology_config = teuth_config
+ self.cluster = LocalCluster()
+ self.daemons = DaemonGroup()
+
+ # Shove some LocalDaemons into the ctx.daemons DaemonGroup instance so that any
+ # tests that want to look these up via ctx can do so.
+ # Inspect ceph.conf to see what roles exist
+ for conf_line in open("ceph.conf").readlines():
+ for svc_type in ["mon", "osd", "mds", "mgr"]:
+ if svc_type not in self.daemons.daemons:
+ self.daemons.daemons[svc_type] = {}
+ match = re.match("^\[{0}\.(.+)\]$".format(svc_type), conf_line)
+ if match:
+ svc_id = match.group(1)
+ self.daemons.daemons[svc_type][svc_id] = LocalDaemon(svc_type, svc_id)
+
+ def __del__(self):
+ shutil.rmtree(self.teuthology_config['test_path'])
+
+ ctx = LocalContext()
+
+ mounts = []
+ for client_id in clients:
+ # Populate client keyring (it sucks to use client.admin for test clients
+ # because it's awkward to find the logs later)
+ client_name = "client.{0}".format(client_id)
+
+ if client_name not in open("./keyring").read():
+ p = remote.run(args=[os.path.join(BIN_PREFIX, "ceph"), "auth", "get-or-create", client_name,
+ "osd", "allow rw",
+ "mds", "allow",
+ "mon", "allow r"])
+
+ open("./keyring", "a").write(p.stdout.getvalue())
+
+ mount = LocalFuseMount(test_dir, client_id)
+ mounts.append(mount)
+ if mount.is_mounted():
+ log.warn("unmounting {0}".format(mount.mountpoint))
+ mount.umount_wait()
+ else:
+ if os.path.exists(mount.mountpoint):
+ os.rmdir(mount.mountpoint)
+ ceph_cluster = LocalCephCluster(ctx)
+ mds_cluster = LocalMDSCluster(ctx)
+ mgr_cluster = LocalMgrCluster(ctx)
+
+ from tasks.cephfs_test_runner import DecoratingLoader
+
+ class LogStream(object):
+ def __init__(self):
+ self.buffer = ""
+
+ def write(self, data):
+ self.buffer += data
+ if "\n" in self.buffer:
+ lines = self.buffer.split("\n")
+ for line in lines[:-1]:
+ pass
+ # sys.stderr.write(line + "\n")
+ log.info(line)
+ self.buffer = lines[-1]
+
+ def flush(self):
+ pass
+
+ decorating_loader = DecoratingLoader({
+ "ctx": ctx,
+ "mounts": mounts,
+ "ceph_cluster": ceph_cluster,
+ "mds_cluster": mds_cluster,
+ "mgr_cluster": mgr_cluster,
+ })
+
+ # For the benefit of polling tests like test_full -- in teuthology land we set this
+ # in a .yaml, here it's just a hardcoded thing for the developer's pleasure.
+ remote.run(args=[os.path.join(BIN_PREFIX, "ceph"), "tell", "osd.*", "injectargs", "--osd-mon-report-interval-max", "5"])
+ ceph_cluster.set_ceph_conf("osd", "osd_mon_report_interval_max", "5")
+
+ # Vstart defaults to two segments, which very easily gets a "behind on trimming" health warning
+ # from normal IO latency. Increase it for running teests.
+ ceph_cluster.set_ceph_conf("mds", "mds log max segments", "10")
+
+ # Make sure the filesystem created in tests has uid/gid that will let us talk to
+ # it after mounting it (without having to go root). Set in 'global' not just 'mds'
+ # so that cephfs-data-scan will pick it up too.
+ ceph_cluster.set_ceph_conf("global", "mds root ino uid", "%s" % os.getuid())
+ ceph_cluster.set_ceph_conf("global", "mds root ino gid", "%s" % os.getgid())
+
+ # Monkeypatch get_package_version to avoid having to work out what kind of distro we're on
+ def _get_package_version(remote, pkg_name):
+ # Used in cephfs tests to find fuse version. Your development workstation *does* have >=2.9, right?
+ return "2.9"
+
+ import teuthology.packaging
+ teuthology.packaging.get_package_version = _get_package_version
+
+ def enumerate_methods(s):
+ for t in s._tests:
+ if isinstance(t, suite.BaseTestSuite):
+ for sub in enumerate_methods(t):
+ yield sub
+ else:
+ yield s, t
+
+ interactive_on_error = False
+
+ args = sys.argv[1:]
+ flags = [a for a in args if a.startswith("-")]
+ modules = [a for a in args if not a.startswith("-")]
+ for f in flags:
+ if f == "--interactive":
+ interactive_on_error = True
+ else:
+ log.error("Unknown option '{0}'".format(f))
+ sys.exit(-1)
+
+ if modules:
+ log.info("Executing modules: {0}".format(modules))
+ module_suites = []
+ for mod_name in modules:
+ # Test names like cephfs.test_auto_repair
+ module_suites.append(decorating_loader.loadTestsFromName(mod_name))
+ log.info("Loaded: {0}".format(list(module_suites)))
+ overall_suite = suite.TestSuite(module_suites)
+ else:
+ log.info("Executing all cephfs tests")
+ overall_suite = decorating_loader.discover(
+ os.path.join(os.path.dirname(os.path.abspath(__file__)), "cephfs")
+ )
+
+ # Filter out tests that don't lend themselves to interactive running,
+ victims = []
+ for case, method in enumerate_methods(overall_suite):
+ fn = getattr(method, method._testMethodName)
+
+ drop_test = False
+
+ if hasattr(fn, 'is_for_teuthology') and getattr(fn, 'is_for_teuthology') is True:
+ drop_test = True
+ log.warn("Dropping test because long running: ".format(method.id()))
+
+ if getattr(fn, "needs_trimming", False) is True:
+ drop_test = (os.getuid() != 0)
+ log.warn("Dropping test because client trim unavailable: ".format(method.id()))
+
+ if drop_test:
+ # Don't drop the test if it was explicitly requested in arguments
+ is_named = False
+ for named in modules:
+ if named.endswith(method.id()):
+ is_named = True
+ break
+
+ if not is_named:
+ victims.append((case, method))
+
+ log.info("Disabling {0} tests because of is_for_teuthology or needs_trimming".format(len(victims)))
+ for s, method in victims:
+ s._tests.remove(method)
+
+ if interactive_on_error:
+ result_class = InteractiveFailureResult
+ else:
+ result_class = unittest.TextTestResult
+ fail_on_skip = False
+
+ class LoggingResult(result_class):
+ def startTest(self, test):
+ log.info("Starting test: {0}".format(self.getDescription(test)))
+ test.started_at = datetime.datetime.utcnow()
+ return super(LoggingResult, self).startTest(test)
+
+ def stopTest(self, test):
+ log.info("Stopped test: {0} in {1}s".format(
+ self.getDescription(test),
+ (datetime.datetime.utcnow() - test.started_at).total_seconds()
+ ))
+
+ def addSkip(self, test, reason):
+ if fail_on_skip:
+ # Don't just call addFailure because that requires a traceback
+ self.failures.append((test, reason))
+ else:
+ super(LoggingResult, self).addSkip(test, reason)
+
+ # Execute!
+ result = unittest.TextTestRunner(
+ stream=LogStream(),
+ resultclass=LoggingResult,
+ verbosity=2,
+ failfast=True).run(overall_suite)
+
+ if not result.wasSuccessful():
+ result.printErrors() # duplicate output at end for convenience
+
+ bad_tests = []
+ for test, error in result.errors:
+ bad_tests.append(str(test))
+ for test, failure in result.failures:
+ bad_tests.append(str(test))
+
+ sys.exit(-1)
+ else:
+ sys.exit(0)
+
+
+if __name__ == "__main__":
+ exec_test()
--- /dev/null
+
+"""
+watch_notify_same_primary task
+"""
+from cStringIO import StringIO
+import contextlib
+import logging
+
+from teuthology.orchestra import run
+from teuthology.contextutil import safe_while
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Run watch_notify_same_primary
+
+ The config should be as follows:
+
+ watch_notify_same_primary:
+ clients: [client list]
+
+ The client list should contain 1 client
+
+ The test requires 3 osds.
+
+ example:
+
+ tasks:
+ - ceph:
+ - watch_notify_same_primary:
+ clients: [client.0]
+ - interactive:
+ """
+ log.info('Beginning watch_notify_same_primary...')
+ assert isinstance(config, dict), \
+ "please list clients to run on"
+
+ clients = config.get('clients', ['client.0'])
+ assert len(clients) == 1
+ role = clients[0]
+ assert isinstance(role, basestring)
+ PREFIX = 'client.'
+ assert role.startswith(PREFIX)
+ (remote,) = ctx.cluster.only(role).remotes.iterkeys()
+ manager = ctx.managers['ceph']
+ manager.raw_cluster_cmd('osd', 'set', 'noout')
+
+ pool = manager.create_pool_with_unique_name()
+ def obj(n): return "foo-{num}".format(num=n)
+ def start_watch(n):
+ remote.run(
+ args = [
+ "rados",
+ "-p", pool,
+ "put",
+ obj(n),
+ "/etc/resolv.conf"],
+ logger=log.getChild('watch.{id}'.format(id=n)))
+ proc = remote.run(
+ args = [
+ "rados",
+ "-p", pool,
+ "watch",
+ obj(n)],
+ stdin=run.PIPE,
+ stdout=StringIO(),
+ stderr=StringIO(),
+ wait=False)
+ return proc
+
+ num = 20
+
+ watches = [start_watch(i) for i in range(num)]
+
+ # wait for them all to register
+ for i in range(num):
+ with safe_while() as proceed:
+ while proceed():
+ proc = remote.run(
+ args = [
+ "rados",
+ "-p", pool,
+ "listwatchers",
+ obj(i)],
+ stdout=StringIO())
+ lines = proc.stdout.getvalue()
+ num_watchers = lines.count('watcher=')
+ log.info('i see %d watchers for %s', num_watchers, obj(i))
+ if num_watchers >= 1:
+ break
+
+ def notify(n, msg):
+ remote.run(
+ args = [
+ "rados",
+ "-p", pool,
+ "notify",
+ obj(n),
+ msg],
+ logger=log.getChild('notify.{id}'.format(id=n)))
+
+ [notify(n, 'notify1') for n in range(len(watches))]
+
+ manager.kill_osd(0)
+ manager.mark_down_osd(0)
+
+ [notify(n, 'notify2') for n in range(len(watches))]
+
+ try:
+ yield
+ finally:
+ log.info('joining watch_notify_stress')
+ for watch in watches:
+ watch.stdin.write("\n")
+
+ run.wait(watches)
+
+ for watch in watches:
+ lines = watch.stdout.getvalue().split("\n")
+ got1 = False
+ got2 = False
+ for l in lines:
+ if 'notify1' in l:
+ got1 = True
+ if 'notify2' in l:
+ got2 = True
+ log.info(lines)
+ assert got1 and got2
+
+ manager.revive_osd(0)
+ manager.remove_pool(pool)
--- /dev/null
+"""
+test_stress_watch task
+"""
+import contextlib
+import logging
+import proc_thrasher
+
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Run test_stress_watch
+
+ The config should be as follows:
+
+ test_stress_watch:
+ clients: [client list]
+
+ example:
+
+ tasks:
+ - ceph:
+ - test_stress_watch:
+ clients: [client.0]
+ - interactive:
+ """
+ log.info('Beginning test_stress_watch...')
+ assert isinstance(config, dict), \
+ "please list clients to run on"
+ testwatch = {}
+
+ remotes = []
+
+ for role in config.get('clients', ['client.0']):
+ assert isinstance(role, basestring)
+ PREFIX = 'client.'
+ assert role.startswith(PREFIX)
+ id_ = role[len(PREFIX):]
+ (remote,) = ctx.cluster.only(role).remotes.iterkeys()
+ remotes.append(remote)
+
+ args =['CEPH_CLIENT_ID={id_}'.format(id_=id_),
+ 'CEPH_ARGS="{flags}"'.format(flags=config.get('flags', '')),
+ 'daemon-helper',
+ 'kill',
+ 'multi_stress_watch foo foo'
+ ]
+
+ log.info("args are %s" % (args,))
+
+ proc = proc_thrasher.ProcThrasher({}, remote,
+ args=[run.Raw(i) for i in args],
+ logger=log.getChild('testwatch.{id}'.format(id=id_)),
+ stdin=run.PIPE,
+ wait=False
+ )
+ proc.start()
+ testwatch[id_] = proc
+
+ try:
+ yield
+ finally:
+ log.info('joining watch_notify_stress')
+ for i in testwatch.itervalues():
+ i.join()
--- /dev/null
+"""
+Workunit task -- Run ceph on sets of specific clients
+"""
+import logging
+import pipes
+import os
+
+from util import get_remote_for_role
+
+from teuthology import misc
+from teuthology.config import config as teuth_config
+from teuthology.orchestra.run import CommandFailedError
+from teuthology.parallel import parallel
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+
+def task(ctx, config):
+ """
+ Run ceph on all workunits found under the specified path.
+
+ For example::
+
+ tasks:
+ - ceph:
+ - ceph-fuse: [client.0]
+ - workunit:
+ clients:
+ client.0: [direct_io, xattrs.sh]
+ client.1: [snaps]
+ branch: foo
+
+ You can also run a list of workunits on all clients:
+ tasks:
+ - ceph:
+ - ceph-fuse:
+ - workunit:
+ tag: v0.47
+ clients:
+ all: [direct_io, xattrs.sh, snaps]
+
+ If you have an "all" section it will run all the workunits
+ on each client simultaneously, AFTER running any workunits specified
+ for individual clients. (This prevents unintended simultaneous runs.)
+
+ To customize tests, you can specify environment variables as a dict. You
+ can also specify a time limit for each work unit (defaults to 3h):
+
+ tasks:
+ - ceph:
+ - ceph-fuse:
+ - workunit:
+ sha1: 9b28948635b17165d17c1cf83d4a870bd138ddf6
+ clients:
+ all: [snaps]
+ env:
+ FOO: bar
+ BAZ: quux
+ timeout: 3h
+
+ This task supports roles that include a ceph cluster, e.g.::
+
+ tasks:
+ - ceph:
+ - workunit:
+ clients:
+ backup.client.0: [foo]
+ client.1: [bar] # cluster is implicitly 'ceph'
+
+ :param ctx: Context
+ :param config: Configuration
+ """
+ assert isinstance(config, dict)
+ assert isinstance(config.get('clients'), dict), \
+ 'configuration must contain a dictionary of clients'
+
+ overrides = ctx.config.get('overrides', {})
+ misc.deep_merge(config, overrides.get('workunit', {}))
+
+ refspec = config.get('branch')
+ if refspec is None:
+ refspec = config.get('tag')
+ if refspec is None:
+ refspec = config.get('sha1')
+ if refspec is None:
+ refspec = 'HEAD'
+
+ timeout = config.get('timeout', '3h')
+
+ log.info('Pulling workunits from ref %s', refspec)
+
+ created_mountpoint = {}
+
+ if config.get('env') is not None:
+ assert isinstance(config['env'], dict), 'env must be a dictionary'
+ clients = config['clients']
+
+ # Create scratch dirs for any non-all workunits
+ log.info('Making a separate scratch dir for every client...')
+ for role in clients.iterkeys():
+ assert isinstance(role, basestring)
+ if role == "all":
+ continue
+
+ assert 'client' in role
+ created_mnt_dir = _make_scratch_dir(ctx, role, config.get('subdir'))
+ created_mountpoint[role] = created_mnt_dir
+
+ # Execute any non-all workunits
+ with parallel() as p:
+ for role, tests in clients.iteritems():
+ if role != "all":
+ p.spawn(_run_tests, ctx, refspec, role, tests,
+ config.get('env'), timeout=timeout)
+
+ # Clean up dirs from any non-all workunits
+ for role, created in created_mountpoint.items():
+ _delete_dir(ctx, role, created)
+
+ # Execute any 'all' workunits
+ if 'all' in clients:
+ all_tasks = clients["all"]
+ _spawn_on_all_clients(ctx, refspec, all_tasks, config.get('env'),
+ config.get('subdir'), timeout=timeout)
+
+
+def _client_mountpoint(ctx, cluster, id_):
+ """
+ Returns the path to the expected mountpoint for workunits running
+ on some kind of filesystem.
+ """
+ # for compatibility with tasks like ceph-fuse that aren't cluster-aware yet,
+ # only include the cluster name in the dir if the cluster is not 'ceph'
+ if cluster == 'ceph':
+ dir_ = 'mnt.{0}'.format(id_)
+ else:
+ dir_ = 'mnt.{0}.{1}'.format(cluster, id_)
+ return os.path.join(misc.get_testdir(ctx), dir_)
+
+
+def _delete_dir(ctx, role, created_mountpoint):
+ """
+ Delete file used by this role, and delete the directory that this
+ role appeared in.
+
+ :param ctx: Context
+ :param role: "role.#" where # is used for the role id.
+ """
+ cluster, _, id_ = misc.split_role(role)
+ remote = get_remote_for_role(ctx, role)
+ mnt = _client_mountpoint(ctx, cluster, id_)
+ client = os.path.join(mnt, 'client.{id}'.format(id=id_))
+
+ # Remove the directory inside the mount where the workunit ran
+ remote.run(
+ args=[
+ 'sudo',
+ 'rm',
+ '-rf',
+ '--',
+ client,
+ ],
+ )
+ log.info("Deleted dir {dir}".format(dir=client))
+
+ # If the mount was an artificially created dir, delete that too
+ if created_mountpoint:
+ remote.run(
+ args=[
+ 'rmdir',
+ '--',
+ mnt,
+ ],
+ )
+ log.info("Deleted artificial mount point {dir}".format(dir=client))
+
+
+def _make_scratch_dir(ctx, role, subdir):
+ """
+ Make scratch directories for this role. This also makes the mount
+ point if that directory does not exist.
+
+ :param ctx: Context
+ :param role: "role.#" where # is used for the role id.
+ :param subdir: use this subdir (False if not used)
+ """
+ created_mountpoint = False
+ cluster, _, id_ = misc.split_role(role)
+ remote = get_remote_for_role(ctx, role)
+ dir_owner = remote.user
+ mnt = _client_mountpoint(ctx, cluster, id_)
+ # if neither kclient nor ceph-fuse are required for a workunit,
+ # mnt may not exist. Stat and create the directory if it doesn't.
+ try:
+ remote.run(
+ args=[
+ 'stat',
+ '--',
+ mnt,
+ ],
+ )
+ log.info('Did not need to create dir {dir}'.format(dir=mnt))
+ except CommandFailedError:
+ remote.run(
+ args=[
+ 'mkdir',
+ '--',
+ mnt,
+ ],
+ )
+ log.info('Created dir {dir}'.format(dir=mnt))
+ created_mountpoint = True
+
+ if not subdir:
+ subdir = 'client.{id}'.format(id=id_)
+
+ if created_mountpoint:
+ remote.run(
+ args=[
+ 'cd',
+ '--',
+ mnt,
+ run.Raw('&&'),
+ 'mkdir',
+ '--',
+ subdir,
+ ],
+ )
+ else:
+ remote.run(
+ args=[
+ # cd first so this will fail if the mount point does
+ # not exist; pure install -d will silently do the
+ # wrong thing
+ 'cd',
+ '--',
+ mnt,
+ run.Raw('&&'),
+ 'sudo',
+ 'install',
+ '-d',
+ '-m', '0755',
+ '--owner={user}'.format(user=dir_owner),
+ '--',
+ subdir,
+ ],
+ )
+
+ return created_mountpoint
+
+
+def _spawn_on_all_clients(ctx, refspec, tests, env, subdir, timeout=None):
+ """
+ Make a scratch directory for each client in the cluster, and then for each
+ test spawn _run_tests() for each role.
+
+ See run_tests() for parameter documentation.
+ """
+ is_client = misc.is_type('client')
+ client_remotes = {}
+ created_mountpoint = {}
+ for remote, roles_for_host in ctx.cluster.remotes.items():
+ for role in roles_for_host:
+ if is_client(role):
+ client_remotes[role] = remote
+ created_mountpoint[role] = _make_scratch_dir(ctx, role, subdir)
+
+ for unit in tests:
+ with parallel() as p:
+ for role, remote in client_remotes.items():
+ p.spawn(_run_tests, ctx, refspec, role, [unit], env, subdir,
+ timeout=timeout)
+
+ # cleanup the generated client directories
+ for role, _ in client_remotes.items():
+ _delete_dir(ctx, role, created_mountpoint[role])
+
+def _run_tests(ctx, refspec, role, tests, env, subdir=None, timeout=None):
+ """
+ Run the individual test. Create a scratch directory and then extract the
+ workunits from git. Make the executables, and then run the tests.
+ Clean up (remove files created) after the tests are finished.
+
+ :param ctx: Context
+ :param refspec: branch, sha1, or version tag used to identify this
+ build
+ :param tests: specific tests specified.
+ :param env: environment set in yaml file. Could be None.
+ :param subdir: subdirectory set in yaml file. Could be None
+ :param timeout: If present, use the 'timeout' command on the remote host
+ to limit execution time. Must be specified by a number
+ followed by 's' for seconds, 'm' for minutes, 'h' for
+ hours, or 'd' for days. If '0' or anything that evaluates
+ to False is passed, the 'timeout' command is not used.
+ """
+ testdir = misc.get_testdir(ctx)
+ assert isinstance(role, basestring)
+ cluster, type_, id_ = misc.split_role(role)
+ assert type_ == 'client'
+ remote = get_remote_for_role(ctx, role)
+ mnt = _client_mountpoint(ctx, cluster, id_)
+ # subdir so we can remove and recreate this a lot without sudo
+ if subdir is None:
+ scratch_tmp = os.path.join(mnt, 'client.{id}'.format(id=id_), 'tmp')
+ else:
+ scratch_tmp = os.path.join(mnt, subdir)
+ srcdir = '{tdir}/workunit.{role}'.format(tdir=testdir, role=role)
+ clonedir = '{tdir}/clone.{role}'.format(tdir=testdir, role=role)
+
+ git_url = teuth_config.get_ceph_git_url()
+ remote.run(
+ logger=log.getChild(role),
+ args=[
+ 'git',
+ 'clone',
+ git_url,
+ clonedir,
+ run.Raw(';'),
+ 'cd', '--', clonedir,
+ run.Raw('&&'),
+ 'git', 'checkout', refspec,
+ run.Raw('&&'),
+ 'mv', 'qa/workunits', srcdir,
+ ],
+ )
+
+ remote.run(
+ logger=log.getChild(role),
+ args=[
+ 'cd', '--', srcdir,
+ run.Raw('&&'),
+ 'if', 'test', '-e', 'Makefile', run.Raw(';'), 'then', 'make', run.Raw(';'), 'fi',
+ run.Raw('&&'),
+ 'find', '-executable', '-type', 'f', '-printf', r'%P\0'.format(srcdir=srcdir),
+ run.Raw('>{tdir}/workunits.list.{role}'.format(tdir=testdir, role=role)),
+ ],
+ )
+
+ workunits_file = '{tdir}/workunits.list.{role}'.format(tdir=testdir, role=role)
+ workunits = sorted(misc.get_file(remote, workunits_file).split('\0'))
+ assert workunits
+
+ try:
+ assert isinstance(tests, list)
+ for spec in tests:
+ log.info('Running workunits matching %s on %s...', spec, role)
+ prefix = '{spec}/'.format(spec=spec)
+ to_run = [w for w in workunits if w == spec or w.startswith(prefix)]
+ if not to_run:
+ raise RuntimeError('Spec did not match any workunits: {spec!r}'.format(spec=spec))
+ for workunit in to_run:
+ log.info('Running workunit %s...', workunit)
+ args = [
+ 'mkdir', '-p', '--', scratch_tmp,
+ run.Raw('&&'),
+ 'cd', '--', scratch_tmp,
+ run.Raw('&&'),
+ run.Raw('CEPH_CLI_TEST_DUP_COMMAND=1'),
+ run.Raw('CEPH_REF={ref}'.format(ref=refspec)),
+ run.Raw('TESTDIR="{tdir}"'.format(tdir=testdir)),
+ run.Raw('CEPH_ARGS="--cluster {0}"'.format(cluster)),
+ run.Raw('CEPH_ID="{id}"'.format(id=id_)),
+ run.Raw('PATH=$PATH:/usr/sbin')
+ ]
+ if env is not None:
+ for var, val in env.iteritems():
+ quoted_val = pipes.quote(val)
+ env_arg = '{var}={val}'.format(var=var, val=quoted_val)
+ args.append(run.Raw(env_arg))
+ args.extend([
+ 'adjust-ulimits',
+ 'ceph-coverage',
+ '{tdir}/archive/coverage'.format(tdir=testdir)])
+ if timeout and timeout != '0':
+ args.extend(['timeout', timeout])
+ args.extend([
+ '{srcdir}/{workunit}'.format(
+ srcdir=srcdir,
+ workunit=workunit,
+ ),
+ ])
+ remote.run(
+ logger=log.getChild(role),
+ args=args,
+ label="workunit test {workunit}".format(workunit=workunit)
+ )
+ remote.run(
+ logger=log.getChild(role),
+ args=['sudo', 'rm', '-rf', '--', scratch_tmp],
+ )
+ finally:
+ log.info('Stopping %s on %s...', tests, role)
+ remote.run(
+ logger=log.getChild(role),
+ args=[
+ 'rm', '-rf', '--', workunits_file, srcdir, clonedir,
+ ],
+ )
--- /dev/null
+tasks:
+- exec:
+ all:
+ - echo America/New_York | sudo tee /etc/timezone
--- /dev/null
+tasks:
+- exec:
+ all:
+ - echo America/Los_Angeles | sudo tee /etc/timezone
--- /dev/null
+tasks:
+- exec:
+ all:
+ - echo America/Los_Angeles | sudo tee /etc/timezone
+ - [ $RANDOM -gt 32000 ] && echo America/New_York | sudo tee /etc/timezone
--- /dev/null
+[tox]
+envlist = flake8
+skipsdist = True
+
+[testenv:flake8]
+deps=
+ flake8
+commands=flake8 --select=F,E9 --exclude=venv,.tox
+++ /dev/null
-tasks:
-- exec:
- osd.0:
- - ceph osd set sortbitwise
- - for p in `ceph osd pool ls` ; do ceph osd pool set $p use_gmt_hitset true ; done
+++ /dev/null
-tasks:
-- exec:
- osd.0:
- - ceph osd set sortbitwise
- - ceph osd set require_jewel_osds
- - for p in `ceph osd pool ls` ; do ceph osd pool set $p use_gmt_hitset true ; done
+++ /dev/null
-tasks:
-- exec:
- osd.0:
- - ceph osd set require_kraken_osds
+++ /dev/null
-overrides:
- rgw:
- ec-data-pool: true
- cache-pools: true
- s3tests:
- slow_backend: true
+++ /dev/null
-overrides:
- rgw:
- ec-data-pool: true
- erasure_code_profile:
- name: testprofile
- k: 3
- m: 1
- ruleset-failure-domain: osd
- s3tests:
- slow_backend: true
+++ /dev/null
-overrides:
- rgw:
- ec-data-pool: true
- s3tests:
- slow_backend: true
+++ /dev/null
-overrides:
- rgw:
- ec-data-pool: false
+++ /dev/null
-tasks:
-- install:
-- ceph:
+++ /dev/null
-roles:
-- [osd.0, osd.1, osd.2, client.0, mon.a]
-- [osd.3, osd.4, osd.5, client.1, mon.b]
-- [osd.6, osd.7, osd.8, client.2, mon.c]
-- [osd.9, osd.10, osd.11, client.3, mon.d]
-- [osd.12, osd.13, osd.14, client.4, mon.e]
-- [osd.15, osd.16, osd.17, client.5]
-- [osd.18, osd.19, osd.20, client.6]
-- [osd.21, osd.22, osd.23, client.7]
-- [osd.24, osd.25, osd.26, client.8]
-- [osd.27, osd.28, osd.29, client.9]
-- [osd.30, osd.31, osd.32, client.10]
-- [osd.33, osd.34, osd.35, client.11]
-- [osd.36, osd.37, osd.38, client.12]
-- [osd.39, osd.40, osd.41, client.13]
-- [osd.42, osd.43, osd.44, client.14]
-- [osd.45, osd.46, osd.47, client.15]
-- [osd.48, osd.49, osd.50, client.16]
-- [osd.51, osd.52, osd.53, client.17]
-- [osd.54, osd.55, osd.56, client.18]
-- [osd.57, osd.58, osd.59, client.19]
-- [osd.60, osd.61, osd.62, client.20]
-- [osd.63, osd.64, osd.65, client.21]
-- [osd.66, osd.67, osd.68, client.22]
-- [osd.69, osd.70, osd.71, client.23]
-- [osd.72, osd.73, osd.74, client.24]
-- [osd.75, osd.76, osd.77, client.25]
-- [osd.78, osd.79, osd.80, client.26]
-- [osd.81, osd.82, osd.83, client.27]
-- [osd.84, osd.85, osd.86, client.28]
-- [osd.87, osd.88, osd.89, client.29]
-- [osd.90, osd.91, osd.92, client.30]
-- [osd.93, osd.94, osd.95, client.31]
-- [osd.96, osd.97, osd.98, client.32]
-- [osd.99, osd.100, osd.101, client.33]
-- [osd.102, osd.103, osd.104, client.34]
-- [osd.105, osd.106, osd.107, client.35]
-- [osd.108, osd.109, osd.110, client.36]
-- [osd.111, osd.112, osd.113, client.37]
-- [osd.114, osd.115, osd.116, client.38]
-- [osd.117, osd.118, osd.119, client.39]
-- [osd.120, osd.121, osd.122, client.40]
-- [osd.123, osd.124, osd.125, client.41]
-- [osd.126, osd.127, osd.128, client.42]
-- [osd.129, osd.130, osd.131, client.43]
-- [osd.132, osd.133, osd.134, client.44]
-- [osd.135, osd.136, osd.137, client.45]
-- [osd.138, osd.139, osd.140, client.46]
-- [osd.141, osd.142, osd.143, client.47]
-- [osd.144, osd.145, osd.146, client.48]
-- [osd.147, osd.148, osd.149, client.49]
-- [osd.150, osd.151, osd.152, client.50]
-#- [osd.153, osd.154, osd.155, client.51]
-#- [osd.156, osd.157, osd.158, client.52]
-#- [osd.159, osd.160, osd.161, client.53]
-#- [osd.162, osd.163, osd.164, client.54]
-#- [osd.165, osd.166, osd.167, client.55]
-#- [osd.168, osd.169, osd.170, client.56]
-#- [osd.171, osd.172, osd.173, client.57]
-#- [osd.174, osd.175, osd.176, client.58]
-#- [osd.177, osd.178, osd.179, client.59]
-#- [osd.180, osd.181, osd.182, client.60]
-#- [osd.183, osd.184, osd.185, client.61]
-#- [osd.186, osd.187, osd.188, client.62]
-#- [osd.189, osd.190, osd.191, client.63]
-#- [osd.192, osd.193, osd.194, client.64]
-#- [osd.195, osd.196, osd.197, client.65]
-#- [osd.198, osd.199, osd.200, client.66]
+++ /dev/null
-roles:
-- [osd.0, osd.1, osd.2, client.0, mon.a]
-- [osd.3, osd.4, osd.5, client.1, mon.b]
-- [osd.6, osd.7, osd.8, client.2, mon.c]
-- [osd.9, osd.10, osd.11, client.3, mon.d]
-- [osd.12, osd.13, osd.14, client.4, mon.e]
-- [osd.15, osd.16, osd.17, client.5]
-- [osd.18, osd.19, osd.20, client.6]
-- [osd.21, osd.22, osd.23, client.7]
-- [osd.24, osd.25, osd.26, client.8]
-- [osd.27, osd.28, osd.29, client.9]
-- [osd.30, osd.31, osd.32, client.10]
-- [osd.33, osd.34, osd.35, client.11]
-- [osd.36, osd.37, osd.38, client.12]
-- [osd.39, osd.40, osd.41, client.13]
-- [osd.42, osd.43, osd.44, client.14]
-- [osd.45, osd.46, osd.47, client.15]
-- [osd.48, osd.49, osd.50, client.16]
-- [osd.51, osd.52, osd.53, client.17]
-- [osd.54, osd.55, osd.56, client.18]
-- [osd.57, osd.58, osd.59, client.19]
-- [osd.60, osd.61, osd.62, client.20]
+++ /dev/null
-roles:
-- [osd.0, osd.1, osd.2, client.0, mon.a]
-- [osd.3, osd.4, osd.5, client.1, mon.b]
-- [osd.6, osd.7, osd.8, client.2, mon.c]
-- [osd.9, osd.10, osd.11, client.3, mon.d]
-- [osd.12, osd.13, osd.14, client.4, mon.e]
+++ /dev/null
-overrides:
- ceph:
- fs: btrfs
- conf:
- osd:
- osd sloppy crc: true
- osd op thread timeout: 60
+++ /dev/null
-overrides:
- ceph:
- fs: xfs
- conf:
- osd:
- osd sloppy crc: true
+++ /dev/null
-overrides:
- ceph:
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
-tasks:
-- thrashosds:
- timeout: 1200
- chance_pgnum_grow: 1
- chance_pgpnum_fix: 1
+++ /dev/null
-tasks:
-- rados:
- ops: 4000
- max_seconds: 3600
- objects: 50
- op_weights:
- read: 100
- write: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
+++ /dev/null
-../../../distros/all
\ No newline at end of file
+++ /dev/null
-# --suite buildpackages/any --ceph v10.0.1 --filter centos_7,ubuntu_14.04
-roles:
- - [client.0]
-tasks:
- - install:
- - exec:
- client.0:
- - ceph --version | grep 'version '
+++ /dev/null
-../../../distros/all
\ No newline at end of file
+++ /dev/null
-# --suite buildpackages/tests --ceph v10.0.1 --filter centos_7.2,ubuntu_14.04
-overrides:
- ansible.cephlab:
- playbook: users.yml
- buildpackages:
- good_machine:
- disk: 20 # GB
- ram: 2000 # MB
- cpus: 2
- min_machine:
- disk: 10 # GB
- ram: 1000 # MB
- cpus: 1
-roles:
- - [client.0]
-tasks:
- - install:
- - exec:
- client.0:
- - ceph --version | grep 'version '
+++ /dev/null
-meta:
-- desc: "3-node cluster"
-roles:
-- [mon.a, mds.a, osd.0, osd.1, client.0]
-- [mon.b, mds.b, osd.2, osd.3]
-- [mon.c, mds.c, osd.4, osd.5]
+++ /dev/null
-meta:
-- desc: "1-node cluster"
-roles:
- - [mon.a, osd.0, client.0]
+++ /dev/null
-../../../distros/supported
\ No newline at end of file
+++ /dev/null
-meta:
-- desc: "Build the cluster using ceph-ansible; then check health and make the keyring readable"
-tasks:
-- ceph_ansible:
-- exec:
- mon.a:
- - "sudo ceph health"
-- exec:
- all:
- - "KEYRING=/etc/ceph/ceph.client.admin.keyring; test -f $KEYRING && sudo chmod o+r $KEYRING"
-- install.ship_utilities:
-overrides:
- ceph_ansible:
- vars:
- ceph_test: true
-openstack:
- - volumes:
- count: 3
- size: 20 # GB
+++ /dev/null
-meta:
-- desc: "Set os_tuning_params to values that are safe for VMs"
-overrides:
- ceph_ansible:
- vars:
- os_tuning_params: '[{"name": "kernel.pid_max", "value": 4194303},{"name": "fs.file-max", "value": 26234859}]'
+++ /dev/null
-meta:
-- desc: "Use a stable upstream Ceph release"
-overrides:
- ceph_ansible:
- vars:
- ceph_origin: upstream
- ceph_stable: true
+++ /dev/null
-meta:
-- desc: "Have teuthology tell ceph-ansible which OSD devices to use"
-overrides:
- ceph_ansible:
- vars:
- osd_auto_discovery: false
+++ /dev/null
-meta:
-- desc: "Tell ceph-ansible to discover OSD devices automatically"
-overrides:
- ceph_ansible:
- vars:
- osd_auto_discovery: true
+++ /dev/null
-meta:
-- desc: "Use a collocated journal"
-overrides:
- ceph_ansible:
- vars:
- journal_collocation: true
- journal_size: 1024
+++ /dev/null
-meta:
-- desc: "Run ceph-admin-commands.sh"
-tasks:
-- workunit:
- clients:
- client.0:
- - ceph-tests/ceph-admin-commands.sh
+++ /dev/null
-meta:
-- desc: "Run the rados cls tests"
-tasks:
-- workunit:
- clients:
- client.0:
- - cls
+++ /dev/null
-meta:
-- desc: "Run the rbd cli tests"
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/run_cli_tests.sh
-
+++ /dev/null
-meta:
-- desc: "Run the rbd import/export tests"
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/import_export.sh
+++ /dev/null
-../../../ceph-deploy-overrides
\ No newline at end of file
+++ /dev/null
-../../../config_options
\ No newline at end of file
+++ /dev/null
-../../../distros/supported
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph-deploy:
- python_version: "2"
+++ /dev/null
-overrides:
- ceph-deploy:
- python_version: "3"
+++ /dev/null
-overrides:
- ceph-deploy:
- conf:
- global:
- debug ms: 1
- osd:
- debug osd: 10
- mon:
- debug mon: 10
-roles:
-- - mon.a
- - mds.0
- - osd.0
-- - osd.1
- - mon.b
- - client.0
-openstack:
- - machine:
- disk: 10 # GB
- ram: 2000 # MB
- cpus: 1
- volumes: # attached to each instance
- count: 3
- size: 10 # GB
-tasks:
-- ssh_keys:
-- print: "**** done ssh_keys"
-- ceph-deploy:
-- print: "**** done ceph-deploy"
-- workunit:
- clients:
- client.0:
- - ceph-tests/ceph-admin-commands.sh
-- print: "**** done ceph-tests/ceph-admin-commands.sh"
+++ /dev/null
-../../../distros/supported
\ No newline at end of file
+++ /dev/null
-roles:
-- - mon.a
- - client.0
-- - osd.0
- - osd.1
-openstack:
- - machine:
- disk: 20 # GB
- ram: 2000 # MB
- cpus: 1
- volumes: # attached to each instance
- count: 3
- size: 10 # GB
-tasks:
-- install:
-- ceph:
- fs: xfs # this implicitly means /dev/vd? are used instead of directories
- wait-for-scrub: false
- conf:
- global:
- mon pg warn min per osd: 2
- osd pool default size: 2
- osd crush chooseleaf type: 0 # failure domain == osd
- osd pg bits: 2
- osd pgp bits: 2
-#
-# Keep this around for debugging purposes. If uncommented the target
-# will pause and the workunit can be run and debug manually.
-#
-# - exec:
-# client.0:
-# - sleep 1000000000 # forever
-#
-- workunit:
- clients:
- all:
- - ceph-disk/ceph-disk.sh
+++ /dev/null
-overrides:
- ansible.cephlab:
- playbook: users.yml
-roles:
- - [mon.a, mds.a, osd.0, osd.1, client.0]
-
-tasks:
- - nop:
-
+++ /dev/null
-roles:
-- [mon.a, mds.a, mds.a-s]
-- [mon.b, mds.b, mds.b-s]
-- [mon.c, mds.c, mds.c-s]
-- [osd.0]
-- [osd.1]
-- [osd.2]
-- [client.0]
+++ /dev/null
-tasks:
-- install:
-- ceph:
- conf:
- mds:
- mds thrash exports: 1
- mds debug subtrees: 1
- mds debug scatterstat: 1
- mds verify scatter: 1
-- ceph-fuse:
-- workunit:
- clients:
- client.0:
- - suites/fsstress.sh
-
+++ /dev/null
-../../../cephfs/begin.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/clusters/fixed-2-ucephfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/mount/fuse.yaml
\ No newline at end of file
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- client use faked inos: true
+++ /dev/null
-../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/tasks/cfuse_workunit_suites_fsstress.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- fuse set user groups: true
- fuse default permissions: false
-tasks:
-- workunit:
- clients:
- all:
- - suites/pjd.sh
+++ /dev/null
-../../../cephfs/begin.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/clusters/fixed-2-ucephfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- exec:
- client.0:
- - sudo ceph mds set inline_data true --yes-i-really-mean-it
+++ /dev/null
-../../../../cephfs/mount/fuse.yaml
\ No newline at end of file
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- cephfs_test_runner:
- modules:
- - tasks.cephfs.test_dump_tree
+++ /dev/null
-overrides:
- ceph:
- log-whitelist:
- - Scrub error on inode
- conf:
- mds:
- mds log max segments: 1
- mds cache max size: 1000
-tasks:
-- cephfs_test_runner:
- modules:
- - tasks.cephfs.test_scrub_checks
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- fuse_default_permissions: 0
-tasks:
-- workunit:
- clients:
- all:
- - kernel_untar_build.sh
+++ /dev/null
-tasks:
-- workunit:
- timeout: 6h
- clients:
- all:
- - fs/misc
-
+++ /dev/null
-tasks:
-- workunit:
- clients:
- all:
- - fs/test_o_trunc.sh
+++ /dev/null
-tasks:
-- workunit:
- timeout: 6h
- clients:
- all:
- - fs/norstats
-
-overrides:
- ceph:
- conf:
- client:
- client dirsize rbytes: false
+++ /dev/null
-tasks:
-- workunit:
- timeout: 6h
- clients:
- all:
- - fs/quota
-
-overrides:
- ceph:
- conf:
- client:
- client quota: true
+++ /dev/null
-../../../../cephfs/tasks/cfuse_workunit_suites_blogbench.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/tasks/cfuse_workunit_suites_dbench.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/tasks/cfuse_workunit_suites_ffsb.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/tasks/cfuse_workunit_suites_fsstress.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- workunit:
- clients:
- all:
- - suites/fsx.sh
+++ /dev/null
-tasks:
-- workunit:
- clients:
- all:
- - suites/fsync-tester.sh
+++ /dev/null
-tasks:
-- workunit:
- clients:
- all:
- - suites/iogen.sh
-
+++ /dev/null
-tasks:
-- workunit:
- clients:
- all:
- - suites/iozone.sh
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- debug ms: 1
- debug client: 20
- fuse set user groups: true
- fuse default permissions: false
- mds:
- debug ms: 1
- debug mds: 20
-tasks:
-- workunit:
- clients:
- all:
- - suites/pjd.sh
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- ms_inject_delay_probability: 1
- ms_inject_delay_type: osd
- ms_inject_delay_max: 5
- client_oc_max_dirty_age: 1
-tasks:
-- exec:
- client.0:
- - cd $TESTDIR/mnt.* && dd if=/dev/zero of=./foo count=100
- - sleep 2
- - cd $TESTDIR/mnt.* && truncate --size 0 ./foo
+++ /dev/null
-../../../../cephfs/tasks/cfuse_workunit_trivial_sync.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/tasks/libcephfs_interface_tests.yaml
\ No newline at end of file
+++ /dev/null
-
-os_type: ubuntu
-os_version: "14.04"
-
-overrides:
- ceph-fuse:
- disabled: true
- kclient:
- disabled: true
-tasks:
-- workunit:
- clients:
- client.0:
- - libcephfs-java/test.sh
+++ /dev/null
-overrides:
- ceph-fuse:
- disabled: true
- kclient:
- disabled: true
-tasks:
-- workunit:
- clients:
- client.0:
- - fs/test_python.sh
+++ /dev/null
-tasks:
--mds_creation_failure:
-- workunit:
- clients:
- all: [fs/misc/trivial_sync.sh]
-
+++ /dev/null
-../../../cephfs/begin.yaml
\ No newline at end of file
+++ /dev/null
-roles:
-- [mon.a, mon.b, mon.c, mds.a, osd.0, osd.1, osd.2]
-- [client.2]
-- [client.1]
-- [client.0]
-
-log-rotate:
- ceph-mds: 10G
- ceph-osd: 10G
-
+++ /dev/null
-roles:
-- [mon.a, mon.b, mon.c, mds.a, osd.0, osd.1, osd.2]
-- [client.1]
-- [client.0]
-
-log-rotate:
- ceph-mds: 10G
- ceph-osd: 10G
-
+++ /dev/null
-../../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/mount/fuse.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms die on skipped message: false
-tasks:
-- kclient:
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- cephfs_test_runner:
- modules:
- - tasks.cephfs.test_misc
+++ /dev/null
-# make sure we get the same MPI version on all hosts
-os_type: ubuntu
-os_version: "14.04"
-
-tasks:
-- pexec:
- clients:
- - cd $TESTDIR
- - wget http://ceph.com/qa/fsx-mpi.c
- - mpicc fsx-mpi.c -o fsx-mpi
- - rm fsx-mpi.c
- - ln -s $TESTDIR/mnt.* $TESTDIR/gmnt
-- ssh_keys:
-- mpi:
- exec: sudo $TESTDIR/fsx-mpi -o 1MB -N 50000 -p 10000 -l 1048576 $TESTDIR/gmnt/test
- workdir: $TESTDIR/gmnt
-- pexec:
- all:
- - rm $TESTDIR/gmnt
- - rm $TESTDIR/fsx-mpi
+++ /dev/null
-# make sure we get the same MPI version on all hosts
-os_type: ubuntu
-os_version: "14.04"
-
-tasks:
-- pexec:
- clients:
- - cd $TESTDIR
- - wget http://ceph.com/qa/ior.tbz2
- - tar xvfj ior.tbz2
- - cd ior
- - ./configure
- - make
- - make install DESTDIR=$TESTDIR/binary/
- - cd $TESTDIR/
- - rm ior.tbz2
- - rm -r ior
- - ln -s $TESTDIR/mnt.* $TESTDIR/gmnt
-- ssh_keys:
-- mpi:
- exec: $TESTDIR/binary/usr/local/bin/ior -e -w -r -W -b 10m -a POSIX -o $TESTDIR/gmnt/ior.testfile
-- pexec:
- all:
- - rm -f $TESTDIR/gmnt/ior.testfile
- - rm -f $TESTDIR/gmnt
- - rm -rf $TESTDIR/binary
+++ /dev/null
-# make sure we get the same MPI version on all hosts
-os_type: ubuntu
-os_version: "14.04"
-
-tasks:
-- pexec:
- clients:
- - cd $TESTDIR
- - wget http://ceph.com/qa/mdtest-1.9.3.tgz
- - mkdir mdtest-1.9.3
- - cd mdtest-1.9.3
- - tar xvfz $TESTDIR/mdtest-1.9.3.tgz
- - rm $TESTDIR/mdtest-1.9.3.tgz
- - MPI_CC=mpicc make
- - ln -s $TESTDIR/mnt.* $TESTDIR/gmnt
-- ssh_keys:
-- mpi:
- exec: $TESTDIR/mdtest-1.9.3/mdtest -d $TESTDIR/gmnt -I 20 -z 5 -b 2 -R
-- pexec:
- all:
- - rm -f $TESTDIR/gmnt
- - rm -rf $TESTDIR/mdtest-1.9.3
- - rm -rf $TESTDIR/._mdtest-1.9.3
\ No newline at end of file
+++ /dev/null
-../../../cephfs/begin.yaml
\ No newline at end of file
+++ /dev/null
-roles:
-- [mon.a, osd.0, mon.b, mds.a, mds.b, client.1]
-- [mds.c, mds.d, mon.c, client.0, osd.1, osd.2]
-log-rotate:
- ceph-mds: 10G
- ceph-osd: 10G
+++ /dev/null
-../../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/mount/fuse.yaml
\ No newline at end of file
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- mon:
- debug mon: 20
+++ /dev/null
-../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph-fuse:
- disabled: true
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_failover
-
+++ /dev/null
-../../../cephfs/begin.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/clusters/fixed-2-ucephfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/mount/fuse.yaml
\ No newline at end of file
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- fuse default permissions: false
- client acl type: posix_acl
-tasks:
-- workunit:
- clients:
- all:
- - fs/misc/acl.sh
- - fs/misc/chmod.sh
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- fuse set user groups: true
- fuse default permissions: false
- client acl type: posix_acl
-tasks:
-- workunit:
- clients:
- all:
- - suites/pjd.sh
+++ /dev/null
-../../../cephfs/begin.yaml
\ No newline at end of file
+++ /dev/null
-roles:
-- [mon.a, osd.0, mds.a, mds.b, client.1, client.2, client.3]
-- [client.0, osd.1, osd.2]
-log-rotate:
- ceph-mds: 10G
- ceph-osd: 10G
+++ /dev/null
-../../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/mount/fuse.yaml
\ No newline at end of file
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph-fuse:
- disabled: true
+++ /dev/null
-../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- log-whitelist:
- - force file system read-only
- - bad backtrace
-
-
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_auto_repair
+++ /dev/null
-
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_backtrace
+++ /dev/null
-
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_cap_flush
+++ /dev/null
-
-overrides:
- ceph:
- log-whitelist:
- - responding to mclientcaps\(revoke\)
- - not advance its oldest_client_tid
-
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_client_limits
+++ /dev/null
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_readahead
+++ /dev/null
-
-# The task interferes with the network, so we need
-# to permit OSDs to complain about that.
-overrides:
- ceph:
- log-whitelist:
- - wrongly marked me down
- - slow request
-
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_client_recovery
+++ /dev/null
-
-overrides:
- ceph:
- conf:
- global:
- lockdep: true
-
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_config_commands
+++ /dev/null
-
-overrides:
- ceph:
- log-whitelist:
- - bad backtrace
- - object missing on disk
- - error reading table object
- - error reading sessionmap
- - Error loading MDS rank
- - missing journal object
- - Error recovering journal
- - error decoding table object
- - failed to read JournalPointer
- - Corrupt directory entry
- - Corrupt fnode header
- - corrupt sessionmap header
- - Corrupt dentry
- - Scrub error on inode
-
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_damage
-
+++ /dev/null
-
-overrides:
- ceph:
- log-whitelist:
- - bad backtrace
- - object missing on disk
- - error reading table object
- - error reading sessionmap
- - unmatched fragstat
- - was unreadable, recreating it now
-
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_data_scan
+++ /dev/null
-
-overrides:
- ceph:
- log-whitelist:
- - inode wrongly marked free
- - bad backtrace on inode
- - inode table repaired for inode
- - Scrub error on inode
-
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_forward_scrub
+++ /dev/null
-
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_fragment
+++ /dev/null
-
-overrides:
- ceph:
- log-whitelist:
- - bad backtrace on dir ino
- - error reading table object
-
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_journal_repair
+++ /dev/null
-
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_flush
+++ /dev/null
-
-overrides:
- ceph:
- log-whitelist:
- - OSD full dropping all updates
- - OSD near full
- - is full \(reached quota
- conf:
- osd:
- osd mon report interval max: 5
- osd objectstore: memstore
- memstore device bytes: 100000000
- client.0:
- debug client: 20
- debug objecter: 20
- debug objectcacher: 20
- client.1:
- debug client: 20
- debug objecter: 20
- debug objectcacher: 20
-
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_full
+++ /dev/null
-
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_pool_perm
+++ /dev/null
-
-overrides:
- ceph:
- conf:
- global:
- ms type: simple
- log-whitelist:
- - client session with invalid root
-
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_sessionmap
+++ /dev/null
-
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_strays
+++ /dev/null
-
-overrides:
- ceph:
- conf:
- global:
- ms type: simple
-
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_volume_client
+++ /dev/null
-../../../cephfs/begin.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/clusters/fixed-2-ucephfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/mount/fuse.yaml
\ No newline at end of file
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- workunit:
- clients:
- all:
- - fs/snaps
+++ /dev/null
-../../../cephfs/begin.yaml
\ No newline at end of file
+++ /dev/null
-
-overrides:
- ceph:
- conf:
- mds:
- mds standby replay: true
-
-roles:
-- [mon.a, mds.a, mds.b-s-0, osd.0, osd.1, client.0]
-- [mon.b, mds.c-s-0, mds.d-s-0, mon.c, osd.2, osd.3]
-log-rotate:
- ceph-mds: 10G
- ceph-osd: 10G
+++ /dev/null
-../../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/mount/fuse.yaml
\ No newline at end of file
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
+++ /dev/null
-
-tasks:
-- cephfs_test_runner:
- modules:
- - tasks.cephfs.test_journal_migration
+++ /dev/null
-../../../cephfs/begin.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- mds_thrash:
+++ /dev/null
-roles:
-- [mon.a, mon.c, osd.0, osd.1, osd.2, mds.b-s-a]
-- [mon.b, mds.a, osd.3, osd.4, osd.5, client.0]
-log-rotate:
- ceph-mds: 10G
- ceph-osd: 10G
+++ /dev/null
-../../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/mount/fuse.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 2500
- mds inject delay type: osd mds
- ms inject delay probability: .005
- ms inject delay max: 1
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- workunit:
- clients:
- all:
- - fs/snaps
+++ /dev/null
-../../../../cephfs/tasks/cfuse_workunit_suites_fsstress.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- fuse set user groups: true
- fuse default permissions: false
-tasks:
-- workunit:
- clients:
- all:
- - suites/pjd.sh
+++ /dev/null
-../../../../cephfs/tasks/cfuse_workunit_trivial_sync.yaml
\ No newline at end of file
+++ /dev/null
-../../../cephfs/begin.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/clusters/fixed-2-ucephfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/mount/fuse.yaml
\ No newline at end of file
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/tasks/cfuse_workunit_suites_blogbench.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/tasks/cfuse_workunit_suites_dbench.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/tasks/cfuse_workunit_suites_ffsb.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/tasks/cfuse_workunit_suites_fsstress.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- mds:
- mds inject traceless reply probability: .5
+++ /dev/null
-../../../cephfs/begin.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/clusters/fixed-2-ucephfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/mount/fuse.yaml
\ No newline at end of file
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- mon:
- debug ms: 1
- debug mon: 20
+++ /dev/null
-../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/tasks/cfuse_workunit_suites_dbench.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/tasks/cfuse_workunit_suites_fsstress.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/tasks/libcephfs_interface_tests.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- lockdep: true
+++ /dev/null
-os_type: centos # xenial valgrind buggy, see http://tracker.ceph.com/issues/18126
-overrides:
- install:
- ceph:
- flavor: notcmalloc
- debuginfo: true
- ceph:
- conf:
- global:
- osd heartbeat grace: 40
- valgrind:
- mon: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
- osd: [--tool=memcheck]
- mds: [--tool=memcheck]
- ceph-fuse:
- client.0:
- valgrind: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
+++ /dev/null
-
-os_type: ubuntu
-os_version: "14.04"
-
-overrides:
- ceph:
- conf:
- client:
- client permissions: false
-roles:
-- [mon.0, mds.0, osd.0, hadoop.master.0]
-- [mon.1, osd.1, hadoop.slave.0]
-- [mon.2, hadoop.slave.1, client.0]
-
+++ /dev/null
-tasks:
-- ssh_keys:
-- install:
-- ceph:
-- hadoop:
-- workunit:
- clients:
- client.0: [hadoop/repl.sh]
+++ /dev/null
-tasks:
-- ssh_keys:
-- install:
-- ceph:
-- hadoop:
-- workunit:
- clients:
- client.0: [hadoop/terasort.sh]
- env:
- NUM_RECORDS: "10000000"
+++ /dev/null
-tasks:
-- ssh_keys:
-- install:
-- ceph:
-- hadoop:
-- workunit:
- clients:
- client.0: [hadoop/wordcount.sh]
+++ /dev/null
-../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../../clusters/fixed-3-cephfs.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms die on skipped message: false
- mds:
- debug mds: 20
+++ /dev/null
-../../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- install:
-- ceph:
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- exec:
- client.0:
- - sudo ceph mds set inline_data true --yes-i-really-mean-it
+++ /dev/null
-tasks:
-- kclient:
-- workunit:
- clients:
- all:
- - direct_io
-
+++ /dev/null
-tasks:
-- kclient:
-- workunit:
- clients:
- all:
- - kernel_untar_build.sh
+++ /dev/null
-tasks:
-- kclient:
-- workunit:
- clients:
- all:
- - fs/misc
+++ /dev/null
-tasks:
-- kclient:
-- workunit:
- clients:
- all:
- - fs/test_o_trunc.sh
-
+++ /dev/null
-tasks:
-- kclient:
-- workunit:
- clients:
- all:
- - fs/snaps
+++ /dev/null
-tasks:
-- kclient:
-- workunit:
- clients:
- all:
- - suites/dbench.sh
+++ /dev/null
-tasks:
-- kclient:
-- workunit:
- clients:
- all:
- - suites/ffsb.sh
+++ /dev/null
-tasks:
-- kclient:
-- workunit:
- clients:
- all:
- - suites/fsstress.sh
+++ /dev/null
-tasks:
-- kclient:
-- workunit:
- clients:
- all:
- - suites/fsx.sh
+++ /dev/null
-tasks:
-- kclient:
-- workunit:
- clients:
- all:
- - suites/fsync-tester.sh
+++ /dev/null
-tasks:
-- kclient:
-- workunit:
- clients:
- all:
- - suites/iozone.sh
+++ /dev/null
-tasks:
-- kclient:
-- workunit:
- clients:
- all:
- - suites/pjd.sh
+++ /dev/null
-tasks:
-- kclient:
-- workunit:
- clients:
- all: [fs/misc/trivial_sync.sh]
+++ /dev/null
-roles:
-- [mon.a, mds.a, osd.0, osd.1]
-- [mon.b, mon.c, osd.2, osd.3]
-- [client.0]
-- [client.1]
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms die on skipped message: false
- mds:
- debug mds: 20
\ No newline at end of file
+++ /dev/null
-../../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- parallel:
- - user-workload
- - kclient-workload
-user-workload:
- sequential:
- - ceph-fuse: [client.0]
- - workunit:
- clients:
- client.0:
- - suites/iozone.sh
-kclient-workload:
- sequential:
- - kclient: [client.1]
- - workunit:
- clients:
- client.1:
- - suites/dbench.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- parallel:
- - user-workload
- - kclient-workload
-user-workload:
- sequential:
- - ceph-fuse: [client.0]
- - workunit:
- clients:
- client.0:
- - suites/blogbench.sh
-kclient-workload:
- sequential:
- - kclient: [client.1]
- - workunit:
- clients:
- client.1:
- - kernel_untar_build.sh
+++ /dev/null
-roles:
-- [mon.a, osd.0, mds.a, mds.c, client.2]
-- [osd.1, osd.2, mds.b, mds.d, client.3]
-- [client.0]
-- [client.1]
-log-rotate:
- ceph-mds: 10G
- ceph-osd: 10G
+++ /dev/null
-overrides:
- ceph:
- conf:
- mds:
- debug ms: 1
- debug mds: 20
- client.0:
- debug ms: 1
- debug client: 20
- client.1:
- debug ms: 1
- debug client: 20
+++ /dev/null
-
-overrides:
- ceph:
- conf:
- mds:
- mds bal frag: true
- mds bal fragment size max: 10000
- mds bal split size: 100
- mds bal merge size: 5
- mds bal split bits: 3
-
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- kclient:
+++ /dev/null
-overrides:
- ceph:
- log-whitelist:
- - force file system read-only
- - bad backtrace
-
-
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_auto_repair
+++ /dev/null
-
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_backtrace
+++ /dev/null
-
-overrides:
- ceph:
- log-whitelist:
- - responding to mclientcaps\(revoke\)
- - not advance its oldest_client_tid
-
-tasks:
- - cephfs_test_runner:
- fail_on_skip: false
- modules:
- - tasks.cephfs.test_client_limits
+++ /dev/null
-
-# The task interferes with the network, so we need
-# to permit OSDs to complain about that.
-overrides:
- ceph:
- log-whitelist:
- - wrongly marked me down
- - slow request
-
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_client_recovery
+++ /dev/null
-
-overrides:
- ceph:
- conf:
- global:
- lockdep: true
-
-tasks:
- - cephfs_test_runner:
- fail_on_skip: false
- modules:
- - tasks.cephfs.test_config_commands
+++ /dev/null
-
-overrides:
- ceph:
- log-whitelist:
- - bad backtrace
- - object missing on disk
- - error reading table object
- - error reading sessionmap
- - Error loading MDS rank
- - missing journal object
- - Error recovering journal
- - error decoding table object
- - failed to read JournalPointer
- - Corrupt directory entry
- - Corrupt fnode header
- - corrupt sessionmap header
- - Corrupt dentry
- - Scrub error on inode
-
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_damage
-
+++ /dev/null
-
-overrides:
- ceph:
- log-whitelist:
- - bad backtrace
- - object missing on disk
- - error reading table object
- - error reading sessionmap
- - unmatched fragstat
- - was unreadable, recreating it now
-
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_data_scan
+++ /dev/null
-
-tasks:
- - cephfs_test_runner:
- fail_on_skip: false
- modules:
- - tasks.cephfs.test_failover
-
+++ /dev/null
-
-overrides:
- ceph:
- log-whitelist:
- - inode wrongly marked free
- - bad backtrace on inode
- - inode table repaired for inode
- - Scrub error on inode
-
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_forward_scrub
+++ /dev/null
-
-overrides:
- ceph:
- log-whitelist:
- - bad backtrace on dir ino
- - error reading table object
-
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_journal_repair
+++ /dev/null
-
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_flush
+++ /dev/null
-
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_pool_perm
+++ /dev/null
-
-overrides:
- ceph:
- conf:
- global:
- ms type: simple
- log-whitelist:
- - client session with invalid root
-
-tasks:
- - cephfs_test_runner:
- fail_on_skip: false
- modules:
- - tasks.cephfs.test_sessionmap
+++ /dev/null
-
-tasks:
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_strays
+++ /dev/null
-
-overrides:
- ceph:
- conf:
- global:
- ms type: simple
-
-tasks:
- - cephfs_test_runner:
- fail_on_skip: false
- modules:
- - tasks.cephfs.test_volume_client
+++ /dev/null
-../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../../clusters/fixed-3-cephfs.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms die on skipped message: false
- mds:
- debug mds: 20
\ No newline at end of file
+++ /dev/null
-../../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
-- thrashosds:
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- mds_thrash:
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- mon_thrash:
- revive_delay: 20
- thrash_delay: 1
+++ /dev/null
-overrides:
- ceph:
- conf:
- osd:
- filestore flush min: 0
-tasks:
-- kclient:
-- workunit:
- clients:
- all:
- - suites/ffsb.sh
+++ /dev/null
-tasks:
-- kclient:
-- workunit:
- clients:
- all:
- - suites/iozone.sh
+++ /dev/null
-
-os_type: ubuntu
-
-overrides:
- ceph:
- conf:
- global:
- ms die on skipped message: false
-
-tasks:
-- install:
-- ceph:
-- kclient: [client.0]
-- knfsd:
- client.0:
- options: [rw,no_root_squash,async]
+++ /dev/null
-../../../../clusters/extra-client.yaml
\ No newline at end of file
+++ /dev/null
-../../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- nfs:
- client.1:
- server: client.0
- options: [rw,hard,intr,nfsvers=3]
+++ /dev/null
-tasks:
-- nfs:
- client.1:
- server: client.0
- options: [rw,hard,intr,nfsvers=4]
+++ /dev/null
-tasks:
-- workunit:
- timeout: 6h
- clients:
- client.1:
- - kernel_untar_build.sh
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.1:
- - fs/misc/chmod.sh
- - fs/misc/i_complete_vs_rename.sh
- - fs/misc/trivial_sync.sh
- #- fs/misc/multiple_rsync.sh
- #- fs/misc/xattrs.sh
-# Once we can run multiple_rsync.sh and xattrs.sh we can change to this
-# - misc
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.1:
- - suites/blogbench.sh
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.1:
- - suites/dbench-short.sh
+++ /dev/null
-overrides:
- ceph:
- conf:
- osd:
- filestore flush min: 0
-tasks:
-- workunit:
- clients:
- client.1:
- - suites/ffsb.sh
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.1:
- - suites/fsstress.sh
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.1:
- - suites/iozone.sh
+++ /dev/null
-../../../../clusters/fixed-3.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms die on skipped message: false
- client:
- rbd default features: 5
+++ /dev/null
-../../../../fs/btrfs.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- install:
-- ceph:
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 5000
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 500
+++ /dev/null
-tasks:
-- workunit:
- clients:
- all:
- - rbd/concurrent.sh
-# Options for rbd/concurrent.sh (default values shown)
-# env:
-# RBD_CONCURRENT_ITER: 100
-# RBD_CONCURRENT_COUNT: 5
-# RBD_CONCURRENT_DELAY: 5
+++ /dev/null
-tasks:
-- workunit:
- clients:
- all:
- - rbd/huge-tickets.sh
+++ /dev/null
-tasks:
-- workunit:
- clients:
- all:
- - rbd/image_read.sh
-# Options for rbd/image_read.sh (default values shown)
-# env:
-# IMAGE_READ_LOCAL_FILES: 'false'
-# IMAGE_READ_FORMAT: '2'
-# IMAGE_READ_VERBOSE: 'true'
-# IMAGE_READ_PAGE_SIZE: '4096'
-# IMAGE_READ_OBJECT_ORDER: '22'
-# IMAGE_READ_TEST_CLONES: 'true'
-# IMAGE_READ_DOUBLE_ORDER: 'true'
-# IMAGE_READ_HALF_ORDER: 'false'
+++ /dev/null
-tasks:
-- workunit:
- clients:
- all:
- - rbd/kernel.sh
+++ /dev/null
-tasks:
-- rbd_fsx:
- clients: [client.0]
- ops: 10000
- krbd: true
- readbdy: 512
- writebdy: 512
- truncbdy: 512
- holebdy: 512
- punch_holes: true
- randomized_striping: false
+++ /dev/null
-tasks:
-- workunit:
- clients:
- all:
- - rbd/map-snapshot-io.sh
+++ /dev/null
-tasks:
-- workunit:
- clients:
- all:
- - rbd/map-unmap.sh
+++ /dev/null
-tasks:
-- workunit:
- clients:
- all:
- - rbd/simple_big.sh
-
+++ /dev/null
-../../../../clusters/fixed-3.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms die on skipped message: false
- client:
- rbd default features: 5
+++ /dev/null
-../../../../fs/btrfs.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 5000
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 500
+++ /dev/null
-tasks:
-- install:
-- ceph: null
-- rbd_fio:
- client.0:
- fio-io-size: 90%
- formats: [2]
- features: [[layering,exclusive-lock]]
- io-engine: sync
- rw: randrw
- runtime: 900
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- rbd:
- all:
-- workunit:
- clients:
- all:
- - kernel_untar_build.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- rbd:
- all:
-- workunit:
- clients:
- all:
- - suites/dbench.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- rbd:
- all:
- image_size: 20480
-- workunit:
- clients:
- all:
- - suites/ffsb.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- rbd:
- all:
-- workunit:
- clients:
- all:
- - suites/fsstress.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- rbd:
- all:
- fs_type: btrfs
-- workunit:
- clients:
- all:
- - suites/fsstress.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- rbd:
- all:
- fs_type: ext4
-- workunit:
- clients:
- all:
- - suites/fsstress.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- rbd:
- all:
-- workunit:
- clients:
- all:
- - suites/fsx.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- rbd:
- all:
- image_size: 20480
-- workunit:
- clients:
- all:
- - suites/iozone.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- rbd:
- all:
-- workunit:
- clients:
- all: [fs/misc/trivial_sync.sh]
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms die on skipped message: false
- client:
- rbd default features: 5
+++ /dev/null
-../../../../fs/btrfs.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 5000
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 500
+++ /dev/null
-roles:
-- [mon.a, mon.c, osd.0, osd.1, osd.2]
-- [mon.b, mds.a, osd.3, osd.4, osd.5]
-- [client.0]
-- [client.1]
-tasks:
-- install:
-- ceph:
-- rbd.xfstests:
- client.0:
- test_image: 'test_image-0'
- scratch_image: 'scratch_image-0'
- tests: '-g auto'
- randomize: true
- client.1:
- test_image: 'test_image-1'
- scratch_image: 'scratch_image-1'
- tests: '-g auto'
- randomize: true
+++ /dev/null
-../../../../clusters/fixed-3.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms die on skipped message: false
- client:
- rbd default features: 5
+++ /dev/null
-../../../../fs/btrfs.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
-- thrashosds:
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- mon_thrash:
- revive_delay: 20
- thrash_delay: 1
+++ /dev/null
-tasks:
-- rbd_fio:
- client.0:
- fio-io-size: 90%
- formats: [2]
- features: [[layering,exclusive-lock]]
- io-engine: sync
- rw: randrw
- runtime: 1200
+++ /dev/null
-tasks:
-- rbd:
- all:
- image_size: 20480
-- workunit:
- clients:
- all:
- - suites/ffsb.sh
+++ /dev/null
-tasks:
-- rbd:
- all:
- image_size: 20480
-- workunit:
- clients:
- all:
- - suites/iozone.sh
+++ /dev/null
-overrides:
- ceph:
- crush_tunables: bobtail
-tasks:
-- install:
-- ceph:
+++ /dev/null
-# fixed-1.yaml, but with client.0 on a separate target
-overrides:
- ceph-deploy:
- conf:
- global:
- osd pool default size: 2
- osd crush chooseleaf type: 0
- osd pool default pg num: 128
- osd pool default pgp num: 128
-roles:
-- [mon.a, osd.0, osd.1, osd.2]
-- [client.0]
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- rbd default features: 1 # pre-single-major is v3.13, so layering only
+++ /dev/null
-overrides:
- kernel:
- client.0:
- branch: nightly_pre-single-major # v3.12.z
-tasks:
-- exec:
- client.0:
- - "modprobe -r rbd"
- - "modprobe --first-time rbd"
- - "test ! -f /sys/module/rbd/parameters/single_major"
+++ /dev/null
-tasks:
-- exec:
- client.0:
- - "modprobe -r rbd"
- - "modprobe --first-time rbd single_major=0"
- - "grep -q N /sys/module/rbd/parameters/single_major"
+++ /dev/null
-tasks:
-- exec:
- client.0:
- - "modprobe -r rbd"
- - "modprobe --first-time rbd single_major=1"
- - "grep -q Y /sys/module/rbd/parameters/single_major"
+++ /dev/null
-tasks:
-- cram:
- clients:
- client.0:
- - http://git.ceph.com/?p=ceph.git;a=blob_plain;hb={branch};f=src/test/cli-integration/rbd/unmap.t
+++ /dev/null
-../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-roles:
-- [mon.a, mon.c, osd.0, osd.1, osd.2]
-- [mon.b, mds.a, osd.3, osd.4, osd.5]
-- [client.0]
+++ /dev/null
-../../../../fs/btrfs.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- kclient:
-- workunit:
- clients:
- all:
- - suites/blogbench.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- kclient:
-- workunit:
- clients:
- all:
- - suites/fsx.sh
+++ /dev/null
-roles:
-- [mon.a, mon.b, mon.c, mds.a, osd.0, osd.1, osd.2]
-- [client.1]
-- [client.0]
+++ /dev/null
-../../../../fs/btrfs.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- kclient:
-- locktest: [client.0, client.1]
+++ /dev/null
-roles:
-- [mon.a, mon.b, mon.c, osd.0, osd.1, osd.2]
-- [mds.a]
-- [client.0]
+++ /dev/null
-tasks:
-- install:
-- ceph:
- conf:
- mds:
- mds log segment size: 16384
- mds log max segments: 1
-- restart:
- exec:
- client.0:
- - test-backtraces.py
+++ /dev/null
-roles:
-- [mon.a, mon.c, mds.a, osd.0, osd.1, osd.2]
-- [mon.b, mds.b, mds.c, osd.3, osd.4, osd.5]
-- [client.0]
-- [client.1]
+++ /dev/null
-roles:
-- [mon.a, mon.c, mds.a, mds.b, mds.c, mds.d, osd.0, osd.1, osd.2]
-- [mon.b, mds.e, mds.f, mds.g, mds.h, mds.i, osd.3, osd.4, osd.5]
-- [client.0]
-- [client.1]
+++ /dev/null
-../../../../fs/btrfs.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- install:
-- ceph:
- conf:
- client:
- fuse_default_permissions: 0
-- ceph-fuse:
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- kclient:
+++ /dev/null
-tasks:
-- workunit:
- clients:
- all:
- - fs/misc
+++ /dev/null
-tasks:
-- workunit:
- clients:
- all:
- - suites/blogbench.sh
+++ /dev/null
-tasks:
-- workunit:
- clients:
- all:
- - suites/dbench.sh
+++ /dev/null
-tasks:
-- workunit:
- clients:
- all:
- - suites/fsstress.sh
+++ /dev/null
-tasks:
-- workunit:
- clients:
- all:
- - suites/fsync-tester.sh
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- fuse default permissions: false
- fuse set user groups: true
-tasks:
-- workunit:
- clients:
- all:
- - suites/pjd.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
- conf:
- client:
- ms_inject_delay_probability: 1
- ms_inject_delay_type: osd
- ms_inject_delay_max: 5
- client_oc_max_dirty_age: 1
-- ceph-fuse:
-- exec:
- client.0:
- - dd if=/dev/zero of=./foo count=100
- - sleep 2
- - truncate --size 0 ./foo
+++ /dev/null
-overrides:
- ceph:
- conf:
- mds:
- mds thrash exports: 1
+++ /dev/null
-roles:
-- [mgr.x, mon.a, mds.a, mds.c, osd.0, client.0]
-- [mgr.y, mon.b, mds.b, osd.1, osd.2, client.1]
-log-rotate:
- ceph-mds: 10G
- ceph-osd: 10G
+++ /dev/null
-overrides:
- ceph:
- conf:
- mon:
- debug mon: 20
- mgr:
- debug mgr: 20
- debug ms: 1
- client:
- debug client: 20
- debug mgrc: 20
- debug ms: 1
- osd:
- debug mgrc: 20
- mds:
- debug mgrc: 20
+++ /dev/null
-overrides:
- ceph:
- fs: xfs
- conf:
- osd:
- osd sloppy crc: true
+++ /dev/null
-
-tasks:
- - install:
- - ceph:
- - cephfs_test_runner:
- modules:
- - tasks.mgr.test_failover
+++ /dev/null
-roles:
-- [mon.a, mds.a, osd.0, osd.1]
-- [mon.b, mon.c, osd.2, osd.3, client.0]
-- [client.1]
+++ /dev/null
-../../../../fs/btrfs.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms die on skipped message: false
-tasks:
-- install:
- branch: dumpling
-- ceph:
-- parallel:
- - user-workload
- - kclient-workload
-user-workload:
- sequential:
- - ceph-fuse: [client.0]
- - workunit:
- clients:
- client.0:
- - suites/iozone.sh
-kclient-workload:
- sequential:
- - kclient: [client.1]
- - workunit:
- clients:
- client.1:
- - suites/dbench.sh
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms die on skipped message: false
-tasks:
-- install:
- branch: dumpling
-- ceph:
-- parallel:
- - user-workload
- - kclient-workload
-user-workload:
- sequential:
- - ceph-fuse: [client.0]
- - workunit:
- clients:
- client.0:
- - suites/blogbench.sh
-kclient-workload:
- sequential:
- - kclient: [client.1]
- - workunit:
- clients:
- client.1:
- - kernel_untar_build.sh
+++ /dev/null
-../../fs/basic/begin.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/clusters/3-mds.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/clusters/9-mds.yaml
\ No newline at end of file
+++ /dev/null
-../../fs/basic/fs/
\ No newline at end of file
+++ /dev/null
-../../fs/basic/inline/
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/mount/fuse.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/mount/kclient.yaml
\ No newline at end of file
+++ /dev/null
-../../../fs/basic/overrides/
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/fuse/default-perm/no.yaml
\ No newline at end of file
+++ /dev/null
-../../fs/basic/tasks/
\ No newline at end of file
+++ /dev/null
-../../fs/verify/begin.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/clusters/3-mds.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/clusters/9-mds.yaml
\ No newline at end of file
+++ /dev/null
-../../fs/verify/fs/
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/mount/fuse.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/mount/kclient.yaml
\ No newline at end of file
+++ /dev/null
-../../../../cephfs/overrides/fuse/default-perm/no.yaml
\ No newline at end of file
+++ /dev/null
-../../../fs/verify/overrides/
\ No newline at end of file
+++ /dev/null
-../../fs/verify/tasks/
\ No newline at end of file
+++ /dev/null
-../../fs/verify/validater/
\ No newline at end of file
+++ /dev/null
-roles:
-- [mon.0, mon.1, mon.2, mds.0, client.0]
-- [osd.0]
-- [osd.1]
-- [osd.2]
+++ /dev/null
-../../../../fs/btrfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- thrashosds:
- chance_down: 1.0
- powercycle: true
- timeout: 600
+++ /dev/null
-overrides:
- ceph:
- conf:
- client.0:
- admin socket: /var/run/ceph/ceph-$name.asok
-tasks:
-- radosbench:
- clients: [client.0]
- time: 60
-- admin_socket:
- client.0:
- objecter_requests:
- test: "http://git.ceph.com/?p=ceph.git;a=blob_plain;f=src/test/admin_socket/objecter_requests;hb={branch}"
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- fuse_default_permissions: 0
-tasks:
-- ceph-fuse:
-- workunit:
- timeout: 6h
- clients:
- all:
- - kernel_untar_build.sh
+++ /dev/null
-tasks:
-- ceph-fuse:
-- workunit:
- timeout: 6h
- clients:
- all:
- - fs/misc
+++ /dev/null
-overrides:
- ceph:
- conf:
- osd:
- filestore flush min: 0
- mds:
- debug ms: 1
- debug mds: 20
-tasks:
-- ceph-fuse:
-- workunit:
- clients:
- all:
- - suites/ffsb.sh
+++ /dev/null
-tasks:
-- ceph-fuse:
-- workunit:
- clients:
- all:
- - suites/fsstress.sh
+++ /dev/null
-tasks:
-- ceph-fuse:
-- workunit:
- timeout: 6h
- clients:
- all:
- - suites/fsx.sh
+++ /dev/null
-tasks:
-- ceph-fuse:
-- workunit:
- clients:
- all:
- - suites/fsync-tester.sh
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- fuse default permissions: false
- fuse set user groups: true
-tasks:
-- ceph-fuse:
-- workunit:
- clients:
- all:
- - suites/pjd.sh
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- ms_inject_delay_probability: 1
- ms_inject_delay_type: osd
- ms_inject_delay_max: 5
- client_oc_max_dirty_age: 1
-tasks:
-- ceph-fuse:
-- exec:
- client.0:
- - dd if=/dev/zero of=./foo count=100
- - sleep 2
- - truncate --size 0 ./foo
+++ /dev/null
-overrides:
- ceph:
- log-whitelist:
- - reached quota
-tasks:
-- ceph-fuse:
-- workunit:
- clients:
- client.0:
- - rados/test.sh
+++ /dev/null
-tasks:
-- full_sequential:
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
+++ /dev/null
-tasks:
-- rados:
- clients: [client.0]
- ops: 4000
- objects: 500
- op_weights:
- read: 45
- write: 45
- delete: 10
+++ /dev/null
-tasks:
-- rados:
- clients: [client.0]
- ops: 4000
- objects: 50
- op_weights:
- read: 100
- write: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
+++ /dev/null
-tasks:
-- rados:
- clients: [client.0]
- ops: 4000
- objects: 500
- op_weights:
- read: 100
- write: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
+++ /dev/null
-../../../../clusters/fixed-2.yaml
\ No newline at end of file
+++ /dev/null
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 10 # GB
+++ /dev/null
-../../../../fs/btrfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../mon_kv_backend
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 5000
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 1500
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms type: async
- enable experimental unrecoverable data corrupting features: '*'
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms type: random
- enable experimental unrecoverable data corrupting features: '*'
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms type: simple
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-../../../config/rados.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- log-whitelist:
- - reached quota
- - wrongly marked me down
-tasks:
-- install:
-- ceph:
-- workunit:
- clients:
- client.0:
- - rados/test.sh
- - rados/test_pool_quota.sh
-
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- workunit:
- clients:
- client.0:
- - cls
+++ /dev/null
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
-- workunit:
- clients:
- client.0:
- - rados/test_python.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- workunit:
- clients:
- client.0:
- - rados/stress_watch.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- exec:
- client.0:
- - ceph_test_rados_striper_api_io
- - ceph_test_rados_striper_api_aio
- - ceph_test_rados_striper_api_striping
-
+++ /dev/null
-overrides:
- ceph:
- log-whitelist:
- - wrongly marked me down
-tasks:
-- install:
-- ceph:
-- workunit:
- clients:
- all:
- - rados/load-gen-big.sh
+++ /dev/null
-overrides:
- ceph:
- log-whitelist:
- - wrongly marked me down
-tasks:
-- install:
-- ceph:
-- workunit:
- clients:
- all:
- - rados/load-gen-mix.sh
+++ /dev/null
-overrides:
- ceph:
- log-whitelist:
- - wrongly marked me down
-tasks:
-- install:
-- ceph:
-- workunit:
- clients:
- all:
- - rados/load-gen-mostlyread.sh
+++ /dev/null
-overrides:
- ceph:
- crush_tunables: optimal
- conf:
- osd:
- osd_discard_disconnected_ops: false
-tasks:
-- install:
-- ceph:
-- rados:
- clients: [client.0]
- ops: 4000
- objects: 500
- op_weights:
- read: 45
- write: 45
- delete: 10
+++ /dev/null
-overrides:
- ceph:
- log-whitelist:
- - candidate had a stat error
- - candidate had a read error
- - deep-scrub 0 missing, 1 inconsistent objects
- - deep-scrub 0 missing, 4 inconsistent objects
- - deep-scrub [0-9]+ errors
- - '!= omap_digest'
- - '!= data_digest'
- - repair 0 missing, 1 inconsistent objects
- - repair 0 missing, 4 inconsistent objects
- - repair [0-9]+ errors, [0-9]+ fixed
- - scrub 0 missing, 1 inconsistent objects
- - scrub [0-9]+ errors
- - 'size 1 != size'
- - attr name mismatch
- conf:
- osd:
- filestore debug inject read err: true
- bluestore debug inject read err: true
-tasks:
-- install:
-- ceph:
-- repair_test:
-
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- debug rgw: 20
- debug ms: 1
-tasks:
-- install:
-- ceph:
-- rgw:
- default_idle_timeout: 3600
- client.0: null
-- thrash_pool_snaps:
- pools:
- - .rgw.buckets
- - .rgw.root
- - .rgw.control
- - .rgw
- - .users.uid
- - .users.email
- - .users
-- s3readwrite:
- client.0:
- rgw_server: client.0
- readwrite:
- bucket: rwtest
- readers: 10
- writers: 3
- duration: 300
- files:
- num: 10
- size: 2000
- stddev: 500
+++ /dev/null
-overrides:
- ceph:
- log-whitelist:
- - '!= data_digest'
- - '!= omap_digest'
- - '!= size'
- - 'deep-scrub 0 missing, 1 inconsistent objects'
- - 'deep-scrub [0-9]+ errors'
- - 'repair 0 missing, 1 inconsistent objects'
- - 'repair [0-9]+ errors, [0-9]+ fixed'
- - 'shard [0-9]+ missing'
- - 'deep-scrub 1 missing, 1 inconsistent objects'
- - 'does not match object info size'
- - 'attr name mistmatch'
- - 'deep-scrub 1 missing, 0 inconsistent objects'
- - 'failed to pick suitable auth object'
- conf:
- osd:
- osd deep scrub update digest min age: 0
-tasks:
-- install:
-- ceph:
-- scrub_test:
+++ /dev/null
-overrides:
- ceph:
- conf:
- mon:
- mon min osdmap epochs: 25
- paxos service trim min: 5
-tasks:
-- install:
-- ceph:
+++ /dev/null
-roles:
-- [mon.a, mon.c, osd.0, osd.1, osd.2]
-- [mon.b, osd.3, osd.4, osd.5, client.0]
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 10 # GB
+++ /dev/null
-roles:
-- [mon.a, mon.b, mon.c, mon.d, mon.e, osd.0, osd.1, osd.2]
-- [mon.f, mon.g, mon.h, mon.i, osd.3, osd.4, osd.5, client.0]
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 10 # GB
+++ /dev/null
-../../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../mon_kv_backend
\ No newline at end of file
+++ /dev/null
-../basic/msgr
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 5000
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 2500
- ms inject delay type: mon
- ms inject delay probability: .005
- ms inject delay max: 1
- ms inject internal delays: .002
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-../../../config/rados.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- mon_thrash:
- revive_delay: 90
- thrash_delay: 1
- thrash_store: true
- thrash_many: true
+++ /dev/null
-overrides:
- ceph:
- conf:
- osd:
- mon client ping interval: 4
- mon client ping timeout: 12
-tasks:
-- mon_thrash:
- revive_delay: 20
- thrash_delay: 1
- thrash_many: true
- freeze_mon_duration: 20
- freeze_mon_probability: 10
+++ /dev/null
-tasks:
-- mon_thrash:
- revive_delay: 20
- thrash_delay: 1
+++ /dev/null
-overrides:
- ceph:
- conf:
- mon:
- paxos min: 10
- paxos trim min: 10
-tasks:
-- mon_thrash:
- revive_delay: 90
- thrash_delay: 1
- thrash_many: true
+++ /dev/null
-overrides:
- ceph:
- conf:
- mon:
- paxos min: 10
- paxos trim min: 10
-tasks:
-- mon_thrash:
- revive_delay: 90
- thrash_delay: 1
+++ /dev/null
-overrides:
- ceph:
- log-whitelist:
- - slow request
-tasks:
-- exec:
- client.0:
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
- - ceph_test_rados_delete_pools_parallel
+++ /dev/null
-tasks:
-- exec:
- client.0:
- - ceph_test_rados_delete_pools_parallel --debug_objecter 20 --debug_ms 1 --debug_rados 20 --debug_monc 20
+++ /dev/null
-overrides:
- ceph:
- log-whitelist:
- - reached quota
- conf:
- global:
- debug objecter: 20
- debug rados: 20
- debug ms: 1
-tasks:
-- workunit:
- clients:
- client.0:
- - rados/test.sh
+++ /dev/null
-overrides:
- ceph:
- log-whitelist:
- - wrongly marked me down
-tasks:
-- workunit:
- clients:
- client.0:
- - mon/pool_ops.sh
- - mon/crush_ops.sh
- - mon/osd.sh
- - mon/caps.sh
-
+++ /dev/null
-tasks:
-- rados:
- clients: [client.0]
- ops: 4000
- objects: 50
- op_weights:
- read: 100
- write: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
+++ /dev/null
-roles:
-- [mon.a, mon.d, mon.g, mon.j, mon.m, mon.p, mon.s, osd.0]
-- [mon.b, mon.e, mon.h, mon.k, mon.n, mon.q, mon.t]
-- [mon.c, mon.f, mon.i, mon.l, mon.o, mon.r, mon.u, osd.1]
-openstack:
-- volumes: # attached to each instance
- count: 1
- size: 10 # GB
+++ /dev/null
-roles:
-- [mon.a, mon.b, mon.c, osd.0, osd.1]
-openstack:
-- volumes: # attached to each instance
- count: 2
- size: 10 # GB
+++ /dev/null
-roles:
-- [mon.a, mon.c, mon.e, osd.0]
-- [mon.b, mon.d, mon.f, osd.1]
-openstack:
-- volumes: # attached to each instance
- count: 1
- size: 10 # GB
+++ /dev/null
-roles:
-- [mon.a, mon.d, mon.g, osd.0]
-- [mon.b, mon.e, mon.h]
-- [mon.c, mon.f, mon.i, osd.1]
-openstack:
-- volumes: # attached to each instance
- count: 1
- size: 10 # GB
+++ /dev/null
-../../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../mon_kv_backend
\ No newline at end of file
+++ /dev/null
-../basic/msgr
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 5000
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 500
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-../../../config/rados.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - slow request
- - .*clock.*skew.*
- - clocks not synchronized
-- mon_clock_skew_check:
- expect-skew: false
+++ /dev/null
-overrides:
- ceph:
- conf:
- mon.b:
- clock offset: 10
-tasks:
-- install:
-- ceph:
- wait-for-healthy: false
- log-whitelist:
- - slow request
- - .*clock.*skew.*
- - clocks not synchronized
-- mon_clock_skew_check:
- expect-skew: true
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- mon_recovery:
+++ /dev/null
-roles:
-- [mon.a, osd.0, osd.1, osd.2, client.0]
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 10 # GB
-
-overrides:
- ceph:
- fs: xfs
- conf:
- osd:
- filestore xfs extsize: true
-
-tasks:
-- install:
-- ceph:
-- workunit:
- clients:
- all:
- - rados/test_alloc_hint.sh
+++ /dev/null
-roles:
-- [mon.0, osd.0, osd.1, osd.2, osd.3, osd.4, osd.5, client.0]
-openstack:
-- volumes: # attached to each instance
- count: 6
- size: 10 # GB
-tasks:
-- install:
-- ceph:
- fs: xfs
- conf:
- global:
- osd max object name len: 460
- osd max object namespace len: 64
-- ceph_objectstore_tool:
- objects: 20
+++ /dev/null
-roles:
-- [mon.0, osd.0, osd.1, client.0]
-openstack:
-- volumes: # attached to each instance
- count: 2
- size: 10 # GB
-tasks:
-- install:
-- ceph:
- fs: xfs
-- exec:
- client.0:
- - ceph_test_filejournal
+++ /dev/null
-roles:
-- [mon.0, osd.0, osd.1, client.0]
-openstack:
-- volumes: # attached to each instance
- count: 2
- size: 10 # GB
-tasks:
-- install:
-- ceph:
- fs: xfs
- conf:
- global:
- journal aio: true
-- filestore_idempotent:
+++ /dev/null
-roles:
-- [mon.0, osd.0, osd.1, client.0]
-openstack:
-- volumes: # attached to each instance
- count: 2
- size: 10 # GB
-tasks:
-- install:
-- ceph:
- fs: xfs
-- filestore_idempotent:
+++ /dev/null
-roles:
-- [mon.0, osd.0, osd.1, client.0]
-tasks:
-- install:
-- workunit:
- clients:
- all:
- - objectstore/test_fuse.sh
-
+++ /dev/null
-roles:
-- [mon.0, osd.0, osd.1, client.0]
-tasks:
-- install:
-- exec:
- client.0:
- - mkdir $TESTDIR/kvtest && cd $TESTDIR/kvtest && ceph_test_keyvaluedb
- - rm -rf $TESTDIR/kvtest
+++ /dev/null
-roles:
-- [mon.0, osd.0, osd.1, client.0]
-openstack:
-- volumes: # attached to each instance
- count: 2
- size: 10 # GB
-tasks:
-- install:
-- ceph:
- fs: xfs
-- workunit:
- clients:
- all:
- - osdc/stress_objectcacher.sh
+++ /dev/null
-roles:
-- [mon.0, osd.0, osd.1, client.0]
-openstack:
-- volumes: # attached to each instance
- count: 2
- size: 10 # GB
-tasks:
-- install:
-- exec:
- client.0:
- - mkdir $TESTDIR/ostest && cd $TESTDIR/ostest && ulimit -c 0 && ulimit -Sn 4096 && ceph_test_objectstore --gtest_filter=-*/3
- - rm -rf $TESTDIR/ostest
+++ /dev/null
-roles:
-- [mon.0, mds.a, osd.0, osd.1, osd.2, client.0, client.1]
-tasks:
-- install:
-- ceph:
- conf:
- global:
- osd max object name len: 460
- osd max object namespace len: 64
- debug client: 20
- debug mds: 20
- debug ms: 1
-- exec:
- client.0:
- - ceph osd pool create data_cache 4
- - ceph osd tier add cephfs_data data_cache
- - ceph osd tier cache-mode data_cache writeback
- - ceph osd tier set-overlay cephfs_data data_cache
- - ceph osd pool set data_cache hit_set_type bloom
- - ceph osd pool set data_cache hit_set_count 8
- - ceph osd pool set data_cache hit_set_period 3600
- - ceph osd pool set data_cache min_read_recency_for_promote 0
-- ceph-fuse:
-- exec:
- client.0:
- - sudo chmod 777 $TESTDIR/mnt.0/
- - dd if=/dev/urandom of=$TESTDIR/mnt.0/foo bs=1M count=5
- - ls -al $TESTDIR/mnt.0/foo
- - truncate --size 0 $TESTDIR/mnt.0/foo
- - ls -al $TESTDIR/mnt.0/foo
- - dd if=/dev/urandom of=$TESTDIR/mnt.0/foo bs=1M count=5
- - ls -al $TESTDIR/mnt.0/foo
- - cp $TESTDIR/mnt.0/foo /tmp/foo
- - sync
- - rados -p data_cache ls -
- - sleep 10
- - rados -p data_cache ls -
- - rados -p data_cache cache-flush-evict-all
- - rados -p data_cache ls -
- - sleep 1
-- exec:
- client.1:
- - hexdump -C /tmp/foo | head
- - hexdump -C $TESTDIR/mnt.1/foo | head
- - cmp $TESTDIR/mnt.1/foo /tmp/foo
+++ /dev/null
-roles:
-- [mon.a, osd.0, osd.1, osd.2, client.0]
-tasks:
-- install:
-- workunit:
- clients:
- all:
- - post-file.sh
+++ /dev/null
-roles:
-- - mon.a
- - osd.0
- - osd.1
- - osd.2
- - client.0
-tasks:
-- install:
-- ceph:
- conf:
- global:
- osd max object name len: 460
- osd max object namespace len: 64
-- exec:
- client.0:
- - ceph osd pool create base-pool 4
- - ceph osd pool create cache-pool 4
- - ceph osd tier add base-pool cache-pool
- - ceph osd tier cache-mode cache-pool writeback
- - ceph osd tier set-overlay base-pool cache-pool
- - dd if=/dev/urandom of=$TESTDIR/foo bs=1M count=1
- - rbd import --image-format 2 $TESTDIR/foo base-pool/bar
- - rbd snap create base-pool/bar@snap
- - rados -p base-pool cache-flush-evict-all
- - rbd export base-pool/bar $TESTDIR/bar
- - rbd export base-pool/bar@snap $TESTDIR/snap
- - cmp $TESTDIR/foo $TESTDIR/bar
- - cmp $TESTDIR/foo $TESTDIR/snap
- - rm $TESTDIR/foo $TESTDIR/bar $TESTDIR/snap
+++ /dev/null
-# verify #13098 fix
-roles:
-- [mon.a, osd.0, osd.1, osd.2, client.0]
-overrides:
- ceph:
- log-whitelist:
- - is full
-tasks:
-- install:
-- ceph:
- conf:
- global:
- osd max object name len: 460
- osd max object namespace len: 64
-- exec:
- client.0:
- - ceph osd pool create ec-ca 1 1
- - ceph osd pool create ec 1 1 erasure default
- - ceph osd tier add ec ec-ca
- - ceph osd tier cache-mode ec-ca readproxy
- - ceph osd tier set-overlay ec ec-ca
- - ceph osd pool set ec-ca hit_set_type bloom
- - ceph osd pool set-quota ec-ca max_bytes 20480000
- - ceph osd pool set-quota ec max_bytes 20480000
- - ceph osd pool set ec-ca target_max_bytes 20480000
- - timeout 30 rados -p ec-ca bench 30 write || true
+++ /dev/null
-roles:
-- [mon.0, osd.0, osd.1, client.0]
-tasks:
-- install:
-- exec:
- client.0:
- - ceph_test_async_driver
- - ceph_test_msgr
-openstack:
- - machine:
- disk: 40 # GB
- ram: 15000 # MB
- cpus: 1
- volumes: # attached to each instance
- count: 0
- size: 1 # GB
-overrides:
- ceph:
- conf:
- client:
- debug ms: 20
+++ /dev/null
-roles:
-- - mon.a
- - osd.0
- - osd.1
- - osd.2
- - client.0
-- - osd.3
- - osd.4
- - osd.5
-tasks:
-- install:
-- ceph:
- conf:
- osd:
- osd debug reject backfill probability: .3
- osd min pg log entries: 25
- osd max pg log entries: 100
- osd max object name len: 460
- osd max object namespace len: 64
-- exec:
- client.0:
- - sudo ceph osd pool create foo 64
- - rados -p foo bench 60 write -b 1024 --no-cleanup
- - sudo ceph osd pool set foo size 3
- - sudo ceph osd out 0 1
-- sleep:
- duration: 60
-- exec:
- client.0:
- - sudo ceph osd in 0 1
-- sleep:
- duration: 60
+++ /dev/null
-os_type: centos # xenial valgrind buggy, see http://tracker.ceph.com/issues/18126
-overrides:
- install:
- ceph:
- flavor: notcmalloc
- debuginfo: true
- ceph:
- conf:
- global:
- osd heartbeat grace: 40
- debug deliberately leak memory: true
- osd max object name len: 460
- osd max object namespace len: 64
- valgrind:
- mon: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
- osd: [--tool=memcheck]
-roles:
-- [mon.0, osd.0, osd.1, client.0]
-tasks:
-- install:
-- ceph:
- expect_valgrind_errors: true
+++ /dev/null
-../../../config/rados.yaml
\ No newline at end of file
+++ /dev/null
-roles:
-- - mon.a
- - osd.0
- - osd.1
- - client.a
-openstack:
- - volumes: # attached to each instance
- count: 2
- size: 10 # GB
-tasks:
-- install:
-- ceph:
-- admin_socket:
- osd.0:
- version:
- git_version:
- help:
- config show:
- config set filestore_dump_file /tmp/foo:
- perf dump:
- perf schema:
- get_heap_property tcmalloc.max_total_thread_cache_byte:
- set_heap_property tcmalloc.max_total_thread_cache_bytes 67108864:
- set_heap_property tcmalloc.max_total_thread_cache_bytes 33554432:
+++ /dev/null
-roles:
-- - mon.a
- - mon.b
- - mon.c
- - osd.0
- - osd.1
- - osd.2
- - client.0
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 10 # GB
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
- - had wrong client addr
- - had wrong cluster addr
- - must scrub before tier agent can activate
-- workunit:
- clients:
- all:
- - cephtool
- - mon/pool_ops.sh
+++ /dev/null
-roles:
-- - mon.0
- - osd.0
- - osd.1
- - osd.2
- - client.0
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 10 # GB
-
-overrides:
- ceph:
- conf:
- osd:
- debug osd: 5
-
-tasks:
-- install:
-- ceph:
-- divergent_priors:
+++ /dev/null
-roles:
-- - mon.0
- - osd.0
- - osd.1
- - osd.2
- - client.0
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 10 # GB
-
-overrides:
- ceph:
- conf:
- osd:
- debug osd: 5
-
-tasks:
-- install:
-- ceph:
-- divergent_priors2:
+++ /dev/null
-roles:
-- - mon.a
- - osd.0
- - osd.1
-openstack:
- - volumes: # attached to each instance
- count: 2
- size: 10 # GB
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
-- dump_stuck:
+++ /dev/null
-roles:
-- - mon.a
- - mon.b
- - mon.c
- - osd.0
- - osd.1
- - osd.2
- - osd.3
-openstack:
- - volumes: # attached to each instance
- count: 4
- size: 10 # GB
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - objects unfound and apparently lost
-- ec_lost_unfound:
+++ /dev/null
-roles:
-- - mon.a
- - mon.b
- - mon.c
- - osd.0
- - osd.1
- - osd.2
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 10 # GB
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - objects unfound and apparently lost
-- rep_lost_unfound_delete:
+++ /dev/null
-roles:
-- - mon.a
- - mon.b
- - mon.c
- - osd.0
- - osd.1
- - osd.2
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 10 # GB
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - objects unfound and apparently lost
-- lost_unfound:
+++ /dev/null
-roles:
-- - mon.0
- - mon.1
- - mon.2
- - osd.0
- - osd.1
- - osd.2
- - client.0
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 10 # GB
-tasks:
-- install:
-- ceph:
-- workunit:
- clients:
- all:
- - mon/test_mon_config_key.py
+++ /dev/null
-roles:
-- - mon.a
- - osd.0
- - osd.1
- - osd.2
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 10 # GB
-tasks:
-- install:
-- ceph:
- config:
- global:
- osd pool default min size : 1
- osd:
- debug monc: 1
- debug ms: 1
-- mon_seesaw:
-- ceph_manager.create_pool:
- kwargs:
- pool_name: test
- pg_num: 1
-- ceph_manager.wait_for_clean:
- kwargs:
- timeout: 60
+++ /dev/null
-roles:
-- - mon.a
- - mon.b
- - mon.c
- - osd.0
- - osd.1
- - client.0
-openstack:
- - volumes: # attached to each instance
- count: 2
- size: 10 # GB
-tasks:
-- install:
-- ceph:
-- mon_thrash:
- revive_delay: 20
- thrash_delay: 1
-- workunit:
- clients:
- all:
- - mon/workloadgen.sh
- env:
- LOADGEN_NUM_OSDS: "5"
- VERBOSE: "1"
- DURATION: "600"
+++ /dev/null
-roles:
-- - mon.a
- - mon.b
- - mon.c
- - osd.0
- - osd.1
- - osd.2
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 10 # GB
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
- conf:
- osd:
- osd min pg log entries: 5
-- osd_backfill:
+++ /dev/null
-roles:
-- - mon.a
- - mon.b
- - mon.c
- - osd.0
- - osd.1
- - osd.2
- - osd.3
-openstack:
- - volumes: # attached to each instance
- count: 4
- size: 10 # GB
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
- conf:
- osd:
- osd min pg log entries: 5
- osd_fast_fail_on_connection_refused: false
-- osd_recovery.test_incomplete_pgs:
+++ /dev/null
-roles:
-- - mon.a
- - mon.b
- - mon.c
- - osd.0
- - osd.1
- - osd.2
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 10 # GB
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
- conf:
- osd:
- osd min pg log entries: 5
- osd_fast_fail_on_connection_refused: false
-- osd_recovery:
+++ /dev/null
-roles:
-- - mon.0
- - mon.1
- - mon.2
- - osd.0
- - osd.1
- - osd.2
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 10 # GB
-tasks:
-- install:
-- ceph:
- config:
- global:
- osd pool default min size : 1
- log-whitelist:
- - objects unfound and apparently lost
-- peer:
+++ /dev/null
-roles:
-- - mon.a
- - osd.0
- - osd.1
- - osd.2
- - client.0
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 10 # GB
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
- - slow request
-- exec:
- client.0:
- - sudo ceph osd pool create foo 128 128
- - sleep 5
- - sudo ceph tell osd.0 injectargs -- --osd-inject-failure-on-pg-removal
- - sudo ceph osd pool delete foo foo --yes-i-really-really-mean-it
-- ceph.wait_for_failure: [osd.0]
-- exec:
- client.0:
- - sudo ceph osd down 0
-- ceph.restart: [osd.0]
-- ceph.healthy:
+++ /dev/null
-roles:
-- - mon.a
- - osd.0
- - osd.1
- - osd.2
- - client.0
-openstack:
- - volumes: # attached to each instance
- count: 2
- size: 10 # GB
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
- - had wrong client addr
- - had wrong cluster addr
- - reached quota
-- workunit:
- clients:
- all:
- - rados/test_rados_tool.sh
+++ /dev/null
-roles:
-- - mon.0
- - mon.1
- - mon.2
- - osd.0
- - osd.1
- - osd.2
- - client.0
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 10 # GB
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - no reply from
-- full_sequential:
- - radosbench:
- clients: [client.0]
- time: 30
- - rebuild_mondb:
- - radosbench:
- clients: [client.0]
- time: 30
+++ /dev/null
-roles:
-- - mon.0
- - osd.0
- - osd.1
- - osd.2
- - client.0
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 10 # GB
-
-overrides:
- ceph:
- conf:
- osd:
- debug osd: 5
-
-tasks:
-- install:
-- ceph:
-- reg11184:
+++ /dev/null
-roles:
-- [mon.0]
-- [osd.0, osd.1, osd.2, client.0]
-
-tasks:
-- install:
-- ceph:
- fs: xfs
-- resolve_stuck_peering:
-
+++ /dev/null
-roles:
-- - mon.0
- - mon.1
- - mon.2
- - osd.0
- - osd.1
- - osd.2
- - mds.a
- - client.0
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 10 # GB
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
- - had wrong client addr
- conf:
- client.rest0:
- debug ms: 1
- debug objecter: 20
- debug rados: 20
-- rest-api: [client.0]
-- workunit:
- clients:
- all:
- - rest/test.py
+++ /dev/null
-overrides:
- ceph:
- fs: ext4
- conf:
- global:
- osd max object name len: 460
- osd max object namespace len: 64
-roles:
-- [mon.a, osd.0, osd.1, osd.2, client.0]
-tasks:
-- install:
-- ceph:
-- workunit:
- clients:
- all:
- - rados/test_envlibrados_for_rocksdb.sh
+++ /dev/null
-roles:
-- - mon.a
- - osd.0
- - osd.1
- - osd.2
-- - osd.3
- - osd.4
- - osd.5
- - client.0
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 10 # GB
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
-- thrashosds:
- op_delay: 30
- clean_interval: 120
- chance_down: .5
-- workunit:
- clients:
- all:
- - rados/load-gen-mix-small.sh
+++ /dev/null
-roles:
-- - mon.a
- - osd.0
- - osd.1
- - osd.2
-- - osd.3
- - osd.4
- - osd.5
- - client.0
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 30 # GB
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
- - slow request
-- exec:
- client.0:
- - sudo ceph osd pool create base 4
- - sudo ceph osd pool create cache 4
- - sudo ceph osd tier add base cache
- - sudo ceph osd tier cache-mode cache writeback
- - sudo ceph osd tier set-overlay base cache
- - sudo ceph osd pool set cache hit_set_type bloom
- - sudo ceph osd pool set cache hit_set_count 8
- - sudo ceph osd pool set cache hit_set_period 60
- - sudo ceph osd pool set cache target_max_objects 500
-- background_exec:
- mon.a:
- - while true
- - do sleep 30
- - echo proxy
- - sudo ceph osd tier cache-mode cache proxy
- - sleep 10
- - sudo ceph osd pool set cache cache_target_full_ratio .001
- - echo cache-try-flush-evict-all
- - rados -p cache cache-try-flush-evict-all
- - sleep 5
- - echo cache-flush-evict-all
- - rados -p cache cache-flush-evict-all
- - sleep 5
- - echo remove overlay
- - sudo ceph osd tier remove-overlay base
- - sleep 20
- - echo add writeback overlay
- - sudo ceph osd tier cache-mode cache writeback
- - sudo ceph osd pool set cache cache_target_full_ratio .8
- - sudo ceph osd tier set-overlay base cache
- - sleep 30
- - sudo ceph osd tier cache-mode cache readproxy
- - done
-- rados:
- clients: [client.0]
- pools: [base]
- max_seconds: 600
- ops: 400000
- objects: 10000
- size: 1024
- op_weights:
- read: 100
- write: 100
- delete: 50
- copy_from: 50
+++ /dev/null
-roles:
-- - mon.0
- - mon.1
- - mon.2
- - osd.0
- - osd.1
- - osd.2
- - client.0
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 10 # GB
-tasks:
-- install:
-- ceph:
- config:
- global:
- osd pool default min size : 1
- client:
- debug ms: 1
- debug objecter: 20
- debug rados: 20
- log-whitelist:
- - objects unfound and apparently lost
-- watch_notify_same_primary:
- clients: [client.0]
+++ /dev/null
-../../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-../basic/msgr
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 5000
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 500
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-../../../config/rados.yaml
\ No newline at end of file
+++ /dev/null
-roles:
-- [osd.0, osd.1, osd.2, client.0, mon.a]
-- [osd.3, osd.4, osd.5, mon.b]
-- [osd.6, osd.7, osd.8, mon.c]
-- [osd.9, osd.10, osd.11]
+++ /dev/null
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 10 # GB
+++ /dev/null
-../thrash/fs
\ No newline at end of file
+++ /dev/null
-../../../mon_kv_backend/leveldb.yaml
\ No newline at end of file
+++ /dev/null
-../thrash/msgr-failures
\ No newline at end of file
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-../../../config/rados.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
- - slow request
- conf:
- osd:
- osd debug reject backfill probability: .3
- osd scrub min interval: 60
- osd scrub max interval: 120
- osd max backfills: 6
-- thrashosds:
- timeout: 1200
- chance_pgnum_grow: 1
- chance_pgpnum_fix: 1
- min_in: 8
+++ /dev/null
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
- conf:
- mon:
- mon osd pool ec fast read: 1
- osd:
- osd debug reject backfill probability: .3
- osd scrub min interval: 60
- osd scrub max interval: 120
- osd max backfills: 2
-- thrashosds:
- timeout: 1200
- chance_pgnum_grow: 1
- chance_pgpnum_fix: 1
- min_in: 4
+++ /dev/null
-overrides:
- ceph:
- conf:
- mon:
- mon min osdmap epochs: 2
- osd:
- osd map cache size: 1
- osd scrub min interval: 60
- osd scrub max interval: 120
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
- - osd_map_cache_size
-- thrashosds:
- timeout: 1800
- chance_pgnum_grow: 1
- chance_pgpnum_fix: 1
- chance_test_map_discontinuity: 0.5
- min_in: 8
+++ /dev/null
-tasks:
-- install:
-- ceph:
- conf:
- osd:
- osd scrub min interval: 60
- osd scrub max interval: 120
- osd max backfills: 9
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
-- thrashosds:
- timeout: 1200
- chance_pgnum_grow: 3
- chance_pgpnum_fix: 1
- min_in: 8
+++ /dev/null
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
- conf:
- osd:
- osd scrub min interval: 60
- osd scrub max interval: 120
-- thrashosds:
- timeout: 1200
- chance_pgnum_grow: 2
- chance_pgpnum_fix: 1
- min_in: 8
+++ /dev/null
-../../../../erasure-code/ec-rados-plugin=lrc-k=4-m=2-l=3.yaml
\ No newline at end of file
+++ /dev/null
-arch: x86_64
+++ /dev/null
-../thrash/clusters
\ No newline at end of file
+++ /dev/null
-../thrash/fs
\ No newline at end of file
+++ /dev/null
-../../../mon_kv_backend/leveldb.yaml
\ No newline at end of file
+++ /dev/null
-../thrash/msgr-failures
\ No newline at end of file
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-../../../config/rados.yaml
\ No newline at end of file
+++ /dev/null
-../../../distros/supported
\ No newline at end of file
+++ /dev/null
-../thrash/thrashers
\ No newline at end of file
+++ /dev/null
-../../../../erasure-code/ec-rados-plugin=isa-k=2-m=1.yaml
\ No newline at end of file
+++ /dev/null
-../../../../clusters/fixed-4.yaml
\ No newline at end of file
+++ /dev/null
-openstack:
- - volumes: # attached to each instance
- count: 4
- size: 10 # GB
+++ /dev/null
-../../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../mon_kv_backend/leveldb.yaml
\ No newline at end of file
+++ /dev/null
-../thrash/msgr-failures
\ No newline at end of file
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-../../../config/rados.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
- - slow request
- conf:
- osd:
- osd debug reject backfill probability: .3
- osd scrub min interval: 60
- osd scrub max interval: 120
- osd max backfills: 3
-- thrashosds:
- timeout: 1200
- chance_pgnum_grow: 1
- chance_pgpnum_fix: 1
- min_in: 8
+++ /dev/null
-../../../../erasure-code/ec-rados-plugin=shec-k=4-m=3-c=2.yaml
\ No newline at end of file
+++ /dev/null
-../thrash/clusters
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- mon osd pool ec fast read: true
+++ /dev/null
-../thrash/fs
\ No newline at end of file
+++ /dev/null
-../../../mon_kv_backend/leveldb.yaml
\ No newline at end of file
+++ /dev/null
-../thrash/msgr-failures
\ No newline at end of file
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-../../../config/rados.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
- conf:
- osd:
- osd debug reject backfill probability: .3
- osd scrub min interval: 60
- osd scrub max interval: 120
- osd max backfills: 2
-- thrashosds:
- timeout: 1200
- chance_pgnum_grow: 1
- chance_pgpnum_fix: 1
- min_in: 4
+++ /dev/null
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
- conf:
- mon:
- mon osd pool ec fast read: 1
- osd:
- osd debug reject backfill probability: .3
- osd scrub min interval: 60
- osd scrub max interval: 120
- osd max backfills: 3
-- thrashosds:
- timeout: 1200
- chance_pgnum_grow: 1
- chance_pgpnum_fix: 1
- min_in: 4
+++ /dev/null
-overrides:
- ceph:
- conf:
- mon:
- mon min osdmap epochs: 2
- osd:
- osd map cache size: 1
- osd scrub min interval: 60
- osd scrub max interval: 120
- osd max backfills: 5
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
- - osd_map_cache_size
-- thrashosds:
- timeout: 1800
- chance_pgnum_grow: 1
- chance_pgpnum_fix: 1
- chance_test_map_discontinuity: 0.5
- min_in: 4
+++ /dev/null
-tasks:
-- install:
-- ceph:
- conf:
- osd:
- osd scrub min interval: 60
- osd scrub max interval: 120
- osd max backfills: 9
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
-- thrashosds:
- timeout: 1200
- chance_pgnum_grow: 3
- chance_pgpnum_fix: 1
- min_in: 4
+++ /dev/null
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
- conf:
- osd:
- osd scrub min interval: 60
- osd scrub max interval: 120
- osd max backfills: 4
-- thrashosds:
- timeout: 1200
- chance_pgnum_grow: 2
- chance_pgpnum_fix: 1
- min_in: 4
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- enable experimental unrecoverable data corrupting features: '*'
- thrashosds:
- disable_objectstore_tool_tests: true
-tasks:
-- rados:
- clients: [client.0]
- ops: 4000
- objects: 50
- pool_snaps: true
- ec_pool: true
- erasure_code_use_hacky_overwrites: true
- op_weights:
- read: 100
- write: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
+++ /dev/null
-../../../../erasure-code/ec-rados-plugin=jerasure-k=2-m=1.yaml
\ No newline at end of file
+++ /dev/null
-../../../../erasure-code/ec-rados-plugin=jerasure-k=3-m=1.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- full_sequential:
- - radosbench:
- clients: [client.0]
- time: 150
- unique_pool: true
- ec_pool: true
- - radosbench:
- clients: [client.0]
- time: 150
- unique_pool: true
- ec_pool: true
- - radosbench:
- clients: [client.0]
- time: 150
- unique_pool: true
- ec_pool: true
- - radosbench:
- clients: [client.0]
- time: 150
- unique_pool: true
- ec_pool: true
- - radosbench:
- clients: [client.0]
- time: 150
- unique_pool: true
- ec_pool: true
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- enable experimental unrecoverable data corrupting features: '*'
- thrashosds:
- disable_objectstore_tool_tests: true
-tasks:
-- rados:
- clients: [client.0]
- ops: 400000
- max_seconds: 600
- max_in_flight: 64
- objects: 1024
- size: 16384
- ec_pool: true
- erasure_code_use_hacky_overwrites: true
- fast_read: true
- op_weights:
- read: 100
- write: 100
- append: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
- setattr: 25
- rmattr: 25
+++ /dev/null
-tasks:
-- rados:
- clients: [client.0]
- ops: 400000
- max_seconds: 600
- max_in_flight: 64
- objects: 1024
- size: 16384
- ec_pool: true
- fast_read: true
- op_weights:
- read: 100
- write: 0
- append: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
- setattr: 25
- rmattr: 25
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- enable experimental unrecoverable data corrupting features: '*'
- thrashosds:
- disable_objectstore_tool_tests: true
-tasks:
-- rados:
- clients: [client.0]
- ops: 400000
- max_seconds: 600
- max_in_flight: 64
- objects: 1024
- size: 16384
- ec_pool: true
- erasure_code_use_hacky_overwrites: true
- op_weights:
- read: 100
- write: 100
- append: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
- setattr: 25
- rmattr: 25
+++ /dev/null
-tasks:
-- rados:
- clients: [client.0]
- ops: 400000
- max_seconds: 600
- max_in_flight: 64
- objects: 1024
- size: 16384
- ec_pool: true
- op_weights:
- read: 100
- write: 0
- append: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
- setattr: 25
- rmattr: 25
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- enable experimental unrecoverable data corrupting features: '*'
- thrashosds:
- disable_objectstore_tool_tests: true
-tasks:
-- rados:
- clients: [client.0]
- ops: 4000
- objects: 50
- ec_pool: true
- erasure_code_use_hacky_overwrites: true
- op_weights:
- read: 100
- write: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
+++ /dev/null
-../../../../overrides/2-size-1-min-size.yaml
\ No newline at end of file
+++ /dev/null
-../../../../overrides/2-size-2-min-size.yaml
\ No newline at end of file
+++ /dev/null
-../../../../overrides/3-size-2-min-size.yaml
\ No newline at end of file
+++ /dev/null
-../../../../overrides/short_pg_log.yaml
\ No newline at end of file
+++ /dev/null
-../../../../clusters/fixed-2.yaml
\ No newline at end of file
+++ /dev/null
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 30 # GB
+++ /dev/null
-../../../../fs/btrfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- osd:
- osd debug randomize hobject sort order: true
+++ /dev/null
-../basic/msgr
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 2500
- ms tcp read timeout: 5
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 5000
- osd:
- osd heartbeat use min delay socket: true
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 2500
- ms inject delay type: osd
- ms inject delay probability: .005
- ms inject delay max: 1
- ms inject internal delays: .002
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-../../../config/rados.yaml
\ No newline at end of file
+++ /dev/null
-../../../mon_kv_backend/rocksdb.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
- conf:
- osd:
- osd debug reject backfill probability: .3
- osd scrub min interval: 60
- osd scrub max interval: 120
- osd max backfills: 3
-- thrashosds:
- timeout: 1200
- chance_pgnum_grow: 1
- chance_pgpnum_fix: 1
+++ /dev/null
-overrides:
- ceph:
- conf:
- mon:
- mon min osdmap epochs: 2
- osd:
- osd map cache size: 1
- osd scrub min interval: 60
- osd scrub max interval: 120
- osd scrub during recovery: false
- osd max backfills: 6
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
- - osd_map_cache_size
-- thrashosds:
- timeout: 1800
- chance_pgnum_grow: 1
- chance_pgpnum_fix: 1
- chance_test_map_discontinuity: 0.5
+++ /dev/null
-tasks:
-- install:
-- ceph:
- conf:
- osd:
- osd scrub min interval: 60
- osd scrub max interval: 120
- journal throttle high multiple: 2
- journal throttle max multiple: 10
- filestore queue throttle high multiple: 2
- filestore queue throttle max multiple: 10
- osd max backfills: 9
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
-- thrashosds:
- timeout: 1200
- chance_pgnum_grow: 3
- chance_pgpnum_fix: 1
-openstack:
-- volumes:
- size: 50
+++ /dev/null
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
- conf:
- osd:
- osd scrub min interval: 60
- osd scrub max interval: 120
- filestore odsync write: true
- osd max backfills: 2
-- thrashosds:
- timeout: 1200
- chance_pgnum_grow: 2
- chance_pgpnum_fix: 1
+++ /dev/null
-overrides:
- ceph:
- conf:
- client.0:
- admin socket: /var/run/ceph/ceph-$name.asok
-tasks:
-- radosbench:
- clients: [client.0]
- time: 150
-- admin_socket:
- client.0:
- objecter_requests:
- test: "http://git.ceph.com/?p=ceph.git;a=blob_plain;f=src/test/admin_socket/objecter_requests;hb={branch}"
+++ /dev/null
-overrides:
- ceph:
- log-whitelist:
- - must scrub before tier agent can activate
-tasks:
-- exec:
- client.0:
- - sudo ceph osd erasure-code-profile set teuthologyprofile ruleset-failure-domain=osd m=1 k=2
- - sudo ceph osd pool create base 4 4 erasure teuthologyprofile
- - sudo ceph osd pool set base min_size 2
- - sudo ceph osd pool create cache 4
- - sudo ceph osd tier add base cache
- - sudo ceph osd tier cache-mode cache writeback
- - sudo ceph osd tier set-overlay base cache
- - sudo ceph osd pool set cache hit_set_type bloom
- - sudo ceph osd pool set cache hit_set_count 8
- - sudo ceph osd pool set cache hit_set_period 60
- - sudo ceph osd pool set cache target_max_objects 5000
-- rados:
- clients: [client.0]
- pools: [base]
- ops: 10000
- objects: 6600
- max_seconds: 1200
- size: 1024
- op_weights:
- read: 100
- write: 100
- delete: 50
- copy_from: 50
+++ /dev/null
-overrides:
- ceph:
- crush_tunables: firefly
- log-whitelist:
- - must scrub before tier agent can activate
-tasks:
-- exec:
- client.0:
- - sudo ceph osd pool create base 4
- - sudo ceph osd pool create cache 4
- - sudo ceph osd tier add base cache
- - sudo ceph osd tier cache-mode cache writeback
- - sudo ceph osd tier set-overlay base cache
- - sudo ceph osd pool set cache hit_set_type bloom
- - sudo ceph osd pool set cache hit_set_count 8
- - sudo ceph osd pool set cache hit_set_period 60
- - sudo ceph osd pool set cache target_max_objects 250
- - sudo ceph osd pool set cache min_read_recency_for_promote 2
- - sudo ceph osd pool set cache min_write_recency_for_promote 2
-- rados:
- clients: [client.0]
- pools: [base]
- ops: 4000
- objects: 500
- op_weights:
- read: 100
- write: 100
- delete: 50
- copy_from: 50
+++ /dev/null
-overrides:
- ceph:
- log-whitelist:
- - must scrub before tier agent can activate
-tasks:
-- exec:
- client.0:
- - sudo ceph osd pool create base 4
- - sudo ceph osd pool create cache 4
- - sudo ceph osd tier add base cache
- - sudo ceph osd tier cache-mode cache readproxy
- - sudo ceph osd tier set-overlay base cache
- - sudo ceph osd pool set cache hit_set_type bloom
- - sudo ceph osd pool set cache hit_set_count 8
- - sudo ceph osd pool set cache hit_set_period 3600
- - sudo ceph osd pool set cache target_max_objects 250
-- rados:
- clients: [client.0]
- pools: [base]
- ops: 4000
- objects: 500
- pool_snaps: true
- op_weights:
- read: 100
- write: 100
- delete: 50
- copy_from: 50
- flush: 50
- try_flush: 50
- evict: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
+++ /dev/null
-overrides:
- ceph:
- log-whitelist:
- - must scrub before tier agent can activate
-tasks:
-- exec:
- client.0:
- - sudo ceph osd pool create base 4
- - sudo ceph osd pool create cache 4
- - sudo ceph osd tier add base cache
- - sudo ceph osd tier cache-mode cache writeback
- - sudo ceph osd tier set-overlay base cache
- - sudo ceph osd pool set cache hit_set_type bloom
- - sudo ceph osd pool set cache hit_set_count 8
- - sudo ceph osd pool set cache hit_set_period 3600
- - sudo ceph osd pool set cache target_max_objects 250
- - sudo ceph osd pool set cache min_read_recency_for_promote 0
- - sudo ceph osd pool set cache min_write_recency_for_promote 0
-- rados:
- clients: [client.0]
- pools: [base]
- ops: 4000
- objects: 500
- pool_snaps: true
- op_weights:
- read: 100
- write: 100
- delete: 50
- copy_from: 50
- flush: 50
- try_flush: 50
- evict: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
-openstack:
- - machine:
- ram: 15000 # MB
+++ /dev/null
-overrides:
- ceph:
- log-whitelist:
- - must scrub before tier agent can activate
-tasks:
-- exec:
- client.0:
- - sudo ceph osd pool create base 4
- - sudo ceph osd pool create cache 4
- - sudo ceph osd tier add base cache
- - sudo ceph osd tier cache-mode cache writeback
- - sudo ceph osd tier set-overlay base cache
- - sudo ceph osd pool set cache hit_set_type bloom
- - sudo ceph osd pool set cache hit_set_count 8
- - sudo ceph osd pool set cache hit_set_period 3600
- - sudo ceph osd pool set cache target_max_objects 250
- - sudo ceph osd pool set cache min_read_recency_for_promote 2
-- rados:
- clients: [client.0]
- pools: [base]
- ops: 4000
- objects: 500
- op_weights:
- read: 100
- write: 100
- delete: 50
- copy_from: 50
- flush: 50
- try_flush: 50
- evict: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
+++ /dev/null
-overrides:
- ceph:
- log-whitelist:
- - must scrub before tier agent can activate
-tasks:
-- exec:
- client.0:
- - sudo ceph osd pool create base 4
- - sudo ceph osd pool create cache 4
- - sudo ceph osd tier add base cache
- - sudo ceph osd tier cache-mode cache writeback
- - sudo ceph osd tier set-overlay base cache
- - sudo ceph osd pool set cache hit_set_type bloom
- - sudo ceph osd pool set cache hit_set_count 8
- - sudo ceph osd pool set cache hit_set_period 3600
- - sudo ceph osd pool set cache min_read_recency_for_promote 0
- - sudo ceph osd pool set cache min_write_recency_for_promote 0
-- rados:
- clients: [client.0]
- pools: [base]
- ops: 4000
- objects: 500
- op_weights:
- read: 100
- write: 100
- delete: 50
- copy_from: 50
- flush: 50
- try_flush: 50
- evict: 50
+++ /dev/null
-override:
- conf:
- osd:
- osd deep scrub update digest min age: 0
-tasks:
-- rados:
- clients: [client.0]
- ops: 4000
- objects: 50
- pool_snaps: true
- op_weights:
- read: 100
- write: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
+++ /dev/null
-overrides:
- ceph:
- log-whitelist:
- - reached quota
- crush_tunables: hammer
- conf:
- client:
- debug ms: 1
- debug objecter: 20
- debug rados: 20
-tasks:
-- workunit:
- clients:
- client.0:
- - rados/test.sh
+++ /dev/null
-overrides:
- ceph:
- conf:
- client.0:
- debug ms: 1
- debug objecter: 20
- debug rados: 20
-tasks:
-- full_sequential:
- - radosbench:
- clients: [client.0]
- time: 120
- - radosbench:
- clients: [client.0]
- time: 120
- - radosbench:
- clients: [client.0]
- time: 120
- - radosbench:
- clients: [client.0]
- time: 120
- - radosbench:
- clients: [client.0]
- time: 120
- - radosbench:
- clients: [client.0]
- time: 120
+++ /dev/null
-overrides:
- ceph:
- crush_tunables: jewel
-tasks:
-- rados:
- clients: [client.0]
- ops: 400000
- max_seconds: 600
- max_in_flight: 64
- objects: 1024
- size: 16384
- op_weights:
- read: 100
- write: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
- setattr: 25
- rmattr: 25
+++ /dev/null
-tasks:
-- rados:
- clients: [client.0]
- ops: 4000
- objects: 50
- op_weights:
- read: 100
- write: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
+++ /dev/null
-tasks:
-- rados:
- clients: [client.0]
- ops: 4000
- objects: 500
- write_fadvise_dontneed: true
- op_weights:
- write: 100
+++ /dev/null
-openstack:
- - machine:
- disk: 100 # GB
- - volumes: # attached to each instance
- count: 3
- size: 30 # GB
+++ /dev/null
-meta:
-- desc: |
- Run ceph on two nodes,
- with a separate client-only node.
- Use xfs beneath the osds.
-overrides:
- ceph:
- conf:
- mon:
- mon warn on legacy crush tunables: false
- fs: xfs
-roles:
-- - mon.a
- - mon.b
- - mon.c
- - mds.a
- - osd.0
- - osd.1
- - osd.2
-- - osd.3
- - osd.4
- - osd.5
-- - client.0
+++ /dev/null
-meta:
-- desc: install ceph/jewel latest
-tasks:
-- install:
- branch: jewel
- exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
-- print: "**** done install jewel"
-- ceph:
-- print: "**** done ceph"
+++ /dev/null
-meta:
-- desc: |
- install upgrade ceph/-x on one node only
- 1st half
- restart : osd.0,1,2,3,4,5
-tasks:
-- install.upgrade:
- osd.0:
-- print: "**** done install.upgrade osd.0"
-- ceph.restart:
- daemons: [osd.0, osd.1, osd.2, osd.3, osd.4, osd.5]
-- print: "**** done ceph.restart 1st half"
+++ /dev/null
-meta:
-- desc: |
- randomly kill and revive osd
- small chance to increase the number of pgs
-overrides:
- ceph:
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
- - log bound mismatch
-tasks:
-- thrashosds:
- timeout: 1200
- chance_pgnum_grow: 1
- chance_pgpnum_fix: 1
-- print: "**** done thrashosds 3-thrash"
+++ /dev/null
-meta:
-- desc: |
- restart mon.a so it is upgraded to -x
-tasks:
-- ceph.restart:
- daemons: [mon.a]
- wait-for-healthy: false
- wait-for-osds-up: true
-- print: "**** done ceph.restart mon.a"
+++ /dev/null
-meta:
-- desc: |
- run basic cls tests for rbd
-tasks:
-- workunit:
- branch: jewel
- clients:
- client.0:
- - cls/test_cls_rbd.sh
-- print: "**** done cls/test_cls_rbd.sh 5-workload"
+++ /dev/null
-meta:
-- desc: |
- run basic import/export cli tests for rbd
-tasks:
-- workunit:
- branch: jewel
- clients:
- client.0:
- - rbd/import_export.sh
- env:
- RBD_CREATE_ARGS: --new-format
-- print: "**** done rbd/import_export.sh 5-workload"
+++ /dev/null
-meta:
-- desc: |
- randomized correctness test for rados operations on a replicated pool,
- using only reads, writes, and deletes
-tasks:
-- full_sequential:
- - rados:
- clients: [client.0]
- ops: 4000
- objects: 500
- write_append_excl: false
- op_weights:
- read: 45
- write: 45
- delete: 10
-- print: "**** done rados/readwrite 5-workload"
+++ /dev/null
-meta:
-- desc: |
- randomized correctness test for rados operations on a replicated pool with snapshot operations
-tasks:
-- full_sequential:
- - rados:
- clients: [client.0]
- ops: 4000
- objects: 50
- write_append_excl: false
- op_weights:
- read: 100
- write: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
-- print: "**** done rados/snaps-few-objects 5-workload"
+++ /dev/null
-meta:
-- desc: |
- restart mon.b so it is upgraded to -x
-tasks:
-- ceph.restart:
- daemons: [mon.b]
- wait-for-healthy: false
- wait-for-osds-up: true
-- print: "**** done ceph.restart mon.b 6-next-mon"
+++ /dev/null
-meta:
-- desc: |
- run randomized correctness test for rados operations
- generate write load with rados bench
-tasks:
-- full_sequential:
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
-- print: "**** done radosbench 7-workload"
+++ /dev/null
-meta:
-- desc: |
- librbd C and C++ api tests
-tasks:
-- workunit:
- branch: jewel
- clients:
- client.0:
- - rbd/test_librbd.sh
-- print: "**** done rbd/test_librbd.sh 7-workload"
+++ /dev/null
-meta:
-- desc: |
- restart mon.c so it is upgraded to -x
- as all mon were upgrated, expected ceph cluster reach quorum
-tasks:
-- ceph.restart:
- daemons: [mon.c]
- wait-for-healthy: false
- wait-for-osds-up: true
-- print: "**** done ceph.restart mon.c 8-next-mon"
-- ceph.wait_for_mon_quorum: [a, b, c]
-- print: "**** done wait_for_mon_quorum 8-next-mon"
+++ /dev/null
-meta:
-- desc: |
- librbd python api tests
-tasks:
-- workunit:
- branch: jewel
- clients:
- client.0:
- - rbd/test_librbd_python.sh
-- print: "**** done rbd/test_librbd_python.sh 9-workload"
+++ /dev/null
-meta:
-- desc: |
- swift api tests for rgw
-tasks:
-- rgw:
- client.0:
- default_idle_timeout: 300
-- print: "**** done rgw 9-workload"
-- swift:
- client.0:
- rgw_server: client.0
-- print: "**** done swift 9-workload"
+++ /dev/null
-meta:
-- desc: |
- randomized correctness test for rados operations on a replicated pool with snapshot operations
-tasks:
-- rados:
- clients: [client.0]
- ops: 4000
- objects: 500
- write_append_excl: false
- op_weights:
- read: 100
- write: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
+++ /dev/null
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
-- thrashosds:
- timeout: 1200
- chance_pgnum_grow: 1
- chance_pgpnum_fix: 1
+++ /dev/null
-tasks:
-- install:
-- ceph:
+++ /dev/null
-../../../../clusters/fixed-2.yaml
\ No newline at end of file
+++ /dev/null
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 10 # GB
+++ /dev/null
-../../../../fs/btrfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../mon_kv_backend
\ No newline at end of file
+++ /dev/null
-../basic/msgr
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 5000
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-../../../config/rados.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- mon_recovery:
+++ /dev/null
-overrides:
- ceph:
- log-whitelist:
- - reached quota
- conf:
- client:
- debug ms: 1
- debug objecter: 20
- debug rados: 20
- debug monc: 20
-tasks:
-- workunit:
- timeout: 6h
- clients:
- client.0:
- - rados/test.sh
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.0:
- - cls
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- lockdep: true
+++ /dev/null
-os_type: centos # xenial valgrind buggy, see http://tracker.ceph.com/issues/18126
-overrides:
- install:
- ceph:
- flavor: notcmalloc
- debuginfo: true
- ceph:
- conf:
- global:
- osd heartbeat grace: 40
- valgrind:
- mon: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
- osd: [--tool=memcheck]
- mds: [--tool=memcheck]
+++ /dev/null
-tasks:
-- install:
-- ceph:
+++ /dev/null
-tasks:
-- exec:
- client.0:
- - sudo ceph osd pool create cache 4
- - sudo ceph osd tier add rbd cache
- - sudo ceph osd tier cache-mode cache writeback
- - sudo ceph osd tier set-overlay rbd cache
- - sudo ceph osd pool set cache hit_set_type bloom
- - sudo ceph osd pool set cache hit_set_count 8
- - sudo ceph osd pool set cache hit_set_period 60
- - sudo ceph osd pool set cache target_max_objects 250
+++ /dev/null
-../../../../clusters/fixed-1.yaml
\ No newline at end of file
+++ /dev/null
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 30 # GB
+++ /dev/null
-../../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 5000
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 500
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/test_librbd.sh
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/run_cli_tests.sh
-
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.0:
- - cls/test_cls_rbd.sh
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/test_lock_fence.sh
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/test_librbd_python.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
+++ /dev/null
-../basic/clusters
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- rbd default format: 2
- rbd default features: 61
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- rbd default format: 1
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- rbd default format: 2
- rbd default features: 125
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- rbd default format: 2
- rbd default features: 1
+++ /dev/null
-../basic/fs
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 5000
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 500
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-tasks:
-- exec:
- client.0:
- - sudo ceph osd erasure-code-profile set teuthologyprofile ruleset-failure-domain=osd m=1 k=2
- - sudo ceph osd pool create datapool 4 4 erasure teuthologyprofile
- - sudo ceph osd pool set datapool debug_white_box_testing_ec_overwrites true
-
-overrides:
- ceph:
- conf:
- global:
- enable experimental unrecoverable data corrupting features: debug_white_box_testing_ec_overwrites
- client:
- rbd default data pool: datapool
+++ /dev/null
-tasks:
-- exec:
- client.0:
- - sudo ceph osd pool create datapool 4
-
-overrides:
- ceph:
- conf:
- client:
- rbd default data pool: datapool
+++ /dev/null
-tasks:
-- exec:
- client.0:
- - sudo ceph osd pool create cache 4
- - sudo ceph osd tier add rbd cache
- - sudo ceph osd tier cache-mode cache writeback
- - sudo ceph osd tier set-overlay rbd cache
- - sudo ceph osd pool set cache hit_set_type bloom
- - sudo ceph osd pool set cache hit_set_count 8
- - sudo ceph osd pool set cache hit_set_period 60
- - sudo ceph osd pool set cache target_max_objects 250
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/copy.sh
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/import_export.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
- conf:
- client:
- rbd cache: false
+++ /dev/null
-tasks:
-- install:
-- ceph:
- conf:
- client:
- rbd cache: true
+++ /dev/null
-tasks:
-- install:
-- ceph:
- conf:
- client:
- rbd cache: true
- rbd cache max dirty: 0
+++ /dev/null
-../../../../clusters/fixed-3.yaml
\ No newline at end of file
+++ /dev/null
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 30 # GB
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- rbd clone copy on read: true
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- rbd skip partial discard: true
+++ /dev/null
-../basic/fs
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 5000
- log-whitelist:
- - wrongly marked me down
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-tasks:
-- exec:
- client.0:
- - sudo ceph osd erasure-code-profile set teuthologyprofile ruleset-failure-domain=osd m=1 k=2
- - sudo ceph osd pool create datapool 4 4 erasure teuthologyprofile
- - sudo ceph osd pool set datapool debug_white_box_testing_ec_overwrites true
-
-overrides:
- ceph:
- conf:
- global:
- enable experimental unrecoverable data corrupting features: debug_white_box_testing_ec_overwrites
- client:
- rbd default data pool: datapool
+++ /dev/null
-tasks:
-- exec:
- client.0:
- - sudo ceph osd pool create datapool 4
-
-overrides:
- ceph:
- conf:
- client:
- rbd default data pool: datapool
+++ /dev/null
-tasks:
-- exec:
- client.0:
- - sudo ceph osd pool create cache 4
- - sudo ceph osd tier add rbd cache
- - sudo ceph osd tier cache-mode cache writeback
- - sudo ceph osd tier set-overlay rbd cache
- - sudo ceph osd pool set cache hit_set_type bloom
- - sudo ceph osd pool set cache hit_set_count 8
- - sudo ceph osd pool set cache hit_set_period 60
- - sudo ceph osd pool set cache target_max_objects 250
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/test_librbd.sh
- env:
- RBD_FEATURES: "1"
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/test_librbd.sh
- env:
- RBD_FEATURES: "61"
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/test_librbd.sh
- env:
- RBD_FEATURES: "125"
+++ /dev/null
-tasks:
-- rbd_fsx:
- clients: [client.0]
- ops: 20000
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/test_librbd_python.sh
- env:
- RBD_FEATURES: "1"
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/test_librbd_python.sh
- env:
- RBD_FEATURES: "61"
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/test_librbd_python.sh
- env:
- RBD_FEATURES: "125"
+++ /dev/null
-tasks:
-- rbd_fio:
- client.0:
- fio-io-size: 80%
- formats: [2]
- features: [[layering],[layering,exclusive-lock,object-map]]
- io-engine: rbd
- test-clone-io: 1
- rw: randrw
- runtime: 900
+++ /dev/null
-tasks:
-- install:
-- ceph:
+++ /dev/null
-../../../../clusters/fixed-3.yaml
\ No newline at end of file
+++ /dev/null
-../../qemu/clusters/openstack.yaml
\ No newline at end of file
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-tasks:
-- parallel:
- - io_workload
- - op_workload
-io_workload:
- sequential:
- - qemu:
- client.0:
- clone: true
- type: block
- num_rbd: 2
- test: http://git.ceph.com/?p=ceph.git;a=blob_plain;f=qa/run_xfstests_qemu.sh
-exclude_arch: armv7l
+++ /dev/null
-op_workload:
- sequential:
- - workunit:
- clients:
- client.0:
- - rbd/qemu_dynamic_features.sh
- env:
- IMAGE_NAME: client.0.1-clone
+++ /dev/null
-op_workload:
- sequential:
- - workunit:
- clients:
- client.0:
- - rbd/qemu_rebuild_object_map.sh
- env:
- IMAGE_NAME: client.0.1-clone
+++ /dev/null
-../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-meta:
-- desc: run two ceph clusters and install rbd-mirror
-tasks:
-- install:
- extra_packages: [rbd-mirror]
-- ceph:
- cluster: cluster1
-- ceph:
- cluster: cluster2
+++ /dev/null
-meta:
-- desc: 2 ceph clusters with 3 mons and 3 osds each
-roles:
-- - cluster1.mon.a
- - cluster1.mon.b
- - cluster1.osd.0
- - cluster1.osd.1
- - cluster1.osd.2
- - cluster2.mon.c
- - cluster1.client.0
- - cluster2.client.0
-- - cluster1.mon.c
- - cluster2.mon.a
- - cluster2.mon.b
- - cluster2.osd.0
- - cluster2.osd.1
- - cluster2.osd.2
- - cluster1.client.mirror
- - cluster2.client.mirror
+++ /dev/null
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 30 # GB
+++ /dev/null
-../basic/fs
\ No newline at end of file
+++ /dev/null
-../basic/msgr-failures
\ No newline at end of file
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-meta:
-- desc: run one rbd-mirror daemon per cluster
-overrides:
- ceph:
- conf:
- client.mirror:
- # override to make these names predictable
- admin socket: /var/run/ceph/$cluster-$name.asok
- pid file: /var/run/ceph/$cluster-$name.pid
-tasks:
-- rbd-mirror:
- client: cluster1.client.mirror
-- rbd-mirror:
- client: cluster2.client.mirror
+++ /dev/null
-meta:
-- desc: run the rbd_mirror_stress.sh workunit to test the rbd-mirror daemon
-tasks:
-- workunit:
- clients:
- cluster1.client.mirror: [rbd/rbd_mirror_stress.sh]
- env:
- # override workunit setting of CEPH_ARGS='--cluster'
- CEPH_ARGS: ''
- RBD_MIRROR_USE_EXISTING_CLUSTER: '1'
- RBD_MIRROR_USE_RBD_MIRROR: '1'
- timeout: 6h
+++ /dev/null
-meta:
-- desc: run the rbd_mirror.sh workunit to test the rbd-mirror daemon
-tasks:
-- workunit:
- clients:
- cluster1.client.mirror: [rbd/rbd_mirror.sh]
- env:
- # override workunit setting of CEPH_ARGS='--cluster'
- CEPH_ARGS: ''
- RBD_MIRROR_USE_EXISTING_CLUSTER: '1'
- RBD_MIRROR_USE_RBD_MIRROR: '1'
+++ /dev/null
-../thrash/base
\ No newline at end of file
+++ /dev/null
-roles:
-- [mon.a, mon.c, osd.0, osd.1, osd.2]
-- [mon.b, osd.3, osd.4, osd.5]
-- [client.0]
+++ /dev/null
-../../thrash/clusters/openstack.yaml
\ No newline at end of file
+++ /dev/null
-../thrash/fs
\ No newline at end of file
+++ /dev/null
-../thrash/msgr-failures
\ No newline at end of file
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-../thrash/thrashers
\ No newline at end of file
+++ /dev/null
-os_type: ubuntu
-overrides:
- install:
- ceph:
- extra_packages: [rbd-nbd]
-tasks:
-- rbd_fsx:
- clients: [client.0]
- ops: 6000
- nbd: True
- holebdy: 512
- punch_holes: true
- readbdy: 512
- truncbdy: 512
- writebdy: 512
+++ /dev/null
-os_type: ubuntu
-overrides:
- install:
- ceph:
- extra_packages: [rbd-nbd]
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/rbd-nbd.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
- conf:
- client:
- rbd cache: false
+++ /dev/null
-tasks:
-- install:
-- ceph:
- conf:
- client:
- rbd cache: true
+++ /dev/null
-tasks:
-- install:
-- ceph:
- conf:
- client:
- rbd cache: true
- rbd cache max dirty: 0
+++ /dev/null
-../../../../clusters/fixed-3.yaml
\ No newline at end of file
+++ /dev/null
-openstack:
- - machine:
- disk: 40 # GB
- ram: 30000 # MB
- cpus: 1
- volumes: # attached to each instance
- count: 3
- size: 30 # GB
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- rbd default format: 2
- rbd default features: 61
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- rbd default format: 2
- rbd default features: 125
+++ /dev/null
-../basic/fs
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 5000
- log-whitelist:
- - wrongly marked me down
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-tasks:
-- exec:
- client.0:
- - sudo ceph osd erasure-code-profile set teuthologyprofile ruleset-failure-domain=osd m=1 k=2
- - sudo ceph osd pool delete rbd rbd --yes-i-really-really-mean-it
- - sudo ceph osd pool create rbd 4 4 erasure teuthologyprofile
- - sudo ceph osd pool create cache 4
- - sudo ceph osd tier add rbd cache
- - sudo ceph osd tier cache-mode cache writeback
- - sudo ceph osd tier set-overlay rbd cache
- - sudo ceph osd pool set cache hit_set_type bloom
- - sudo ceph osd pool set cache hit_set_count 8
- - sudo ceph osd pool set cache hit_set_period 60
- - sudo ceph osd pool set cache target_max_objects 250
+++ /dev/null
-tasks:
-- exec:
- client.0:
- - sudo ceph osd erasure-code-profile set teuthologyprofile ruleset-failure-domain=osd m=1 k=2
- - sudo ceph osd pool create datapool 4 4 erasure teuthologyprofile
- - sudo ceph osd pool set datapool debug_white_box_testing_ec_overwrites true
-
-overrides:
- ceph:
- conf:
- global:
- enable experimental unrecoverable data corrupting features: debug_white_box_testing_ec_overwrites
- client:
- rbd default data pool: datapool
+++ /dev/null
-tasks:
-- exec:
- client.0:
- - sudo ceph osd pool create datapool 4
-
-overrides:
- ceph:
- conf:
- client:
- rbd default data pool: datapool
+++ /dev/null
-tasks:
-- exec:
- client.0:
- - sudo ceph osd pool create cache 4
- - sudo ceph osd tier add rbd cache
- - sudo ceph osd tier cache-mode cache writeback
- - sudo ceph osd tier set-overlay rbd cache
- - sudo ceph osd pool set cache hit_set_type bloom
- - sudo ceph osd pool set cache hit_set_count 8
- - sudo ceph osd pool set cache hit_set_period 60
- - sudo ceph osd pool set cache target_max_objects 250
+++ /dev/null
-tasks:
-- qemu:
- all:
- type: block
- num_rbd: 2
- grtest: http://git.ceph.com/?p=ceph.git;a=blob_plain;f=qa/run_xfstests_qemu.sh
-exclude_arch: armv7l
+++ /dev/null
-tasks:
-- qemu:
- all:
- clone: true
- test: http://git.ceph.com/?p=ceph.git;a=blob_plain;f=qa/workunits/suites/bonnie.sh
-exclude_arch: armv7l
+++ /dev/null
-tasks:
-- qemu:
- all:
- clone: true
- test: http://git.ceph.com/?p=ceph.git;a=blob_plain;f=qa/workunits/suites/fsstress.sh
-exclude_arch: armv7l
+++ /dev/null
-tasks:
-- qemu:
- all:
- test: http://git.ceph.com/?p=ceph.git;a=blob_plain;f=qa/workunits/suites/iozone.sh
- image_size: 20480
-exclude_arch: armv7l
+++ /dev/null
-tasks:
-- qemu:
- all:
- clone: true
- type: block
- num_rbd: 2
- test: http://git.ceph.com/?p=ceph.git;a=blob_plain;f=qa/run_xfstests_qemu.sh
-exclude_arch: armv7l
+++ /dev/null
-roles:
-- [mon.a, osd.0, osd.1, client.0]
-tasks:
-- install:
-- ceph:
- fs: xfs
-- workunit:
- clients:
- all: [rbd/test_admin_socket.sh]
+++ /dev/null
-roles:
-- [mon.a, osd.0, osd.1, client.0]
-tasks:
-- install:
-- ceph:
- fs: xfs
-- cram:
- clients:
- client.0:
- - http://git.ceph.com/?p=ceph.git;a=blob_plain;hb={branch};f=src/test/cli-integration/rbd/formatted-output.t
+++ /dev/null
-roles:
-- [mon.a, osd.0, osd.1, client.0]
-tasks:
-- install:
-- ceph:
- fs: xfs
-- workunit:
- clients:
- all: [rbd/merge_diff.sh]
+++ /dev/null
-roles:
-- [mon.a, osd.0, osd.1, client.0]
-tasks:
-- install:
-- ceph:
- fs: xfs
-- workunit:
- clients:
- all: [rbd/permissions.sh]
+++ /dev/null
-exclude_arch: armv7l
-roles:
-- [mon.a, osd.0, osd.1, client.0]
-tasks:
-- install:
-- ceph:
- fs: xfs
- conf:
- client:
- rbd cache: false
-- workunit:
- clients:
- all: [rbd/qemu-iotests.sh]
+++ /dev/null
-exclude_arch: armv7l
-roles:
-- [mon.a, osd.0, osd.1, client.0]
-tasks:
-- install:
-- ceph:
- fs: xfs
- conf:
- client:
- rbd cache: true
-- workunit:
- clients:
- all: [rbd/qemu-iotests.sh]
+++ /dev/null
-exclude_arch: armv7l
-roles:
-- [mon.a, osd.0, osd.1, client.0]
-tasks:
-- install:
-- ceph:
- fs: xfs
- conf:
- client:
- rbd cache: true
- rbd cache max dirty: 0
-- workunit:
- clients:
- all: [rbd/qemu-iotests.sh]
+++ /dev/null
-roles:
-- [mon.a, osd.0, osd.1, client.0]
-tasks:
-- install:
-- ceph:
- fs: xfs
- conf:
- client:
- rbd validate pool: false
-- workunit:
- clients:
- all:
- - mon/rbd_snaps_ops.sh
-
+++ /dev/null
-roles:
-- [mon.a, osd.0, osd.1, client.0]
-tasks:
-- install:
-- ceph:
- fs: xfs
-- workunit:
- clients:
- all: [rbd/test_rbd_mirror.sh]
+++ /dev/null
-roles:
-- [client.0]
-tasks:
-- install:
-- workunit:
- clients:
- all: [rbd/test_rbdmap_RBDMAPFILE.sh]
+++ /dev/null
-roles:
-- [mon.a, osd.0, osd.1, client.0]
-tasks:
-- install:
-- ceph:
- fs: xfs
- conf:
- client:
- rbd cache: false
-- workunit:
- clients:
- all: [rbd/read-flags.sh]
+++ /dev/null
-roles:
-- [mon.a, osd.0, osd.1, client.0]
-tasks:
-- install:
-- ceph:
- fs: xfs
- conf:
- client:
- rbd cache: true
-- workunit:
- clients:
- all: [rbd/read-flags.sh]
+++ /dev/null
-roles:
-- [mon.a, osd.0, osd.1, client.0]
-tasks:
-- install:
-- ceph:
- fs: xfs
- conf:
- client:
- rbd cache: true
- rbd cache max dirty: 0
-- workunit:
- clients:
- all: [rbd/read-flags.sh]
+++ /dev/null
-roles:
-- [mon.a, osd.0, osd.1, client.0]
-tasks:
-- install:
-- ceph:
- fs: xfs
-- workunit:
- clients:
- all: [rbd/verify_pool.sh]
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-openstack:
- - volumes: # attached to each instance
- count: 2
- size: 30 # GB
+++ /dev/null
-tasks:
-- install:
-- ceph:
+++ /dev/null
-../../../../clusters/fixed-2.yaml
\ No newline at end of file
+++ /dev/null
-openstack:
- - machine:
- disk: 40 # GB
- ram: 8000 # MB
- cpus: 1
- volumes: # attached to each instance
- count: 3
- size: 30 # GB
+++ /dev/null
-../../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 5000
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
-tasks:
-- exec:
- client.0:
- - sudo ceph osd pool create cache 4
- - sudo ceph osd tier add rbd cache
- - sudo ceph osd tier cache-mode cache writeback
- - sudo ceph osd tier set-overlay rbd cache
- - sudo ceph osd pool set cache hit_set_type bloom
- - sudo ceph osd pool set cache hit_set_count 8
- - sudo ceph osd pool set cache hit_set_period 60
- - sudo ceph osd pool set cache target_max_objects 250
-- thrashosds:
- timeout: 1200
+++ /dev/null
-overrides:
- ceph:
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
-tasks:
-- thrashosds:
- timeout: 1200
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/journal.sh
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/test_librbd.sh
- env:
- RBD_FEATURES: "61"
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/test_librbd.sh
- env:
- RBD_FEATURES: "61"
-overrides:
- ceph:
- conf:
- client:
- rbd clone copy on read: true
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/test_librbd.sh
- env:
- RBD_FEATURES: "125"
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/test_librbd.sh
- env:
- RBD_FEATURES: "1"
+++ /dev/null
-tasks:
-- rbd_fsx:
- clients: [client.0]
- ops: 6000
-overrides:
- ceph:
- conf:
- client:
- rbd cache: true
+++ /dev/null
-tasks:
-- rbd_fsx:
- clients: [client.0]
- ops: 6000
-overrides:
- ceph:
- conf:
- client:
- rbd cache: true
- rbd cache max dirty: 0
+++ /dev/null
-tasks:
-- rbd_fsx:
- clients: [client.0]
- ops: 6000
-overrides:
- ceph:
- conf:
- client:
- rbd cache: true
- rbd clone copy on read: true
+++ /dev/null
-tasks:
-- rbd_fsx:
- clients: [client.0]
- ops: 6000
- journal_replay: True
+++ /dev/null
-tasks:
-- rbd_fsx:
- clients: [client.0]
- ops: 6000
-overrides:
- ceph:
- conf:
- client:
- rbd cache: false
+++ /dev/null
-tasks:
-- install:
-- ceph:
+++ /dev/null
-../basic/clusters
\ No newline at end of file
+++ /dev/null
-../basic/fs
\ No newline at end of file
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-os_type: centos # xenial valgrind buggy, see http://tracker.ceph.com/issues/18126
-overrides:
- install:
- ceph:
- flavor: notcmalloc
- debuginfo: true
- rbd_fsx:
- valgrind: ["--tool=memcheck"]
- workunit:
- env:
- VALGRIND: "memcheck"
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/test_librbd.sh
- env:
- RBD_FEATURES: "1"
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/test_librbd.sh
- env:
- RBD_FEATURES: "61"
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/test_librbd.sh
- env:
- RBD_FEATURES: "125"
+++ /dev/null
-tasks:
-- rbd_fsx:
- clients: [client.0]
- size: 134217728
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/test_librbd_python.sh
- env:
- RBD_FEATURES: "1"
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/test_librbd_python.sh
- env:
- RBD_FEATURES: "61"
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/test_librbd_python.sh
- env:
- RBD_FEATURES: "125"
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/test_rbd_mirror.sh
+++ /dev/null
-roles:
-- - mon.a
- - mds.a
- - osd.0
- - osd.1
-- - mon.b
- - mon.c
- - osd.2
- - osd.3
- - client.0
-
-tasks:
-- install:
-- ceph:
- fs: xfs
- log-whitelist:
- - wrongly marked me down
- conf:
- client.rest0:
- debug ms: 1
- debug objecter: 20
- debug rados: 20
-- rest-api: [client.0]
-- workunit:
- clients:
- client.0:
- - rest/test.py
+++ /dev/null
-../../../../clusters/fixed-2.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- rgw:
- frontend: apache
+++ /dev/null
-overrides:
- rgw:
- frontend: civetweb
+++ /dev/null
-../../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- debug rgw: 20
- rgw:
- frontend: civetweb
+++ /dev/null
-../../../rgw_pool_type
\ No newline at end of file
+++ /dev/null
-# Amazon/S3.pm (cpan) not available as an rpm
-os_type: ubuntu
-tasks:
-- install:
-- ceph:
-- rgw: [client.0]
-- workunit:
- clients:
- client.0:
- - rgw/s3_bucket_quota.pl
+++ /dev/null
-# Amazon::S3 is not available on el7
-os_type: ubuntu
-tasks:
-- install:
-- ceph:
-- rgw: [client.0]
-- workunit:
- clients:
- client.0:
- - rgw/s3_multipart_upload.pl
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- rgw: [client.0]
-- s3readwrite:
- client.0:
- rgw_server: client.0
- readwrite:
- bucket: rwtest
- readers: 10
- writers: 3
- duration: 300
- files:
- num: 10
- size: 2000
- stddev: 500
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- rgw: [client.0]
-- s3roundtrip:
- client.0:
- rgw_server: client.0
- roundtrip:
- bucket: rttest
- readers: 10
- writers: 3
- duration: 300
- files:
- num: 10
- size: 2000
- stddev: 500
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- rgw: [client.0]
-- s3tests:
- client.0:
- force-branch: ceph-master
- rgw_server: client.0
-overrides:
- ceph:
- conf:
- client:
- rgw lc debug interval: 10
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- rgw: [client.0]
-- swift:
- client.0:
- rgw_server: client.0
+++ /dev/null
-# Amazon/S3.pm (cpan) not available as an rpm
-os_type: ubuntu
-tasks:
-- install:
-- ceph:
-- rgw: [client.0]
-- workunit:
- clients:
- client.0:
- - rgw/s3_user_quota.pl
+++ /dev/null
-roles:
-- [mon.a, osd.0, client.0]
-- [osd.1, osd.2, osd.3, client.1]
-tasks:
-- install:
-- ceph:
- conf:
- client:
- debug ms: 1
- rgw gc obj min wait: 15
- rgw data log window: 30
- osd:
- debug ms: 1
- debug objclass : 20
- client.0:
- rgw region: region0
- rgw zone: r0z0
- rgw region root pool: .rgw.region.0
- rgw zone root pool: .rgw.zone.0
- rgw gc pool: .rgw.gc.0
- rgw user uid pool: .users.uid.0
- rgw user keys pool: .users.0
- rgw log data: True
- rgw log meta: True
- client.1:
- rgw region: region0
- rgw zone: r0z1
- rgw region root pool: .rgw.region.0
- rgw zone root pool: .rgw.zone.1
- rgw gc pool: .rgw.gc.1
- rgw user uid pool: .users.uid.1
- rgw user keys pool: .users.1
- rgw log data: False
- rgw log meta: False
-- rgw:
- realm:
- realm0
- regions:
- region0:
- api name: api1
- is master: True
- master zone: r0z0
- zones: [r0z0, r0z1]
- client.0:
- system user:
- name: client0-system-user
- access key: 0te6NH5mcdcq0Tc5i8i2
- secret key: Oy4IOauQoL18Gp2zM7lC1vLmoawgqcYPbYGcWfXv
- client.1:
- system user:
- name: client1-system-user
- access key: 1te6NH5mcdcq0Tc5i8i3
- secret key: Py4IOauQoL18Gp2zM7lC1vLmoawgqcYPbYGcWfXw
-- radosgw-agent:
- client.0:
- max-entries: 10
- src: client.0
- dest: client.1
-- sleep:
- duration: 30
-- radosgw-admin:
+++ /dev/null
-roles:
-- [mon.a, osd.0, osd.1, osd.2, client.0]
-- [mon.b, mon.c, osd.3, osd.4, osd.5, client.1]
-tasks:
-- install:
-- ceph:
- conf:
- client:
- debug ms: 1
- rgw gc obj min wait: 15
- osd:
- debug ms: 1
- debug objclass : 20
- client.0:
- rgw region: region0
- rgw zone: r0z1
- rgw region root pool: .rgw.region.0
- rgw zone root pool: .rgw.zone.0
- rgw gc pool: .rgw.gc.0
- rgw user uid pool: .users.uid.0
- rgw user keys pool: .users.0
- rgw log data: True
- rgw log meta: True
- client.1:
- rgw region: region1
- rgw zone: r1z1
- rgw region root pool: .rgw.region.1
- rgw zone root pool: .rgw.zone.1
- rgw gc pool: .rgw.gc.1
- rgw user uid pool: .users.uid.1
- rgw user keys pool: .users.1
- rgw log data: False
- rgw log meta: False
-- rgw:
- realm:
- realm0
- regions:
- region0:
- api name: api1
- is master: True
- master zone: r0z1
- zones: [r0z1]
- region1:
- api name: api1
- is master: False
- master zone: r1z1
- zones: [r1z1]
- client.0:
- system user:
- name: client0-system-user
- access key: 0te6NH5mcdcq0Tc5i8i2
- secret key: Oy4IOauQoL18Gp2zM7lC1vLmoawgqcYPbYGcWfXv
- client.1:
- system user:
- name: client1-system-user
- access key: 1te6NH5mcdcq0Tc5i8i3
- secret key: Py4IOauQoL18Gp2zM7lC1vLmoawgqcYPbYGcWfXw
-- radosgw-agent:
- client.0:
- src: client.0
- dest: client.1
- metadata-only: true
-- radosgw-admin:
+++ /dev/null
-roles:
-- [mon.a, osd.0]
-- [client.0, osd.1, osd.2, osd.3]
-tasks:
-- install:
-- ceph:
- conf:
- client:
- debug ms: 1
- rgw gc obj min wait: 15
- osd:
- debug ms: 1
- debug objclass : 20
-- rgw:
- client.0:
-- radosgw-admin:
+++ /dev/null
-overrides:
- s3readwrite:
- s3:
- user_id: s3readwrite-test-user
- display_name: test user for the s3readwrite tests
- email: tester@inktank
- access_key: 2te6NH5mcdcq0Tc5i8i4
- secret_key: Qy4IOauQoL18Gp2zM7lC1vLmoawgqcYPbYGcWfXx
- readwrite:
- deterministic_file_names: True
- duration: 30
- bucket: testbucket
- files:
- num: 10
- size: 2000
- stddev: 500
-roles:
-- [mon.a, osd.0, osd.1, osd.2, client.0]
-- [mon.b, mon.c, osd.3, osd.4, osd.5, client.1]
-
-tasks:
-- install:
-- ceph:
- conf:
- client:
- rgw region: default
- rgw zone: r1z1
- rgw region root pool: .rgw
- rgw zone root pool: .rgw
- rgw domain root: .rgw
- rgw gc pool: .rgw.gc
- rgw user uid pool: .users.uid
- rgw user keys pool: .users
-- rgw:
- realm:
- realm0
- regions:
- default:
- api name: api1
- is master: true
- master zone: r1z1
- zones: [r1z1]
- client.0:
- system user:
- name: nr-system
- access key: 0te6NH5mcdcq0Tc5i8i2
- secret key: Oy4IOauQoL18Gp2zM7lC1vLmoawgqcYPbYGcWfXv
-- s3readwrite:
- client.0:
- extra_args: ['--no-cleanup']
- s3:
- delete_user: False
- readwrite:
- writers: 1
- readers: 0
-- rgw:
- realm:
- realm0
- regions:
- default:
- api name: api1
- is master: true
- master zone: r1z1
- zones: [r1z1]
- client.1:
- system user:
- name: r2-system
- access key: 1te6NH5mcdcq0Tc5i8i3
- secret key: Py4IOauQoL18Gp2zM7lC1vLmoawgqcYPbYGcWfXw
-- s3readwrite:
- client.1:
- s3:
- create_user: False
- readwrite:
- writers: 0
- readers: 2
-
+++ /dev/null
-overrides:
- rgw:
- frontend: apache
+++ /dev/null
-overrides:
- rgw:
- frontend: civetweb
+++ /dev/null
-overrides:
- ceph:
- fs: xfs
- conf:
- osd:
- osd sloppy crc: true
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- debug rgw: 20
- rgw:
- frontend: civetweb
+++ /dev/null
-../../../rgw_pool_type/
\ No newline at end of file
+++ /dev/null
-../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../../clusters/fixed-2.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- rgw:
- frontend: apache
+++ /dev/null
-overrides:
- rgw:
- frontend: civetweb
+++ /dev/null
-overrides:
- ceph:
- fs: xfs
- conf:
- osd:
- osd sloppy crc: true
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 5000
+++ /dev/null
-../../../objectstore
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- debug rgw: 20
- rgw compression type: random
- rgw:
- frontend: civetweb
+++ /dev/null
-../../../rgw_pool_type/
\ No newline at end of file
+++ /dev/null
-os_type: centos # xenial valgrind buggy, see http://tracker.ceph.com/issues/18126
-tasks:
-- install:
- flavor: notcmalloc
- debuginfo: true
-- ceph:
-- rgw:
- client.0:
- valgrind: [--tool=memcheck]
-- s3tests:
- client.0:
- force-branch: ceph-master
- rgw_server: client.0
-overrides:
- ceph:
- conf:
- global:
- osd_min_pg_log_entries: 10
- osd_max_pg_log_entries: 10
- client:
- rgw lc debug interval: 10
+++ /dev/null
-os_type: centos # xenial valgrind buggy, see http://tracker.ceph.com/issues/18126
-tasks:
-- install:
- flavor: notcmalloc
- debuginfo: true
-- ceph:
- conf:
- client.0:
- rgw region: zero
- rgw zone: r0z1
- rgw region root pool: .rgw.region.0
- rgw zone root pool: .rgw.zone.0
- rgw gc pool: .rgw.gc.0
- rgw user uid pool: .users.uid.0
- rgw user keys pool: .users.0
- rgw log data: True
- rgw log meta: True
- client.1:
- rgw region: one
- rgw zone: r1z1
- rgw region root pool: .rgw.region.1
- rgw zone root pool: .rgw.zone.1
- rgw gc pool: .rgw.gc.1
- rgw user uid pool: .users.uid.1
- rgw user keys pool: .users.1
- rgw log data: False
- rgw log meta: False
-- rgw:
- default_idle_timeout: 300
- realm:
- realm0
- regions:
- zero:
- api name: api1
- is master: True
- master zone: r0z1
- zones: [r0z1]
- one:
- api name: api1
- is master: False
- master zone: r1z1
- zones: [r1z1]
- client.0:
- valgrind: [--tool=memcheck]
- system user:
- name: client0-system-user
- access key: 1te6NH5mcdcq0Tc5i8i2
- secret key: 1y4IOauQoL18Gp2zM7lC1vLmoawgqcYPbYGcWfXv
- client.1:
- valgrind: [--tool=memcheck]
- system user:
- name: client1-system-user
- access key: 0te6NH5mcdcq0Tc5i8i2
- secret key: Oy4IOauQoL18Gp2zM7lC1vLmoawgqcYPbYGcWfXv
-- radosgw-agent:
- client.0:
- src: client.0
- dest: client.1
- metadata-only: true
-- s3tests:
- client.0:
- force-branch: ceph-master
- idle_timeout: 300
- rgw_server: client.0
-overrides:
- ceph:
- conf:
- client:
- rgw lc debug interval: 10
+++ /dev/null
-os_type: centos # xenial valgrind buggy, see http://tracker.ceph.com/issues/18126
-tasks:
-- install:
- flavor: notcmalloc
- debuginfo: true
-- ceph:
-- rgw:
- client.0:
- valgrind: [--tool=memcheck]
-- swift:
- client.0:
- rgw_server: client.0
+++ /dev/null
-overrides:
- ceph:
- conf:
- osd:
- lockdep: true
- mon:
- lockdep: true
+++ /dev/null
-os_type: centos # xenial valgrind buggy, see http://tracker.ceph.com/issues/18126
-overrides:
- install:
- ceph:
- flavor: notcmalloc
- debuginfo: true
- ceph:
- conf:
- global:
- osd heartbeat grace: 40
- valgrind:
- mon: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
- osd: [--tool=memcheck]
- mds: [--tool=memcheck]
+++ /dev/null
-roles:
-- [mon.a, mon.b, mon.c, mds.a, osd.0, osd.1]
-- [samba.0, client.0, client.1]
+++ /dev/null
-../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-# we currently can't install Samba on RHEL; need a gitbuilder and code updates
-os_type: ubuntu
-
-tasks:
-- install:
-- install:
- project: samba
- extra_packages: ['samba']
-- ceph:
+++ /dev/null
-tasks:
-- ceph-fuse: [client.0]
-- samba:
- samba.0:
- ceph: "{testdir}/mnt.0"
-
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms die on skipped message: false
-kernel:
- client:
- branch: testing
-tasks:
-- kclient: [client.0]
-- samba:
- samba.0:
- ceph: "{testdir}/mnt.0"
-
+++ /dev/null
-tasks:
-- samba:
+++ /dev/null
-tasks:
-- localdir: [client.0]
-- samba:
- samba.0:
- ceph: "{testdir}/mnt.0"
+++ /dev/null
-tasks:
-- cifs-mount:
- client.1:
- share: ceph
-- workunit:
- clients:
- client.1:
- - suites/dbench.sh
+++ /dev/null
-tasks:
-- cifs-mount:
- client.1:
- share: ceph
-- workunit:
- clients:
- client.1:
- - suites/fsstress.sh
+++ /dev/null
-tasks:
-- cifs-mount:
- client.1:
- share: ceph
-- workunit:
- clients:
- client.1:
- - kernel_untar_build.sh
-
+++ /dev/null
-tasks:
-- pexec:
- client.1:
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.lock
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.fdpass
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.unlink
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.attr
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.trans2
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.negnowait
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.dir1
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.deny1
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.deny2
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.deny3
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.denydos
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.ntdeny1
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.ntdeny2
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.tcon
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.tcondev
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.vuid
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.rw1
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.open
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.defer_open
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.xcopy
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.rename
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.properties
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.mangle
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.openattr
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.chkpath
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.secleak
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.disconnect
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.samba3error
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.smb
-# - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.bench-holdcon
-# - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.bench-holdopen
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.bench-readwrite
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.bench-torture
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.scan-pipe_number
- - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.scan-ioctl
-# - /usr/local/samba/bin/smbtorture --password=ubuntu //localhost/ceph base.scan-maxfid
+++ /dev/null
-../../../../clusters/fixed-1.yaml
\ No newline at end of file
+++ /dev/null
-openstack:
- - machine:
- disk: 40 # GB
- ram: 8000 # MB
- cpus: 1
- volumes: # attached to each instance
- count: 3
- size: 30 # GB
+++ /dev/null
-meta:
-- desc: |
- Run ceph-deploy cli tests on one node
- and verify all the cli works and cluster can reach
- HEALTH_OK state(implicty verifying the daemons via init).
-tasks:
-- ceph_deploy.single_node_test: null
+++ /dev/null
-../../../../clusters/fixed-3-cephfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../../fs/btrfs.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- install:
-- ceph:
- fs: xfs
-- ceph-fuse:
-- workunit:
- clients:
- all:
- - suites/blogbench.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
- fs: btrfs
-- ceph-fuse:
-- workunit:
- clients:
- all:
- - suites/fsstress.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
- fs: btrfs
-- ceph-fuse: [client.0]
-- workunit:
- clients:
- all:
- - suites/iozone.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
- fs: xfs
- conf:
- mds:
- debug mds: 20
- debug ms: 1
- client:
- debug client: 20
- debug ms: 1
- fuse default permissions: false
- fuse set user groups: true
-- ceph-fuse:
-- workunit:
- clients:
- all:
- - suites/pjd.sh
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms die on skipped message: false
-tasks:
-- install:
-- ceph:
- fs: btrfs
-- kclient:
-- workunit:
- clients:
- all:
- - direct_io
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms die on skipped message: false
-tasks:
-- install:
-- ceph:
- fs: xfs
-- kclient:
-- workunit:
- clients:
- all:
- - suites/dbench.sh
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms die on skipped message: false
-tasks:
-- install:
-- ceph:
- fs: xfs
-- kclient:
-- workunit:
- clients:
- all:
- - suites/fsstress.sh
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms die on skipped message: false
-tasks:
-- install:
-- ceph:
- fs: xfs
-- kclient:
-- workunit:
- clients:
- all:
- - suites/pjd.sh
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- debug ms: 1
- debug client: 20
- mds:
- debug ms: 1
- debug mds: 20
-tasks:
-- install:
-- ceph:
- fs: btrfs
-- ceph-fuse:
-- workunit:
- clients:
- client.0:
- - libcephfs/test.sh
+++ /dev/null
-overrides:
- ceph:
- log-whitelist:
- - reached quota
- conf:
- global:
- ms inject delay max: 1
- ms inject delay probability: 0.005
- ms inject delay type: mon
- ms inject internal delays: 0.002
- ms inject socket failures: 2500
-tasks:
-- install: null
-- ceph:
- fs: xfs
-- mon_thrash:
- revive_delay: 90
- thrash_delay: 1
- thrash_many: true
-- workunit:
- clients:
- client.0:
- - rados/test.sh
+++ /dev/null
-tasks:
-- install: null
-- ceph:
- fs: ext4
- log-whitelist:
- - reached quota
- - wrongly marked me down
- - objects unfound and apparently lost
-- thrashosds:
- chance_pgnum_grow: 2
- chance_pgpnum_fix: 1
- timeout: 1200
-- workunit:
- clients:
- client.0:
- - rados/test.sh
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject delay max: 1
- ms inject delay probability: 0.005
- ms inject delay type: osd
- ms inject internal delays: 0.002
- ms inject socket failures: 2500
-tasks:
-- install: null
-- ceph:
- fs: xfs
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
-- thrashosds:
- chance_pgnum_grow: 2
- chance_pgpnum_fix: 1
- timeout: 1200
-- full_sequential:
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
+++ /dev/null
-tasks:
-- install: null
-- ceph:
- fs: btrfs
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
-- thrashosds:
- chance_pgnum_grow: 2
- chance_pgpnum_fix: 1
- timeout: 1200
-- exec:
- client.0:
- - sudo ceph osd pool create base 4
- - sudo ceph osd pool create cache 4
- - sudo ceph osd tier add base cache
- - sudo ceph osd tier cache-mode cache writeback
- - sudo ceph osd tier set-overlay base cache
- - sudo ceph osd pool set cache hit_set_type bloom
- - sudo ceph osd pool set cache hit_set_count 8
- - sudo ceph osd pool set cache hit_set_period 3600
- - sudo ceph osd pool set cache target_max_objects 250
-- rados:
- clients:
- - client.0
- objects: 500
- op_weights:
- copy_from: 50
- delete: 50
- evict: 50
- flush: 50
- read: 100
- rollback: 50
- snap_create: 50
- snap_remove: 50
- try_flush: 50
- write: 100
- ops: 4000
- pool_snaps: true
- pools:
- - base
+++ /dev/null
-tasks:
-- install:
-- ceph:
- fs: xfs
-- workunit:
- clients:
- client.0:
- - cls
+++ /dev/null
-tasks:
-- install: null
-- ceph:
- fs: xfs
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
-- thrashosds:
- chance_pgnum_grow: 3
- chance_pgpnum_fix: 1
- timeout: 1200
-- rados:
- clients:
- - client.0
- ec_pool: true
- max_in_flight: 64
- max_seconds: 600
- objects: 1024
- op_weights:
- append: 100
- copy_from: 50
- delete: 50
- read: 100
- rmattr: 25
- rollback: 50
- setattr: 25
- snap_create: 50
- snap_remove: 50
- write: 0
- ops: 400000
- size: 16384
+++ /dev/null
-tasks:
-- install:
-- ceph:
- fs: btrfs
- log-whitelist:
- - wrongly marked me down
-- ceph-fuse:
-- workunit:
- clients:
- client.0:
- - rados/test_python.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
- fs: ext4
-- ceph-fuse:
-- workunit:
- clients:
- all:
- - rados/load-gen-mix.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
- fs: xfs
-- ceph-fuse:
-- workunit:
- clients:
- client.0:
- - rbd/test_librbd.sh
- env:
- RBD_FEATURES: "1"
+++ /dev/null
-tasks:
-- install:
-- ceph:
- fs: xfs
-- ceph-fuse:
-- workunit:
- clients:
- client.0:
- - rbd/import_export.sh
- env:
- RBD_CREATE_ARGS: --new-format
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- rbd cache: true
- global:
- ms inject socket failures: 5000
-tasks:
-- install: null
-- ceph:
- fs: xfs
-- thrashosds:
- timeout: 1200
-- rbd_fsx:
- clients:
- - client.0
- ops: 2000
+++ /dev/null
-tasks:
-- install:
-- ceph:
- fs: btrfs
-- ceph-fuse:
-- workunit:
- clients:
- client.0:
- - rbd/test_librbd_python.sh
- env:
- RBD_FEATURES: "1"
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms die on skipped message: false
- client:
- rbd default features: 5
-tasks:
-- install:
-- ceph:
- fs: btrfs
-- rbd:
- all:
- image_size: 20480
-- workunit:
- clients:
- all:
- - suites/iozone.sh
+++ /dev/null
-overrides:
- rgw:
- ec-data-pool: true
- cache-pools: true
- frontend: civetweb
-tasks:
-- install:
-- ceph:
- fs: btrfs
-- rgw: [client.0]
-- s3tests:
- client.0:
- rgw_server: client.0
-overrides:
- ceph:
- conf:
- client:
- rgw lc debug interval: 10
+++ /dev/null
-tasks:
-- install:
-- ceph:
- fs: xfs
-- rgw: [client.0]
-- s3tests:
- client.0:
- rgw_server: client.0
-overrides:
- ceph:
- conf:
- client:
- rgw lc debug interval: 10
+++ /dev/null
-tasks:
-- install:
-- ceph:
- fs: ext4
-- rgw: [client.0]
-- swift:
- client.0:
- rgw_server: client.0
+++ /dev/null
-../../../../distros/supported/centos_7.2.yaml
\ No newline at end of file
+++ /dev/null
-os_type: ubuntu
-os_version: "16.04"
+++ /dev/null
-roles:
-- [mon.a, osd.0]
-- [osd.1, osd.2]
-- [mds.a, osd.3]
-- [mon.b, client.0]
-tasks:
-- ssh-keys:
-- ceph-deploy:
-- systemd:
-- workunit:
- clients:
- all:
- - rados/load-gen-mix.sh
+++ /dev/null
-../../../../clusters/fixed-3-cephfs.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- ceph-fuse:
-- workunit:
- clients:
- all:
- - snaps
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- kclient:
-- workunit:
- clients:
- all:
- - suites/fsx.sh
+++ /dev/null
-roles:
-- [mon.0, mds.a, osd.0]
-- [mon.1, osd.1]
-- [mon.2, osd.2]
-- [osd.3]
-- [osd.4]
-- [osd.5]
-- [osd.6]
-- [osd.7]
-- [osd.8]
-- [osd.9]
-- [osd.10]
-- [osd.11]
-- [osd.12]
-- [osd.13]
-- [osd.14]
-- [osd.15]
-- [client.0]
+++ /dev/null
-roles:
-- [mon.0, mds.a, osd.0, osd.1, osd.2]
-- [mon.1, mon.2, client.0]
+++ /dev/null
-roles:
-- [mon.0, mds.a, osd.0]
-- [mon.1, osd.1]
-- [mon.2, osd.2]
-- [osd.3]
-- [osd.4]
-- [osd.5]
-- [osd.6]
-- [osd.7]
-- [client.0]
+++ /dev/null
-../../../../fs/btrfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
-- thrashosds:
+++ /dev/null
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
-- thrashosds:
- op_delay: 1
- chance_down: 10
+++ /dev/null
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
-- thrashosds:
- chance_down: 50
+++ /dev/null
-tasks:
-- ceph-fuse:
-- workunit:
- clients:
- all:
- - suites/bonnie.sh
+++ /dev/null
-tasks:
-- ceph-fuse:
-- workunit:
- clients:
- all:
- - suites/iozone.sh
+++ /dev/null
-tasks:
-- radosbench:
- clients: [client.0]
- time: 1800
+++ /dev/null
-tasks:
-- rados:
- clients: [client.0]
- ops: 4000
- objects: 500
- op_weights:
- read: 45
- write: 45
- delete: 10
+++ /dev/null
-../../../distros/supported/
\ No newline at end of file
+++ /dev/null
-roles:
- - [mon.0, client.0]
-tasks:
- - install:
- # branch has precedence over sha1
- branch: hammer
- sha1: e5b6eea91cc37434f78a987d2dd1d3edd4a23f3f # dumpling
- - exec:
- client.0:
- - ceph --version | grep 'version 0.94'
+++ /dev/null
-roles:
- - [client.0]
-tasks:
- - install:
- tag: v0.94.1
- - exec:
- client.0:
- - ceph --version | grep 'version 0.94.1'
- - install.upgrade:
- client.0:
- tag: v0.94.3
- - exec:
- client.0:
- - ceph --version | grep 'version 0.94.3'
+++ /dev/null
-roles:
- - [mon.0, client.0]
-tasks:
- - install:
- # tag has precedence over branch and sha1
- tag: v0.94.1
- branch: firefly
- sha1: e5b6eea91cc37434f78a987d2dd1d3edd4a23f3f # dumpling
- - exec:
- client.0:
- - ceph --version | grep 'version 0.94.1'
+++ /dev/null
-roles:
- - [mon.0, client.0]
+++ /dev/null
-../../../distros/supported/
\ No newline at end of file
+++ /dev/null
-tasks:
- - install:
- - tests:
+++ /dev/null
-tasks:
-- teuthology_integration:
+++ /dev/null
-roles:
-- - ceph.mon.a
- - ceph.mon.b
- - backup.osd.0
- - backup.osd.1
- - backup.osd.2
- - backup.client.0
-- - backup.mon.a
- - ceph.osd.0
- - ceph.osd.1
- - ceph.osd.2
- - ceph.client.0
- - client.1
- - osd.3
-tasks:
-- install:
-- ceph:
- cluster: backup
-- ceph:
-- workunit:
- clients:
- ceph.client.0: [true.sh]
- backup.client.0: [true.sh]
+++ /dev/null
-roles:
-- - backup.mon.a
- - backup.mon.b
- - backup.osd.0
- - backup.osd.1
- - backup.osd.2
-- - backup.mon.c
- - backup.osd.3
- - backup.osd.4
- - backup.osd.5
- - backup.client.0
-tasks:
-- install:
-- ceph:
- cluster: backup
-- thrashosds:
- cluster: backup
-- workunit:
- clients:
- all: [true.sh]
+++ /dev/null
-overrides:
- ceph:
- log-whitelist:
- - failed to encode map
- conf:
- mon:
- mon warn on legacy crush tunables: false
-roles:
-- - ceph.mon.a
- - ceph.mon.b
- - backup.osd.0
- - backup.osd.1
- - backup.osd.2
- - backup.client.0
-- - backup.mon.a
- - ceph.osd.0
- - ceph.osd.1
- - ceph.osd.2
- - ceph.client.0
- - client.1
- - osd.3
-tasks:
-- install:
- branch: infernalis
-- ceph:
- cluster: backup
-- ceph:
-- workunit:
- clients:
- backup.client.0: [true.sh]
- ceph.client.0: [true.sh]
-- install.upgrade:
- ceph.mon.a:
- branch: jewel
- backup.mon.a:
- branch: jewel
-- ceph.restart: [ceph.mon.a, ceph.mon.b, ceph.osd.0, ceph.osd.1, ceph.osd.2, osd.3]
-- exec:
- ceph.client.0:
- - ceph --version | grep -F 'version 10.'
- client.1:
- - ceph --cluster backup --version | grep -F 'version 10.'
- backup.client.0:
- # cli upgraded
- - ceph --cluster backup --id 0 --version | grep -F 'version 10.'
- - ceph --version | grep -F 'version 10.'
- # backup cluster mon not upgraded
- - ceph --cluster backup --id 0 tell mon.a version | grep -F 'version 9.2.'
- - ceph tell mon.a version | grep -F 'version 10.'
+++ /dev/null
-roles:
-- - backup.mon.a
- - osd.0
- - osd.1
- - osd.2
- - client.0
- - backup.client.0
-- - mon.a
- - backup.osd.0
- - backup.osd.1
- - backup.osd.2
- - client.1
- - backup.client.1
-tasks:
-- install:
-- workunit:
- clients:
- all: [true.sh]
-- workunit:
- clients:
- backup.client.1: [true.sh]
+++ /dev/null
-../../../../fs/xfs.yaml
\ No newline at end of file
+++ /dev/null
-roles:
- - [mon.0, client.0]
+++ /dev/null
-tasks:
- - tests:
+++ /dev/null
-tasks:
- - nop:
-
+++ /dev/null
-../../../distros/supported/
\ No newline at end of file
+++ /dev/null
-# this runs s3tests against rgw, using civetweb
-roles:
-- [mon.a, mon.c, osd.0, osd.1, osd.2, client.0]
-- [mon.b, osd.3, osd.4, osd.5, client.1]
-
-tasks:
-- install:
- branch: master
-- ceph:
-- rgw: [client.0]
-- s3tests:
- client.0:
- rgw_server: client.0
- force-branch: master
-overrides:
- ceph:
- fs: xfs
- conf:
- client:
- debug rgw: 20
- rgw lc debug interval: 10
- rgw:
- ec-data-pool: false
- frontend: civetweb
+++ /dev/null
-# this runs s3tests against rgw, using mod_fastcgi
-roles:
-- [mon.a, mon.c, osd.0, osd.1, osd.2, client.0]
-- [mon.b, osd.3, osd.4, osd.5, client.1]
-
-tasks:
-- install:
- branch: master
-- ceph:
-- rgw: [client.0]
-- s3tests:
- client.0:
- rgw_server: client.0
- force-branch: master
-overrides:
- ceph:
- fs: xfs
- conf:
- client:
- debug rgw: 20
- rgw lc debug interval: 10
- rgw:
- ec-data-pool: false
- frontend: apache
+++ /dev/null
-# this runs s3tests against rgw, using mod_proxy_fcgi
-# the choice between uds or tcp with mod_proxy_fcgi depends on the distro
-roles:
-- [mon.a, mon.c, osd.0, osd.1, osd.2, client.0]
-- [mon.b, osd.3, osd.4, osd.5, client.1]
-
-tasks:
-- install:
- branch: master
-- ceph:
-- rgw: [client.0]
-- s3tests:
- client.0:
- rgw_server: client.0
- force-branch: master
-overrides:
- ceph:
- fs: xfs
- conf:
- client:
- debug rgw: 20
- rgw lc debug interval: 10
- rgw:
- ec-data-pool: false
- frontend: apache
- use_fcgi: true
+++ /dev/null
-roles:
- - [client.0]
-tasks:
-- install:
-- workunit:
- clients:
- all:
- - true.sh
+++ /dev/null
-roles:
-- [mon.a, mon.c, osd.0, osd.1, osd.2]
-- [mon.b, mds.a, osd.3, osd.4, osd.5]
-- [client.0]
+++ /dev/null
-overrides:
- ceph:
- fs: btrfs
- conf:
- osd:
- osd op thread timeout: 60
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 5000
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- ms inject socket failures: 500
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- tgt:
-- iscsi:
-- workunit:
- clients:
- all:
- - suites/blogbench.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- tgt:
-- iscsi:
-- workunit:
- clients:
- all:
- - suites/bonnie.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- tgt:
-- iscsi:
-- workunit:
- clients:
- all:
- - suites/dbench-short.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- tgt:
-- iscsi:
-- workunit:
- clients:
- all:
- - suites/dbench.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- tgt:
-- iscsi:
-- workunit:
- clients:
- all:
- - suites/ffsb.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- tgt:
-- iscsi:
-- workunit:
- clients:
- all:
- - suites/fio.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- tgt:
-- iscsi:
-- workunit:
- clients:
- all:
- - suites/fsstress.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- tgt:
-- iscsi:
-- workunit:
- clients:
- all:
- - suites/fsx.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- tgt:
-- iscsi:
-- workunit:
- clients:
- all:
- - suites/fsync-tester.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- tgt:
-- iscsi:
-- workunit:
- clients:
- all:
- - suites/iogen.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- tgt:
-- iscsi:
-- workunit:
- clients:
- all:
- - suites/iozone-sync.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- tgt:
-- iscsi:
-- workunit:
- clients:
- all:
- - suites/iozone.sh
+++ /dev/null
-tasks:
-- install:
-- ceph:
-- tgt:
-- iscsi:
-- workunit:
- clients:
- all:
- - suites/pjd.sh
+++ /dev/null
-roles:
-- - mon.a
- - osd.0
- - osd.1
-- - mon.b
- - mon.c
- - osd.2
- - osd.3
-- - client.0
-overrides:
- ceph:
- log-whitelist:
- - failed to encode map
- fs: xfs
+++ /dev/null
-tasks:
-- install:
- branch: hammer
- exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
-- print: "**** done install hammer"
-upgrade_workload:
- sequential:
- - install.upgrade:
- exclude_packages: ['ceph-test-dbg']
- client.0:
- - print: "**** done install.upgrade client.0"
+++ /dev/null
-overrides:
- ceph:
- conf:
- client:
- rbd default features: 13
-tasks:
-- exec:
- client.0:
- - "cp $(which ceph_test_librbd_api) $TESTDIR/ceph_test_librbd_api"
-- sequential:
- - upgrade_workload
-- ceph:
-- print: "**** done ceph"
-- exec:
- client.0:
- - "cp --force $TESTDIR/ceph_test_librbd_api $(which ceph_test_librbd_api)"
- - "rm -rf $TESTDIR/ceph_test_librbd_api"
-- print: "**** done reverting to hammer ceph_test_librbd_api"
-- workunit:
- branch: hammer
- clients:
- client.0:
- - rbd/test_librbd_api.sh
- env:
- RBD_FEATURES: "13"
-- print: "**** done rbd/test_librbd_api.sh"
+++ /dev/null
-tasks:
-- sequential:
- - upgrade_workload
-- ceph:
-- print: "**** done ceph"
-- workunit:
- branch: hammer
- clients:
- client.0:
- - rbd/import_export.sh
- env:
- RBD_CREATE_ARGS: --image-feature layering,exclusive-lock,object-map
-- print: "**** done rbd/import_export.sh"
+++ /dev/null
-roles:
-- - mon.a
- - mon.b
- - mon.c
- - osd.0
- - osd.1
- - osd.2
- - client.0
-- - client.1
-overrides:
- ceph:
- log-whitelist:
- - failed to encode map
- fs: xfs
- conf:
- client:
- rbd default features: 1
+++ /dev/null
-tasks:
-- install:
- branch: hammer
- exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
-- print: "**** done install hammer"
-- install.upgrade:
- exclude_packages: ['ceph-test-dbg']
- client.1:
-- print: "**** done install.upgrade client.1"
-- ceph:
-- print: "**** done ceph"
+++ /dev/null
-tasks:
-- workunit:
- branch: hammer
- clients:
- client.0:
- - rbd/notify_master.sh
- client.1:
- - rbd/notify_slave.sh
- env:
- RBD_FEATURES: "13"
-- print: "**** done rbd: old librbd -> new librbd"
-- workunit:
- branch: hammer
- clients:
- client.0:
- - rbd/notify_slave.sh
- client.1:
- - rbd/notify_master.sh
- env:
- RBD_FEATURES: "13"
-- print: "**** done rbd: new librbd -> old librbd"
+++ /dev/null
-roles:
-- - mon.a
- - mon.b
- - mon.c
- - osd.0
- - osd.1
- - osd.2
-- - client.0
-overrides:
- ceph:
- log-whitelist:
- - failed to encode map
- fs: xfs
+++ /dev/null
-tasks:
-- install:
- branch: jewel
- exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
-- print: "**** done install jewel"
-upgrade_workload:
- sequential:
- - install.upgrade:
- exclude_packages: ['ceph-test', 'ceph-test-dbg']
- client.0:
- - print: "**** done install.upgrade to -x on client.0"
+++ /dev/null
-tasks:
-- exec:
- client.0:
- - "cp $(which ceph_test_librbd_api) $TESTDIR/ceph_test_librbd_api"
-- sequential:
- - upgrade_workload
-- ceph:
-- print: "**** done ceph"
-- exec:
- client.0:
- - "cp --force $TESTDIR/ceph_test_librbd_api $(which ceph_test_librbd_api)"
- - "rm -rf $TESTDIR/ceph_test_librbd_api"
-- print: "**** done reverting to jewel ceph_test_librbd_api"
-- workunit:
- branch: kraken
- clients:
- client.0:
- - rbd/test_librbd_api.sh
- env:
- RBD_FEATURES: "13"
-- print: "**** done rbd/test_librbd_api.sh"
+++ /dev/null
-tasks:
-- sequential:
- - upgrade_workload
-- ceph:
-- print: "**** done ceph"
-- workunit:
- branch: jewel
- clients:
- client.0:
- - rbd/import_export.sh
- env:
- RBD_CREATE_ARGS: --image-feature layering,exclusive-lock,object-map
-- print: "**** done rbd/import_export.sh"
+++ /dev/null
-roles:
-- - mon.a
- - mon.b
- - mon.c
- - osd.0
- - osd.1
- - osd.2
- - client.0
-- - client.1
-overrides:
- ceph:
- log-whitelist:
- - failed to encode map
- fs: xfs
- conf:
- client:
- rbd default features: 1
-
+++ /dev/null
-tasks:
-- install:
- branch: jewel
- exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
-- print: "**** done install jewel"
-- install.upgrade:
- exclude_packages: ['ceph-test', 'ceph-test-dbg']
- client.1:
-- print: "**** done install.upgrade to -x on client.0"
-- ceph:
-- print: "**** done ceph task"
+++ /dev/null
-tasks:
-- workunit:
- branch: jewel
- clients:
- client.0:
- - rbd/notify_master.sh
- client.1:
- - rbd/notify_slave.sh
- env:
- RBD_FEATURES: "13"
-- print: "**** done rbd: old librbd -> new librbd"
-- workunit:
- branch: jewel
- clients:
- client.0:
- - rbd/notify_slave.sh
- client.1:
- - rbd/notify_master.sh
- env:
- RBD_FEATURES: "13"
-- print: "**** done rbd: new librbd -> old librbd"
+++ /dev/null
-overrides:
- ceph:
- conf:
- mon:
- mon warn on legacy crush tunables: false
- mon debug unsafe allow tier with nonempty snaps: true
- log-whitelist:
- - wrongly marked me down
- - reached quota
-roles:
-- - mon.a
- - osd.0
- - osd.1
-- - mon.b
- - mon.c
- - osd.2
- - osd.3
-- - client.0
- - client.1
+++ /dev/null
-tasks:
-- install:
- branch: hammer
- exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
-- print: "**** done hammer"
-- ceph:
- fs: xfs
-- install.upgrade:
- exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
- osd.0:
- branch: jewel
- osd.2:
- branch: jewel
-- print: "*** client.0 upgraded packages to jewel"
-- parallel:
- - workload
- - upgrade-sequence
-- print: "**** done parallel"
+++ /dev/null
-workload:
- full_sequential:
- - rados:
- clients: [client.0]
- ops: 4000
- objects: 50
- ec_pool: true
- write_append_excl: false
- op_weights:
- read: 100
- write: 0
- append: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
- setattr: 25
- rmattr: 25
- - print: "**** done rados ec task"
+++ /dev/null
-workload:
- full_sequential:
- - workunit:
- branch: hammer
- clients:
- client.0:
- - cls
- - print: "**** done cls 2-workload"
+++ /dev/null
-workload:
- full_sequential:
- - workunit:
- branch: hammer
- clients:
- client.0:
- - rados/load-gen-big.sh
- - print: "**** done rados/load-gen-big.sh 2-workload"
+++ /dev/null
-workload:
- full_sequential:
- - workunit:
- branch: hammer
- clients:
- client.0:
- - rbd/test_librbd.sh
- - print: "**** done rbd/test_librbd.sh 2-workload"
+++ /dev/null
-workload:
- full_sequential:
- - workunit:
- branch: hammer
- clients:
- client.0:
- - rbd/test_librbd_python.sh
- - print: "**** done rbd/test_librbd_python.sh 2-workload"
+++ /dev/null
-upgrade-sequence:
- sequential:
- - ceph.restart:
- daemons: [osd.0, osd.1, osd.2, osd.3]
- wait-for-healthy: false
- wait-for-osds-up: true
- - ceph.restart:
- daemons: [mon.a, mon.b, mon.c]
- wait-for-healthy: false
- wait-for-osds-up: true
- - print: "**** done ceph.restart do not wait for healthy"
- - exec:
- mon.a:
- - sleep 300 # http://tracker.ceph.com/issues/17808
- - ceph osd set require_jewel_osds
- - ceph.healthy:
- - print: "**** done ceph.healthy"
+++ /dev/null
-upgrade-sequence:
- sequential:
- - ceph.restart:
- daemons: [osd.0, osd.1]
- wait-for-healthy: true
- - sleep:
- duration: 60
- - ceph.restart:
- daemons: [osd.2, osd.3]
- wait-for-healthy: true
- - sleep:
- duration: 60
- - ceph.restart:
- daemons: [mon.a]
- wait-for-healthy: false
- - sleep:
- duration: 60
- - print: "**** running mixed versions of osds and mons"
-#do we need to use "ceph osd crush tunables hammer" ?
- - exec:
- mon.b:
- - sudo ceph osd crush tunables hammer
- - print: "**** done ceph osd crush tunables hammer"
- - ceph.restart:
- daemons: [mon.b, mon.c]
- wait-for-healthy: false
- - sleep:
- duration: 30
- - exec:
- osd.0:
- - sleep 300 # http://tracker.ceph.com/issues/17808
- - ceph osd set require_jewel_osds
- - ceph.healthy:
- - sleep:
- duration: 60
+++ /dev/null
-tasks:
-- install.upgrade:
- exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
- client.0:
- branch: jewel
+++ /dev/null
-../../../../releases/jewel.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
- - install.upgrade:
- exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
- client.0:
- branch: jewel
- - print: "**** done install.upgrade client.0 to jewel"
- - install.upgrade:
- osd.0:
- osd.2:
- - print: "**** done install.upgrade daemons to x"
- - parallel:
- - workload2
- - upgrade-sequence2
- - print: "**** done parallel workload2 and upgrade-sequence2"
+++ /dev/null
-meta:
-- desc: |
- run run randomized correctness test for rados operations
- on an erasure-coded pool
-workload2:
- full_sequential:
- - rados:
- erasure_code_profile:
- name: teuthologyprofile2
- k: 2
- m: 1
- ruleset-failure-domain: osd
- clients: [client.0]
- ops: 4000
- objects: 50
- ec_pool: true
- write_append_excl: false
- op_weights:
- read: 100
- write: 0
- append: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
- setattr: 25
- rmattr: 25
- - print: "**** done rados ec task"
+++ /dev/null
-meta:
-- desc: |
- object class functional tests
-workload2:
- full_sequential:
- - workunit:
- branch: jewel
- clients:
- client.0:
- - cls
- - print: "**** done cls 2-workload"
+++ /dev/null
-meta:
-- desc: |
- generate read/write load with rados objects ranging from 1MB to 25MB
-workload2:
- full_sequential:
- - workunit:
- branch: jewel
- clients:
- client.0:
- - rados/load-gen-big.sh
- - print: "**** done rados/load-gen-big.sh 2-workload"
+++ /dev/null
-meta:
-- desc: |
- librbd C and C++ api tests
-workload2:
- full_sequential:
- - workunit:
- branch: jewel
- clients:
- client.0:
- - rbd/test_librbd.sh
- - print: "**** done rbd/test_librbd.sh 2-workload"
+++ /dev/null
-meta:
-- desc: |
- librbd python api tests
-workload2:
- full_sequential:
- - workunit:
- branch: jewel
- clients:
- client.0:
- - rbd/test_librbd_python.sh
- - print: "**** done rbd/test_librbd_python.sh 2-workload"
+++ /dev/null
-meta:
-- desc: |
- upgrade the ceph cluster
-upgrade-sequence2:
- sequential:
- - ceph.restart:
- daemons: [mon.a, mon.b, mon.c, osd.0, osd.1, osd.2, osd.3]
- wait-for-healthy: false
- wait-for-osds-up: true
- - exec:
- mon.a:
- - ceph osd set require_kraken_osds
- - ceph.restart:
- daemons: [osd.0]
- - print: "**** done ceph.restart all"
+++ /dev/null
-meta:
-- desc: |
- upgrade the ceph cluster,
- upgrate in two steps
- step one ordering: mon.a, osd.0, osd.1
- step two ordering: mon.b, mon.c, osd.2, osd.3
- ceph expected to be healthy state after each step
-upgrade-sequence2:
- sequential:
- - ceph.restart:
- daemons: [mon.a]
- wait-for-healthy: true
- - sleep:
- duration: 60
- - ceph.restart:
- daemons: [osd.0, osd.1]
- wait-for-healthy: true
- - sleep:
- duration: 60
- - print: "**** running mixed versions of osds and mons"
- - exec:
- mon.b:
- - sudo ceph osd crush tunables jewel
- - print: "**** done ceph osd crush tunables jewel"
- - ceph.restart:
- daemons: [mon.b, mon.c]
- wait-for-healthy: true
- - sleep:
- duration: 60
- - ceph.restart:
- daemons: [osd.2, osd.3]
- wait-for-healthy: false
- wait-for-osds-up: true
- - exec:
- mon.a:
- - ceph osd set require_kraken_osds
- - ceph.restart: [osd.3]
- - sleep:
- duration: 60
+++ /dev/null
-../../../../releases/kraken.yaml
\ No newline at end of file
+++ /dev/null
-tasks:
-- rados:
- clients: [client.1]
- ops: 4000
- objects: 50
- op_weights:
- read: 100
- write: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
-- print: "**** done 7-final-workload/rados-snaps-few-objects.yaml"
+++ /dev/null
-tasks:
- - workunit:
- clients:
- client.1:
- - rados/load-gen-mix.sh
- - print: "**** done 7-final-workload/rados_loadgenmix.yaml"
+++ /dev/null
-tasks:
- - sequential:
- - mon_thrash:
- revive_delay: 20
- thrash_delay: 1
- - workunit:
- branch: jewel
- clients:
- client.1:
- - rados/test-upgrade-v11.0.0.sh
- - print: "**** done rados/test-upgrade-v11.0.0.sh from 7-final-workload"
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.1:
- - cls/test_cls_rbd.sh
-- print: "**** done 7-final-workload/rbd_cls.yaml"
+++ /dev/null
-tasks:
-- workunit:
- clients:
- client.1:
- - rbd/import_export.sh
- env:
- RBD_CREATE_ARGS: --new-format
-- print: "**** done rbd/import_export.sh from 7-final-workload"
+++ /dev/null
-tasks:
-- rgw: [client.1]
-- s3tests:
- client.1:
- rgw_server: client.1
-- print: "**** done rgw_server from 7-final-workload"
-overrides:
- ceph:
- conf:
- client:
- rgw lc debug interval: 10
+++ /dev/null
-os_type: centos
-os_version: "7.2"
+++ /dev/null
-os_type: ubuntu
-os_version: "14.04"
+++ /dev/null
-../../jewel-x/stress-split/0-cluster
\ No newline at end of file
+++ /dev/null
-tasks:
-- install:
- branch: hammer
- exclude_packages:
- - ceph-mgr
- - libcephfs2
- - libcephfs-devel
- - libcephfs-dev
-- print: '**** done hammer'
-- ceph:
- fs: xfs
-- install.upgrade:
- exclude_packages:
- - ceph-mgr
- - libcephfs2
- - libcephfs-devel
- - libcephfs-dev
- osd.0:
- branch: jewel
- osd.3:
- branch: jewel
-- print: '*** client.0 upgraded packages to jewel'
-- parallel:
- - workload-h-j
- - upgrade-sequence-h-j
-- print: '**** done parallel'
-- install.upgrade:
- client.0:
- branch: jewel
- exclude_packages:
- - ceph-mgr
- - libcephfs2
- - libcephfs-devel
- - libcephfs-dev
-- exec:
- osd.0:
- - ceph osd set sortbitwise
- - ceph osd set require_jewel_osds
- - for p in `ceph osd pool ls` ; do ceph osd pool set $p use_gmt_hitset true ;
- done
-- install.upgrade:
- client.0:
- branch: jewel
- exclude_packages:
- - ceph-mgr
- - libcephfs2
- - libcephfs-devel
- - libcephfs-dev
-- print: '**** done install.upgrade client.0 to jewel'
-upgrade-sequence-h-j:
- sequential:
- - ceph.restart:
- daemons:
- - osd.0
- - osd.1
- - osd.2
- - osd.3
- - osd.4
- - osd.5
- wait-for-healthy: false
- wait-for-osds-up: true
- - ceph.restart:
- daemons:
- - mon.a
- - mon.b
- - mon.c
- wait-for-healthy: false
- wait-for-osds-up: true
- - print: '**** done ceph.restart do not wait for healthy'
- - exec:
- mon.a:
- - sleep 300
- - ceph osd set require_jewel_osds
- - ceph.healthy: null
- - print: '**** done ceph.healthy'
-workload-h-j:
- full_sequential:
- - workunit:
- branch: hammer
- clients:
- client.0:
- - cls
- - print: "**** done cls 2-workload"
- - workunit:
- branch: hammer
- clients:
- client.0:
- - rbd/test_librbd.sh
- - print: "**** done rbd/test_librbd.sh 2-workload"
+++ /dev/null
-../../jewel-x/stress-split/2-partial-upgrade/
\ No newline at end of file
+++ /dev/null
-../../jewel-x/stress-split/3-thrash/
\ No newline at end of file
+++ /dev/null
-../../jewel-x/stress-split/4-mon/
\ No newline at end of file
+++ /dev/null
-../../jewel-x/stress-split/5-workload/
\ No newline at end of file
+++ /dev/null
-../../jewel-x/stress-split/6-next-mon/
\ No newline at end of file
+++ /dev/null
-../../jewel-x/stress-split/7-workload/
\ No newline at end of file
+++ /dev/null
-../../jewel-x/stress-split/8-next-mon/
\ No newline at end of file
+++ /dev/null
-../../jewel-x/stress-split/9-workload/
\ No newline at end of file
+++ /dev/null
-os_type: centos
-os_version: "7.2"
+++ /dev/null
-os_type: ubuntu
-os_version: "14.04"
+++ /dev/null
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 30 # GB
+++ /dev/null
-meta:
-- desc: |
- Run ceph on two nodes,
- with a separate client 0,1,2 third node.
- Use xfs beneath the osds.
- CephFS tests running on client 2,3
-roles:
-- - mon.a
- - mds.a
- - osd.0
- - osd.1
-- - mon.b
- - mon.c
- - osd.2
- - osd.3
-- - client.0
- - client.1
- - client.2
- - client.3
-overrides:
- ceph:
- log-whitelist:
- - scrub mismatch
- - ScrubResult
- - wrongly marked
- conf:
- fs: xfs
+++ /dev/null
-meta:
-- desc: |
- install ceph/jewel latest
- run workload and upgrade-sequence in parallel
- upgrade the client node
-tasks:
-- install:
- branch: jewel
- exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
-- print: "**** done installing jewel"
-- ceph:
-- print: "**** done ceph"
-- install.upgrade:
- mon.a:
- mon.b:
-- print: "**** done install.upgrade mon.a and mon.b"
-- parallel:
- - workload
- - upgrade-sequence
-- print: "**** done parallel"
-- install.upgrade:
- client.0:
-- print: "**** done install.upgrade on client.0"
+++ /dev/null
-meta:
-- desc: |
- run a cephfs stress test
- mount ceph-fuse on client.2 before running workunit
-workload:
- full_sequential:
- - sequential:
- - ceph-fuse:
- - print: "**** done ceph-fuse 2-workload"
- - workunit:
- clients:
- client.2:
- - suites/blogbench.sh
- - print: "**** done suites/blogbench.sh 2-workload"
+++ /dev/null
-meta:
-- desc: |
- run run randomized correctness test for rados operations
- on an erasure-coded pool
-workload:
- full_sequential:
- - rados:
- clients: [client.0]
- ops: 4000
- objects: 50
- ec_pool: true
- write_append_excl: false
- op_weights:
- read: 100
- write: 0
- append: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
- setattr: 25
- rmattr: 25
- - print: "**** done rados ec task"
+++ /dev/null
-meta:
-- desc: |
- object class functional tests
-workload:
- full_sequential:
- - workunit:
- branch: jewel
- clients:
- client.0:
- - cls
- - print: "**** done cls 2-workload"
+++ /dev/null
-meta:
-- desc: |
- generate read/write load with rados objects ranging from 1MB to 25MB
-workload:
- full_sequential:
- - workunit:
- branch: jewel
- clients:
- client.0:
- - rados/load-gen-big.sh
- - print: "**** done rados/load-gen-big.sh 2-workload"
+++ /dev/null
-meta:
-- desc: |
- librbd C and C++ api tests
-workload:
- full_sequential:
- - workunit:
- branch: jewel
- clients:
- client.0:
- - rbd/test_librbd.sh
- - print: "**** done rbd/test_librbd.sh 2-workload"
+++ /dev/null
-meta:
-- desc: |
- librbd python api tests
-workload:
- full_sequential:
- - workunit:
- branch: jewel
- clients:
- client.0:
- - rbd/test_librbd_python.sh
- - print: "**** done rbd/test_librbd_python.sh 2-workload"
+++ /dev/null
-meta:
-- desc: |
- upgrade the ceph cluster
-upgrade-sequence:
- sequential:
- - ceph.restart:
- daemons: [mon.a, mon.b, mon.c, mds.a, osd.0, osd.1, osd.2, osd.3]
- wait-for-healthy: false
- wait-for-osds-up: true
- - exec:
- mon.a:
- - ceph osd set require_kraken_osds
- - ceph.restart:
- daemons: [osd.0]
- - print: "**** done ceph.restart all"
+++ /dev/null
-meta:
-- desc: |
- upgrade the ceph cluster,
- upgrate in two steps
- step one ordering: mon.a, osd.0, osd.1, mds.a
- step two ordering: mon.b, mon.c, osd.2, osd.3
- ceph expected to be healthy state after each step
-upgrade-sequence:
- sequential:
- - ceph.restart:
- daemons: [mon.a]
- wait-for-healthy: true
- - sleep:
- duration: 60
- - ceph.restart:
- daemons: [osd.0, osd.1]
- wait-for-healthy: true
- - sleep:
- duration: 60
- - ceph.restart: [mds.a]
- - sleep:
- duration: 60
- - print: "**** running mixed versions of osds and mons"
- - exec:
- mon.b:
- - sudo ceph osd crush tunables jewel
- - print: "**** done ceph osd crush tunables jewel"
- - ceph.restart:
- daemons: [mon.b, mon.c]
- wait-for-healthy: true
- - sleep:
- duration: 60
- - ceph.restart:
- daemons: [osd.2, osd.3]
- wait-for-healthy: false
- wait-for-osds-up: true
- - exec:
- mon.a:
- - ceph osd set require_kraken_osds
- - ceph.restart: [osd.3]
- - sleep:
- duration: 60
+++ /dev/null
-../../../../releases/kraken.yaml
\ No newline at end of file
+++ /dev/null
-meta:
-- desc: |
- run a cephfs stress test
- mount ceph-fuse on client.3 before running workunit
-tasks:
-- sequential:
- - ceph-fuse:
- - print: "**** done ceph-fuse 5-final-workload"
- - workunit:
- clients:
- client.3:
- - suites/blogbench.sh
- - print: "**** done suites/blogbench.sh 5-final-workload"
+++ /dev/null
-meta:
-- desc: |
- randomized correctness test for rados operations on a replicated pool with snapshots
-tasks:
- - rados:
- clients: [client.1]
- ops: 4000
- objects: 50
- write_append_excl: false
- op_weights:
- read: 100
- write: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- - print: "**** done rados 4-final-workload"
+++ /dev/null
-meta:
-- desc: |
- generate read/write load with rados objects ranging from 1 byte to 1MB
-tasks:
- - workunit:
- clients:
- client.1:
- - rados/load-gen-mix.sh
- - print: "**** done rados/load-gen-mix.sh 4-final-workload"
+++ /dev/null
-meta:
-- desc: |
- librados C and C++ api tests
-overrides:
- ceph:
- log-whitelist:
- - reached quota
-tasks:
- - mon_thrash:
- revive_delay: 20
- thrash_delay: 1
- - print: "**** done mon_thrash 4-final-workload"
- - workunit:
- branch: jewel
- clients:
- client.1:
- - rados/test-upgrade-v11.0.0.sh
- - print: "**** done rados/test-upgrade-v11.0.0.sh 4-final-workload"
+++ /dev/null
-meta:
-- desc: |
- rbd object class functional tests
-tasks:
- - workunit:
- clients:
- client.1:
- - cls/test_cls_rbd.sh
- - print: "**** done cls/test_cls_rbd.sh 4-final-workload"
+++ /dev/null
-meta:
-- desc: |
- run basic import/export cli tests for rbd
-tasks:
- - workunit:
- clients:
- client.1:
- - rbd/import_export.sh
- env:
- RBD_CREATE_ARGS: --new-format
- - print: "**** done rbd/import_export.sh 4-final-workload"
+++ /dev/null
-meta:
-- desc: |
- swift api tests for rgw
-overrides:
- rgw:
- frontend: civetweb
-tasks:
- - rgw: [client.1]
- - print: "**** done rgw 4-final-workload"
- - swift:
- client.1:
- rgw_server: client.1
- - print: "**** done swift 4-final-workload"
+++ /dev/null
-../../../../distros/supported/
\ No newline at end of file
+++ /dev/null
-../../../../releases/kraken.yaml
\ No newline at end of file
+++ /dev/null
-../../../../../distros/all/centos_7.2.yaml
\ No newline at end of file
+++ /dev/null
-../../../../../distros/all/ubuntu_14.04.yaml
\ No newline at end of file
+++ /dev/null
-meta:
-- desc: |
- Run ceph on two nodes, using one of them as a client,
- with a separate client-only node.
- Use xfs beneath the osds.
- install ceph/jewel v10.2.0 point version
- run workload and upgrade-sequence in parallel
- install ceph/jewel latest version
- run workload and upgrade-sequence in parallel
- install ceph/-x version (jewel or kraken)
- run workload and upgrade-sequence in parallel
-overrides:
- ceph:
- log-whitelist:
- - reached quota
- - scrub
- - osd_map_max_advance
- - wrongly marked
- fs: xfs
- conf:
- mon:
- mon debug unsafe allow tier with nonempty snaps: true
- osd:
- osd map max advance: 1000
-roles:
-- - mon.a
- - mds.a
- - osd.0
- - osd.1
- - osd.2
-- - mon.b
- - mon.c
- - osd.3
- - osd.4
- - osd.5
- - client.0
-- - client.1
-openstack:
-- volumes: # attached to each instance
- count: 3
- size: 30 # GB
-tasks:
-- print: "**** v10.2.0 about to install"
-- install:
- tag: v10.2.0
- exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev', 'librgw2']
-- print: "**** done v10.2.0 install"
-- ceph:
- fs: xfs
-- print: "**** done ceph xfs"
-- sequential:
- - workload
-- print: "**** done workload v10.2.0"
-- install.upgrade:
- exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
- mon.a:
- branch: jewel
- mon.b:
- branch: jewel
- # Note that client.a IS NOT upgraded at this point
- #client.1:
- #branch: jewel
-- parallel:
- - workload_jewel
- - upgrade-sequence_jewel
-- print: "**** done parallel jewel branch"
-- install.upgrade:
- exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
- client.1:
- branch: jewel
-- print: "**** done branch: jewel install.upgrade on client.1"
-- install.upgrade:
- mon.a:
- mon.b:
-- print: "**** done branch: -x install.upgrade on mon.a and mon.b"
-- parallel:
- - workload_x
- - upgrade-sequence_x
-- print: "**** done parallel -x branch"
-# Run librados tests on the -x upgraded cluster
-- install.upgrade:
- client.1:
-- workunit:
- branch: jewel
- clients:
- client.1:
- - rados/test-upgrade-v11.0.0.sh
- - cls
-- print: "**** done final test on -x cluster"
-#######################
-workload:
- sequential:
- - workunit:
- clients:
- client.0:
- - suites/blogbench.sh
-workload_jewel:
- full_sequential:
- - workunit:
- branch: jewel
- clients:
- client.1:
- - rados/test.sh
- - cls
- env:
- CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.mirror_image'
- - print: "**** done rados/test.sh & cls workload_jewel"
- - sequential:
- - rgw: [client.0]
- - print: "**** done rgw workload_jewel"
- - s3tests:
- client.0:
- force-branch: ceph-jewel
- rgw_server: client.0
- - print: "**** done s3tests workload_jewel"
-upgrade-sequence_jewel:
- sequential:
- - print: "**** done branch: jewel install.upgrade"
- - ceph.restart: [mds.a]
- - sleep:
- duration: 60
- - ceph.restart: [osd.0]
- - sleep:
- duration: 30
- - ceph.restart: [osd.1]
- - sleep:
- duration: 30
- - ceph.restart: [osd.2]
- - sleep:
- duration: 30
- - ceph.restart: [osd.3]
- - sleep:
- duration: 30
- - ceph.restart: [osd.4]
- - sleep:
- duration: 30
- - ceph.restart: [osd.5]
- - sleep:
- duration: 60
- - ceph.restart: [mon.a]
- - sleep:
- duration: 60
- - ceph.restart: [mon.b]
- - sleep:
- duration: 60
- - ceph.restart: [mon.c]
- - sleep:
- duration: 60
- - print: "**** done ceph.restart all jewel branch mds/osd/mon"
-workload_x:
- sequential:
- - workunit:
- branch: jewel
- clients:
- client.1:
- - rados/test-upgrade-v11.0.0.sh
- - cls
- env:
- CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.mirror_image'
- - print: "**** done rados/test-upgrade-v11.0.0.sh & cls workload_x NOT upgraded client"
- - workunit:
- branch: jewel
- clients:
- client.0:
- - rados/test-upgrade-v11.0.0.sh
- - cls
- - print: "**** done rados/test-upgrade-v11.0.0.sh & cls workload_x upgraded client"
- - rgw: [client.1]
- - print: "**** done rgw workload_x"
- - s3tests:
- client.1:
- force-branch: ceph-jewel
- rgw_server: client.1
- - print: "**** done s3tests workload_x"
-upgrade-sequence_x:
- sequential:
- - ceph.restart: [mds.a]
- - sleep:
- duration: 60
- - ceph.restart: [mon.a]
- - sleep:
- duration: 60
- - ceph.restart: [mon.b]
- - sleep:
- duration: 60
- - ceph.restart: [mon.c]
- - sleep:
- duration: 60
- - ceph.restart: [osd.0]
- - sleep:
- duration: 30
- - ceph.restart: [osd.1]
- - sleep:
- duration: 30
- - ceph.restart: [osd.2]
- - sleep:
- duration: 30
- - ceph.restart: [osd.3]
- - sleep:
- duration: 30
- - ceph.restart: [osd.4]
- - sleep:
- duration: 30
- - ceph.restart:
- daemons: [osd.5]
- wait-for-healthy: false
- wait-for-up-osds: true
- - exec:
- mon.a:
- - ceph osd set require_kraken_osds
- - sleep:
- duration: 60
- - print: "**** done ceph.restart all -x branch mds/osd/mon"
+++ /dev/null
-../stress-split/0-cluster/
\ No newline at end of file
+++ /dev/null
-arch: x86_64
+++ /dev/null
-../stress-split/1-jewel-install/
\ No newline at end of file
+++ /dev/null
-../stress-split/2-partial-upgrade/
\ No newline at end of file
+++ /dev/null
-meta:
-- desc: |
- randomly kill and revive osd
- small chance of increasing the number of pgs
-overrides:
- ceph:
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
- - log bound mismatch
-tasks:
-- thrashosds:
- timeout: 1200
- chance_pgnum_grow: 1
- chance_pgpnum_fix: 1
- min_in: 4
-- print: "**** done thrashosds 3-thrash"
+++ /dev/null
-../stress-split/4-mon/
\ No newline at end of file
+++ /dev/null
-meta:
-- desc: |
- randomized correctness test for rados operations on an erasure coded pool
-tasks:
- - rados:
- clients: [client.0]
- ops: 4000
- objects: 50
- ec_pool: true
- write_append_excl: false
- op_weights:
- read: 100
- write: 0
- append: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
- setattr: 25
- rmattr: 25
- - print: "**** done rados ec task"
+++ /dev/null
-../stress-split/6-next-mon/
\ No newline at end of file
+++ /dev/null
-../stress-split/8-next-mon/
\ No newline at end of file
+++ /dev/null
-#
-# k=3 implies a stripe_width of 1376*3 = 4128 which is different from
-# the default value of 4096 It is also not a multiple of 1024*1024 and
-# creates situations where rounding rules during recovery becomes
-# necessary.
-#
-meta:
-- desc: |
- randomized correctness test for rados operations on an erasure coded pool
- using the jerasure plugin with k=3 and m=1
-tasks:
-- rados:
- clients: [client.0]
- ops: 4000
- objects: 50
- ec_pool: true
- write_append_excl: false
- erasure_code_profile:
- name: jerasure31profile
- plugin: jerasure
- k: 3
- m: 1
- technique: reed_sol_van
- ruleset-failure-domain: osd
- op_weights:
- read: 100
- write: 0
- append: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
- setattr: 25
- rmattr: 25
+++ /dev/null
-../stress-split/0-cluster/
\ No newline at end of file
+++ /dev/null
-../stress-split/1-jewel-install/
\ No newline at end of file
+++ /dev/null
-../stress-split/2-partial-upgrade/
\ No newline at end of file
+++ /dev/null
-meta:
-- desc: |
- randomly kill and revive osd
- small chance to increase the number of pgs
-overrides:
- ceph:
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
- - log bound mismatch
-tasks:
-- thrashosds:
- timeout: 1200
- chance_pgnum_grow: 1
- chance_pgpnum_fix: 1
- min_in: 4
-- print: "**** done thrashosds 3-thrash"
+++ /dev/null
-../stress-split/4-mon/
\ No newline at end of file
+++ /dev/null
-meta:
-- desc: |
- randomized correctness test for rados operations on an erasure coded pool
-tasks:
- - rados:
- clients: [client.0]
- ops: 4000
- objects: 50
- ec_pool: true
- write_append_excl: false
- op_weights:
- read: 100
- write: 0
- append: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
- setattr: 25
- rmattr: 25
- - print: "**** done rados ec task"
+++ /dev/null
-../stress-split/6-next-mon/
\ No newline at end of file
+++ /dev/null
-../stress-split/8-next-mon/
\ No newline at end of file
+++ /dev/null
-#
-# k=3 implies a stripe_width of 1376*3 = 4128 which is different from
-# the default value of 4096 It is also not a multiple of 1024*1024 and
-# creates situations where rounding rules during recovery becomes
-# necessary.
-#
-meta:
-- desc: |
- randomized correctness test for rados operations on an erasure coded pool
- using the jerasure plugin with k=3 and m=1
-tasks:
-- rados:
- clients: [client.0]
- ops: 4000
- objects: 50
- ec_pool: true
- write_append_excl: false
- erasure_code_profile:
- name: jerasure31profile
- plugin: jerasure
- k: 3
- m: 1
- technique: reed_sol_van
- ruleset-failure-domain: osd
- op_weights:
- read: 100
- write: 0
- append: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
- setattr: 25
- rmattr: 25
+++ /dev/null
-../../../../distros/supported/
\ No newline at end of file
+++ /dev/null
-openstack:
- - machine:
- disk: 100 # GB
- - volumes: # attached to each instance
- count: 3
- size: 30 # GB
+++ /dev/null
-meta:
-- desc: |
- Run ceph on two nodes,
- with a separate client-only node.
- Use xfs beneath the osds.
-overrides:
- ceph:
- conf:
- mon:
- mon warn on legacy crush tunables: false
- fs: xfs
-roles:
-- - mon.a
- - mon.b
- - mon.c
- - osd.0
- - osd.1
- - osd.2
-- - osd.3
- - osd.4
- - osd.5
-- - client.0
+++ /dev/null
-meta:
-- desc: install ceph/jewel latest
-tasks:
-- install:
- branch: jewel
- exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
-- print: "**** done install jewel"
-- ceph:
-- print: "**** done ceph"
+++ /dev/null
-meta:
-- desc: |
- install upgrade ceph/-x on one node only
- 1st half
- restart : osd.0,1,2,3,4,5
-tasks:
-- install.upgrade:
- osd.0:
-- print: "**** done install.upgrade osd.0"
-- ceph.restart:
- daemons: [osd.0, osd.1, osd.2, osd.3, osd.4, osd.5]
-- print: "**** done ceph.restart 1st half"
+++ /dev/null
-meta:
-- desc: |
- randomly kill and revive osd
- small chance to increase the number of pgs
-overrides:
- ceph:
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
- - log bound mismatch
-tasks:
-- thrashosds:
- timeout: 1200
- chance_pgnum_grow: 1
- chance_pgpnum_fix: 1
-- print: "**** done thrashosds 3-thrash"
+++ /dev/null
-meta:
-- desc: |
- restart mon.a so it is upgraded to -x
-tasks:
-- ceph.restart:
- daemons: [mon.a]
- wait-for-healthy: false
- wait-for-osds-up: true
-- print: "**** done ceph.restart mon.a"
+++ /dev/null
-meta:
-- desc: |
- run basic cls tests for rbd
-tasks:
-- workunit:
- branch: jewel
- clients:
- client.0:
- - cls/test_cls_rbd.sh
-- print: "**** done cls/test_cls_rbd.sh 5-workload"
+++ /dev/null
-meta:
-- desc: |
- run basic import/export cli tests for rbd
-tasks:
-- workunit:
- branch: jewel
- clients:
- client.0:
- - rbd/import_export.sh
- env:
- RBD_CREATE_ARGS: --new-format
-- print: "**** done rbd/import_export.sh 5-workload"
+++ /dev/null
-meta:
-- desc: |
- randomized correctness test for rados operations on a replicated pool,
- using only reads, writes, and deletes
-tasks:
-- full_sequential:
- - rados:
- clients: [client.0]
- ops: 4000
- objects: 500
- write_append_excl: false
- op_weights:
- read: 45
- write: 45
- delete: 10
-- print: "**** done rados/readwrite 5-workload"
+++ /dev/null
-meta:
-- desc: |
- randomized correctness test for rados operations on a replicated pool with snapshot operations
-tasks:
-- full_sequential:
- - rados:
- clients: [client.0]
- ops: 4000
- objects: 50
- write_append_excl: false
- op_weights:
- read: 100
- write: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
-- print: "**** done rados/snaps-few-objects 5-workload"
+++ /dev/null
-meta:
-- desc: |
- restart mon.b so it is upgraded to -x
-tasks:
-- ceph.restart:
- daemons: [mon.b]
- wait-for-healthy: false
- wait-for-osds-up: true
-- print: "**** done ceph.restart mon.b 6-next-mon"
+++ /dev/null
-meta:
-- desc: |
- run randomized correctness test for rados operations
- generate write load with rados bench
-tasks:
-- full_sequential:
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
-- print: "**** done radosbench 7-workload"
+++ /dev/null
-meta:
-- desc: |
- librbd C and C++ api tests
-tasks:
-- workunit:
- branch: jewel
- clients:
- client.0:
- - rbd/test_librbd.sh
-- print: "**** done rbd/test_librbd.sh 7-workload"
+++ /dev/null
-meta:
-- desc: |
- restart mon.c so it is upgraded to -x
- as all mon were upgrated, expected ceph cluster reach quorum
-tasks:
-- ceph.restart:
- daemons: [mon.c]
- wait-for-healthy: false
- wait-for-osds-up: true
-- print: "**** done ceph.restart mon.c 8-next-mon"
-- ceph.wait_for_mon_quorum: [a, b, c]
-- print: "**** done wait_for_mon_quorum 8-next-mon"
+++ /dev/null
-meta:
-- desc: |
- librbd python api tests
-tasks:
-- workunit:
- branch: jewel
- clients:
- client.0:
- - rbd/test_librbd_python.sh
-- print: "**** done rbd/test_librbd_python.sh 9-workload"
+++ /dev/null
-meta:
-- desc: |
- swift api tests for rgw
-tasks:
-- rgw:
- client.0:
- default_idle_timeout: 300
-- print: "**** done rgw 9-workload"
-- swift:
- client.0:
- rgw_server: client.0
-- print: "**** done swift 9-workload"
+++ /dev/null
-meta:
-- desc: |
- randomized correctness test for rados operations on a replicated pool with snapshot operations
-tasks:
-- rados:
- clients: [client.0]
- ops: 4000
- objects: 500
- write_append_excl: false
- op_weights:
- read: 100
- write: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
+++ /dev/null
-../../../../distros/supported/
\ No newline at end of file
+++ /dev/null
-import logging
-
-# Inherit teuthology's log level
-teuthology_log = logging.getLogger('teuthology')
-log = logging.getLogger(__name__)
-log.setLevel(teuthology_log.level)
+++ /dev/null
-"""
-Admin Socket task -- used in rados, powercycle, and smoke testing
-"""
-from cStringIO import StringIO
-
-import json
-import logging
-import os
-import time
-
-from teuthology.orchestra import run
-from teuthology import misc as teuthology
-from teuthology.parallel import parallel
-
-log = logging.getLogger(__name__)
-
-
-def task(ctx, config):
- """
- Run an admin socket command, make sure the output is json, and run
- a test program on it. The test program should read json from
- stdin. This task succeeds if the test program exits with status 0.
-
- To run the same test on all clients::
-
- tasks:
- - ceph:
- - rados:
- - admin_socket:
- all:
- dump_requests:
- test: http://example.com/script
-
- To restrict it to certain clients::
-
- tasks:
- - ceph:
- - rados: [client.1]
- - admin_socket:
- client.1:
- dump_requests:
- test: http://example.com/script
-
- If an admin socket command has arguments, they can be specified as
- a list::
-
- tasks:
- - ceph:
- - rados: [client.0]
- - admin_socket:
- client.0:
- dump_requests:
- test: http://example.com/script
- help:
- test: http://example.com/test_help_version
- args: [version]
-
- Note that there must be a ceph client with an admin socket running
- before this task is run. The tests are parallelized at the client
- level. Tests for a single client are run serially.
-
- :param ctx: Context
- :param config: Configuration
- """
- assert isinstance(config, dict), \
- 'admin_socket task requires a dict for configuration'
- teuthology.replace_all_with_clients(ctx.cluster, config)
-
- with parallel() as ptask:
- for client, tests in config.iteritems():
- ptask.spawn(_run_tests, ctx, client, tests)
-
-
-def _socket_command(ctx, remote, socket_path, command, args):
- """
- Run an admin socket command and return the result as a string.
-
- :param ctx: Context
- :param remote: Remote site
- :param socket_path: path to socket
- :param command: command to be run remotely
- :param args: command arguments
-
- :returns: output of command in json format
- """
- json_fp = StringIO()
- testdir = teuthology.get_testdir(ctx)
- max_tries = 120
- while True:
- proc = remote.run(
- args=[
- 'sudo',
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'ceph',
- '--admin-daemon', socket_path,
- ] + command.split(' ') + args,
- stdout=json_fp,
- check_status=False,
- )
- if proc.exitstatus == 0:
- break
- assert max_tries > 0
- max_tries -= 1
- log.info('ceph cli returned an error, command not registered yet?')
- log.info('sleeping and retrying ...')
- time.sleep(1)
- out = json_fp.getvalue()
- json_fp.close()
- log.debug('admin socket command %s returned %s', command, out)
- return json.loads(out)
-
-def _run_tests(ctx, client, tests):
- """
- Create a temp directory and wait for a client socket to be created.
- For each test, copy the executable locally and run the test.
- Remove temp directory when finished.
-
- :param ctx: Context
- :param client: client machine to run the test
- :param tests: list of tests to run
- """
- testdir = teuthology.get_testdir(ctx)
- log.debug('Running admin socket tests on %s', client)
- (remote,) = ctx.cluster.only(client).remotes.iterkeys()
- socket_path = '/var/run/ceph/ceph-{name}.asok'.format(name=client)
- overrides = ctx.config.get('overrides', {}).get('admin_socket', {})
-
- try:
- tmp_dir = os.path.join(
- testdir,
- 'admin_socket_{client}'.format(client=client),
- )
- remote.run(
- args=[
- 'mkdir',
- '--',
- tmp_dir,
- run.Raw('&&'),
- # wait for client process to create the socket
- 'while', 'test', '!', '-e', socket_path, run.Raw(';'),
- 'do', 'sleep', '1', run.Raw(';'), 'done',
- ],
- )
-
- for command, config in tests.iteritems():
- if config is None:
- config = {}
- teuthology.deep_merge(config, overrides)
- log.debug('Testing %s with config %s', command, str(config))
-
- test_path = None
- if 'test' in config:
- url = config['test'].format(
- branch=config.get('branch', 'master')
- )
- test_path = os.path.join(tmp_dir, command)
- remote.run(
- args=[
- 'wget',
- '-q',
- '-O',
- test_path,
- '--',
- url,
- run.Raw('&&'),
- 'chmod',
- 'u=rx',
- '--',
- test_path,
- ],
- )
-
- args = config.get('args', [])
- assert isinstance(args, list), \
- 'admin socket command args must be a list'
- sock_out = _socket_command(ctx, remote, socket_path, command, args)
- if test_path is not None:
- remote.run(
- args=[
- test_path,
- ],
- stdin=json.dumps(sock_out),
- )
-
- finally:
- remote.run(
- args=[
- 'rm', '-rf', '--', tmp_dir,
- ],
- )
+++ /dev/null
-<IfModule !version_module>
- LoadModule version_module {mod_path}/mod_version.so
-</IfModule>
-<IfModule !env_module>
- LoadModule env_module {mod_path}/mod_env.so
-</IfModule>
-<IfModule !rewrite_module>
- LoadModule rewrite_module {mod_path}/mod_rewrite.so
-</IfModule>
-<IfModule !log_config_module>
- LoadModule log_config_module {mod_path}/mod_log_config.so
-</IfModule>
-
-Listen {port}
-ServerName {host}
-
-<IfVersion >= 2.4>
- <IfModule !unixd_module>
- LoadModule unixd_module {mod_path}/mod_unixd.so
- </IfModule>
- <IfModule !authz_core_module>
- LoadModule authz_core_module {mod_path}/mod_authz_core.so
- </IfModule>
- <IfModule !mpm_worker_module>
- LoadModule mpm_worker_module {mod_path}/mod_mpm_worker.so
- </IfModule>
- User {user}
- Group {group}
-</IfVersion>
-
-ServerRoot {testdir}/apache
-ErrorLog {testdir}/archive/apache.{client}/error.log
-LogFormat "%h l %u %t \"%r\" %>s %b \"{{Referer}}i\" \"%{{User-agent}}i\"" combined
-CustomLog {testdir}/archive/apache.{client}/access.log combined
-PidFile {testdir}/apache/tmp.{client}/apache.pid
-DocumentRoot {testdir}/apache/htdocs.{client}
-
-
-<Directory {testdir}/apache/htdocs.{client}>
- Options +ExecCGI
- AllowOverride All
- SetHandler fastcgi-script
-</Directory>
-
-AllowEncodedSlashes On
-ServerSignature Off
-MaxRequestsPerChild 0
-
+++ /dev/null
-"""
-Run an autotest test on the ceph cluster.
-"""
-import json
-import logging
-import os
-
-from teuthology import misc as teuthology
-from teuthology.parallel import parallel
-from teuthology.orchestra import run
-
-log = logging.getLogger(__name__)
-
-def task(ctx, config):
- """
- Run an autotest test on the ceph cluster.
-
- Only autotest client tests are supported.
-
- The config is a mapping from role name to list of tests to run on
- that client.
-
- For example::
-
- tasks:
- - ceph:
- - ceph-fuse: [client.0, client.1]
- - autotest:
- client.0: [dbench]
- client.1: [bonnie]
-
- You can also specify a list of tests to run on all clients::
-
- tasks:
- - ceph:
- - ceph-fuse:
- - autotest:
- all: [dbench]
- """
- assert isinstance(config, dict)
- config = teuthology.replace_all_with_clients(ctx.cluster, config)
- log.info('Setting up autotest...')
- testdir = teuthology.get_testdir(ctx)
- with parallel() as p:
- for role in config.iterkeys():
- (remote,) = ctx.cluster.only(role).remotes.keys()
- p.spawn(_download, testdir, remote)
-
- log.info('Making a separate scratch dir for every client...')
- for role in config.iterkeys():
- assert isinstance(role, basestring)
- PREFIX = 'client.'
- assert role.startswith(PREFIX)
- id_ = role[len(PREFIX):]
- (remote,) = ctx.cluster.only(role).remotes.iterkeys()
- mnt = os.path.join(testdir, 'mnt.{id}'.format(id=id_))
- scratch = os.path.join(mnt, 'client.{id}'.format(id=id_))
- remote.run(
- args=[
- 'sudo',
- 'install',
- '-d',
- '-m', '0755',
- '--owner={user}'.format(user='ubuntu'), #TODO
- '--',
- scratch,
- ],
- )
-
- with parallel() as p:
- for role, tests in config.iteritems():
- (remote,) = ctx.cluster.only(role).remotes.keys()
- p.spawn(_run_tests, testdir, remote, role, tests)
-
-def _download(testdir, remote):
- """
- Download. Does not explicitly support muliple tasks in a single run.
- """
- remote.run(
- args=[
- # explicitly does not support multiple autotest tasks
- # in a single run; the result archival would conflict
- 'mkdir', '{tdir}/archive/autotest'.format(tdir=testdir),
- run.Raw('&&'),
- 'mkdir', '{tdir}/autotest'.format(tdir=testdir),
- run.Raw('&&'),
- 'wget',
- '-nv',
- '--no-check-certificate',
- 'https://github.com/ceph/autotest/tarball/ceph',
- '-O-',
- run.Raw('|'),
- 'tar',
- '-C', '{tdir}/autotest'.format(tdir=testdir),
- '-x',
- '-z',
- '-f-',
- '--strip-components=1',
- ],
- )
-
-def _run_tests(testdir, remote, role, tests):
- """
- Spawned to run test on remote site
- """
- assert isinstance(role, basestring)
- PREFIX = 'client.'
- assert role.startswith(PREFIX)
- id_ = role[len(PREFIX):]
- mnt = os.path.join(testdir, 'mnt.{id}'.format(id=id_))
- scratch = os.path.join(mnt, 'client.{id}'.format(id=id_))
-
- assert isinstance(tests, list)
- for idx, testname in enumerate(tests):
- log.info('Running autotest client test #%d: %s...', idx, testname)
-
- tag = 'client.{id}.num{idx}.{testname}'.format(
- idx=idx,
- testname=testname,
- id=id_,
- )
- control = '{tdir}/control.{tag}'.format(tdir=testdir, tag=tag)
- teuthology.write_file(
- remote=remote,
- path=control,
- data='import json; data=json.loads({data!r}); job.run_test(**data)'.format(
- data=json.dumps(dict(
- url=testname,
- dir=scratch,
- # TODO perhaps tag
- # results will be in {testdir}/autotest/client/results/dbench
- # or {testdir}/autotest/client/results/dbench.{tag}
- )),
- ),
- )
- remote.run(
- args=[
- '{tdir}/autotest/client/bin/autotest'.format(tdir=testdir),
- '--verbose',
- '--harness=simple',
- '--tag={tag}'.format(tag=tag),
- control,
- run.Raw('3>&1'),
- ],
- )
-
- remote.run(
- args=[
- 'rm', '-rf', '--', control,
- ],
- )
-
- remote.run(
- args=[
- 'mv',
- '--',
- '{tdir}/autotest/client/results/{tag}'.format(tdir=testdir, tag=tag),
- '{tdir}/archive/autotest/{tag}'.format(tdir=testdir, tag=tag),
- ],
- )
-
- remote.run(
- args=[
- 'rm', '-rf', '--', '{tdir}/autotest'.format(tdir=testdir),
- ],
- )
+++ /dev/null
-"""
-Aver wrapper task
-"""
-import contextlib
-import logging
-from subprocess import check_call, Popen, PIPE
-
-log = logging.getLogger(__name__)
-
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Execute an aver assertion
-
- Parameters:
-
- input: file containing data referred to by the assertions. File name is
- relative to the job's archive path
- validations: list of validations in the Aver language
-
- Example:
- - aver:
- input: bench_output.csv
- validations:
- - expect performance(alg='ceph') > performance(alg='raw')
- - for size > 3 expect avg_throughput > 2000
- """
- log.info('Beginning aver...')
- assert isinstance(config, dict), 'expecting dictionary for configuration'
-
- if 'input' not in config:
- raise Exception("Expecting 'input' option")
- if len(config.get('validations', [])) < 1:
- raise Exception("Expecting at least one entry in 'validations'")
-
- url = ('https://github.com/ivotron/aver/releases/download/'
- 'v0.3.0/aver-linux-amd64.tar.bz2')
-
- aver_path = ctx.archive + '/aver'
-
- # download binary
- check_call(['wget', '-O', aver_path + '.tbz', url])
- check_call(['tar', 'xfj', aver_path + '.tbz', '-C', ctx.archive])
-
- # print version
- process = Popen([aver_path, '-v'], stdout=PIPE)
- log.info(process.communicate()[0])
-
- # validate
- for validation in config['validations']:
- cmd = (aver_path + ' -s -i ' + (ctx.archive + '/' + config['input']) +
- ' "' + validation + '"')
- log.info("executing: " + cmd)
- process = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
- (stdout, stderr) = process.communicate()
- if stderr:
- log.info('aver stderr: ' + stderr)
- log.info('aver result: ' + stdout)
- if stdout.strip(' \t\n\r') != 'true':
- raise Exception('Failed validation: ' + validation)
-
- try:
- yield
- finally:
- log.info('Removing aver binary...')
- check_call(['rm', aver_path, aver_path + '.tbz'])
+++ /dev/null
-"""
-Run blktrace program through teuthology
-"""
-import contextlib
-import logging
-
-from teuthology import misc as teuthology
-from teuthology import contextutil
-from teuthology.orchestra import run
-
-log = logging.getLogger(__name__)
-blktrace = '/usr/sbin/blktrace'
-daemon_signal = 'term'
-
-@contextlib.contextmanager
-def setup(ctx, config):
- """
- Setup all the remotes
- """
- osds = ctx.cluster.only(teuthology.is_type('osd', config['cluster']))
- log_dir = '{tdir}/archive/performance/blktrace'.format(tdir=teuthology.get_testdir(ctx))
-
- for remote, roles_for_host in osds.remotes.iteritems():
- log.info('Creating %s on %s' % (log_dir, remote.name))
- remote.run(
- args=['mkdir', '-p', '-m0755', '--', log_dir],
- wait=False,
- )
- yield
-
-@contextlib.contextmanager
-def execute(ctx, config):
- """
- Run the blktrace program on remote machines.
- """
- procs = []
- testdir = teuthology.get_testdir(ctx)
- log_dir = '{tdir}/archive/performance/blktrace'.format(tdir=testdir)
-
- osds = ctx.cluster.only(teuthology.is_type('osd'))
- for remote, roles_for_host in osds.remotes.iteritems():
- roles_to_devs = ctx.disk_config.remote_to_roles_to_dev[remote]
- for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd',
- config['cluster']):
- if roles_to_devs.get(role):
- dev = roles_to_devs[role]
- log.info("running blktrace on %s: %s" % (remote.name, dev))
-
- proc = remote.run(
- args=[
- 'cd',
- log_dir,
- run.Raw(';'),
- 'daemon-helper',
- daemon_signal,
- 'sudo',
- blktrace,
- '-o',
- dev.rsplit("/", 1)[1],
- '-d',
- dev,
- ],
- wait=False,
- stdin=run.PIPE,
- )
- procs.append(proc)
- try:
- yield
- finally:
- osds = ctx.cluster.only(teuthology.is_type('osd'))
- log.info('stopping blktrace processs')
- for proc in procs:
- proc.stdin.close()
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Usage:
- blktrace:
-
- or:
- blktrace:
- cluster: backup
-
- Runs blktrace on all osds in the specified cluster (the 'ceph' cluster by
- default).
- """
- if config is None:
- config = {}
- config['cluster'] = config.get('cluster', 'ceph')
-
- with contextutil.nested(
- lambda: setup(ctx=ctx, config=config),
- lambda: execute(ctx=ctx, config=config),
- ):
- yield
+++ /dev/null
-[Boto]
-http_socket_timeout = {idle_timeout}
+++ /dev/null
-"""
-Build ceph packages
-
-Unit tests:
-
-py.test -v -s tests/test_buildpackages.py
-
-Integration tests:
-
-teuthology-openstack --verbose --key-name myself --key-filename ~/Downloads/myself --ceph infernalis --suite teuthology/buildpackages
-
-"""
-import copy
-import logging
-import os
-import types
-from teuthology import packaging
-from teuthology import misc
-from teuthology.config import config as teuth_config
-from teuthology.openstack import OpenStack
-
-log = logging.getLogger(__name__)
-
-class LocalGitbuilderProject(packaging.GitbuilderProject):
-
- def __init__(self):
- pass
-
-
-def get_pkg_type(os_type):
- if os_type in ('centos', 'fedora', 'opensuse', 'rhel', 'sles'):
- return 'rpm'
- else:
- return 'deb'
-
-def apply_overrides(ctx, config):
- if config is None:
- config = {}
- else:
- config = copy.deepcopy(config)
-
- assert isinstance(config, dict), \
- "task install only supports a dictionary for configuration"
-
- project, = config.get('project', 'ceph'),
- log.debug('project %s' % project)
- overrides = ctx.config.get('overrides')
- if overrides:
- install_overrides = overrides.get('install', {})
- misc.deep_merge(config, install_overrides.get(project, {}))
- return config
-
-def get_config_install(ctx, config):
- config = apply_overrides(ctx, config)
- log.debug('install config %s' % config)
- return [(config.get('flavor', 'basic'),
- config.get('tag', ''),
- config.get('branch', ''),
- config.get('sha1'))]
-
-def get_config_install_upgrade(ctx, config):
- log.debug('install.upgrade config before override %s' % config)
- configs = []
- for (role, role_config) in config.iteritems():
- if role_config is None:
- role_config = {}
- o = apply_overrides(ctx, role_config)
-
- log.debug('install.upgrade config ' + str(role_config) +
- ' and with overrides ' + str(o))
- # for install.upgrade overrides are actually defaults
- configs.append((o.get('flavor', 'basic'),
- role_config.get('tag', o.get('tag', '')),
- role_config.get('branch', o.get('branch', '')),
- role_config.get('sha1', o.get('sha1'))))
- return configs
-
-GET_CONFIG_FUNCTIONS = {
- 'install': get_config_install,
- 'install.upgrade': get_config_install_upgrade,
-}
-
-def lookup_configs(ctx, node):
- configs = []
- if type(node) is types.ListType:
- for leaf in node:
- configs.extend(lookup_configs(ctx, leaf))
- elif type(node) is types.DictType:
- for (key, value) in node.iteritems():
- if key in ('install', 'install.upgrade'):
- configs.extend(GET_CONFIG_FUNCTIONS[key](ctx, value))
- elif key in ('overrides',):
- pass
- else:
- configs.extend(lookup_configs(ctx, value))
- return configs
-
-def get_sha1(ref):
- url = teuth_config.get_ceph_git_url()
- ls_remote = misc.sh("git ls-remote " + url + " " + ref)
- return ls_remote.split()[0]
-
-def task(ctx, config):
- """
- Build Ceph packages. This task will automagically be run
- before the task that need to install packages (this is taken
- care of by the internal teuthology task).
-
- The config should be as follows:
-
- buildpackages:
- good_machine:
- disk: 40 # GB
- ram: 48000 # MB
- cpus: 16
- min_machine:
- disk: 40 # GB
- ram: 8000 # MB
- cpus: 1
-
- example:
-
- tasks:
- - buildpackages:
- good_machine:
- disk: 40 # GB
- ram: 15000 # MB
- cpus: 16
- min_machine:
- disk: 40 # GB
- ram: 8000 # MB
- cpus: 1
- - install:
-
- When a buildpackages task is already included, the values it contains can be
- overriden with:
-
- overrides:
- buildpackages:
- good_machine:
- disk: 20 # GB
- ram: 2000 # MB
- cpus: 2
- min_machine:
- disk: 10 # GB
- ram: 1000 # MB
- cpus: 1
-
- """
- log.info('Beginning buildpackages...')
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- 'task only accepts a dict for config not ' + str(config)
- overrides = ctx.config.get('overrides', {})
- misc.deep_merge(config, overrides.get('buildpackages', {}))
- d = os.path.join(os.path.dirname(__file__), 'buildpackages')
- os_type = misc.get_distro(ctx)
- os_version = misc.get_distro_version(ctx)
- arch = ctx.config.get('arch', OpenStack().get_default_arch())
- dist = LocalGitbuilderProject()._get_distro(distro=os_type,
- version=os_version)
- pkg_type = get_pkg_type(os_type)
- misc.sh(
- "flock --close /tmp/buildpackages " +
- "make -C " + d + " " + os.environ['HOME'] + "/.ssh_agent")
- for (flavor, tag, branch, sha1) in lookup_configs(ctx, ctx.config):
- if tag:
- sha1 = get_sha1(tag)
- elif branch:
- sha1 = get_sha1(branch)
- log.info("building flavor = " + flavor + "," +
- " tag = " + tag + "," +
- " branch = " + branch + "," +
- " sha1 = " + sha1)
- target = ('ceph-' +
- pkg_type + '-' +
- dist + '-' +
- arch + '-' +
- flavor + '-' +
- sha1)
- openstack = OpenStack()
- openstack.set_provider()
- if openstack.provider == 'ovh':
- select = '^(vps|hg)-.*ssd'
- else:
- select = ''
- network = openstack.net()
- if network != "":
- network = " OPENSTACK_NETWORK='" + network + "' "
- openstack.image(os_type, os_version, arch) # create if it does not exist
- build_flavor = openstack.flavor_range(
- config['min_machine'], config['good_machine'], arch, select)
- default_arch = openstack.get_default_arch()
- http_flavor = openstack.flavor({
- 'disk': 30, # GB
- 'ram': 1024, # MB
- 'cpus': 1,
- }, default_arch, select)
- lock = "/tmp/buildpackages-" + sha1 + "-" + os_type + "-" + os_version
- cmd = (". " + os.environ['HOME'] + "/.ssh_agent ; " +
- " flock --close " + lock +
- " make -C " + d +
- network +
- " CEPH_GIT_URL=" + teuth_config.get_ceph_git_url() +
- " CEPH_PKG_TYPE=" + pkg_type +
- " CEPH_OS_TYPE=" + os_type +
- " CEPH_OS_VERSION=" + os_version +
- " CEPH_DIST=" + dist +
- " CEPH_ARCH=" + arch +
- " CEPH_SHA1=" + sha1 +
- " CEPH_TAG=" + tag +
- " CEPH_BRANCH=" + branch +
- " CEPH_FLAVOR=" + flavor +
- " BUILD_FLAVOR=" + build_flavor +
- " HTTP_FLAVOR=" + http_flavor +
- " HTTP_ARCH=" + default_arch +
- " " + target +
- " ")
- log.info("buildpackages: " + cmd)
- misc.sh(cmd)
- teuth_config.gitbuilder_host = openstack.get_ip('packages-repository', '')
- log.info('Finished buildpackages')
+++ /dev/null
-SHELL=/bin/bash
-D=/tmp/stampsdir
-VPATH=${D}
-TIMEOUT_SERVER_CREATE = 30m
-TIMEOUT_BUILD = 220m # 20 minutes short of 4 hours
-PKG_REPO=packages-repository
-PKG_REPO_OS_TYPE=ubuntu
-PKG_REPO_OS_VERSION=14.04
-PKG_REPO_USER_DATA=${PKG_REPO_OS_TYPE}-${PKG_REPO_OS_VERSION}-user-data.txt
-
-# We want to extract the first listed IPv4 address!
-# Openstack will provide the addresses field in this format:
-# "net1-name=ip(, ip)+(; net2-name=ip(, ip)+)+"
-# Each IP may be v4 or v6 (including shortened forms and IPv4-mapped-IPv6 forms)
-# 1.2.3.4
-# 2001:db8:6050:ed4d:f816:3eff:fe48:3b36
-# 2001:db8::fe48:3b36
-# 2001:db8::1.2.3.4
-# Example long-form input:
-# private-network=10.10.10.69, 2001:db8:6050:ed4d:f816:3eff:fed1:d9f8;net-name2=2001:db8::fe48:3b36, 2001:db8::1.2.3.4, 1.2.3.4;
-# TODO: allow selection of the network instead of taking the first network
-# TODO: Support IPv6 in future
-define get_ip
-$$(openstack server show -f value -c addresses $(1) |perl -pe 's/^[^=]+=([^;]+).*/\1/g; s/[ ,]/\n/g; ' |grep -v -e ':' -e '^$$' |head -n1)
-endef
-
-MY_IP=$(shell hostname -I | cut -f1 -d' ')
-
-${HOME}/.ssh_agent:
- ssh-agent -s > ${HOME}/.ssh_agent
- source ${HOME}/.ssh_agent ; ssh-add ; ssh-add -l
- grep -q ssh_agent ~/.bashrc_teuthology || echo 'source ${HOME}/.ssh_agent' >> ~/.bashrc_teuthology
-
-flock-${PKG_REPO}:
- timeout $(TIMEOUT_SERVER_CREATE) openstack server create --image 'teuthology-ubuntu-14.04-${HTTP_ARCH}' ${OPENSTACK_NETWORK} --flavor ${HTTP_FLAVOR} --key-name teuthology --security-group teuthology --property ownedby=${MY_IP} --user-data ${PKG_REPO_USER_DATA} --wait ${PKG_REPO}
- sleep 30
- set -ex ; \
- ip=$(call get_ip,${PKG_REPO}) ; \
- for delay in 1 2 4 8 8 8 8 8 8 8 8 8 16 16 16 16 16 32 32 32 64 128 256 512 ; do if ssh -o 'ConnectTimeout=3' $$ip bash -c '"grep -q READYTORUN /var/log/cloud-init*.log"' ; then break ; else sleep $$delay ; fi ; done ; \
- ssh $$ip sudo apt-get update ; \
- ssh $$ip sudo apt-get install -y nginx rsync && \
- ssh $$ip sudo chown -R ubuntu /usr/share/nginx/html && \
- ssh $$ip sudo rm /usr/share/nginx/html/\* && \
- ssh $$ip sudo perl -pi -e '"s|location / {|location / { autoindex on;|"' /etc/nginx/sites-available/default && \
- ssh $$ip sudo /etc/init.d/nginx restart && \
- perl -pi -e "s/^gitbuilder_host:.*/gitbuilder_host: $$ip/" ~/.teuthology.yaml
- touch ${D}/$@
-
-${PKG_REPO}:
- mkdir -p ${D}
- flock --close ${D}/flock-$@.lock ${MAKE} flock-$@
- touch ${D}/$@
-
-# Just because 'server create' return success does not mean it actually succeeded!
-# Check the server status before we proceed.
-# If it's a weird status, bail out and let the delete fire
-# eg: ERROR status can happen if there is no VM host without enough capacity for the request.
-ceph-${CEPH_PKG_TYPE}-${CEPH_DIST}-${CEPH_ARCH}-${CEPH_FLAVOR}-${CEPH_SHA1}: ${PKG_REPO}
- timeout $(TIMEOUT_SERVER_CREATE) openstack server create --image 'teuthology-${CEPH_OS_TYPE}-${CEPH_OS_VERSION}-${CEPH_ARCH}' ${OPENSTACK_NETWORK} --flavor ${BUILD_FLAVOR} --key-name teuthology --security-group teuthology --property ownedby=${MY_IP} --user-data ${CEPH_OS_TYPE}-${CEPH_OS_VERSION}-user-data.txt --wait $@
- set -ex ; \
- trap "openstack server delete --wait $@" EXIT ; \
- for delay in 30 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 ; do \
- status=$$(openstack server show -c status -f value $@) ; \
- case $$status in \
- ACTIVE) break ;; \
- NOSTATE|*BUILD|*BOOT|*RESIZE) sleep $$delay ;; \
- *) exit 1 ;; \
- esac ; \
- done ; \
- ip=$(call get_ip,$@) ; \
- test -n "$$ip" || exit ; \
- for delay in 1 2 4 8 8 8 8 8 8 8 8 8 16 16 16 16 16 32 32 32 64 128 256 512 ; do if ssh -o 'ConnectTimeout=3' $$ip bash -c '"grep -q READYTORUN /var/log/cloud-init*.log"' ; then break ; else sleep $$delay ; fi ; done ; \
- scp make-${CEPH_PKG_TYPE}.sh common.sh ubuntu@$$ip: ; \
- packages_repository=$(call get_ip,${<F}) ; \
- timeout $(TIMEOUT_BUILD) ssh -tt -A ubuntu@$$ip bash ./make-${CEPH_PKG_TYPE}.sh $$packages_repository ${CEPH_DIST} ${CEPH_GIT_URL} ${CEPH_SHA1} ${CEPH_FLAVOR} ${CEPH_ARCH}
- mkdir -p ${D}/${@D} ; touch ${D}/$@
-
-clobber:
- pkill ssh-agent || true
- rm -f ${HOME}/.ssh_agent
- rm -fr ${D}
+++ /dev/null
-#cloud-config
-bootcmd:
- - yum install -y yum-utils && yum-config-manager --add-repo https://dl.fedoraproject.org/pub/epel/6/x86_64/ && yum install --nogpgcheck -y epel-release && rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-6 && rm /etc/yum.repos.d/dl.fedoraproject.org*
- - ( echo ; echo "MaxSessions 1000" ) >> /etc/ssh/sshd_config
- - ( echo 'Defaults !requiretty' ; echo 'Defaults visiblepw' ) | tee /etc/sudoers.d/cephlab_sudo
-preserve_hostname: true
-system_info:
- default_user:
- name: ubuntu
-packages:
- - dracut-modules-growroot
-runcmd:
- - mkinitrd --force /boot/initramfs-2.6.32-573.3.1.el6.x86_64.img 2.6.32-573.3.1.el6.x86_64
- - reboot
-final_message: "READYTORUN"
+++ /dev/null
-user-data.txt
\ No newline at end of file
+++ /dev/null
-user-data.txt
\ No newline at end of file
+++ /dev/null
-user-data.txt
\ No newline at end of file
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2015 Red Hat <contact@redhat.com>
-#
-# Author: Loic Dachary <loic@dachary.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-function install_deps() {
- git archive --remote=git://git.ceph.com/ceph.git master install-deps.sh | tar -xvf -
- #
- # drop the following hack when trusty is not supported anymore
- # there is no other way as long as we maintain a debian directory that tries
- # to be the same for all distributions
- #
- if grep --quiet 14.04 /etc/issue 2>/dev/null && sudo apt-get install --force-yes -qq -y dpkg-dev && test "$(dpkg-architecture -qDEB_BUILD_GNU_CPU 2>/dev/null)" = aarch64 ; then
- sed -i -e '/libgoogle-perftools-dev/d' debian/control
- fi
- bash -x install-deps.sh
-}
-
-function git_submodules() {
- # see http://tracker.ceph.com/issues/13426
- perl -pi -e 's|git://ceph.com/git/ceph-object-corpus.git|https://github.com/ceph/ceph-object-corpus.git|' .gitmodules
- local force=$(if git submodule usage 2>&1 | grep --quiet 'update.*--force'; then echo --force ; fi)
- git submodule sync || return 1
- git submodule update $force --init --recursive || return 1
-}
-
-function get_ceph() {
- local git_ceph_url=$1
- local sha1=$2
-
- test -d ceph || git clone ${git_ceph_url} ceph
- cd ceph
- if test -d src ; then # so we don't try to fetch when using a fixture
- git fetch --tags http://github.com/ceph/ceph
- fi
- git fetch --tags ${git_ceph_url}
- git checkout ${sha1}
-}
-
-function init_ceph() {
- local git_ceph_url=$1
- local sha1=$2
- get_ceph $git_ceph_url $sha1 || return 1
- git_submodules || return 1
- install_deps || return 1
-}
-
-function flavor2configure() {
- local flavor=$1
-
- eval $(dpkg-architecture)
-
- if test $flavor = notcmalloc || test "$DEB_HOST_GNU_CPU" = aarch64 ; then
- echo --without-tcmalloc --without-cryptopp
- fi
-}
-
-#
-# for a given $sha1 in the $ceph_dir repository, lookup all references
-# from the remote origin and tags matching the sha1. Add a symbolic
-# link in $ref_dir to the $sha1 for each reference found. If the
-# reference is a tag, also add a symbolic link to the commit to which
-# the tag points, if it is an annotated tag.
-#
-function link_same() {
- local ref_dir=$1
- local ceph_dir=$2
- local sha1=$3
-
- mkdir -p $ref_dir
- (
- cd ${ceph_dir}
- git for-each-ref refs/tags/** refs/remotes/origin/** | grep $sha1 | \
- while read sha1 type ref ; do
- if test $type = 'tag' ; then
- commit_sha1=$(git rev-parse $ref^{commit})
- if test $commit_sha1 != $sha1 ; then
- echo ../sha1/$sha1 ../sha1/$commit_sha1
- fi
- fi
- echo ../sha1/$sha1 $(basename $ref)
- done
- ) | while read from to ; do
- ( cd $ref_dir ; ln -sf $from $to )
- done
-}
-
-function test_link_same() {
- local d=/tmp/link_same$$
- mkdir -p $d/primary
- cd $d/primary
- git init
- touch a ; git add a ; git commit -m 'm' a
- git tag tag1
- tag1=$(git rev-parse HEAD)
- git branch branch1
- touch b ; git add b ; git commit -m 'm' b
- git tag --annotate -m 'a' tag2
- tag2=$(git rev-parse tag2)
- sha1_tag2=$(git rev-parse tag2^{commit})
- git branch branch2
- touch c ; git add c ; git commit -m 'm' c
- git branch branch3
- sha1_branch3=$(git rev-parse branch3)
-
- git clone $d/primary $d/secondary
- cd $d/secondary
- mkdir $d/ref $d/sha1
-
- touch $d/sha1/$sha1_branch3
- link_same $d/ref $d/secondary $sha1_branch3
- test $(readlink --canonicalize $d/ref/branch3) = $d/sha1/$sha1_branch3 || return 1
- test $(readlink --canonicalize $d/ref/master) = $d/sha1/$sha1_branch3 || return 1
-
- touch $d/sha1/$tag2
- link_same $d/ref $d/secondary $tag2
- test $(readlink --canonicalize $d/ref/tag2) = $d/sha1/$tag2 || return 1
- test $(readlink --canonicalize $d/sha1/$sha1_tag2) = $d/sha1/$tag2 || return 1
-
- touch $d/sha1/$tag1
- link_same $d/ref $d/secondary $tag1
- test $(readlink --canonicalize $d/ref/tag1) = $d/sha1/$tag1 || return 1
- test $(readlink --canonicalize $d/ref/branch1) = $d/sha1/$tag1 || return 1
-
- rm -fr $d
-}
-
-function maybe_parallel() {
- local nproc=$1
- local vers=$2
-
- if echo $vers | grep --quiet '0\.67' ; then
- return
- fi
-
- if test $nproc -gt 1 ; then
- echo -j${nproc}
- fi
-}
-
-function test_maybe_parallel() {
- test "$(maybe_parallel 1 0.72)" = "" || return 1
- test "$(maybe_parallel 8 0.67)" = "" || return 1
- test "$(maybe_parallel 8 0.72)" = "-j8" || return 1
-}
-
-if test "$1" = "TEST" ; then
- shopt -s -o xtrace
- PS4='${BASH_SOURCE[0]}:$LINENO: ${FUNCNAME[0]}: '
- test_link_same
- test_maybe_parallel
-fi
+++ /dev/null
-#cloud-config
-bootcmd:
- - echo 'APT::Get::AllowUnauthenticated "true";' | tee /etc/apt/apt.conf.d/99disablesigs
- - echo nameserver 8.8.8.8 | tee -a /etc/resolv.conf # last resort, in case the DHCP server does not provide a resolver
-manage_etc_hosts: true
-preserve_hostname: true
-system_info:
- default_user:
- name: ubuntu
-runcmd:
- - echo 'ubuntu ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
-final_message: "READYTORUN"
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2015 Red Hat <contact@redhat.com>
-#
-# Author: Loic Dachary <loic@dachary.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-
-#
-# Create and upload a deb repository with the same naming conventions
-# as https://github.com/ceph/autobuild-ceph/blob/master/build-ceph-deb.sh
-#
-set -xe
-
-base=/tmp/release
-gitbuilder_host=$1
-codename=$2
-git_ceph_url=$3
-sha1=$4
-flavor=$5
-arch=$6
-
-sudo apt-get update
-sudo apt-get install -y git
-
-source $(dirname $0)/common.sh
-
-init_ceph $git_ceph_url $sha1
-
-#codename=$(lsb_release -sc)
-releasedir=$base/$(lsb_release -si)/WORKDIR
-#
-# git describe provides a version that is
-# a) human readable
-# b) is unique for each commit
-# c) compares higher than any previous commit
-# d) contains the short hash of the commit
-#
-vers=$(git describe --match "v*" | sed s/^v//)
-#
-# always set the debian version to 1 which is ok because the debian
-# directory is included in the sources and the upstream version will
-# change each time it is modified.
-#
-dvers="$vers-1"
-: ${NPROC:=$(nproc)}
-ceph_dir=$(pwd)
-
-function build_package() {
-
- rm -fr $releasedir
- mkdir -p $releasedir
- #
- # remove all files not under git so they are not
- # included in the distribution.
- #
- git clean -qdxff
-
- fileext="gz"
- # autotools only works in jewel and below
- if [[ ! -e "make-dist" ]] ; then
- #
- # creating the distribution tarbal requires some configure
- # options (otherwise parts of the source tree will be left out).
- #
- ./autogen.sh
- # Building with LTTNG on Ubuntu Precise is not possible.
- # It fails the LTTNG-is-sane check (it misses headers)
- # And the Debian rules files leave it out anyway
- case $codename in
- precise) lttng_opt="--without-lttng" ;;
- *) lttng_opt="--with-lttng" ;;
- esac
- ./configure $(flavor2configure $flavor) \
- --with-rocksdb --with-ocf \
- --with-nss --with-debug --enable-cephfs-java \
- $lttng_opt --with-babeltrace
- #
- # use distdir= to set the name of the top level directory of the
- # tarbal to match the desired version
- #
- make distdir=ceph-$vers dist
- else
- ./make-dist
- fileext="bz2"
- fi
- #
- # rename the tarbal to match debian conventions and extract it
- #
- mv ceph-$vers.tar.$fileext $releasedir/ceph_$vers.orig.tar.$fileext
- tar -C $releasedir -xf $releasedir/ceph_$vers.orig.tar.$fileext
- #
- # copy the debian directory over
- #
- cp -a debian $releasedir/ceph-$vers/debian
- cd $releasedir
- #
- # uncomment to remove -dbg packages
- # because they are large and take time to build
- #
- #perl -ni -e 'print if(!(/^Package: .*-dbg$/../^$/))' ceph-$vers/debian/control
- #perl -pi -e 's/--dbg-package.*//' ceph-$vers/debian/rules
- #
- # update the changelog to match the desired version
- #
- cd ceph-$vers
- local chvers=$(head -1 debian/changelog | perl -ne 's/.*\(//; s/\).*//; print')
- if [ "$chvers" != "$dvers" ]; then
- DEBEMAIL="contact@ceph.com" dch -D $codename --force-distribution -b -v "$dvers" "new version"
- fi
- #
- # create the packages (with ccache)
- #
- export CEPH_EXTRA_CONFIGURE_ARGS=$(flavor2configure $flavor)
- j=$(maybe_parallel $NPROC $vers)
- PATH=/usr/lib/ccache:$PATH dpkg-buildpackage $j -uc -us -sa
-}
-
-function build_repo() {
- local gitbuilder_host=$1
-
- sudo apt-get install -y reprepro
- cd ${releasedir}/..
- #
- # Create a repository in a directory with a name structured
- # as
- #
- base=ceph-deb-$codename-$arch-$flavor
- sha1_dir=$codename/$base/sha1/$sha1
- mkdir -p $sha1_dir/conf
- cat > $sha1_dir/conf/distributions <<EOF
-Codename: $codename
-Suite: stable
-Components: main
-Architectures: i386 amd64 arm64 source
-EOF
- reprepro --basedir $sha1_dir include $codename WORKDIR/*.changes
- echo $dvers > $sha1_dir/version
- echo $sha1 > $sha1_dir/sha1
- link_same $codename/$base/ref $ceph_dir $sha1
- if test "$gitbuilder_host" ; then
- cd $codename
- sudo apt-get install -y rsync
- RSYNC_RSH='ssh -o StrictHostKeyChecking=false' rsync -av $base/ $gitbuilder_host:/usr/share/nginx/html/$base/
- fi
-}
-
-build_package
-build_repo $gitbuilder_host
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2015 Red Hat <contact@redhat.com>
-#
-# Author: Loic Dachary <loic@dachary.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-
-#
-# Create and upload a RPM repository with the same naming conventions
-# as https://github.com/ceph/autobuild-ceph/blob/master/build-ceph-rpm.sh
-#
-
-set -xe
-
-base=/tmp/release
-gitbuilder_host=$1
-codename=$2
-git_ceph_url=$3
-sha1=$4
-flavor=$5
-arch=$6
-
-suse=false
-[[ $codename =~ suse ]] && suse=true
-
-if [ "$suse" = true ] ; then
- sudo zypper -n install git
-else
- sudo yum install -y git
-fi
-
-source $(dirname $0)/common.sh
-
-init_ceph $git_ceph_url $sha1
-
-distro=$( source /etc/os-release ; echo $ID )
-distro_version=$( source /etc/os-release ; echo $VERSION )
-releasedir=$base/$distro/WORKDIR
-#
-# git describe provides a version that is
-# a) human readable
-# b) is unique for each commit
-# c) compares higher than any previous commit
-# d) contains the short hash of the commit
-#
-vers=$(git describe --match "v*" | sed s/^v//)
-ceph_dir=$(pwd)
-
-#
-# Create a repository in a directory with a name structured
-# as
-#
-base=ceph-rpm-$codename-$arch-$flavor
-
-function setup_rpmmacros() {
- if ! grep -q find_debuginfo_dwz_opts $HOME/.rpmmacros ; then
- echo '%_find_debuginfo_dwz_opts %{nil}' >> $HOME/.rpmmacros
- fi
- if [ "x${distro}x" = "xcentosx" ] && echo $distro_version | grep -q '7' ; then
- if ! grep -q '%dist .el7' $HOME/.rpmmacros ; then
- echo '%dist .el7' >> $HOME/.rpmmacros
- fi
- fi
-}
-
-function build_package() {
- rm -fr $releasedir
- mkdir -p $releasedir
- #
- # remove all files not under git so they are not
- # included in the distribution.
- #
- git clean -qdxff
- #
- # creating the distribution tarbal requires some configure
- # options (otherwise parts of the source tree will be left out).
- #
- if [ "$suse" = true ] ; then
- sudo zypper -n install bzip2
- else
- sudo yum install -y bzip2
- fi
- # autotools only works in jewel and below
- if [[ ! -e "make-dist" ]] ; then
- ./autogen.sh
- ./configure $(flavor2configure $flavor) --with-debug --with-radosgw --with-fuse --with-libatomic-ops --with-gtk2 --with-nss
-
- #
- # use distdir= to set the name of the top level directory of the
- # tarbal to match the desired version
- #
- make dist-bzip2
- else
- # kraken and above
- ./make-dist
- fi
- # Set up build area
- setup_rpmmacros
- if [ "$suse" = true ] ; then
- sudo zypper -n install rpm-build
- else
- sudo yum install -y rpm-build
- fi
- local buildarea=$releasedir
- mkdir -p ${buildarea}/SOURCES
- mkdir -p ${buildarea}/SRPMS
- mkdir -p ${buildarea}/SPECS
- cp ceph.spec ${buildarea}/SPECS
- mkdir -p ${buildarea}/RPMS
- mkdir -p ${buildarea}/BUILD
- CEPH_TARBALL=( ceph-*.tar.bz2 )
- cp -a $CEPH_TARBALL ${buildarea}/SOURCES/.
- cp -a rpm/*.patch ${buildarea}/SOURCES || true
- (
- cd ${buildarea}/SPECS
- ccache=$(echo /usr/lib*/ccache)
- # Build RPMs
- if [ "$suse" = true ]; then
- sed -i -e '0,/%package/s//%debug_package\n&/' \
- -e 's/%{epoch}://g' \
- -e '/^Epoch:/d' \
- -e 's/%bcond_with ceph_test_package/%bcond_without ceph_test_package/' \
- -e "s/^Source0:.*$/Source0: $CEPH_TARBALL/" \
- ceph.spec
- fi
- buildarea=`readlink -fn ${releasedir}` ### rpm wants absolute path
- PATH=$ccache:$PATH rpmbuild -ba --define "_unpackaged_files_terminate_build 0" --define "_topdir ${buildarea}" ceph.spec
- )
-}
-
-function build_rpm_release() {
- local buildarea=$1
- local sha1=$2
- local gitbuilder_host=$3
- local base=$4
-
- cat <<EOF > ${buildarea}/SPECS/ceph-release.spec
-Name: ceph-release
-Version: 1
-Release: 0%{?dist}
-Summary: Ceph repository configuration
-Group: System Environment/Base
-License: GPLv2
-URL: http://gitbuilder.ceph.com/$dist
-Source0: ceph.repo
-#Source0: RPM-GPG-KEY-CEPH
-#Source1: ceph.repo
-BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n)
-BuildArch: noarch
-
-%description
-This package contains the Ceph repository GPG key as well as configuration
-for yum and up2date.
-
-%prep
-
-%setup -q -c -T
-install -pm 644 %{SOURCE0} .
-#install -pm 644 %{SOURCE1} .
-
-%build
-
-%install
-rm -rf %{buildroot}
-#install -Dpm 644 %{SOURCE0} \
-# %{buildroot}/%{_sysconfdir}/pki/rpm-gpg/RPM-GPG-KEY-CEPH
-%if 0%{defined suse_version}
-install -dm 755 %{buildroot}/%{_sysconfdir}/zypp
-install -dm 755 %{buildroot}/%{_sysconfdir}/zypp/repos.d
-install -pm 644 %{SOURCE0} \
- %{buildroot}/%{_sysconfdir}/zypp/repos.d
-%else
-install -dm 755 %{buildroot}/%{_sysconfdir}/yum.repos.d
-install -pm 644 %{SOURCE0} \
- %{buildroot}/%{_sysconfdir}/yum.repos.d
-%endif
-
-%clean
-#rm -rf %{buildroot}
-
-%post
-
-%postun
-
-%files
-%defattr(-,root,root,-)
-#%doc GPL
-%if 0%{defined suse_version}
-/etc/zypp/repos.d/*
-%else
-/etc/yum.repos.d/*
-%endif
-#/etc/pki/rpm-gpg/*
-
-%changelog
-* Tue Mar 10 2013 Gary Lowell <glowell@inktank.com> - 1-0
-- Handle both yum and zypper
-- Use URL to ceph git repo for key
-- remove config attribute from repo file
-* Tue Aug 27 2012 Gary Lowell <glowell@inktank.com> - 1-0
-- Initial Package
-EOF
-
- cat <<EOF > $buildarea/SOURCES/ceph.repo
-[Ceph]
-name=Ceph packages for \$basearch
-baseurl=http://${gitbuilder_host}/${base}/sha1/${sha1}/\$basearch
-enabled=1
-gpgcheck=0
-type=rpm-md
-
-[Ceph-noarch]
-name=Ceph noarch packages
-baseurl=http://${gitbuilder_host}/${base}/sha1/${sha1}/noarch
-enabled=1
-gpgcheck=0
-type=rpm-md
-
-[ceph-source]
-name=Ceph source packages
-baseurl=http://${gitbuilder_host}/${base}/sha1/${sha1}/SRPMS
-enabled=1
-gpgcheck=0
-type=rpm-md
-EOF
-
- rpmbuild -bb --define "_topdir ${buildarea}" ${buildarea}/SPECS/ceph-release.spec
-}
-
-function build_rpm_repo() {
- local buildarea=$1
- local gitbuilder_host=$2
- local base=$3
-
- if [ "$suse" = true ] ; then
- sudo zypper -n install createrepo
- else
- sudo yum install -y createrepo
- fi
-
- for dir in ${buildarea}/SRPMS ${buildarea}/RPMS/*
- do
- createrepo ${dir}
- done
-
- local sha1_dir=${buildarea}/../$codename/$base/sha1/$sha1
- mkdir -p $sha1_dir
- echo $vers > $sha1_dir/version
- echo $sha1 > $sha1_dir/sha1
- echo ceph > $sha1_dir/name
-
- for dir in ${buildarea}/SRPMS ${buildarea}/RPMS/*
- do
- cp -fla ${dir} $sha1_dir
- done
-
- link_same ${buildarea}/../$codename/$base/ref $ceph_dir $sha1
- if test "$gitbuilder_host" ; then
- (
- cd ${buildarea}/../$codename
- RSYNC_RSH='ssh -o StrictHostKeyChecking=false' rsync -av $base/ ubuntu@$gitbuilder_host:/usr/share/nginx/html/$base/
- )
- fi
-}
-
-setup_rpmmacros
-build_package
-build_rpm_release $releasedir $sha1 $gitbuilder_host $base
-build_rpm_repo $releasedir $gitbuilder_host $base
+++ /dev/null
-#cloud-config
-bootcmd:
- - echo nameserver 8.8.8.8 | tee -a /etc/resolv.conf # last resort, in case the DHCP server does not provide a resolver
-manage_etc_hosts: true
-preserve_hostname: true
-users:
- - name: ubuntu
- gecos: User
- sudo: ["ALL=(ALL) NOPASSWD:ALL"]
- groups: users
-runcmd:
- - ( MYHOME=/home/ubuntu ; mkdir $MYHOME/.ssh ; chmod 700 $MYHOME/.ssh ; cp /root/.ssh/authorized_keys $MYHOME/.ssh ; chown -R ubuntu.users $MYHOME/.ssh )
-final_message: "READYTORUN"
+++ /dev/null
-user-data.txt
\ No newline at end of file
+++ /dev/null
-user-data.txt
\ No newline at end of file
+++ /dev/null
-user-data.txt
\ No newline at end of file
+++ /dev/null
-#cloud-config
-bootcmd:
- - echo 'APT::Get::AllowUnauthenticated "true";' | tee /etc/apt/apt.conf.d/99disablesigs
- - echo nameserver 8.8.8.8 | tee -a /etc/resolv.conf # last resort, in case the DHCP server does not provide a resolver
-manage_etc_hosts: true
-preserve_hostname: true
-system_info:
- default_user:
- name: ubuntu
-final_message: "READYTORUN"
+++ /dev/null
-import contextlib
-import logging
-import os
-import textwrap
-import yaml
-
-from cStringIO import StringIO
-from teuthology import contextutil
-from teuthology import misc
-from teuthology import packaging
-from teuthology.orchestra import run
-
-log = logging.getLogger(__name__)
-
-# extra stuff we need to do our job here
-EXTRA_PKGS = [
- 'git',
-]
-
-# stuff that would be in a devmode install, but should be
-# installed in the system for running nosetests against
-# a production install.
-EXTRA_NOSETEST_PKGS = [
- 'python-psutil',
- 'python-mock',
-]
-
-
-def find_client0(cluster):
- ''' Find remote that has client.0 role, or None '''
- for rem, roles in cluster.remotes.iteritems():
- if 'client.0' in roles:
- return rem
- return None
-
-
-def pip(remote, package, venv=None, uninstall=False, force=False):
- ''' {un}install a package with pip, possibly in a virtualenv '''
- if venv:
- pip = os.path.join(venv, 'bin', 'pip')
- args = ['sudo', pip]
- else:
- args = ['sudo', 'pip']
-
- if uninstall:
- args.extend(['uninstall', '-y'])
- else:
- args.append('install')
- if force:
- args.append('-I')
-
- args.append(package)
- remote.run(args=args)
-
-
-@contextlib.contextmanager
-def install_epel(remote):
- ''' install a disabled-by-default epel repo config file '''
- remove = False
- try:
- if remote.os.package_type == 'deb':
- yield
- else:
- remove = True
- distromajor = remote.os.version.split('.')[0]
-
- repofiledata = textwrap.dedent('''
- [epel]
- name=epel{version}
- metalink=http://mirrors.fedoraproject.org/metalink?repo=epel-{version}&arch=$basearch
- enabled=0
- gpgcheck=0
- ''').format(version=distromajor)
-
- misc.create_file(remote, '/etc/yum.repos.d/epel.repo',
- data=repofiledata, sudo=True)
- remote.run(args='sudo yum clean all')
- yield
-
- finally:
- if remove:
- misc.delete_file(remote, '/etc/yum.repos.d/epel.repo', sudo=True)
-
-
-def enable_epel(remote, enable=True):
- ''' enable/disable the epel repo '''
- args = 'sudo sed -i'.split()
- if enable:
- args.extend(['s/enabled=0/enabled=1/'])
- else:
- args.extend(['s/enabled=1/enabled=0/'])
- args.extend(['/etc/yum.repos.d/epel.repo'])
-
- remote.run(args=args)
- remote.run(args='sudo yum clean all')
-
-
-@contextlib.contextmanager
-def install_extra_pkgs(client):
- ''' Install EXTRA_PKGS '''
- try:
- for pkg in EXTRA_PKGS:
- packaging.install_package(pkg, client)
- yield
-
- finally:
- for pkg in EXTRA_PKGS:
- packaging.remove_package(pkg, client)
-
-
-@contextlib.contextmanager
-def clone_calamari(config, client):
- ''' clone calamari source into current directory on remote '''
- branch = config.get('calamari_branch', 'master')
- url = config.get('calamari_giturl', 'git://github.com/ceph/calamari')
- try:
- out = StringIO()
- # ensure branch is present (clone -b will succeed even if
- # the branch doesn't exist, falling back to master)
- client.run(
- args='git ls-remote %s %s' % (url, branch),
- stdout=out,
- label='check for calamari branch %s existence' % branch
- )
- if len(out.getvalue()) == 0:
- raise RuntimeError("Calamari branch %s doesn't exist" % branch)
- client.run(args='git clone -b %s %s' % (branch, url))
- yield
- finally:
- # sudo python setup.py develop may have left some root files around
- client.run(args='sudo rm -rf calamari')
-
-
-@contextlib.contextmanager
-def write_info_yaml(cluster, client):
- ''' write info.yaml to client for nosetests '''
- try:
- info = {
- 'cluster': {
- rem.name: {'roles': roles}
- for rem, roles in cluster.remotes.iteritems()
- }
- }
- misc.create_file(client, 'calamari/info.yaml',
- data=yaml.safe_dump(info, default_flow_style=False))
- yield
- finally:
- misc.delete_file(client, 'calamari/info.yaml')
-
-
-@contextlib.contextmanager
-def write_test_conf(client):
- ''' write calamari/tests/test.conf to client for nosetests '''
- try:
- testconf = textwrap.dedent('''
- [testing]
-
- calamari_control = external
- ceph_control = external
- bootstrap = False
- api_username = admin
- api_password = admin
- embedded_timeout_factor = 1
- external_timeout_factor = 3
- external_cluster_path = info.yaml
- ''')
- misc.create_file(client, 'calamari/tests/test.conf', data=testconf)
- yield
-
- finally:
- misc.delete_file(client, 'calamari/tests/test.conf')
-
-
-@contextlib.contextmanager
-def prepare_nosetest_env(client):
- try:
- # extra dependencies that would be in the devmode venv
- if client.os.package_type == 'rpm':
- enable_epel(client, enable=True)
- for package in EXTRA_NOSETEST_PKGS:
- packaging.install_package(package, client)
- if client.os.package_type == 'rpm':
- enable_epel(client, enable=False)
-
- # install nose itself into the calamari venv, force it in case it's
- # already installed in the system, so we can invoke it by path without
- # fear that it's not present
- pip(client, 'nose', venv='/opt/calamari/venv', force=True)
-
- # install a later version of requests into the venv as well
- # (for precise)
- pip(client, 'requests', venv='/opt/calamari/venv', force=True)
-
- # link (setup.py develop) calamari/rest-api into the production venv
- # because production does not include calamari_rest.management, needed
- # for test_rest_api.py's ApiIntrospection
- args = 'cd calamari/rest-api'.split() + [run.Raw(';')] + \
- 'sudo /opt/calamari/venv/bin/python setup.py develop'.split()
- client.run(args=args)
-
- # because, at least in Python 2.6/Centos, site.py uses
- # 'os.path.exists()' to process .pth file entries, and exists() uses
- # access(2) to check for existence, all the paths leading up to
- # $HOME/calamari/rest-api need to be searchable by all users of
- # the package, which will include the WSGI/Django app, running
- # as the Apache user. So make them all world-read-and-execute.
- args = 'sudo chmod a+x'.split() + \
- ['.', './calamari', './calamari/rest-api']
- client.run(args=args)
-
- # make one dummy request just to get the WSGI app to do
- # all its log creation here, before the chmod below (I'm
- # looking at you, graphite -- /var/log/calamari/info.log and
- # /var/log/calamari/exception.log)
- client.run(args='wget -q -O /dev/null http://localhost')
-
- # /var/log/calamari/* is root-or-apache write-only
- client.run(args='sudo chmod a+w /var/log/calamari/*')
-
- yield
-
- finally:
- args = 'cd calamari/rest-api'.split() + [run.Raw(';')] + \
- 'sudo /opt/calamari/venv/bin/python setup.py develop -u'.split()
- client.run(args=args)
- for pkg in ('nose', 'requests'):
- pip(client, pkg, venv='/opt/calamari/venv', uninstall=True)
- for package in EXTRA_NOSETEST_PKGS:
- packaging.remove_package(package, client)
-
-
-@contextlib.contextmanager
-def run_nosetests(client):
- ''' Actually run the tests '''
- args = [
- 'cd',
- 'calamari',
- run.Raw(';'),
- 'CALAMARI_CONFIG=/etc/calamari/calamari.conf',
- '/opt/calamari/venv/bin/nosetests',
- '-v',
- 'tests/',
- ]
- client.run(args=args)
- yield
-
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Run Calamari tests against an instance set up by 'calamari_server'.
-
- -- clone the Calamari source into $HOME (see options)
- -- write calamari/info.yaml describing the cluster
- -- write calamari/tests/test.conf containing
- 'external' for calamari_control and ceph_control
- 'bootstrap = False' to disable test bootstrapping (installing minions)
- no api_url necessary (inferred from client.0)
- 'external_cluster_path = info.yaml'
- -- modify the production Calamari install to allow test runs:
- install nose in the venv
- install EXTRA_NOSETEST_PKGS
- link in, with setup.py develop, calamari_rest (for ApiIntrospection)
- -- set CALAMARI_CONFIG to point to /etc/calamari/calamari.conf
- -- nosetests -v tests/
-
- Options are:
- calamari_giturl: url from which to git clone calamari
- (default: git://github.com/ceph/calamari)
- calamari_branch: git branch of calamari to check out
- (default: master)
-
- Note: the tests must find a clean cluster, so don't forget to
- set the crush default type appropriately, or install min_size OSD hosts
- """
- client0 = find_client0(ctx.cluster)
- if client0 is None:
- raise RuntimeError("must have client.0 role")
-
- with contextutil.nested(
- lambda: install_epel(client0),
- lambda: install_extra_pkgs(client0),
- lambda: clone_calamari(config, client0),
- lambda: write_info_yaml(ctx.cluster, client0),
- lambda: write_test_conf(client0),
- lambda: prepare_nosetest_env(client0),
- lambda: run_nosetests(client0),
- ):
- yield
+++ /dev/null
-"""
-Calamari setup task
-"""
-import contextlib
-import logging
-import os
-import requests
-import shutil
-import webbrowser
-
-from cStringIO import StringIO
-from teuthology.orchestra import run
-from teuthology import contextutil
-from teuthology import misc
-
-log = logging.getLogger(__name__)
-
-
-DEFAULTS = {
- 'version': 'v0.80.9',
- 'test_image': None,
- 'start_browser': False,
- 'email': 'x@y.com',
- 'no_epel': True,
- 'calamari_user': 'admin',
- 'calamari_password': 'admin',
-}
-
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Do the setup of a calamari server.
-
- - calamari_setup:
- version: 'v80.1'
- test_image: <path to tarball or iso>
-
- Options are (see DEFAULTS above):
-
- version -- ceph version we are testing against
- test_image -- Can be an HTTP URL, in which case fetch from this
- http path; can also be local path
- start_browser -- If True, start a browser. To be used by runs that will
- bring up a browser quickly for human use. Set to False
- for overnight suites that are testing for problems in
- the installation itself
- email -- email address for the user
- no_epel -- indicates if we should remove epel files prior to yum
- installations.
- calamari_user -- user name to log into gui
- calamari_password -- calamari user password
- """
- local_config = DEFAULTS
- local_config.update(config)
- config = local_config
- cal_svr = None
- for remote_, roles in ctx.cluster.remotes.items():
- if 'client.0' in roles:
- cal_svr = remote_
- break
- if not cal_svr:
- raise RuntimeError('client.0 not found in roles')
- with contextutil.nested(
- lambda: adjust_yum_repos(ctx, cal_svr, config['no_epel']),
- lambda: calamari_install(config, cal_svr),
- lambda: ceph_install(ctx, cal_svr),
- # do it again because ceph-deploy installed epel for centos
- lambda: remove_epel(ctx, config['no_epel']),
- lambda: calamari_connect(ctx, cal_svr),
- lambda: browser(config['start_browser'], cal_svr.hostname),
- ):
- yield
-
-
-@contextlib.contextmanager
-def adjust_yum_repos(ctx, cal_svr, no_epel):
- """
- For each remote machine, fix the repos if yum is used.
- """
- ice_distro = str(cal_svr.os)
- if ice_distro.startswith('rhel') or ice_distro.startswith('centos'):
- if no_epel:
- for remote in ctx.cluster.remotes:
- fix_yum_repos(remote, ice_distro)
- try:
- yield
- finally:
- if ice_distro.startswith('rhel') or ice_distro.startswith('centos'):
- if no_epel:
- for remote in ctx.cluster.remotes:
- restore_yum_repos(remote)
-
-
-def restore_yum_repos(remote):
- """
- Copy the old saved repo back in.
- """
- if remote.run(args=['sudo', 'rm', '-rf', '/etc/yum.repos.d']).exitstatus:
- return False
- if remote.run(args=['sudo', 'mv', '/etc/yum.repos.d.old',
- '/etc/yum.repos.d']).exitstatus:
- return False
-
-
-def fix_yum_repos(remote, distro):
- """
- For yum calamari installations, the repos.d directory should only
- contain a repo file named rhel<version-number>.repo
- """
- if distro.startswith('centos'):
- # hack alert: detour: install lttng for ceph
- # this works because epel is preinstalled on the vpms
- # this is not a generic solution
- # this is here solely to test the one-off 1.3.0 release for centos6
- remote.run(args="sudo yum -y install lttng-tools")
- cmds = [
- 'sudo mkdir /etc/yum.repos.d.old'.split(),
- ['sudo', 'cp', run.Raw('/etc/yum.repos.d/*'),
- '/etc/yum.repos.d.old'],
- ['sudo', 'rm', run.Raw('/etc/yum.repos.d/epel*')],
- ]
- for cmd in cmds:
- if remote.run(args=cmd).exitstatus:
- return False
- else:
- cmds = [
- 'sudo mv /etc/yum.repos.d /etc/yum.repos.d.old'.split(),
- 'sudo mkdir /etc/yum.repos.d'.split(),
- ]
- for cmd in cmds:
- if remote.run(args=cmd).exitstatus:
- return False
-
- # map "distroversion" from Remote.os to a tuple of
- # (repo title, repo name descriptor, apt-mirror repo path chunk)
- yum_repo_params = {
- 'rhel 6.4': ('rhel6-server', 'RHEL', 'rhel6repo-server'),
- 'rhel 6.5': ('rhel6-server', 'RHEL', 'rhel6repo-server'),
- 'rhel 7.0': ('rhel7-server', 'RHEL', 'rhel7repo/server'),
- }
- repotitle, reponame, path = yum_repo_params[distro]
- repopath = '/etc/yum.repos.d/%s.repo' % repotitle
- # TO DO: Make this data configurable too
- repo_contents = '\n'.join(
- ('[%s]' % repotitle,
- 'name=%s $releasever - $basearch' % reponame,
- 'baseurl=http://apt-mirror.front.sepia.ceph.com/' + path,
- 'gpgcheck=0',
- 'enabled=1')
- )
- misc.sudo_write_file(remote, repopath, repo_contents)
- cmds = [
- 'sudo yum clean all'.split(),
- 'sudo yum makecache'.split(),
- ]
- for cmd in cmds:
- if remote.run(args=cmd).exitstatus:
- return False
- return True
-
-
-@contextlib.contextmanager
-def remove_epel(ctx, no_epel):
- """
- just remove epel. No undo; assumed that it's used after
- adjust_yum_repos, and relies on its state-save/restore.
- """
- if no_epel:
- for remote in ctx.cluster.remotes:
- if remote.os.name.startswith('centos'):
- remote.run(args=[
- 'sudo', 'rm', '-f', run.Raw('/etc/yum.repos.d/epel*')
- ])
- try:
- yield
- finally:
- pass
-
-
-def get_iceball_with_http(url, destdir):
- '''
- Copy iceball with http to destdir. Try both .tar.gz and .iso.
- '''
- # stream=True means we don't download until copyfileobj below,
- # and don't need a temp file
- r = requests.get(url, stream=True)
- if not r.ok:
- raise RuntimeError("Failed to download %s", str(url))
- filename = os.path.join(destdir, url.split('/')[-1])
- with open(filename, 'w') as f:
- shutil.copyfileobj(r.raw, f)
- log.info('saved %s as %s' % (url, filename))
- return filename
-
-
-@contextlib.contextmanager
-def calamari_install(config, cal_svr):
- """
- Install calamari
-
- The steps here are:
- -- Get the iceball, locally or from http
- -- Copy the iceball to the calamari server, and untar/mount it.
- -- Run ice-setup on the calamari server.
- -- Run calamari-ctl initialize.
- """
- client_id = str(cal_svr)
- at_loc = client_id.find('@')
- if at_loc > 0:
- client_id = client_id[at_loc + 1:]
-
- test_image = config['test_image']
-
- if not test_image:
- raise RuntimeError('Must supply test image')
- log.info('calamari test image: %s' % test_image)
- delete_iceball = False
-
- if test_image.startswith('http'):
- iceball_file = get_iceball_with_http(test_image, '/tmp')
- delete_iceball = True
- else:
- iceball_file = test_image
-
- remote_iceball_file = os.path.join('/tmp', os.path.split(iceball_file)[1])
- cal_svr.put_file(iceball_file, remote_iceball_file)
- if iceball_file.endswith('.tar.gz'): # XXX specify tar/iso in config?
- icetype = 'tarball'
- elif iceball_file.endswith('.iso'):
- icetype = 'iso'
- else:
- raise RuntimeError('Can''t handle iceball {0}'.format(iceball_file))
-
- if icetype == 'tarball':
- ret = cal_svr.run(args=['gunzip', run.Raw('<'), remote_iceball_file,
- run.Raw('|'), 'tar', 'xvf', run.Raw('-')])
- if ret.exitstatus:
- raise RuntimeError('remote iceball untar failed')
- elif icetype == 'iso':
- mountpoint = '/mnt/' # XXX create?
- ret = cal_svr.run(
- args=['sudo', 'mount', '-o', 'loop', '-r',
- remote_iceball_file, mountpoint]
- )
-
- # install ice_setup package
- args = {
- 'deb': 'sudo dpkg -i /mnt/ice-setup*deb',
- 'rpm': 'sudo yum -y localinstall /mnt/ice_setup*rpm'
- }.get(cal_svr.system_type, None)
- if not args:
- raise RuntimeError('{0}: unknown system type'.format(cal_svr))
- ret = cal_svr.run(args=args)
- if ret.exitstatus:
- raise RuntimeError('ice_setup package install failed')
-
- # Run ice_setup
- icesetdata = 'yes\n\n%s\nhttp\n' % client_id
- ice_in = StringIO(icesetdata)
- ice_out = StringIO()
- if icetype == 'tarball':
- args = 'sudo python ice_setup.py'
- else:
- args = 'sudo ice_setup -d /mnt'
- ret = cal_svr.run(args=args, stdin=ice_in, stdout=ice_out)
- log.debug(ice_out.getvalue())
- if ret.exitstatus:
- raise RuntimeError('ice_setup failed')
-
- # Run calamari-ctl initialize.
- icesetdata = '%s\n%s\n%s\n%s\n' % (
- config['calamari_user'],
- config['email'],
- config['calamari_password'],
- config['calamari_password'],
- )
- ice_in = StringIO(icesetdata)
- ret = cal_svr.run(args=['sudo', 'calamari-ctl', 'initialize'],
- stdin=ice_in, stdout=ice_out)
- log.debug(ice_out.getvalue())
- if ret.exitstatus:
- raise RuntimeError('calamari-ctl initialize failed')
- try:
- yield
- finally:
- log.info('Cleaning up after Calamari installation')
- if icetype == 'iso':
- cal_svr.run(args=['sudo', 'umount', mountpoint])
- if delete_iceball:
- os.unlink(iceball_file)
-
-
-@contextlib.contextmanager
-def ceph_install(ctx, cal_svr):
- """
- Install ceph if ceph was not previously installed by teuthology. This
- code tests the case where calamari is installed on a brand new system.
- """
- loc_inst = False
- if 'install' not in [x.keys()[0] for x in ctx.config['tasks']]:
- loc_inst = True
- ret = deploy_ceph(ctx, cal_svr)
- if ret:
- raise RuntimeError('ceph installs failed')
- try:
- yield
- finally:
- if loc_inst:
- if not undeploy_ceph(ctx, cal_svr):
- log.error('Cleanup of Ceph installed by Calamari-setup failed')
-
-
-def deploy_ceph(ctx, cal_svr):
- """
- Perform the ceph-deploy actions needed to bring up a Ceph cluster. This
- test is needed to check the ceph-deploy that comes with the calamari
- package.
- """
- osd_to_name = {}
- all_machines = set()
- all_mons = set()
- all_osds = set()
-
- # collect which remotes are osds and which are mons
- for remote in ctx.cluster.remotes:
- all_machines.add(remote.shortname)
- roles = ctx.cluster.remotes[remote]
- for role in roles:
- daemon_type, number = role.split('.')
- if daemon_type == 'osd':
- all_osds.add(remote.shortname)
- osd_to_name[number] = remote.shortname
- if daemon_type == 'mon':
- all_mons.add(remote.shortname)
-
- # figure out whether we're in "1.3+" mode: prior to 1.3, there was
- # only one Ceph repo, and it was all installed on every Ceph host.
- # with 1.3, we've split that into MON and OSD repos (in order to
- # be able to separately track subscriptions per-node). This
- # requires new switches to ceph-deploy to select which locally-served
- # repo is connected to which cluster host.
- #
- # (TODO: A further issue is that the installation/setup may not have
- # created local repos at all, but that is the subject of a future
- # change.)
-
- r = cal_svr.run(args='/usr/bin/test -d /mnt/MON', check_status=False)
- use_install_repo = (r.returncode == 0)
-
- # pre-1.3:
- # ceph-deploy new <all_mons>
- # ceph-deploy install <all_machines>
- # ceph-deploy mon create-initial
- #
- # 1.3 and later:
- # ceph-deploy new <all_mons>
- # ceph-deploy install --repo --release=ceph-mon <all_mons>
- # ceph-deploy install <all_mons>
- # ceph-deploy install --repo --release=ceph-osd <all_osds>
- # ceph-deploy install <all_osds>
- # ceph-deploy mon create-initial
- #
- # one might think the install <all_mons> and install <all_osds>
- # commands would need --mon and --osd, but #12147 has not yet
- # made it into RHCS 1.3.0; since the package split also hasn't
- # landed, we can avoid using the flag and avoid the bug.
-
- cmds = ['ceph-deploy new ' + ' '.join(all_mons)]
-
- if use_install_repo:
- cmds.append('ceph-deploy repo ceph-mon ' +
- ' '.join(all_mons))
- cmds.append('ceph-deploy install --no-adjust-repos --mon ' +
- ' '.join(all_mons))
- cmds.append('ceph-deploy repo ceph-osd ' +
- ' '.join(all_osds))
- cmds.append('ceph-deploy install --no-adjust-repos --osd ' +
- ' '.join(all_osds))
- # We tell users to use `hostname` in our docs. Do the same here.
- cmds.append('ceph-deploy install --no-adjust-repos --cli `hostname`')
- else:
- cmds.append('ceph-deploy install ' + ' '.join(all_machines))
-
- cmds.append('ceph-deploy mon create-initial')
-
- for cmd in cmds:
- cal_svr.run(args=cmd).exitstatus
-
- disk_labels = '_dcba'
- # NEEDS WORK assumes disks start with vd (need to check this somewhere)
- for cmd_pts in [['disk', 'zap'], ['osd', 'prepare'], ['osd', 'activate']]:
- mach_osd_cnt = {}
- for osdn in osd_to_name:
- osd_mac = osd_to_name[osdn]
- mach_osd_cnt[osd_mac] = mach_osd_cnt.get(osd_mac, 0) + 1
- arg_list = ['ceph-deploy']
- arg_list.extend(cmd_pts)
- disk_id = '%s:vd%s' % (osd_to_name[osdn],
- disk_labels[mach_osd_cnt[osd_mac]])
- if 'activate' in cmd_pts:
- disk_id += '1'
- arg_list.append(disk_id)
- cal_svr.run(args=arg_list).exitstatus
-
-
-def undeploy_ceph(ctx, cal_svr):
- """
- Cleanup deployment of ceph.
- """
- all_machines = []
- ret = True
- for remote in ctx.cluster.remotes:
- roles = ctx.cluster.remotes[remote]
- if (
- not any('osd' in role for role in roles) and
- not any('mon' in role for role in roles)
- ):
- continue
- ret &= remote.run(
- args=['sudo', 'stop', 'ceph-all', run.Raw('||'),
- 'sudo', 'service', 'ceph', 'stop']
- ).exitstatus
- all_machines.append(remote.shortname)
- all_machines = set(all_machines)
- cmd1 = ['ceph-deploy', 'uninstall']
- cmd1.extend(all_machines)
- ret &= cal_svr.run(args=cmd1).exitstatus
- cmd2 = ['ceph-deploy', 'purge']
- cmd2.extend(all_machines)
- ret &= cal_svr.run(args=cmd2).exitstatus
- for remote in ctx.cluster.remotes:
- ret &= remote.run(args=['sudo', 'rm', '-rf',
- '.ssh/known_hosts']).exitstatus
- return ret
-
-
-@contextlib.contextmanager
-def calamari_connect(ctx, cal_svr):
- """
- Connect calamari to the ceph nodes.
- """
- connects = ['ceph-deploy', 'calamari', 'connect']
- for machine_info in ctx.cluster.remotes:
- if 'client.0' not in ctx.cluster.remotes[machine_info]:
- connects.append(machine_info.shortname)
- ret = cal_svr.run(args=connects)
- if ret.exitstatus:
- raise RuntimeError('calamari connect failed')
- try:
- yield
- finally:
- log.info('Calamari test terminating')
-
-
-@contextlib.contextmanager
-def browser(start_browser, web_page):
- """
- Bring up a browser, if wanted.
- """
- if start_browser:
- webbrowser.open('http://%s' % web_page)
- try:
- yield
- finally:
- if start_browser:
- log.info('Web browser support terminating')
+++ /dev/null
-"""
-Ceph cluster task.
-
-Handle the setup, starting, and clean-up of a Ceph cluster.
-"""
-from cStringIO import StringIO
-
-import argparse
-import contextlib
-import errno
-import logging
-import os
-import json
-import time
-import gevent
-import socket
-
-from ceph_manager import CephManager, write_conf
-from tasks.cephfs.filesystem import Filesystem
-from teuthology import misc as teuthology
-from teuthology import contextutil
-from teuthology import exceptions
-from teuthology.orchestra import run
-import ceph_client as cclient
-from teuthology.orchestra.daemon import DaemonGroup
-
-CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw']
-
-log = logging.getLogger(__name__)
-
-
-def generate_caps(type_):
- """
- Each call will return the next capability for each system type
- (essentially a subset of possible role values). Valid types are osd,
- mds and client.
- """
- defaults = dict(
- osd=dict(
- mon='allow *',
- osd='allow *',
- ),
- mgr=dict(
- mon='allow *',
- ),
- mds=dict(
- mon='allow *',
- osd='allow *',
- mds='allow',
- ),
- client=dict(
- mon='allow rw',
- osd='allow rwx',
- mds='allow',
- ),
- )
- for subsystem, capability in defaults[type_].items():
- yield '--cap'
- yield subsystem
- yield capability
-
-
-@contextlib.contextmanager
-def ceph_log(ctx, config):
- """
- Create /var/log/ceph log directory that is open to everyone.
- Add valgrind and profiling-logger directories.
-
- :param ctx: Context
- :param config: Configuration
- """
- log.info('Making ceph log dir writeable by non-root...')
- run.wait(
- ctx.cluster.run(
- args=[
- 'sudo',
- 'chmod',
- '777',
- '/var/log/ceph',
- ],
- wait=False,
- )
- )
- log.info('Disabling ceph logrotate...')
- run.wait(
- ctx.cluster.run(
- args=[
- 'sudo',
- 'rm', '-f', '--',
- '/etc/logrotate.d/ceph',
- ],
- wait=False,
- )
- )
- log.info('Creating extra log directories...')
- run.wait(
- ctx.cluster.run(
- args=[
- 'sudo',
- 'install', '-d', '-m0777', '--',
- '/var/log/ceph/valgrind',
- '/var/log/ceph/profiling-logger',
- ],
- wait=False,
- )
- )
-
- class Rotater(object):
- stop_event = gevent.event.Event()
-
- def invoke_logrotate(self):
- # 1) install ceph-test.conf in /etc/logrotate.d
- # 2) continuously loop over logrotate invocation with ceph-test.conf
- while not self.stop_event.is_set():
- self.stop_event.wait(timeout=30)
- try:
- run.wait(
- ctx.cluster.run(
- args=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'
- ],
- wait=False,
- )
- )
- except exceptions.ConnectionLostError as e:
- # Some tests may power off nodes during test, in which
- # case we will see connection errors that we should ignore.
- log.debug("Missed logrotate, node '{0}' is offline".format(
- e.node))
- except EOFError as e:
- # Paramiko sometimes raises this when it fails to
- # connect to a node during open_session. As with
- # ConnectionLostError, we ignore this because nodes
- # are allowed to get power cycled during tests.
- log.debug("Missed logrotate, EOFError")
- except socket.error as e:
- if e.errno == errno.EHOSTUNREACH:
- log.debug("Missed logrotate, host unreachable")
- else:
- raise
-
- def begin(self):
- self.thread = gevent.spawn(self.invoke_logrotate)
-
- def end(self):
- self.stop_event.set()
- self.thread.get()
-
- def write_rotate_conf(ctx, daemons):
- testdir = teuthology.get_testdir(ctx)
- rotate_conf_path = os.path.join(os.path.dirname(__file__), 'logrotate.conf')
- with file(rotate_conf_path, 'rb') as f:
- conf = ""
- for daemon, size in daemons.iteritems():
- log.info('writing logrotate stanza for {daemon}'.format(daemon=daemon))
- conf += f.read().format(daemon_type=daemon, max_size=size)
- f.seek(0, 0)
-
- for remote in ctx.cluster.remotes.iterkeys():
- teuthology.write_file(remote=remote,
- path='{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
- data=StringIO(conf)
- )
- remote.run(
- args=[
- 'sudo',
- 'mv',
- '{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
- '/etc/logrotate.d/ceph-test.conf',
- run.Raw('&&'),
- 'sudo',
- 'chmod',
- '0644',
- '/etc/logrotate.d/ceph-test.conf',
- run.Raw('&&'),
- 'sudo',
- 'chown',
- 'root.root',
- '/etc/logrotate.d/ceph-test.conf'
- ]
- )
- remote.chcon('/etc/logrotate.d/ceph-test.conf',
- 'system_u:object_r:etc_t:s0')
-
- if ctx.config.get('log-rotate'):
- daemons = ctx.config.get('log-rotate')
- log.info('Setting up log rotation with ' + str(daemons))
- write_rotate_conf(ctx, daemons)
- logrotater = Rotater()
- logrotater.begin()
- try:
- yield
-
- finally:
- if ctx.config.get('log-rotate'):
- log.info('Shutting down logrotate')
- logrotater.end()
- ctx.cluster.run(
- args=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf'
- ]
- )
- if ctx.archive is not None and \
- not (ctx.config.get('archive-on-error') and ctx.summary['success']):
- # and logs
- log.info('Compressing logs...')
- run.wait(
- ctx.cluster.run(
- args=[
- 'sudo',
- 'find',
- '/var/log/ceph',
- '-name',
- '*.log',
- '-print0',
- run.Raw('|'),
- 'sudo',
- 'xargs',
- '-0',
- '--no-run-if-empty',
- '--',
- 'gzip',
- '--',
- ],
- wait=False,
- ),
- )
-
- log.info('Archiving logs...')
- path = os.path.join(ctx.archive, 'remote')
- os.makedirs(path)
- for remote in ctx.cluster.remotes.iterkeys():
- sub = os.path.join(path, remote.shortname)
- os.makedirs(sub)
- teuthology.pull_directory(remote, '/var/log/ceph',
- os.path.join(sub, 'log'))
-
-
-def assign_devs(roles, devs):
- """
- Create a dictionary of devs indexed by roles
-
- :param roles: List of roles
- :param devs: Corresponding list of devices.
- :returns: Dictionary of devs indexed by roles.
- """
- return dict(zip(roles, devs))
-
-
-@contextlib.contextmanager
-def valgrind_post(ctx, config):
- """
- After the tests run, look throught all the valgrind logs. Exceptions are raised
- if textual errors occured in the logs, or if valgrind exceptions were detected in
- the logs.
-
- :param ctx: Context
- :param config: Configuration
- """
- try:
- yield
- finally:
- lookup_procs = list()
- log.info('Checking for errors in any valgrind logs...')
- for remote in ctx.cluster.remotes.iterkeys():
- # look at valgrind logs for each node
- proc = remote.run(
- args=[
- 'sudo',
- 'zgrep',
- '<kind>',
- run.Raw('/var/log/ceph/valgrind/*'),
- '/dev/null', # include a second file so that we always get a filename prefix on the output
- run.Raw('|'),
- 'sort',
- run.Raw('|'),
- 'uniq',
- ],
- wait=False,
- check_status=False,
- stdout=StringIO(),
- )
- lookup_procs.append((proc, remote))
-
- valgrind_exception = None
- for (proc, remote) in lookup_procs:
- proc.wait()
- out = proc.stdout.getvalue()
- for line in out.split('\n'):
- if line == '':
- continue
- try:
- (file, kind) = line.split(':')
- except Exception:
- log.error('failed to split line %s', line)
- raise
- log.debug('file %s kind %s', file, kind)
- if (file.find('mds') >= 0) and kind.find('Lost') > 0:
- continue
- log.error('saw valgrind issue %s in %s', kind, file)
- valgrind_exception = Exception('saw valgrind issues')
-
- if config.get('expect_valgrind_errors'):
- if not valgrind_exception:
- raise Exception('expected valgrind issues and found none')
- else:
- if valgrind_exception:
- raise valgrind_exception
-
-
-@contextlib.contextmanager
-def crush_setup(ctx, config):
- cluster_name = config['cluster']
- first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
- (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
-
- profile = config.get('crush_tunables', 'default')
- log.info('Setting crush tunables to %s', profile)
- mon_remote.run(
- args=['sudo', 'ceph', '--cluster', cluster_name,
- 'osd', 'crush', 'tunables', profile])
- yield
-
-
-@contextlib.contextmanager
-def cephfs_setup(ctx, config):
- cluster_name = config['cluster']
- testdir = teuthology.get_testdir(ctx)
- coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
-
- first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
- (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
- mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
- # If there are any MDSs, then create a filesystem for them to use
- # Do this last because requires mon cluster to be up and running
- if mdss.remotes:
- log.info('Setting up CephFS filesystem...')
-
- Filesystem(ctx, create='cephfs') # TODO: make Filesystem cluster-aware
-
- is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role
- all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
- num_active = len([r for r in all_roles if is_active_mds(r)])
- mon_remote.run(
- args=[
- 'sudo',
- 'adjust-ulimits',
- 'ceph-coverage',
- coverage_dir,
- 'ceph', 'mds', 'set', 'allow_multimds', 'true',
- '--yes-i-really-mean-it'],
- check_status=False, # probably old version, upgrade test
- )
- mon_remote.run(args=[
- 'sudo',
- 'adjust-ulimits',
- 'ceph-coverage',
- coverage_dir,
- 'ceph',
- '--cluster', cluster_name,
- 'mds', 'set_max_mds', str(num_active)])
- mon_remote.run(
- args=[
- 'sudo',
- 'adjust-ulimits',
- 'ceph-coverage',
- coverage_dir,
- 'ceph', 'mds', 'set', 'allow_dirfrags', 'true',
- '--yes-i-really-mean-it'],
- check_status=False, # probably old version, upgrade test
- )
-
- yield
-
-
-@contextlib.contextmanager
-def cluster(ctx, config):
- """
- Handle the creation and removal of a ceph cluster.
-
- On startup:
- Create directories needed for the cluster.
- Create remote journals for all osds.
- Create and set keyring.
- Copy the monmap to tht test systems.
- Setup mon nodes.
- Setup mds nodes.
- Mkfs osd nodes.
- Add keyring information to monmaps
- Mkfs mon nodes.
-
- On exit:
- If errors occured, extract a failure message and store in ctx.summary.
- Unmount all test files and temporary journaling files.
- Save the monitor information and archive all ceph logs.
- Cleanup the keyring setup, and remove all monitor map and data files left over.
-
- :param ctx: Context
- :param config: Configuration
- """
- if ctx.config.get('use_existing_cluster', False) is True:
- log.info("'use_existing_cluster' is true; skipping cluster creation")
- yield
-
- testdir = teuthology.get_testdir(ctx)
- cluster_name = config['cluster']
- data_dir = '{tdir}/{cluster}.data'.format(tdir=testdir, cluster=cluster_name)
- log.info('Creating ceph cluster %s...', cluster_name)
- run.wait(
- ctx.cluster.run(
- args=[
- 'install', '-d', '-m0755', '--',
- data_dir,
- ],
- wait=False,
- )
- )
-
- run.wait(
- ctx.cluster.run(
- args=[
- 'sudo',
- 'install', '-d', '-m0777', '--', '/var/run/ceph',
- ],
- wait=False,
- )
- )
-
- devs_to_clean = {}
- remote_to_roles_to_devs = {}
- remote_to_roles_to_journals = {}
- osds = ctx.cluster.only(teuthology.is_type('osd', cluster_name))
- for remote, roles_for_host in osds.remotes.iteritems():
- devs = teuthology.get_scratch_devices(remote)
- roles_to_devs = {}
- roles_to_journals = {}
- if config.get('fs'):
- log.info('fs option selected, checking for scratch devs')
- log.info('found devs: %s' % (str(devs),))
- devs_id_map = teuthology.get_wwn_id_map(remote, devs)
- iddevs = devs_id_map.values()
- roles_to_devs = assign_devs(
- teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
- )
- if len(roles_to_devs) < len(iddevs):
- iddevs = iddevs[len(roles_to_devs):]
- devs_to_clean[remote] = []
-
- if config.get('block_journal'):
- log.info('block journal enabled')
- roles_to_journals = assign_devs(
- teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
- )
- log.info('journal map: %s', roles_to_journals)
-
- if config.get('tmpfs_journal'):
- log.info('tmpfs journal enabled')
- roles_to_journals = {}
- remote.run(args=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt'])
- for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
- tmpfs = '/mnt/' + role
- roles_to_journals[role] = tmpfs
- remote.run(args=['truncate', '-s', '1500M', tmpfs])
- log.info('journal map: %s', roles_to_journals)
-
- log.info('dev map: %s' % (str(roles_to_devs),))
- remote_to_roles_to_devs[remote] = roles_to_devs
- remote_to_roles_to_journals[remote] = roles_to_journals
-
- log.info('Generating config...')
- remotes_and_roles = ctx.cluster.remotes.items()
- roles = [role_list for (remote, role_list) in remotes_and_roles]
- ips = [host for (host, port) in
- (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
- conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips, cluster=cluster_name)
- for remote, roles_to_journals in remote_to_roles_to_journals.iteritems():
- for role, journal in roles_to_journals.iteritems():
- name = teuthology.ceph_role(role)
- if name not in conf:
- conf[name] = {}
- conf[name]['osd journal'] = journal
- for section, keys in config['conf'].iteritems():
- for key, value in keys.iteritems():
- log.info("[%s] %s = %s" % (section, key, value))
- if section not in conf:
- conf[section] = {}
- conf[section][key] = value
-
- if config.get('tmpfs_journal'):
- conf['journal dio'] = False
-
- if not hasattr(ctx, 'ceph'):
- ctx.ceph = {}
- ctx.ceph[cluster_name] = argparse.Namespace()
- ctx.ceph[cluster_name].conf = conf
-
- default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name)
- keyring_path = config.get('keyring_path', default_keyring)
-
- coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
-
- firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
-
- log.info('Setting up %s...' % firstmon)
- ctx.cluster.only(firstmon).run(
- args=[
- 'sudo',
- 'adjust-ulimits',
- 'ceph-coverage',
- coverage_dir,
- 'ceph-authtool',
- '--create-keyring',
- keyring_path,
- ],
- )
- ctx.cluster.only(firstmon).run(
- args=[
- 'sudo',
- 'adjust-ulimits',
- 'ceph-coverage',
- coverage_dir,
- 'ceph-authtool',
- '--gen-key',
- '--name=mon.',
- keyring_path,
- ],
- )
- ctx.cluster.only(firstmon).run(
- args=[
- 'sudo',
- 'chmod',
- '0644',
- keyring_path,
- ],
- )
- (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
- monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
- cluster=cluster_name)
- fsid = teuthology.create_simple_monmap(
- ctx,
- remote=mon0_remote,
- conf=conf,
- path=monmap_path,
- )
- if not 'global' in conf:
- conf['global'] = {}
- conf['global']['fsid'] = fsid
-
- default_conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster_name)
- conf_path = config.get('conf_path', default_conf_path)
- log.info('Writing %s for FSID %s...' % (conf_path, fsid))
- write_conf(ctx, conf_path, cluster_name)
-
- log.info('Creating admin key on %s...' % firstmon)
- ctx.cluster.only(firstmon).run(
- args=[
- 'sudo',
- 'adjust-ulimits',
- 'ceph-coverage',
- coverage_dir,
- 'ceph-authtool',
- '--gen-key',
- '--name=client.admin',
- '--set-uid=0',
- '--cap', 'mon', 'allow *',
- '--cap', 'osd', 'allow *',
- '--cap', 'mds', 'allow *',
- keyring_path,
- ],
- )
-
- log.info('Copying monmap to all nodes...')
- keyring = teuthology.get_file(
- remote=mon0_remote,
- path=keyring_path,
- )
- monmap = teuthology.get_file(
- remote=mon0_remote,
- path=monmap_path,
- )
-
- for rem in ctx.cluster.remotes.iterkeys():
- # copy mon key and initial monmap
- log.info('Sending monmap to node {remote}'.format(remote=rem))
- teuthology.sudo_write_file(
- remote=rem,
- path=keyring_path,
- data=keyring,
- perms='0644'
- )
- teuthology.write_file(
- remote=rem,
- path=monmap_path,
- data=monmap,
- )
-
- log.info('Setting up mon nodes...')
- mons = ctx.cluster.only(teuthology.is_type('mon', cluster_name))
- osdmap_path = '{tdir}/{cluster}.osdmap'.format(tdir=testdir,
- cluster=cluster_name)
- run.wait(
- mons.run(
- args=[
- 'adjust-ulimits',
- 'ceph-coverage',
- coverage_dir,
- 'osdmaptool',
- '-c', conf_path,
- '--clobber',
- '--createsimple', '{num:d}'.format(
- num=teuthology.num_instances_of_type(ctx.cluster, 'osd',
- cluster_name),
- ),
- osdmap_path,
- '--pg_bits', '2',
- '--pgp_bits', '4',
- ],
- wait=False,
- ),
- )
-
- log.info('Setting up mgr nodes...')
- mgrs = ctx.cluster.only(teuthology.is_type('mgr', cluster_name))
- for remote, roles_for_host in mgrs.remotes.iteritems():
- for role in teuthology.cluster_roles_of_type(roles_for_host, 'mgr',
- cluster_name):
- _, _, id_ = teuthology.split_role(role)
- mgr_dir = '/var/lib/ceph/mgr/{cluster}-{id}'.format(
- cluster=cluster_name,
- id=id_,
- )
- remote.run(
- args=[
- 'sudo',
- 'mkdir',
- '-p',
- mgr_dir,
- run.Raw('&&'),
- 'sudo',
- 'adjust-ulimits',
- 'ceph-coverage',
- coverage_dir,
- 'ceph-authtool',
- '--create-keyring',
- '--gen-key',
- '--name=mgr.{id}'.format(id=id_),
- mgr_dir + '/keyring',
- ],
- )
-
- log.info('Setting up mds nodes...')
- mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
- for remote, roles_for_host in mdss.remotes.iteritems():
- for role in teuthology.cluster_roles_of_type(roles_for_host, 'mds',
- cluster_name):
- _, _, id_ = teuthology.split_role(role)
- mds_dir = '/var/lib/ceph/mds/{cluster}-{id}'.format(
- cluster=cluster_name,
- id=id_,
- )
- remote.run(
- args=[
- 'sudo',
- 'mkdir',
- '-p',
- mds_dir,
- run.Raw('&&'),
- 'sudo',
- 'adjust-ulimits',
- 'ceph-coverage',
- coverage_dir,
- 'ceph-authtool',
- '--create-keyring',
- '--gen-key',
- '--name=mds.{id}'.format(id=id_),
- mds_dir + '/keyring',
- ],
- )
-
- cclient.create_keyring(ctx, cluster_name)
- log.info('Running mkfs on osd nodes...')
-
- if not hasattr(ctx, 'disk_config'):
- ctx.disk_config = argparse.Namespace()
- if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev'):
- ctx.disk_config.remote_to_roles_to_dev = {}
- if not hasattr(ctx.disk_config, 'remote_to_roles_to_journals'):
- ctx.disk_config.remote_to_roles_to_journals = {}
- if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_mount_options'):
- ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
- if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_fstype'):
- ctx.disk_config.remote_to_roles_to_dev_fstype = {}
-
- teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev, remote_to_roles_to_devs)
- teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_journals, remote_to_roles_to_journals)
-
- log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
- for remote, roles_for_host in osds.remotes.iteritems():
- roles_to_devs = remote_to_roles_to_devs[remote]
- roles_to_journals = remote_to_roles_to_journals[remote]
-
- for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
- _, _, id_ = teuthology.split_role(role)
- mnt_point = '/var/lib/ceph/osd/{cluster}-{id}'.format(cluster=cluster_name, id=id_)
- remote.run(
- args=[
- 'sudo',
- 'mkdir',
- '-p',
- mnt_point,
- ])
- log.info(str(roles_to_journals))
- log.info(role)
- if roles_to_devs.get(role):
- dev = roles_to_devs[role]
- fs = config.get('fs')
- package = None
- mkfs_options = config.get('mkfs_options')
- mount_options = config.get('mount_options')
- if fs == 'btrfs':
- # package = 'btrfs-tools'
- if mount_options is None:
- mount_options = ['noatime', 'user_subvol_rm_allowed']
- if mkfs_options is None:
- mkfs_options = ['-m', 'single',
- '-l', '32768',
- '-n', '32768']
- if fs == 'xfs':
- # package = 'xfsprogs'
- if mount_options is None:
- mount_options = ['noatime']
- if mkfs_options is None:
- mkfs_options = ['-f', '-i', 'size=2048']
- if fs == 'ext4' or fs == 'ext3':
- if mount_options is None:
- mount_options = ['noatime', 'user_xattr']
-
- if mount_options is None:
- mount_options = []
- if mkfs_options is None:
- mkfs_options = []
- mkfs = ['mkfs.%s' % fs] + mkfs_options
- log.info('%s on %s on %s' % (mkfs, dev, remote))
- if package is not None:
- remote.run(
- args=[
- 'sudo',
- 'apt-get', 'install', '-y', package
- ],
- stdout=StringIO(),
- )
-
- try:
- remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
- except run.CommandFailedError:
- # Newer btfs-tools doesn't prompt for overwrite, use -f
- if '-f' not in mount_options:
- mkfs_options.append('-f')
- mkfs = ['mkfs.%s' % fs] + mkfs_options
- log.info('%s on %s on %s' % (mkfs, dev, remote))
- remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
-
- log.info('mount %s on %s -o %s' % (dev, remote,
- ','.join(mount_options)))
- remote.run(
- args=[
- 'sudo',
- 'mount',
- '-t', fs,
- '-o', ','.join(mount_options),
- dev,
- mnt_point,
- ]
- )
- remote.run(
- args=[
- 'sudo', '/sbin/restorecon', mnt_point,
- ],
- check_status=False,
- )
- if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
- ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
- ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][role] = mount_options
- if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
- ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
- ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role] = fs
- devs_to_clean[remote].append(mnt_point)
-
- for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
- _, _, id_ = teuthology.split_role(role)
- remote.run(
- args=[
- 'sudo',
- 'MALLOC_CHECK_=3',
- 'adjust-ulimits',
- 'ceph-coverage',
- coverage_dir,
- 'ceph-osd',
- '--cluster',
- cluster_name,
- '--mkfs',
- '--mkkey',
- '-i', id_,
- '--monmap', monmap_path,
- ],
- )
-
- log.info('Reading keys from all nodes...')
- keys_fp = StringIO()
- keys = []
- for remote, roles_for_host in ctx.cluster.remotes.iteritems():
- for type_ in ['mgr', 'mds', 'osd']:
- for role in teuthology.cluster_roles_of_type(roles_for_host, type_, cluster_name):
- _, _, id_ = teuthology.split_role(role)
- data = teuthology.get_file(
- remote=remote,
- path='/var/lib/ceph/{type}/{cluster}-{id}/keyring'.format(
- type=type_,
- id=id_,
- cluster=cluster_name,
- ),
- sudo=True,
- )
- keys.append((type_, id_, data))
- keys_fp.write(data)
- for remote, roles_for_host in ctx.cluster.remotes.iteritems():
- for role in teuthology.cluster_roles_of_type(roles_for_host, 'client', cluster_name):
- _, _, id_ = teuthology.split_role(role)
- data = teuthology.get_file(
- remote=remote,
- path='/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_, cluster=cluster_name)
- )
- keys.append(('client', id_, data))
- keys_fp.write(data)
-
- log.info('Adding keys to all mons...')
- writes = mons.run(
- args=[
- 'sudo', 'tee', '-a',
- keyring_path,
- ],
- stdin=run.PIPE,
- wait=False,
- stdout=StringIO(),
- )
- keys_fp.seek(0)
- teuthology.feed_many_stdins_and_close(keys_fp, writes)
- run.wait(writes)
- for type_, id_, data in keys:
- run.wait(
- mons.run(
- args=[
- 'sudo',
- 'adjust-ulimits',
- 'ceph-coverage',
- coverage_dir,
- 'ceph-authtool',
- keyring_path,
- '--name={type}.{id}'.format(
- type=type_,
- id=id_,
- ),
- ] + list(generate_caps(type_)),
- wait=False,
- ),
- )
-
- log.info('Running mkfs on mon nodes...')
- for remote, roles_for_host in mons.remotes.iteritems():
- for role in teuthology.cluster_roles_of_type(roles_for_host, 'mon', cluster_name):
- _, _, id_ = teuthology.split_role(role)
- remote.run(
- args=[
- 'sudo',
- 'mkdir',
- '-p',
- '/var/lib/ceph/mon/{cluster}-{id}'.format(id=id_, cluster=cluster_name),
- ],
- )
- remote.run(
- args=[
- 'sudo',
- 'adjust-ulimits',
- 'ceph-coverage',
- coverage_dir,
- 'ceph-mon',
- '--cluster', cluster_name,
- '--mkfs',
- '-i', id_,
- '--monmap', monmap_path,
- '--osdmap', osdmap_path,
- '--keyring', keyring_path,
- ],
- )
-
- run.wait(
- mons.run(
- args=[
- 'rm',
- '--',
- monmap_path,
- osdmap_path,
- ],
- wait=False,
- ),
- )
-
- try:
- yield
- except Exception:
- # we need to know this below
- ctx.summary['success'] = False
- raise
- finally:
- (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
-
- log.info('Checking cluster log for badness...')
-
- def first_in_ceph_log(pattern, excludes):
- """
- Find the first occurence of the pattern specified in the Ceph log,
- Returns None if none found.
-
- :param pattern: Pattern scanned for.
- :param excludes: Patterns to ignore.
- :return: First line of text (or None if not found)
- """
- args = [
- 'sudo',
- 'egrep', pattern,
- '/var/log/ceph/{cluster}.log'.format(cluster=cluster_name),
- ]
- for exclude in excludes:
- args.extend([run.Raw('|'), 'egrep', '-v', exclude])
- args.extend([
- run.Raw('|'), 'head', '-n', '1',
- ])
- r = mon0_remote.run(
- stdout=StringIO(),
- args=args,
- )
- stdout = r.stdout.getvalue()
- if stdout != '':
- return stdout
- return None
-
- if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
- config['log_whitelist']) is not None:
- log.warning('Found errors (ERR|WRN|SEC) in cluster log')
- ctx.summary['success'] = False
- # use the most severe problem as the failure reason
- if 'failure_reason' not in ctx.summary:
- for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
- match = first_in_ceph_log(pattern, config['log_whitelist'])
- if match is not None:
- ctx.summary['failure_reason'] = \
- '"{match}" in cluster log'.format(
- match=match.rstrip('\n'),
- )
- break
-
- for remote, dirs in devs_to_clean.iteritems():
- for dir_ in dirs:
- log.info('Unmounting %s on %s' % (dir_, remote))
- try:
- remote.run(
- args=[
- 'sync',
- run.Raw('&&'),
- 'sudo',
- 'umount',
- '-f',
- dir_
- ]
- )
- except Exception as e:
- remote.run(args=[
- 'sudo',
- run.Raw('PATH=/usr/sbin:$PATH'),
- 'lsof',
- run.Raw(';'),
- 'ps', 'auxf',
- ])
- raise e
-
- if config.get('tmpfs_journal'):
- log.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
- for remote, roles_for_host in osds.remotes.iteritems():
- remote.run(
- args=['sudo', 'umount', '-f', '/mnt'],
- check_status=False,
- )
-
- if ctx.archive is not None and \
- not (ctx.config.get('archive-on-error') and ctx.summary['success']):
-
- # archive mon data, too
- log.info('Archiving mon data...')
- path = os.path.join(ctx.archive, 'data')
- try:
- os.makedirs(path)
- except OSError as e:
- if e.errno == errno.EEXIST:
- pass
- else:
- raise
- for remote, roles in mons.remotes.iteritems():
- for role in roles:
- is_mon = teuthology.is_type('mon', cluster_name)
- if is_mon(role):
- _, _, id_ = teuthology.split_role(role)
- mon_dir = '/var/lib/ceph/mon/' + \
- '{0}-{1}'.format(cluster_name, id_)
- teuthology.pull_directory_tarball(
- remote,
- mon_dir,
- path + '/' + role + '.tgz')
-
- log.info('Cleaning ceph cluster...')
- run.wait(
- ctx.cluster.run(
- args=[
- 'sudo',
- 'rm',
- '-rf',
- '--',
- conf_path,
- keyring_path,
- data_dir,
- monmap_path,
- osdmap_path,
- run.Raw('{tdir}/../*.pid'.format(tdir=testdir)),
- ],
- wait=False,
- ),
- )
-
-
-def osd_scrub_pgs(ctx, config):
- """
- Scrub pgs when we exit.
-
- First make sure all pgs are active and clean.
- Next scrub all osds.
- Then periodically check until all pgs have scrub time stamps that
- indicate the last scrub completed. Time out if no progess is made
- here after two minutes.
- """
- retries = 12
- delays = 10
- cluster_name = config['cluster']
- manager = ctx.managers[cluster_name]
- all_clean = False
- for _ in range(0, retries):
- stats = manager.get_pg_stats()
- states = [stat['state'] for stat in stats]
- if len(set(states)) == 1 and states[0] == 'active+clean':
- all_clean = True
- break
- log.info("Waiting for all osds to be active and clean.")
- time.sleep(delays)
- if not all_clean:
- log.info("Scrubbing terminated -- not all pgs were active and clean.")
- return
- check_time_now = time.localtime()
- time.sleep(1)
- all_roles = teuthology.all_roles(ctx.cluster)
- for role in teuthology.cluster_roles_of_type(all_roles, 'osd', cluster_name):
- log.info("Scrubbing {osd}".format(osd=role))
- _, _, id_ = teuthology.split_role(role)
- manager.raw_cluster_cmd('osd', 'deep-scrub', id_)
- prev_good = 0
- gap_cnt = 0
- loop = True
- while loop:
- stats = manager.get_pg_stats()
- timez = [stat['last_scrub_stamp'] for stat in stats]
- loop = False
- thiscnt = 0
- for tmval in timez:
- pgtm = time.strptime(tmval[0:tmval.find('.')], '%Y-%m-%d %H:%M:%S')
- if pgtm > check_time_now:
- thiscnt += 1
- else:
- loop = True
- if thiscnt > prev_good:
- prev_good = thiscnt
- gap_cnt = 0
- else:
- gap_cnt += 1
- if gap_cnt > retries:
- log.info('Exiting scrub checking -- not all pgs scrubbed.')
- return
- if loop:
- log.info('Still waiting for all pgs to be scrubbed.')
- time.sleep(delays)
-
-
-@contextlib.contextmanager
-def run_daemon(ctx, config, type_):
- """
- Run daemons for a role type. Handle the startup and termination of a a daemon.
- On startup -- set coverages, cpu_profile, valgrind values for all remotes,
- and a max_mds value for one mds.
- On cleanup -- Stop all existing daemons of this type.
-
- :param ctx: Context
- :param config: Configuration
- :paran type_: Role type
- """
- cluster_name = config['cluster']
- log.info('Starting %s daemons in cluster %s...', type_, cluster_name)
- testdir = teuthology.get_testdir(ctx)
- daemons = ctx.cluster.only(teuthology.is_type(type_, cluster_name))
-
- # check whether any daemons if this type are configured
- if daemons is None:
- return
- coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
-
- daemon_signal = 'kill'
- if config.get('coverage') or config.get('valgrind') is not None:
- daemon_signal = 'term'
-
- for remote, roles_for_host in daemons.remotes.iteritems():
- is_type_ = teuthology.is_type(type_, cluster_name)
- for role in roles_for_host:
- if not is_type_(role):
- continue
- _, _, id_ = teuthology.split_role(role)
-
- run_cmd = [
- 'sudo',
- 'adjust-ulimits',
- 'ceph-coverage',
- coverage_dir,
- 'daemon-helper',
- daemon_signal,
- ]
- run_cmd_tail = [
- 'ceph-%s' % (type_),
- '-f',
- '--cluster', cluster_name,
- '-i', id_]
-
- if type_ in config.get('cpu_profile', []):
- profile_path = '/var/log/ceph/profiling-logger/%s.prof' % (role)
- run_cmd.extend(['env', 'CPUPROFILE=%s' % profile_path])
-
- if config.get('valgrind') is not None:
- valgrind_args = None
- if type_ in config['valgrind']:
- valgrind_args = config['valgrind'][type_]
- if role in config['valgrind']:
- valgrind_args = config['valgrind'][role]
- run_cmd = teuthology.get_valgrind_args(testdir, role,
- run_cmd,
- valgrind_args)
-
- run_cmd.extend(run_cmd_tail)
-
- ctx.daemons.add_daemon(remote, type_, id_,
- cluster=cluster_name,
- args=run_cmd,
- logger=log.getChild(role),
- stdin=run.PIPE,
- wait=False,
- )
-
- try:
- yield
- finally:
- teuthology.stop_daemons_of_type(ctx, type_, cluster_name)
-
-
-def healthy(ctx, config):
- """
- Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
-
- :param ctx: Context
- :param config: Configuration
- """
- config = config if isinstance(config, dict) else dict()
- cluster_name = config.get('cluster', 'ceph')
- log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
- firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
- (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
- teuthology.wait_until_osds_up(
- ctx,
- cluster=ctx.cluster,
- remote=mon0_remote,
- ceph_cluster=cluster_name,
- )
- teuthology.wait_until_healthy(
- ctx,
- remote=mon0_remote,
- ceph_cluster=cluster_name,
- )
-
- if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
- # Some MDSs exist, wait for them to be healthy
- ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware
- ceph_fs.wait_for_daemons(timeout=300)
-
-
-def wait_for_osds_up(ctx, config):
- """
- Wait for all osd's to come up.
-
- :param ctx: Context
- :param config: Configuration
- """
- log.info('Waiting until ceph osds are all up...')
- cluster_name = config.get('cluster', 'ceph')
- firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
- (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
- teuthology.wait_until_osds_up(
- ctx,
- cluster=ctx.cluster,
- remote=mon0_remote
- )
-
-
-def wait_for_mon_quorum(ctx, config):
- """
- Check renote ceph status until all monitors are up.
-
- :param ctx: Context
- :param config: Configuration
- """
- if isinstance(config, dict):
- mons = config['daemons']
- cluster_name = config.get('cluster', 'ceph')
- else:
- assert isinstance(config, list)
- mons = config
- cluster_name = 'ceph'
- firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
- (remote,) = ctx.cluster.only(firstmon).remotes.keys()
- with contextutil.safe_while(sleep=10, tries=60,
- action='wait for monitor quorum') as proceed:
- while proceed():
- r = remote.run(
- args=[
- 'sudo',
- 'ceph',
- 'quorum_status',
- ],
- stdout=StringIO(),
- logger=log.getChild('quorum_status'),
- )
- j = json.loads(r.stdout.getvalue())
- q = j.get('quorum_names', [])
- log.debug('Quorum: %s', q)
- if sorted(q) == sorted(mons):
- break
-
-
-def created_pool(ctx, config):
- """
- Add new pools to the dictionary of pools that the ceph-manager
- knows about.
- """
- for new_pool in config:
- if new_pool not in ctx.managers['ceph'].pools:
- ctx.managers['ceph'].pools[new_pool] = ctx.managers['ceph'].get_pool_property(
- new_pool, 'pg_num')
-
-
-@contextlib.contextmanager
-def restart(ctx, config):
- """
- restart ceph daemons
-
- For example::
- tasks:
- - ceph.restart: [all]
-
- For example::
- tasks:
- - ceph.restart: [osd.0, mon.1, mds.*]
-
- or::
-
- tasks:
- - ceph.restart:
- daemons: [osd.0, mon.1]
- wait-for-healthy: false
- wait-for-osds-up: true
-
- :param ctx: Context
- :param config: Configuration
- """
- if config is None:
- config = {}
- elif isinstance(config, list):
- config = {'daemons': config}
-
- daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
- clusters = set()
- for role in daemons:
- cluster, type_, id_ = teuthology.split_role(role)
- ctx.daemons.get_daemon(type_, id_, cluster).restart()
- clusters.add(cluster)
-
- if config.get('wait-for-healthy', True):
- for cluster in clusters:
- healthy(ctx=ctx, config=dict(cluster=cluster))
- if config.get('wait-for-osds-up', False):
- for cluster in clusters:
- wait_for_osds_up(ctx=ctx, config=dict(cluster=cluster))
- manager = ctx.managers['ceph']
- for dmon in daemons:
- if '.' in dmon:
- dm_parts = dmon.split('.')
- if dm_parts[1].isdigit():
- if dm_parts[0] == 'osd':
- manager.mark_down_osd(int(dm_parts[1]))
- yield
-
-
-@contextlib.contextmanager
-def stop(ctx, config):
- """
- Stop ceph daemons
-
- For example::
- tasks:
- - ceph.stop: [mds.*]
-
- tasks:
- - ceph.stop: [osd.0, osd.2]
-
- tasks:
- - ceph.stop:
- daemons: [osd.0, osd.2]
-
- """
- if config is None:
- config = {}
- elif isinstance(config, list):
- config = {'daemons': config}
-
- daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
- for role in daemons:
- cluster, type_, id_ = teuthology.split_role(role)
- ctx.daemons.get_daemon(type_, id_, cluster).stop()
-
- yield
-
-
-@contextlib.contextmanager
-def wait_for_failure(ctx, config):
- """
- Wait for a failure of a ceph daemon
-
- For example::
- tasks:
- - ceph.wait_for_failure: [mds.*]
-
- tasks:
- - ceph.wait_for_failure: [osd.0, osd.2]
-
- tasks:
- - ceph.wait_for_failure:
- daemons: [osd.0, osd.2]
-
- """
- if config is None:
- config = {}
- elif isinstance(config, list):
- config = {'daemons': config}
-
- daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
- for role in daemons:
- cluster, type_, id_ = teuthology.split_role(role)
- try:
- ctx.daemons.get_daemon(type_, id_, cluster).wait()
- except:
- log.info('Saw expected daemon failure. Continuing.')
- pass
- else:
- raise RuntimeError('daemon %s did not fail' % role)
-
- yield
-
-
-def validate_config(ctx, config):
- """
- Perform some simple validation on task configuration.
- Raises exceptions.ConfigError if an error is found.
- """
- # check for osds from multiple clusters on the same host
- for remote, roles_for_host in ctx.cluster.remotes.items():
- last_cluster = None
- last_role = None
- for role in roles_for_host:
- role_cluster, role_type, _ = teuthology.split_role(role)
- if role_type != 'osd':
- continue
- if last_cluster and last_cluster != role_cluster:
- msg = "Host should not have osds (%s and %s) from multiple clusters" % (
- last_role, role)
- raise exceptions.ConfigError(msg)
- last_cluster = role_cluster
- last_role = role
-
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Set up and tear down a Ceph cluster.
-
- For example::
-
- tasks:
- - ceph:
- - interactive:
-
- You can also specify what branch to run::
-
- tasks:
- - ceph:
- branch: foo
-
- Or a tag::
-
- tasks:
- - ceph:
- tag: v0.42.13
-
- Or a sha1::
-
- tasks:
- - ceph:
- sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
-
- Or a local source dir::
-
- tasks:
- - ceph:
- path: /home/sage/ceph
-
- To capture code coverage data, use::
-
- tasks:
- - ceph:
- coverage: true
-
- To use btrfs, ext4, or xfs on the target's scratch disks, use::
-
- tasks:
- - ceph:
- fs: xfs
- mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
- mount_options: [nobarrier, inode64]
-
- Note, this will cause the task to check the /scratch_devs file on each node
- for available devices. If no such file is found, /dev/sdb will be used.
-
- To run some daemons under valgrind, include their names
- and the tool/args to use in a valgrind section::
-
- tasks:
- - ceph:
- valgrind:
- mds.1: --tool=memcheck
- osd.1: [--tool=memcheck, --leak-check=no]
-
- Those nodes which are using memcheck or valgrind will get
- checked for bad results.
-
- To adjust or modify config options, use::
-
- tasks:
- - ceph:
- conf:
- section:
- key: value
-
- For example::
-
- tasks:
- - ceph:
- conf:
- mds.0:
- some option: value
- other key: other value
- client.0:
- debug client: 10
- debug ms: 1
-
- By default, the cluster log is checked for errors and warnings,
- and the run marked failed if any appear. You can ignore log
- entries by giving a list of egrep compatible regexes, i.e.:
-
- tasks:
- - ceph:
- log-whitelist: ['foo.*bar', 'bad message']
-
- To run multiple ceph clusters, use multiple ceph tasks, and roles
- with a cluster name prefix, e.g. cluster1.client.0. Roles with no
- cluster use the default cluster name, 'ceph'. OSDs from separate
- clusters must be on separate hosts. Clients and non-osd daemons
- from multiple clusters may be colocated. For each cluster, add an
- instance of the ceph task with the cluster name specified, e.g.::
-
- roles:
- - [mon.a, osd.0, osd.1]
- - [backup.mon.a, backup.osd.0, backup.osd.1]
- - [client.0, backup.client.0]
- tasks:
- - ceph:
- cluster: ceph
- - ceph:
- cluster: backup
-
- :param ctx: Context
- :param config: Configuration
-
- """
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- "task ceph only supports a dictionary for configuration"
-
- overrides = ctx.config.get('overrides', {})
- teuthology.deep_merge(config, overrides.get('ceph', {}))
-
- first_ceph_cluster = False
- if not hasattr(ctx, 'daemons'):
- first_ceph_cluster = True
- ctx.daemons = DaemonGroup()
-
- testdir = teuthology.get_testdir(ctx)
- if config.get('coverage'):
- coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
- log.info('Creating coverage directory...')
- run.wait(
- ctx.cluster.run(
- args=[
- 'install', '-d', '-m0755', '--',
- coverage_dir,
- ],
- wait=False,
- )
- )
-
- if 'cluster' not in config:
- config['cluster'] = 'ceph'
-
- validate_config(ctx, config)
-
- subtasks = []
- if first_ceph_cluster:
- # these tasks handle general log setup and parsing on all hosts,
- # so they should only be run once
- subtasks = [
- lambda: ceph_log(ctx=ctx, config=None),
- lambda: valgrind_post(ctx=ctx, config=config),
- ]
-
- subtasks += [
- lambda: cluster(ctx=ctx, config=dict(
- conf=config.get('conf', {}),
- fs=config.get('fs', 'xfs'),
- mkfs_options=config.get('mkfs_options', None),
- mount_options=config.get('mount_options', None),
- block_journal=config.get('block_journal', None),
- tmpfs_journal=config.get('tmpfs_journal', None),
- log_whitelist=config.get('log-whitelist', []),
- cpu_profile=set(config.get('cpu_profile', []),),
- cluster=config['cluster'],
- )),
- lambda: run_daemon(ctx=ctx, config=config, type_='mon'),
- lambda: run_daemon(ctx=ctx, config=config, type_='mgr'),
- lambda: crush_setup(ctx=ctx, config=config),
- lambda: run_daemon(ctx=ctx, config=config, type_='osd'),
- lambda: cephfs_setup(ctx=ctx, config=config),
- lambda: run_daemon(ctx=ctx, config=config, type_='mds'),
- ]
-
- with contextutil.nested(*subtasks):
- first_mon = teuthology.get_first_mon(ctx, config, config['cluster'])
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
- if not hasattr(ctx, 'managers'):
- ctx.managers = {}
- ctx.managers[config['cluster']] = CephManager(
- mon,
- ctx=ctx,
- logger=log.getChild('ceph_manager.' + config['cluster']),
- cluster=config['cluster'],
- )
-
- try:
- if config.get('wait-for-healthy', True):
- healthy(ctx=ctx, config=dict(cluster=config['cluster']))
-
- yield
- finally:
- if config.get('wait-for-scrub', True):
- osd_scrub_pgs(ctx, config)
+++ /dev/null
-"""
-Set up client keyring
-"""
-import logging
-
-from teuthology import misc as teuthology
-from teuthology.orchestra import run
-
-log = logging.getLogger(__name__)
-
-def create_keyring(ctx, cluster_name):
- """
- Set up key ring on remote sites
- """
- log.info('Setting up client nodes...')
- clients = ctx.cluster.only(teuthology.is_type('client', cluster_name))
- testdir = teuthology.get_testdir(ctx)
- coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
- for remote, roles_for_host in clients.remotes.iteritems():
- for role in teuthology.cluster_roles_of_type(roles_for_host, 'client',
- cluster_name):
- name = teuthology.ceph_role(role)
- client_keyring = '/etc/ceph/{0}.{1}.keyring'.format(cluster_name, name)
- remote.run(
- args=[
- 'sudo',
- 'adjust-ulimits',
- 'ceph-coverage',
- coverage_dir,
- 'ceph-authtool',
- '--create-keyring',
- '--gen-key',
- # TODO this --name= is not really obeyed, all unknown "types" are munged to "client"
- '--name={name}'.format(name=name),
- client_keyring,
- run.Raw('&&'),
- 'sudo',
- 'chmod',
- '0644',
- client_keyring,
- ],
- )
+++ /dev/null
-"""
-Execute ceph-deploy as a task
-"""
-from cStringIO import StringIO
-
-import contextlib
-import os
-import time
-import logging
-import traceback
-
-from teuthology import misc as teuthology
-from teuthology import contextutil
-from teuthology.config import config as teuth_config
-from teuthology.task import install as install_fn
-from teuthology.orchestra import run
-from tasks.cephfs.filesystem import Filesystem
-
-log = logging.getLogger(__name__)
-
-
-@contextlib.contextmanager
-def download_ceph_deploy(ctx, config):
- """
- Downloads ceph-deploy from the ceph.com git mirror and (by default)
- switches to the master branch. If the `ceph-deploy-branch` is specified, it
- will use that instead. The `bootstrap` script is ran, with the argument
- obtained from `python_version`, if specified.
- """
- ceph_admin = ctx.cluster.only(teuthology.get_first_mon(ctx, config))
-
- try:
- py_ver = str(config['python_version'])
- except KeyError:
- pass
- else:
- supported_versions = ['2', '3']
- if py_ver not in supported_versions:
- raise ValueError("python_version must be: {}, not {}".format(
- ' or '.join(supported_versions), py_ver
- ))
-
- log.info("Installing Python")
- for admin in ceph_admin.remotes:
- system_type = teuthology.get_system_type(admin)
-
- if system_type == 'rpm':
- package = 'python34' if py_ver == '3' else 'python'
- ctx.cluster.run(args=[
- 'sudo', 'yum', '-y', 'install',
- package, 'python-virtualenv'
- ])
- else:
- package = 'python3' if py_ver == '3' else 'python'
- ctx.cluster.run(args=[
- 'sudo', 'apt-get', '-y', '--force-yes', 'install',
- package, 'python-virtualenv'
- ])
-
- log.info('Downloading ceph-deploy...')
- testdir = teuthology.get_testdir(ctx)
- ceph_deploy_branch = config.get('ceph-deploy-branch', 'master')
-
- ceph_admin.run(
- args=[
- 'git', 'clone', '-b', ceph_deploy_branch,
- teuth_config.ceph_git_base_url + 'ceph-deploy.git',
- '{tdir}/ceph-deploy'.format(tdir=testdir),
- ],
- )
- args = [
- 'cd',
- '{tdir}/ceph-deploy'.format(tdir=testdir),
- run.Raw('&&'),
- './bootstrap',
- ]
- try:
- args.append(str(config['python_version']))
- except KeyError:
- pass
- ceph_admin.run(args=args)
-
- try:
- yield
- finally:
- log.info('Removing ceph-deploy ...')
- ceph_admin.run(
- args=[
- 'rm',
- '-rf',
- '{tdir}/ceph-deploy'.format(tdir=testdir),
- ],
- )
-
-
-def is_healthy(ctx, config):
- """Wait until a Ceph cluster is healthy."""
- testdir = teuthology.get_testdir(ctx)
- ceph_admin = teuthology.get_first_mon(ctx, config)
- (remote,) = ctx.cluster.only(ceph_admin).remotes.keys()
- max_tries = 90 # 90 tries * 10 secs --> 15 minutes
- tries = 0
- while True:
- tries += 1
- if tries >= max_tries:
- msg = "ceph health was unable to get 'HEALTH_OK' after waiting 15 minutes"
- remote.run(
- args=[
- 'cd',
- '{tdir}'.format(tdir=testdir),
- run.Raw('&&'),
- 'sudo', 'ceph',
- 'report',
- ],
- )
- raise RuntimeError(msg)
-
- r = remote.run(
- args=[
- 'cd',
- '{tdir}'.format(tdir=testdir),
- run.Raw('&&'),
- 'sudo', 'ceph',
- 'health',
- ],
- stdout=StringIO(),
- logger=log.getChild('health'),
- )
- out = r.stdout.getvalue()
- log.info('Ceph health: %s', out.rstrip('\n'))
- if out.split(None, 1)[0] == 'HEALTH_OK':
- break
- time.sleep(10)
-
-
-def get_nodes_using_role(ctx, target_role):
- """
- Extract the names of nodes that match a given role from a cluster, and modify the
- cluster's service IDs to match the resulting node-based naming scheme that ceph-deploy
- uses, such that if "mon.a" is on host "foo23", it'll be renamed to "mon.foo23".
- """
-
- # Nodes containing a service of the specified role
- nodes_of_interest = []
-
- # Prepare a modified version of cluster.remotes with ceph-deploy-ized names
- modified_remotes = {}
-
- for _remote, roles_for_host in ctx.cluster.remotes.iteritems():
- modified_remotes[_remote] = []
- for svc_id in roles_for_host:
- if svc_id.startswith("{0}.".format(target_role)):
- fqdn = str(_remote).split('@')[-1]
- nodename = str(str(_remote).split('.')[0]).split('@')[1]
- if target_role == 'mon':
- nodes_of_interest.append(fqdn)
- else:
- nodes_of_interest.append(nodename)
-
- modified_remotes[_remote].append(
- "{0}.{1}".format(target_role, nodename))
- else:
- modified_remotes[_remote].append(svc_id)
-
- ctx.cluster.remotes = modified_remotes
-
- return nodes_of_interest
-
-
-def get_dev_for_osd(ctx, config):
- """Get a list of all osd device names."""
- osd_devs = []
- for remote, roles_for_host in ctx.cluster.remotes.iteritems():
- host = remote.name.split('@')[-1]
- shortname = host.split('.')[0]
- devs = teuthology.get_scratch_devices(remote)
- num_osd_per_host = list(
- teuthology.roles_of_type(
- roles_for_host, 'osd'))
- num_osds = len(num_osd_per_host)
- if config.get('separate_journal_disk') is not None:
- num_devs_reqd = 2 * num_osds
- assert num_devs_reqd <= len(
- devs), 'fewer data and journal disks than required ' + shortname
- for dindex in range(0, num_devs_reqd, 2):
- jd_index = dindex + 1
- dev_short = devs[dindex].split('/')[-1]
- jdev_short = devs[jd_index].split('/')[-1]
- osd_devs.append((shortname, dev_short, jdev_short))
- else:
- assert num_osds <= len(devs), 'fewer disks than osds ' + shortname
- for dev in devs[:num_osds]:
- dev_short = dev.split('/')[-1]
- osd_devs.append((shortname, dev_short))
- return osd_devs
-
-
-def get_all_nodes(ctx, config):
- """Return a string of node names separated by blanks"""
- nodelist = []
- for t, k in ctx.config['targets'].iteritems():
- host = t.split('@')[-1]
- simple_host = host.split('.')[0]
- nodelist.append(simple_host)
- nodelist = " ".join(nodelist)
- return nodelist
-
-
-@contextlib.contextmanager
-def build_ceph_cluster(ctx, config):
- """Build a ceph cluster"""
-
- # Expect to find ceph_admin on the first mon by ID, same place that the download task
- # puts it. Remember this here, because subsequently IDs will change from those in
- # the test config to those that ceph-deploy invents.
- (ceph_admin,) = ctx.cluster.only(
- teuthology.get_first_mon(ctx, config)).remotes.iterkeys()
-
- def execute_ceph_deploy(cmd):
- """Remotely execute a ceph_deploy command"""
- return ceph_admin.run(
- args=[
- 'cd',
- '{tdir}/ceph-deploy'.format(tdir=testdir),
- run.Raw('&&'),
- run.Raw(cmd),
- ],
- check_status=False,
- ).exitstatus
-
- try:
- log.info('Building ceph cluster using ceph-deploy...')
- testdir = teuthology.get_testdir(ctx)
- ceph_branch = None
- if config.get('branch') is not None:
- cbranch = config.get('branch')
- for var, val in cbranch.iteritems():
- ceph_branch = '--{var}={val}'.format(var=var, val=val)
- all_nodes = get_all_nodes(ctx, config)
- mds_nodes = get_nodes_using_role(ctx, 'mds')
- mds_nodes = " ".join(mds_nodes)
- mon_node = get_nodes_using_role(ctx, 'mon')
- mon_nodes = " ".join(mon_node)
- new_mon = './ceph-deploy new' + " " + mon_nodes
- mon_hostname = mon_nodes.split(' ')[0]
- mon_hostname = str(mon_hostname)
- gather_keys = './ceph-deploy gatherkeys' + " " + mon_hostname
- deploy_mds = './ceph-deploy mds create' + " " + mds_nodes
- no_of_osds = 0
-
- if mon_nodes is None:
- raise RuntimeError("no monitor nodes in the config file")
-
- estatus_new = execute_ceph_deploy(new_mon)
- if estatus_new != 0:
- raise RuntimeError("ceph-deploy: new command failed")
-
- log.info('adding config inputs...')
- testdir = teuthology.get_testdir(ctx)
- conf_path = '{tdir}/ceph-deploy/ceph.conf'.format(tdir=testdir)
-
- if config.get('conf') is not None:
- confp = config.get('conf')
- for section, keys in confp.iteritems():
- lines = '[{section}]\n'.format(section=section)
- teuthology.append_lines_to_file(ceph_admin, conf_path, lines,
- sudo=True)
- for key, value in keys.iteritems():
- log.info("[%s] %s = %s" % (section, key, value))
- lines = '{key} = {value}\n'.format(key=key, value=value)
- teuthology.append_lines_to_file(
- ceph_admin, conf_path, lines, sudo=True)
-
- # install ceph
- ceph_sha = ctx.config['sha1']
- devcommit = '--dev-commit={sha}'.format(sha=ceph_sha)
- if ceph_branch:
- option = ceph_branch
- else:
- option = devcommit
- install_nodes = './ceph-deploy install ' + option + " " + all_nodes
- estatus_install = execute_ceph_deploy(install_nodes)
- if estatus_install != 0:
- raise RuntimeError("ceph-deploy: Failed to install ceph")
- # install ceph-test package too
- install_nodes2 = './ceph-deploy install --tests ' + option + \
- " " + all_nodes
- estatus_install = execute_ceph_deploy(install_nodes2)
- if estatus_install != 0:
- raise RuntimeError("ceph-deploy: Failed to install ceph-test")
-
- mon_create_nodes = './ceph-deploy mon create-initial'
- # If the following fails, it is OK, it might just be that the monitors
- # are taking way more than a minute/monitor to form quorum, so lets
- # try the next block which will wait up to 15 minutes to gatherkeys.
- execute_ceph_deploy(mon_create_nodes)
-
- estatus_gather = execute_ceph_deploy(gather_keys)
- max_gather_tries = 90
- gather_tries = 0
- while (estatus_gather != 0):
- gather_tries += 1
- if gather_tries >= max_gather_tries:
- msg = 'ceph-deploy was not able to gatherkeys after 15 minutes'
- raise RuntimeError(msg)
- estatus_gather = execute_ceph_deploy(gather_keys)
- time.sleep(10)
-
- if mds_nodes:
- estatus_mds = execute_ceph_deploy(deploy_mds)
- if estatus_mds != 0:
- raise RuntimeError("ceph-deploy: Failed to deploy mds")
-
- if config.get('test_mon_destroy') is not None:
- for d in range(1, len(mon_node)):
- mon_destroy_nodes = './ceph-deploy mon destroy' + \
- " " + mon_node[d]
- estatus_mon_d = execute_ceph_deploy(mon_destroy_nodes)
- if estatus_mon_d != 0:
- raise RuntimeError("ceph-deploy: Failed to delete monitor")
-
- node_dev_list = get_dev_for_osd(ctx, config)
- for d in node_dev_list:
- node = d[0]
- for disk in d[1:]:
- zap = './ceph-deploy disk zap ' + node + ':' + disk
- estatus = execute_ceph_deploy(zap)
- if estatus != 0:
- raise RuntimeError("ceph-deploy: Failed to zap osds")
- osd_create_cmd = './ceph-deploy osd create '
- if config.get('dmcrypt') is not None:
- osd_create_cmd += '--dmcrypt '
- osd_create_cmd += ":".join(d)
- estatus_osd = execute_ceph_deploy(osd_create_cmd)
- if estatus_osd == 0:
- log.info('successfully created osd')
- no_of_osds += 1
- else:
- raise RuntimeError("ceph-deploy: Failed to create osds")
-
- if config.get('wait-for-healthy', True) and no_of_osds >= 2:
- is_healthy(ctx=ctx, config=None)
-
- log.info('Setting up client nodes...')
- conf_path = '/etc/ceph/ceph.conf'
- admin_keyring_path = '/etc/ceph/ceph.client.admin.keyring'
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon0_remote,) = ctx.cluster.only(first_mon).remotes.keys()
- conf_data = teuthology.get_file(
- remote=mon0_remote,
- path=conf_path,
- sudo=True,
- )
- admin_keyring = teuthology.get_file(
- remote=mon0_remote,
- path=admin_keyring_path,
- sudo=True,
- )
-
- clients = ctx.cluster.only(teuthology.is_type('client'))
- for remot, roles_for_host in clients.remotes.iteritems():
- for id_ in teuthology.roles_of_type(roles_for_host, 'client'):
- client_keyring = \
- '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_)
- mon0_remote.run(
- args=[
- 'cd',
- '{tdir}'.format(tdir=testdir),
- run.Raw('&&'),
- 'sudo', 'bash', '-c',
- run.Raw('"'), 'ceph',
- 'auth',
- 'get-or-create',
- 'client.{id}'.format(id=id_),
- 'mds', 'allow',
- 'mon', 'allow *',
- 'osd', 'allow *',
- run.Raw('>'),
- client_keyring,
- run.Raw('"'),
- ],
- )
- key_data = teuthology.get_file(
- remote=mon0_remote,
- path=client_keyring,
- sudo=True,
- )
- teuthology.sudo_write_file(
- remote=remot,
- path=client_keyring,
- data=key_data,
- perms='0644'
- )
- teuthology.sudo_write_file(
- remote=remot,
- path=admin_keyring_path,
- data=admin_keyring,
- perms='0644'
- )
- teuthology.sudo_write_file(
- remote=remot,
- path=conf_path,
- data=conf_data,
- perms='0644'
- )
-
- if mds_nodes:
- log.info('Configuring CephFS...')
- ceph_fs = Filesystem(ctx)
- if not ceph_fs.legacy_configured():
- ceph_fs.create()
- elif not config.get('only_mon'):
- raise RuntimeError(
- "The cluster is NOT operational due to insufficient OSDs")
- yield
-
- except Exception:
- log.info(
- "Error encountered, logging exception before tearing down ceph-deploy")
- log.info(traceback.format_exc())
- raise
- finally:
- if config.get('keep_running'):
- return
- log.info('Stopping ceph...')
- ctx.cluster.run(args=['sudo', 'stop', 'ceph-all', run.Raw('||'),
- 'sudo', 'service', 'ceph', 'stop', run.Raw('||'),
- 'sudo', 'systemctl', 'stop', 'ceph.target'])
-
- # Are you really not running anymore?
- # try first with the init tooling
- # ignoring the status so this becomes informational only
- ctx.cluster.run(
- args=[
- 'sudo', 'status', 'ceph-all', run.Raw('||'),
- 'sudo', 'service', 'ceph', 'status', run.Raw('||'),
- 'sudo', 'systemctl', 'status', 'ceph.target'],
- check_status=False)
-
- # and now just check for the processes themselves, as if upstart/sysvinit
- # is lying to us. Ignore errors if the grep fails
- ctx.cluster.run(args=['sudo', 'ps', 'aux', run.Raw('|'),
- 'grep', '-v', 'grep', run.Raw('|'),
- 'grep', 'ceph'], check_status=False)
-
- if ctx.archive is not None:
- # archive mon data, too
- log.info('Archiving mon data...')
- path = os.path.join(ctx.archive, 'data')
- os.makedirs(path)
- mons = ctx.cluster.only(teuthology.is_type('mon'))
- for remote, roles in mons.remotes.iteritems():
- for role in roles:
- if role.startswith('mon.'):
- teuthology.pull_directory_tarball(
- remote,
- '/var/lib/ceph/mon',
- path + '/' + role + '.tgz')
-
- log.info('Compressing logs...')
- run.wait(
- ctx.cluster.run(
- args=[
- 'sudo',
- 'find',
- '/var/log/ceph',
- '-name',
- '*.log',
- '-print0',
- run.Raw('|'),
- 'sudo',
- 'xargs',
- '-0',
- '--no-run-if-empty',
- '--',
- 'gzip',
- '--',
- ],
- wait=False,
- ),
- )
-
- log.info('Archiving logs...')
- path = os.path.join(ctx.archive, 'remote')
- os.makedirs(path)
- for remote in ctx.cluster.remotes.iterkeys():
- sub = os.path.join(path, remote.shortname)
- os.makedirs(sub)
- teuthology.pull_directory(remote, '/var/log/ceph',
- os.path.join(sub, 'log'))
-
- # Prevent these from being undefined if the try block fails
- all_nodes = get_all_nodes(ctx, config)
- purge_nodes = './ceph-deploy purge' + " " + all_nodes
- purgedata_nodes = './ceph-deploy purgedata' + " " + all_nodes
-
- log.info('Purging package...')
- execute_ceph_deploy(purge_nodes)
- log.info('Purging data...')
- execute_ceph_deploy(purgedata_nodes)
-
-
-@contextlib.contextmanager
-def cli_test(ctx, config):
- """
- ceph-deploy cli to exercise most commonly use cli's and ensure
- all commands works and also startup the init system.
-
- """
- log.info('Ceph-deploy Test')
- if config is None:
- config = {}
- test_branch = ''
- conf_dir = teuthology.get_testdir(ctx) + "/cdtest"
-
- def execute_cdeploy(admin, cmd, path):
- """Execute ceph-deploy commands """
- """Either use git path or repo path """
- args = ['cd', conf_dir, run.Raw(';')]
- if path:
- args.append('{path}/ceph-deploy/ceph-deploy'.format(path=path));
- else:
- args.append('ceph-deploy')
- args.append(run.Raw(cmd))
- ec = admin.run(args=args, check_status=False).exitstatus
- if ec != 0:
- raise RuntimeError(
- "failed during ceph-deploy cmd: {cmd} , ec={ec}".format(cmd=cmd, ec=ec))
-
- if config.get('rhbuild'):
- path = None
- else:
- path = teuthology.get_testdir(ctx)
- # test on branch from config eg: wip-* , master or next etc
- # packages for all distro's should exist for wip*
- if ctx.config.get('branch'):
- branch = ctx.config.get('branch')
- test_branch = ' --dev={branch} '.format(branch=branch)
- mons = ctx.cluster.only(teuthology.is_type('mon'))
- for node, role in mons.remotes.iteritems():
- admin = node
- admin.run(args=['mkdir', conf_dir], check_status=False)
- nodename = admin.shortname
- system_type = teuthology.get_system_type(admin)
- if config.get('rhbuild'):
- admin.run(args=['sudo', 'yum', 'install', 'ceph-deploy', '-y'])
- log.info('system type is %s', system_type)
- osds = ctx.cluster.only(teuthology.is_type('osd'))
-
- for remote, roles in osds.remotes.iteritems():
- devs = teuthology.get_scratch_devices(remote)
- log.info("roles %s", roles)
- if (len(devs) < 3):
- log.error(
- 'Test needs minimum of 3 devices, only found %s',
- str(devs))
- raise RuntimeError("Needs minimum of 3 devices ")
-
- conf_path = '{conf_dir}/ceph.conf'.format(conf_dir=conf_dir)
- new_cmd = 'new ' + nodename
- execute_cdeploy(admin, new_cmd, path)
- if config.get('conf') is not None:
- confp = config.get('conf')
- for section, keys in confp.iteritems():
- lines = '[{section}]\n'.format(section=section)
- teuthology.append_lines_to_file(admin, conf_path, lines,
- sudo=True)
- for key, value in keys.iteritems():
- log.info("[%s] %s = %s" % (section, key, value))
- lines = '{key} = {value}\n'.format(key=key, value=value)
- teuthology.append_lines_to_file(admin, conf_path, lines,
- sudo=True)
- new_mon_install = 'install {branch} --mon '.format(
- branch=test_branch) + nodename
- new_osd_install = 'install {branch} --osd '.format(
- branch=test_branch) + nodename
- new_admin = 'install {branch} --cli '.format(branch=test_branch) + nodename
- create_initial = 'mon create-initial '
- execute_cdeploy(admin, new_mon_install, path)
- execute_cdeploy(admin, new_osd_install, path)
- execute_cdeploy(admin, new_admin, path)
- execute_cdeploy(admin, create_initial, path)
-
- for i in range(3):
- zap_disk = 'disk zap ' + "{n}:{d}".format(n=nodename, d=devs[i])
- prepare = 'osd prepare ' + "{n}:{d}".format(n=nodename, d=devs[i])
- execute_cdeploy(admin, zap_disk, path)
- execute_cdeploy(admin, prepare, path)
-
- log.info("list files for debugging purpose to check file permissions")
- admin.run(args=['ls', run.Raw('-lt'), conf_dir])
- remote.run(args=['sudo', 'ceph', '-s'], check_status=False)
- r = remote.run(args=['sudo', 'ceph', 'health'], stdout=StringIO())
- out = r.stdout.getvalue()
- log.info('Ceph health: %s', out.rstrip('\n'))
- log.info("Waiting for cluster to become healthy")
- with contextutil.safe_while(sleep=10, tries=6,
- action='check health') as proceed:
- while proceed():
- r = remote.run(args=['sudo', 'ceph', 'health'], stdout=StringIO())
- out = r.stdout.getvalue()
- if (out.split(None,1)[0] == 'HEALTH_OK'):
- break
- rgw_install = 'install {branch} --rgw {node}'.format(
- branch=test_branch,
- node=nodename,
- )
- rgw_create = 'rgw create ' + nodename
- execute_cdeploy(admin, rgw_install, path)
- execute_cdeploy(admin, rgw_create, path)
- log.info('All ceph-deploy cli tests passed')
- try:
- yield
- finally:
- log.info("cleaning up")
- ctx.cluster.run(args=['sudo', 'stop', 'ceph-all', run.Raw('||'),
- 'sudo', 'service', 'ceph', 'stop', run.Raw('||'),
- 'sudo', 'systemctl', 'stop', 'ceph.target'],
- check_status=False)
- time.sleep(4)
- for i in range(3):
- umount_dev = "{d}1".format(d=devs[i])
- r = remote.run(args=['sudo', 'umount', run.Raw(umount_dev)])
- cmd = 'purge ' + nodename
- execute_cdeploy(admin, cmd, path)
- cmd = 'purgedata ' + nodename
- execute_cdeploy(admin, cmd, path)
- log.info("Removing temporary dir")
- admin.run(
- args=[
- 'rm',
- run.Raw('-rf'),
- run.Raw(conf_dir)],
- check_status=False)
- if config.get('rhbuild'):
- admin.run(args=['sudo', 'yum', 'remove', 'ceph-deploy', '-y'])
-
-
-@contextlib.contextmanager
-def single_node_test(ctx, config):
- """
- - ceph-deploy.single_node_test: null
-
- #rhbuild testing
- - ceph-deploy.single_node_test:
- rhbuild: 1.2.3
-
- """
- log.info("Testing ceph-deploy on single node")
- if config is None:
- config = {}
- overrides = ctx.config.get('overrides', {})
- teuthology.deep_merge(config, overrides.get('ceph-deploy', {}))
-
- if config.get('rhbuild'):
- log.info("RH Build, Skip Download")
- with contextutil.nested(
- lambda: cli_test(ctx=ctx, config=config),
- ):
- yield
- else:
- with contextutil.nested(
- lambda: install_fn.ship_utilities(ctx=ctx, config=None),
- lambda: download_ceph_deploy(ctx=ctx, config=config),
- lambda: cli_test(ctx=ctx, config=config),
- ):
- yield
-
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Set up and tear down a Ceph cluster.
-
- For example::
-
- tasks:
- - install:
- extras: yes
- - ssh_keys:
- - ceph-deploy:
- branch:
- stable: bobtail
- mon_initial_members: 1
- only_mon: true
- keep_running: true
-
- tasks:
- - install:
- extras: yes
- - ssh_keys:
- - ceph-deploy:
- branch:
- dev: master
- conf:
- mon:
- debug mon = 20
-
- tasks:
- - install:
- extras: yes
- - ssh_keys:
- - ceph-deploy:
- branch:
- testing:
- dmcrypt: yes
- separate_journal_disk: yes
-
- """
- if config is None:
- config = {}
-
- assert isinstance(config, dict), \
- "task ceph-deploy only supports a dictionary for configuration"
-
- overrides = ctx.config.get('overrides', {})
- teuthology.deep_merge(config, overrides.get('ceph-deploy', {}))
-
- if config.get('branch') is not None:
- assert isinstance(
- config['branch'], dict), 'branch must be a dictionary'
-
- log.info('task ceph-deploy with config ' + str(config))
-
- with contextutil.nested(
- lambda: install_fn.ship_utilities(ctx=ctx, config=None),
- lambda: download_ceph_deploy(ctx=ctx, config=config),
- lambda: build_ceph_cluster(ctx=ctx, config=config),
- ):
- yield
+++ /dev/null
-"""
-Ceph FUSE client task
-"""
-
-import contextlib
-import logging
-
-from teuthology import misc as teuthology
-from cephfs.fuse_mount import FuseMount
-
-log = logging.getLogger(__name__)
-
-
-def get_client_configs(ctx, config):
- """
- Get a map of the configuration for each FUSE client in the configuration by
- combining the configuration of the current task with any global overrides.
-
- :param ctx: Context instance
- :param config: configuration for this task
- :return: dict of client name to config or to None
- """
- if config is None:
- config = dict(('client.{id}'.format(id=id_), None)
- for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client'))
- elif isinstance(config, list):
- config = dict((name, None) for name in config)
-
- overrides = ctx.config.get('overrides', {})
- teuthology.deep_merge(config, overrides.get('ceph-fuse', {}))
-
- return config
-
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Mount/unmount a ``ceph-fuse`` client.
-
- The config is optional and defaults to mounting on all clients. If
- a config is given, it is expected to be a list of clients to do
- this operation on. This lets you e.g. set up one client with
- ``ceph-fuse`` and another with ``kclient``.
-
- Example that mounts all clients::
-
- tasks:
- - ceph:
- - ceph-fuse:
- - interactive:
-
- Example that uses both ``kclient` and ``ceph-fuse``::
-
- tasks:
- - ceph:
- - ceph-fuse: [client.0]
- - kclient: [client.1]
- - interactive:
-
- Example that enables valgrind:
-
- tasks:
- - ceph:
- - ceph-fuse:
- client.0:
- valgrind: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
- - interactive:
-
- Example that stops an already-mounted client:
-
- ::
-
- tasks:
- - ceph:
- - ceph-fuse: [client.0]
- - ... do something that requires the FS mounted ...
- - ceph-fuse:
- client.0:
- mounted: false
- - ... do something that requires the FS unmounted ...
-
- Example that adds more generous wait time for mount (for virtual machines):
-
- tasks:
- - ceph:
- - ceph-fuse:
- client.0:
- mount_wait: 60 # default is 0, do not wait before checking /sys/
- mount_timeout: 120 # default is 30, give up if /sys/ is not populated
- - interactive:
-
- :param ctx: Context
- :param config: Configuration
- """
- log.info('Mounting ceph-fuse clients...')
-
- testdir = teuthology.get_testdir(ctx)
- config = get_client_configs(ctx, config)
-
- # List clients we will configure mounts for, default is all clients
- clients = list(teuthology.get_clients(ctx=ctx, roles=filter(lambda x: 'client.' in x, config.keys())))
-
- all_mounts = getattr(ctx, 'mounts', {})
- mounted_by_me = {}
-
- # Construct any new FuseMount instances
- for id_, remote in clients:
- client_config = config.get("client.%s" % id_)
- if client_config is None:
- client_config = {}
-
- if id_ not in all_mounts:
- fuse_mount = FuseMount(client_config, testdir, id_, remote)
- all_mounts[id_] = fuse_mount
- else:
- # Catch bad configs where someone has e.g. tried to use ceph-fuse and kcephfs for the same client
- assert isinstance(all_mounts[id_], FuseMount)
-
- if not config.get("disabled", False) and client_config.get('mounted', True):
- mounted_by_me[id_] = all_mounts[id_]
-
- ctx.mounts = all_mounts
-
- # Mount any clients we have been asked to (default to mount all)
- for mount in mounted_by_me.values():
- mount.mount()
-
- for mount in mounted_by_me.values():
- mount.wait_until_mounted()
-
- # Umount any pre-existing clients that we have not been asked to mount
- for client_id in set(all_mounts.keys()) - set(mounted_by_me.keys()):
- mount = all_mounts[client_id]
- if mount.is_mounted():
- mount.umount_wait()
-
- try:
- yield all_mounts
- finally:
- log.info('Unmounting ceph-fuse clients...')
-
- for mount in mounted_by_me.values():
- # Conditional because an inner context might have umounted it
- if mount.is_mounted():
- mount.umount_wait()
+++ /dev/null
-"""
-ceph manager -- Thrasher and CephManager objects
-"""
-from cStringIO import StringIO
-from functools import wraps
-import contextlib
-import random
-import signal
-import time
-import gevent
-import base64
-import json
-import logging
-import threading
-import traceback
-import os
-from teuthology import misc as teuthology
-from tasks.scrub import Scrubber
-from util.rados import cmd_erasure_code_profile
-from util import get_remote
-from teuthology.contextutil import safe_while
-from teuthology.orchestra.remote import Remote
-from teuthology.orchestra import run
-from teuthology.exceptions import CommandFailedError
-
-try:
- from subprocess import DEVNULL # py3k
-except ImportError:
- DEVNULL = open(os.devnull, 'r+')
-
-DEFAULT_CONF_PATH = '/etc/ceph/ceph.conf'
-
-log = logging.getLogger(__name__)
-
-
-def write_conf(ctx, conf_path=DEFAULT_CONF_PATH, cluster='ceph'):
- conf_fp = StringIO()
- ctx.ceph[cluster].conf.write(conf_fp)
- conf_fp.seek(0)
- writes = ctx.cluster.run(
- args=[
- 'sudo', 'mkdir', '-p', '/etc/ceph', run.Raw('&&'),
- 'sudo', 'chmod', '0755', '/etc/ceph', run.Raw('&&'),
- 'sudo', 'python',
- '-c',
- ('import shutil, sys; '
- 'shutil.copyfileobj(sys.stdin, file(sys.argv[1], "wb"))'),
- conf_path,
- run.Raw('&&'),
- 'sudo', 'chmod', '0644', conf_path,
- ],
- stdin=run.PIPE,
- wait=False)
- teuthology.feed_many_stdins_and_close(conf_fp, writes)
- run.wait(writes)
-
-
-def mount_osd_data(ctx, remote, cluster, osd):
- """
- Mount a remote OSD
-
- :param ctx: Context
- :param remote: Remote site
- :param cluster: name of ceph cluster
- :param osd: Osd name
- """
- log.debug('Mounting data for osd.{o} on {r}'.format(o=osd, r=remote))
- role = "{0}.osd.{1}".format(cluster, osd)
- alt_role = role if cluster != 'ceph' else "osd.{0}".format(osd)
- if remote in ctx.disk_config.remote_to_roles_to_dev:
- if alt_role in ctx.disk_config.remote_to_roles_to_dev[remote]:
- role = alt_role
- if role not in ctx.disk_config.remote_to_roles_to_dev[remote]:
- return
- dev = ctx.disk_config.remote_to_roles_to_dev[remote][role]
- mount_options = ctx.disk_config.\
- remote_to_roles_to_dev_mount_options[remote][role]
- fstype = ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role]
- mnt = os.path.join('/var/lib/ceph/osd', '{0}-{1}'.format(cluster, osd))
-
- log.info('Mounting osd.{o}: dev: {n}, cluster: {c}'
- 'mountpoint: {p}, type: {t}, options: {v}'.format(
- o=osd, n=remote.name, p=mnt, t=fstype, v=mount_options,
- c=cluster))
-
- remote.run(
- args=[
- 'sudo',
- 'mount',
- '-t', fstype,
- '-o', ','.join(mount_options),
- dev,
- mnt,
- ]
- )
-
-
-class Thrasher:
- """
- Object used to thrash Ceph
- """
- def __init__(self, manager, config, logger=None):
- self.ceph_manager = manager
- self.cluster = manager.cluster
- self.ceph_manager.wait_for_clean()
- osd_status = self.ceph_manager.get_osd_status()
- self.in_osds = osd_status['in']
- self.live_osds = osd_status['live']
- self.out_osds = osd_status['out']
- self.dead_osds = osd_status['dead']
- self.stopping = False
- self.logger = logger
- self.config = config
- self.revive_timeout = self.config.get("revive_timeout", 150)
- if self.config.get('powercycle'):
- self.revive_timeout += 120
- self.clean_wait = self.config.get('clean_wait', 0)
- self.minin = self.config.get("min_in", 3)
- self.chance_move_pg = self.config.get('chance_move_pg', 1.0)
- self.sighup_delay = self.config.get('sighup_delay')
- self.optrack_toggle_delay = self.config.get('optrack_toggle_delay')
- self.dump_ops_enable = self.config.get('dump_ops_enable')
- self.noscrub_toggle_delay = self.config.get('noscrub_toggle_delay')
-
- num_osds = self.in_osds + self.out_osds
- self.max_pgs = self.config.get("max_pgs_per_pool_osd", 1200) * num_osds
- if self.logger is not None:
- self.log = lambda x: self.logger.info(x)
- else:
- def tmp(x):
- """
- Implement log behavior
- """
- print x
- self.log = tmp
- if self.config is None:
- self.config = dict()
- # prevent monitor from auto-marking things out while thrasher runs
- # try both old and new tell syntax, in case we are testing old code
- try:
- manager.raw_cluster_cmd('--', 'tell', 'mon.*', 'injectargs',
- '--mon-osd-down-out-interval 0')
- except Exception:
- manager.raw_cluster_cmd('--', 'mon', 'tell', '*', 'injectargs',
- '--mon-osd-down-out-interval 0')
- self.thread = gevent.spawn(self.do_thrash)
- if self.sighup_delay:
- self.sighup_thread = gevent.spawn(self.do_sighup)
- if self.optrack_toggle_delay:
- self.optrack_toggle_thread = gevent.spawn(self.do_optrack_toggle)
- if self.dump_ops_enable == "true":
- self.dump_ops_thread = gevent.spawn(self.do_dump_ops)
- if self.noscrub_toggle_delay:
- self.noscrub_toggle_thread = gevent.spawn(self.do_noscrub_toggle)
- if (self.config.get('powercycle') or
- not self.cmd_exists_on_osds("ceph-objectstore-tool") or
- self.config.get('disable_objectstore_tool_tests', False)):
- self.ceph_objectstore_tool = False
- self.test_rm_past_intervals = False
- if self.config.get('powercycle'):
- self.log("Unable to test ceph-objectstore-tool, "
- "powercycle testing")
- else:
- self.log("Unable to test ceph-objectstore-tool, "
- "not available on all OSD nodes")
- else:
- self.ceph_objectstore_tool = \
- self.config.get('ceph_objectstore_tool', True)
- self.test_rm_past_intervals = \
- self.config.get('test_rm_past_intervals', True)
-
- def cmd_exists_on_osds(self, cmd):
- allremotes = self.ceph_manager.ctx.cluster.only(\
- teuthology.is_type('osd', self.cluster)).remotes.keys()
- allremotes = list(set(allremotes))
- for remote in allremotes:
- proc = remote.run(args=['type', cmd], wait=True,
- check_status=False, stdout=StringIO(),
- stderr=StringIO())
- if proc.exitstatus != 0:
- return False;
- return True;
-
- def kill_osd(self, osd=None, mark_down=False, mark_out=False):
- """
- :param osd: Osd to be killed.
- :mark_down: Mark down if true.
- :mark_out: Mark out if true.
- """
- if osd is None:
- osd = random.choice(self.live_osds)
- self.log("Killing osd %s, live_osds are %s" % (str(osd),
- str(self.live_osds)))
- self.live_osds.remove(osd)
- self.dead_osds.append(osd)
- self.ceph_manager.kill_osd(osd)
- if mark_down:
- self.ceph_manager.mark_down_osd(osd)
- if mark_out and osd in self.in_osds:
- self.out_osd(osd)
- if self.ceph_objectstore_tool:
- self.log("Testing ceph-objectstore-tool on down osd")
- remote = self.ceph_manager.find_remote('osd', osd)
- FSPATH = self.ceph_manager.get_filepath()
- JPATH = os.path.join(FSPATH, "journal")
- exp_osd = imp_osd = osd
- exp_remote = imp_remote = remote
- # If an older osd is available we'll move a pg from there
- if (len(self.dead_osds) > 1 and
- random.random() < self.chance_move_pg):
- exp_osd = random.choice(self.dead_osds[:-1])
- exp_remote = self.ceph_manager.find_remote('osd', exp_osd)
- if ('keyvaluestore_backend' in
- self.ceph_manager.ctx.ceph[self.cluster].conf['osd']):
- prefix = ("sudo adjust-ulimits ceph-objectstore-tool "
- "--data-path {fpath} --journal-path {jpath} "
- "--type keyvaluestore "
- "--log-file="
- "/var/log/ceph/objectstore_tool.\\$pid.log ".
- format(fpath=FSPATH, jpath=JPATH))
- else:
- prefix = ("sudo adjust-ulimits ceph-objectstore-tool "
- "--data-path {fpath} --journal-path {jpath} "
- "--log-file="
- "/var/log/ceph/objectstore_tool.\\$pid.log ".
- format(fpath=FSPATH, jpath=JPATH))
- cmd = (prefix + "--op list-pgs").format(id=exp_osd)
-
- # ceph-objectstore-tool might be temporarily absent during an
- # upgrade - see http://tracker.ceph.com/issues/18014
- with safe_while(sleep=15, tries=40, action="type ceph-objectstore-tool") as proceed:
- while proceed():
- proc = exp_remote.run(args=['type', 'ceph-objectstore-tool'],
- wait=True, check_status=False, stdout=StringIO(),
- stderr=StringIO())
- if proc.exitstatus == 0:
- break
- log.debug("ceph-objectstore-tool binary not present, trying again")
-
- proc = exp_remote.run(args=cmd, wait=True,
- check_status=False, stdout=StringIO())
- if proc.exitstatus:
- raise Exception("ceph-objectstore-tool: "
- "exp list-pgs failure with status {ret}".
- format(ret=proc.exitstatus))
- pgs = proc.stdout.getvalue().split('\n')[:-1]
- if len(pgs) == 0:
- self.log("No PGs found for osd.{osd}".format(osd=exp_osd))
- return
- pg = random.choice(pgs)
- exp_path = teuthology.get_testdir(self.ceph_manager.ctx)
- exp_path = os.path.join(exp_path, '{0}.data'.format(self.cluster))
- exp_path = os.path.join(exp_path,
- "exp.{pg}.{id}".format(
- pg=pg,
- id=exp_osd))
- # export
- cmd = prefix + "--op export --pgid {pg} --file {file}"
- cmd = cmd.format(id=exp_osd, pg=pg, file=exp_path)
- proc = exp_remote.run(args=cmd)
- if proc.exitstatus:
- raise Exception("ceph-objectstore-tool: "
- "export failure with status {ret}".
- format(ret=proc.exitstatus))
- # remove
- cmd = prefix + "--op remove --pgid {pg}"
- cmd = cmd.format(id=exp_osd, pg=pg)
- proc = exp_remote.run(args=cmd)
- if proc.exitstatus:
- raise Exception("ceph-objectstore-tool: "
- "remove failure with status {ret}".
- format(ret=proc.exitstatus))
- # If there are at least 2 dead osds we might move the pg
- if exp_osd != imp_osd:
- # If pg isn't already on this osd, then we will move it there
- cmd = (prefix + "--op list-pgs").format(id=imp_osd)
- proc = imp_remote.run(args=cmd, wait=True,
- check_status=False, stdout=StringIO())
- if proc.exitstatus:
- raise Exception("ceph-objectstore-tool: "
- "imp list-pgs failure with status {ret}".
- format(ret=proc.exitstatus))
- pgs = proc.stdout.getvalue().split('\n')[:-1]
- if pg not in pgs:
- self.log("Moving pg {pg} from osd.{fosd} to osd.{tosd}".
- format(pg=pg, fosd=exp_osd, tosd=imp_osd))
- if imp_remote != exp_remote:
- # Copy export file to the other machine
- self.log("Transfer export file from {srem} to {trem}".
- format(srem=exp_remote, trem=imp_remote))
- tmpexport = Remote.get_file(exp_remote, exp_path)
- Remote.put_file(imp_remote, tmpexport, exp_path)
- os.remove(tmpexport)
- else:
- # Can't move the pg after all
- imp_osd = exp_osd
- imp_remote = exp_remote
- # import
- cmd = (prefix + "--op import --file {file}")
- cmd = cmd.format(id=imp_osd, file=exp_path)
- proc = imp_remote.run(args=cmd, wait=True, check_status=False)
- if proc.exitstatus == 10:
- self.log("Pool went away before processing an import"
- "...ignored")
- elif proc.exitstatus == 11:
- self.log("Attempt to import an incompatible export"
- "...ignored")
- elif proc.exitstatus:
- raise Exception("ceph-objectstore-tool: "
- "import failure with status {ret}".
- format(ret=proc.exitstatus))
- cmd = "rm -f {file}".format(file=exp_path)
- exp_remote.run(args=cmd)
- if imp_remote != exp_remote:
- imp_remote.run(args=cmd)
-
- # apply low split settings to each pool
- for pool in self.ceph_manager.list_pools():
- no_sudo_prefix = prefix[5:]
- cmd = ("CEPH_ARGS='--filestore-merge-threshold 1 "
- "--filestore-split-multiple 1' sudo -E "
- + no_sudo_prefix + "--op apply-layout-settings --pool " + pool).format(id=osd)
- proc = remote.run(args=cmd, wait=True, check_status=False, stderr=StringIO())
- output = proc.stderr.getvalue()
- if 'Couldn\'t find pool' in output:
- continue
- if proc.exitstatus:
- raise Exception("ceph-objectstore-tool apply-layout-settings"
- " failed with {status}".format(status=proc.exitstatus))
-
- def rm_past_intervals(self, osd=None):
- """
- :param osd: Osd to find pg to remove past intervals
- """
- if self.test_rm_past_intervals:
- if osd is None:
- osd = random.choice(self.dead_osds)
- self.log("Use ceph_objectstore_tool to remove past intervals")
- remote = self.ceph_manager.find_remote('osd', osd)
- FSPATH = self.ceph_manager.get_filepath()
- JPATH = os.path.join(FSPATH, "journal")
- if ('keyvaluestore_backend' in
- self.ceph_manager.ctx.ceph[self.cluster].conf['osd']):
- prefix = ("sudo adjust-ulimits ceph-objectstore-tool "
- "--data-path {fpath} --journal-path {jpath} "
- "--type keyvaluestore "
- "--log-file="
- "/var/log/ceph/objectstore_tool.\\$pid.log ".
- format(fpath=FSPATH, jpath=JPATH))
- else:
- prefix = ("sudo adjust-ulimits ceph-objectstore-tool "
- "--data-path {fpath} --journal-path {jpath} "
- "--log-file="
- "/var/log/ceph/objectstore_tool.\\$pid.log ".
- format(fpath=FSPATH, jpath=JPATH))
- cmd = (prefix + "--op list-pgs").format(id=osd)
- proc = remote.run(args=cmd, wait=True,
- check_status=False, stdout=StringIO())
- if proc.exitstatus:
- raise Exception("ceph_objectstore_tool: "
- "exp list-pgs failure with status {ret}".
- format(ret=proc.exitstatus))
- pgs = proc.stdout.getvalue().split('\n')[:-1]
- if len(pgs) == 0:
- self.log("No PGs found for osd.{osd}".format(osd=osd))
- return
- pg = random.choice(pgs)
- cmd = (prefix + "--op rm-past-intervals --pgid {pg}").\
- format(id=osd, pg=pg)
- proc = remote.run(args=cmd)
- if proc.exitstatus:
- raise Exception("ceph_objectstore_tool: "
- "rm-past-intervals failure with status {ret}".
- format(ret=proc.exitstatus))
-
- def blackhole_kill_osd(self, osd=None):
- """
- If all else fails, kill the osd.
- :param osd: Osd to be killed.
- """
- if osd is None:
- osd = random.choice(self.live_osds)
- self.log("Blackholing and then killing osd %s, live_osds are %s" %
- (str(osd), str(self.live_osds)))
- self.live_osds.remove(osd)
- self.dead_osds.append(osd)
- self.ceph_manager.blackhole_kill_osd(osd)
-
- def revive_osd(self, osd=None, skip_admin_check=False):
- """
- Revive the osd.
- :param osd: Osd to be revived.
- """
- if osd is None:
- osd = random.choice(self.dead_osds)
- self.log("Reviving osd %s" % (str(osd),))
- self.ceph_manager.revive_osd(
- osd,
- self.revive_timeout,
- skip_admin_check=skip_admin_check)
- self.dead_osds.remove(osd)
- self.live_osds.append(osd)
-
- def out_osd(self, osd=None):
- """
- Mark the osd out
- :param osd: Osd to be marked.
- """
- if osd is None:
- osd = random.choice(self.in_osds)
- self.log("Removing osd %s, in_osds are: %s" %
- (str(osd), str(self.in_osds)))
- self.ceph_manager.mark_out_osd(osd)
- self.in_osds.remove(osd)
- self.out_osds.append(osd)
-
- def in_osd(self, osd=None):
- """
- Mark the osd out
- :param osd: Osd to be marked.
- """
- if osd is None:
- osd = random.choice(self.out_osds)
- if osd in self.dead_osds:
- return self.revive_osd(osd)
- self.log("Adding osd %s" % (str(osd),))
- self.out_osds.remove(osd)
- self.in_osds.append(osd)
- self.ceph_manager.mark_in_osd(osd)
- self.log("Added osd %s" % (str(osd),))
-
- def reweight_osd_or_by_util(self, osd=None):
- """
- Reweight an osd that is in
- :param osd: Osd to be marked.
- """
- if osd is not None or random.choice([True, False]):
- if osd is None:
- osd = random.choice(self.in_osds)
- val = random.uniform(.1, 1.0)
- self.log("Reweighting osd %s to %s" % (str(osd), str(val)))
- self.ceph_manager.raw_cluster_cmd('osd', 'reweight',
- str(osd), str(val))
- else:
- # do it several times, the option space is large
- for i in range(5):
- options = {
- 'max_change': random.choice(['0.05', '1.0', '3.0']),
- 'overage': random.choice(['110', '1000']),
- 'type': random.choice([
- 'reweight-by-utilization',
- 'test-reweight-by-utilization']),
- }
- self.log("Reweighting by: %s"%(str(options),))
- self.ceph_manager.raw_cluster_cmd(
- 'osd',
- options['type'],
- options['overage'],
- options['max_change'])
-
- def primary_affinity(self, osd=None):
- if osd is None:
- osd = random.choice(self.in_osds)
- if random.random() >= .5:
- pa = random.random()
- elif random.random() >= .5:
- pa = 1
- else:
- pa = 0
- self.log('Setting osd %s primary_affinity to %f' % (str(osd), pa))
- self.ceph_manager.raw_cluster_cmd('osd', 'primary-affinity',
- str(osd), str(pa))
-
- def all_up(self):
- """
- Make sure all osds are up and not out.
- """
- while len(self.dead_osds) > 0:
- self.log("reviving osd")
- self.revive_osd()
- while len(self.out_osds) > 0:
- self.log("inning osd")
- self.in_osd()
-
- def do_join(self):
- """
- Break out of this Ceph loop
- """
- self.stopping = True
- self.thread.get()
- if self.sighup_delay:
- self.log("joining the do_sighup greenlet")
- self.sighup_thread.get()
- if self.optrack_toggle_delay:
- self.log("joining the do_optrack_toggle greenlet")
- self.optrack_toggle_thread.join()
- if self.dump_ops_enable == "true":
- self.log("joining the do_dump_ops greenlet")
- self.dump_ops_thread.join()
- if self.noscrub_toggle_delay:
- self.log("joining the do_noscrub_toggle greenlet")
- self.noscrub_toggle_thread.join()
-
- def grow_pool(self):
- """
- Increase the size of the pool
- """
- pool = self.ceph_manager.get_pool()
- self.log("Growing pool %s" % (pool,))
- self.ceph_manager.expand_pool(pool,
- self.config.get('pool_grow_by', 10),
- self.max_pgs)
-
- def fix_pgp_num(self):
- """
- Fix number of pgs in pool.
- """
- pool = self.ceph_manager.get_pool()
- self.log("fixing pg num pool %s" % (pool,))
- self.ceph_manager.set_pool_pgpnum(pool)
-
- def test_pool_min_size(self):
- """
- Kill and revive all osds except one.
- """
- self.log("test_pool_min_size")
- self.all_up()
- self.ceph_manager.wait_for_recovery(
- timeout=self.config.get('timeout')
- )
- the_one = random.choice(self.in_osds)
- self.log("Killing everyone but %s", the_one)
- to_kill = filter(lambda x: x != the_one, self.in_osds)
- [self.kill_osd(i) for i in to_kill]
- [self.out_osd(i) for i in to_kill]
- time.sleep(self.config.get("test_pool_min_size_time", 10))
- self.log("Killing %s" % (the_one,))
- self.kill_osd(the_one)
- self.out_osd(the_one)
- self.log("Reviving everyone but %s" % (the_one,))
- [self.revive_osd(i) for i in to_kill]
- [self.in_osd(i) for i in to_kill]
- self.log("Revived everyone but %s" % (the_one,))
- self.log("Waiting for clean")
- self.ceph_manager.wait_for_recovery(
- timeout=self.config.get('timeout')
- )
-
- def inject_pause(self, conf_key, duration, check_after, should_be_down):
- """
- Pause injection testing. Check for osd being down when finished.
- """
- the_one = random.choice(self.live_osds)
- self.log("inject_pause on {osd}".format(osd=the_one))
- self.log(
- "Testing {key} pause injection for duration {duration}".format(
- key=conf_key,
- duration=duration
- ))
- self.log(
- "Checking after {after}, should_be_down={shouldbedown}".format(
- after=check_after,
- shouldbedown=should_be_down
- ))
- self.ceph_manager.set_config(the_one, **{conf_key: duration})
- if not should_be_down:
- return
- time.sleep(check_after)
- status = self.ceph_manager.get_osd_status()
- assert the_one in status['down']
- time.sleep(duration - check_after + 20)
- status = self.ceph_manager.get_osd_status()
- assert not the_one in status['down']
-
- def test_backfill_full(self):
- """
- Test backfills stopping when the replica fills up.
-
- First, use osd_backfill_full_ratio to simulate a now full
- osd by setting it to 0 on all of the OSDs.
-
- Second, on a random subset, set
- osd_debug_skip_full_check_in_backfill_reservation to force
- the more complicated check in do_scan to be exercised.
-
- Then, verify that all backfills stop.
- """
- self.log("injecting osd_backfill_full_ratio = 0")
- for i in self.live_osds:
- self.ceph_manager.set_config(
- i,
- osd_debug_skip_full_check_in_backfill_reservation=
- random.choice(['false', 'true']),
- osd_backfill_full_ratio=0)
- for i in range(30):
- status = self.ceph_manager.compile_pg_status()
- if 'backfill' not in status.keys():
- break
- self.log(
- "waiting for {still_going} backfills".format(
- still_going=status.get('backfill')))
- time.sleep(1)
- assert('backfill' not in self.ceph_manager.compile_pg_status().keys())
- for i in self.live_osds:
- self.ceph_manager.set_config(
- i,
- osd_debug_skip_full_check_in_backfill_reservation='false',
- osd_backfill_full_ratio=0.85)
-
- def test_map_discontinuity(self):
- """
- 1) Allows the osds to recover
- 2) kills an osd
- 3) allows the remaining osds to recover
- 4) waits for some time
- 5) revives the osd
- This sequence should cause the revived osd to have to handle
- a map gap since the mons would have trimmed
- """
- while len(self.in_osds) < (self.minin + 1):
- self.in_osd()
- self.log("Waiting for recovery")
- self.ceph_manager.wait_for_all_up(
- timeout=self.config.get('timeout')
- )
- # now we wait 20s for the pg status to change, if it takes longer,
- # the test *should* fail!
- time.sleep(20)
- self.ceph_manager.wait_for_clean(
- timeout=self.config.get('timeout')
- )
-
- # now we wait 20s for the backfill replicas to hear about the clean
- time.sleep(20)
- self.log("Recovered, killing an osd")
- self.kill_osd(mark_down=True, mark_out=True)
- self.log("Waiting for clean again")
- self.ceph_manager.wait_for_clean(
- timeout=self.config.get('timeout')
- )
- self.log("Waiting for trim")
- time.sleep(int(self.config.get("map_discontinuity_sleep_time", 40)))
- self.revive_osd()
-
- def choose_action(self):
- """
- Random action selector.
- """
- chance_down = self.config.get('chance_down', 0.4)
- chance_test_min_size = self.config.get('chance_test_min_size', 0)
- chance_test_backfill_full = \
- self.config.get('chance_test_backfill_full', 0)
- if isinstance(chance_down, int):
- chance_down = float(chance_down) / 100
- minin = self.minin
- minout = self.config.get("min_out", 0)
- minlive = self.config.get("min_live", 2)
- mindead = self.config.get("min_dead", 0)
-
- self.log('choose_action: min_in %d min_out '
- '%d min_live %d min_dead %d' %
- (minin, minout, minlive, mindead))
- actions = []
- if len(self.in_osds) > minin:
- actions.append((self.out_osd, 1.0,))
- if len(self.live_osds) > minlive and chance_down > 0:
- actions.append((self.kill_osd, chance_down,))
- if len(self.dead_osds) > 1:
- actions.append((self.rm_past_intervals, 1.0,))
- if len(self.out_osds) > minout:
- actions.append((self.in_osd, 1.7,))
- if len(self.dead_osds) > mindead:
- actions.append((self.revive_osd, 1.0,))
- if self.config.get('thrash_primary_affinity', True):
- actions.append((self.primary_affinity, 1.0,))
- actions.append((self.reweight_osd_or_by_util,
- self.config.get('reweight_osd', .5),))
- actions.append((self.grow_pool,
- self.config.get('chance_pgnum_grow', 0),))
- actions.append((self.fix_pgp_num,
- self.config.get('chance_pgpnum_fix', 0),))
- actions.append((self.test_pool_min_size,
- chance_test_min_size,))
- actions.append((self.test_backfill_full,
- chance_test_backfill_full,))
- for key in ['heartbeat_inject_failure', 'filestore_inject_stall']:
- for scenario in [
- (lambda:
- self.inject_pause(key,
- self.config.get('pause_short', 3),
- 0,
- False),
- self.config.get('chance_inject_pause_short', 1),),
- (lambda:
- self.inject_pause(key,
- self.config.get('pause_long', 80),
- self.config.get('pause_check_after', 70),
- True),
- self.config.get('chance_inject_pause_long', 0),)]:
- actions.append(scenario)
-
- total = sum([y for (x, y) in actions])
- val = random.uniform(0, total)
- for (action, prob) in actions:
- if val < prob:
- return action
- val -= prob
- return None
-
- def log_exc(func):
- @wraps(func)
- def wrapper(self):
- try:
- return func(self)
- except:
- self.log(traceback.format_exc())
- raise
- return wrapper
-
- @log_exc
- def do_sighup(self):
- """
- Loops and sends signal.SIGHUP to a random live osd.
-
- Loop delay is controlled by the config value sighup_delay.
- """
- delay = float(self.sighup_delay)
- self.log("starting do_sighup with a delay of {0}".format(delay))
- while not self.stopping:
- osd = random.choice(self.live_osds)
- self.ceph_manager.signal_osd(osd, signal.SIGHUP, silent=True)
- time.sleep(delay)
-
- @log_exc
- def do_optrack_toggle(self):
- """
- Loops and toggle op tracking to all osds.
-
- Loop delay is controlled by the config value optrack_toggle_delay.
- """
- delay = float(self.optrack_toggle_delay)
- osd_state = "true"
- self.log("starting do_optrack_toggle with a delay of {0}".format(delay))
- while not self.stopping:
- if osd_state == "true":
- osd_state = "false"
- else:
- osd_state = "true"
- self.ceph_manager.raw_cluster_cmd_result('tell', 'osd.*',
- 'injectargs', '--osd_enable_op_tracker=%s' % osd_state)
- gevent.sleep(delay)
-
- @log_exc
- def do_dump_ops(self):
- """
- Loops and does op dumps on all osds
- """
- self.log("starting do_dump_ops")
- while not self.stopping:
- for osd in self.live_osds:
- # Ignore errors because live_osds is in flux
- self.ceph_manager.osd_admin_socket(osd, command=['dump_ops_in_flight'],
- check_status=False, timeout=30, stdout=DEVNULL)
- self.ceph_manager.osd_admin_socket(osd, command=['dump_blocked_ops'],
- check_status=False, timeout=30, stdout=DEVNULL)
- self.ceph_manager.osd_admin_socket(osd, command=['dump_historic_ops'],
- check_status=False, timeout=30, stdout=DEVNULL)
- gevent.sleep(0)
-
- @log_exc
- def do_noscrub_toggle(self):
- """
- Loops and toggle noscrub flags
-
- Loop delay is controlled by the config value noscrub_toggle_delay.
- """
- delay = float(self.noscrub_toggle_delay)
- scrub_state = "none"
- self.log("starting do_noscrub_toggle with a delay of {0}".format(delay))
- while not self.stopping:
- if scrub_state == "none":
- self.ceph_manager.raw_cluster_cmd('osd', 'set', 'noscrub')
- scrub_state = "noscrub"
- elif scrub_state == "noscrub":
- self.ceph_manager.raw_cluster_cmd('osd', 'set', 'nodeep-scrub')
- scrub_state = "both"
- elif scrub_state == "both":
- self.ceph_manager.raw_cluster_cmd('osd', 'unset', 'noscrub')
- scrub_state = "nodeep-scrub"
- else:
- self.ceph_manager.raw_cluster_cmd('osd', 'unset', 'nodeep-scrub')
- scrub_state = "none"
- gevent.sleep(delay)
- self.ceph_manager.raw_cluster_cmd('osd', 'unset', 'noscrub')
- self.ceph_manager.raw_cluster_cmd('osd', 'unset', 'nodeep-scrub')
-
- @log_exc
- def do_thrash(self):
- """
- Loop to select random actions to thrash ceph manager with.
- """
- cleanint = self.config.get("clean_interval", 60)
- scrubint = self.config.get("scrub_interval", -1)
- maxdead = self.config.get("max_dead", 0)
- delay = self.config.get("op_delay", 5)
- self.log("starting do_thrash")
- while not self.stopping:
- to_log = [str(x) for x in ["in_osds: ", self.in_osds,
- "out_osds: ", self.out_osds,
- "dead_osds: ", self.dead_osds,
- "live_osds: ", self.live_osds]]
- self.log(" ".join(to_log))
- if random.uniform(0, 1) < (float(delay) / cleanint):
- while len(self.dead_osds) > maxdead:
- self.revive_osd()
- for osd in self.in_osds:
- self.ceph_manager.raw_cluster_cmd('osd', 'reweight',
- str(osd), str(1))
- if random.uniform(0, 1) < float(
- self.config.get('chance_test_map_discontinuity', 0)):
- self.test_map_discontinuity()
- else:
- self.ceph_manager.wait_for_recovery(
- timeout=self.config.get('timeout')
- )
- time.sleep(self.clean_wait)
- if scrubint > 0:
- if random.uniform(0, 1) < (float(delay) / scrubint):
- self.log('Scrubbing while thrashing being performed')
- Scrubber(self.ceph_manager, self.config)
- self.choose_action()()
- time.sleep(delay)
- self.all_up()
-
-
-class ObjectStoreTool:
-
- def __init__(self, manager, pool, **kwargs):
- self.manager = manager
- self.pool = pool
- self.osd = kwargs.get('osd', None)
- self.object_name = kwargs.get('object_name', None)
- self.do_revive = kwargs.get('do_revive', True)
- if self.osd and self.pool and self.object_name:
- if self.osd == "primary":
- self.osd = self.manager.get_object_primary(self.pool,
- self.object_name)
- assert self.osd
- if self.object_name:
- self.pgid = self.manager.get_object_pg_with_shard(self.pool,
- self.object_name,
- self.osd)
- self.remote = self.manager.ctx.\
- cluster.only('osd.{o}'.format(o=self.osd)).remotes.keys()[0]
- path = self.manager.get_filepath().format(id=self.osd)
- self.paths = ("--data-path {path} --journal-path {path}/journal".
- format(path=path))
-
- def build_cmd(self, options, args, stdin):
- lines = []
- if self.object_name:
- lines.append("object=$(sudo adjust-ulimits ceph-objectstore-tool "
- "{paths} --pgid {pgid} --op list |"
- "grep '\"oid\":\"{name}\"')".
- format(paths=self.paths,
- pgid=self.pgid,
- name=self.object_name))
- args = '"$object" ' + args
- options += " --pgid {pgid}".format(pgid=self.pgid)
- cmd = ("sudo adjust-ulimits ceph-objectstore-tool {paths} {options} {args}".
- format(paths=self.paths,
- args=args,
- options=options))
- if stdin:
- cmd = ("echo {payload} | base64 --decode | {cmd}".
- format(payload=base64.encode(stdin),
- cmd=cmd))
- lines.append(cmd)
- return "\n".join(lines)
-
- def run(self, options, args, stdin=None, stdout=None):
- if stdout is None:
- stdout = StringIO()
- self.manager.kill_osd(self.osd)
- cmd = self.build_cmd(options, args, stdin)
- self.manager.log(cmd)
- try:
- proc = self.remote.run(args=['bash', '-e', '-x', '-c', cmd],
- check_status=False,
- stdout=stdout,
- stderr=StringIO())
- proc.wait()
- if proc.exitstatus != 0:
- self.manager.log("failed with " + str(proc.exitstatus))
- error = proc.stdout.getvalue() + " " + proc.stderr.getvalue()
- raise Exception(error)
- finally:
- if self.do_revive:
- self.manager.revive_osd(self.osd)
-
-
-class CephManager:
- """
- Ceph manager object.
- Contains several local functions that form a bulk of this module.
-
- Note: this class has nothing to do with the Ceph daemon (ceph-mgr) of
- the same name.
- """
-
- REPLICATED_POOL = 1
- ERASURE_CODED_POOL = 3
-
- def __init__(self, controller, ctx=None, config=None, logger=None,
- cluster='ceph'):
- self.lock = threading.RLock()
- self.ctx = ctx
- self.config = config
- self.controller = controller
- self.next_pool_id = 0
- self.cluster = cluster
- if (logger):
- self.log = lambda x: logger.info(x)
- else:
- def tmp(x):
- """
- implement log behavior.
- """
- print x
- self.log = tmp
- if self.config is None:
- self.config = dict()
- pools = self.list_pools()
- self.pools = {}
- for pool in pools:
- # we may race with a pool deletion; ignore failures here
- try:
- self.pools[pool] = self.get_pool_property(pool, 'pg_num')
- except CommandFailedError:
- self.log('Failed to get pg_num from pool %s, ignoring' % pool)
-
- def raw_cluster_cmd(self, *args):
- """
- Start ceph on a raw cluster. Return count
- """
- testdir = teuthology.get_testdir(self.ctx)
- ceph_args = [
- 'sudo',
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'timeout',
- '120',
- 'ceph',
- '--cluster',
- self.cluster,
- ]
- ceph_args.extend(args)
- proc = self.controller.run(
- args=ceph_args,
- stdout=StringIO(),
- )
- return proc.stdout.getvalue()
-
- def raw_cluster_cmd_result(self, *args):
- """
- Start ceph on a cluster. Return success or failure information.
- """
- testdir = teuthology.get_testdir(self.ctx)
- ceph_args = [
- 'sudo',
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'timeout',
- '120',
- 'ceph',
- '--cluster',
- self.cluster,
- ]
- ceph_args.extend(args)
- proc = self.controller.run(
- args=ceph_args,
- check_status=False,
- )
- return proc.exitstatus
-
- def run_ceph_w(self):
- """
- Execute "ceph -w" in the background with stdout connected to a StringIO,
- and return the RemoteProcess.
- """
- return self.controller.run(
- args=["sudo",
- "daemon-helper",
- "kill",
- "ceph",
- '--cluster',
- self.cluster,
- "-w"],
- wait=False, stdout=StringIO(), stdin=run.PIPE)
-
- def do_rados(self, remote, cmd, check_status=True):
- """
- Execute a remote rados command.
- """
- testdir = teuthology.get_testdir(self.ctx)
- pre = [
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'rados',
- '--cluster',
- self.cluster,
- ]
- pre.extend(cmd)
- proc = remote.run(
- args=pre,
- wait=True,
- check_status=check_status
- )
- return proc
-
- def rados_write_objects(self, pool, num_objects, size,
- timelimit, threads, cleanup=False):
- """
- Write rados objects
- Threads not used yet.
- """
- args = [
- '-p', pool,
- '--num-objects', num_objects,
- '-b', size,
- 'bench', timelimit,
- 'write'
- ]
- if not cleanup:
- args.append('--no-cleanup')
- return self.do_rados(self.controller, map(str, args))
-
- def do_put(self, pool, obj, fname, namespace=None):
- """
- Implement rados put operation
- """
- args = ['-p', pool]
- if namespace is not None:
- args += ['-N', namespace]
- args += [
- 'put',
- obj,
- fname
- ]
- return self.do_rados(
- self.controller,
- args,
- check_status=False
- ).exitstatus
-
- def do_get(self, pool, obj, fname='/dev/null', namespace=None):
- """
- Implement rados get operation
- """
- args = ['-p', pool]
- if namespace is not None:
- args += ['-N', namespace]
- args += [
- 'get',
- obj,
- fname
- ]
- return self.do_rados(
- self.controller,
- args,
- check_status=False
- ).exitstatus
-
- def do_rm(self, pool, obj, namespace=None):
- """
- Implement rados rm operation
- """
- args = ['-p', pool]
- if namespace is not None:
- args += ['-N', namespace]
- args += [
- 'rm',
- obj
- ]
- return self.do_rados(
- self.controller,
- args,
- check_status=False
- ).exitstatus
-
- def osd_admin_socket(self, osd_id, command, check_status=True, timeout=0, stdout=None):
- if stdout is None:
- stdout = StringIO()
- return self.admin_socket('osd', osd_id, command, check_status, timeout, stdout)
-
- def find_remote(self, service_type, service_id):
- """
- Get the Remote for the host where a particular service runs.
-
- :param service_type: 'mds', 'osd', 'client'
- :param service_id: The second part of a role, e.g. '0' for
- the role 'client.0'
- :return: a Remote instance for the host where the
- requested role is placed
- """
- return get_remote(self.ctx, self.cluster,
- service_type, service_id)
-
- def admin_socket(self, service_type, service_id,
- command, check_status=True, timeout=0, stdout=None):
- """
- Remotely start up ceph specifying the admin socket
- :param command: a list of words to use as the command
- to the admin socket
- """
- if stdout is None:
- stdout = StringIO()
- testdir = teuthology.get_testdir(self.ctx)
- remote = self.find_remote(service_type, service_id)
- args = [
- 'sudo',
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'timeout',
- str(timeout),
- 'ceph',
- '--cluster',
- self.cluster,
- '--admin-daemon',
- '/var/run/ceph/{cluster}-{type}.{id}.asok'.format(
- cluster=self.cluster,
- type=service_type,
- id=service_id),
- ]
- args.extend(command)
- return remote.run(
- args=args,
- stdout=stdout,
- wait=True,
- check_status=check_status
- )
-
- def objectstore_tool(self, pool, options, args, **kwargs):
- return ObjectStoreTool(self, pool, **kwargs).run(options, args)
-
- def get_pgid(self, pool, pgnum):
- """
- :param pool: pool name
- :param pgnum: pg number
- :returns: a string representing this pg.
- """
- poolnum = self.get_pool_num(pool)
- pg_str = "{poolnum}.{pgnum}".format(
- poolnum=poolnum,
- pgnum=pgnum)
- return pg_str
-
- def get_pg_replica(self, pool, pgnum):
- """
- get replica for pool, pgnum (e.g. (data, 0)->0
- """
- output = self.raw_cluster_cmd("pg", "dump", '--format=json')
- j = json.loads('\n'.join(output.split('\n')[1:]))
- pg_str = self.get_pgid(pool, pgnum)
- for pg in j['pg_stats']:
- if pg['pgid'] == pg_str:
- return int(pg['acting'][-1])
- assert False
-
- def get_pg_primary(self, pool, pgnum):
- """
- get primary for pool, pgnum (e.g. (data, 0)->0
- """
- output = self.raw_cluster_cmd("pg", "dump", '--format=json')
- j = json.loads('\n'.join(output.split('\n')[1:]))
- pg_str = self.get_pgid(pool, pgnum)
- for pg in j['pg_stats']:
- if pg['pgid'] == pg_str:
- return int(pg['acting'][0])
- assert False
-
- def get_pool_num(self, pool):
- """
- get number for pool (e.g., data -> 2)
- """
- return int(self.get_pool_dump(pool)['pool'])
-
- def list_pools(self):
- """
- list all pool names
- """
- osd_dump = self.get_osd_dump_json()
- self.log(osd_dump['pools'])
- return [str(i['pool_name']) for i in osd_dump['pools']]
-
- def clear_pools(self):
- """
- remove all pools
- """
- [self.remove_pool(i) for i in self.list_pools()]
-
- def kick_recovery_wq(self, osdnum):
- """
- Run kick_recovery_wq on cluster.
- """
- return self.raw_cluster_cmd(
- 'tell', "osd.%d" % (int(osdnum),),
- 'debug',
- 'kick_recovery_wq',
- '0')
-
- def wait_run_admin_socket(self, service_type,
- service_id, args=['version'], timeout=75, stdout=None):
- """
- If osd_admin_socket call suceeds, return. Otherwise wait
- five seconds and try again.
- """
- if stdout is None:
- stdout = StringIO()
- tries = 0
- while True:
- proc = self.admin_socket(service_type, service_id,
- args, check_status=False, stdout=stdout)
- if proc.exitstatus is 0:
- break
- else:
- tries += 1
- if (tries * 5) > timeout:
- raise Exception('timed out waiting for admin_socket '
- 'to appear after {type}.{id} restart'.
- format(type=service_type,
- id=service_id))
- self.log("waiting on admin_socket for {type}-{id}, "
- "{command}".format(type=service_type,
- id=service_id,
- command=args))
- time.sleep(5)
-
- def get_pool_dump(self, pool):
- """
- get the osd dump part of a pool
- """
- osd_dump = self.get_osd_dump_json()
- for i in osd_dump['pools']:
- if i['pool_name'] == pool:
- return i
- assert False
-
- def set_config(self, osdnum, **argdict):
- """
- :param osdnum: osd number
- :param argdict: dictionary containing values to set.
- """
- for k, v in argdict.iteritems():
- self.wait_run_admin_socket(
- 'osd', osdnum,
- ['config', 'set', str(k), str(v)])
-
- def raw_cluster_status(self):
- """
- Get status from cluster
- """
- status = self.raw_cluster_cmd('status', '--format=json-pretty')
- return json.loads(status)
-
- def raw_osd_status(self):
- """
- Get osd status from cluster
- """
- return self.raw_cluster_cmd('osd', 'dump')
-
- def get_osd_status(self):
- """
- Get osd statuses sorted by states that the osds are in.
- """
- osd_lines = filter(
- lambda x: x.startswith('osd.') and (("up" in x) or ("down" in x)),
- self.raw_osd_status().split('\n'))
- self.log(osd_lines)
- in_osds = [int(i[4:].split()[0])
- for i in filter(lambda x: " in " in x, osd_lines)]
- out_osds = [int(i[4:].split()[0])
- for i in filter(lambda x: " out " in x, osd_lines)]
- up_osds = [int(i[4:].split()[0])
- for i in filter(lambda x: " up " in x, osd_lines)]
- down_osds = [int(i[4:].split()[0])
- for i in filter(lambda x: " down " in x, osd_lines)]
- dead_osds = [int(x.id_)
- for x in filter(lambda x:
- not x.running(),
- self.ctx.daemons.
- iter_daemons_of_role('osd', self.cluster))]
- live_osds = [int(x.id_) for x in
- filter(lambda x:
- x.running(),
- self.ctx.daemons.iter_daemons_of_role('osd',
- self.cluster))]
- return {'in': in_osds, 'out': out_osds, 'up': up_osds,
- 'down': down_osds, 'dead': dead_osds, 'live': live_osds,
- 'raw': osd_lines}
-
- def get_num_pgs(self):
- """
- Check cluster status for the number of pgs
- """
- status = self.raw_cluster_status()
- self.log(status)
- return status['pgmap']['num_pgs']
-
- def create_erasure_code_profile(self, profile_name, profile):
- """
- Create an erasure code profile name that can be used as a parameter
- when creating an erasure coded pool.
- """
- with self.lock:
- args = cmd_erasure_code_profile(profile_name, profile)
- self.raw_cluster_cmd(*args)
-
- def create_pool_with_unique_name(self, pg_num=16,
- erasure_code_profile_name=None,
- min_size=None,
- erasure_code_use_hacky_overwrites=False):
- """
- Create a pool named unique_pool_X where X is unique.
- """
- name = ""
- with self.lock:
- name = "unique_pool_%s" % (str(self.next_pool_id),)
- self.next_pool_id += 1
- self.create_pool(
- name,
- pg_num,
- erasure_code_profile_name=erasure_code_profile_name,
- min_size=min_size,
- erasure_code_use_hacky_overwrites=erasure_code_use_hacky_overwrites)
- return name
-
- @contextlib.contextmanager
- def pool(self, pool_name, pg_num=16, erasure_code_profile_name=None):
- self.create_pool(pool_name, pg_num, erasure_code_profile_name)
- yield
- self.remove_pool(pool_name)
-
- def create_pool(self, pool_name, pg_num=16,
- erasure_code_profile_name=None,
- min_size=None,
- erasure_code_use_hacky_overwrites=False):
- """
- Create a pool named from the pool_name parameter.
- :param pool_name: name of the pool being created.
- :param pg_num: initial number of pgs.
- :param erasure_code_profile_name: if set and !None create an
- erasure coded pool using the profile
- :param erasure_code_use_hacky_overwrites: if true, use the hacky
- overwrites mode
- """
- with self.lock:
- assert isinstance(pool_name, basestring)
- assert isinstance(pg_num, int)
- assert pool_name not in self.pools
- self.log("creating pool_name %s" % (pool_name,))
- if erasure_code_profile_name:
- self.raw_cluster_cmd('osd', 'pool', 'create',
- pool_name, str(pg_num), str(pg_num),
- 'erasure', erasure_code_profile_name)
- else:
- self.raw_cluster_cmd('osd', 'pool', 'create',
- pool_name, str(pg_num))
- if min_size is not None:
- self.raw_cluster_cmd(
- 'osd', 'pool', 'set', pool_name,
- 'min_size',
- str(min_size))
- if erasure_code_use_hacky_overwrites:
- self.raw_cluster_cmd(
- 'osd', 'pool', 'set', pool_name,
- 'debug_white_box_testing_ec_overwrites',
- 'true')
- self.pools[pool_name] = pg_num
- time.sleep(1)
-
- def add_pool_snap(self, pool_name, snap_name):
- """
- Add pool snapshot
- :param pool_name: name of pool to snapshot
- :param snap_name: name of snapshot to take
- """
- self.raw_cluster_cmd('osd', 'pool', 'mksnap',
- str(pool_name), str(snap_name))
-
- def remove_pool_snap(self, pool_name, snap_name):
- """
- Remove pool snapshot
- :param pool_name: name of pool to snapshot
- :param snap_name: name of snapshot to remove
- """
- self.raw_cluster_cmd('osd', 'pool', 'rmsnap',
- str(pool_name), str(snap_name))
-
- def remove_pool(self, pool_name):
- """
- Remove the indicated pool
- :param pool_name: Pool to be removed
- """
- with self.lock:
- assert isinstance(pool_name, basestring)
- assert pool_name in self.pools
- self.log("removing pool_name %s" % (pool_name,))
- del self.pools[pool_name]
- self.do_rados(self.controller,
- ['rmpool', pool_name, pool_name,
- "--yes-i-really-really-mean-it"])
-
- def get_pool(self):
- """
- Pick a random pool
- """
- with self.lock:
- return random.choice(self.pools.keys())
-
- def get_pool_pg_num(self, pool_name):
- """
- Return the number of pgs in the pool specified.
- """
- with self.lock:
- assert isinstance(pool_name, basestring)
- if pool_name in self.pools:
- return self.pools[pool_name]
- return 0
-
- def get_pool_property(self, pool_name, prop):
- """
- :param pool_name: pool
- :param prop: property to be checked.
- :returns: property as an int value.
- """
- with self.lock:
- assert isinstance(pool_name, basestring)
- assert isinstance(prop, basestring)
- output = self.raw_cluster_cmd(
- 'osd',
- 'pool',
- 'get',
- pool_name,
- prop)
- return int(output.split()[1])
-
- def set_pool_property(self, pool_name, prop, val):
- """
- :param pool_name: pool
- :param prop: property to be set.
- :param val: value to set.
-
- This routine retries if set operation fails.
- """
- with self.lock:
- assert isinstance(pool_name, basestring)
- assert isinstance(prop, basestring)
- assert isinstance(val, int)
- tries = 0
- while True:
- r = self.raw_cluster_cmd_result(
- 'osd',
- 'pool',
- 'set',
- pool_name,
- prop,
- str(val))
- if r != 11: # EAGAIN
- break
- tries += 1
- if tries > 50:
- raise Exception('timed out getting EAGAIN '
- 'when setting pool property %s %s = %s' %
- (pool_name, prop, val))
- self.log('got EAGAIN setting pool property, '
- 'waiting a few seconds...')
- time.sleep(2)
-
- def expand_pool(self, pool_name, by, max_pgs):
- """
- Increase the number of pgs in a pool
- """
- with self.lock:
- assert isinstance(pool_name, basestring)
- assert isinstance(by, int)
- assert pool_name in self.pools
- if self.get_num_creating() > 0:
- return
- if (self.pools[pool_name] + by) > max_pgs:
- return
- self.log("increase pool size by %d" % (by,))
- new_pg_num = self.pools[pool_name] + by
- self.set_pool_property(pool_name, "pg_num", new_pg_num)
- self.pools[pool_name] = new_pg_num
-
- def set_pool_pgpnum(self, pool_name):
- """
- Set pgpnum property of pool_name pool.
- """
- with self.lock:
- assert isinstance(pool_name, basestring)
- assert pool_name in self.pools
- if self.get_num_creating() > 0:
- return
- self.set_pool_property(pool_name, 'pgp_num', self.pools[pool_name])
-
- def list_pg_missing(self, pgid):
- """
- return list of missing pgs with the id specified
- """
- r = None
- offset = {}
- while True:
- out = self.raw_cluster_cmd('--', 'pg', pgid, 'list_missing',
- json.dumps(offset))
- j = json.loads(out)
- if r is None:
- r = j
- else:
- r['objects'].extend(j['objects'])
- if not 'more' in j:
- break
- if j['more'] == 0:
- break
- offset = j['objects'][-1]['oid']
- if 'more' in r:
- del r['more']
- return r
-
- def get_pg_stats(self):
- """
- Dump the cluster and get pg stats
- """
- out = self.raw_cluster_cmd('pg', 'dump', '--format=json')
- j = json.loads('\n'.join(out.split('\n')[1:]))
- return j['pg_stats']
-
- def compile_pg_status(self):
- """
- Return a histogram of pg state values
- """
- ret = {}
- j = self.get_pg_stats()
- for pg in j:
- for status in pg['state'].split('+'):
- if status not in ret:
- ret[status] = 0
- ret[status] += 1
- return ret
-
- def pg_scrubbing(self, pool, pgnum):
- """
- pg scrubbing wrapper
- """
- pgstr = self.get_pgid(pool, pgnum)
- stats = self.get_single_pg_stats(pgstr)
- return 'scrub' in stats['state']
-
- def pg_repairing(self, pool, pgnum):
- """
- pg repairing wrapper
- """
- pgstr = self.get_pgid(pool, pgnum)
- stats = self.get_single_pg_stats(pgstr)
- return 'repair' in stats['state']
-
- def pg_inconsistent(self, pool, pgnum):
- """
- pg inconsistent wrapper
- """
- pgstr = self.get_pgid(pool, pgnum)
- stats = self.get_single_pg_stats(pgstr)
- return 'inconsistent' in stats['state']
-
- def get_last_scrub_stamp(self, pool, pgnum):
- """
- Get the timestamp of the last scrub.
- """
- stats = self.get_single_pg_stats(self.get_pgid(pool, pgnum))
- return stats["last_scrub_stamp"]
-
- def do_pg_scrub(self, pool, pgnum, stype):
- """
- Scrub pg and wait for scrubbing to finish
- """
- init = self.get_last_scrub_stamp(pool, pgnum)
- RESEND_TIMEOUT = 120 # Must be a multiple of SLEEP_TIME
- FATAL_TIMEOUT = RESEND_TIMEOUT * 3
- SLEEP_TIME = 10
- timer = 0
- while init == self.get_last_scrub_stamp(pool, pgnum):
- assert timer < FATAL_TIMEOUT, "fatal timeout trying to " + stype
- self.log("waiting for scrub type %s" % (stype,))
- if (timer % RESEND_TIMEOUT) == 0:
- self.raw_cluster_cmd('pg', stype, self.get_pgid(pool, pgnum))
- # The first time in this loop is the actual request
- if timer != 0 and stype == "repair":
- self.log("WARNING: Resubmitted a non-idempotent repair")
- time.sleep(SLEEP_TIME)
- timer += SLEEP_TIME
-
- def get_single_pg_stats(self, pgid):
- """
- Return pg for the pgid specified.
- """
- all_stats = self.get_pg_stats()
-
- for pg in all_stats:
- if pg['pgid'] == pgid:
- return pg
-
- return None
-
- def get_object_pg_with_shard(self, pool, name, osdid):
- """
- """
- pool_dump = self.get_pool_dump(pool)
- object_map = self.get_object_map(pool, name)
- if pool_dump["type"] == CephManager.ERASURE_CODED_POOL:
- shard = object_map['acting'].index(osdid)
- return "{pgid}s{shard}".format(pgid=object_map['pgid'],
- shard=shard)
- else:
- return object_map['pgid']
-
- def get_object_primary(self, pool, name):
- """
- """
- object_map = self.get_object_map(pool, name)
- return object_map['acting_primary']
-
- def get_object_map(self, pool, name):
- """
- osd map --format=json converted to a python object
- :returns: the python object
- """
- out = self.raw_cluster_cmd('--format=json', 'osd', 'map', pool, name)
- return json.loads('\n'.join(out.split('\n')[1:]))
-
- def get_osd_dump_json(self):
- """
- osd dump --format=json converted to a python object
- :returns: the python object
- """
- out = self.raw_cluster_cmd('osd', 'dump', '--format=json')
- return json.loads('\n'.join(out.split('\n')[1:]))
-
- def get_osd_dump(self):
- """
- Dump osds
- :returns: all osds
- """
- return self.get_osd_dump_json()['osds']
-
- def get_stuck_pgs(self, type_, threshold):
- """
- :returns: stuck pg information from the cluster
- """
- out = self.raw_cluster_cmd('pg', 'dump_stuck', type_, str(threshold),
- '--format=json')
- return json.loads(out)
-
- def get_num_unfound_objects(self):
- """
- Check cluster status to get the number of unfound objects
- """
- status = self.raw_cluster_status()
- self.log(status)
- return status['pgmap'].get('unfound_objects', 0)
-
- def get_num_creating(self):
- """
- Find the number of pgs in creating mode.
- """
- pgs = self.get_pg_stats()
- num = 0
- for pg in pgs:
- if 'creating' in pg['state']:
- num += 1
- return num
-
- def get_num_active_clean(self):
- """
- Find the number of active and clean pgs.
- """
- pgs = self.get_pg_stats()
- num = 0
- for pg in pgs:
- if (pg['state'].count('active') and
- pg['state'].count('clean') and
- not pg['state'].count('stale')):
- num += 1
- return num
-
- def get_num_active_recovered(self):
- """
- Find the number of active and recovered pgs.
- """
- pgs = self.get_pg_stats()
- num = 0
- for pg in pgs:
- if (pg['state'].count('active') and
- not pg['state'].count('recover') and
- not pg['state'].count('backfill') and
- not pg['state'].count('stale')):
- num += 1
- return num
-
- def get_is_making_recovery_progress(self):
- """
- Return whether there is recovery progress discernable in the
- raw cluster status
- """
- status = self.raw_cluster_status()
- kps = status['pgmap'].get('recovering_keys_per_sec', 0)
- bps = status['pgmap'].get('recovering_bytes_per_sec', 0)
- ops = status['pgmap'].get('recovering_objects_per_sec', 0)
- return kps > 0 or bps > 0 or ops > 0
-
- def get_num_active(self):
- """
- Find the number of active pgs.
- """
- pgs = self.get_pg_stats()
- num = 0
- for pg in pgs:
- if pg['state'].count('active') and not pg['state'].count('stale'):
- num += 1
- return num
-
- def get_num_down(self):
- """
- Find the number of pgs that are down.
- """
- pgs = self.get_pg_stats()
- num = 0
- for pg in pgs:
- if ((pg['state'].count('down') and not
- pg['state'].count('stale')) or
- (pg['state'].count('incomplete') and not
- pg['state'].count('stale'))):
- num += 1
- return num
-
- def get_num_active_down(self):
- """
- Find the number of pgs that are either active or down.
- """
- pgs = self.get_pg_stats()
- num = 0
- for pg in pgs:
- if ((pg['state'].count('active') and not
- pg['state'].count('stale')) or
- (pg['state'].count('down') and not
- pg['state'].count('stale')) or
- (pg['state'].count('incomplete') and not
- pg['state'].count('stale'))):
- num += 1
- return num
-
- def is_clean(self):
- """
- True if all pgs are clean
- """
- return self.get_num_active_clean() == self.get_num_pgs()
-
- def is_recovered(self):
- """
- True if all pgs have recovered
- """
- return self.get_num_active_recovered() == self.get_num_pgs()
-
- def is_active_or_down(self):
- """
- True if all pgs are active or down
- """
- return self.get_num_active_down() == self.get_num_pgs()
-
- def wait_for_clean(self, timeout=None):
- """
- Returns true when all pgs are clean.
- """
- self.log("waiting for clean")
- start = time.time()
- num_active_clean = self.get_num_active_clean()
- while not self.is_clean():
- if timeout is not None:
- if self.get_is_making_recovery_progress():
- self.log("making progress, resetting timeout")
- start = time.time()
- else:
- self.log("no progress seen, keeping timeout for now")
- if time.time() - start >= timeout:
- self.log('dumping pgs')
- out = self.raw_cluster_cmd('pg', 'dump')
- self.log(out)
- assert time.time() - start < timeout, \
- 'failed to become clean before timeout expired'
- cur_active_clean = self.get_num_active_clean()
- if cur_active_clean != num_active_clean:
- start = time.time()
- num_active_clean = cur_active_clean
- time.sleep(3)
- self.log("clean!")
-
- def are_all_osds_up(self):
- """
- Returns true if all osds are up.
- """
- x = self.get_osd_dump()
- return (len(x) == sum([(y['up'] > 0) for y in x]))
-
- def wait_for_all_up(self, timeout=None):
- """
- When this exits, either the timeout has expired, or all
- osds are up.
- """
- self.log("waiting for all up")
- start = time.time()
- while not self.are_all_osds_up():
- if timeout is not None:
- assert time.time() - start < timeout, \
- 'timeout expired in wait_for_all_up'
- time.sleep(3)
- self.log("all up!")
-
- def wait_for_recovery(self, timeout=None):
- """
- Check peering. When this exists, we have recovered.
- """
- self.log("waiting for recovery to complete")
- start = time.time()
- num_active_recovered = self.get_num_active_recovered()
- while not self.is_recovered():
- now = time.time()
- if timeout is not None:
- if self.get_is_making_recovery_progress():
- self.log("making progress, resetting timeout")
- start = time.time()
- else:
- self.log("no progress seen, keeping timeout for now")
- if now - start >= timeout:
- self.log('dumping pgs')
- out = self.raw_cluster_cmd('pg', 'dump')
- self.log(out)
- assert now - start < timeout, \
- 'failed to recover before timeout expired'
- cur_active_recovered = self.get_num_active_recovered()
- if cur_active_recovered != num_active_recovered:
- start = time.time()
- num_active_recovered = cur_active_recovered
- time.sleep(3)
- self.log("recovered!")
-
- def wait_for_active(self, timeout=None):
- """
- Check peering. When this exists, we are definitely active
- """
- self.log("waiting for peering to complete")
- start = time.time()
- num_active = self.get_num_active()
- while not self.is_active():
- if timeout is not None:
- if time.time() - start >= timeout:
- self.log('dumping pgs')
- out = self.raw_cluster_cmd('pg', 'dump')
- self.log(out)
- assert time.time() - start < timeout, \
- 'failed to recover before timeout expired'
- cur_active = self.get_num_active()
- if cur_active != num_active:
- start = time.time()
- num_active = cur_active
- time.sleep(3)
- self.log("active!")
-
- def wait_for_active_or_down(self, timeout=None):
- """
- Check peering. When this exists, we are definitely either
- active or down
- """
- self.log("waiting for peering to complete or become blocked")
- start = time.time()
- num_active_down = self.get_num_active_down()
- while not self.is_active_or_down():
- if timeout is not None:
- if time.time() - start >= timeout:
- self.log('dumping pgs')
- out = self.raw_cluster_cmd('pg', 'dump')
- self.log(out)
- assert time.time() - start < timeout, \
- 'failed to recover before timeout expired'
- cur_active_down = self.get_num_active_down()
- if cur_active_down != num_active_down:
- start = time.time()
- num_active_down = cur_active_down
- time.sleep(3)
- self.log("active or down!")
-
- def osd_is_up(self, osd):
- """
- Wrapper for osd check
- """
- osds = self.get_osd_dump()
- return osds[osd]['up'] > 0
-
- def wait_till_osd_is_up(self, osd, timeout=None):
- """
- Loop waiting for osd.
- """
- self.log('waiting for osd.%d to be up' % osd)
- start = time.time()
- while not self.osd_is_up(osd):
- if timeout is not None:
- assert time.time() - start < timeout, \
- 'osd.%d failed to come up before timeout expired' % osd
- time.sleep(3)
- self.log('osd.%d is up' % osd)
-
- def is_active(self):
- """
- Wrapper to check if all pgs are active
- """
- return self.get_num_active() == self.get_num_pgs()
-
- def wait_till_active(self, timeout=None):
- """
- Wait until all pgs are active.
- """
- self.log("waiting till active")
- start = time.time()
- while not self.is_active():
- if timeout is not None:
- if time.time() - start >= timeout:
- self.log('dumping pgs')
- out = self.raw_cluster_cmd('pg', 'dump')
- self.log(out)
- assert time.time() - start < timeout, \
- 'failed to become active before timeout expired'
- time.sleep(3)
- self.log("active!")
-
- def mark_out_osd(self, osd):
- """
- Wrapper to mark osd out.
- """
- self.raw_cluster_cmd('osd', 'out', str(osd))
-
- def kill_osd(self, osd):
- """
- Kill osds by either power cycling (if indicated by the config)
- or by stopping.
- """
- if self.config.get('powercycle'):
- remote = self.find_remote('osd', osd)
- self.log('kill_osd on osd.{o} '
- 'doing powercycle of {s}'.format(o=osd, s=remote.name))
- self._assert_ipmi(remote)
- remote.console.power_off()
- elif self.config.get('bdev_inject_crash') and self.config.get('bdev_inject_crash_probability'):
- if random.uniform(0, 1) < self.config.get('bdev_inject_crash_probability', .5):
- self.raw_cluster_cmd(
- '--', 'tell', 'osd.%d' % osd,
- 'injectargs',
- '--bdev-inject-crash %d' % self.config.get('bdev_inject_crash'),
- )
- try:
- self.ctx.daemons.get_daemon('osd', osd, self.cluster).wait()
- except:
- pass
- else:
- raise RuntimeError('osd.%s did not fail' % osd)
- else:
- self.ctx.daemons.get_daemon('osd', osd, self.cluster).stop()
- else:
- self.ctx.daemons.get_daemon('osd', osd, self.cluster).stop()
-
- @staticmethod
- def _assert_ipmi(remote):
- assert remote.console.has_ipmi_credentials, (
- "powercycling requested but RemoteConsole is not "
- "initialized. Check ipmi config.")
-
- def blackhole_kill_osd(self, osd):
- """
- Stop osd if nothing else works.
- """
- self.raw_cluster_cmd('--', 'tell', 'osd.%d' % osd,
- 'injectargs',
- '--objectstore-blackhole')
- time.sleep(2)
- self.ctx.daemons.get_daemon('osd', osd, self.cluster).stop()
-
- def revive_osd(self, osd, timeout=150, skip_admin_check=False):
- """
- Revive osds by either power cycling (if indicated by the config)
- or by restarting.
- """
- if self.config.get('powercycle'):
- remote = self.find_remote('osd', osd)
- self.log('kill_osd on osd.{o} doing powercycle of {s}'.
- format(o=osd, s=remote.name))
- self._assert_ipmi(remote)
- remote.console.power_on()
- if not remote.console.check_status(300):
- raise Exception('Failed to revive osd.{o} via ipmi'.
- format(o=osd))
- teuthology.reconnect(self.ctx, 60, [remote])
- mount_osd_data(self.ctx, remote, self.cluster, str(osd))
- self.make_admin_daemon_dir(remote)
- self.ctx.daemons.get_daemon('osd', osd, self.cluster).reset()
- self.ctx.daemons.get_daemon('osd', osd, self.cluster).restart()
-
- if not skip_admin_check:
- # wait for dump_ops_in_flight; this command doesn't appear
- # until after the signal handler is installed and it is safe
- # to stop the osd again without making valgrind leak checks
- # unhappy. see #5924.
- self.wait_run_admin_socket('osd', osd,
- args=['dump_ops_in_flight'],
- timeout=timeout, stdout=DEVNULL)
-
- def mark_down_osd(self, osd):
- """
- Cluster command wrapper
- """
- self.raw_cluster_cmd('osd', 'down', str(osd))
-
- def mark_in_osd(self, osd):
- """
- Cluster command wrapper
- """
- self.raw_cluster_cmd('osd', 'in', str(osd))
-
- def signal_osd(self, osd, sig, silent=False):
- """
- Wrapper to local get_daemon call which sends the given
- signal to the given osd.
- """
- self.ctx.daemons.get_daemon('osd', osd,
- self.cluster).signal(sig, silent=silent)
-
- ## monitors
- def signal_mon(self, mon, sig, silent=False):
- """
- Wrapper to local get_deamon call
- """
- self.ctx.daemons.get_daemon('mon', mon,
- self.cluster).signal(sig, silent=silent)
-
- def kill_mon(self, mon):
- """
- Kill the monitor by either power cycling (if the config says so),
- or by doing a stop.
- """
- if self.config.get('powercycle'):
- remote = self.find_remote('mon', mon)
- self.log('kill_mon on mon.{m} doing powercycle of {s}'.
- format(m=mon, s=remote.name))
- self._assert_ipmi(remote)
- remote.console.power_off()
- else:
- self.ctx.daemons.get_daemon('mon', mon, self.cluster).stop()
-
- def revive_mon(self, mon):
- """
- Restart by either power cycling (if the config says so),
- or by doing a normal restart.
- """
- if self.config.get('powercycle'):
- remote = self.find_remote('mon', mon)
- self.log('revive_mon on mon.{m} doing powercycle of {s}'.
- format(m=mon, s=remote.name))
- self._assert_ipmi(remote)
- remote.console.power_on()
- self.make_admin_daemon_dir(remote)
- self.ctx.daemons.get_daemon('mon', mon, self.cluster).restart()
-
- def get_mon_status(self, mon):
- """
- Extract all the monitor status information from the cluster
- """
- addr = self.ctx.ceph[self.cluster].conf['mon.%s' % mon]['mon addr']
- out = self.raw_cluster_cmd('-m', addr, 'mon_status')
- return json.loads(out)
-
- def get_mon_quorum(self):
- """
- Extract monitor quorum information from the cluster
- """
- out = self.raw_cluster_cmd('quorum_status')
- j = json.loads(out)
- self.log('quorum_status is %s' % out)
- return j['quorum']
-
- def wait_for_mon_quorum_size(self, size, timeout=300):
- """
- Loop until quorum size is reached.
- """
- self.log('waiting for quorum size %d' % size)
- start = time.time()
- while not len(self.get_mon_quorum()) == size:
- if timeout is not None:
- assert time.time() - start < timeout, \
- ('failed to reach quorum size %d '
- 'before timeout expired' % size)
- time.sleep(3)
- self.log("quorum is size %d" % size)
-
- def get_mon_health(self, debug=False):
- """
- Extract all the monitor health information.
- """
- out = self.raw_cluster_cmd('health', '--format=json')
- if debug:
- self.log('health:\n{h}'.format(h=out))
- return json.loads(out)
-
- def get_mds_status(self, mds):
- """
- Run cluster commands for the mds in order to get mds information
- """
- out = self.raw_cluster_cmd('mds', 'dump', '--format=json')
- j = json.loads(' '.join(out.splitlines()[1:]))
- # collate; for dup ids, larger gid wins.
- for info in j['info'].itervalues():
- if info['name'] == mds:
- return info
- return None
-
- def get_filepath(self):
- """
- Return path to osd data with {id} needing to be replaced
- """
- return '/var/lib/ceph/osd/' + self.cluster + '-{id}'
-
- def make_admin_daemon_dir(self, remote):
- """
- Create /var/run/ceph directory on remote site.
-
- :param ctx: Context
- :param remote: Remote site
- """
- remote.run(args=['sudo',
- 'install', '-d', '-m0777', '--', '/var/run/ceph', ], )
-
-
-def utility_task(name):
- """
- Generate ceph_manager subtask corresponding to ceph_manager
- method name
- """
- def task(ctx, config):
- if config is None:
- config = {}
- args = config.get('args', [])
- kwargs = config.get('kwargs', {})
- cluster = config.get('cluster', 'ceph')
- fn = getattr(ctx.managers[cluster], name)
- fn(*args, **kwargs)
- return task
-
-revive_osd = utility_task("revive_osd")
-revive_mon = utility_task("revive_mon")
-kill_osd = utility_task("kill_osd")
-kill_mon = utility_task("kill_mon")
-create_pool = utility_task("create_pool")
-remove_pool = utility_task("remove_pool")
-wait_for_clean = utility_task("wait_for_clean")
-set_pool_property = utility_task("set_pool_property")
-do_pg_scrub = utility_task("do_pg_scrub")
+++ /dev/null
-"""
-ceph_objectstore_tool - Simple test of ceph-objectstore-tool utility
-"""
-from cStringIO import StringIO
-import contextlib
-import logging
-import ceph_manager
-from teuthology import misc as teuthology
-import time
-import os
-import string
-from teuthology.orchestra import run
-import sys
-import tempfile
-import json
-from util.rados import (rados, create_replicated_pool, create_ec_pool)
-# from util.rados import (rados, create_ec_pool,
-# create_replicated_pool,
-# create_cache_pool)
-
-log = logging.getLogger(__name__)
-
-# Should get cluster name "ceph" from somewhere
-# and normal path from osd_data and osd_journal in conf
-FSPATH = "/var/lib/ceph/osd/ceph-{id}"
-JPATH = "/var/lib/ceph/osd/ceph-{id}/journal"
-
-
-def cod_setup_local_data(log, ctx, NUM_OBJECTS, DATADIR,
- BASE_NAME, DATALINECOUNT):
- objects = range(1, NUM_OBJECTS + 1)
- for i in objects:
- NAME = BASE_NAME + "{num}".format(num=i)
- LOCALNAME = os.path.join(DATADIR, NAME)
-
- dataline = range(DATALINECOUNT)
- fd = open(LOCALNAME, "w")
- data = "This is the data for " + NAME + "\n"
- for _ in dataline:
- fd.write(data)
- fd.close()
-
-
-def cod_setup_remote_data(log, ctx, remote, NUM_OBJECTS, DATADIR,
- BASE_NAME, DATALINECOUNT):
-
- objects = range(1, NUM_OBJECTS + 1)
- for i in objects:
- NAME = BASE_NAME + "{num}".format(num=i)
- DDNAME = os.path.join(DATADIR, NAME)
-
- remote.run(args=['rm', '-f', DDNAME])
-
- dataline = range(DATALINECOUNT)
- data = "This is the data for " + NAME + "\n"
- DATA = ""
- for _ in dataline:
- DATA += data
- teuthology.write_file(remote, DDNAME, DATA)
-
-
-def cod_setup(log, ctx, remote, NUM_OBJECTS, DATADIR,
- BASE_NAME, DATALINECOUNT, POOL, db, ec):
- ERRORS = 0
- log.info("Creating {objs} objects in pool".format(objs=NUM_OBJECTS))
-
- objects = range(1, NUM_OBJECTS + 1)
- for i in objects:
- NAME = BASE_NAME + "{num}".format(num=i)
- DDNAME = os.path.join(DATADIR, NAME)
-
- proc = rados(ctx, remote, ['-p', POOL, 'put', NAME, DDNAME],
- wait=False)
- # proc = remote.run(args=['rados', '-p', POOL, 'put', NAME, DDNAME])
- ret = proc.wait()
- if ret != 0:
- log.critical("Rados put failed with status {ret}".
- format(ret=proc.exitstatus))
- sys.exit(1)
-
- db[NAME] = {}
-
- keys = range(i)
- db[NAME]["xattr"] = {}
- for k in keys:
- if k == 0:
- continue
- mykey = "key{i}-{k}".format(i=i, k=k)
- myval = "val{i}-{k}".format(i=i, k=k)
- proc = remote.run(args=['rados', '-p', POOL, 'setxattr',
- NAME, mykey, myval])
- ret = proc.wait()
- if ret != 0:
- log.error("setxattr failed with {ret}".format(ret=ret))
- ERRORS += 1
- db[NAME]["xattr"][mykey] = myval
-
- # Erasure coded pools don't support omap
- if ec:
- continue
-
- # Create omap header in all objects but REPobject1
- if i != 1:
- myhdr = "hdr{i}".format(i=i)
- proc = remote.run(args=['rados', '-p', POOL, 'setomapheader',
- NAME, myhdr])
- ret = proc.wait()
- if ret != 0:
- log.critical("setomapheader failed with {ret}".format(ret=ret))
- ERRORS += 1
- db[NAME]["omapheader"] = myhdr
-
- db[NAME]["omap"] = {}
- for k in keys:
- if k == 0:
- continue
- mykey = "okey{i}-{k}".format(i=i, k=k)
- myval = "oval{i}-{k}".format(i=i, k=k)
- proc = remote.run(args=['rados', '-p', POOL, 'setomapval',
- NAME, mykey, myval])
- ret = proc.wait()
- if ret != 0:
- log.critical("setomapval failed with {ret}".format(ret=ret))
- db[NAME]["omap"][mykey] = myval
-
- return ERRORS
-
-
-def get_lines(filename):
- tmpfd = open(filename, "r")
- line = True
- lines = []
- while line:
- line = tmpfd.readline().rstrip('\n')
- if line:
- lines += [line]
- tmpfd.close()
- os.unlink(filename)
- return lines
-
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Run ceph_objectstore_tool test
-
- The config should be as follows::
-
- ceph_objectstore_tool:
- objects: 20 # <number of objects>
- pgnum: 12
- """
-
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- 'ceph_objectstore_tool task only accepts a dict for configuration'
-
- log.info('Beginning ceph_objectstore_tool...')
-
- log.debug(config)
- log.debug(ctx)
- clients = ctx.cluster.only(teuthology.is_type('client'))
- assert len(clients.remotes) > 0, 'Must specify at least 1 client'
- (cli_remote, _) = clients.remotes.popitem()
- log.debug(cli_remote)
-
- # clients = dict(teuthology.get_clients(ctx=ctx, roles=config.keys()))
- # client = clients.popitem()
- # log.info(client)
- osds = ctx.cluster.only(teuthology.is_type('osd'))
- log.info("OSDS")
- log.info(osds)
- log.info(osds.remotes)
-
- manager = ctx.managers['ceph']
- while (len(manager.get_osd_status()['up']) !=
- len(manager.get_osd_status()['raw'])):
- time.sleep(10)
- while (len(manager.get_osd_status()['in']) !=
- len(manager.get_osd_status()['up'])):
- time.sleep(10)
- manager.raw_cluster_cmd('osd', 'set', 'noout')
- manager.raw_cluster_cmd('osd', 'set', 'nodown')
-
- PGNUM = config.get('pgnum', 12)
- log.info("pgnum: {num}".format(num=PGNUM))
-
- ERRORS = 0
-
- REP_POOL = "rep_pool"
- REP_NAME = "REPobject"
- create_replicated_pool(cli_remote, REP_POOL, PGNUM)
- ERRORS += test_objectstore(ctx, config, cli_remote, REP_POOL, REP_NAME)
-
- EC_POOL = "ec_pool"
- EC_NAME = "ECobject"
- create_ec_pool(cli_remote, EC_POOL, 'default', PGNUM)
- ERRORS += test_objectstore(ctx, config, cli_remote,
- EC_POOL, EC_NAME, ec=True)
-
- if ERRORS == 0:
- log.info("TEST PASSED")
- else:
- log.error("TEST FAILED WITH {errcount} ERRORS".format(errcount=ERRORS))
-
- assert ERRORS == 0
-
- try:
- yield
- finally:
- log.info('Ending ceph_objectstore_tool')
-
-
-def test_objectstore(ctx, config, cli_remote, REP_POOL, REP_NAME, ec=False):
- manager = ctx.managers['ceph']
-
- osds = ctx.cluster.only(teuthology.is_type('osd'))
-
- TEUTHDIR = teuthology.get_testdir(ctx)
- DATADIR = os.path.join(TEUTHDIR, "ceph.data")
- DATALINECOUNT = 10000
- ERRORS = 0
- NUM_OBJECTS = config.get('objects', 10)
- log.info("objects: {num}".format(num=NUM_OBJECTS))
-
- pool_dump = manager.get_pool_dump(REP_POOL)
- REPID = pool_dump['pool']
-
- log.debug("repid={num}".format(num=REPID))
-
- db = {}
-
- LOCALDIR = tempfile.mkdtemp("cod")
-
- cod_setup_local_data(log, ctx, NUM_OBJECTS, LOCALDIR,
- REP_NAME, DATALINECOUNT)
- allremote = []
- allremote.append(cli_remote)
- allremote += osds.remotes.keys()
- allremote = list(set(allremote))
- for remote in allremote:
- cod_setup_remote_data(log, ctx, remote, NUM_OBJECTS, DATADIR,
- REP_NAME, DATALINECOUNT)
-
- ERRORS += cod_setup(log, ctx, cli_remote, NUM_OBJECTS, DATADIR,
- REP_NAME, DATALINECOUNT, REP_POOL, db, ec)
-
- pgs = {}
- for stats in manager.get_pg_stats():
- if stats["pgid"].find(str(REPID) + ".") != 0:
- continue
- if pool_dump["type"] == ceph_manager.CephManager.REPLICATED_POOL:
- for osd in stats["acting"]:
- pgs.setdefault(osd, []).append(stats["pgid"])
- elif pool_dump["type"] == ceph_manager.CephManager.ERASURE_CODED_POOL:
- shard = 0
- for osd in stats["acting"]:
- pgs.setdefault(osd, []).append("{pgid}s{shard}".
- format(pgid=stats["pgid"],
- shard=shard))
- shard += 1
- else:
- raise Exception("{pool} has an unexpected type {type}".
- format(pool=REP_POOL, type=pool_dump["type"]))
-
- log.info(pgs)
- log.info(db)
-
- for osd in manager.get_osd_status()['up']:
- manager.kill_osd(osd)
- time.sleep(5)
-
- pgswithobjects = set()
- objsinpg = {}
-
- # Test --op list and generate json for all objects
- log.info("Test --op list by generating json for all objects")
- prefix = ("sudo ceph-objectstore-tool "
- "--data-path {fpath} "
- "--journal-path {jpath} ").format(fpath=FSPATH, jpath=JPATH)
- for remote in osds.remotes.iterkeys():
- log.debug(remote)
- log.debug(osds.remotes[remote])
- for role in osds.remotes[remote]:
- if string.find(role, "osd.") != 0:
- continue
- osdid = int(role.split('.')[1])
- log.info("process osd.{id} on {remote}".
- format(id=osdid, remote=remote))
- cmd = (prefix + "--op list").format(id=osdid)
- proc = remote.run(args=cmd.split(), check_status=False,
- stdout=StringIO())
- if proc.exitstatus != 0:
- log.error("Bad exit status {ret} from --op list request".
- format(ret=proc.exitstatus))
- ERRORS += 1
- else:
- for pgline in proc.stdout.getvalue().splitlines():
- if not pgline:
- continue
- (pg, obj) = json.loads(pgline)
- name = obj['oid']
- if name in db:
- pgswithobjects.add(pg)
- objsinpg.setdefault(pg, []).append(name)
- db[name].setdefault("pg2json",
- {})[pg] = json.dumps(obj)
-
- log.info(db)
- log.info(pgswithobjects)
- log.info(objsinpg)
-
- if pool_dump["type"] == ceph_manager.CephManager.REPLICATED_POOL:
- # Test get-bytes
- log.info("Test get-bytes and set-bytes")
- for basename in db.keys():
- file = os.path.join(DATADIR, basename)
- GETNAME = os.path.join(DATADIR, "get")
- SETNAME = os.path.join(DATADIR, "set")
-
- for remote in osds.remotes.iterkeys():
- for role in osds.remotes[remote]:
- if string.find(role, "osd.") != 0:
- continue
- osdid = int(role.split('.')[1])
- if osdid not in pgs:
- continue
-
- for pg, JSON in db[basename]["pg2json"].iteritems():
- if pg in pgs[osdid]:
- cmd = ((prefix + "--pgid {pg}").
- format(id=osdid, pg=pg).split())
- cmd.append(run.Raw("'{json}'".format(json=JSON)))
- cmd += ("get-bytes {fname}".
- format(fname=GETNAME).split())
- proc = remote.run(args=cmd, check_status=False)
- if proc.exitstatus != 0:
- remote.run(args="rm -f {getfile}".
- format(getfile=GETNAME).split())
- log.error("Bad exit status {ret}".
- format(ret=proc.exitstatus))
- ERRORS += 1
- continue
- cmd = ("diff -q {file} {getfile}".
- format(file=file, getfile=GETNAME))
- proc = remote.run(args=cmd.split())
- if proc.exitstatus != 0:
- log.error("Data from get-bytes differ")
- # log.debug("Got:")
- # cat_file(logging.DEBUG, GETNAME)
- # log.debug("Expected:")
- # cat_file(logging.DEBUG, file)
- ERRORS += 1
- remote.run(args="rm -f {getfile}".
- format(getfile=GETNAME).split())
-
- data = ("put-bytes going into {file}\n".
- format(file=file))
- teuthology.write_file(remote, SETNAME, data)
- cmd = ((prefix + "--pgid {pg}").
- format(id=osdid, pg=pg).split())
- cmd.append(run.Raw("'{json}'".format(json=JSON)))
- cmd += ("set-bytes {fname}".
- format(fname=SETNAME).split())
- proc = remote.run(args=cmd, check_status=False)
- proc.wait()
- if proc.exitstatus != 0:
- log.info("set-bytes failed for object {obj} "
- "in pg {pg} osd.{id} ret={ret}".
- format(obj=basename, pg=pg,
- id=osdid, ret=proc.exitstatus))
- ERRORS += 1
-
- cmd = ((prefix + "--pgid {pg}").
- format(id=osdid, pg=pg).split())
- cmd.append(run.Raw("'{json}'".format(json=JSON)))
- cmd += "get-bytes -".split()
- proc = remote.run(args=cmd, check_status=False,
- stdout=StringIO())
- proc.wait()
- if proc.exitstatus != 0:
- log.error("get-bytes after "
- "set-bytes ret={ret}".
- format(ret=proc.exitstatus))
- ERRORS += 1
- else:
- if data != proc.stdout.getvalue():
- log.error("Data inconsistent after "
- "set-bytes, got:")
- log.error(proc.stdout.getvalue())
- ERRORS += 1
-
- cmd = ((prefix + "--pgid {pg}").
- format(id=osdid, pg=pg).split())
- cmd.append(run.Raw("'{json}'".format(json=JSON)))
- cmd += ("set-bytes {fname}".
- format(fname=file).split())
- proc = remote.run(args=cmd, check_status=False)
- proc.wait()
- if proc.exitstatus != 0:
- log.info("set-bytes failed for object {obj} "
- "in pg {pg} osd.{id} ret={ret}".
- format(obj=basename, pg=pg,
- id=osdid, ret=proc.exitstatus))
- ERRORS += 1
-
- log.info("Test list-attrs get-attr")
- for basename in db.keys():
- file = os.path.join(DATADIR, basename)
- GETNAME = os.path.join(DATADIR, "get")
- SETNAME = os.path.join(DATADIR, "set")
-
- for remote in osds.remotes.iterkeys():
- for role in osds.remotes[remote]:
- if string.find(role, "osd.") != 0:
- continue
- osdid = int(role.split('.')[1])
- if osdid not in pgs:
- continue
-
- for pg, JSON in db[basename]["pg2json"].iteritems():
- if pg in pgs[osdid]:
- cmd = ((prefix + "--pgid {pg}").
- format(id=osdid, pg=pg).split())
- cmd.append(run.Raw("'{json}'".format(json=JSON)))
- cmd += ["list-attrs"]
- proc = remote.run(args=cmd, check_status=False,
- stdout=StringIO(), stderr=StringIO())
- proc.wait()
- if proc.exitstatus != 0:
- log.error("Bad exit status {ret}".
- format(ret=proc.exitstatus))
- ERRORS += 1
- continue
- keys = proc.stdout.getvalue().split()
- values = dict(db[basename]["xattr"])
-
- for key in keys:
- if (key == "_" or
- key == "snapset" or
- key == "hinfo_key"):
- continue
- key = key.strip("_")
- if key not in values:
- log.error("The key {key} should be present".
- format(key=key))
- ERRORS += 1
- continue
- exp = values.pop(key)
- cmd = ((prefix + "--pgid {pg}").
- format(id=osdid, pg=pg).split())
- cmd.append(run.Raw("'{json}'".format(json=JSON)))
- cmd += ("get-attr {key}".
- format(key="_" + key).split())
- proc = remote.run(args=cmd, check_status=False,
- stdout=StringIO())
- proc.wait()
- if proc.exitstatus != 0:
- log.error("get-attr failed with {ret}".
- format(ret=proc.exitstatus))
- ERRORS += 1
- continue
- val = proc.stdout.getvalue()
- if exp != val:
- log.error("For key {key} got value {got} "
- "instead of {expected}".
- format(key=key, got=val,
- expected=exp))
- ERRORS += 1
- if "hinfo_key" in keys:
- cmd_prefix = prefix.format(id=osdid)
- cmd = """
- expected=$({prefix} --pgid {pg} '{json}' get-attr {key} | base64)
- echo placeholder | {prefix} --pgid {pg} '{json}' set-attr {key} -
- test $({prefix} --pgid {pg} '{json}' get-attr {key}) = placeholder
- echo $expected | base64 --decode | \
- {prefix} --pgid {pg} '{json}' set-attr {key} -
- test $({prefix} --pgid {pg} '{json}' get-attr {key} | base64) = $expected
- """.format(prefix=cmd_prefix, pg=pg, json=JSON,
- key="hinfo_key")
- log.debug(cmd)
- proc = remote.run(args=['bash', '-e', '-x',
- '-c', cmd],
- check_status=False,
- stdout=StringIO(),
- stderr=StringIO())
- proc.wait()
- if proc.exitstatus != 0:
- log.error("failed with " +
- str(proc.exitstatus))
- log.error(proc.stdout.getvalue() + " " +
- proc.stderr.getvalue())
- ERRORS += 1
-
- if len(values) != 0:
- log.error("Not all keys found, remaining keys:")
- log.error(values)
-
- log.info("Test pg info")
- for remote in osds.remotes.iterkeys():
- for role in osds.remotes[remote]:
- if string.find(role, "osd.") != 0:
- continue
- osdid = int(role.split('.')[1])
- if osdid not in pgs:
- continue
-
- for pg in pgs[osdid]:
- cmd = ((prefix + "--op info --pgid {pg}").
- format(id=osdid, pg=pg).split())
- proc = remote.run(args=cmd, check_status=False,
- stdout=StringIO())
- proc.wait()
- if proc.exitstatus != 0:
- log.error("Failure of --op info command with {ret}".
- format(proc.exitstatus))
- ERRORS += 1
- continue
- info = proc.stdout.getvalue()
- if not str(pg) in info:
- log.error("Bad data from info: {info}".format(info=info))
- ERRORS += 1
-
- log.info("Test pg logging")
- for remote in osds.remotes.iterkeys():
- for role in osds.remotes[remote]:
- if string.find(role, "osd.") != 0:
- continue
- osdid = int(role.split('.')[1])
- if osdid not in pgs:
- continue
-
- for pg in pgs[osdid]:
- cmd = ((prefix + "--op log --pgid {pg}").
- format(id=osdid, pg=pg).split())
- proc = remote.run(args=cmd, check_status=False,
- stdout=StringIO())
- proc.wait()
- if proc.exitstatus != 0:
- log.error("Getting log failed for pg {pg} "
- "from osd.{id} with {ret}".
- format(pg=pg, id=osdid, ret=proc.exitstatus))
- ERRORS += 1
- continue
- HASOBJ = pg in pgswithobjects
- MODOBJ = "modify" in proc.stdout.getvalue()
- if HASOBJ != MODOBJ:
- log.error("Bad log for pg {pg} from osd.{id}".
- format(pg=pg, id=osdid))
- MSG = (HASOBJ and [""] or ["NOT "])[0]
- log.error("Log should {msg}have a modify entry".
- format(msg=MSG))
- ERRORS += 1
-
- log.info("Test pg export")
- EXP_ERRORS = 0
- for remote in osds.remotes.iterkeys():
- for role in osds.remotes[remote]:
- if string.find(role, "osd.") != 0:
- continue
- osdid = int(role.split('.')[1])
- if osdid not in pgs:
- continue
-
- for pg in pgs[osdid]:
- fpath = os.path.join(DATADIR, "osd{id}.{pg}".
- format(id=osdid, pg=pg))
-
- cmd = ((prefix + "--op export --pgid {pg} --file {file}").
- format(id=osdid, pg=pg, file=fpath))
- proc = remote.run(args=cmd, check_status=False,
- stdout=StringIO())
- proc.wait()
- if proc.exitstatus != 0:
- log.error("Exporting failed for pg {pg} "
- "on osd.{id} with {ret}".
- format(pg=pg, id=osdid, ret=proc.exitstatus))
- EXP_ERRORS += 1
-
- ERRORS += EXP_ERRORS
-
- log.info("Test pg removal")
- RM_ERRORS = 0
- for remote in osds.remotes.iterkeys():
- for role in osds.remotes[remote]:
- if string.find(role, "osd.") != 0:
- continue
- osdid = int(role.split('.')[1])
- if osdid not in pgs:
- continue
-
- for pg in pgs[osdid]:
- cmd = ((prefix + "--op remove --pgid {pg}").
- format(pg=pg, id=osdid))
- proc = remote.run(args=cmd, check_status=False,
- stdout=StringIO())
- proc.wait()
- if proc.exitstatus != 0:
- log.error("Removing failed for pg {pg} "
- "on osd.{id} with {ret}".
- format(pg=pg, id=osdid, ret=proc.exitstatus))
- RM_ERRORS += 1
-
- ERRORS += RM_ERRORS
-
- IMP_ERRORS = 0
- if EXP_ERRORS == 0 and RM_ERRORS == 0:
- log.info("Test pg import")
-
- for remote in osds.remotes.iterkeys():
- for role in osds.remotes[remote]:
- if string.find(role, "osd.") != 0:
- continue
- osdid = int(role.split('.')[1])
- if osdid not in pgs:
- continue
-
- for pg in pgs[osdid]:
- fpath = os.path.join(DATADIR, "osd{id}.{pg}".
- format(id=osdid, pg=pg))
-
- cmd = ((prefix + "--op import --file {file}").
- format(id=osdid, file=fpath))
- proc = remote.run(args=cmd, check_status=False,
- stdout=StringIO())
- proc.wait()
- if proc.exitstatus != 0:
- log.error("Import failed from {file} with {ret}".
- format(file=fpath, ret=proc.exitstatus))
- IMP_ERRORS += 1
- else:
- log.warning("SKIPPING IMPORT TESTS DUE TO PREVIOUS FAILURES")
-
- ERRORS += IMP_ERRORS
-
- if EXP_ERRORS == 0 and RM_ERRORS == 0 and IMP_ERRORS == 0:
- log.info("Restarting OSDs....")
- # They are still look to be up because of setting nodown
- for osd in manager.get_osd_status()['up']:
- manager.revive_osd(osd)
- # Wait for health?
- time.sleep(5)
- # Let scrub after test runs verify consistency of all copies
- log.info("Verify replicated import data")
- objects = range(1, NUM_OBJECTS + 1)
- for i in objects:
- NAME = REP_NAME + "{num}".format(num=i)
- TESTNAME = os.path.join(DATADIR, "gettest")
- REFNAME = os.path.join(DATADIR, NAME)
-
- proc = rados(ctx, cli_remote,
- ['-p', REP_POOL, 'get', NAME, TESTNAME], wait=False)
-
- ret = proc.wait()
- if ret != 0:
- log.error("After import, rados get failed with {ret}".
- format(ret=proc.exitstatus))
- ERRORS += 1
- continue
-
- cmd = "diff -q {gettest} {ref}".format(gettest=TESTNAME,
- ref=REFNAME)
- proc = cli_remote.run(args=cmd, check_status=False)
- proc.wait()
- if proc.exitstatus != 0:
- log.error("Data comparison failed for {obj}".format(obj=NAME))
- ERRORS += 1
-
- return ERRORS
+++ /dev/null
-
-import unittest
-import time
-import logging
-
-from teuthology.orchestra.run import CommandFailedError
-
-log = logging.getLogger(__name__)
-
-
-class CephTestCase(unittest.TestCase):
- """
- For test tasks that want to define a structured set of
- tests implemented in python. Subclass this with appropriate
- helpers for the subsystem you're testing.
- """
-
- # Environment references
- mounts = None
- fs = None
- ceph_cluster = None
- mds_cluster = None
- mgr_cluster = None
- ctx = None
-
- mon_manager = None
-
- def setUp(self):
- self.ceph_cluster.mon_manager.raw_cluster_cmd("log",
- "Starting test {0}".format(self.id()))
-
- def tearDown(self):
- self.ceph_cluster.mon_manager.raw_cluster_cmd("log",
- "Ended test {0}".format(self.id()))
-
- def assert_cluster_log(self, expected_pattern, invert_match=False, timeout=10):
- """
- Context manager. Assert that during execution, or up to 5 seconds later,
- the Ceph cluster log emits a message matching the expected pattern.
-
- :param expected_pattern: a string that you expect to see in the log output
- """
-
- ceph_manager = self.ceph_cluster.mon_manager
-
- class ContextManager(object):
- def match(self):
- found = expected_pattern in self.watcher_process.stdout.getvalue()
- if invert_match:
- return not found
-
- return found
-
- def __enter__(self):
- self.watcher_process = ceph_manager.run_ceph_w()
-
- def __exit__(self, exc_type, exc_val, exc_tb):
- if not self.watcher_process.finished:
- # Check if we got an early match, wait a bit if we didn't
- if self.match():
- return
- else:
- log.debug("No log hits yet, waiting...")
- # Default monc tick interval is 10s, so wait that long and
- # then some grace
- time.sleep(5 + timeout)
-
- self.watcher_process.stdin.close()
- try:
- self.watcher_process.wait()
- except CommandFailedError:
- pass
-
- if not self.match():
- log.error("Log output: \n{0}\n".format(self.watcher_process.stdout.getvalue()))
- raise AssertionError("Expected log message not found: '{0}'".format(expected_pattern))
-
- return ContextManager()
-
- def wait_for_health(self, pattern, timeout):
- """
- Wait until 'ceph health' contains messages matching the pattern
- """
- def seen_health_warning():
- health = self.ceph_cluster.mon_manager.get_mon_health()
- summary_strings = [s['summary'] for s in health['summary']]
- if len(summary_strings) == 0:
- log.debug("Not expected number of summary strings ({0})".format(summary_strings))
- return False
- else:
- for ss in summary_strings:
- if pattern in ss:
- return True
-
- log.debug("Not found expected summary strings yet ({0})".format(summary_strings))
- return False
-
- self.wait_until_true(seen_health_warning, timeout)
-
- def wait_for_health_clear(self, timeout):
- """
- Wait until `ceph health` returns no messages
- """
- def is_clear():
- health = self.ceph_cluster.mon_manager.get_mon_health()
- return len(health['summary']) == 0
-
- self.wait_until_true(is_clear, timeout)
-
- def wait_until_equal(self, get_fn, expect_val, timeout, reject_fn=None):
- period = 5
- elapsed = 0
- while True:
- val = get_fn()
- if val == expect_val:
- return
- elif reject_fn and reject_fn(val):
- raise RuntimeError("wait_until_equal: forbidden value {0} seen".format(val))
- else:
- if elapsed >= timeout:
- raise RuntimeError("Timed out after {0} seconds waiting for {1} (currently {2})".format(
- elapsed, expect_val, val
- ))
- else:
- log.debug("wait_until_equal: {0} != {1}, waiting...".format(val, expect_val))
- time.sleep(period)
- elapsed += period
-
- log.debug("wait_until_equal: success")
-
- def wait_until_true(self, condition, timeout):
- period = 5
- elapsed = 0
- while True:
- if condition():
- log.debug("wait_until_true: success in {0}s".format(elapsed))
- return
- else:
- if elapsed >= timeout:
- raise RuntimeError("Timed out after {0}s".format(elapsed))
- else:
- log.debug("wait_until_true: waiting...")
- time.sleep(period)
- elapsed += period
-
-
+++ /dev/null
-import json
-import logging
-from unittest import case
-from tasks.ceph_test_case import CephTestCase
-import os
-import re
-from StringIO import StringIO
-
-from tasks.cephfs.fuse_mount import FuseMount
-
-from teuthology.orchestra import run
-from teuthology.orchestra.run import CommandFailedError
-
-
-log = logging.getLogger(__name__)
-
-
-def for_teuthology(f):
- """
- Decorator that adds an "is_for_teuthology" attribute to the wrapped function
- """
- f.is_for_teuthology = True
- return f
-
-
-def needs_trimming(f):
- """
- Mark fn as requiring a client capable of trimming its cache (i.e. for ceph-fuse
- this means it needs to be able to run as root, currently)
- """
- f.needs_trimming = True
- return f
-
-
-class CephFSTestCase(CephTestCase):
- """
- Test case for Ceph FS, requires caller to populate Filesystem and Mounts,
- into the fs, mount_a, mount_b class attributes (setting mount_b is optional)
-
- Handles resetting the cluster under test between tests.
- """
-
- # FIXME weird explicit naming
- mount_a = None
- mount_b = None
-
- # Declarative test requirements: subclasses should override these to indicate
- # their special needs. If not met, tests will be skipped.
- CLIENTS_REQUIRED = 1
- MDSS_REQUIRED = 1
- REQUIRE_KCLIENT_REMOTE = False
- REQUIRE_ONE_CLIENT_REMOTE = False
- REQUIRE_MEMSTORE = False
-
- # Whether to create the default filesystem during setUp
- REQUIRE_FILESYSTEM = True
-
- LOAD_SETTINGS = []
-
- def setUp(self):
- super(CephFSTestCase, self).setUp()
-
- if len(self.mds_cluster.mds_ids) < self.MDSS_REQUIRED:
- raise case.SkipTest("Only have {0} MDSs, require {1}".format(
- len(self.mds_cluster.mds_ids), self.MDSS_REQUIRED
- ))
-
- if len(self.mounts) < self.CLIENTS_REQUIRED:
- raise case.SkipTest("Only have {0} clients, require {1}".format(
- len(self.mounts), self.CLIENTS_REQUIRED
- ))
-
- if self.REQUIRE_KCLIENT_REMOTE:
- if not isinstance(self.mounts[0], FuseMount) or not isinstance(self.mounts[1], FuseMount):
- # kclient kill() power cycles nodes, so requires clients to each be on
- # their own node
- if self.mounts[0].client_remote.hostname == self.mounts[1].client_remote.hostname:
- raise case.SkipTest("kclient clients must be on separate nodes")
-
- if self.REQUIRE_ONE_CLIENT_REMOTE:
- if self.mounts[0].client_remote.hostname in self.mds_cluster.get_mds_hostnames():
- raise case.SkipTest("Require first client to be on separate server from MDSs")
-
- if self.REQUIRE_MEMSTORE:
- objectstore = self.mds_cluster.get_config("osd_objectstore", "osd")
- if objectstore != "memstore":
- # You certainly *could* run this on a real OSD, but you don't want to sit
- # here for hours waiting for the test to fill up a 1TB drive!
- raise case.SkipTest("Require `memstore` OSD backend to simulate full drives")
-
- # Unmount all surplus clients
- for i in range(self.CLIENTS_REQUIRED, len(self.mounts)):
- mount = self.mounts[i]
- log.info("Unmounting unneeded client {0}".format(mount.client_id))
- mount.umount_wait()
-
- # Create friendly mount_a, mount_b attrs
- for i in range(0, self.CLIENTS_REQUIRED):
- setattr(self, "mount_{0}".format(chr(ord('a') + i)), self.mounts[i])
-
- self.mds_cluster.clear_firewall()
-
- # Unmount in order to start each test on a fresh mount, such
- # that test_barrier can have a firm expectation of what OSD
- # epoch the clients start with.
- if self.mount_a.is_mounted():
- self.mount_a.umount_wait()
-
- if self.mount_b:
- if self.mount_b.is_mounted():
- self.mount_b.umount_wait()
-
- # To avoid any issues with e.g. unlink bugs, we destroy and recreate
- # the filesystem rather than just doing a rm -rf of files
- self.mds_cluster.mds_stop()
- self.mds_cluster.delete_all_filesystems()
- self.fs = None # is now invalid!
-
- # In case the previous filesystem had filled up the RADOS cluster, wait for that
- # flag to pass.
- osd_mon_report_interval_max = int(self.mds_cluster.get_config("osd_mon_report_interval_max", service_type='osd'))
- self.wait_until_true(lambda: not self.mds_cluster.is_full(),
- timeout=osd_mon_report_interval_max * 5)
-
- # In case anything is in the OSD blacklist list, clear it out. This is to avoid
- # the OSD map changing in the background (due to blacklist expiry) while tests run.
- try:
- self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blacklist", "clear")
- except CommandFailedError:
- # Fallback for older Ceph cluster
- blacklist = json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd("osd",
- "dump", "--format=json-pretty"))['blacklist']
- log.info("Removing {0} blacklist entries".format(len(blacklist)))
- for addr, blacklisted_at in blacklist.items():
- self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blacklist", "rm", addr)
-
- client_mount_ids = [m.client_id for m in self.mounts]
- # In case the test changes the IDs of clients, stash them so that we can
- # reset in tearDown
- self._original_client_ids = client_mount_ids
- log.info(client_mount_ids)
-
- # In case there were any extra auth identities around from a previous
- # test, delete them
- for entry in self.auth_list():
- ent_type, ent_id = entry['entity'].split(".")
- if ent_type == "client" and ent_id not in client_mount_ids and ent_id != "admin":
- self.mds_cluster.mon_manager.raw_cluster_cmd("auth", "del", entry['entity'])
-
- if self.REQUIRE_FILESYSTEM:
- self.fs = self.mds_cluster.newfs(True)
- self.fs.mds_restart()
-
- # In case some test messed with auth caps, reset them
- for client_id in client_mount_ids:
- self.mds_cluster.mon_manager.raw_cluster_cmd_result(
- 'auth', 'caps', "client.{0}".format(client_id),
- 'mds', 'allow',
- 'mon', 'allow r',
- 'osd', 'allow rw pool={0}'.format(self.fs.get_data_pool_name()))
-
- # wait for mds restart to complete...
- self.fs.wait_for_daemons()
- if not self.mount_a.is_mounted():
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
-
- if self.mount_b:
- if not self.mount_b.is_mounted():
- self.mount_b.mount()
- self.mount_b.wait_until_mounted()
-
- # Load an config settings of interest
- for setting in self.LOAD_SETTINGS:
- setattr(self, setting, int(self.fs.mds_asok(
- ['config', 'get', setting], self.mds_cluster.mds_ids[0]
- )[setting]))
-
- self.configs_set = set()
-
- def tearDown(self):
- super(CephFSTestCase, self).tearDown()
-
- self.mds_cluster.clear_firewall()
- for m in self.mounts:
- m.teardown()
-
- for i, m in enumerate(self.mounts):
- m.client_id = self._original_client_ids[i]
-
- for subsys, key in self.configs_set:
- self.mds_cluster.clear_ceph_conf(subsys, key)
-
- def set_conf(self, subsys, key, value):
- self.configs_set.add((subsys, key))
- self.mds_cluster.set_ceph_conf(subsys, key, value)
-
- def auth_list(self):
- """
- Convenience wrapper on "ceph auth list"
- """
- return json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd(
- "auth", "list", "--format=json-pretty"
- ))['auth_dump']
-
- def assert_session_count(self, expected, ls_data=None, mds_id=None):
- if ls_data is None:
- ls_data = self.fs.mds_asok(['session', 'ls'], mds_id=mds_id)
-
- self.assertEqual(expected, len(ls_data), "Expected {0} sessions, found {1}".format(
- expected, len(ls_data)
- ))
-
- def assert_session_state(self, client_id, expected_state):
- self.assertEqual(
- self._session_by_id(
- self.fs.mds_asok(['session', 'ls'])).get(client_id, {'state': None})['state'],
- expected_state)
-
- def get_session_data(self, client_id):
- return self._session_by_id(client_id)
-
- def _session_list(self):
- ls_data = self.fs.mds_asok(['session', 'ls'])
- ls_data = [s for s in ls_data if s['state'] not in ['stale', 'closed']]
- return ls_data
-
- def get_session(self, client_id, session_ls=None):
- if session_ls is None:
- session_ls = self.fs.mds_asok(['session', 'ls'])
-
- return self._session_by_id(session_ls)[client_id]
-
- def _session_by_id(self, session_ls):
- return dict([(s['id'], s) for s in session_ls])
-
- def wait_for_daemon_start(self, daemon_ids=None):
- """
- Wait until all the daemons appear in the FSMap, either assigned
- MDS ranks or in the list of standbys
- """
- def get_daemon_names():
- return [info['name'] for info in self.mds_cluster.status().get_all()]
-
- if daemon_ids is None:
- daemon_ids = self.mds_cluster.mds_ids
-
- try:
- self.wait_until_true(
- lambda: set(daemon_ids) & set(get_daemon_names()) == set(daemon_ids),
- timeout=30
- )
- except RuntimeError:
- log.warn("Timeout waiting for daemons {0}, while we have {1}".format(
- daemon_ids, get_daemon_names()
- ))
- raise
-
- def assert_mds_crash(self, daemon_id):
- """
- Assert that the a particular MDS daemon crashes (block until
- it does)
- """
- try:
- self.mds_cluster.mds_daemons[daemon_id].proc.wait()
- except CommandFailedError as e:
- log.info("MDS '{0}' crashed with status {1} as expected".format(daemon_id, e.exitstatus))
- self.mds_cluster.mds_daemons[daemon_id].proc = None
-
- # Go remove the coredump from the crash, otherwise teuthology.internal.coredump will
- # catch it later and treat it as a failure.
- p = self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
- "sudo", "sysctl", "-n", "kernel.core_pattern"], stdout=StringIO())
- core_pattern = p.stdout.getvalue().strip()
- if os.path.dirname(core_pattern): # Non-default core_pattern with a directory in it
- # We have seen a core_pattern that looks like it's from teuthology's coredump
- # task, so proceed to clear out the core file
- log.info("Clearing core from pattern: {0}".format(core_pattern))
-
- # Determine the PID of the crashed MDS by inspecting the MDSMap, it had
- # to talk to the mons to get assigned a rank to reach the point of crashing
- addr = self.mds_cluster.mon_manager.get_mds_status(daemon_id)['addr']
- pid_str = addr.split("/")[1]
- log.info("Determined crasher PID was {0}".format(pid_str))
-
- # Substitute PID into core_pattern to get a glob
- core_glob = core_pattern.replace("%p", pid_str)
- core_glob = re.sub("%[a-z]", "*", core_glob) # Match all for all other % tokens
-
- # Verify that we see the expected single coredump matching the expected pattern
- ls_proc = self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
- "sudo", "ls", run.Raw(core_glob)
- ], stdout=StringIO())
- cores = [f for f in ls_proc.stdout.getvalue().strip().split("\n") if f]
- log.info("Enumerated cores: {0}".format(cores))
- self.assertEqual(len(cores), 1)
-
- log.info("Found core file {0}, deleting it".format(cores[0]))
-
- self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
- "sudo", "rm", "-f", cores[0]
- ])
- else:
- log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)")
-
- else:
- raise AssertionError("MDS daemon '{0}' did not crash as expected".format(daemon_id))
+++ /dev/null
-
-from StringIO import StringIO
-import json
-import logging
-from gevent import Greenlet
-import os
-import time
-import datetime
-import re
-import errno
-
-from teuthology.exceptions import CommandFailedError
-from teuthology import misc
-from teuthology.nuke import clear_firewall
-from teuthology.parallel import parallel
-from tasks.ceph_manager import write_conf
-from tasks import ceph_manager
-
-
-log = logging.getLogger(__name__)
-
-
-DAEMON_WAIT_TIMEOUT = 120
-ROOT_INO = 1
-
-
-class ObjectNotFound(Exception):
- def __init__(self, object_name):
- self._object_name = object_name
-
- def __str__(self):
- return "Object not found: '{0}'".format(self._object_name)
-
-class FSStatus(object):
- """
- Operations on a snapshot of the FSMap.
- """
- def __init__(self, mon_manager):
- self.mon = mon_manager
- self.map = json.loads(self.mon.raw_cluster_cmd("fs", "dump", "--format=json-pretty"))
-
- def __str__(self):
- return json.dumps(self.map, indent = 2, sort_keys = True)
-
- # Expose the fsmap for manual inspection.
- def __getitem__(self, key):
- """
- Get a field from the fsmap.
- """
- return self.map[key]
-
- def get_filesystems(self):
- """
- Iterator for all filesystems.
- """
- for fs in self.map['filesystems']:
- yield fs
-
- def get_all(self):
- """
- Iterator for all the mds_info components in the FSMap.
- """
- for info in self.get_standbys():
- yield info
- for fs in self.map['filesystems']:
- for info in fs['mdsmap']['info'].values():
- yield info
-
- def get_standbys(self):
- """
- Iterator for all standbys.
- """
- for info in self.map['standbys']:
- yield info
-
- def get_fsmap(self, fscid):
- """
- Get the fsmap for the given FSCID.
- """
- for fs in self.map['filesystems']:
- if fscid is None or fs['id'] == fscid:
- return fs
- raise RuntimeError("FSCID {0} not in map".format(fscid))
-
- def get_fsmap_byname(self, name):
- """
- Get the fsmap for the given file system name.
- """
- for fs in self.map['filesystems']:
- if name is None or fs['mdsmap']['fs_name'] == name:
- return fs
- raise RuntimeError("FS {0} not in map".format(name))
-
- def get_replays(self, fscid):
- """
- Get the standby:replay MDS for the given FSCID.
- """
- fs = self.get_fsmap(fscid)
- for info in fs['mdsmap']['info'].values():
- if info['state'] == 'up:standby-replay':
- yield info
-
- def get_ranks(self, fscid):
- """
- Get the ranks for the given FSCID.
- """
- fs = self.get_fsmap(fscid)
- for info in fs['mdsmap']['info'].values():
- if info['rank'] >= 0:
- yield info
-
- def get_rank(self, fscid, rank):
- """
- Get the rank for the given FSCID.
- """
- for info in self.get_ranks(fscid):
- if info['rank'] == rank:
- return info
- raise RuntimeError("FSCID {0} has no rank {1}".format(fscid, rank))
-
- def get_mds(self, name):
- """
- Get the info for the given MDS name.
- """
- for info in self.get_all():
- if info['name'] == name:
- return info
- return None
-
- def get_mds_addr(self, name):
- """
- Return the instance addr as a string, like "10.214.133.138:6807\/10825"
- """
- info = self.get_mds(name)
- if info:
- return info['addr']
- else:
- log.warn(json.dumps(list(self.get_all()), indent=2)) # dump for debugging
- raise RuntimeError("MDS id '{0}' not found in map".format(name))
-
-class CephCluster(object):
- @property
- def admin_remote(self):
- first_mon = misc.get_first_mon(self._ctx, None)
- (result,) = self._ctx.cluster.only(first_mon).remotes.iterkeys()
- return result
-
- def __init__(self, ctx):
- self._ctx = ctx
- self.mon_manager = ceph_manager.CephManager(self.admin_remote, ctx=ctx, logger=log.getChild('ceph_manager'))
-
- def get_config(self, key, service_type=None):
- """
- Get config from mon by default, or a specific service if caller asks for it
- """
- if service_type is None:
- service_type = 'mon'
-
- service_id = sorted(misc.all_roles_of_type(self._ctx.cluster, service_type))[0]
- return self.json_asok(['config', 'get', key], service_type, service_id)[key]
-
- def set_ceph_conf(self, subsys, key, value):
- if subsys not in self._ctx.ceph['ceph'].conf:
- self._ctx.ceph['ceph'].conf[subsys] = {}
- self._ctx.ceph['ceph'].conf[subsys][key] = value
- write_conf(self._ctx) # XXX because we don't have the ceph task's config object, if they
- # used a different config path this won't work.
-
- def clear_ceph_conf(self, subsys, key):
- del self._ctx.ceph['ceph'].conf[subsys][key]
- write_conf(self._ctx)
-
- def json_asok(self, command, service_type, service_id):
- proc = self.mon_manager.admin_socket(service_type, service_id, command)
- response_data = proc.stdout.getvalue()
- log.info("_json_asok output: {0}".format(response_data))
- if response_data.strip():
- return json.loads(response_data)
- else:
- return None
-
-
-class MDSCluster(CephCluster):
- """
- Collective operations on all the MDS daemons in the Ceph cluster. These
- daemons may be in use by various Filesystems.
-
- For the benefit of pre-multi-filesystem tests, this class is also
- a parent of Filesystem. The correct way to use MDSCluster going forward is
- as a separate instance outside of your (multiple) Filesystem instances.
- """
- def __init__(self, ctx):
- super(MDSCluster, self).__init__(ctx)
-
- self.mds_ids = list(misc.all_roles_of_type(ctx.cluster, 'mds'))
-
- if len(self.mds_ids) == 0:
- raise RuntimeError("This task requires at least one MDS")
-
- if hasattr(self._ctx, "daemons"):
- # Presence of 'daemons' attribute implies ceph task rather than ceph_deploy task
- self.mds_daemons = dict([(mds_id, self._ctx.daemons.get_daemon('mds', mds_id)) for mds_id in self.mds_ids])
-
- def _one_or_all(self, mds_id, cb, in_parallel=True):
- """
- Call a callback for a single named MDS, or for all.
-
- Note that the parallelism here isn't for performance, it's to avoid being overly kind
- to the cluster by waiting a graceful ssh-latency of time between doing things, and to
- avoid being overly kind by executing them in a particular order. However, some actions
- don't cope with being done in parallel, so it's optional (`in_parallel`)
-
- :param mds_id: MDS daemon name, or None
- :param cb: Callback taking single argument of MDS daemon name
- :param in_parallel: whether to invoke callbacks concurrently (else one after the other)
- """
- if mds_id is None:
- if in_parallel:
- with parallel() as p:
- for mds_id in self.mds_ids:
- p.spawn(cb, mds_id)
- else:
- for mds_id in self.mds_ids:
- cb(mds_id)
- else:
- cb(mds_id)
-
- def mds_stop(self, mds_id=None):
- """
- Stop the MDS daemon process(se). If it held a rank, that rank
- will eventually go laggy.
- """
- self._one_or_all(mds_id, lambda id_: self.mds_daemons[id_].stop())
-
- def mds_fail(self, mds_id=None):
- """
- Inform MDSMonitor of the death of the daemon process(es). If it held
- a rank, that rank will be relinquished.
- """
- self._one_or_all(mds_id, lambda id_: self.mon_manager.raw_cluster_cmd("mds", "fail", id_))
-
- def mds_restart(self, mds_id=None):
- self._one_or_all(mds_id, lambda id_: self.mds_daemons[id_].restart())
-
- def mds_fail_restart(self, mds_id=None):
- """
- Variation on restart that includes marking MDSs as failed, so that doing this
- operation followed by waiting for healthy daemon states guarantees that they
- have gone down and come up, rather than potentially seeing the healthy states
- that existed before the restart.
- """
- def _fail_restart(id_):
- self.mds_daemons[id_].stop()
- self.mon_manager.raw_cluster_cmd("mds", "fail", id_)
- self.mds_daemons[id_].restart()
-
- self._one_or_all(mds_id, _fail_restart)
-
- def newfs(self, name):
- return Filesystem(self._ctx, create=name)
-
- def status(self):
- return FSStatus(self.mon_manager)
-
- def delete_all_filesystems(self):
- """
- Remove all filesystems that exist, and any pools in use by them.
- """
- pools = json.loads(self.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools']
- pool_id_name = {}
- for pool in pools:
- pool_id_name[pool['pool']] = pool['pool_name']
-
- # mark cluster down for each fs to prevent churn during deletion
- status = self.status()
- for fs in status.get_filesystems():
- self.mon_manager.raw_cluster_cmd("fs", "set", fs['mdsmap']['fs_name'], "cluster_down", "true")
-
- # get a new copy as actives may have since changed
- status = self.status()
- for fs in status.get_filesystems():
- mdsmap = fs['mdsmap']
- metadata_pool = pool_id_name[mdsmap['metadata_pool']]
-
- for gid in mdsmap['up'].values():
- self.mon_manager.raw_cluster_cmd('mds', 'fail', gid.__str__())
-
- self.mon_manager.raw_cluster_cmd('fs', 'rm', mdsmap['fs_name'], '--yes-i-really-mean-it')
- self.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
- metadata_pool, metadata_pool,
- '--yes-i-really-really-mean-it')
- for data_pool in mdsmap['data_pools']:
- data_pool = pool_id_name[data_pool]
- self.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
- data_pool, data_pool,
- '--yes-i-really-really-mean-it')
-
- def get_standby_daemons(self):
- return set([s['name'] for s in self.status().get_standbys()])
-
- def get_mds_hostnames(self):
- result = set()
- for mds_id in self.mds_ids:
- mds_remote = self.mon_manager.find_remote('mds', mds_id)
- result.add(mds_remote.hostname)
-
- return list(result)
-
- def set_clients_block(self, blocked, mds_id=None):
- """
- Block (using iptables) client communications to this MDS. Be careful: if
- other services are running on this MDS, or other MDSs try to talk to this
- MDS, their communications may also be blocked as collatoral damage.
-
- :param mds_id: Optional ID of MDS to block, default to all
- :return:
- """
- da_flag = "-A" if blocked else "-D"
-
- def set_block(_mds_id):
- remote = self.mon_manager.find_remote('mds', _mds_id)
- status = self.status()
-
- addr = status.get_mds_addr(_mds_id)
- ip_str, port_str, inst_str = re.match("(.+):(.+)/(.+)", addr).groups()
-
- remote.run(
- args=["sudo", "iptables", da_flag, "OUTPUT", "-p", "tcp", "--sport", port_str, "-j", "REJECT", "-m",
- "comment", "--comment", "teuthology"])
- remote.run(
- args=["sudo", "iptables", da_flag, "INPUT", "-p", "tcp", "--dport", port_str, "-j", "REJECT", "-m",
- "comment", "--comment", "teuthology"])
-
- self._one_or_all(mds_id, set_block, in_parallel=False)
-
- def clear_firewall(self):
- clear_firewall(self._ctx)
-
- def get_mds_info(self, mds_id):
- return FSStatus(self.mon_manager).get_mds(mds_id)
-
- def is_full(self):
- flags = json.loads(self.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['flags']
- return 'full' in flags
-
- def is_pool_full(self, pool_name):
- pools = json.loads(self.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools']
- for pool in pools:
- if pool['pool_name'] == pool_name:
- return 'full' in pool['flags_names'].split(",")
-
- raise RuntimeError("Pool not found '{0}'".format(pool_name))
-
-class Filesystem(MDSCluster):
- """
- This object is for driving a CephFS filesystem. The MDS daemons driven by
- MDSCluster may be shared with other Filesystems.
- """
- def __init__(self, ctx, fscid=None, create=None):
- super(Filesystem, self).__init__(ctx)
-
- self.id = None
- self.name = None
- self.metadata_pool_name = None
- self.data_pools = None
-
- client_list = list(misc.all_roles_of_type(self._ctx.cluster, 'client'))
- self.client_id = client_list[0]
- self.client_remote = list(misc.get_clients(ctx=ctx, roles=["client.{0}".format(self.client_id)]))[0][1]
-
- if create is not None:
- if fscid is not None:
- raise RuntimeError("cannot specify fscid when creating fs")
- if create is True:
- self.name = 'cephfs'
- else:
- self.name = create
- if not self.legacy_configured():
- self.create()
- elif fscid is not None:
- self.id = fscid
- self.getinfo(refresh = True)
-
- # Stash a reference to the first created filesystem on ctx, so
- # that if someone drops to the interactive shell they can easily
- # poke our methods.
- if not hasattr(self._ctx, "filesystem"):
- self._ctx.filesystem = self
-
- def getinfo(self, refresh = False):
- status = self.status()
- if self.id is not None:
- fsmap = status.get_fsmap(self.id)
- elif self.name is not None:
- fsmap = status.get_fsmap_byname(self.name)
- else:
- fss = [fs for fs in status.get_filesystems()]
- if len(fss) == 1:
- fsmap = fss[0]
- elif len(fss) == 0:
- raise RuntimeError("no file system available")
- else:
- raise RuntimeError("more than one file system available")
- self.id = fsmap['id']
- self.name = fsmap['mdsmap']['fs_name']
- self.get_pool_names(status = status, refresh = refresh)
- return status
-
- def deactivate(self, rank):
- if rank < 0:
- raise RuntimeError("invalid rank")
- elif rank == 0:
- raise RuntimeError("cannot deactivate rank 0")
- self.mon_manager.raw_cluster_cmd("mds", "deactivate", "%d:%d" % (self.id, rank))
-
- def set_max_mds(self, max_mds):
- self.mon_manager.raw_cluster_cmd("fs", "set", self.name, "max_mds", "%d" % max_mds)
-
- def get_pgs_per_fs_pool(self):
- """
- Calculate how many PGs to use when creating a pool, in order to avoid raising any
- health warnings about mon_pg_warn_min_per_osd
-
- :return: an integer number of PGs
- """
- pg_warn_min_per_osd = int(self.get_config('mon_pg_warn_min_per_osd'))
- osd_count = len(list(misc.all_roles_of_type(self._ctx.cluster, 'osd')))
- return pg_warn_min_per_osd * osd_count
-
- def create(self):
- if self.name is None:
- self.name = "cephfs"
- if self.metadata_pool_name is None:
- self.metadata_pool_name = "{0}_metadata".format(self.name)
- data_pool_name = "{0}_data".format(self.name)
-
- log.info("Creating filesystem '{0}'".format(self.name))
-
- pgs_per_fs_pool = self.get_pgs_per_fs_pool()
-
- self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
- self.metadata_pool_name, pgs_per_fs_pool.__str__())
- self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
- data_pool_name, pgs_per_fs_pool.__str__())
- self.mon_manager.raw_cluster_cmd('fs', 'new',
- self.name, self.metadata_pool_name, data_pool_name)
-
- self.getinfo(refresh = True)
-
- def __del__(self):
- if getattr(self._ctx, "filesystem", None) == self:
- delattr(self._ctx, "filesystem")
-
- def exists(self):
- """
- Whether a filesystem exists in the mon's filesystem list
- """
- fs_list = json.loads(self.mon_manager.raw_cluster_cmd('fs', 'ls', '--format=json-pretty'))
- return self.name in [fs['name'] for fs in fs_list]
-
- def legacy_configured(self):
- """
- Check if a legacy (i.e. pre "fs new") filesystem configuration is present. If this is
- the case, the caller should avoid using Filesystem.create
- """
- try:
- out_text = self.mon_manager.raw_cluster_cmd('--format=json-pretty', 'osd', 'lspools')
- pools = json.loads(out_text)
- metadata_pool_exists = 'metadata' in [p['poolname'] for p in pools]
- if metadata_pool_exists:
- self.metadata_pool_name = 'metadata'
- except CommandFailedError as e:
- # For use in upgrade tests, Ceph cuttlefish and earlier don't support
- # structured output (--format) from the CLI.
- if e.exitstatus == 22:
- metadata_pool_exists = True
- else:
- raise
-
- return metadata_pool_exists
-
- def _df(self):
- return json.loads(self.mon_manager.raw_cluster_cmd("df", "--format=json-pretty"))
-
- def get_mds_map(self):
- return self.status().get_fsmap(self.id)['mdsmap']
-
- def add_data_pool(self, name):
- self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', name, self.get_pgs_per_fs_pool().__str__())
- self.mon_manager.raw_cluster_cmd('fs', 'add_data_pool', self.name, name)
- self.get_pool_names(refresh = True)
- for poolid, fs_name in self.data_pools.items():
- if name == fs_name:
- return poolid
- raise RuntimeError("could not get just created pool '{0}'".format(name))
-
- def get_pool_names(self, refresh = False, status = None):
- if refresh or self.metadata_pool_name is None or self.data_pools is None:
- if status is None:
- status = self.status()
- fsmap = status.get_fsmap(self.id)
-
- osd_map = self.mon_manager.get_osd_dump_json()
- id_to_name = {}
- for p in osd_map['pools']:
- id_to_name[p['pool']] = p['pool_name']
-
- self.metadata_pool_name = id_to_name[fsmap['mdsmap']['metadata_pool']]
- self.data_pools = {}
- for data_pool in fsmap['mdsmap']['data_pools']:
- self.data_pools[data_pool] = id_to_name[data_pool]
-
- def get_data_pool_name(self, refresh = False):
- if refresh or self.data_pools is None:
- self.get_pool_names(refresh = True)
- assert(len(self.data_pools) == 1)
- return self.data_pools.values()[0]
-
- def get_data_pool_id(self, refresh = False):
- """
- Don't call this if you have multiple data pools
- :return: integer
- """
- if refresh or self.data_pools is None:
- self.get_pool_names(refresh = True)
- assert(len(self.data_pools) == 1)
- return self.data_pools.keys()[0]
-
- def get_data_pool_names(self, refresh = False):
- if refresh or self.data_pools is None:
- self.get_pool_names(refresh = True)
- return self.data_pools.values()
-
- def get_metadata_pool_name(self):
- return self.metadata_pool_name
-
- def get_namespace_id(self):
- return self.id
-
- def get_pool_df(self, pool_name):
- """
- Return a dict like:
- {u'bytes_used': 0, u'max_avail': 83848701, u'objects': 0, u'kb_used': 0}
- """
- for pool_df in self._df()['pools']:
- if pool_df['name'] == pool_name:
- return pool_df['stats']
-
- raise RuntimeError("Pool name '{0}' not found".format(pool_name))
-
- def get_usage(self):
- return self._df()['stats']['total_used_bytes']
-
- def are_daemons_healthy(self):
- """
- Return true if all daemons are in one of active, standby, standby-replay, and
- at least max_mds daemons are in 'active'.
-
- Unlike most of Filesystem, this function is tolerant of new-style `fs`
- commands being missing, because we are part of the ceph installation
- process during upgrade suites, so must fall back to old style commands
- when we get an EINVAL on a new style command.
-
- :return:
- """
-
- active_count = 0
- try:
- mds_map = self.get_mds_map()
- except CommandFailedError as cfe:
- # Old version, fall back to non-multi-fs commands
- if cfe.exitstatus == errno.EINVAL:
- mds_map = json.loads(
- self.mon_manager.raw_cluster_cmd('mds', 'dump', '--format=json'))
- else:
- raise
-
- log.info("are_daemons_healthy: mds map: {0}".format(mds_map))
-
- for mds_id, mds_status in mds_map['info'].items():
- if mds_status['state'] not in ["up:active", "up:standby", "up:standby-replay"]:
- log.warning("Unhealthy mds state {0}:{1}".format(mds_id, mds_status['state']))
- return False
- elif mds_status['state'] == 'up:active':
- active_count += 1
-
- log.info("are_daemons_healthy: {0}/{1}".format(
- active_count, mds_map['max_mds']
- ))
-
- if active_count >= mds_map['max_mds']:
- # The MDSMap says these guys are active, but let's check they really are
- for mds_id, mds_status in mds_map['info'].items():
- if mds_status['state'] == 'up:active':
- try:
- daemon_status = self.mds_asok(["status"], mds_id=mds_status['name'])
- except CommandFailedError as cfe:
- if cfe.exitstatus == errno.EINVAL:
- # Old version, can't do this check
- continue
- else:
- # MDS not even running
- return False
-
- if daemon_status['state'] != 'up:active':
- # MDS hasn't taken the latest map yet
- return False
-
- return True
- else:
- return False
-
- def get_daemon_names(self, state=None):
- """
- Return MDS daemon names of those daemons in the given state
- :param state:
- :return:
- """
- status = self.get_mds_map()
- result = []
- for mds_status in sorted(status['info'].values(), lambda a, b: cmp(a['rank'], b['rank'])):
- if mds_status['state'] == state or state is None:
- result.append(mds_status['name'])
-
- return result
-
- def get_active_names(self):
- """
- Return MDS daemon names of those daemons holding ranks
- in state up:active
-
- :return: list of strings like ['a', 'b'], sorted by rank
- """
- return self.get_daemon_names("up:active")
-
- def get_all_mds_rank(self):
- status = self.get_mds_map()
- result = []
- for mds_status in sorted(status['info'].values(), lambda a, b: cmp(a['rank'], b['rank'])):
- if mds_status['rank'] != -1 and mds_status['state'] != 'up:standby-replay':
- result.append(mds_status['rank'])
-
- return result
-
- def get_rank_names(self):
- """
- Return MDS daemon names of those daemons holding a rank,
- sorted by rank. This includes e.g. up:replay/reconnect
- as well as active, but does not include standby or
- standby-replay.
- """
- status = self.get_mds_map()
- result = []
- for mds_status in sorted(status['info'].values(), lambda a, b: cmp(a['rank'], b['rank'])):
- if mds_status['rank'] != -1 and mds_status['state'] != 'up:standby-replay':
- result.append(mds_status['name'])
-
- return result
-
- def wait_for_daemons(self, timeout=None):
- """
- Wait until all daemons are healthy
- :return:
- """
-
- if timeout is None:
- timeout = DAEMON_WAIT_TIMEOUT
-
- elapsed = 0
- while True:
- if self.are_daemons_healthy():
- return
- else:
- time.sleep(1)
- elapsed += 1
-
- if elapsed > timeout:
- raise RuntimeError("Timed out waiting for MDS daemons to become healthy")
-
- def get_lone_mds_id(self):
- """
- Get a single MDS ID: the only one if there is only one
- configured, else the only one currently holding a rank,
- else raise an error.
- """
- if len(self.mds_ids) != 1:
- alive = self.get_rank_names()
- if len(alive) == 1:
- return alive[0]
- else:
- raise ValueError("Explicit MDS argument required when multiple MDSs in use")
- else:
- return self.mds_ids[0]
-
- def recreate(self):
- log.info("Creating new filesystem")
- self.delete_all_filesystems()
- self.id = None
- self.create()
-
- def put_metadata_object_raw(self, object_id, infile):
- """
- Save an object to the metadata pool
- """
- temp_bin_path = infile
- self.client_remote.run(args=[
- 'sudo', os.path.join(self._prefix, 'rados'), '-p', self.metadata_pool_name, 'put', object_id, temp_bin_path
- ])
-
- def get_metadata_object_raw(self, object_id):
- """
- Retrieve an object from the metadata pool and store it in a file.
- """
- temp_bin_path = '/tmp/' + object_id + '.bin'
-
- self.client_remote.run(args=[
- 'sudo', os.path.join(self._prefix, 'rados'), '-p', self.metadata_pool_name, 'get', object_id, temp_bin_path
- ])
-
- return temp_bin_path
-
- def get_metadata_object(self, object_type, object_id):
- """
- Retrieve an object from the metadata pool, pass it through
- ceph-dencoder to dump it to JSON, and return the decoded object.
- """
- temp_bin_path = '/tmp/out.bin'
-
- self.client_remote.run(args=[
- 'sudo', os.path.join(self._prefix, 'rados'), '-p', self.metadata_pool_name, 'get', object_id, temp_bin_path
- ])
-
- stdout = StringIO()
- self.client_remote.run(args=[
- 'sudo', os.path.join(self._prefix, 'ceph-dencoder'), 'type', object_type, 'import', temp_bin_path, 'decode', 'dump_json'
- ], stdout=stdout)
- dump_json = stdout.getvalue().strip()
- try:
- dump = json.loads(dump_json)
- except (TypeError, ValueError):
- log.error("Failed to decode JSON: '{0}'".format(dump_json))
- raise
-
- return dump
-
- def get_journal_version(self):
- """
- Read the JournalPointer and Journal::Header objects to learn the version of
- encoding in use.
- """
- journal_pointer_object = '400.00000000'
- journal_pointer_dump = self.get_metadata_object("JournalPointer", journal_pointer_object)
- journal_ino = journal_pointer_dump['journal_pointer']['front']
-
- journal_header_object = "{0:x}.00000000".format(journal_ino)
- journal_header_dump = self.get_metadata_object('Journaler::Header', journal_header_object)
-
- version = journal_header_dump['journal_header']['stream_format']
- log.info("Read journal version {0}".format(version))
-
- return version
-
- def mds_asok(self, command, mds_id=None):
- if mds_id is None:
- mds_id = self.get_lone_mds_id()
-
- return self.json_asok(command, 'mds', mds_id)
-
- def read_cache(self, path, depth=None):
- cmd = ["dump", "tree", path]
- if depth is not None:
- cmd.append(depth.__str__())
- result = self.mds_asok(cmd)
- if len(result) == 0:
- raise RuntimeError("Path not found in cache: {0}".format(path))
-
- return result
-
- def wait_for_state(self, goal_state, reject=None, timeout=None, mds_id=None):
- """
- Block until the MDS reaches a particular state, or a failure condition
- is met.
-
- When there are multiple MDSs, succeed when exaclty one MDS is in the
- goal state, or fail when any MDS is in the reject state.
-
- :param goal_state: Return once the MDS is in this state
- :param reject: Fail if the MDS enters this state before the goal state
- :param timeout: Fail if this many seconds pass before reaching goal
- :return: number of seconds waited, rounded down to integer
- """
-
- started_at = time.time()
- while True:
- status = self.status()
- if mds_id is not None:
- # mds_info is None if no daemon with this ID exists in the map
- mds_info = status.get_mds(mds_id)
- current_state = mds_info['state'] if mds_info else None
- log.info("Looked up MDS state for {0}: {1}".format(mds_id, current_state))
- else:
- # In general, look for a single MDS
- states = [m['state'] for m in status.get_ranks(self.id)]
- if [s for s in states if s == goal_state] == [goal_state]:
- current_state = goal_state
- elif reject in states:
- current_state = reject
- else:
- current_state = None
- log.info("mapped states {0} to {1}".format(states, current_state))
-
- elapsed = time.time() - started_at
- if current_state == goal_state:
- log.info("reached state '{0}' in {1}s".format(current_state, elapsed))
- return elapsed
- elif reject is not None and current_state == reject:
- raise RuntimeError("MDS in reject state {0}".format(current_state))
- elif timeout is not None and elapsed > timeout:
- log.error("MDS status at timeout: {0}".format(status.get_fsmap(self.id)))
- raise RuntimeError(
- "Reached timeout after {0} seconds waiting for state {1}, while in state {2}".format(
- elapsed, goal_state, current_state
- ))
- else:
- time.sleep(1)
-
- def _read_data_xattr(self, ino_no, xattr_name, type, pool):
- mds_id = self.mds_ids[0]
- remote = self.mds_daemons[mds_id].remote
- if pool is None:
- pool = self.get_data_pool_name()
-
- obj_name = "{0:x}.00000000".format(ino_no)
-
- args = [
- os.path.join(self._prefix, "rados"), "-p", pool, "getxattr", obj_name, xattr_name
- ]
- try:
- proc = remote.run(
- args=args,
- stdout=StringIO())
- except CommandFailedError as e:
- log.error(e.__str__())
- raise ObjectNotFound(obj_name)
-
- data = proc.stdout.getvalue()
-
- p = remote.run(
- args=[os.path.join(self._prefix, "ceph-dencoder"), "type", type, "import", "-", "decode", "dump_json"],
- stdout=StringIO(),
- stdin=data
- )
-
- return json.loads(p.stdout.getvalue().strip())
-
- def _write_data_xattr(self, ino_no, xattr_name, data, pool=None):
- """
- Write to an xattr of the 0th data object of an inode. Will
- succeed whether the object and/or xattr already exist or not.
-
- :param ino_no: integer inode number
- :param xattr_name: string name of the xattr
- :param data: byte array data to write to the xattr
- :param pool: name of data pool or None to use primary data pool
- :return: None
- """
- remote = self.mds_daemons[self.mds_ids[0]].remote
- if pool is None:
- pool = self.get_data_pool_name()
-
- obj_name = "{0:x}.00000000".format(ino_no)
- args = [
- os.path.join(self._prefix, "rados"), "-p", pool, "setxattr",
- obj_name, xattr_name, data
- ]
- remote.run(
- args=args,
- stdout=StringIO())
-
- def read_backtrace(self, ino_no, pool=None):
- """
- Read the backtrace from the data pool, return a dict in the format
- given by inode_backtrace_t::dump, which is something like:
-
- ::
-
- rados -p cephfs_data getxattr 10000000002.00000000 parent > out.bin
- ceph-dencoder type inode_backtrace_t import out.bin decode dump_json
-
- { "ino": 1099511627778,
- "ancestors": [
- { "dirino": 1,
- "dname": "blah",
- "version": 11}],
- "pool": 1,
- "old_pools": []}
-
- :param pool: name of pool to read backtrace from. If omitted, FS must have only
- one data pool and that will be used.
- """
- return self._read_data_xattr(ino_no, "parent", "inode_backtrace_t", pool)
-
- def read_layout(self, ino_no, pool=None):
- """
- Read 'layout' xattr of an inode and parse the result, returning a dict like:
- ::
- {
- "stripe_unit": 4194304,
- "stripe_count": 1,
- "object_size": 4194304,
- "pool_id": 1,
- "pool_ns": "",
- }
-
- :param pool: name of pool to read backtrace from. If omitted, FS must have only
- one data pool and that will be used.
- """
- return self._read_data_xattr(ino_no, "layout", "file_layout_t", pool)
-
- def _enumerate_data_objects(self, ino, size):
- """
- Get the list of expected data objects for a range, and the list of objects
- that really exist.
-
- :return a tuple of two lists of strings (expected, actual)
- """
- stripe_size = 1024 * 1024 * 4
-
- size = max(stripe_size, size)
-
- want_objects = [
- "{0:x}.{1:08x}".format(ino, n)
- for n in range(0, ((size - 1) / stripe_size) + 1)
- ]
-
- exist_objects = self.rados(["ls"], pool=self.get_data_pool_name()).split("\n")
-
- return want_objects, exist_objects
-
- def data_objects_present(self, ino, size):
- """
- Check that *all* the expected data objects for an inode are present in the data pool
- """
-
- want_objects, exist_objects = self._enumerate_data_objects(ino, size)
- missing = set(want_objects) - set(exist_objects)
-
- if missing:
- log.info("Objects missing (ino {0}, size {1}): {2}".format(
- ino, size, missing
- ))
- return False
- else:
- log.info("All objects for ino {0} size {1} found".format(ino, size))
- return True
-
- def data_objects_absent(self, ino, size):
- want_objects, exist_objects = self._enumerate_data_objects(ino, size)
- present = set(want_objects) & set(exist_objects)
-
- if present:
- log.info("Objects not absent (ino {0}, size {1}): {2}".format(
- ino, size, present
- ))
- return False
- else:
- log.info("All objects for ino {0} size {1} are absent".format(ino, size))
- return True
-
- def rados(self, args, pool=None, namespace=None, stdin_data=None):
- """
- Call into the `rados` CLI from an MDS
- """
-
- if pool is None:
- pool = self.get_metadata_pool_name()
-
- # Doesn't matter which MDS we use to run rados commands, they all
- # have access to the pools
- mds_id = self.mds_ids[0]
- remote = self.mds_daemons[mds_id].remote
-
- # NB we could alternatively use librados pybindings for this, but it's a one-liner
- # using the `rados` CLI
- args = ([os.path.join(self._prefix, "rados"), "-p", pool] +
- (["--namespace", namespace] if namespace else []) +
- args)
- p = remote.run(
- args=args,
- stdin=stdin_data,
- stdout=StringIO())
- return p.stdout.getvalue().strip()
-
- def list_dirfrag(self, dir_ino):
- """
- Read the named object and return the list of omap keys
-
- :return a list of 0 or more strings
- """
-
- dirfrag_obj_name = "{0:x}.00000000".format(dir_ino)
-
- try:
- key_list_str = self.rados(["listomapkeys", dirfrag_obj_name])
- except CommandFailedError as e:
- log.error(e.__str__())
- raise ObjectNotFound(dirfrag_obj_name)
-
- return key_list_str.split("\n") if key_list_str else []
-
- def erase_metadata_objects(self, prefix):
- """
- For all objects in the metadata pool matching the prefix,
- erase them.
-
- This O(N) with the number of objects in the pool, so only suitable
- for use on toy test filesystems.
- """
- all_objects = self.rados(["ls"]).split("\n")
- matching_objects = [o for o in all_objects if o.startswith(prefix)]
- for o in matching_objects:
- self.rados(["rm", o])
-
- def erase_mds_objects(self, rank):
- """
- Erase all the per-MDS objects for a particular rank. This includes
- inotable, sessiontable, journal
- """
-
- def obj_prefix(multiplier):
- """
- MDS object naming conventions like rank 1's
- journal is at 201.***
- """
- return "%x." % (multiplier * 0x100 + rank)
-
- # MDS_INO_LOG_OFFSET
- self.erase_metadata_objects(obj_prefix(2))
- # MDS_INO_LOG_BACKUP_OFFSET
- self.erase_metadata_objects(obj_prefix(3))
- # MDS_INO_LOG_POINTER_OFFSET
- self.erase_metadata_objects(obj_prefix(4))
- # MDSTables & SessionMap
- self.erase_metadata_objects("mds{rank:d}_".format(rank=rank))
-
- @property
- def _prefix(self):
- """
- Override this to set a different
- """
- return ""
-
- def _run_tool(self, tool, args, rank=None, quiet=False):
- # Tests frequently have [client] configuration that jacks up
- # the objecter log level (unlikely to be interesting here)
- # and does not set the mds log level (very interesting here)
- if quiet:
- base_args = [os.path.join(self._prefix, tool), '--debug-mds=1', '--debug-objecter=1']
- else:
- base_args = [os.path.join(self._prefix, tool), '--debug-mds=4', '--debug-objecter=1']
-
- if rank is not None:
- base_args.extend(["--rank", "%d" % rank])
-
- t1 = datetime.datetime.now()
- r = self.tool_remote.run(
- args=base_args + args,
- stdout=StringIO()).stdout.getvalue().strip()
- duration = datetime.datetime.now() - t1
- log.info("Ran {0} in time {1}, result:\n{2}".format(
- base_args + args, duration, r
- ))
- return r
-
- @property
- def tool_remote(self):
- """
- An arbitrary remote to use when invoking recovery tools. Use an MDS host because
- it'll definitely have keys with perms to access cephfs metadata pool. This is public
- so that tests can use this remote to go get locally written output files from the tools.
- """
- mds_id = self.mds_ids[0]
- return self.mds_daemons[mds_id].remote
-
- def journal_tool(self, args, rank=None, quiet=False):
- """
- Invoke cephfs-journal-tool with the passed arguments, and return its stdout
- """
- return self._run_tool("cephfs-journal-tool", args, rank, quiet)
-
- def table_tool(self, args, quiet=False):
- """
- Invoke cephfs-table-tool with the passed arguments, and return its stdout
- """
- return self._run_tool("cephfs-table-tool", args, None, quiet)
-
- def data_scan(self, args, quiet=False, worker_count=1):
- """
- Invoke cephfs-data-scan with the passed arguments, and return its stdout
-
- :param worker_count: if greater than 1, multiple workers will be run
- in parallel and the return value will be None
- """
-
- workers = []
-
- for n in range(0, worker_count):
- if worker_count > 1:
- # data-scan args first token is a command, followed by args to it.
- # insert worker arguments after the command.
- cmd = args[0]
- worker_args = [cmd] + ["--worker_n", n.__str__(), "--worker_m", worker_count.__str__()] + args[1:]
- else:
- worker_args = args
-
- workers.append(Greenlet.spawn(lambda wargs=worker_args:
- self._run_tool("cephfs-data-scan", wargs, None, quiet)))
-
- for w in workers:
- w.get()
-
- if worker_count == 1:
- return workers[0].value
- else:
- return None
+++ /dev/null
-
-from StringIO import StringIO
-import json
-import time
-import logging
-from textwrap import dedent
-
-from teuthology import misc
-from teuthology.contextutil import MaxWhileTries
-from teuthology.orchestra import run
-from teuthology.orchestra.run import CommandFailedError
-from .mount import CephFSMount
-
-log = logging.getLogger(__name__)
-
-
-class FuseMount(CephFSMount):
- def __init__(self, client_config, test_dir, client_id, client_remote):
- super(FuseMount, self).__init__(test_dir, client_id, client_remote)
-
- self.client_config = client_config if client_config else {}
- self.fuse_daemon = None
- self._fuse_conn = None
-
- def mount(self, mount_path=None, mount_fs_name=None):
- log.info("Client client.%s config is %s" % (self.client_id, self.client_config))
-
- daemon_signal = 'kill'
- if self.client_config.get('coverage') or self.client_config.get('valgrind') is not None:
- daemon_signal = 'term'
-
- log.info('Mounting ceph-fuse client.{id} at {remote} {mnt}...'.format(
- id=self.client_id, remote=self.client_remote, mnt=self.mountpoint))
-
- self.client_remote.run(
- args=[
- 'mkdir',
- '--',
- self.mountpoint,
- ],
- )
-
- run_cmd = [
- 'sudo',
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=self.test_dir),
- 'daemon-helper',
- daemon_signal,
- ]
-
- fuse_cmd = ['ceph-fuse', "-f"]
-
- if mount_path is not None:
- fuse_cmd += ["--client_mountpoint={0}".format(mount_path)]
-
- if mount_fs_name is not None:
- fuse_cmd += ["--client_mds_namespace={0}".format(mount_fs_name)]
-
- fuse_cmd += [
- '--name', 'client.{id}'.format(id=self.client_id),
- # TODO ceph-fuse doesn't understand dash dash '--',
- self.mountpoint,
- ]
-
- if self.client_config.get('valgrind') is not None:
- run_cmd = misc.get_valgrind_args(
- self.test_dir,
- 'client.{id}'.format(id=self.client_id),
- run_cmd,
- self.client_config.get('valgrind'),
- )
-
- run_cmd.extend(fuse_cmd)
-
- def list_connections():
- self.client_remote.run(
- args=["sudo", "mount", "-t", "fusectl", "/sys/fs/fuse/connections", "/sys/fs/fuse/connections"],
- check_status=False
- )
- p = self.client_remote.run(
- args=["ls", "/sys/fs/fuse/connections"],
- stdout=StringIO(),
- check_status=False
- )
- if p.exitstatus != 0:
- return []
-
- ls_str = p.stdout.getvalue().strip()
- if ls_str:
- return [int(n) for n in ls_str.split("\n")]
- else:
- return []
-
- # Before starting ceph-fuse process, note the contents of
- # /sys/fs/fuse/connections
- pre_mount_conns = list_connections()
- log.info("Pre-mount connections: {0}".format(pre_mount_conns))
-
- proc = self.client_remote.run(
- args=run_cmd,
- logger=log.getChild('ceph-fuse.{id}'.format(id=self.client_id)),
- stdin=run.PIPE,
- wait=False,
- )
- self.fuse_daemon = proc
-
- # Wait for the connection reference to appear in /sys
- mount_wait = self.client_config.get('mount_wait', 0)
- if mount_wait > 0:
- log.info("Fuse mount waits {0} seconds before checking /sys/".format(mount_wait))
- time.sleep(mount_wait)
- timeout = int(self.client_config.get('mount_timeout', 30))
- waited = 0
-
- post_mount_conns = list_connections()
- while len(post_mount_conns) <= len(pre_mount_conns):
- if self.fuse_daemon.finished:
- # Did mount fail? Raise the CommandFailedError instead of
- # hitting the "failed to populate /sys/" timeout
- self.fuse_daemon.wait()
- time.sleep(1)
- waited += 1
- if waited > timeout:
- raise RuntimeError("Fuse mount failed to populate /sys/ after {0} seconds".format(
- waited
- ))
- else:
- post_mount_conns = list_connections()
-
- log.info("Post-mount connections: {0}".format(post_mount_conns))
-
- # Record our fuse connection number so that we can use it when
- # forcing an unmount
- new_conns = list(set(post_mount_conns) - set(pre_mount_conns))
- if len(new_conns) == 0:
- raise RuntimeError("New fuse connection directory not found ({0})".format(new_conns))
- elif len(new_conns) > 1:
- raise RuntimeError("Unexpectedly numerous fuse connections {0}".format(new_conns))
- else:
- self._fuse_conn = new_conns[0]
-
- def is_mounted(self):
- proc = self.client_remote.run(
- args=[
- 'stat',
- '--file-system',
- '--printf=%T\n',
- '--',
- self.mountpoint,
- ],
- stdout=StringIO(),
- stderr=StringIO(),
- wait=False
- )
- try:
- proc.wait()
- except CommandFailedError:
- if ("endpoint is not connected" in proc.stderr.getvalue()
- or "Software caused connection abort" in proc.stderr.getvalue()):
- # This happens is fuse is killed without unmount
- log.warn("Found stale moutn point at {0}".format(self.mountpoint))
- return True
- else:
- # This happens if the mount directory doesn't exist
- log.info('mount point does not exist: %s', self.mountpoint)
- return False
-
- fstype = proc.stdout.getvalue().rstrip('\n')
- if fstype == 'fuseblk':
- log.info('ceph-fuse is mounted on %s', self.mountpoint)
- return True
- else:
- log.debug('ceph-fuse not mounted, got fs type {fstype!r}'.format(
- fstype=fstype))
- return False
-
- def wait_until_mounted(self):
- """
- Check to make sure that fuse is mounted on mountpoint. If not,
- sleep for 5 seconds and check again.
- """
-
- while not self.is_mounted():
- # Even if it's not mounted, it should at least
- # be running: catch simple failures where it has terminated.
- assert not self.fuse_daemon.poll()
-
- time.sleep(5)
-
- # Now that we're mounted, set permissions so that the rest of the test will have
- # unrestricted access to the filesystem mount.
- self.client_remote.run(
- args=['sudo', 'chmod', '1777', self.mountpoint])
-
- def _mountpoint_exists(self):
- return self.client_remote.run(args=["ls", "-d", self.mountpoint], check_status=False).exitstatus == 0
-
- def umount(self):
- try:
- log.info('Running fusermount -u on {name}...'.format(name=self.client_remote.name))
- self.client_remote.run(
- args=[
- 'sudo',
- 'fusermount',
- '-u',
- self.mountpoint,
- ],
- )
- except run.CommandFailedError:
- log.info('Failed to unmount ceph-fuse on {name}, aborting...'.format(name=self.client_remote.name))
-
- # abort the fuse mount, killing all hung processes
- if self._fuse_conn:
- self.run_python(dedent("""
- import os
- path = "/sys/fs/fuse/connections/{0}/abort"
- if os.path.exists(path):
- open(path, "w").write("1")
- """).format(self._fuse_conn))
- self._fuse_conn = None
-
- stderr = StringIO()
- try:
- # make sure its unmounted
- self.client_remote.run(
- args=[
- 'sudo',
- 'umount',
- '-l',
- '-f',
- self.mountpoint,
- ],
- stderr=stderr
- )
- except CommandFailedError:
- if self.is_mounted():
- raise
-
- assert not self.is_mounted()
- self._fuse_conn = None
-
- def umount_wait(self, force=False, require_clean=False):
- """
- :param force: Complete cleanly even if the MDS is offline
- """
- if force:
- assert not require_clean # mutually exclusive
-
- # When we expect to be forcing, kill the ceph-fuse process directly.
- # This should avoid hitting the more aggressive fallback killing
- # in umount() which can affect other mounts too.
- self.fuse_daemon.stdin.close()
-
- # However, we will still hit the aggressive wait if there is an ongoing
- # mount -o remount (especially if the remount is stuck because MDSs
- # are unavailable)
-
- self.umount()
-
- try:
- if self.fuse_daemon:
- # Permit a timeout, so that we do not block forever
- run.wait([self.fuse_daemon], 900)
- except MaxWhileTries:
- log.error("process failed to terminate after unmount. This probably"
- "indicates a bug within ceph-fuse.")
- raise
- except CommandFailedError:
- if require_clean:
- raise
-
- self.cleanup()
-
- def cleanup(self):
- """
- Remove the mount point.
-
- Prerequisite: the client is not mounted.
- """
- stderr = StringIO()
- try:
- self.client_remote.run(
- args=[
- 'rmdir',
- '--',
- self.mountpoint,
- ],
- stderr=stderr
- )
- except CommandFailedError:
- if "No such file or directory" in stderr.getvalue():
- pass
- else:
- raise
-
- def kill(self):
- """
- Terminate the client without removing the mount point.
- """
- self.fuse_daemon.stdin.close()
- try:
- self.fuse_daemon.wait()
- except CommandFailedError:
- pass
-
- def kill_cleanup(self):
- """
- Follow up ``kill`` to get to a clean unmounted state.
- """
- self.umount()
- self.cleanup()
-
- def teardown(self):
- """
- Whatever the state of the mount, get it gone.
- """
- super(FuseMount, self).teardown()
-
- self.umount()
-
- if self.fuse_daemon and not self.fuse_daemon.finished:
- self.fuse_daemon.stdin.close()
- try:
- self.fuse_daemon.wait()
- except CommandFailedError:
- pass
-
- # Indiscriminate, unlike the touchier cleanup()
- self.client_remote.run(
- args=[
- 'rm',
- '-rf',
- self.mountpoint,
- ],
- )
-
- def _asok_path(self):
- return "/var/run/ceph/ceph-client.{0}.*.asok".format(self.client_id)
-
- @property
- def _prefix(self):
- return ""
-
- def admin_socket(self, args):
- pyscript = """
-import glob
-import re
-import os
-import subprocess
-
-def find_socket(client_name):
- asok_path = "{asok_path}"
- files = glob.glob(asok_path)
-
- # Given a non-glob path, it better be there
- if "*" not in asok_path:
- assert(len(files) == 1)
- return files[0]
-
- for f in files:
- pid = re.match(".*\.(\d+)\.asok$", f).group(1)
- if os.path.exists("/proc/{{0}}".format(pid)):
- return f
- raise RuntimeError("Client socket {{0}} not found".format(client_name))
-
-print find_socket("{client_name}")
-""".format(
- asok_path=self._asok_path(),
- client_name="client.{0}".format(self.client_id))
-
- # Find the admin socket
- p = self.client_remote.run(args=[
- 'python', '-c', pyscript
- ], stdout=StringIO())
- asok_path = p.stdout.getvalue().strip()
- log.info("Found client admin socket at {0}".format(asok_path))
-
- # Query client ID from admin socket
- p = self.client_remote.run(
- args=['sudo', self._prefix + 'ceph', '--admin-daemon', asok_path] + args,
- stdout=StringIO())
- return json.loads(p.stdout.getvalue())
-
- def get_global_id(self):
- """
- Look up the CephFS client ID for this mount
- """
-
- return self.admin_socket(['mds_sessions'])['id']
-
- def get_osd_epoch(self):
- """
- Return 2-tuple of osd_epoch, osd_epoch_barrier
- """
- status = self.admin_socket(['status'])
- return status['osd_epoch'], status['osd_epoch_barrier']
-
- def get_dentry_count(self):
- """
- Return 2-tuple of dentry_count, dentry_pinned_count
- """
- status = self.admin_socket(['status'])
- return status['dentry_count'], status['dentry_pinned_count']
-
- def set_cache_size(self, size):
- return self.admin_socket(['config', 'set', 'client_cache_size', str(size)])
+++ /dev/null
-from StringIO import StringIO
-import json
-import logging
-from textwrap import dedent
-from teuthology.orchestra.run import CommandFailedError
-from teuthology import misc
-
-from teuthology.orchestra import remote as orchestra_remote
-from teuthology.orchestra import run
-from .mount import CephFSMount
-
-log = logging.getLogger(__name__)
-
-
-class KernelMount(CephFSMount):
- def __init__(self, mons, test_dir, client_id, client_remote,
- ipmi_user, ipmi_password, ipmi_domain):
- super(KernelMount, self).__init__(test_dir, client_id, client_remote)
- self.mons = mons
-
- self.mounted = False
- self.ipmi_user = ipmi_user
- self.ipmi_password = ipmi_password
- self.ipmi_domain = ipmi_domain
-
- def write_secret_file(self, remote, role, keyring, filename):
- """
- Stash the keyring in the filename specified.
- """
- remote.run(
- args=[
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=self.test_dir),
- 'ceph-authtool',
- '--name={role}'.format(role=role),
- '--print-key',
- keyring,
- run.Raw('>'),
- filename,
- ],
- )
-
- def mount(self, mount_path=None, mount_fs_name=None):
- log.info('Mounting kclient client.{id} at {remote} {mnt}...'.format(
- id=self.client_id, remote=self.client_remote, mnt=self.mountpoint))
-
- keyring = self.get_keyring_path()
- secret = '{tdir}/ceph.data/client.{id}.secret'.format(tdir=self.test_dir, id=self.client_id)
- self.write_secret_file(self.client_remote, 'client.{id}'.format(id=self.client_id),
- keyring, secret)
-
- self.client_remote.run(
- args=[
- 'mkdir',
- '--',
- self.mountpoint,
- ],
- )
-
- if mount_path is None:
- mount_path = "/"
-
- opts = 'name={id},secretfile={secret},norequire_active_mds'.format(id=self.client_id,
- secret=secret)
-
- if mount_fs_name is not None:
- opts += ",mds_namespace={0}".format(mount_fs_name)
-
- self.client_remote.run(
- args=[
- 'sudo',
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=self.test_dir),
- '/sbin/mount.ceph',
- '{mons}:{mount_path}'.format(mons=','.join(self.mons), mount_path=mount_path),
- self.mountpoint,
- '-v',
- '-o',
- opts
- ],
- )
-
- self.client_remote.run(
- args=['sudo', 'chmod', '1777', self.mountpoint])
-
- self.mounted = True
-
- def umount(self):
- log.debug('Unmounting client client.{id}...'.format(id=self.client_id))
- self.client_remote.run(
- args=[
- 'sudo',
- 'umount',
- self.mountpoint,
- ],
- )
- self.client_remote.run(
- args=[
- 'rmdir',
- '--',
- self.mountpoint,
- ],
- )
- self.mounted = False
-
- def cleanup(self):
- pass
-
- def umount_wait(self, force=False, require_clean=False):
- """
- Unlike the fuse client, the kernel client's umount is immediate
- """
- if not self.is_mounted():
- return
-
- try:
- self.umount()
- except CommandFailedError:
- if not force:
- raise
-
- self.kill()
- self.kill_cleanup()
-
- self.mounted = False
-
- def is_mounted(self):
- return self.mounted
-
- def wait_until_mounted(self):
- """
- Unlike the fuse client, the kernel client is up and running as soon
- as the initial mount() function returns.
- """
- assert self.mounted
-
- def teardown(self):
- super(KernelMount, self).teardown()
- if self.mounted:
- self.umount()
-
- def kill(self):
- """
- The Ceph kernel client doesn't have a mechanism to kill itself (doing
- that in side the kernel would be weird anyway), so we reboot the whole node
- to get the same effect.
-
- We use IPMI to reboot, because we don't want the client to send any
- releases of capabilities.
- """
-
- con = orchestra_remote.getRemoteConsole(self.client_remote.hostname,
- self.ipmi_user,
- self.ipmi_password,
- self.ipmi_domain)
- con.power_off()
-
- self.mounted = False
-
- def kill_cleanup(self):
- assert not self.mounted
-
- con = orchestra_remote.getRemoteConsole(self.client_remote.hostname,
- self.ipmi_user,
- self.ipmi_password,
- self.ipmi_domain)
- con.power_on()
-
- # Wait for node to come back up after reboot
- misc.reconnect(None, 300, [self.client_remote])
-
- # Remove mount directory
- self.client_remote.run(
- args=[
- 'rmdir',
- '--',
- self.mountpoint,
- ],
- )
-
- def _find_debug_dir(self):
- """
- Find the debugfs folder for this mount
- """
- pyscript = dedent("""
- import glob
- import os
- import json
-
- def get_id_to_dir():
- result = {}
- for dir in glob.glob("/sys/kernel/debug/ceph/*"):
- mds_sessions_lines = open(os.path.join(dir, "mds_sessions")).readlines()
- client_id = mds_sessions_lines[1].split()[1].strip('"')
-
- result[client_id] = dir
- return result
-
- print json.dumps(get_id_to_dir())
- """)
-
- p = self.client_remote.run(args=[
- 'sudo', 'python', '-c', pyscript
- ], stdout=StringIO())
- client_id_to_dir = json.loads(p.stdout.getvalue())
-
- try:
- return client_id_to_dir[self.client_id]
- except KeyError:
- log.error("Client id '{0}' debug dir not found (clients seen were: {1})".format(
- self.client_id, ",".join(client_id_to_dir.keys())
- ))
- raise
-
- def _read_debug_file(self, filename):
- debug_dir = self._find_debug_dir()
-
- pyscript = dedent("""
- import os
-
- print open(os.path.join("{debug_dir}", "{filename}")).read()
- """).format(debug_dir=debug_dir, filename=filename)
-
- p = self.client_remote.run(args=[
- 'sudo', 'python', '-c', pyscript
- ], stdout=StringIO())
- return p.stdout.getvalue()
-
- def get_global_id(self):
- """
- Look up the CephFS client ID for this mount, using debugfs.
- """
-
- assert self.mounted
-
- mds_sessions = self._read_debug_file("mds_sessions")
- lines = mds_sessions.split("\n")
- return int(lines[0].split()[1])
-
- def get_osd_epoch(self):
- """
- Return 2-tuple of osd_epoch, osd_epoch_barrier
- """
- osd_map = self._read_debug_file("osdmap")
- lines = osd_map.split("\n")
- epoch = int(lines[0].split()[1])
-
- mds_sessions = self._read_debug_file("mds_sessions")
- lines = mds_sessions.split("\n")
- epoch_barrier = int(lines[2].split()[1].strip('"'))
-
- return epoch, epoch_barrier
+++ /dev/null
-from contextlib import contextmanager
-import json
-import logging
-import datetime
-import time
-from textwrap import dedent
-import os
-from StringIO import StringIO
-from teuthology.orchestra import run
-from teuthology.orchestra.run import CommandFailedError, ConnectionLostError
-
-log = logging.getLogger(__name__)
-
-
-class CephFSMount(object):
- def __init__(self, test_dir, client_id, client_remote):
- """
- :param test_dir: Global teuthology test dir
- :param client_id: Client ID, the 'foo' in client.foo
- :param client_remote: Remote instance for the host where client will run
- """
-
- self.test_dir = test_dir
- self.client_id = client_id
- self.client_remote = client_remote
- self.mountpoint_dir_name = 'mnt.{id}'.format(id=self.client_id)
-
- self.test_files = ['a', 'b', 'c']
-
- self.background_procs = []
-
- @property
- def mountpoint(self):
- return os.path.join(
- self.test_dir, '{dir_name}'.format(dir_name=self.mountpoint_dir_name))
-
- def is_mounted(self):
- raise NotImplementedError()
-
- def mount(self, mount_path=None, mount_fs_name=None):
- raise NotImplementedError()
-
- def umount(self):
- raise NotImplementedError()
-
- def umount_wait(self, force=False, require_clean=False):
- """
-
- :param force: Expect that the mount will not shutdown cleanly: kill
- it hard.
- :param require_clean: Wait for the Ceph client associated with the
- mount (e.g. ceph-fuse) to terminate, and
- raise if it doesn't do so cleanly.
- :return:
- """
- raise NotImplementedError()
-
- def kill_cleanup(self):
- raise NotImplementedError()
-
- def kill(self):
- raise NotImplementedError()
-
- def cleanup(self):
- raise NotImplementedError()
-
- def wait_until_mounted(self):
- raise NotImplementedError()
-
- def get_keyring_path(self):
- return '/etc/ceph/ceph.client.{id}.keyring'.format(id=self.client_id)
-
- @property
- def config_path(self):
- """
- Path to ceph.conf: override this if you're not a normal systemwide ceph install
- :return: stringv
- """
- return "/etc/ceph/ceph.conf"
-
- @contextmanager
- def mounted(self):
- """
- A context manager, from an initially unmounted state, to mount
- this, yield, and then unmount and clean up.
- """
- self.mount()
- self.wait_until_mounted()
- try:
- yield
- finally:
- self.umount_wait()
-
- def create_files(self):
- assert(self.is_mounted())
-
- for suffix in self.test_files:
- log.info("Creating file {0}".format(suffix))
- self.client_remote.run(args=[
- 'sudo', 'touch', os.path.join(self.mountpoint, suffix)
- ])
-
- def check_files(self):
- assert(self.is_mounted())
-
- for suffix in self.test_files:
- log.info("Checking file {0}".format(suffix))
- r = self.client_remote.run(args=[
- 'sudo', 'ls', os.path.join(self.mountpoint, suffix)
- ], check_status=False)
- if r.exitstatus != 0:
- raise RuntimeError("Expected file {0} not found".format(suffix))
-
- def create_destroy(self):
- assert(self.is_mounted())
-
- filename = "{0} {1}".format(datetime.datetime.now(), self.client_id)
- log.debug("Creating test file {0}".format(filename))
- self.client_remote.run(args=[
- 'sudo', 'touch', os.path.join(self.mountpoint, filename)
- ])
- log.debug("Deleting test file {0}".format(filename))
- self.client_remote.run(args=[
- 'sudo', 'rm', '-f', os.path.join(self.mountpoint, filename)
- ])
-
- def _run_python(self, pyscript):
- return self.client_remote.run(args=[
- 'sudo', 'adjust-ulimits', 'daemon-helper', 'kill', 'python', '-c', pyscript
- ], wait=False, stdin=run.PIPE, stdout=StringIO())
-
- def run_python(self, pyscript):
- p = self._run_python(pyscript)
- p.wait()
- return p.stdout.getvalue().strip()
-
- def run_shell(self, args, wait=True):
- args = ["cd", self.mountpoint, run.Raw('&&'), "sudo"] + args
- return self.client_remote.run(args=args, stdout=StringIO(), wait=wait)
-
- def open_no_data(self, basename):
- """
- A pure metadata operation
- """
- assert(self.is_mounted())
-
- path = os.path.join(self.mountpoint, basename)
-
- p = self._run_python(dedent(
- """
- f = open("{path}", 'w')
- """.format(path=path)
- ))
- p.wait()
-
- def open_background(self, basename="background_file"):
- """
- Open a file for writing, then block such that the client
- will hold a capability
- """
- assert(self.is_mounted())
-
- path = os.path.join(self.mountpoint, basename)
-
- pyscript = dedent("""
- import time
-
- f = open("{path}", 'w')
- f.write('content')
- f.flush()
- f.write('content2')
- while True:
- time.sleep(1)
- """).format(path=path)
-
- rproc = self._run_python(pyscript)
- self.background_procs.append(rproc)
- return rproc
-
- def wait_for_visible(self, basename="background_file", timeout=30):
- i = 0
- while i < timeout:
- r = self.client_remote.run(args=[
- 'sudo', 'ls', os.path.join(self.mountpoint, basename)
- ], check_status=False)
- if r.exitstatus == 0:
- log.debug("File {0} became visible from {1} after {2}s".format(
- basename, self.client_id, i))
- return
- else:
- time.sleep(1)
- i += 1
-
- raise RuntimeError("Timed out after {0}s waiting for {1} to become visible from {2}".format(
- i, basename, self.client_id))
-
- def lock_background(self, basename="background_file", do_flock=True):
- """
- Open and lock a files for writing, hold the lock in a background process
- """
- assert(self.is_mounted())
-
- path = os.path.join(self.mountpoint, basename)
-
- script_builder = """
- import time
- import fcntl
- import struct"""
- if do_flock:
- script_builder += """
- f1 = open("{path}-1", 'w')
- fcntl.flock(f1, fcntl.LOCK_EX | fcntl.LOCK_NB)"""
- script_builder += """
- f2 = open("{path}-2", 'w')
- lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0)
- fcntl.fcntl(f2, fcntl.F_SETLK, lockdata)
- while True:
- time.sleep(1)
- """
-
- pyscript = dedent(script_builder).format(path=path)
-
- log.info("lock file {0}".format(basename))
- rproc = self._run_python(pyscript)
- self.background_procs.append(rproc)
- return rproc
-
- def check_filelock(self, basename="background_file", do_flock=True):
- assert(self.is_mounted())
-
- path = os.path.join(self.mountpoint, basename)
-
- script_builder = """
- import fcntl
- import errno
- import struct"""
- if do_flock:
- script_builder += """
- f1 = open("{path}-1", 'r')
- try:
- fcntl.flock(f1, fcntl.LOCK_EX | fcntl.LOCK_NB)
- except IOError, e:
- if e.errno == errno.EAGAIN:
- pass
- else:
- raise RuntimeError("flock on file {path}-1 not found")"""
- script_builder += """
- f2 = open("{path}-2", 'r')
- try:
- lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0)
- fcntl.fcntl(f2, fcntl.F_SETLK, lockdata)
- except IOError, e:
- if e.errno == errno.EAGAIN:
- pass
- else:
- raise RuntimeError("posix lock on file {path}-2 not found")
- """
- pyscript = dedent(script_builder).format(path=path)
-
- log.info("check lock on file {0}".format(basename))
- self.client_remote.run(args=[
- 'sudo', 'python', '-c', pyscript
- ])
-
- def write_background(self, basename="background_file", loop=False):
- """
- Open a file for writing, complete as soon as you can
- :param basename:
- :return:
- """
- assert(self.is_mounted())
-
- path = os.path.join(self.mountpoint, basename)
-
- pyscript = dedent("""
- import os
- import time
-
- fd = os.open("{path}", os.O_RDWR | os.O_CREAT, 0644)
- try:
- while True:
- os.write(fd, 'content')
- time.sleep(1)
- if not {loop}:
- break
- except IOError, e:
- pass
- os.close(fd)
- """).format(path=path, loop=str(loop))
-
- rproc = self._run_python(pyscript)
- self.background_procs.append(rproc)
- return rproc
-
- def write_n_mb(self, filename, n_mb, seek=0, wait=True):
- """
- Write the requested number of megabytes to a file
- """
- assert(self.is_mounted())
-
- return self.run_shell(["dd", "if=/dev/urandom", "of={0}".format(filename),
- "bs=1M", "conv=fdatasync",
- "count={0}".format(n_mb),
- "seek={0}".format(seek)
- ], wait=wait)
-
- def write_test_pattern(self, filename, size):
- log.info("Writing {0} bytes to {1}".format(size, filename))
- return self.run_python(dedent("""
- import zlib
- path = "{path}"
- f = open(path, 'w')
- for i in range(0, {size}):
- val = zlib.crc32("%s" % i) & 7
- f.write(chr(val))
- f.close()
- """.format(
- path=os.path.join(self.mountpoint, filename),
- size=size
- )))
-
- def validate_test_pattern(self, filename, size):
- log.info("Validating {0} bytes from {1}".format(size, filename))
- return self.run_python(dedent("""
- import zlib
- path = "{path}"
- f = open(path, 'r')
- bytes = f.read()
- f.close()
- if len(bytes) != {size}:
- raise RuntimeError("Bad length {{0}} vs. expected {{1}}".format(
- len(bytes), {size}
- ))
- for i, b in enumerate(bytes):
- val = zlib.crc32("%s" % i) & 7
- if b != chr(val):
- raise RuntimeError("Bad data at offset {{0}}".format(i))
- """.format(
- path=os.path.join(self.mountpoint, filename),
- size=size
- )))
-
- def open_n_background(self, fs_path, count):
- """
- Open N files for writing, hold them open in a background process
-
- :param fs_path: Path relative to CephFS root, e.g. "foo/bar"
- :return: a RemoteProcess
- """
- assert(self.is_mounted())
-
- abs_path = os.path.join(self.mountpoint, fs_path)
-
- pyscript = dedent("""
- import sys
- import time
- import os
-
- n = {count}
- abs_path = "{abs_path}"
-
- if not os.path.exists(os.path.dirname(abs_path)):
- os.makedirs(os.path.dirname(abs_path))
-
- handles = []
- for i in range(0, n):
- fname = "{{0}}_{{1}}".format(abs_path, i)
- handles.append(open(fname, 'w'))
-
- while True:
- time.sleep(1)
- """).format(abs_path=abs_path, count=count)
-
- rproc = self._run_python(pyscript)
- self.background_procs.append(rproc)
- return rproc
-
- def create_n_files(self, fs_path, count, sync=False):
- assert(self.is_mounted())
-
- abs_path = os.path.join(self.mountpoint, fs_path)
-
- pyscript = dedent("""
- import sys
- import time
- import os
-
- n = {count}
- abs_path = "{abs_path}"
-
- if not os.path.exists(os.path.dirname(abs_path)):
- os.makedirs(os.path.dirname(abs_path))
-
- for i in range(0, n):
- fname = "{{0}}_{{1}}".format(abs_path, i)
- h = open(fname, 'w')
- h.write('content')
- if {sync}:
- h.flush()
- os.fsync(h.fileno())
- h.close()
- """).format(abs_path=abs_path, count=count, sync=str(sync))
-
- self.run_python(pyscript)
-
- def teardown(self):
- for p in self.background_procs:
- log.info("Terminating background process")
- self._kill_background(p)
-
- self.background_procs = []
-
- def _kill_background(self, p):
- if p.stdin:
- p.stdin.close()
- try:
- p.wait()
- except (CommandFailedError, ConnectionLostError):
- pass
-
- def kill_background(self, p):
- """
- For a process that was returned by one of the _background member functions,
- kill it hard.
- """
- self._kill_background(p)
- self.background_procs.remove(p)
-
- def spam_dir_background(self, path):
- """
- Create directory `path` and do lots of metadata operations
- in it until further notice.
- """
- assert(self.is_mounted())
- abs_path = os.path.join(self.mountpoint, path)
-
- pyscript = dedent("""
- import sys
- import time
- import os
-
- abs_path = "{abs_path}"
-
- if not os.path.exists(abs_path):
- os.makedirs(abs_path)
-
- n = 0
- while True:
- file_path = os.path.join(abs_path, "tmp%d" % n)
- f = open(file_path, 'w')
- f.close()
- n = n + 1
- """).format(abs_path=abs_path)
-
- rproc = self._run_python(pyscript)
- self.background_procs.append(rproc)
- return rproc
-
- def get_global_id(self):
- raise NotImplementedError()
-
- def get_osd_epoch(self):
- raise NotImplementedError()
-
- def stat(self, fs_path, wait=True):
- """
- stat a file, and return the result as a dictionary like this:
- {
- "st_ctime": 1414161137.0,
- "st_mtime": 1414161137.0,
- "st_nlink": 33,
- "st_gid": 0,
- "st_dev": 16777218,
- "st_size": 1190,
- "st_ino": 2,
- "st_uid": 0,
- "st_mode": 16877,
- "st_atime": 1431520593.0
- }
-
- Raises exception on absent file.
- """
- abs_path = os.path.join(self.mountpoint, fs_path)
-
- pyscript = dedent("""
- import os
- import stat
- import json
- import sys
-
- try:
- s = os.stat("{path}")
- except OSError as e:
- sys.exit(e.errno)
-
- attrs = ["st_mode", "st_ino", "st_dev", "st_nlink", "st_uid", "st_gid", "st_size", "st_atime", "st_mtime", "st_ctime"]
- print json.dumps(
- dict([(a, getattr(s, a)) for a in attrs]),
- indent=2)
- """).format(path=abs_path)
- proc = self._run_python(pyscript)
- if wait:
- proc.wait()
- return json.loads(proc.stdout.getvalue().strip())
- else:
- return proc
-
- def touch(self, fs_path):
- """
- Create a dentry if it doesn't already exist. This python
- implementation exists because the usual command line tool doesn't
- pass through error codes like EIO.
-
- :param fs_path:
- :return:
- """
- abs_path = os.path.join(self.mountpoint, fs_path)
- pyscript = dedent("""
- import sys
- import errno
-
- try:
- f = open("{path}", "w")
- f.close()
- except IOError as e:
- sys.exit(errno.EIO)
- """).format(path=abs_path)
- proc = self._run_python(pyscript)
- proc.wait()
-
- def path_to_ino(self, fs_path, follow_symlinks=True):
- abs_path = os.path.join(self.mountpoint, fs_path)
-
- if follow_symlinks:
- pyscript = dedent("""
- import os
- import stat
-
- print os.stat("{path}").st_ino
- """).format(path=abs_path)
- else:
- pyscript = dedent("""
- import os
- import stat
-
- print os.lstat("{path}").st_ino
- """).format(path=abs_path)
-
- proc = self._run_python(pyscript)
- proc.wait()
- return int(proc.stdout.getvalue().strip())
-
- def path_to_nlink(self, fs_path):
- abs_path = os.path.join(self.mountpoint, fs_path)
-
- pyscript = dedent("""
- import os
- import stat
-
- print os.stat("{path}").st_nlink
- """).format(path=abs_path)
-
- proc = self._run_python(pyscript)
- proc.wait()
- return int(proc.stdout.getvalue().strip())
-
- def ls(self, path=None):
- """
- Wrap ls: return a list of strings
- """
- cmd = ["ls"]
- if path:
- cmd.append(path)
-
- ls_text = self.run_shell(cmd).stdout.getvalue().strip()
-
- if ls_text:
- return ls_text.split("\n")
- else:
- # Special case because otherwise split on empty string
- # gives you [''] instead of []
- return []
-
- def getfattr(self, path, attr):
- """
- Wrap getfattr: return the values of a named xattr on one file.
-
- :return: a string
- """
- p = self.run_shell(["getfattr", "--only-values", "-n", attr, path])
- return p.stdout.getvalue()
-
- def df(self):
- """
- Wrap df: return a dict of usage fields in bytes
- """
-
- p = self.run_shell(["df", "-B1", "."])
- lines = p.stdout.getvalue().strip().split("\n")
- fs, total, used, avail = lines[1].split()[:4]
- log.warn(lines)
-
- return {
- "total": int(total),
- "used": int(used),
- "available": int(avail)
- }
+++ /dev/null
-
-"""
-Exercise the MDS's auto repair functions
-"""
-
-import logging
-import time
-
-from teuthology.orchestra.run import CommandFailedError
-from tasks.cephfs.cephfs_test_case import CephFSTestCase
-
-
-log = logging.getLogger(__name__)
-
-
-# Arbitrary timeouts for operations involving restarting
-# an MDS or waiting for it to come up
-MDS_RESTART_GRACE = 60
-
-
-class TestMDSAutoRepair(CephFSTestCase):
- def test_backtrace_repair(self):
- """
- MDS should verify/fix backtrace on fetch dirfrag
- """
-
- self.mount_a.run_shell(["mkdir", "testdir1"])
- self.mount_a.run_shell(["touch", "testdir1/testfile"])
- dir_objname = "{:x}.00000000".format(self.mount_a.path_to_ino("testdir1"))
-
- # drop inodes caps
- self.mount_a.umount_wait()
-
- # flush journal entries to dirfrag objects, and expire journal
- self.fs.mds_asok(['flush', 'journal'])
-
- # Restart the MDS to drop the metadata cache (because we expired the journal,
- # nothing gets replayed into cache on restart)
- self.fs.mds_stop()
- self.fs.mds_fail_restart()
- self.fs.wait_for_daemons()
-
- # remove testdir1's backtrace
- self.fs.rados(["rmxattr", dir_objname, "parent"])
-
- # readdir (fetch dirfrag) should fix testdir1's backtrace
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
- self.mount_a.run_shell(["ls", "testdir1"])
-
- # flush journal entries to dirfrag objects
- self.fs.mds_asok(['flush', 'journal'])
-
- # check if backtrace exists
- self.fs.rados(["getxattr", dir_objname, "parent"])
-
- def test_mds_readonly(self):
- """
- test if MDS behave correct when it's readonly
- """
- # operation should successd when MDS is not readonly
- self.mount_a.run_shell(["touch", "test_file1"])
- writer = self.mount_a.write_background(loop=True)
-
- time.sleep(10)
- self.assertFalse(writer.finished)
-
- # force MDS to read-only mode
- self.fs.mds_asok(['force_readonly'])
- time.sleep(10)
-
- # touching test file should fail
- try:
- self.mount_a.run_shell(["touch", "test_file1"])
- except CommandFailedError:
- pass
- else:
- self.assertTrue(False)
-
- # background writer also should fail
- self.assertTrue(writer.finished)
-
- # The MDS should report its readonly health state to the mon
- self.wait_for_health("MDS in read-only mode", timeout=30)
-
- # restart mds to make it writable
- self.fs.mds_fail_restart()
- self.fs.wait_for_daemons()
-
- self.wait_for_health_clear(timeout=30)
+++ /dev/null
-
-from tasks.cephfs.cephfs_test_case import CephFSTestCase
-
-
-class TestBacktrace(CephFSTestCase):
- def test_backtrace(self):
- """
- That the 'parent' and 'layout' xattrs on the head objects of files
- are updated correctly.
- """
-
- old_data_pool_name = self.fs.get_data_pool_name()
- old_pool_id = self.fs.get_data_pool_id()
-
- # Create a file for subsequent checks
- self.mount_a.run_shell(["mkdir", "parent_a"])
- self.mount_a.run_shell(["touch", "parent_a/alpha"])
- file_ino = self.mount_a.path_to_ino("parent_a/alpha")
-
- # That backtrace and layout are written after initial flush
- self.fs.mds_asok(["flush", "journal"])
- backtrace = self.fs.read_backtrace(file_ino)
- self.assertEqual(['alpha', 'parent_a'], [a['dname'] for a in backtrace['ancestors']])
- layout = self.fs.read_layout(file_ino)
- self.assertDictEqual(layout, {
- "stripe_unit": 4194304,
- "stripe_count": 1,
- "object_size": 4194304,
- "pool_id": old_pool_id,
- "pool_ns": "",
- })
- self.assertEqual(backtrace['pool'], old_pool_id)
-
- # That backtrace is written after parentage changes
- self.mount_a.run_shell(["mkdir", "parent_b"])
- self.mount_a.run_shell(["mv", "parent_a/alpha", "parent_b/alpha"])
-
- self.fs.mds_asok(["flush", "journal"])
- backtrace = self.fs.read_backtrace(file_ino)
- self.assertEqual(['alpha', 'parent_b'], [a['dname'] for a in backtrace['ancestors']])
-
- # Create a new data pool
- new_pool_name = "data_new"
- new_pool_id = self.fs.add_data_pool(new_pool_name)
-
- # That an object which has switched pools gets its backtrace updated
- self.mount_a.run_shell(["setfattr", "-n", "ceph.file.layout.pool", "-v", new_pool_name, "./parent_b/alpha"])
- self.fs.mds_asok(["flush", "journal"])
- backtrace_old_pool = self.fs.read_backtrace(file_ino, pool=old_data_pool_name)
- self.assertEqual(backtrace_old_pool['pool'], new_pool_id)
- backtrace_new_pool = self.fs.read_backtrace(file_ino, pool=new_pool_name)
- self.assertEqual(backtrace_new_pool['pool'], new_pool_id)
- new_pool_layout = self.fs.read_layout(file_ino, pool=new_pool_name)
- self.assertEqual(new_pool_layout['pool_id'], new_pool_id)
- self.assertEqual(new_pool_layout['pool_ns'], '')
-
- # That subsequent linkage changes are only written to new pool backtrace
- self.mount_a.run_shell(["mkdir", "parent_c"])
- self.mount_a.run_shell(["mv", "parent_b/alpha", "parent_c/alpha"])
- self.fs.mds_asok(["flush", "journal"])
- backtrace_old_pool = self.fs.read_backtrace(file_ino, pool=old_data_pool_name)
- self.assertEqual(['alpha', 'parent_b'], [a['dname'] for a in backtrace_old_pool['ancestors']])
- backtrace_new_pool = self.fs.read_backtrace(file_ino, pool=new_pool_name)
- self.assertEqual(['alpha', 'parent_c'], [a['dname'] for a in backtrace_new_pool['ancestors']])
-
- # That layout is written to new pool after change to other field in layout
- self.mount_a.run_shell(["setfattr", "-n", "ceph.file.layout.object_size", "-v", "8388608", "./parent_c/alpha"])
-
- self.fs.mds_asok(["flush", "journal"])
- new_pool_layout = self.fs.read_layout(file_ino, pool=new_pool_name)
- self.assertEqual(new_pool_layout['object_size'], 8388608)
-
- # ...but not to the old pool: the old pool's backtrace points to the new pool, and that's enough,
- # we don't update the layout in all the old pools whenever it changes
- old_pool_layout = self.fs.read_layout(file_ino, pool=old_data_pool_name)
- self.assertEqual(old_pool_layout['object_size'], 4194304)
+++ /dev/null
-
-import os
-import time
-from textwrap import dedent
-from unittest import SkipTest
-from tasks.cephfs.fuse_mount import FuseMount
-from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
-
-class TestCapFlush(CephFSTestCase):
- @for_teuthology
- def test_replay_create(self):
- """
- MDS starts to handle client caps when it enters clientreplay stage.
- When handling a client cap in clientreplay stage, it's possible that
- corresponding inode does not exist because the client request which
- creates inode hasn't been replayed.
- """
-
- if not isinstance(self.mount_a, FuseMount):
- raise SkipTest("Require FUSE client to inject client release failure")
-
- dir_path = os.path.join(self.mount_a.mountpoint, "testdir")
- py_script = dedent("""
- import os
- os.mkdir("{0}")
- fd = os.open("{0}", os.O_RDONLY)
- os.fchmod(fd, 0777)
- os.fsync(fd)
- """).format(dir_path)
- self.mount_a.run_python(py_script)
-
- self.fs.mds_asok(["flush", "journal"])
-
- # client will only get unsafe replay
- self.fs.mds_asok(["config", "set", "mds_log_pause", "1"])
-
- file_name = "testfile"
- file_path = dir_path + "/" + file_name
-
- # Create a file and modify its mode. ceph-fuse will mark Ax cap dirty
- py_script = dedent("""
- import os
- os.chdir("{0}")
- os.setgid(65534)
- os.setuid(65534)
- fd = os.open("{1}", os.O_CREAT | os.O_RDWR, 0644)
- os.fchmod(fd, 0640)
- """).format(dir_path, file_name)
- self.mount_a.run_python(py_script)
-
- # Modify file mode by different user. ceph-fuse will send a setattr request
- self.mount_a.run_shell(["chmod", "600", file_path], wait=False)
-
- time.sleep(10)
-
- # Restart mds. Client will re-send the unsafe request and cap flush
- self.fs.mds_stop()
- self.fs.mds_fail_restart()
- self.fs.wait_for_daemons()
-
- mode = self.mount_a.run_shell(['stat', '-c' '%a', file_path]).stdout.getvalue().strip()
- # If the cap flush get dropped, mode should be 0644.
- # (Ax cap stays in dirty state, which prevents setattr reply from updating file mode)
- self.assertEqual(mode, "600")
+++ /dev/null
-
-"""
-Exercise the MDS's behaviour when clients and the MDCache reach or
-exceed the limits of how many caps/inodes they should hold.
-"""
-
-import logging
-from textwrap import dedent
-from unittest import SkipTest
-from teuthology.orchestra.run import CommandFailedError
-from tasks.cephfs.cephfs_test_case import CephFSTestCase, needs_trimming
-from tasks.cephfs.fuse_mount import FuseMount
-import os
-
-
-log = logging.getLogger(__name__)
-
-
-# Arbitrary timeouts for operations involving restarting
-# an MDS or waiting for it to come up
-MDS_RESTART_GRACE = 60
-
-# Hardcoded values from Server::recall_client_state
-CAP_RECALL_RATIO = 0.8
-CAP_RECALL_MIN = 100
-
-
-class TestClientLimits(CephFSTestCase):
- REQUIRE_KCLIENT_REMOTE = True
- CLIENTS_REQUIRED = 2
-
- def _test_client_pin(self, use_subdir):
- """
- When a client pins an inode in its cache, for example because the file is held open,
- it should reject requests from the MDS to trim these caps. The MDS should complain
- to the user that it is unable to enforce its cache size limits because of this
- objectionable client.
-
- :param use_subdir: whether to put test files in a subdir or use root
- """
-
- cache_size = 100
- open_files = 200
-
- self.set_conf('mds', 'mds cache size', cache_size)
- self.fs.mds_fail_restart()
- self.fs.wait_for_daemons()
-
- mount_a_client_id = self.mount_a.get_global_id()
- path = "subdir/mount_a" if use_subdir else "mount_a"
- open_proc = self.mount_a.open_n_background(path, open_files)
-
- # Client should now hold:
- # `open_files` caps for the open files
- # 1 cap for root
- # 1 cap for subdir
- self.wait_until_equal(lambda: self.get_session(mount_a_client_id)['num_caps'],
- open_files + (2 if use_subdir else 1),
- timeout=600,
- reject_fn=lambda x: x > open_files + 2)
-
- # MDS should not be happy about that, as the client is failing to comply
- # with the SESSION_RECALL messages it is being sent
- mds_recall_state_timeout = int(self.fs.get_config("mds_recall_state_timeout"))
- self.wait_for_health("failing to respond to cache pressure",
- mds_recall_state_timeout + 10)
-
- # We can also test that the MDS health warning for oversized
- # cache is functioning as intended.
- self.wait_for_health("Too many inodes in cache",
- mds_recall_state_timeout + 10)
-
- # When the client closes the files, it should retain only as many caps as allowed
- # under the SESSION_RECALL policy
- log.info("Terminating process holding files open")
- open_proc.stdin.close()
- try:
- open_proc.wait()
- except CommandFailedError:
- # We killed it, so it raises an error
- pass
-
- # The remaining caps should comply with the numbers sent from MDS in SESSION_RECALL message,
- # which depend on the cache size and overall ratio
- self.wait_until_equal(
- lambda: self.get_session(mount_a_client_id)['num_caps'],
- int(cache_size * 0.8),
- timeout=600,
- reject_fn=lambda x: x < int(cache_size*.8))
-
- @needs_trimming
- def test_client_pin_root(self):
- self._test_client_pin(False)
-
- @needs_trimming
- def test_client_pin(self):
- self._test_client_pin(True)
-
- def test_client_release_bug(self):
- """
- When a client has a bug (which we will simulate) preventing it from releasing caps,
- the MDS should notice that releases are not being sent promptly, and generate a health
- metric to that effect.
- """
-
- # The debug hook to inject the failure only exists in the fuse client
- if not isinstance(self.mount_a, FuseMount):
- raise SkipTest("Require FUSE client to inject client release failure")
-
- self.set_conf('client.{0}'.format(self.mount_a.client_id), 'client inject release failure', 'true')
- self.mount_a.teardown()
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
- mount_a_client_id = self.mount_a.get_global_id()
-
- # Client A creates a file. He will hold the write caps on the file, and later (simulated bug) fail
- # to comply with the MDSs request to release that cap
- self.mount_a.run_shell(["touch", "file1"])
-
- # Client B tries to stat the file that client A created
- rproc = self.mount_b.write_background("file1")
-
- # After mds_revoke_cap_timeout, we should see a health warning (extra lag from
- # MDS beacon period)
- mds_revoke_cap_timeout = int(self.fs.get_config("mds_revoke_cap_timeout"))
- self.wait_for_health("failing to respond to capability release", mds_revoke_cap_timeout + 10)
-
- # Client B should still be stuck
- self.assertFalse(rproc.finished)
-
- # Kill client A
- self.mount_a.kill()
- self.mount_a.kill_cleanup()
-
- # Client B should complete
- self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
- rproc.wait()
-
- def test_client_oldest_tid(self):
- """
- When a client does not advance its oldest tid, the MDS should notice that
- and generate health warnings.
- """
-
- # num of requests client issues
- max_requests = 1000
-
- # The debug hook to inject the failure only exists in the fuse client
- if not isinstance(self.mount_a, FuseMount):
- raise SkipTest("Require FUSE client to inject client release failure")
-
- self.set_conf('client', 'client inject fixed oldest tid', 'true')
- self.mount_a.teardown()
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
-
- self.fs.mds_asok(['config', 'set', 'mds_max_completed_requests', '{0}'.format(max_requests)])
-
- # Create lots of files
- self.mount_a.create_n_files("testdir/file1", max_requests + 100)
-
- # Create a few files synchronously. This makes sure previous requests are completed
- self.mount_a.create_n_files("testdir/file2", 5, True)
-
- # Wait for the health warnings. Assume mds can handle 10 request per second at least
- self.wait_for_health("failing to advance its oldest client/flush tid", max_requests / 10)
-
- def _test_client_cache_size(self, mount_subdir):
- """
- check if client invalidate kernel dcache according to its cache size config
- """
-
- # The debug hook to inject the failure only exists in the fuse client
- if not isinstance(self.mount_a, FuseMount):
- raise SkipTest("Require FUSE client to inject client release failure")
-
- if mount_subdir:
- # fuse assigns a fix inode number (1) to root inode. But in mounting into
- # subdir case, the actual inode number of root is not 1. This mismatch
- # confuses fuse_lowlevel_notify_inval_entry() when invalidating dentries
- # in root directory.
- self.mount_a.run_shell(["mkdir", "subdir"])
- self.mount_a.umount_wait()
- self.set_conf('client', 'client mountpoint', '/subdir')
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
- root_ino = self.mount_a.path_to_ino(".")
- self.assertEqual(root_ino, 1);
-
- dir_path = os.path.join(self.mount_a.mountpoint, "testdir")
-
- mkdir_script = dedent("""
- import os
- os.mkdir("{path}")
- for n in range(0, {num_dirs}):
- os.mkdir("{path}/dir{{0}}".format(n))
- """)
-
- num_dirs = 1000
- self.mount_a.run_python(mkdir_script.format(path=dir_path, num_dirs=num_dirs))
- self.mount_a.run_shell(["sync"])
-
- dentry_count, dentry_pinned_count = self.mount_a.get_dentry_count()
- self.assertGreaterEqual(dentry_count, num_dirs)
- self.assertGreaterEqual(dentry_pinned_count, num_dirs)
-
- cache_size = num_dirs / 10
- self.mount_a.set_cache_size(cache_size)
-
- def trimmed():
- dentry_count, dentry_pinned_count = self.mount_a.get_dentry_count()
- log.info("waiting, dentry_count, dentry_pinned_count: {0}, {1}".format(
- dentry_count, dentry_pinned_count
- ))
- if dentry_count > cache_size or dentry_pinned_count > cache_size:
- return False
-
- return True
-
- self.wait_until_true(trimmed, 30)
-
- @needs_trimming
- def test_client_cache_size(self):
- self._test_client_cache_size(False)
- self._test_client_cache_size(True)
+++ /dev/null
-
-"""
-Teuthology task for exercising CephFS client recovery
-"""
-
-import logging
-from textwrap import dedent
-import time
-import distutils.version as version
-import re
-import os
-
-from teuthology.orchestra.run import CommandFailedError, ConnectionLostError
-from tasks.cephfs.cephfs_test_case import CephFSTestCase
-from teuthology.packaging import get_package_version
-
-
-log = logging.getLogger(__name__)
-
-
-# Arbitrary timeouts for operations involving restarting
-# an MDS or waiting for it to come up
-MDS_RESTART_GRACE = 60
-
-
-class TestClientNetworkRecovery(CephFSTestCase):
- REQUIRE_KCLIENT_REMOTE = True
- REQUIRE_ONE_CLIENT_REMOTE = True
- CLIENTS_REQUIRED = 2
-
- LOAD_SETTINGS = ["mds_session_timeout", "mds_reconnect_timeout", "ms_max_backoff"]
-
- # Environment references
- mds_session_timeout = None
- mds_reconnect_timeout = None
- ms_max_backoff = None
-
- def test_network_death(self):
- """
- Simulate software freeze or temporary network failure.
-
- Check that the client blocks I/O during failure, and completes
- I/O after failure.
- """
-
- # We only need one client
- self.mount_b.umount_wait()
-
- # Initially our one client session should be visible
- client_id = self.mount_a.get_global_id()
- ls_data = self._session_list()
- self.assert_session_count(1, ls_data)
- self.assertEqual(ls_data[0]['id'], client_id)
- self.assert_session_state(client_id, "open")
-
- # ...and capable of doing I/O without blocking
- self.mount_a.create_files()
-
- # ...but if we turn off the network
- self.fs.set_clients_block(True)
-
- # ...and try and start an I/O
- write_blocked = self.mount_a.write_background()
-
- # ...then it should block
- self.assertFalse(write_blocked.finished)
- self.assert_session_state(client_id, "open")
- time.sleep(self.mds_session_timeout * 1.5) # Long enough for MDS to consider session stale
- self.assertFalse(write_blocked.finished)
- self.assert_session_state(client_id, "stale")
-
- # ...until we re-enable I/O
- self.fs.set_clients_block(False)
-
- # ...when it should complete promptly
- a = time.time()
- self.wait_until_true(lambda: write_blocked.finished, self.ms_max_backoff * 2)
- write_blocked.wait() # Already know we're finished, wait() to raise exception on errors
- recovery_time = time.time() - a
- log.info("recovery time: {0}".format(recovery_time))
- self.assert_session_state(client_id, "open")
-
-
-class TestClientRecovery(CephFSTestCase):
- REQUIRE_KCLIENT_REMOTE = True
- CLIENTS_REQUIRED = 2
-
- LOAD_SETTINGS = ["mds_session_timeout", "mds_reconnect_timeout", "ms_max_backoff"]
-
- # Environment references
- mds_session_timeout = None
- mds_reconnect_timeout = None
- ms_max_backoff = None
-
- def test_basic(self):
- # Check that two clients come up healthy and see each others' files
- # =====================================================
- self.mount_a.create_files()
- self.mount_a.check_files()
- self.mount_a.umount_wait()
-
- self.mount_b.check_files()
-
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
-
- # Check that the admin socket interface is correctly reporting
- # two sessions
- # =====================================================
- ls_data = self._session_list()
- self.assert_session_count(2, ls_data)
-
- self.assertSetEqual(
- set([l['id'] for l in ls_data]),
- {self.mount_a.get_global_id(), self.mount_b.get_global_id()}
- )
-
- def test_restart(self):
- # Check that after an MDS restart both clients reconnect and continue
- # to handle I/O
- # =====================================================
- self.fs.mds_fail_restart()
- self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE)
-
- self.mount_a.create_destroy()
- self.mount_b.create_destroy()
-
- def _session_num_caps(self, client_id):
- ls_data = self.fs.mds_asok(['session', 'ls'])
- return int(self._session_by_id(ls_data).get(client_id, {'num_caps': None})['num_caps'])
-
- def test_reconnect_timeout(self):
- # Reconnect timeout
- # =================
- # Check that if I stop an MDS and a client goes away, the MDS waits
- # for the reconnect period
- self.fs.mds_stop()
- self.fs.mds_fail()
-
- mount_a_client_id = self.mount_a.get_global_id()
- self.mount_a.umount_wait(force=True)
-
- self.fs.mds_restart()
-
- self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE)
- # Check that the MDS locally reports its state correctly
- status = self.fs.mds_asok(['status'])
- self.assertIn("reconnect_status", status)
-
- ls_data = self._session_list()
- self.assert_session_count(2, ls_data)
-
- # The session for the dead client should have the 'reconnect' flag set
- self.assertTrue(self.get_session(mount_a_client_id)['reconnecting'])
-
- # Wait for the reconnect state to clear, this should take the
- # reconnect timeout period.
- in_reconnect_for = self.fs.wait_for_state('up:active', timeout=self.mds_reconnect_timeout * 2)
- # Check that the period we waited to enter active is within a factor
- # of two of the reconnect timeout.
- self.assertGreater(in_reconnect_for, self.mds_reconnect_timeout / 2,
- "Should have been in reconnect phase for {0} but only took {1}".format(
- self.mds_reconnect_timeout, in_reconnect_for
- ))
-
- self.assert_session_count(1)
-
- # Check that the client that timed out during reconnect can
- # mount again and do I/O
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
- self.mount_a.create_destroy()
-
- self.assert_session_count(2)
-
- def test_reconnect_eviction(self):
- # Eviction during reconnect
- # =========================
- mount_a_client_id = self.mount_a.get_global_id()
-
- self.fs.mds_stop()
- self.fs.mds_fail()
-
- # The mount goes away while the MDS is offline
- self.mount_a.kill()
-
- self.fs.mds_restart()
-
- # Enter reconnect phase
- self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE)
- self.assert_session_count(2)
-
- # Evict the stuck client
- self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
- self.assert_session_count(1)
-
- # Observe that we proceed to active phase without waiting full reconnect timeout
- evict_til_active = self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE)
- # Once we evict the troublemaker, the reconnect phase should complete
- # in well under the reconnect timeout.
- self.assertLess(evict_til_active, self.mds_reconnect_timeout * 0.5,
- "reconnect did not complete soon enough after eviction, took {0}".format(
- evict_til_active
- ))
-
- # We killed earlier so must clean up before trying to use again
- self.mount_a.kill_cleanup()
-
- # Bring the client back
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
- self.mount_a.create_destroy()
-
- def test_stale_caps(self):
- # Capability release from stale session
- # =====================================
- cap_holder = self.mount_a.open_background()
-
- # Wait for the file to be visible from another client, indicating
- # that mount_a has completed its network ops
- self.mount_b.wait_for_visible()
-
- # Simulate client death
- self.mount_a.kill()
-
- try:
- # Now, after mds_session_timeout seconds, the waiter should
- # complete their operation when the MDS marks the holder's
- # session stale.
- cap_waiter = self.mount_b.write_background()
- a = time.time()
- cap_waiter.wait()
- b = time.time()
-
- # Should have succeeded
- self.assertEqual(cap_waiter.exitstatus, 0)
-
- cap_waited = b - a
- log.info("cap_waiter waited {0}s".format(cap_waited))
- self.assertTrue(self.mds_session_timeout / 2.0 <= cap_waited <= self.mds_session_timeout * 2.0,
- "Capability handover took {0}, expected approx {1}".format(
- cap_waited, self.mds_session_timeout
- ))
-
- cap_holder.stdin.close()
- try:
- cap_holder.wait()
- except (CommandFailedError, ConnectionLostError):
- # We killed it (and possibly its node), so it raises an error
- pass
- finally:
- # teardown() doesn't quite handle this case cleanly, so help it out
- self.mount_a.kill_cleanup()
-
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
-
- def test_evicted_caps(self):
- # Eviction while holding a capability
- # ===================================
-
- # Take out a write capability on a file on client A,
- # and then immediately kill it.
- cap_holder = self.mount_a.open_background()
- mount_a_client_id = self.mount_a.get_global_id()
-
- # Wait for the file to be visible from another client, indicating
- # that mount_a has completed its network ops
- self.mount_b.wait_for_visible()
-
- # Simulate client death
- self.mount_a.kill()
-
- try:
- # The waiter should get stuck waiting for the capability
- # held on the MDS by the now-dead client A
- cap_waiter = self.mount_b.write_background()
- time.sleep(5)
- self.assertFalse(cap_waiter.finished)
-
- self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
- # Now, because I evicted the old holder of the capability, it should
- # immediately get handed over to the waiter
- a = time.time()
- cap_waiter.wait()
- b = time.time()
- cap_waited = b - a
- log.info("cap_waiter waited {0}s".format(cap_waited))
- # This is the check that it happened 'now' rather than waiting
- # for the session timeout
- self.assertLess(cap_waited, self.mds_session_timeout / 2.0,
- "Capability handover took {0}, expected less than {1}".format(
- cap_waited, self.mds_session_timeout / 2.0
- ))
-
- cap_holder.stdin.close()
- try:
- cap_holder.wait()
- except (CommandFailedError, ConnectionLostError):
- # We killed it (and possibly its node), so it raises an error
- pass
- finally:
- self.mount_a.kill_cleanup()
-
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
-
- def test_trim_caps(self):
- # Trim capability when reconnecting MDS
- # ===================================
-
- count = 500
- # Create lots of files
- for i in range(count):
- self.mount_a.run_shell(["touch", "f{0}".format(i)])
-
- # Populate mount_b's cache
- self.mount_b.run_shell(["ls"])
-
- client_id = self.mount_b.get_global_id()
- num_caps = self._session_num_caps(client_id)
- self.assertGreaterEqual(num_caps, count)
-
- # Restart MDS. client should trim its cache when reconnecting to the MDS
- self.fs.mds_fail_restart()
- self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE)
-
- num_caps = self._session_num_caps(client_id)
- self.assertLess(num_caps, count,
- "should have less than {0} capabilities, have {1}".format(
- count, num_caps
- ))
-
- def test_filelock(self):
- """
- Check that file lock doesn't get lost after an MDS restart
- """
- a_version_str = get_package_version(self.mount_a.client_remote, "fuse")
- b_version_str = get_package_version(self.mount_b.client_remote, "fuse")
- flock_version_str = "2.9"
-
- version_regex = re.compile(r"[0-9\.]+")
- a_result = version_regex.match(a_version_str)
- self.assertTrue(a_result)
- b_result = version_regex.match(b_version_str)
- self.assertTrue(b_result)
- a_version = version.StrictVersion(a_result.group())
- b_version = version.StrictVersion(b_result.group())
- flock_version=version.StrictVersion(flock_version_str)
-
- flockable = False
- if (a_version >= flock_version and b_version >= flock_version):
- log.info("testing flock locks")
- flockable = True
- else:
- log.info("not testing flock locks, machines have versions {av} and {bv}".format(
- av=a_version_str,bv=b_version_str))
-
- lock_holder = self.mount_a.lock_background(do_flock=flockable)
-
- self.mount_b.wait_for_visible("background_file-2")
- self.mount_b.check_filelock(do_flock=flockable)
-
- self.fs.mds_fail_restart()
- self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE)
-
- self.mount_b.check_filelock(do_flock=flockable)
-
- # Tear down the background process
- lock_holder.stdin.close()
- try:
- lock_holder.wait()
- except (CommandFailedError, ConnectionLostError):
- # We killed it, so it raises an error
- pass
-
- def test_dir_fsync(self):
- self._test_fsync(True);
-
- def test_create_fsync(self):
- self._test_fsync(False);
-
- def _test_fsync(self, dirfsync):
- """
- That calls to fsync guarantee visibility of metadata to another
- client immediately after the fsyncing client dies.
- """
-
- # Leave this guy out until he's needed
- self.mount_b.umount_wait()
-
- # Create dir + child dentry on client A, and fsync the dir
- path = os.path.join(self.mount_a.mountpoint, "subdir")
- self.mount_a.run_python(
- dedent("""
- import os
- import time
-
- path = "{path}"
-
- print "Starting creation..."
- start = time.time()
-
- os.mkdir(path)
- dfd = os.open(path, os.O_DIRECTORY)
-
- fd = open(os.path.join(path, "childfile"), "w")
- print "Finished creation in {{0}}s".format(time.time() - start)
-
- print "Starting fsync..."
- start = time.time()
- if {dirfsync}:
- os.fsync(dfd)
- else:
- os.fsync(fd)
- print "Finished fsync in {{0}}s".format(time.time() - start)
- """.format(path=path,dirfsync=str(dirfsync)))
- )
-
- # Immediately kill the MDS and then client A
- self.fs.mds_stop()
- self.fs.mds_fail()
- self.mount_a.kill()
- self.mount_a.kill_cleanup()
-
- # Restart the MDS. Wait for it to come up, it'll have to time out in clientreplay
- self.fs.mds_restart()
- log.info("Waiting for reconnect...")
- self.fs.wait_for_state("up:reconnect")
- log.info("Waiting for active...")
- self.fs.wait_for_state("up:active", timeout=MDS_RESTART_GRACE + self.mds_reconnect_timeout)
- log.info("Reached active...")
-
- # Is the child dentry visible from mount B?
- self.mount_b.mount()
- self.mount_b.wait_until_mounted()
- self.mount_b.run_shell(["ls", "subdir/childfile"])
+++ /dev/null
-
-from unittest import case
-from tasks.cephfs.cephfs_test_case import CephFSTestCase
-from tasks.cephfs.fuse_mount import FuseMount
-
-
-class TestConfigCommands(CephFSTestCase):
- """
- Test that daemons and clients respond to the otherwise rarely-used
- runtime config modification operations.
- """
-
- CLIENTS_REQUIRED = 1
- MDSS_REQUIRED = 1
-
- def test_client_config(self):
- """
- That I can successfully issue asok "config set" commands
-
- :return:
- """
-
- if not isinstance(self.mount_a, FuseMount):
- raise case.SkipTest("Test only applies to FUSE clients")
-
- test_key = "client_cache_size"
- test_val = "123"
- self.mount_a.admin_socket(['config', 'set', test_key, test_val])
- out = self.mount_a.admin_socket(['config', 'get', test_key])
- self.assertEqual(out[test_key], test_val)
-
- self.mount_a.write_n_mb("file.bin", 1);
-
- # Implicitly asserting that things don't have lockdep error in shutdown
- self.mount_a.umount_wait(require_clean=True)
- self.fs.mds_stop()
-
- def test_mds_config_asok(self):
- test_key = "mds_max_purge_ops"
- test_val = "123"
- self.fs.mds_asok(['config', 'set', test_key, test_val])
- out = self.fs.mds_asok(['config', 'get', test_key])
- self.assertEqual(out[test_key], test_val)
-
- # Implicitly asserting that things don't have lockdep error in shutdown
- self.mount_a.umount_wait(require_clean=True)
- self.fs.mds_stop()
-
- def test_mds_config_tell(self):
- test_key = "mds_max_purge_ops"
- test_val = "123"
-
- mds_id = self.fs.get_lone_mds_id()
- self.fs.mon_manager.raw_cluster_cmd("tell", "mds.{0}".format(mds_id), "injectargs",
- "--{0}={1}".format(test_key, test_val))
-
- # Read it back with asok because there is no `tell` equivalent
- out = self.fs.mds_asok(['config', 'get', test_key])
- self.assertEqual(out[test_key], test_val)
-
- # Implicitly asserting that things don't have lockdep error in shutdown
- self.mount_a.umount_wait(require_clean=True)
- self.fs.mds_stop()
+++ /dev/null
-import json
-import logging
-import errno
-import re
-from teuthology.contextutil import MaxWhileTries
-from teuthology.exceptions import CommandFailedError
-from teuthology.orchestra.run import wait
-from tasks.cephfs.fuse_mount import FuseMount
-from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
-
-DAMAGED_ON_START = "damaged_on_start"
-DAMAGED_ON_LS = "damaged_on_ls"
-CRASHED = "server crashed"
-NO_DAMAGE = "no damage"
-FAILED_CLIENT = "client failed"
-FAILED_SERVER = "server failed"
-
-# An EIO in response to a stat from the client
-EIO_ON_LS = "eio"
-
-# An EIO, but nothing in damage table (not ever what we expect)
-EIO_NO_DAMAGE = "eio without damage entry"
-
-
-log = logging.getLogger(__name__)
-
-
-class TestDamage(CephFSTestCase):
- def _simple_workload_write(self):
- self.mount_a.run_shell(["mkdir", "subdir"])
- self.mount_a.write_n_mb("subdir/sixmegs", 6)
- return self.mount_a.stat("subdir/sixmegs")
-
- def is_marked_damaged(self, rank):
- mds_map = self.fs.get_mds_map()
- return rank in mds_map['damaged']
-
- @for_teuthology #459s
- def test_object_deletion(self):
- """
- That the MDS has a clean 'damaged' response to loss of any single metadata object
- """
-
- self._simple_workload_write()
-
- # Hmm, actually it would be nice to permute whether the metadata pool
- # state contains sessions or not, but for the moment close this session
- # to avoid waiting through reconnect on every MDS start.
- self.mount_a.umount_wait()
- for mds_name in self.fs.get_active_names():
- self.fs.mds_asok(["flush", "journal"], mds_name)
-
- self.fs.mds_stop()
- self.fs.mds_fail()
-
- self.fs.rados(['export', '/tmp/metadata.bin'])
-
- def is_ignored(obj_id, dentry=None):
- """
- A filter to avoid redundantly mutating many similar objects (e.g.
- stray dirfrags) or similar dentries (e.g. stray dir dentries)
- """
- if re.match("60.\.00000000", obj_id) and obj_id != "600.00000000":
- return True
-
- if dentry and obj_id == "100.00000000":
- if re.match("stray.+_head", dentry) and dentry != "stray0_head":
- return True
-
- return False
-
- def get_path(obj_id, dentry=None):
- """
- What filesystem path does this object or dentry correspond to? i.e.
- what should I poke to see EIO after damaging it?
- """
-
- if obj_id == "1.00000000" and dentry == "subdir_head":
- return "./subdir"
- elif obj_id == "10000000000.00000000" and dentry == "sixmegs_head":
- return "./subdir/sixmegs"
-
- # None means ls will do an "ls -R" in hope of seeing some errors
- return None
-
- objects = self.fs.rados(["ls"]).split("\n")
- objects = [o for o in objects if not is_ignored(o)]
-
- # Find all objects with an OMAP header
- omap_header_objs = []
- for o in objects:
- header = self.fs.rados(["getomapheader", o])
- # The rados CLI wraps the header output in a hex-printed style
- header_bytes = int(re.match("header \((.+) bytes\)", header).group(1))
- if header_bytes > 0:
- omap_header_objs.append(o)
-
- # Find all OMAP key/vals
- omap_keys = []
- for o in objects:
- keys_str = self.fs.rados(["listomapkeys", o])
- if keys_str:
- for key in keys_str.split("\n"):
- if not is_ignored(o, key):
- omap_keys.append((o, key))
-
- # Find objects that have data in their bodies
- data_objects = []
- for obj_id in objects:
- stat_out = self.fs.rados(["stat", obj_id])
- size = int(re.match(".+, size (.+)$", stat_out).group(1))
- if size > 0:
- data_objects.append(obj_id)
-
- # Define the various forms of damage we will inflict
- class MetadataMutation(object):
- def __init__(self, obj_id_, desc_, mutate_fn_, expectation_, ls_path=None):
- self.obj_id = obj_id_
- self.desc = desc_
- self.mutate_fn = mutate_fn_
- self.expectation = expectation_
- if ls_path is None:
- self.ls_path = "."
- else:
- self.ls_path = ls_path
-
- def __eq__(self, other):
- return self.desc == other.desc
-
- def __hash__(self):
- return hash(self.desc)
-
- junk = "deadbeef" * 10
- mutations = []
-
- # Removals
- for obj_id in objects:
- if obj_id in [
- # JournalPointers are auto-replaced if missing (same path as upgrade)
- "400.00000000",
- # Missing dirfrags for non-system dirs result in empty directory
- "10000000000.00000000",
- ]:
- expectation = NO_DAMAGE
- else:
- expectation = DAMAGED_ON_START
-
- log.info("Expectation on rm '{0}' will be '{1}'".format(
- obj_id, expectation
- ))
-
- mutations.append(MetadataMutation(
- obj_id,
- "Delete {0}".format(obj_id),
- lambda o=obj_id: self.fs.rados(["rm", o]),
- expectation
- ))
-
- # Blatant corruptions
- mutations.extend([
- MetadataMutation(
- o,
- "Corrupt {0}".format(o),
- lambda o=o: self.fs.rados(["put", o, "-"], stdin_data=junk),
- DAMAGED_ON_START
- ) for o in data_objects
- ])
-
- # Truncations
- mutations.extend([
- MetadataMutation(
- o,
- "Truncate {0}".format(o),
- lambda o=o: self.fs.rados(["truncate", o, "0"]),
- DAMAGED_ON_START
- ) for o in data_objects
- ])
-
- # OMAP value corruptions
- for o, k in omap_keys:
- if o.startswith("100."):
- # Anything in rank 0's 'mydir'
- expectation = DAMAGED_ON_START
- else:
- expectation = EIO_ON_LS
-
- mutations.append(
- MetadataMutation(
- o,
- "Corrupt omap key {0}:{1}".format(o, k),
- lambda o=o,k=k: self.fs.rados(["setomapval", o, k, junk]),
- expectation,
- get_path(o, k)
- )
- )
-
- # OMAP header corruptions
- for obj_id in omap_header_objs:
- if re.match("60.\.00000000", obj_id) \
- or obj_id in ["1.00000000", "100.00000000", "mds0_sessionmap"]:
- expectation = DAMAGED_ON_START
- else:
- expectation = NO_DAMAGE
-
- log.info("Expectation on corrupt header '{0}' will be '{1}'".format(
- obj_id, expectation
- ))
-
- mutations.append(
- MetadataMutation(
- obj_id,
- "Corrupt omap header on {0}".format(obj_id),
- lambda o=obj_id: self.fs.rados(["setomapheader", o, junk]),
- expectation
- )
- )
-
- results = {}
-
- for mutation in mutations:
- log.info("Applying mutation '{0}'".format(mutation.desc))
-
- # Reset MDS state
- self.mount_a.umount_wait(force=True)
- self.fs.mds_stop()
- self.fs.mds_fail()
- self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
-
- # Reset RADOS pool state
- self.fs.rados(['import', '/tmp/metadata.bin'])
-
- # Inject the mutation
- mutation.mutate_fn()
-
- # Try starting the MDS
- self.fs.mds_restart()
-
- # How long we'll wait between starting a daemon and expecting
- # it to make it through startup, and potentially declare itself
- # damaged to the mon cluster.
- startup_timeout = 60
-
- if mutation.expectation not in (EIO_ON_LS, DAMAGED_ON_LS, NO_DAMAGE):
- if mutation.expectation == DAMAGED_ON_START:
- # The MDS may pass through active before making it to damaged
- try:
- self.wait_until_true(lambda: self.is_marked_damaged(0), startup_timeout)
- except RuntimeError:
- pass
-
- # Wait for MDS to either come up or go into damaged state
- try:
- self.wait_until_true(lambda: self.is_marked_damaged(0) or self.fs.are_daemons_healthy(), startup_timeout)
- except RuntimeError:
- crashed = False
- # Didn't make it to healthy or damaged, did it crash?
- for daemon_id, daemon in self.fs.mds_daemons.items():
- if daemon.proc and daemon.proc.finished:
- crashed = True
- log.error("Daemon {0} crashed!".format(daemon_id))
- daemon.proc = None # So that subsequent stop() doesn't raise error
- if not crashed:
- # Didn't go health, didn't go damaged, didn't crash, so what?
- raise
- else:
- log.info("Result: Mutation '{0}' led to crash".format(mutation.desc))
- results[mutation] = CRASHED
- continue
- if self.is_marked_damaged(0):
- log.info("Result: Mutation '{0}' led to DAMAGED state".format(mutation.desc))
- results[mutation] = DAMAGED_ON_START
- continue
- else:
- log.info("Mutation '{0}' did not prevent MDS startup, attempting ls...".format(mutation.desc))
- else:
- try:
- self.wait_until_true(self.fs.are_daemons_healthy, 60)
- except RuntimeError:
- log.info("Result: Mutation '{0}' should have left us healthy, actually not.".format(mutation.desc))
- if self.is_marked_damaged(0):
- results[mutation] = DAMAGED_ON_START
- else:
- results[mutation] = FAILED_SERVER
- continue
- log.info("Daemons came up after mutation '{0}', proceeding to ls".format(mutation.desc))
-
- # MDS is up, should go damaged on ls or client mount
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
- if mutation.ls_path == ".":
- proc = self.mount_a.run_shell(["ls", "-R", mutation.ls_path], wait=False)
- else:
- proc = self.mount_a.stat(mutation.ls_path, wait=False)
-
- if mutation.expectation == DAMAGED_ON_LS:
- try:
- self.wait_until_true(lambda: self.is_marked_damaged(0), 60)
- log.info("Result: Mutation '{0}' led to DAMAGED state after ls".format(mutation.desc))
- results[mutation] = DAMAGED_ON_LS
- except RuntimeError:
- if self.fs.are_daemons_healthy():
- log.error("Result: Failed to go damaged on mutation '{0}', actually went active".format(
- mutation.desc))
- results[mutation] = NO_DAMAGE
- else:
- log.error("Result: Failed to go damaged on mutation '{0}'".format(mutation.desc))
- results[mutation] = FAILED_SERVER
-
- else:
- try:
- wait([proc], 20)
- log.info("Result: Mutation '{0}' did not caused DAMAGED state".format(mutation.desc))
- results[mutation] = NO_DAMAGE
- except MaxWhileTries:
- log.info("Result: Failed to complete client IO on mutation '{0}'".format(mutation.desc))
- results[mutation] = FAILED_CLIENT
- except CommandFailedError as e:
- if e.exitstatus == errno.EIO:
- log.info("Result: EIO on client")
- results[mutation] = EIO_ON_LS
- else:
- log.info("Result: unexpected error {0} on client".format(e))
- results[mutation] = FAILED_CLIENT
-
- if mutation.expectation == EIO_ON_LS:
- # EIOs mean something handled by DamageTable: assert that it has
- # been populated
- damage = json.loads(
- self.fs.mon_manager.raw_cluster_cmd(
- 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), "damage", "ls", '--format=json-pretty'))
- if len(damage) == 0:
- results[mutation] = EIO_NO_DAMAGE
-
- failures = [(mutation, result) for (mutation, result) in results.items() if mutation.expectation != result]
- if failures:
- log.error("{0} mutations had unexpected outcomes:".format(len(failures)))
- for mutation, result in failures:
- log.error(" Expected '{0}' actually '{1}' from '{2}'".format(
- mutation.expectation, result, mutation.desc
- ))
- raise RuntimeError("{0} mutations had unexpected outcomes".format(len(failures)))
- else:
- log.info("All {0} mutations had expected outcomes".format(len(mutations)))
-
- def test_damaged_dentry(self):
- # Damage to dentrys is interesting because it leaves the
- # directory's `complete` flag in a subtle state where
- # we have marked the dir complete in order that folks
- # can access it, but in actual fact there is a dentry
- # missing
- self.mount_a.run_shell(["mkdir", "subdir/"])
-
- self.mount_a.run_shell(["touch", "subdir/file_undamaged"])
- self.mount_a.run_shell(["touch", "subdir/file_to_be_damaged"])
-
- subdir_ino = self.mount_a.path_to_ino("subdir")
-
- self.mount_a.umount_wait()
- for mds_name in self.fs.get_active_names():
- self.fs.mds_asok(["flush", "journal"], mds_name)
-
- self.fs.mds_stop()
- self.fs.mds_fail()
-
- # Corrupt a dentry
- junk = "deadbeef" * 10
- dirfrag_obj = "{0:x}.00000000".format(subdir_ino)
- self.fs.rados(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk])
-
- # Start up and try to list it
- self.fs.mds_restart()
- self.fs.wait_for_daemons()
-
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
- dentries = self.mount_a.ls("subdir/")
-
- # The damaged guy should have disappeared
- self.assertEqual(dentries, ["file_undamaged"])
-
- # I should get ENOENT if I try and read it normally, because
- # the dir is considered complete
- try:
- self.mount_a.stat("subdir/file_to_be_damaged", wait=True)
- except CommandFailedError as e:
- self.assertEqual(e.exitstatus, errno.ENOENT)
- else:
- raise AssertionError("Expected ENOENT")
-
- # The fact that there is damaged should have bee recorded
- damage = json.loads(
- self.fs.mon_manager.raw_cluster_cmd(
- 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
- "damage", "ls", '--format=json-pretty'))
- self.assertEqual(len(damage), 1)
- damage_id = damage[0]['id']
-
- # If I try to create a dentry with the same name as the damaged guy
- # then that should be forbidden
- try:
- self.mount_a.touch("subdir/file_to_be_damaged")
- except CommandFailedError as e:
- self.assertEqual(e.exitstatus, errno.EIO)
- else:
- raise AssertionError("Expected EIO")
-
- # Attempting that touch will clear the client's complete flag, now
- # when I stat it I'll get EIO instead of ENOENT
- try:
- self.mount_a.stat("subdir/file_to_be_damaged", wait=True)
- except CommandFailedError as e:
- if isinstance(self.mount_a, FuseMount):
- self.assertEqual(e.exitstatus, errno.EIO)
- else:
- # Kernel client handles this case differently
- self.assertEqual(e.exitstatus, errno.ENOENT)
- else:
- raise AssertionError("Expected EIO")
-
- nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files")
- self.assertEqual(nfiles, "2")
-
- self.mount_a.umount_wait()
-
- # Now repair the stats
- scrub_json = self.fs.mds_asok(["scrub_path", "/subdir", "repair"])
- log.info(json.dumps(scrub_json, indent=2))
-
- self.assertEqual(scrub_json["passed_validation"], False)
- self.assertEqual(scrub_json["raw_stats"]["checked"], True)
- self.assertEqual(scrub_json["raw_stats"]["passed"], False)
-
- # Check that the file count is now correct
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
- nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files")
- self.assertEqual(nfiles, "1")
-
- # Clean up the omap object
- self.fs.rados(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk])
-
- # Clean up the damagetable entry
- self.fs.mon_manager.raw_cluster_cmd(
- 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
- "damage", "rm", "{did}".format(did=damage_id))
-
- # Now I should be able to create a file with the same name as the
- # damaged guy if I want.
- self.mount_a.touch("subdir/file_to_be_damaged")
+++ /dev/null
-
-"""
-Test our tools for recovering metadata from the data pool
-"""
-import json
-
-import logging
-import os
-from textwrap import dedent
-import traceback
-from collections import namedtuple, defaultdict
-
-from teuthology.orchestra.run import CommandFailedError
-from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
-
-log = logging.getLogger(__name__)
-
-
-ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
-
-
-class Workload(object):
- def __init__(self, filesystem, mount):
- self._mount = mount
- self._filesystem = filesystem
- self._initial_state = None
-
- # Accumulate backtraces for every failed validation, and return them. Backtraces
- # are rather verbose, but we only see them when something breaks, and they
- # let us see which check failed without having to decorate each check with
- # a string
- self._errors = []
-
- def assert_equal(self, a, b):
- try:
- if a != b:
- raise AssertionError("{0} != {1}".format(a, b))
- except AssertionError as e:
- self._errors.append(
- ValidationError(e, traceback.format_exc(3))
- )
-
- def write(self):
- """
- Write the workload files to the mount
- """
- raise NotImplementedError()
-
- def validate(self):
- """
- Read from the mount and validate that the workload files are present (i.e. have
- survived or been reconstructed from the test scenario)
- """
- raise NotImplementedError()
-
- def damage(self):
- """
- Damage the filesystem pools in ways that will be interesting to recover from. By
- default just wipe everything in the metadata pool
- """
- # Delete every object in the metadata pool
- objects = self._filesystem.rados(["ls"]).split("\n")
- for o in objects:
- self._filesystem.rados(["rm", o])
-
- def flush(self):
- """
- Called after client unmount, after write: flush whatever you want
- """
- self._filesystem.mds_asok(["flush", "journal"])
-
-
-class SimpleWorkload(Workload):
- """
- Single file, single directory, check that it gets recovered and so does its size
- """
- def write(self):
- self._mount.run_shell(["mkdir", "subdir"])
- self._mount.write_n_mb("subdir/sixmegs", 6)
- self._initial_state = self._mount.stat("subdir/sixmegs")
-
- def validate(self):
- self._mount.run_shell(["ls", "subdir"])
- st = self._mount.stat("subdir/sixmegs")
- self.assert_equal(st['st_size'], self._initial_state['st_size'])
- return self._errors
-
-
-class MovedFile(Workload):
- def write(self):
- # Create a file whose backtrace disagrees with his eventual position
- # in the metadata. We will see that he gets reconstructed in his
- # original position according to his backtrace.
- self._mount.run_shell(["mkdir", "subdir_alpha"])
- self._mount.run_shell(["mkdir", "subdir_bravo"])
- self._mount.write_n_mb("subdir_alpha/sixmegs", 6)
- self._filesystem.mds_asok(["flush", "journal"])
- self._mount.run_shell(["mv", "subdir_alpha/sixmegs", "subdir_bravo/sixmegs"])
- self._initial_state = self._mount.stat("subdir_bravo/sixmegs")
-
- def flush(self):
- pass
-
- def validate(self):
- self.assert_equal(self._mount.ls(), ["subdir_alpha"])
- st = self._mount.stat("subdir_alpha/sixmegs")
- self.assert_equal(st['st_size'], self._initial_state['st_size'])
- return self._errors
-
-
-class BacktracelessFile(Workload):
- def write(self):
- self._mount.run_shell(["mkdir", "subdir"])
- self._mount.write_n_mb("subdir/sixmegs", 6)
- self._initial_state = self._mount.stat("subdir/sixmegs")
-
- def flush(self):
- # Never flush metadata, so backtrace won't be written
- pass
-
- def validate(self):
- ino_name = "%x" % self._initial_state["st_ino"]
-
- # The inode should be linked into lost+found because we had no path for it
- self.assert_equal(self._mount.ls(), ["lost+found"])
- self.assert_equal(self._mount.ls("lost+found"), [ino_name])
- st = self._mount.stat("lost+found/{ino_name}".format(ino_name=ino_name))
-
- # We might not have got the name or path, but we should still get the size
- self.assert_equal(st['st_size'], self._initial_state['st_size'])
-
- return self._errors
-
-
-class StripedStashedLayout(Workload):
- def __init__(self, fs, m):
- super(StripedStashedLayout, self).__init__(fs, m)
-
- # Nice small stripes so we can quickly do our writes+validates
- self.sc = 4
- self.ss = 65536
- self.os = 262144
-
- self.interesting_sizes = [
- # Exactly stripe_count objects will exist
- self.os * self.sc,
- # Fewer than stripe_count objects will exist
- self.os * self.sc / 2,
- self.os * (self.sc - 1) + self.os / 2,
- self.os * (self.sc - 1) + self.os / 2 - 1,
- self.os * (self.sc + 1) + self.os / 2,
- self.os * (self.sc + 1) + self.os / 2 + 1,
- # More than stripe_count objects will exist
- self.os * self.sc + self.os * self.sc / 2
- ]
-
- def write(self):
- # Create a dir with a striped layout set on it
- self._mount.run_shell(["mkdir", "stripey"])
-
- self._mount.run_shell([
- "setfattr", "-n", "ceph.dir.layout", "-v",
- "stripe_unit={ss} stripe_count={sc} object_size={os} pool={pool}".format(
- ss=self.ss, os=self.os, sc=self.sc,
- pool=self._filesystem.get_data_pool_name()
- ),
- "./stripey"])
-
- # Write files, then flush metadata so that its layout gets written into an xattr
- for i, n_bytes in enumerate(self.interesting_sizes):
- self._mount.write_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes)
- # This is really just validating the validator
- self._mount.validate_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes)
- self._filesystem.mds_asok(["flush", "journal"])
-
- # Write another file in the same way, but this time don't flush the metadata,
- # so that it won't have the layout xattr
- self._mount.write_test_pattern("stripey/unflushed_file", 1024 * 512)
- self._mount.validate_test_pattern("stripey/unflushed_file", 1024 * 512)
-
- self._initial_state = {
- "unflushed_ino": self._mount.path_to_ino("stripey/unflushed_file")
- }
-
- def flush(self):
- # Pass because we already selectively flushed during write
- pass
-
- def validate(self):
- # The first files should have been recovered into its original location
- # with the correct layout: read back correct data
- for i, n_bytes in enumerate(self.interesting_sizes):
- try:
- self._mount.validate_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes)
- except CommandFailedError as e:
- self._errors.append(
- ValidationError("File {0} (size {1}): {2}".format(i, n_bytes, e), traceback.format_exc(3))
- )
-
- # The unflushed file should have been recovered into lost+found without
- # the correct layout: read back junk
- ino_name = "%x" % self._initial_state["unflushed_ino"]
- self.assert_equal(self._mount.ls("lost+found"), [ino_name])
- try:
- self._mount.validate_test_pattern(os.path.join("lost+found", ino_name), 1024 * 512)
- except CommandFailedError:
- pass
- else:
- self._errors.append(
- ValidationError("Unexpectedly valid data in unflushed striped file", "")
- )
-
- return self._errors
-
-
-class ManyFilesWorkload(Workload):
- def __init__(self, filesystem, mount, file_count):
- super(ManyFilesWorkload, self).__init__(filesystem, mount)
- self.file_count = file_count
-
- def write(self):
- self._mount.run_shell(["mkdir", "subdir"])
- for n in range(0, self.file_count):
- self._mount.write_test_pattern("subdir/{0}".format(n), 6 * 1024 * 1024)
-
- def validate(self):
- for n in range(0, self.file_count):
- try:
- self._mount.validate_test_pattern("subdir/{0}".format(n), 6 * 1024 * 1024)
- except CommandFailedError as e:
- self._errors.append(
- ValidationError("File {0}: {1}".format(n, e), traceback.format_exc(3))
- )
-
- return self._errors
-
-
-class MovedDir(Workload):
- def write(self):
- # Create a nested dir that we will then move. Two files with two different
- # backtraces referring to the moved dir, claiming two different locations for
- # it. We will see that only one backtrace wins and the dir ends up with
- # single linkage.
- self._mount.run_shell(["mkdir", "-p", "grandmother/parent"])
- self._mount.write_n_mb("grandmother/parent/orig_pos_file", 1)
- self._filesystem.mds_asok(["flush", "journal"])
- self._mount.run_shell(["mkdir", "grandfather"])
- self._mount.run_shell(["mv", "grandmother/parent", "grandfather"])
- self._mount.write_n_mb("grandfather/parent/new_pos_file", 2)
- self._filesystem.mds_asok(["flush", "journal"])
-
- self._initial_state = (
- self._mount.stat("grandfather/parent/orig_pos_file"),
- self._mount.stat("grandfather/parent/new_pos_file")
- )
-
- def validate(self):
- root_files = self._mount.ls()
- self.assert_equal(len(root_files), 1)
- self.assert_equal(root_files[0] in ["grandfather", "grandmother"], True)
- winner = root_files[0]
- st_opf = self._mount.stat("{0}/parent/orig_pos_file".format(winner))
- st_npf = self._mount.stat("{0}/parent/new_pos_file".format(winner))
-
- self.assert_equal(st_opf['st_size'], self._initial_state[0]['st_size'])
- self.assert_equal(st_npf['st_size'], self._initial_state[1]['st_size'])
-
-
-class MissingZerothObject(Workload):
- def write(self):
- self._mount.run_shell(["mkdir", "subdir"])
- self._mount.write_n_mb("subdir/sixmegs", 6)
- self._initial_state = self._mount.stat("subdir/sixmegs")
-
- def damage(self):
- super(MissingZerothObject, self).damage()
- zeroth_id = "{0:x}.00000000".format(self._initial_state['st_ino'])
- self._filesystem.rados(["rm", zeroth_id], pool=self._filesystem.get_data_pool_name())
-
- def validate(self):
- st = self._mount.stat("lost+found/{0:x}".format(self._initial_state['st_ino']))
- self.assert_equal(st['st_size'], self._initial_state['st_size'])
-
-
-class NonDefaultLayout(Workload):
- """
- Check that the reconstruction copes with files that have a different
- object size in their layout
- """
- def write(self):
- self._mount.run_shell(["touch", "datafile"])
- self._mount.run_shell(["setfattr", "-n", "ceph.file.layout.object_size", "-v", "8388608", "./datafile"])
- self._mount.run_shell(["dd", "if=/dev/urandom", "of=./datafile", "bs=1M", "count=32"])
- self._initial_state = self._mount.stat("datafile")
-
- def validate(self):
- p = self._mount.run_shell(["getfattr", "--only-values", "-n", "ceph.file.layout.object_size", "./datafile"])
-
- # Check we got the layout reconstructed properly
- object_size = int(p.stdout.getvalue().strip())
- self.assert_equal(object_size, 8388608)
-
- # Check we got the file size reconstructed properly
- st = self._mount.stat("datafile")
- self.assert_equal(st['st_size'], self._initial_state['st_size'])
-
-
-class TestDataScan(CephFSTestCase):
- MDSS_REQUIRED = 2
-
- def is_marked_damaged(self, rank):
- mds_map = self.fs.get_mds_map()
- return rank in mds_map['damaged']
-
- def _rebuild_metadata(self, workload, workers=1):
- """
- That when all objects in metadata pool are removed, we can rebuild a metadata pool
- based on the contents of a data pool, and a client can see and read our files.
- """
-
- # First, inject some files
- workload.write()
-
- # Unmount the client and flush the journal: the tool should also cope with
- # situations where there is dirty metadata, but we'll test that separately
- self.mount_a.umount_wait()
- workload.flush()
-
- # Stop the MDS
- self.fs.mds_stop()
- self.fs.mds_fail()
-
- # After recovery, we need the MDS to not be strict about stats (in production these options
- # are off by default, but in QA we need to explicitly disable them)
- self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
- self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
-
- # Apply any data damage the workload wants
- workload.damage()
-
- # Reset the MDS map in case multiple ranks were in play: recovery procedure
- # only understands how to rebuild metadata under rank 0
- self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
- '--yes-i-really-mean-it')
-
- # Attempt to start an MDS, see that it goes into damaged state
- self.fs.mds_restart()
-
- def get_state(mds_id):
- info = self.mds_cluster.get_mds_info(mds_id)
- return info['state'] if info is not None else None
-
- self.wait_until_true(lambda: self.is_marked_damaged(0), 60)
- for mds_id in self.fs.mds_ids:
- self.wait_until_equal(
- lambda: get_state(mds_id),
- "up:standby",
- timeout=60)
-
- # Run the recovery procedure
- self.fs.table_tool(["0", "reset", "session"])
- self.fs.table_tool(["0", "reset", "snap"])
- self.fs.table_tool(["0", "reset", "inode"])
- if False:
- with self.assertRaises(CommandFailedError):
- # Normal reset should fail when no objects are present, we'll use --force instead
- self.fs.journal_tool(["journal", "reset"])
- self.fs.journal_tool(["journal", "reset", "--force"])
- self.fs.data_scan(["init"])
- self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()], worker_count=workers)
- self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()], worker_count=workers)
-
- # Mark the MDS repaired
- self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
-
- # Start the MDS
- self.fs.mds_restart()
- self.fs.wait_for_daemons()
- log.info(str(self.mds_cluster.status()))
-
- # Mount a client
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
-
- # See that the files are present and correct
- errors = workload.validate()
- if errors:
- log.error("Validation errors found: {0}".format(len(errors)))
- for e in errors:
- log.error(e.exception)
- log.error(e.backtrace)
- raise AssertionError("Validation failed, first error: {0}\n{1}".format(
- errors[0].exception, errors[0].backtrace
- ))
-
- def test_rebuild_simple(self):
- self._rebuild_metadata(SimpleWorkload(self.fs, self.mount_a))
-
- def test_rebuild_moved_file(self):
- self._rebuild_metadata(MovedFile(self.fs, self.mount_a))
-
- def test_rebuild_backtraceless(self):
- self._rebuild_metadata(BacktracelessFile(self.fs, self.mount_a))
-
- def test_rebuild_moved_dir(self):
- self._rebuild_metadata(MovedDir(self.fs, self.mount_a))
-
- def test_rebuild_missing_zeroth(self):
- self._rebuild_metadata(MissingZerothObject(self.fs, self.mount_a))
-
- def test_rebuild_nondefault_layout(self):
- self._rebuild_metadata(NonDefaultLayout(self.fs, self.mount_a))
-
- def test_stashed_layout(self):
- self._rebuild_metadata(StripedStashedLayout(self.fs, self.mount_a))
-
- def _dirfrag_keys(self, object_id):
- keys_str = self.fs.rados(["listomapkeys", object_id])
- if keys_str:
- return keys_str.split("\n")
- else:
- return []
-
- def test_fragmented_injection(self):
- """
- That when injecting a dentry into a fragmented directory, we put it in the right fragment.
- """
-
- self.fs.mon_manager.raw_cluster_cmd("mds", "set", "allow_dirfrags", "true",
- "--yes-i-really-mean-it")
-
- file_count = 100
- file_names = ["%s" % n for n in range(0, file_count)]
-
- # Create a directory of `file_count` files, each named after its
- # decimal number and containing the string of its decimal number
- self.mount_a.run_python(dedent("""
- import os
- path = os.path.join("{path}", "subdir")
- os.mkdir(path)
- for n in range(0, {file_count}):
- open(os.path.join(path, "%s" % n), 'w').write("%s" % n)
- """.format(
- path=self.mount_a.mountpoint,
- file_count=file_count
- )))
-
- dir_ino = self.mount_a.path_to_ino("subdir")
-
- # Only one MDS should be active!
- self.assertEqual(len(self.fs.get_active_names()), 1)
-
- # Ensure that one directory is fragmented
- mds_id = self.fs.get_active_names()[0]
- self.fs.mds_asok(["dirfrag", "split", "/subdir", "0/0", "1"], mds_id)
-
- # Flush journal and stop MDS
- self.mount_a.umount_wait()
- self.fs.mds_asok(["flush", "journal"], mds_id)
- self.fs.mds_stop()
- self.fs.mds_fail()
-
- # Pick a dentry and wipe out its key
- # Because I did a 1 bit split, I know one frag will be named <inode>.01000000
- frag_obj_id = "{0:x}.01000000".format(dir_ino)
- keys = self._dirfrag_keys(frag_obj_id)
- victim_key = keys[7] # arbitrary choice
- log.info("victim_key={0}".format(victim_key))
- victim_dentry = victim_key.split("_head")[0]
- self.fs.rados(["rmomapkey", frag_obj_id, victim_key])
-
- # Start filesystem back up, observe that the file appears to be gone in an `ls`
- self.fs.mds_restart()
- self.fs.wait_for_daemons()
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
- files = self.mount_a.run_shell(["ls", "subdir/"]).stdout.getvalue().strip().split("\n")
- self.assertListEqual(sorted(files), sorted(list(set(file_names) - set([victim_dentry]))))
-
- # Stop the filesystem
- self.mount_a.umount_wait()
- self.fs.mds_stop()
- self.fs.mds_fail()
-
- # Run data-scan, observe that it inserts our dentry back into the correct fragment
- # by checking the omap now has the dentry's key again
- self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()])
- self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()])
- self.assertIn(victim_key, self._dirfrag_keys(frag_obj_id))
-
- # Start the filesystem and check that the dentry we deleted is now once again visible
- # and points to the correct file data.
- self.fs.mds_restart()
- self.fs.wait_for_daemons()
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
- out = self.mount_a.run_shell(["cat", "subdir/{0}".format(victim_dentry)]).stdout.getvalue().strip()
- self.assertEqual(out, victim_dentry)
-
- # Finally, close the loop by checking our injected dentry survives a merge
- mds_id = self.fs.get_active_names()[0]
- self.mount_a.ls("subdir") # Do an ls to ensure both frags are in cache so the merge will work
- self.fs.mds_asok(["dirfrag", "merge", "/subdir", "0/0"], mds_id)
- self.fs.mds_asok(["flush", "journal"], mds_id)
- frag_obj_id = "{0:x}.00000000".format(dir_ino)
- keys = self._dirfrag_keys(frag_obj_id)
- self.assertListEqual(sorted(keys), sorted(["%s_head" % f for f in file_names]))
-
- @for_teuthology
- def test_parallel_execution(self):
- self._rebuild_metadata(ManyFilesWorkload(self.fs, self.mount_a, 25), workers=7)
-
- def test_pg_files(self):
- """
- That the pg files command tells us which files are associated with
- a particular PG
- """
- file_count = 20
- self.mount_a.run_shell(["mkdir", "mydir"])
- self.mount_a.create_n_files("mydir/myfile", file_count)
-
- # Some files elsewhere in the system that we will ignore
- # to check that the tool is filtering properly
- self.mount_a.run_shell(["mkdir", "otherdir"])
- self.mount_a.create_n_files("otherdir/otherfile", file_count)
-
- pgs_to_files = defaultdict(list)
- # Rough (slow) reimplementation of the logic
- for i in range(0, file_count):
- file_path = "mydir/myfile_{0}".format(i)
- ino = self.mount_a.path_to_ino(file_path)
- obj = "{0:x}.{1:08x}".format(ino, 0)
- pgid = json.loads(self.fs.mon_manager.raw_cluster_cmd(
- "osd", "map", self.fs.get_data_pool_name(), obj,
- "--format=json-pretty"
- ))['pgid']
- pgs_to_files[pgid].append(file_path)
- log.info("{0}: {1}".format(file_path, pgid))
-
- pg_count = self.fs.get_pgs_per_fs_pool()
- for pg_n in range(0, pg_count):
- pg_str = "{0}.{1}".format(self.fs.get_data_pool_id(), pg_n)
- out = self.fs.data_scan(["pg_files", "mydir", pg_str])
- lines = [l for l in out.split("\n") if l]
- log.info("{0}: {1}".format(pg_str, lines))
- self.assertSetEqual(set(lines), set(pgs_to_files[pg_str]))
-
- def test_scan_links(self):
- """
- The scan_links command fixes linkage errors
- """
- self.mount_a.run_shell(["mkdir", "testdir1"])
- self.mount_a.run_shell(["mkdir", "testdir2"])
- dir1_ino = self.mount_a.path_to_ino("testdir1")
- dir2_ino = self.mount_a.path_to_ino("testdir2")
- dirfrag1_oid = "{0:x}.00000000".format(dir1_ino)
- dirfrag2_oid = "{0:x}.00000000".format(dir2_ino)
-
- self.mount_a.run_shell(["touch", "testdir1/file1"])
- self.mount_a.run_shell(["ln", "testdir1/file1", "testdir1/link1"])
- self.mount_a.run_shell(["ln", "testdir1/file1", "testdir2/link2"])
-
- mds_id = self.fs.get_active_names()[0]
- self.fs.mds_asok(["flush", "journal"], mds_id)
-
- dirfrag1_keys = self._dirfrag_keys(dirfrag1_oid)
-
- # introduce duplicated primary link
- file1_key = "file1_head"
- self.assertIn(file1_key, dirfrag1_keys)
- file1_omap_data = self.fs.rados(["getomapval", dirfrag1_oid, file1_key, '-'])
- self.fs.rados(["setomapval", dirfrag2_oid, file1_key], stdin_data=file1_omap_data)
- self.assertIn(file1_key, self._dirfrag_keys(dirfrag2_oid))
-
- # remove a remote link, make inode link count incorrect
- link1_key = 'link1_head'
- self.assertIn(link1_key, dirfrag1_keys)
- self.fs.rados(["rmomapkey", dirfrag1_oid, link1_key])
-
- # increase good primary link's version
- self.mount_a.run_shell(["touch", "testdir1/file1"])
- self.mount_a.umount_wait()
-
- self.fs.mds_asok(["flush", "journal"], mds_id)
- self.fs.mds_stop()
- self.fs.mds_fail()
-
- # repair linkage errors
- self.fs.data_scan(["scan_links"])
-
- # primary link in testdir2 was deleted?
- self.assertNotIn(file1_key, self._dirfrag_keys(dirfrag2_oid))
-
- self.fs.mds_restart()
- self.fs.wait_for_daemons()
-
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
-
- # link count was adjusted?
- file1_nlink = self.mount_a.path_to_nlink("testdir1/file1")
- self.assertEqual(file1_nlink, 2)
+++ /dev/null
-from tasks.cephfs.cephfs_test_case import CephFSTestCase
-import random
-import os
-
-class TestDumpTree(CephFSTestCase):
- def get_paths_to_ino(self):
- inos = {}
- p = self.mount_a.run_shell(["find", "./"])
- paths = p.stdout.getvalue().strip().split()
- for path in paths:
- inos[path] = self.mount_a.path_to_ino(path, False)
-
- return inos
-
- def populate(self):
- self.mount_a.run_shell(["git", "clone",
- "https://github.com/ceph/ceph-qa-suite"])
-
- def test_basic(self):
- self.mount_a.run_shell(["mkdir", "parent"])
- self.mount_a.run_shell(["mkdir", "parent/child"])
- self.mount_a.run_shell(["touch", "parent/child/file"])
- self.mount_a.run_shell(["mkdir", "parent/child/grandchild"])
- self.mount_a.run_shell(["touch", "parent/child/grandchild/file"])
-
- inos = self.get_paths_to_ino()
- tree = self.fs.mds_asok(["dump", "tree", "/parent/child", "1"])
-
- target_inos = [inos["./parent/child"], inos["./parent/child/file"],
- inos["./parent/child/grandchild"]]
-
- for ino in tree:
- del target_inos[target_inos.index(ino['ino'])] # don't catch!
-
- assert(len(target_inos) == 0)
-
- def test_random(self):
- random.seed(0)
-
- self.populate()
- inos = self.get_paths_to_ino()
- target = random.choice(inos.keys())
-
- if target != "./":
- target = os.path.dirname(target)
-
- subtree = [path for path in inos.keys() if path.startswith(target)]
- target_inos = [inos[path] for path in subtree]
- tree = self.fs.mds_asok(["dump", "tree", target[1:]])
-
- for ino in tree:
- del target_inos[target_inos.index(ino['ino'])] # don't catch!
-
- assert(len(target_inos) == 0)
-
- target_depth = target.count('/')
- maxdepth = max([path.count('/') for path in subtree]) - target_depth
- depth = random.randint(0, maxdepth)
- target_inos = [inos[path] for path in subtree \
- if path.count('/') <= depth + target_depth]
- tree = self.fs.mds_asok(["dump", "tree", target[1:], str(depth)])
-
- for ino in tree:
- del target_inos[target_inos.index(ino['ino'])] # don't catch!
-
- assert(len(target_inos) == 0)
+++ /dev/null
-import json
-import logging
-from unittest import case, SkipTest
-
-from cephfs_test_case import CephFSTestCase
-from teuthology.exceptions import CommandFailedError
-from tasks.ceph_manager import CephManager
-from teuthology import misc as teuthology
-from tasks.cephfs.fuse_mount import FuseMount
-
-log = logging.getLogger(__name__)
-
-
-class TestFailover(CephFSTestCase):
- CLIENTS_REQUIRED = 1
- MDSS_REQUIRED = 2
-
- def test_simple(self):
- """
- That when the active MDS is killed, a standby MDS is promoted into
- its rank after the grace period.
-
- This is just a simple unit test, the harder cases are covered
- in thrashing tests.
- """
-
- # Need all my standbys up as well as the active daemons
- self.wait_for_daemon_start()
-
- (original_active, ) = self.fs.get_active_names()
- original_standbys = self.mds_cluster.get_standby_daemons()
-
- # Kill the rank 0 daemon's physical process
- self.fs.mds_stop(original_active)
-
- grace = int(self.fs.get_config("mds_beacon_grace", service_type="mon"))
-
- # Wait until the monitor promotes his replacement
- def promoted():
- active = self.fs.get_active_names()
- return active and active[0] in original_standbys
-
- log.info("Waiting for promotion of one of the original standbys {0}".format(
- original_standbys))
- self.wait_until_true(
- promoted,
- timeout=grace*2)
-
- # Start the original rank 0 daemon up again, see that he becomes a standby
- self.fs.mds_restart(original_active)
- self.wait_until_true(
- lambda: original_active in self.mds_cluster.get_standby_daemons(),
- timeout=60 # Approximately long enough for MDS to start and mon to notice
- )
-
- def test_client_abort(self):
- """
- That a client will respect fuse_require_active_mds and error out
- when the cluster appears to be unavailable.
- """
-
- if not isinstance(self.mount_a, FuseMount):
- raise SkipTest("Requires FUSE client to inject client metadata")
-
- require_active = self.fs.get_config("fuse_require_active_mds", service_type="mon").lower() == "true"
- if not require_active:
- raise case.SkipTest("fuse_require_active_mds is not set")
-
- grace = int(self.fs.get_config("mds_beacon_grace", service_type="mon"))
-
- # Check it's not laggy to begin with
- (original_active, ) = self.fs.get_active_names()
- self.assertNotIn("laggy_since", self.fs.mon_manager.get_mds_status(original_active))
-
- self.mounts[0].umount_wait()
-
- # Control: that we can mount and unmount usually, while the cluster is healthy
- self.mounts[0].mount()
- self.mounts[0].wait_until_mounted()
- self.mounts[0].umount_wait()
-
- # Stop the daemon processes
- self.fs.mds_stop()
-
- # Wait for everyone to go laggy
- def laggy():
- mdsmap = self.fs.get_mds_map()
- for info in mdsmap['info'].values():
- if "laggy_since" not in info:
- return False
-
- return True
-
- self.wait_until_true(laggy, grace * 2)
- with self.assertRaises(CommandFailedError):
- self.mounts[0].mount()
-
-
-class TestStandbyReplay(CephFSTestCase):
- MDSS_REQUIRED = 4
- REQUIRE_FILESYSTEM = False
-
- def set_standby_for(self, leader, follower, replay):
- self.set_conf("mds.{0}".format(follower), "mds_standby_for_name", leader)
- if replay:
- self.set_conf("mds.{0}".format(follower), "mds_standby_replay", "true")
-
- def get_info_by_name(self, mds_name):
- status = self.mds_cluster.status()
- info = status.get_mds(mds_name)
- if info is None:
- log.warn(str(status))
- raise RuntimeError("MDS '{0}' not found".format(mds_name))
- else:
- return info
-
- def test_standby_replay_unused(self):
- # Pick out exactly 3 daemons to be run during test
- use_daemons = sorted(self.mds_cluster.mds_ids[0:3])
- mds_a, mds_b, mds_c = use_daemons
- log.info("Using MDS daemons: {0}".format(use_daemons))
-
- # B and C should both follow A, but only one will
- # really get into standby replay state.
- self.set_standby_for(mds_a, mds_b, True)
- self.set_standby_for(mds_a, mds_c, True)
-
- # Create FS and start A
- fs_a = self.mds_cluster.newfs("alpha")
- self.mds_cluster.mds_restart(mds_a)
- fs_a.wait_for_daemons()
- self.assertEqual(fs_a.get_active_names(), [mds_a])
-
- # Start B, he should go into standby replay
- self.mds_cluster.mds_restart(mds_b)
- self.wait_for_daemon_start([mds_b])
- info_b = self.get_info_by_name(mds_b)
- self.assertEqual(info_b['state'], "up:standby-replay")
- self.assertEqual(info_b['standby_for_name'], mds_a)
- self.assertEqual(info_b['rank'], 0)
-
- # Start C, he should go into standby (*not* replay)
- self.mds_cluster.mds_restart(mds_c)
- self.wait_for_daemon_start([mds_c])
- info_c = self.get_info_by_name(mds_c)
- self.assertEqual(info_c['state'], "up:standby")
- self.assertEqual(info_c['standby_for_name'], mds_a)
- self.assertEqual(info_c['rank'], -1)
-
- # Kill B, C should go into standby replay
- self.mds_cluster.mds_stop(mds_b)
- self.mds_cluster.mds_fail(mds_b)
- self.wait_until_equal(
- lambda: self.get_info_by_name(mds_c)['state'],
- "up:standby-replay",
- 60)
- info_c = self.get_info_by_name(mds_c)
- self.assertEqual(info_c['state'], "up:standby-replay")
- self.assertEqual(info_c['standby_for_name'], mds_a)
- self.assertEqual(info_c['rank'], 0)
-
- def test_standby_failure(self):
- """
- That the failure of a standby-replay daemon happens cleanly
- and doesn't interrupt anything else.
- """
- # Pick out exactly 2 daemons to be run during test
- use_daemons = sorted(self.mds_cluster.mds_ids[0:2])
- mds_a, mds_b = use_daemons
- log.info("Using MDS daemons: {0}".format(use_daemons))
-
- # Configure two pairs of MDSs that are standby for each other
- self.set_standby_for(mds_a, mds_b, True)
- self.set_standby_for(mds_b, mds_a, False)
-
- # Create FS alpha and get mds_a to come up as active
- fs_a = self.mds_cluster.newfs("alpha")
- self.mds_cluster.mds_restart(mds_a)
- fs_a.wait_for_daemons()
- self.assertEqual(fs_a.get_active_names(), [mds_a])
-
- # Start the standbys
- self.mds_cluster.mds_restart(mds_b)
- self.wait_for_daemon_start([mds_b])
-
- # See the standby come up as the correct rank
- info_b = self.get_info_by_name(mds_b)
- self.assertEqual(info_b['state'], "up:standby-replay")
- self.assertEqual(info_b['standby_for_name'], mds_a)
- self.assertEqual(info_b['rank'], 0)
-
- # Kill the standby
- self.mds_cluster.mds_stop(mds_b)
- self.mds_cluster.mds_fail(mds_b)
-
- # See that the standby is gone and the active remains
- self.assertEqual(fs_a.get_active_names(), [mds_a])
- mds_map = fs_a.get_mds_map()
- self.assertEqual(len(mds_map['info']), 1)
- self.assertEqual(mds_map['failed'], [])
- self.assertEqual(mds_map['damaged'], [])
- self.assertEqual(mds_map['stopped'], [])
-
- def test_rank_stopped(self):
- """
- That when a rank is STOPPED, standby replays for
- that rank get torn down
- """
- # Pick out exactly 2 daemons to be run during test
- use_daemons = sorted(self.mds_cluster.mds_ids[0:4])
- mds_a, mds_b, mds_a_s, mds_b_s = use_daemons
- log.info("Using MDS daemons: {0}".format(use_daemons))
-
- # a and b both get a standby
- self.set_standby_for(mds_a, mds_a_s, True)
- self.set_standby_for(mds_b, mds_b_s, True)
-
- # Create FS alpha and get mds_a to come up as active
- fs_a = self.mds_cluster.newfs("alpha")
- fs_a.mon_manager.raw_cluster_cmd('fs', 'set', fs_a.name,
- 'allow_multimds', "true",
- "--yes-i-really-mean-it")
- fs_a.mon_manager.raw_cluster_cmd('fs', 'set', fs_a.name, 'max_mds', "2")
-
- self.mds_cluster.mds_restart(mds_a)
- self.wait_until_equal(lambda: fs_a.get_active_names(), [mds_a], 30)
- self.mds_cluster.mds_restart(mds_b)
- fs_a.wait_for_daemons()
- self.assertEqual(sorted(fs_a.get_active_names()), [mds_a, mds_b])
-
- # Start the standbys
- self.mds_cluster.mds_restart(mds_b_s)
- self.wait_for_daemon_start([mds_b_s])
- self.mds_cluster.mds_restart(mds_a_s)
- self.wait_for_daemon_start([mds_a_s])
- info_b_s = self.get_info_by_name(mds_b_s)
- self.assertEqual(info_b_s['state'], "up:standby-replay")
- info_a_s = self.get_info_by_name(mds_a_s)
- self.assertEqual(info_a_s['state'], "up:standby-replay")
-
- # Shrink the cluster
- fs_a.mon_manager.raw_cluster_cmd('fs', 'set', fs_a.name, 'max_mds', "1")
- fs_a.mon_manager.raw_cluster_cmd("mds", "stop", "{0}:1".format(fs_a.name))
- self.wait_until_equal(
- lambda: fs_a.get_active_names(), [mds_a],
- 60
- )
-
- # Both 'b' and 'b_s' should go back to being standbys
- self.wait_until_equal(
- lambda: self.mds_cluster.get_standby_daemons(), {mds_b, mds_b_s},
- 60
- )
-
-
-class TestMultiFilesystems(CephFSTestCase):
- CLIENTS_REQUIRED = 2
- MDSS_REQUIRED = 4
-
- # We'll create our own filesystems and start our own daemons
- REQUIRE_FILESYSTEM = False
-
- def setUp(self):
- super(TestMultiFilesystems, self).setUp()
- self.mds_cluster.mon_manager.raw_cluster_cmd("fs", "flag", "set",
- "enable_multiple", "true",
- "--yes-i-really-mean-it")
-
- def _setup_two(self):
- fs_a = self.mds_cluster.newfs("alpha")
- fs_b = self.mds_cluster.newfs("bravo")
-
- self.mds_cluster.mds_restart()
-
- # Wait for both filesystems to go healthy
- fs_a.wait_for_daemons()
- fs_b.wait_for_daemons()
-
- # Reconfigure client auth caps
- for mount in self.mounts:
- self.mds_cluster.mon_manager.raw_cluster_cmd_result(
- 'auth', 'caps', "client.{0}".format(mount.client_id),
- 'mds', 'allow',
- 'mon', 'allow r',
- 'osd', 'allow rw pool={0}, allow rw pool={1}'.format(
- fs_a.get_data_pool_name(), fs_b.get_data_pool_name()))
-
- return fs_a, fs_b
-
- def test_clients(self):
- fs_a, fs_b = self._setup_two()
-
- # Mount a client on fs_a
- self.mount_a.mount(mount_fs_name=fs_a.name)
- self.mount_a.write_n_mb("pad.bin", 1)
- self.mount_a.write_n_mb("test.bin", 2)
- a_created_ino = self.mount_a.path_to_ino("test.bin")
- self.mount_a.create_files()
-
- # Mount a client on fs_b
- self.mount_b.mount(mount_fs_name=fs_b.name)
- self.mount_b.write_n_mb("test.bin", 1)
- b_created_ino = self.mount_b.path_to_ino("test.bin")
- self.mount_b.create_files()
-
- # Check that a non-default filesystem mount survives an MDS
- # failover (i.e. that map subscription is continuous, not
- # just the first time), reproduces #16022
- old_fs_b_mds = fs_b.get_active_names()[0]
- self.mds_cluster.mds_stop(old_fs_b_mds)
- self.mds_cluster.mds_fail(old_fs_b_mds)
- fs_b.wait_for_daemons()
- background = self.mount_b.write_background()
- # Raise exception if the write doesn't finish (i.e. if client
- # has not kept up with MDS failure)
- try:
- self.wait_until_true(lambda: background.finished, timeout=30)
- except RuntimeError:
- # The mount is stuck, we'll have to force it to fail cleanly
- background.stdin.close()
- self.mount_b.umount_wait(force=True)
- raise
-
- self.mount_a.umount_wait()
- self.mount_b.umount_wait()
-
- # See that the client's files went into the correct pool
- self.assertTrue(fs_a.data_objects_present(a_created_ino, 1024 * 1024))
- self.assertTrue(fs_b.data_objects_present(b_created_ino, 1024 * 1024))
-
- def test_standby(self):
- fs_a, fs_b = self._setup_two()
-
- # Assert that the remaining two MDS daemons are now standbys
- a_daemons = fs_a.get_active_names()
- b_daemons = fs_b.get_active_names()
- self.assertEqual(len(a_daemons), 1)
- self.assertEqual(len(b_daemons), 1)
- original_a = a_daemons[0]
- original_b = b_daemons[0]
- expect_standby_daemons = set(self.mds_cluster.mds_ids) - (set(a_daemons) | set(b_daemons))
-
- # Need all my standbys up as well as the active daemons
- self.wait_for_daemon_start()
- self.assertEqual(expect_standby_daemons, self.mds_cluster.get_standby_daemons())
-
- # Kill fs_a's active MDS, see a standby take over
- self.mds_cluster.mds_stop(original_a)
- self.mds_cluster.mon_manager.raw_cluster_cmd("mds", "fail", original_a)
- self.wait_until_equal(lambda: len(fs_a.get_active_names()), 1, 30,
- reject_fn=lambda v: v > 1)
- # Assert that it's a *different* daemon that has now appeared in the map for fs_a
- self.assertNotEqual(fs_a.get_active_names()[0], original_a)
-
- # Kill fs_b's active MDS, see a standby take over
- self.mds_cluster.mds_stop(original_b)
- self.mds_cluster.mon_manager.raw_cluster_cmd("mds", "fail", original_b)
- self.wait_until_equal(lambda: len(fs_b.get_active_names()), 1, 30,
- reject_fn=lambda v: v > 1)
- # Assert that it's a *different* daemon that has now appeared in the map for fs_a
- self.assertNotEqual(fs_b.get_active_names()[0], original_b)
-
- # Both of the original active daemons should be gone, and all standbys used up
- self.assertEqual(self.mds_cluster.get_standby_daemons(), set())
-
- # Restart the ones I killed, see them reappear as standbys
- self.mds_cluster.mds_restart(original_a)
- self.mds_cluster.mds_restart(original_b)
- self.wait_until_true(
- lambda: {original_a, original_b} == self.mds_cluster.get_standby_daemons(),
- timeout=30
- )
-
- def test_grow_shrink(self):
- # Usual setup...
- fs_a, fs_b = self._setup_two()
- fs_a.mon_manager.raw_cluster_cmd("fs", "set", fs_a.name,
- "allow_multimds", "true",
- "--yes-i-really-mean-it")
-
- fs_b.mon_manager.raw_cluster_cmd("fs", "set", fs_b.name,
- "allow_multimds", "true",
- "--yes-i-really-mean-it")
-
- # Increase max_mds on fs_b, see a standby take up the role
- fs_b.mon_manager.raw_cluster_cmd('fs', 'set', fs_b.name, 'max_mds', "2")
- self.wait_until_equal(lambda: len(fs_b.get_active_names()), 2, 30,
- reject_fn=lambda v: v > 2 or v < 1)
-
- # Increase max_mds on fs_a, see a standby take up the role
- fs_a.mon_manager.raw_cluster_cmd('fs', 'set', fs_a.name, 'max_mds', "2")
- self.wait_until_equal(lambda: len(fs_a.get_active_names()), 2, 30,
- reject_fn=lambda v: v > 2 or v < 1)
-
- # Shrink fs_b back to 1, see a daemon go back to standby
- fs_b.mon_manager.raw_cluster_cmd('fs', 'set', fs_b.name, 'max_mds', "1")
- fs_b.mon_manager.raw_cluster_cmd('mds', 'deactivate', "{0}:1".format(fs_b.name))
- self.wait_until_equal(lambda: len(fs_b.get_active_names()), 1, 30,
- reject_fn=lambda v: v > 2 or v < 1)
-
- # Grow fs_a up to 3, see the former fs_b daemon join it.
- fs_a.mon_manager.raw_cluster_cmd('fs', 'set', fs_a.name, 'max_mds', "3")
- self.wait_until_equal(lambda: len(fs_a.get_active_names()), 3, 60,
- reject_fn=lambda v: v > 3 or v < 2)
-
- def test_standby_for_name(self):
- # Pick out exactly 4 daemons to be run during test
- use_daemons = sorted(self.mds_cluster.mds_ids[0:4])
- mds_a, mds_b, mds_c, mds_d = use_daemons
- log.info("Using MDS daemons: {0}".format(use_daemons))
-
- def set_standby_for(leader, follower, replay):
- self.set_conf("mds.{0}".format(follower), "mds_standby_for_name", leader)
- if replay:
- self.set_conf("mds.{0}".format(follower), "mds_standby_replay", "true")
-
- # Configure two pairs of MDSs that are standby for each other
- set_standby_for(mds_a, mds_b, True)
- set_standby_for(mds_b, mds_a, False)
- set_standby_for(mds_c, mds_d, True)
- set_standby_for(mds_d, mds_c, False)
-
- # Create FS alpha and get mds_a to come up as active
- fs_a = self.mds_cluster.newfs("alpha")
- self.mds_cluster.mds_restart(mds_a)
- fs_a.wait_for_daemons()
- self.assertEqual(fs_a.get_active_names(), [mds_a])
-
- # Create FS bravo and get mds_c to come up as active
- fs_b = self.mds_cluster.newfs("bravo")
- self.mds_cluster.mds_restart(mds_c)
- fs_b.wait_for_daemons()
- self.assertEqual(fs_b.get_active_names(), [mds_c])
-
- # Start the standbys
- self.mds_cluster.mds_restart(mds_b)
- self.mds_cluster.mds_restart(mds_d)
- self.wait_for_daemon_start([mds_b, mds_d])
-
- def get_info_by_name(fs, mds_name):
- mds_map = fs.get_mds_map()
- for gid_str, info in mds_map['info'].items():
- if info['name'] == mds_name:
- return info
-
- log.warn(json.dumps(mds_map, indent=2))
- raise RuntimeError("MDS '{0}' not found in filesystem MDSMap".format(mds_name))
-
- # See both standbys come up as standby replay for the correct ranks
- # mds_b should be in filesystem alpha following mds_a
- info_b = get_info_by_name(fs_a, mds_b)
- self.assertEqual(info_b['state'], "up:standby-replay")
- self.assertEqual(info_b['standby_for_name'], mds_a)
- self.assertEqual(info_b['rank'], 0)
- # mds_d should be in filesystem alpha following mds_c
- info_d = get_info_by_name(fs_b, mds_d)
- self.assertEqual(info_d['state'], "up:standby-replay")
- self.assertEqual(info_d['standby_for_name'], mds_c)
- self.assertEqual(info_d['rank'], 0)
-
- # Kill both active daemons
- self.mds_cluster.mds_stop(mds_a)
- self.mds_cluster.mds_fail(mds_a)
- self.mds_cluster.mds_stop(mds_c)
- self.mds_cluster.mds_fail(mds_c)
-
- # Wait for standbys to take over
- fs_a.wait_for_daemons()
- self.assertEqual(fs_a.get_active_names(), [mds_b])
- fs_b.wait_for_daemons()
- self.assertEqual(fs_b.get_active_names(), [mds_d])
-
- # Start the original active daemons up again
- self.mds_cluster.mds_restart(mds_a)
- self.mds_cluster.mds_restart(mds_c)
- self.wait_for_daemon_start([mds_a, mds_c])
-
- self.assertEqual(set(self.mds_cluster.get_standby_daemons()),
- {mds_a, mds_c})
-
- def test_standby_for_rank(self):
- use_daemons = sorted(self.mds_cluster.mds_ids[0:4])
- mds_a, mds_b, mds_c, mds_d = use_daemons
- log.info("Using MDS daemons: {0}".format(use_daemons))
-
- def set_standby_for(leader_rank, leader_fs, follower_id):
- self.set_conf("mds.{0}".format(follower_id),
- "mds_standby_for_rank", leader_rank)
-
- fscid = leader_fs.get_namespace_id()
- self.set_conf("mds.{0}".format(follower_id),
- "mds_standby_for_fscid", fscid)
-
- fs_a = self.mds_cluster.newfs("alpha")
- fs_b = self.mds_cluster.newfs("bravo")
- set_standby_for(0, fs_a, mds_a)
- set_standby_for(0, fs_a, mds_b)
- set_standby_for(0, fs_b, mds_c)
- set_standby_for(0, fs_b, mds_d)
-
- self.mds_cluster.mds_restart(mds_a)
- fs_a.wait_for_daemons()
- self.assertEqual(fs_a.get_active_names(), [mds_a])
-
- self.mds_cluster.mds_restart(mds_c)
- fs_b.wait_for_daemons()
- self.assertEqual(fs_b.get_active_names(), [mds_c])
-
- self.mds_cluster.mds_restart(mds_b)
- self.mds_cluster.mds_restart(mds_d)
- self.wait_for_daemon_start([mds_b, mds_d])
-
- self.mds_cluster.mds_stop(mds_a)
- self.mds_cluster.mds_fail(mds_a)
- self.mds_cluster.mds_stop(mds_c)
- self.mds_cluster.mds_fail(mds_c)
-
- fs_a.wait_for_daemons()
- self.assertEqual(fs_a.get_active_names(), [mds_b])
- fs_b.wait_for_daemons()
- self.assertEqual(fs_b.get_active_names(), [mds_d])
-
- def test_standby_for_fscid(self):
- """
- That I can set a standby FSCID with no rank, and the result is
- that daemons join any rank for that filesystem.
- """
- use_daemons = sorted(self.mds_cluster.mds_ids[0:4])
- mds_a, mds_b, mds_c, mds_d = use_daemons
-
- log.info("Using MDS daemons: {0}".format(use_daemons))
-
- def set_standby_for(leader_fs, follower_id):
- fscid = leader_fs.get_namespace_id()
- self.set_conf("mds.{0}".format(follower_id),
- "mds_standby_for_fscid", fscid)
-
- # Create two filesystems which should have two ranks each
- fs_a = self.mds_cluster.newfs("alpha")
- fs_a.mon_manager.raw_cluster_cmd("fs", "set", fs_a.name,
- "allow_multimds", "true",
- "--yes-i-really-mean-it")
-
- fs_b = self.mds_cluster.newfs("bravo")
- fs_b.mon_manager.raw_cluster_cmd("fs", "set", fs_b.name,
- "allow_multimds", "true",
- "--yes-i-really-mean-it")
-
- fs_a.mon_manager.raw_cluster_cmd('fs', 'set', fs_a.name,
- 'max_mds', "2")
- fs_b.mon_manager.raw_cluster_cmd('fs', 'set', fs_b.name,
- 'max_mds', "2")
-
- # Set all the daemons to have a FSCID assignment but no other
- # standby preferences.
- set_standby_for(fs_a, mds_a)
- set_standby_for(fs_a, mds_b)
- set_standby_for(fs_b, mds_c)
- set_standby_for(fs_b, mds_d)
-
- # Now when we start all daemons at once, they should fall into
- # ranks in the right filesystem
- self.mds_cluster.mds_restart(mds_a)
- self.mds_cluster.mds_restart(mds_b)
- self.mds_cluster.mds_restart(mds_c)
- self.mds_cluster.mds_restart(mds_d)
- self.wait_for_daemon_start([mds_a, mds_b, mds_c, mds_d])
- fs_a.wait_for_daemons()
- fs_b.wait_for_daemons()
- self.assertEqual(set(fs_a.get_active_names()), {mds_a, mds_b})
- self.assertEqual(set(fs_b.get_active_names()), {mds_c, mds_d})
-
- def test_standby_for_invalid_fscid(self):
- # Set invalid standby_fscid with other mds standby_rank
- # stopping active mds service should not end up in mon crash
-
- # Get configured mons in the cluster
- first_mon = teuthology.get_first_mon(self.ctx, self.configs_set)
- (mon,) = self.ctx.cluster.only(first_mon).remotes.iterkeys()
- manager = CephManager(
- mon,
- ctx=self.ctx,
- logger=log.getChild('ceph_manager'),
- )
- configured_mons = manager.get_mon_quorum()
-
- use_daemons = sorted(self.mds_cluster.mds_ids[0:3])
- mds_a, mds_b, mds_c = use_daemons
- log.info("Using MDS daemons: {0}".format(use_daemons))
-
- def set_standby_for_rank(leader_rank, follower_id):
- self.set_conf("mds.{0}".format(follower_id),
- "mds_standby_for_rank", leader_rank)
-
- # Create one fs
- fs_a = self.mds_cluster.newfs("cephfs")
-
- # Set all the daemons to have a rank assignment but no other
- # standby preferences.
- set_standby_for_rank(0, mds_a)
- set_standby_for_rank(0, mds_b)
-
- # Set third daemon to have invalid fscid assignment and no other
- # standby preferences
- invalid_fscid = 123
- self.set_conf("mds.{0}".format(mds_c), "mds_standby_for_fscid", invalid_fscid)
-
- #Restart all the daemons to make the standby preference applied
- self.mds_cluster.mds_restart(mds_a)
- self.mds_cluster.mds_restart(mds_b)
- self.mds_cluster.mds_restart(mds_c)
- self.wait_for_daemon_start([mds_a, mds_b, mds_c])
-
- #Stop active mds daemon service of fs
- if (fs_a.get_active_names(), [mds_a]):
- self.mds_cluster.mds_stop(mds_a)
- self.mds_cluster.mds_fail(mds_a)
- fs_a.wait_for_daemons()
- else:
- self.mds_cluster.mds_stop(mds_b)
- self.mds_cluster.mds_fail(mds_b)
- fs_a.wait_for_daemons()
-
- #Get active mons from cluster
- active_mons = manager.get_mon_quorum()
-
- #Check for active quorum mon status and configured mon status
- self.assertEqual(active_mons, configured_mons, "Not all mons are in quorum Invalid standby invalid fscid test failed!")
+++ /dev/null
-
-from textwrap import dedent
-from tasks.cephfs.cephfs_test_case import CephFSTestCase
-from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO
-
-
-class TestFlush(CephFSTestCase):
- def test_flush(self):
- self.mount_a.run_shell(["mkdir", "mydir"])
- self.mount_a.run_shell(["touch", "mydir/alpha"])
- dir_ino = self.mount_a.path_to_ino("mydir")
- file_ino = self.mount_a.path_to_ino("mydir/alpha")
-
- # Unmount the client so that it isn't still holding caps
- self.mount_a.umount_wait()
-
- # Before flush, the dirfrag object does not exist
- with self.assertRaises(ObjectNotFound):
- self.fs.list_dirfrag(dir_ino)
-
- # Before flush, the file's backtrace has not been written
- with self.assertRaises(ObjectNotFound):
- self.fs.read_backtrace(file_ino)
-
- # Before flush, there are no dentries in the root
- self.assertEqual(self.fs.list_dirfrag(ROOT_INO), [])
-
- # Execute flush
- flush_data = self.fs.mds_asok(["flush", "journal"])
- self.assertEqual(flush_data['return_code'], 0)
-
- # After flush, the dirfrag object has been created
- dir_list = self.fs.list_dirfrag(dir_ino)
- self.assertEqual(dir_list, ["alpha_head"])
-
- # And the 'mydir' dentry is in the root
- self.assertEqual(self.fs.list_dirfrag(ROOT_INO), ['mydir_head'])
-
- # ...and the data object has its backtrace
- backtrace = self.fs.read_backtrace(file_ino)
- self.assertEqual(['alpha', 'mydir'], [a['dname'] for a in backtrace['ancestors']])
- self.assertEqual([dir_ino, 1], [a['dirino'] for a in backtrace['ancestors']])
- self.assertEqual(file_ino, backtrace['ino'])
-
- # ...and the journal is truncated to just a single subtreemap from the
- # newly created segment
- summary_output = self.fs.journal_tool(["event", "get", "summary"])
- try:
- self.assertEqual(summary_output,
- dedent(
- """
- Events by type:
- SUBTREEMAP: 1
- Errors: 0
- """
- ).strip())
- except AssertionError:
- # In some states, flushing the journal will leave you
- # an extra event from locks a client held. This is
- # correct behaviour: the MDS is flushing the journal,
- # it's just that new events are getting added too.
- # In this case, we should nevertheless see a fully
- # empty journal after a second flush.
- self.assertEqual(summary_output,
- dedent(
- """
- Events by type:
- SUBTREEMAP: 1
- UPDATE: 1
- Errors: 0
- """
- ).strip())
- flush_data = self.fs.mds_asok(["flush", "journal"])
- self.assertEqual(flush_data['return_code'], 0)
- self.assertEqual(self.fs.journal_tool(["event", "get", "summary"]),
- dedent(
- """
- Events by type:
- SUBTREEMAP: 1
- Errors: 0
- """
- ).strip())
-
- # Now for deletion!
- # We will count the RADOS deletions and MDS file purges, to verify that
- # the expected behaviour is happening as a result of the purge
- initial_dels = self.fs.mds_asok(['perf', 'dump', 'objecter'])['objecter']['osdop_delete']
- initial_purges = self.fs.mds_asok(['perf', 'dump', 'mds_cache'])['mds_cache']['strays_purged']
-
- # Use a client to delete a file
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
- self.mount_a.run_shell(["rm", "-rf", "mydir"])
-
- # Flush the journal so that the directory inode can be purged
- flush_data = self.fs.mds_asok(["flush", "journal"])
- self.assertEqual(flush_data['return_code'], 0)
-
- # We expect to see a single file purge
- self.wait_until_true(
- lambda: self.fs.mds_asok(['perf', 'dump', 'mds_cache'])['mds_cache']['strays_purged'] - initial_purges >= 2,
- 60)
-
- # We expect two deletions, one of the dirfrag and one of the backtrace
- self.wait_until_true(
- lambda: self.fs.mds_asok(['perf', 'dump', 'objecter'])['objecter']['osdop_delete'] - initial_dels >= 2,
- 60) # timeout is fairly long to allow for tick+rados latencies
-
- with self.assertRaises(ObjectNotFound):
- self.fs.list_dirfrag(dir_ino)
- with self.assertRaises(ObjectNotFound):
- self.fs.read_backtrace(file_ino)
- self.assertEqual(self.fs.list_dirfrag(ROOT_INO), [])
+++ /dev/null
-
-"""
-Test that the forward scrub functionality can traverse metadata and apply
-requested tags, on well formed metadata.
-
-This is *not* the real testing for forward scrub, which will need to test
-how the functionality responds to damaged metadata.
-
-"""
-import json
-
-import logging
-from collections import namedtuple
-from textwrap import dedent
-
-from teuthology.orchestra.run import CommandFailedError
-from tasks.cephfs.cephfs_test_case import CephFSTestCase
-
-import struct
-
-log = logging.getLogger(__name__)
-
-
-ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
-
-
-class TestForwardScrub(CephFSTestCase):
- MDSS_REQUIRED = 1
-
- def _read_str_xattr(self, pool, obj, attr):
- """
- Read a ceph-encoded string from a rados xattr
- """
- output = self.fs.rados(["getxattr", obj, attr], pool=pool)
- strlen = struct.unpack('i', output[0:4])[0]
- return output[4:(4 + strlen)]
-
- def _get_paths_to_ino(self):
- inos = {}
- p = self.mount_a.run_shell(["find", "./"])
- paths = p.stdout.getvalue().strip().split()
- for path in paths:
- inos[path] = self.mount_a.path_to_ino(path)
-
- return inos
-
- def test_apply_tag(self):
- self.mount_a.run_shell(["mkdir", "parentdir"])
- self.mount_a.run_shell(["mkdir", "parentdir/childdir"])
- self.mount_a.run_shell(["touch", "rfile"])
- self.mount_a.run_shell(["touch", "parentdir/pfile"])
- self.mount_a.run_shell(["touch", "parentdir/childdir/cfile"])
-
- # Build a structure mapping path to inode, as we will later want
- # to check object by object and objects are named after ino number
- inos = self._get_paths_to_ino()
-
- # Flush metadata: this is a friendly test of forward scrub so we're skipping
- # the part where it's meant to cope with dirty metadata
- self.mount_a.umount_wait()
- self.fs.mds_asok(["flush", "journal"])
-
- tag = "mytag"
-
- # Execute tagging forward scrub
- self.fs.mds_asok(["tag", "path", "/parentdir", tag])
- # Wait for completion
- import time
- time.sleep(10)
- # FIXME watching clog isn't a nice mechanism for this, once we have a ScrubMap we'll
- # watch that instead
-
- # Check that dirs were tagged
- for dirpath in ["./parentdir", "./parentdir/childdir"]:
- self.assertTagged(inos[dirpath], tag, self.fs.get_metadata_pool_name())
-
- # Check that files were tagged
- for filepath in ["./parentdir/pfile", "./parentdir/childdir/cfile"]:
- self.assertTagged(inos[filepath], tag, self.fs.get_data_pool_name())
-
- # This guy wasn't in the tag path, shouldn't have been tagged
- self.assertUntagged(inos["./rfile"])
-
- def assertUntagged(self, ino):
- file_obj_name = "{0:x}.00000000".format(ino)
- with self.assertRaises(CommandFailedError):
- self._read_str_xattr(
- self.fs.get_data_pool_name(),
- file_obj_name,
- "scrub_tag"
- )
-
- def assertTagged(self, ino, tag, pool):
- file_obj_name = "{0:x}.00000000".format(ino)
- wrote = self._read_str_xattr(
- pool,
- file_obj_name,
- "scrub_tag"
- )
- self.assertEqual(wrote, tag)
-
- def _validate_linkage(self, expected):
- inos = self._get_paths_to_ino()
- try:
- self.assertDictEqual(inos, expected)
- except AssertionError:
- log.error("Expected: {0}".format(json.dumps(expected, indent=2)))
- log.error("Actual: {0}".format(json.dumps(inos, indent=2)))
- raise
-
- def test_orphan_scan(self):
- # Create some files whose metadata we will flush
- self.mount_a.run_python(dedent("""
- import os
- mount_point = "{mount_point}"
- parent = os.path.join(mount_point, "parent")
- os.mkdir(parent)
- flushed = os.path.join(parent, "flushed")
- os.mkdir(flushed)
- for f in ["alpha", "bravo", "charlie"]:
- open(os.path.join(flushed, f), 'w').write(f)
- """.format(mount_point=self.mount_a.mountpoint)))
-
- inos = self._get_paths_to_ino()
-
- # Flush journal
- # Umount before flush to avoid cap releases putting
- # things we don't want in the journal later.
- self.mount_a.umount_wait()
- self.fs.mds_asok(["flush", "journal"])
-
- # Create a new inode that's just in the log, i.e. would
- # look orphaned to backward scan if backward scan wisnae
- # respectin' tha scrub_tag xattr.
- self.mount_a.mount()
- self.mount_a.run_shell(["mkdir", "parent/unflushed"])
- self.mount_a.run_shell(["dd", "if=/dev/urandom",
- "of=./parent/unflushed/jfile",
- "bs=1M", "count=8"])
- inos["./parent/unflushed"] = self.mount_a.path_to_ino("./parent/unflushed")
- inos["./parent/unflushed/jfile"] = self.mount_a.path_to_ino("./parent/unflushed/jfile")
- self.mount_a.umount_wait()
-
- # Orphan an inode by deleting its dentry
- # Our victim will be.... bravo.
- self.mount_a.umount_wait()
- self.fs.mds_stop()
- self.fs.mds_fail()
- self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
- self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
- frag_obj_id = "{0:x}.00000000".format(inos["./parent/flushed"])
- self.fs.rados(["rmomapkey", frag_obj_id, "bravo_head"])
-
- self.fs.mds_restart()
- self.fs.wait_for_daemons()
-
- # See that the orphaned file is indeed missing from a client's POV
- self.mount_a.mount()
- damaged_state = self._get_paths_to_ino()
- self.assertNotIn("./parent/flushed/bravo", damaged_state)
- self.mount_a.umount_wait()
-
- # Run a tagging forward scrub
- tag = "mytag123"
- self.fs.mds_asok(["tag", "path", "/parent", tag])
-
- # See that the orphan wisnae tagged
- self.assertUntagged(inos['./parent/flushed/bravo'])
-
- # See that the flushed-metadata-and-still-present files are tagged
- self.assertTagged(inos['./parent/flushed/alpha'], tag, self.fs.get_data_pool_name())
- self.assertTagged(inos['./parent/flushed/charlie'], tag, self.fs.get_data_pool_name())
-
- # See that journalled-but-not-flushed file *was* tagged
- self.assertTagged(inos['./parent/unflushed/jfile'], tag, self.fs.get_data_pool_name())
-
- # Run cephfs-data-scan targeting only orphans
- self.fs.mds_stop()
- self.fs.mds_fail()
- self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()])
- self.fs.data_scan([
- "scan_inodes",
- "--filter-tag", tag,
- self.fs.get_data_pool_name()
- ])
-
- # After in-place injection stats should be kosher again
- self.fs.set_ceph_conf('mds', 'mds verify scatter', True)
- self.fs.set_ceph_conf('mds', 'mds debug scatterstat', True)
-
- # And we should have all the same linkage we started with,
- # and no lost+found, and no extra inodes!
- self.fs.mds_restart()
- self.fs.wait_for_daemons()
- self.mount_a.mount()
- self._validate_linkage(inos)
-
- def _stash_inotable(self):
- # Get all active ranks
- ranks = self.fs.get_all_mds_rank()
-
- inotable_dict = {}
- for rank in ranks:
- inotable_oid = "mds{rank:d}_".format(rank=rank) + "inotable"
- print "Trying to fetch inotable object: " + inotable_oid
-
- #self.fs.get_metadata_object("InoTable", "mds0_inotable")
- inotable_raw = self.fs.get_metadata_object_raw(inotable_oid)
- inotable_dict[inotable_oid] = inotable_raw
- return inotable_dict
-
- def test_inotable_sync(self):
- self.mount_a.write_n_mb("file1_sixmegs", 6)
-
- # Flush journal
- self.mount_a.umount_wait()
- self.fs.mds_asok(["flush", "journal"])
-
- inotable_copy = self._stash_inotable()
-
- self.mount_a.mount()
-
- self.mount_a.write_n_mb("file2_sixmegs", 6)
- self.mount_a.write_n_mb("file3_sixmegs", 6)
-
- inos = self._get_paths_to_ino()
-
- # Flush journal
- self.mount_a.umount_wait()
- self.fs.mds_asok(["flush", "journal"])
-
- self.mount_a.umount_wait()
-
- with self.assert_cluster_log("inode table repaired", invert_match=True):
- self.fs.mds_asok(["scrub_path", "/", "repair", "recursive"])
-
- self.mds_cluster.mds_stop()
- self.mds_cluster.mds_fail()
-
- # Truncate the journal (to ensure the inotable on disk
- # is all that will be in the InoTable in memory)
-
- self.fs.journal_tool(["event", "splice",
- "--inode={0}".format(inos["./file2_sixmegs"]), "summary"])
-
- self.fs.journal_tool(["event", "splice",
- "--inode={0}".format(inos["./file3_sixmegs"]), "summary"])
-
- # Revert to old inotable.
- for key, value in inotable_copy.iteritems():
- self.fs.put_metadata_object_raw(key, value)
-
- self.mds_cluster.mds_restart()
- self.fs.wait_for_daemons()
-
- with self.assert_cluster_log("inode table repaired"):
- self.fs.mds_asok(["scrub_path", "/", "repair", "recursive"])
-
- self.mds_cluster.mds_stop()
- table_text = self.fs.table_tool(["0", "show", "inode"])
- table = json.loads(table_text)
- self.assertGreater(
- table['0']['data']['inotable']['free'][0]['start'],
- inos['./file3_sixmegs'])
-
- def test_backtrace_repair(self):
- """
- That the MDS can repair an inodes backtrace in the data pool
- if it is found to be damaged.
- """
- # Create a file for subsequent checks
- self.mount_a.run_shell(["mkdir", "parent_a"])
- self.mount_a.run_shell(["touch", "parent_a/alpha"])
- file_ino = self.mount_a.path_to_ino("parent_a/alpha")
-
- # That backtrace and layout are written after initial flush
- self.fs.mds_asok(["flush", "journal"])
- backtrace = self.fs.read_backtrace(file_ino)
- self.assertEqual(['alpha', 'parent_a'],
- [a['dname'] for a in backtrace['ancestors']])
-
- # Go corrupt the backtrace
- self.fs._write_data_xattr(file_ino, "parent",
- "oh i'm sorry did i overwrite your xattr?")
-
- with self.assert_cluster_log("bad backtrace on inode"):
- self.fs.mds_asok(["scrub_path", "/", "repair", "recursive"])
- self.fs.mds_asok(["flush", "journal"])
- backtrace = self.fs.read_backtrace(file_ino)
- self.assertEqual(['alpha', 'parent_a'],
- [a['dname'] for a in backtrace['ancestors']])
+++ /dev/null
-
-
-from tasks.cephfs.cephfs_test_case import CephFSTestCase
-from teuthology.orchestra import run
-
-import logging
-log = logging.getLogger(__name__)
-
-
-class TestFragmentation(CephFSTestCase):
- CLIENTS_REQUIRED = 1
- MDSS_REQUIRED = 1
-
- def get_splits(self):
- return self.fs.mds_asok(['perf', 'dump', 'mds'])['mds']['dir_split']
-
- def get_merges(self):
- return self.fs.mds_asok(['perf', 'dump', 'mds'])['mds']['dir_merge']
-
- def get_dir_ino(self, path):
- dir_cache = self.fs.read_cache(path, 0)
- dir_ino = None
- dir_inono = self.mount_a.path_to_ino(path.strip("/"))
- for ino in dir_cache:
- if ino['ino'] == dir_inono:
- dir_ino = ino
- break
- self.assertIsNotNone(dir_ino)
- return dir_ino
-
- def _configure(self, **kwargs):
- """
- Apply kwargs as MDS configuration settings, enable dirfrags
- and restart the MDSs.
- """
- kwargs['mds_bal_frag'] = "true"
-
- for k, v in kwargs.items():
- self.ceph_cluster.set_ceph_conf("mds", k, v.__str__())
-
- self.fs.mon_manager.raw_cluster_cmd("fs", "set", self.fs.name,
- "allow_dirfrags", "true",
- "--yes-i-really-mean-it")
-
- self.mds_cluster.mds_fail_restart()
- self.fs.wait_for_daemons()
-
- def test_oversize(self):
- """
- That a directory is split when it becomes too large.
- """
-
- split_size = 20
- merge_size = 5
-
- self._configure(
- mds_bal_split_size=split_size,
- mds_bal_merge_size=merge_size,
- mds_bal_split_bits=1
- )
-
- self.assertEqual(self.get_splits(), 0)
-
- self.mount_a.create_n_files("splitdir/file", split_size + 1)
-
- self.wait_until_true(
- lambda: self.get_splits() == 1,
- timeout=30
- )
-
- frags = self.get_dir_ino("/splitdir")['dirfrags']
- self.assertEqual(len(frags), 2)
- self.assertEqual(frags[0]['dirfrag'], "10000000000.0*")
- self.assertEqual(frags[1]['dirfrag'], "10000000000.1*")
- self.assertEqual(
- sum([len(f['dentries']) for f in frags]),
- split_size + 1
- )
-
- self.assertEqual(self.get_merges(), 0)
-
- self.mount_a.run_shell(["rm", "-f", run.Raw("splitdir/file*")])
-
- self.wait_until_true(
- lambda: self.get_merges() == 1,
- timeout=30
- )
-
- self.assertEqual(len(self.get_dir_ino("/splitdir")["dirfrags"]), 1)
-
- def test_rapid_creation(self):
- """
- That the fast-splitting limit of 1.5x normal limit is
- applied when creating dentries quickly.
- """
-
- split_size = 100
- merge_size = 1
-
- self._configure(
- mds_bal_split_size=split_size,
- mds_bal_merge_size=merge_size,
- mds_bal_split_bits=3,
- mds_bal_fragment_size_max=(split_size * 1.5 + 2)
- )
-
- # We test this only at a single split level. If a client was sending
- # IO so fast that it hit a second split before the first split
- # was complete, it could violate mds_bal_fragment_size_max -- there
- # is a window where the child dirfrags of a split are unfrozen
- # (so they can grow), but still have STATE_FRAGMENTING (so they
- # can't be split).
-
- # By writing 4x the split size when the split bits are set
- # to 3 (i.e. 4-ways), I am reasonably sure to see precisely
- # one split. The test is to check whether that split
- # happens soon enough that the client doesn't exceed
- # 2x the split_size (the "immediate" split mode should
- # kick in at 1.5x the split size).
-
- self.assertEqual(self.get_splits(), 0)
- self.mount_a.create_n_files("splitdir/file", split_size * 4)
- self.wait_until_equal(
- self.get_splits,
- 1,
- reject_fn=lambda s: s > 1,
- timeout=30
- )
-
- def test_deep_split(self):
- """
- That when the directory grows many times larger than split size,
- the fragments get split again.
- """
-
- split_size = 100
- merge_size = 1 # i.e. don't merge frag unless its empty
- split_bits = 1
-
- branch_factor = 2**split_bits
-
- # Arbitrary: how many levels shall we try fragmenting before
- # ending the test?
- max_depth = 5
-
- self._configure(
- mds_bal_split_size=split_size,
- mds_bal_merge_size=merge_size,
- mds_bal_split_bits=split_bits
- )
-
- # Each iteration we will create another level of fragments. The
- # placement of dentries into fragments is by hashes (i.e. pseudo
- # random), so we rely on statistics to get the behaviour that
- # by writing about 1.5x as many dentries as the split_size times
- # the number of frags, we will get them all to exceed their
- # split size and trigger a split.
- depth = 0
- files_written = 0
- splits_expected = 0
- while depth < max_depth:
- log.info("Writing files for depth {0}".format(depth))
- target_files = branch_factor**depth * int(split_size * 1.5)
- create_files = target_files - files_written
-
- self.ceph_cluster.mon_manager.raw_cluster_cmd("log",
- "{0} Writing {1} files (depth={2})".format(
- self.__class__.__name__, create_files, depth
- ))
- self.mount_a.create_n_files("splitdir/file_{0}".format(depth),
- create_files)
- self.ceph_cluster.mon_manager.raw_cluster_cmd("log",
- "{0} Done".format(self.__class__.__name__))
-
- files_written += create_files
- log.info("Now have {0} files".format(files_written))
-
- splits_expected += branch_factor**depth
- log.info("Waiting to see {0} splits".format(splits_expected))
- try:
- self.wait_until_equal(
- self.get_splits,
- splits_expected,
- timeout=30,
- reject_fn=lambda x: x > splits_expected
- )
-
- frags = self.get_dir_ino("/splitdir")['dirfrags']
- self.assertEqual(len(frags), branch_factor**(depth+1))
- self.assertEqual(
- sum([len(f['dentries']) for f in frags]),
- target_files
- )
- except:
- # On failures, log what fragmentation we actually ended
- # up with. This block is just for logging, at the end
- # we raise the exception again.
- frags = self.get_dir_ino("/splitdir")['dirfrags']
- log.info("depth={0} splits_expected={1} files_written={2}".format(
- depth, splits_expected, files_written
- ))
- log.info("Dirfrags:")
- for f in frags:
- log.info("{0}: {1}".format(
- f['dirfrag'], len(f['dentries'])
- ))
- raise
-
- depth += 1
-
- # Remember the inode number because we will be checking for
- # objects later.
- dir_inode_no = self.mount_a.path_to_ino("splitdir")
-
- self.mount_a.run_shell(["rm", "-rf", "splitdir/"])
- self.mount_a.umount_wait()
-
- self.fs.mds_asok(['flush', 'journal'])
-
- # Wait for all strays to purge
- self.wait_until_equal(
- lambda: self.fs.mds_asok(['perf', 'dump', 'mds_cache']
- )['mds_cache']['num_strays'],
- 0,
- timeout=1200
- )
- # Check that the metadata pool objects for all the myriad
- # child fragments are gone
- metadata_objs = self.fs.rados(["ls"])
- frag_objs = []
- for o in metadata_objs:
- if o.startswith("{0:x}.".format(dir_inode_no)):
- frag_objs.append(o)
- self.assertListEqual(frag_objs, [])
+++ /dev/null
-
-
-import json
-import logging
-import os
-from textwrap import dedent
-import time
-from teuthology.orchestra.run import CommandFailedError
-from tasks.cephfs.fuse_mount import FuseMount
-from tasks.cephfs.cephfs_test_case import CephFSTestCase
-
-
-log = logging.getLogger(__name__)
-
-
-class FullnessTestCase(CephFSTestCase):
- CLIENTS_REQUIRED = 2
-
- # Subclasses define whether they're filling whole cluster or just data pool
- data_only = False
-
- # Subclasses define how many bytes should be written to achieve fullness
- pool_capacity = None
- fill_mb = None
-
- # Subclasses define what fullness means to them
- def is_full(self):
- raise NotImplementedError()
-
- def setUp(self):
- CephFSTestCase.setUp(self)
-
- if not isinstance(self.mount_a, FuseMount):
- self.skipTest("FUSE needed: ENOSPC handling in kclient is tracker #17204")
-
- # These tests just use a single active MDS throughout, so remember its ID
- # for use in mds_asok calls
- self.active_mds_id = self.fs.get_active_names()[0]
-
- # Capture the initial OSD map epoch for later use
- self.initial_osd_epoch = json.loads(
- self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json").strip()
- )['epoch']
-
- # Check the initial barrier epoch on the MDS: this should be
- # set to the latest map at MDS startup. We do this check in
- # setUp to get in there before subclasses might touch things
- # in their own setUp functions.
- self.assertGreaterEqual(self.fs.mds_asok(["status"], mds_id=self.active_mds_id)['osdmap_epoch_barrier'],
- self.initial_osd_epoch)
-
- def test_barrier(self):
- """
- That when an OSD epoch barrier is set on an MDS, subsequently
- issued capabilities cause clients to update their OSD map to that
- epoch.
- """
-
- # Sync up clients with initial MDS OSD map barrier
- self.mount_a.open_no_data("foo")
- self.mount_b.open_no_data("bar")
-
- # Grab mounts' initial OSD epochs: later we will check that
- # it hasn't advanced beyond this point.
- mount_a_initial_epoch = self.mount_a.get_osd_epoch()[0]
- mount_b_initial_epoch = self.mount_b.get_osd_epoch()[0]
-
- # Freshly mounted at start of test, should be up to date with OSD map
- self.assertGreaterEqual(mount_a_initial_epoch, self.initial_osd_epoch)
- self.assertGreaterEqual(mount_b_initial_epoch, self.initial_osd_epoch)
-
- # Set and unset a flag to cause OSD epoch to increment
- self.fs.mon_manager.raw_cluster_cmd("osd", "set", "pause")
- self.fs.mon_manager.raw_cluster_cmd("osd", "unset", "pause")
-
- out = self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json").strip()
- new_epoch = json.loads(out)['epoch']
- self.assertNotEqual(self.initial_osd_epoch, new_epoch)
-
- # Do a metadata operation on clients, witness that they end up with
- # the old OSD map from startup time (nothing has prompted client
- # to update its map)
- self.mount_a.open_no_data("alpha")
- self.mount_b.open_no_data("bravo1")
-
- # Sleep long enough that if the OSD map was propagating it would
- # have done so (this is arbitrary because we are 'waiting' for something
- # to *not* happen).
- time.sleep(30)
-
- mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch()
- self.assertEqual(mount_a_epoch, mount_a_initial_epoch)
- mount_b_epoch, mount_b_barrier = self.mount_b.get_osd_epoch()
- self.assertEqual(mount_b_epoch, mount_b_initial_epoch)
-
- # Set a barrier on the MDS
- self.fs.mds_asok(["osdmap", "barrier", new_epoch.__str__()], mds_id=self.active_mds_id)
-
- # Do an operation on client B, witness that it ends up with
- # the latest OSD map from the barrier. This shouldn't generate any
- # cap revokes to A because B was already the last one to touch
- # a file in root.
- self.mount_b.run_shell(["touch", "bravo2"])
- self.mount_b.open_no_data("bravo2")
-
- # Some time passes here because the metadata part of the operation
- # completes immediately, while the resulting OSD map update happens
- # asynchronously (it's an Objecter::_maybe_request_map) as a result
- # of seeing the new epoch barrier.
- self.wait_until_equal(
- lambda: self.mount_b.get_osd_epoch(),
- (new_epoch, new_epoch),
- 30,
- lambda x: x[0] > new_epoch or x[1] > new_epoch)
-
- # ...and none of this should have affected the oblivious mount a,
- # because it wasn't doing any data or metadata IO
- mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch()
- self.assertEqual(mount_a_epoch, mount_a_initial_epoch)
-
- def _data_pool_name(self):
- data_pool_names = self.fs.get_data_pool_names()
- if len(data_pool_names) > 1:
- raise RuntimeError("This test can't handle multiple data pools")
- else:
- return data_pool_names[0]
-
- def _test_full(self, easy_case):
- """
- - That a client trying to write data to a file is prevented
- from doing so with an -EFULL result
- - That they are also prevented from creating new files by the MDS.
- - That they may delete another file to get the system healthy again
-
- :param easy_case: if true, delete a successfully written file to
- free up space. else, delete the file that experienced
- the failed write.
- """
-
- osd_mon_report_interval_max = int(self.fs.get_config("osd_mon_report_interval_max", service_type='osd'))
-
- log.info("Writing {0}MB should fill this cluster".format(self.fill_mb))
-
- # Fill up the cluster. This dd may or may not fail, as it depends on
- # how soon the cluster recognises its own fullness
- self.mount_a.write_n_mb("large_file_a", self.fill_mb / 2)
- try:
- self.mount_a.write_n_mb("large_file_b", self.fill_mb / 2)
- except CommandFailedError:
- log.info("Writing file B failed (full status happened already)")
- assert self.is_full()
- else:
- log.info("Writing file B succeeded (full status will happen soon)")
- self.wait_until_true(lambda: self.is_full(),
- timeout=osd_mon_report_interval_max * 5)
-
- # Attempting to write more data should give me ENOSPC
- with self.assertRaises(CommandFailedError) as ar:
- self.mount_a.write_n_mb("large_file_b", 50, seek=self.fill_mb / 2)
- self.assertEqual(ar.exception.exitstatus, 1) # dd returns 1 on "No space"
-
- # Wait for the MDS to see the latest OSD map so that it will reliably
- # be applying the policy of rejecting non-deletion metadata operations
- # while in the full state.
- osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch']
- self.wait_until_true(
- lambda: self.fs.mds_asok(['status'], mds_id=self.active_mds_id)['osdmap_epoch'] >= osd_epoch,
- timeout=10)
-
- if not self.data_only:
- with self.assertRaises(CommandFailedError):
- self.mount_a.write_n_mb("small_file_1", 0)
-
- # Clear out some space
- if easy_case:
- self.mount_a.run_shell(['rm', '-f', 'large_file_a'])
- self.mount_a.run_shell(['rm', '-f', 'large_file_b'])
- else:
- # In the hard case it is the file that filled the system.
- # Before the new #7317 (ENOSPC, epoch barrier) changes, this
- # would fail because the last objects written would be
- # stuck in the client cache as objecter operations.
- self.mount_a.run_shell(['rm', '-f', 'large_file_b'])
- self.mount_a.run_shell(['rm', '-f', 'large_file_a'])
-
- # Here we are waiting for two things to happen:
- # * The MDS to purge the stray folder and execute object deletions
- # * The OSDs to inform the mon that they are no longer full
- self.wait_until_true(lambda: not self.is_full(),
- timeout=osd_mon_report_interval_max * 5)
-
- # Wait for the MDS to see the latest OSD map so that it will reliably
- # be applying the free space policy
- osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch']
- self.wait_until_true(
- lambda: self.fs.mds_asok(['status'], mds_id=self.active_mds_id)['osdmap_epoch'] >= osd_epoch,
- timeout=10)
-
- # Now I should be able to write again
- self.mount_a.write_n_mb("large_file", 50, seek=0)
-
- # Ensure that the MDS keeps its OSD epoch barrier across a restart
-
- def test_full_different_file(self):
- self._test_full(True)
-
- def test_full_same_file(self):
- self._test_full(False)
-
- def _remote_write_test(self, template):
- """
- Run some remote python in a way that's useful for
- testing free space behaviour (see test_* methods using this)
- """
- file_path = os.path.join(self.mount_a.mountpoint, "full_test_file")
-
- # Enough to trip the full flag
- osd_mon_report_interval_max = int(self.fs.get_config("osd_mon_report_interval_max", service_type='osd'))
- mon_tick_interval = int(self.fs.get_config("mon_tick_interval", service_type="mon"))
-
- # Sufficient data to cause RADOS cluster to go 'full'
- log.info("pool capacity {0}, {1}MB should be enough to fill it".format(self.pool_capacity, self.fill_mb))
-
- # Long enough for RADOS cluster to notice it is full and set flag on mons
- # (report_interval for mon to learn PG stats, tick interval for it to update OSD map,
- # factor of 1.5 for I/O + network latency in committing OSD map and distributing it
- # to the OSDs)
- full_wait = (osd_mon_report_interval_max + mon_tick_interval) * 1.5
-
- # Configs for this test should bring this setting down in order to
- # run reasonably quickly
- if osd_mon_report_interval_max > 10:
- log.warn("This test may run rather slowly unless you decrease"
- "osd_mon_report_interval_max (5 is a good setting)!")
-
- self.mount_a.run_python(template.format(
- fill_mb=self.fill_mb,
- file_path=file_path,
- full_wait=full_wait
- ))
-
- def test_full_fclose(self):
- # A remote script which opens a file handle, fills up the filesystem, and then
- # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync
- remote_script = dedent("""
- import time
- import datetime
- import subprocess
- import os
-
- # Write some buffered data through before going full, all should be well
- print "writing some data through which we expect to succeed"
- bytes = 0
- f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT)
- bytes += os.write(f, 'a' * 4096)
- os.fsync(f)
- print "fsync'ed data successfully, will now attempt to fill fs"
-
- # Okay, now we're going to fill up the filesystem, and then keep
- # writing until we see an error from fsync. As long as we're doing
- # buffered IO, the error should always only appear from fsync and not
- # from write
- full = False
-
- for n in range(0, {fill_mb}):
- bytes += os.write(f, 'x' * 1024 * 1024)
- print "wrote bytes via buffered write, may repeat"
- print "done writing bytes"
-
- # OK, now we should sneak in under the full condition
- # due to the time it takes the OSDs to report to the
- # mons, and get a successful fsync on our full-making data
- os.fsync(f)
- print "successfully fsync'ed prior to getting full state reported"
-
- # Now wait for the full flag to get set so that our
- # next flush IO will fail
- time.sleep(30)
-
- # A buffered IO, should succeed
- print "starting buffered write we expect to succeed"
- os.write(f, 'x' * 4096)
- print "wrote, now waiting 30s and then doing a close we expect to fail"
-
- # Wait long enough for a background flush that should fail
- time.sleep(30)
-
- # ...and check that the failed background flush is reflected in fclose
- try:
- os.close(f)
- except OSError:
- print "close() returned an error as expected"
- else:
- raise RuntimeError("close() failed to raise error")
-
- os.unlink("{file_path}")
- """)
- self._remote_write_test(remote_script)
-
- def test_full_fsync(self):
- """
- That when the full flag is encountered during asynchronous
- flushes, such that an fwrite() succeeds but an fsync/fclose()
- should return the ENOSPC error.
- """
-
- # A remote script which opens a file handle, fills up the filesystem, and then
- # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync
- remote_script = dedent("""
- import time
- import datetime
- import subprocess
- import os
-
- # Write some buffered data through before going full, all should be well
- print "writing some data through which we expect to succeed"
- bytes = 0
- f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT)
- bytes += os.write(f, 'a' * 4096)
- os.fsync(f)
- print "fsync'ed data successfully, will now attempt to fill fs"
-
- # Okay, now we're going to fill up the filesystem, and then keep
- # writing until we see an error from fsync. As long as we're doing
- # buffered IO, the error should always only appear from fsync and not
- # from write
- full = False
-
- for n in range(0, {fill_mb} + 1):
- try:
- bytes += os.write(f, 'x' * 1024 * 1024)
- print "wrote bytes via buffered write, moving on to fsync"
- except OSError as e:
- print "Unexpected error %s from write() instead of fsync()" % e
- raise
-
- try:
- os.fsync(f)
- print "fsync'ed successfully"
- except OSError as e:
- print "Reached fullness after %.2f MB" % (bytes / (1024.0 * 1024.0))
- full = True
- break
- else:
- print "Not full yet after %.2f MB" % (bytes / (1024.0 * 1024.0))
-
- if n > {fill_mb} * 0.8:
- # Be cautious in the last region where we expect to hit
- # the full condition, so that we don't overshoot too dramatically
- print "sleeping a bit as we've exceeded 80% of our expected full ratio"
- time.sleep({full_wait})
-
- if not full:
- raise RuntimeError("Failed to reach fullness after writing %d bytes" % bytes)
-
- # The error sticks to the inode until we dispose of it
- try:
- os.close(f)
- except OSError:
- print "Saw error from close() as expected"
- else:
- raise RuntimeError("Did not see expected error from close()")
-
- os.unlink("{file_path}")
- """)
-
- self._remote_write_test(remote_script)
-
-
-class TestQuotaFull(FullnessTestCase):
- """
- Test per-pool fullness, which indicates quota limits exceeded
- """
- pool_capacity = 1024 * 1024 * 32 # arbitrary low-ish limit
- fill_mb = pool_capacity / (1024 * 1024)
-
- # We are only testing quota handling on the data pool, not the metadata
- # pool.
- data_only = True
-
- def setUp(self):
- super(TestQuotaFull, self).setUp()
-
- pool_name = self.fs.get_data_pool_name()
- self.fs.mon_manager.raw_cluster_cmd("osd", "pool", "set-quota", pool_name,
- "max_bytes", "{0}".format(self.pool_capacity))
-
- def is_full(self):
- return self.fs.is_pool_full(self.fs.get_data_pool_name())
-
-
-class TestClusterFull(FullnessTestCase):
- """
- Test cluster-wide fullness, which indicates that an OSD has become too full
- """
- pool_capacity = None
- REQUIRE_MEMSTORE = True
-
- def setUp(self):
- super(TestClusterFull, self).setUp()
-
- if self.pool_capacity is None:
- # This is a hack to overcome weird fluctuations in the reported
- # `max_avail` attribute of pools that sometimes occurs in between
- # tests (reason as yet unclear, but this dodges the issue)
- TestClusterFull.pool_capacity = self.fs.get_pool_df(self._data_pool_name())['max_avail']
- mon_osd_full_ratio = float(self.fs.get_config("mon_osd_full_ratio"))
- TestClusterFull.fill_mb = int(1.05 * mon_osd_full_ratio * (self.pool_capacity / (1024.0 * 1024.0)))
-
- def is_full(self):
- return self.fs.is_full()
-
-# Hide the parent class so that unittest.loader doesn't try to run it.
-del globals()['FullnessTestCase']
+++ /dev/null
-
-from StringIO import StringIO
-from tasks.cephfs.cephfs_test_case import CephFSTestCase
-from tasks.workunit import task as workunit
-
-JOURNAL_FORMAT_LEGACY = 0
-JOURNAL_FORMAT_RESILIENT = 1
-
-
-class TestJournalMigration(CephFSTestCase):
- CLIENTS_REQUIRED = 1
-
- def test_journal_migration(self):
- old_journal_version = JOURNAL_FORMAT_LEGACY
- new_journal_version = JOURNAL_FORMAT_RESILIENT
-
- self.fs.set_ceph_conf('mds', 'mds journal format', old_journal_version)
-
- # Create a filesystem using the older journal format.
- self.mount_a.umount_wait()
- self.fs.mds_stop()
- self.fs.recreate()
- self.fs.mds_restart()
- self.fs.wait_for_daemons()
-
- # Do some client work so that the log is populated with something.
- with self.mount_a.mounted():
- self.mount_a.create_files()
- self.mount_a.check_files() # sanity, this should always pass
-
- # Run a more substantial workunit so that the length of the log to be
- # coverted is going span at least a few segments
- workunit(self.ctx, {
- 'clients': {
- "client.{0}".format(self.mount_a.client_id): ["suites/fsstress.sh"],
- },
- "timeout": "3h"
- })
-
- # Modify the ceph.conf to ask the MDS to use the new journal format.
- self.fs.set_ceph_conf('mds', 'mds journal format', new_journal_version)
-
- # Restart the MDS.
- self.fs.mds_fail_restart()
- self.fs.wait_for_daemons()
-
- # This ensures that all daemons come up into a valid state
- self.fs.wait_for_daemons()
-
- # Check that files created in the initial client workload are still visible
- # in a client mount.
- with self.mount_a.mounted():
- self.mount_a.check_files()
-
- # Verify that the journal really has been rewritten.
- journal_version = self.fs.get_journal_version()
- if journal_version != new_journal_version:
- raise RuntimeError("Journal was not upgraded, version should be {0} but is {1}".format(
- new_journal_version, journal_version()
- ))
-
- # Verify that cephfs-journal-tool can now read the rewritten journal
- inspect_out = self.fs.journal_tool(["journal", "inspect"])
- if not inspect_out.endswith(": OK"):
- raise RuntimeError("Unexpected journal-tool result: '{0}'".format(
- inspect_out
- ))
-
- self.fs.journal_tool(["event", "get", "json", "--path", "/tmp/journal.json"])
- p = self.fs.tool_remote.run(
- args=[
- "python",
- "-c",
- "import json; print len(json.load(open('/tmp/journal.json')))"
- ],
- stdout=StringIO())
- event_count = int(p.stdout.getvalue().strip())
- if event_count < 1000:
- # Approximate value of "lots", expected from having run fsstress
- raise RuntimeError("Unexpectedly few journal events: {0}".format(event_count))
-
- # Do some client work so that the log is populated with something.
- with self.mount_a.mounted():
- workunit(self.ctx, {
- 'clients': {
- "client.{0}".format(self.mount_a.client_id): ["fs/misc/trivial_sync.sh"],
- },
- "timeout": "3h"
- })
+++ /dev/null
-
-"""
-Test our tools for recovering the content of damaged journals
-"""
-
-import json
-import logging
-from textwrap import dedent
-import time
-
-from teuthology.exceptions import CommandFailedError, ConnectionLostError
-from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO
-from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
-from tasks.workunit import task as workunit
-
-log = logging.getLogger(__name__)
-
-
-class TestJournalRepair(CephFSTestCase):
- MDSS_REQUIRED = 2
-
- def test_inject_to_empty(self):
- """
- That when some dentries in the journal but nothing is in
- the backing store, we correctly populate the backing store
- from the journalled dentries.
- """
-
- # Inject metadata operations
- self.mount_a.run_shell(["touch", "rootfile"])
- self.mount_a.run_shell(["mkdir", "subdir"])
- self.mount_a.run_shell(["touch", "subdir/subdirfile"])
- # There are several different paths for handling hardlinks, depending
- # on whether an existing dentry (being overwritten) is also a hardlink
- self.mount_a.run_shell(["mkdir", "linkdir"])
-
- # Test inode -> remote transition for a dentry
- self.mount_a.run_shell(["touch", "linkdir/link0"])
- self.mount_a.run_shell(["rm", "-f", "linkdir/link0"])
- self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link0"])
-
- # Test nothing -> remote transition
- self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link1"])
-
- # Test remote -> inode transition
- self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link2"])
- self.mount_a.run_shell(["rm", "-f", "linkdir/link2"])
- self.mount_a.run_shell(["touch", "linkdir/link2"])
-
- # Test remote -> diff remote transition
- self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link3"])
- self.mount_a.run_shell(["rm", "-f", "linkdir/link3"])
- self.mount_a.run_shell(["ln", "rootfile", "linkdir/link3"])
-
- # Test an empty directory
- self.mount_a.run_shell(["mkdir", "subdir/subsubdir"])
- self.mount_a.run_shell(["sync"])
-
- # Before we unmount, make a note of the inode numbers, later we will
- # check that they match what we recover from the journal
- rootfile_ino = self.mount_a.path_to_ino("rootfile")
- subdir_ino = self.mount_a.path_to_ino("subdir")
- linkdir_ino = self.mount_a.path_to_ino("linkdir")
- subdirfile_ino = self.mount_a.path_to_ino("subdir/subdirfile")
- subsubdir_ino = self.mount_a.path_to_ino("subdir/subsubdir")
-
- self.mount_a.umount_wait()
-
- # Stop the MDS
- self.fs.mds_stop()
- self.fs.mds_fail()
-
- # Now, the journal should contain the operations, but the backing
- # store shouldn't
- with self.assertRaises(ObjectNotFound):
- self.fs.list_dirfrag(subdir_ino)
- self.assertEqual(self.fs.list_dirfrag(ROOT_INO), [])
-
- # Execute the dentry recovery, this should populate the backing store
- self.fs.journal_tool(['event', 'recover_dentries', 'list'])
-
- # Dentries in ROOT_INO are present
- self.assertEqual(sorted(self.fs.list_dirfrag(ROOT_INO)), sorted(['rootfile_head', 'subdir_head', 'linkdir_head']))
- self.assertEqual(self.fs.list_dirfrag(subdir_ino), ['subdirfile_head', 'subsubdir_head'])
- self.assertEqual(sorted(self.fs.list_dirfrag(linkdir_ino)),
- sorted(['link0_head', 'link1_head', 'link2_head', 'link3_head']))
-
- # Now check the MDS can read what we wrote: truncate the journal
- # and start the mds.
- self.fs.journal_tool(['journal', 'reset'])
- self.fs.mds_fail_restart()
- self.fs.wait_for_daemons()
-
- # List files
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
-
- # First ls -R to populate MDCache, such that hardlinks will
- # resolve properly (recover_dentries does not create backtraces,
- # so ordinarily hardlinks to inodes that happen not to have backtraces
- # will be invisible in readdir).
- # FIXME: hook in forward scrub here to regenerate backtraces
- proc = self.mount_a.run_shell(['ls', '-R'])
- self.mount_a.umount_wait() # remount to clear client cache before our second ls
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
-
- proc = self.mount_a.run_shell(['ls', '-R'])
- self.assertEqual(proc.stdout.getvalue().strip(),
- dedent("""
- .:
- linkdir
- rootfile
- subdir
-
- ./linkdir:
- link0
- link1
- link2
- link3
-
- ./subdir:
- subdirfile
- subsubdir
-
- ./subdir/subsubdir:
- """).strip())
-
- # Check the correct inos were preserved by path
- self.assertEqual(rootfile_ino, self.mount_a.path_to_ino("rootfile"))
- self.assertEqual(subdir_ino, self.mount_a.path_to_ino("subdir"))
- self.assertEqual(subdirfile_ino, self.mount_a.path_to_ino("subdir/subdirfile"))
- self.assertEqual(subsubdir_ino, self.mount_a.path_to_ino("subdir/subsubdir"))
-
- # Check that the hard link handling came out correctly
- self.assertEqual(self.mount_a.path_to_ino("linkdir/link0"), subdirfile_ino)
- self.assertEqual(self.mount_a.path_to_ino("linkdir/link1"), subdirfile_ino)
- self.assertNotEqual(self.mount_a.path_to_ino("linkdir/link2"), subdirfile_ino)
- self.assertEqual(self.mount_a.path_to_ino("linkdir/link3"), rootfile_ino)
-
- # Create a new file, ensure it is not issued the same ino as one of the
- # recovered ones
- self.mount_a.run_shell(["touch", "afterwards"])
- new_ino = self.mount_a.path_to_ino("afterwards")
- self.assertNotIn(new_ino, [rootfile_ino, subdir_ino, subdirfile_ino])
-
- # Check that we can do metadata ops in the recovered directory
- self.mount_a.run_shell(["touch", "subdir/subsubdir/subsubdirfile"])
-
- @for_teuthology # 308s
- def test_reset(self):
- """
- That after forcibly modifying the backing store, we can get back into
- a good state by resetting the MDSMap.
-
- The scenario is that we have two active MDSs, and we lose the journals. Once
- we have completely lost confidence in the integrity of the metadata, we want to
- return the system to a single-MDS state to go into a scrub to recover what we
- can.
- """
-
- # Set max_mds to 2
- self.fs.mon_manager.raw_cluster_cmd_result('mds', 'set', "allow_multimds",
- "true", "--yes-i-really-mean-it")
- self.fs.mon_manager.raw_cluster_cmd_result('mds', 'set', "max_mds", "2")
-
- # See that we have two active MDSs
- self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30,
- reject_fn=lambda v: v > 2 or v < 1)
- active_mds_names = self.fs.get_active_names()
-
- # Switch off any unneeded MDS daemons
- for unneeded_mds in set(self.mds_cluster.mds_ids) - set(active_mds_names):
- self.mds_cluster.mds_stop(unneeded_mds)
- self.mds_cluster.mds_fail(unneeded_mds)
-
- # Do a bunch of I/O such that at least some will hit the second MDS: create
- # lots of directories so that the balancer should find it easy to make a decision
- # to allocate some of them to the second mds.
- spammers = []
- for n in range(0, 16):
- dir_name = "spam_{0}".format(n)
- spammers.append(self.mount_a.spam_dir_background(dir_name))
-
- def subtrees_assigned():
- got_subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=active_mds_names[0])
- rank_1_count = len([s for s in got_subtrees if s['auth_first'] == 1])
-
- # Greater than 1, because there is typically 1 for ~mds1, and once it
- # has been assigned something in addition to that it means it has been
- # assigned a "real" subtree.
- return rank_1_count > 1
-
- # We are waiting for the MDS to respond to hot directories, which
- # is not guaranteed to happen at a particular time, so a lengthy timeout here.
- self.wait_until_true(subtrees_assigned, 600)
-
- # Flush the journals so that we have some backing store data
- # belonging to one MDS, and some to the other MDS.
- for mds_name in active_mds_names:
- self.fs.mds_asok(["flush", "journal"], mds_name)
-
- # Stop (hard) the second MDS daemon
- self.fs.mds_stop(active_mds_names[1])
-
- # Wipe out the tables for MDS rank 1 so that it is broken and can't start
- # (this is the simulated failure that we will demonstrate that the disaster
- # recovery tools can get us back from)
- self.fs.erase_metadata_objects(prefix="mds1_")
-
- # Try to access files from the client
- blocked_ls = self.mount_a.run_shell(["ls", "-R"], wait=False)
-
- # Check that this "ls -R" blocked rather than completing: indicates
- # it got stuck trying to access subtrees which were on the now-dead MDS.
- log.info("Sleeping to check ls is blocked...")
- time.sleep(60)
- self.assertFalse(blocked_ls.finished)
-
- # This mount is now useless because it will depend on MDS rank 1, and MDS rank 1
- # is not coming back. Kill it.
- log.info("Killing mount, it's blocked on the MDS we killed")
- self.mount_a.kill()
- self.mount_a.kill_cleanup()
- try:
- # Now that the mount is dead, the ls -R should error out.
- blocked_ls.wait()
- except (CommandFailedError, ConnectionLostError):
- # The ConnectionLostError case is for kernel client, where
- # killing the mount also means killing the node.
- pass
-
- log.info("Terminating spammer processes...")
- for spammer_proc in spammers:
- spammer_proc.stdin.close()
- try:
- spammer_proc.wait()
- except (CommandFailedError, ConnectionLostError):
- # The ConnectionLostError case is for kernel client, where
- # killing the mount also means killing the node.
- pass
-
- # See that the second MDS will crash when it starts and tries to
- # acquire rank 1
- damaged_id = active_mds_names[1]
- self.fs.mds_restart(damaged_id)
-
- # The daemon taking the damaged rank should start starting, then
- # restart back into standby after asking the mon to mark the rank
- # damaged.
- def is_marked_damaged():
- mds_map = self.fs.get_mds_map()
- return 1 in mds_map['damaged']
-
- self.wait_until_true(is_marked_damaged, 60)
-
- def get_state():
- info = self.mds_cluster.get_mds_info(damaged_id)
- return info['state'] if info is not None else None
-
- self.wait_until_equal(
- get_state,
- "up:standby",
- timeout=60)
-
- self.fs.mds_stop(damaged_id)
- self.fs.mds_fail(damaged_id)
-
- # Now give up and go through a disaster recovery procedure
- self.fs.mds_stop(active_mds_names[0])
- self.fs.mds_fail(active_mds_names[0])
- # Invoke recover_dentries quietly, because otherwise log spews millions of lines
- self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=0, quiet=True)
- self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=1, quiet=True)
- self.fs.table_tool(["0", "reset", "session"])
- self.fs.journal_tool(["journal", "reset"], rank=0)
- self.fs.erase_mds_objects(1)
- self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
- '--yes-i-really-mean-it')
-
- # Bring an MDS back online, mount a client, and see that we can walk the full
- # filesystem tree again
- self.fs.mds_fail_restart(active_mds_names[0])
- self.wait_until_equal(lambda: self.fs.get_active_names(), [active_mds_names[0]], 30,
- reject_fn=lambda v: len(v) > 1)
- self.mount_a.mount()
- self.mount_a.run_shell(["ls", "-R"], wait=True)
-
- def test_table_tool(self):
- active_mdss = self.fs.get_active_names()
- self.assertEqual(len(active_mdss), 1)
- mds_name = active_mdss[0]
-
- self.mount_a.run_shell(["touch", "foo"])
- self.fs.mds_asok(["flush", "journal"], mds_name)
-
- log.info(self.fs.table_tool(["all", "show", "inode"]))
- log.info(self.fs.table_tool(["all", "show", "snap"]))
- log.info(self.fs.table_tool(["all", "show", "session"]))
-
- # Inode table should always be the same because initial state
- # and choice of inode are deterministic.
- # Should see one inode consumed
- self.assertEqual(
- json.loads(self.fs.table_tool(["all", "show", "inode"])),
- {"0": {
- "data": {
- "version": 2,
- "inotable": {
- "projected_free": [
- {"start": 1099511628777,
- "len": 1099511626775}],
- "free": [
- {"start": 1099511628777,
- "len": 1099511626775}]}},
- "result": 0}}
-
- )
-
- # Should see one session
- session_data = json.loads(self.fs.table_tool(
- ["all", "show", "session"]))
- self.assertEqual(len(session_data["0"]["data"]["Sessions"]), 1)
- self.assertEqual(session_data["0"]["result"], 0)
-
- # Should see no snaps
- self.assertEqual(
- json.loads(self.fs.table_tool(["all", "show", "snap"])),
- {"version": 0,
- "snapserver": {"last_snap": 1,
- "pending_noop": [],
- "snaps": [],
- "need_to_purge": {},
- "pending_update": [],
- "pending_destroy": []},
- "result": 0}
- )
-
- # Reset everything
- for table in ["session", "inode", "snap"]:
- self.fs.table_tool(["all", "reset", table])
-
- log.info(self.fs.table_tool(["all", "show", "inode"]))
- log.info(self.fs.table_tool(["all", "show", "snap"]))
- log.info(self.fs.table_tool(["all", "show", "session"]))
-
- # Should see 0 sessions
- session_data = json.loads(self.fs.table_tool(
- ["all", "show", "session"]))
- self.assertEqual(len(session_data["0"]["data"]["Sessions"]), 0)
- self.assertEqual(session_data["0"]["result"], 0)
-
- # Should see entire inode range now marked free
- self.assertEqual(
- json.loads(self.fs.table_tool(["all", "show", "inode"])),
- {"0": {"data": {"version": 1,
- "inotable": {"projected_free": [
- {"start": 1099511627776,
- "len": 1099511627776}],
- "free": [
- {"start": 1099511627776,
- "len": 1099511627776}]}},
- "result": 0}}
- )
-
- # Should see no snaps
- self.assertEqual(
- json.loads(self.fs.table_tool(["all", "show", "snap"])),
- {"version": 1,
- "snapserver": {"last_snap": 1,
- "pending_noop": [],
- "snaps": [],
- "need_to_purge": {},
- "pending_update": [],
- "pending_destroy": []},
- "result": 0}
- )
-
- def test_table_tool_take_inos(self):
- initial_range_start = 1099511627776
- initial_range_len = 1099511627776
- # Initially a completely clear range
- self.assertEqual(
- json.loads(self.fs.table_tool(["all", "show", "inode"])),
- {"0": {"data": {"version": 0,
- "inotable": {"projected_free": [
- {"start": initial_range_start,
- "len": initial_range_len}],
- "free": [
- {"start": initial_range_start,
- "len": initial_range_len}]}},
- "result": 0}}
- )
-
- # Remove some
- self.assertEqual(
- json.loads(self.fs.table_tool(["all", "take_inos", "{0}".format(initial_range_start + 100)])),
- {"0": {"data": {"version": 1,
- "inotable": {"projected_free": [
- {"start": initial_range_start + 101,
- "len": initial_range_len - 101}],
- "free": [
- {"start": initial_range_start + 101,
- "len": initial_range_len - 101}]}},
- "result": 0}}
- )
-
- @for_teuthology # Hack: "for_teuthology" because .sh doesn't work outside teuth
- def test_journal_smoke(self):
- workunit(self.ctx, {
- 'clients': {
- "client.{0}".format(self.mount_a.client_id): [
- "fs/misc/trivial_sync.sh"],
- },
- "timeout": "1h"
- })
-
- for mount in self.mounts:
- mount.umount_wait()
-
- self.fs.mds_stop()
- self.fs.mds_fail()
-
- # journal tool smoke
- workunit(self.ctx, {
- 'clients': {
- "client.{0}".format(self.mount_a.client_id): [
- "suites/cephfs_journal_tool_smoke.sh"],
- },
- "timeout": "1h"
- })
-
-
-
- self.fs.mds_restart()
- self.fs.wait_for_daemons()
-
- self.mount_a.mount()
-
- # trivial sync moutn a
- workunit(self.ctx, {
- 'clients': {
- "client.{0}".format(self.mount_a.client_id): [
- "fs/misc/trivial_sync.sh"],
- },
- "timeout": "1h"
- })
-
+++ /dev/null
-from tasks.cephfs.cephfs_test_case import CephFSTestCase
-import json
-import logging
-
-log = logging.getLogger(__name__)
-failure = "using old balancer; mantle failed for balancer="
-success = "mantle balancer version changed: "
-
-class TestMantle(CephFSTestCase):
- def start_mantle(self):
- self.wait_for_health_clear(timeout=30)
- self.fs.mon_manager.raw_cluster_cmd_result('mds', 'set', "allow_multimds",
- "true", "--yes-i-really-mean-it")
- self.fs.mon_manager.raw_cluster_cmd_result('mds', 'set', "max_mds", "2")
- self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30,
- reject_fn=lambda v: v > 2 or v < 1)
-
- for m in self.fs.get_active_names():
- self.fs.mds_asok(['config', 'set', 'debug_objecter', '20'], mds_id=m)
- self.fs.mds_asok(['config', 'set', 'debug_ms', '0'], mds_id=m)
- self.fs.mds_asok(['config', 'set', 'debug_mds', '0'], mds_id=m)
- self.fs.mds_asok(['config', 'set', 'debug_mds_balancer', '5'], mds_id=m)
-
- def push_balancer(self, obj, lua_code, expect):
- self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', obj)
- self.fs.rados(["put", obj, "-"], stdin_data=lua_code)
- with self.assert_cluster_log(failure + obj + " " + expect):
- log.info("run a " + obj + " balancer that expects=" + expect)
-
- def test_version_empty(self):
- self.start_mantle()
- expect = " : (2) No such file or directory"
-
- ret = self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer')
- assert(ret == 22) # EINVAL
-
- self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', " ")
- with self.assert_cluster_log(failure + " " + expect): pass
-
- def test_version_not_in_rados(self):
- self.start_mantle()
- expect = failure + "ghost.lua : (2) No such file or directory"
- self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "ghost.lua")
- with self.assert_cluster_log(expect): pass
-
- def test_balancer_invalid(self):
- self.start_mantle()
- expect = ": (22) Invalid argument"
-
- lua_code = "this is invalid lua code!"
- self.push_balancer("invalid.lua", lua_code, expect)
-
- lua_code = "BAL_LOG()"
- self.push_balancer("invalid_log.lua", lua_code, expect)
-
- lua_code = "BAL_LOG(0)"
- self.push_balancer("invalid_log_again.lua", lua_code, expect)
-
- def test_balancer_valid(self):
- self.start_mantle()
- lua_code = "BAL_LOG(0, \"test\")\nreturn {3, 4}"
- self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "valid.lua")
- self.fs.rados(["put", "valid.lua", "-"], stdin_data=lua_code)
- with self.assert_cluster_log(success + "valid.lua"):
- log.info("run a valid.lua balancer")
-
- def test_return_invalid(self):
- self.start_mantle()
- expect = ": (22) Invalid argument"
-
- lua_code = "return \"hello\""
- self.push_balancer("string.lua", lua_code, expect)
-
- lua_code = "return 3"
- self.push_balancer("number.lua", lua_code, expect)
-
- lua_code = "return {}"
- self.push_balancer("dict_empty.lua", lua_code, expect)
-
- lua_code = "return {\"this\", \"is\", \"a\", \"test\"}"
- self.push_balancer("dict_of_strings.lua", lua_code, expect)
-
- lua_code = "return {3, \"test\"}"
- self.push_balancer("dict_of_mixed.lua", lua_code, expect)
-
- lua_code = "return {3}"
- self.push_balancer("not_enough_numbers.lua", lua_code, expect)
-
- lua_code = "return {3, 4, 5, 6, 7, 8, 9}"
- self.push_balancer("too_many_numbers.lua", lua_code, expect)
-
- def test_dead_osd(self):
- self.start_mantle()
- expect = " : (110) Connection timed out"
-
- # kill the OSDs so that the balancer pull from RADOS times out
- osd_map = json.loads(self.fs.mon_manager.raw_cluster_cmd('osd', 'dump', '--format=json-pretty'))
- for i in range(0, len(osd_map['osds'])):
- self.fs.mon_manager.raw_cluster_cmd_result('osd', 'down', str(i))
- self.fs.mon_manager.raw_cluster_cmd_result('osd', 'out', str(i))
-
- # trigger a pull from RADOS
- self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "valid.lua")
-
- # make the timeout a little longer since dead OSDs spam ceph -w
- with self.assert_cluster_log(failure + "valid.lua" + expect, timeout=30):
- log.info("run a balancer that should timeout")
-
- # cleanup
- for i in range(0, len(osd_map['osds'])):
- self.fs.mon_manager.raw_cluster_cmd_result('osd', 'in', str(i))
+++ /dev/null
-
-from unittest import SkipTest
-from tasks.cephfs.fuse_mount import FuseMount
-from tasks.cephfs.cephfs_test_case import CephFSTestCase
-
-class TestMisc(CephFSTestCase):
- CLIENTS_REQUIRED = 2
- def test_getattr_caps(self):
- """
- Check if MDS recognizes the 'mask' parameter of open request.
- The paramter allows client to request caps when opening file
- """
-
- if not isinstance(self.mount_a, FuseMount):
- raise SkipTest("Require FUSE client")
-
- # Enable debug. Client will requests CEPH_CAP_XATTR_SHARED
- # on lookup/open
- self.mount_b.umount_wait()
- self.set_conf('client', 'client debug getattr caps', 'true')
- self.mount_b.mount()
- self.mount_b.wait_until_mounted()
-
- # create a file and hold it open. MDS will issue CEPH_CAP_EXCL_*
- # to mount_a
- p = self.mount_a.open_background("testfile")
- self.mount_b.wait_for_visible("testfile")
-
- # this tiggers a lookup request and an open request. The debug
- # code will check if lookup/open reply contains xattrs
- self.mount_b.run_shell(["cat", "testfile"])
-
- self.mount_a.kill_background(p)
+++ /dev/null
-from textwrap import dedent
-from teuthology.exceptions import CommandFailedError
-from tasks.cephfs.cephfs_test_case import CephFSTestCase
-import os
-
-
-class TestPoolPerm(CephFSTestCase):
- def test_pool_perm(self):
- self.mount_a.run_shell(["touch", "test_file"])
-
- file_path = os.path.join(self.mount_a.mountpoint, "test_file")
-
- remote_script = dedent("""
- import os
- import errno
-
- fd = os.open("{path}", os.O_RDWR)
- try:
- if {check_read}:
- ret = os.read(fd, 1024)
- else:
- os.write(fd, 'content')
- except OSError, e:
- if e.errno != errno.EPERM:
- raise
- else:
- raise RuntimeError("client does not check permission of data pool")
- """)
-
- client_name = "client.{0}".format(self.mount_a.client_id)
-
- # set data pool read only
- self.fs.mon_manager.raw_cluster_cmd_result(
- 'auth', 'caps', client_name, 'mds', 'allow', 'mon', 'allow r', 'osd',
- 'allow r pool={0}'.format(self.fs.get_data_pool_name()))
-
- self.mount_a.umount_wait()
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
-
- # write should fail
- self.mount_a.run_python(remote_script.format(path=file_path, check_read=str(False)))
-
- # set data pool write only
- self.fs.mon_manager.raw_cluster_cmd_result(
- 'auth', 'caps', client_name, 'mds', 'allow', 'mon', 'allow r', 'osd',
- 'allow w pool={0}'.format(self.fs.get_data_pool_name()))
-
- self.mount_a.umount_wait()
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
-
- # read should fail
- self.mount_a.run_python(remote_script.format(path=file_path, check_read=str(True)))
-
- def test_forbidden_modification(self):
- """
- That a client who does not have the capability for setting
- layout pools is prevented from doing so.
- """
-
- # Set up
- client_name = "client.{0}".format(self.mount_a.client_id)
- new_pool_name = "data_new"
- self.fs.add_data_pool(new_pool_name)
-
- self.mount_a.run_shell(["touch", "layoutfile"])
- self.mount_a.run_shell(["mkdir", "layoutdir"])
-
- # Set MDS 'rw' perms: missing 'p' means no setting pool layouts
- self.fs.mon_manager.raw_cluster_cmd_result(
- 'auth', 'caps', client_name, 'mds', 'allow rw', 'mon', 'allow r',
- 'osd',
- 'allow rw pool={0},allow rw pool={1}'.format(
- self.fs.get_data_pool_names()[0],
- self.fs.get_data_pool_names()[1],
- ))
-
- self.mount_a.umount_wait()
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
-
- with self.assertRaises(CommandFailedError):
- self.mount_a.run_shell(["setfattr",
- "-n", "ceph.file.layout.pool",
- "-v", new_pool_name, "layoutfile"])
- with self.assertRaises(CommandFailedError):
- self.mount_a.run_shell(["setfattr",
- "-n", "ceph.dir.layout.pool",
- "-v", new_pool_name, "layoutdir"])
- self.mount_a.umount_wait()
-
- # Set MDS 'rwp' perms: should now be able to set layouts
- self.fs.mon_manager.raw_cluster_cmd_result(
- 'auth', 'caps', client_name, 'mds', 'allow rwp', 'mon', 'allow r',
- 'osd',
- 'allow rw pool={0},allow rw pool={1}'.format(
- self.fs.get_data_pool_names()[0],
- self.fs.get_data_pool_names()[1],
- ))
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
- self.mount_a.run_shell(["setfattr",
- "-n", "ceph.file.layout.pool",
- "-v", new_pool_name, "layoutfile"])
- self.mount_a.run_shell(["setfattr",
- "-n", "ceph.dir.layout.pool",
- "-v", new_pool_name, "layoutdir"])
- self.mount_a.umount_wait()
-
- def tearDown(self):
- self.fs.mon_manager.raw_cluster_cmd_result(
- 'auth', 'caps', "client.{0}".format(self.mount_a.client_id),
- 'mds', 'allow', 'mon', 'allow r', 'osd',
- 'allow rw pool={0}'.format(self.fs.get_data_pool_names()[0]))
- super(TestPoolPerm, self).tearDown()
-
+++ /dev/null
-import logging
-from tasks.cephfs.fuse_mount import FuseMount
-from tasks.cephfs.cephfs_test_case import CephFSTestCase
-
-log = logging.getLogger(__name__)
-
-
-class TestReadahead(CephFSTestCase):
- def test_flush(self):
- if not isinstance(self.mount_a, FuseMount):
- self.skipTest("FUSE needed for measuring op counts")
-
- # Create 32MB file
- self.mount_a.run_shell(["dd", "if=/dev/urandom", "of=foo", "bs=1M", "count=32"])
-
- # Unmount and remount the client to flush cache
- self.mount_a.umount_wait()
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
-
- initial_op_r = self.mount_a.admin_socket(['perf', 'dump', 'objecter'])['objecter']['op_r']
- self.mount_a.run_shell(["dd", "if=foo", "of=/dev/null", "bs=128k", "count=32"])
- op_r = self.mount_a.admin_socket(['perf', 'dump', 'objecter'])['objecter']['op_r']
- assert op_r >= initial_op_r
- op_r -= initial_op_r
- log.info("read operations: {0}".format(op_r))
-
- # with exponentially increasing readahead, we should see fewer than 10 operations
- # but this test simply checks if the client is doing a remote read for each local read
- if op_r >= 32:
- raise RuntimeError("readahead not working")
+++ /dev/null
-"""
-MDS admin socket scrubbing-related tests.
-"""
-import json
-import logging
-import errno
-import time
-from teuthology.exceptions import CommandFailedError
-import os
-from tasks.cephfs.cephfs_test_case import CephFSTestCase
-
-log = logging.getLogger(__name__)
-
-
-class TestScrubChecks(CephFSTestCase):
- """
- Run flush and scrub commands on the specified files in the filesystem. This
- task will run through a sequence of operations, but it is not comprehensive
- on its own -- it doesn't manipulate the mds cache state to test on both
- in- and out-of-memory parts of the hierarchy. So it's designed to be run
- multiple times within a single test run, so that the test can manipulate
- memory state.
-
- Usage:
- mds_scrub_checks:
- mds_rank: 0
- path: path/to/test/dir
- client: 0
- run_seq: [0-9]+
-
- Increment the run_seq on subsequent invocations within a single test run;
- it uses that value to generate unique folder and file names.
- """
-
- MDSS_REQUIRED = 1
- CLIENTS_REQUIRED = 1
-
- def test_scrub_checks(self):
- self._checks(0)
- self._checks(1)
-
- def _checks(self, run_seq):
- mds_rank = 0
- test_dir = "scrub_test_path"
-
- abs_test_path = "/{0}".format(test_dir)
-
- log.info("mountpoint: {0}".format(self.mount_a.mountpoint))
- client_path = os.path.join(self.mount_a.mountpoint, test_dir)
- log.info("client_path: {0}".format(client_path))
-
- log.info("Cloning repo into place")
- repo_path = self.clone_repo(self.mount_a, client_path)
-
- log.info("Initiating mds_scrub_checks on mds.{id_}, " +
- "test_path {path}, run_seq {seq}".format(
- id_=mds_rank, path=abs_test_path, seq=run_seq)
- )
-
-
- success_validator = lambda j, r: self.json_validator(j, r, "return_code", 0)
-
- nep = "{test_path}/i/dont/exist".format(test_path=abs_test_path)
- self.asok_command(mds_rank, "flush_path {nep}".format(nep=nep),
- lambda j, r: self.json_validator(j, r, "return_code", -errno.ENOENT))
- self.asok_command(mds_rank, "scrub_path {nep}".format(nep=nep),
- lambda j, r: self.json_validator(j, r, "return_code", -errno.ENOENT))
-
- test_repo_path = "{test_path}/ceph-qa-suite".format(test_path=abs_test_path)
- dirpath = "{repo_path}/suites".format(repo_path=test_repo_path)
-
- if run_seq == 0:
- log.info("First run: flushing {dirpath}".format(dirpath=dirpath))
- command = "flush_path {dirpath}".format(dirpath=dirpath)
- self.asok_command(mds_rank, command, success_validator)
- command = "scrub_path {dirpath}".format(dirpath=dirpath)
- self.asok_command(mds_rank, command, success_validator)
-
- filepath = "{repo_path}/suites/fs/verify/validater/valgrind.yaml".format(
- repo_path=test_repo_path)
- if run_seq == 0:
- log.info("First run: flushing {filepath}".format(filepath=filepath))
- command = "flush_path {filepath}".format(filepath=filepath)
- self.asok_command(mds_rank, command, success_validator)
- command = "scrub_path {filepath}".format(filepath=filepath)
- self.asok_command(mds_rank, command, success_validator)
-
- filepath = "{repo_path}/suites/fs/basic/clusters/fixed-3-cephfs.yaml". \
- format(repo_path=test_repo_path)
- command = "scrub_path {filepath}".format(filepath=filepath)
- self.asok_command(mds_rank, command,
- lambda j, r: self.json_validator(j, r, "performed_validation",
- False))
-
- if run_seq == 0:
- log.info("First run: flushing base dir /")
- command = "flush_path /"
- self.asok_command(mds_rank, command, success_validator)
- command = "scrub_path /"
- self.asok_command(mds_rank, command, success_validator)
-
- new_dir = "{repo_path}/new_dir_{i}".format(repo_path=repo_path, i=run_seq)
- test_new_dir = "{repo_path}/new_dir_{i}".format(repo_path=test_repo_path,
- i=run_seq)
- self.mount_a.run_shell(["mkdir", new_dir])
- command = "flush_path {dir}".format(dir=test_new_dir)
- self.asok_command(mds_rank, command, success_validator)
-
- new_file = "{repo_path}/new_file_{i}".format(repo_path=repo_path,
- i=run_seq)
- test_new_file = "{repo_path}/new_file_{i}".format(repo_path=test_repo_path,
- i=run_seq)
- self.mount_a.write_n_mb(new_file, 1)
-
- command = "flush_path {file}".format(file=test_new_file)
- self.asok_command(mds_rank, command, success_validator)
-
- # check that scrub fails on errors
- ino = self.mount_a.path_to_ino(new_file)
- rados_obj_name = "{ino:x}.00000000".format(ino=ino)
- command = "scrub_path {file}".format(file=test_new_file)
-
- # Missing parent xattr -> ENODATA
- self.fs.rados(["rmxattr", rados_obj_name, "parent"], pool=self.fs.get_data_pool_name())
- self.asok_command(mds_rank, command,
- lambda j, r: self.json_validator(j, r, "return_code", -errno.ENODATA))
-
- # Missing object -> ENOENT
- self.fs.rados(["rm", rados_obj_name], pool=self.fs.get_data_pool_name())
- self.asok_command(mds_rank, command,
- lambda j, r: self.json_validator(j, r, "return_code", -errno.ENOENT))
-
- command = "flush_path /"
- self.asok_command(mds_rank, command, success_validator)
-
- def test_scrub_repair(self):
- mds_rank = 0
- test_dir = "scrub_repair_path"
-
- self.mount_a.run_shell(["sudo", "mkdir", test_dir])
- self.mount_a.run_shell(["sudo", "touch", "{0}/file".format(test_dir)])
- dir_objname = "{:x}.00000000".format(self.mount_a.path_to_ino(test_dir))
-
- self.mount_a.umount_wait()
-
- # flush journal entries to dirfrag objects, and expire journal
- self.fs.mds_asok(['flush', 'journal'])
- self.fs.mds_stop()
-
- # remove the dentry from dirfrag, cause incorrect fragstat/rstat
- self.fs.rados(["rmomapkey", dir_objname, "file_head"],
- pool=self.fs.get_metadata_pool_name())
-
- self.fs.mds_fail_restart()
- self.fs.wait_for_daemons()
-
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
-
- # fragstat indicates the directory is not empty, rmdir should fail
- with self.assertRaises(CommandFailedError) as ar:
- self.mount_a.run_shell(["sudo", "rmdir", test_dir])
- self.assertEqual(ar.exception.exitstatus, 1)
-
- self.asok_command(mds_rank, "scrub_path /{0} repair".format(test_dir),
- lambda j, r: self.json_validator(j, r, "return_code", 0))
-
- # wait a few second for background repair
- time.sleep(10)
-
- # fragstat should be fixed
- self.mount_a.run_shell(["sudo", "rmdir", test_dir])
-
- @staticmethod
- def json_validator(json_out, rc, element, expected_value):
- if rc != 0:
- return False, "asok command returned error {rc}".format(rc=rc)
- element_value = json_out.get(element)
- if element_value != expected_value:
- return False, "unexpectedly got {jv} instead of {ev}!".format(
- jv=element_value, ev=expected_value)
- return True, "Succeeded"
-
- def asok_command(self, mds_rank, command, validator):
- log.info("Running command '{command}'".format(command=command))
-
- command_list = command.split()
-
- # we just assume there's an active mds for every rank
- mds_id = self.fs.get_active_names()[mds_rank]
- proc = self.fs.mon_manager.admin_socket('mds', mds_id,
- command_list, check_status=False)
- rout = proc.exitstatus
- sout = proc.stdout.getvalue()
-
- if sout.strip():
- jout = json.loads(sout)
- else:
- jout = None
-
- log.info("command '{command}' got response code " +
- "'{rout}' and stdout '{sout}'".format(
- command=command, rout=rout, sout=sout))
-
- success, errstring = validator(jout, rout)
-
- if not success:
- raise AsokCommandFailedError(command, rout, jout, errstring)
-
- return jout
-
- def clone_repo(self, client_mount, path):
- repo = "ceph-qa-suite"
- repo_path = os.path.join(path, repo)
- client_mount.run_shell(["mkdir", "-p", path])
-
- try:
- client_mount.stat(repo_path)
- except CommandFailedError:
- client_mount.run_shell([
- "git", "clone", '--branch', 'giant',
- "http://github.com/ceph/{repo}".format(repo=repo),
- "{path}/{repo}".format(path=path, repo=repo)
- ])
-
- return repo_path
-
-
-class AsokCommandFailedError(Exception):
- """
- Exception thrown when we get an unexpected response
- on an admin socket command
- """
-
- def __init__(self, command, rc, json_out, errstring):
- self.command = command
- self.rc = rc
- self.json = json_out
- self.errstring = errstring
-
- def __str__(self):
- return "Admin socket: {command} failed with rc={rc}," + \
- "json output={json}, because '{es}'".format(
- command=self.command, rc=self.rc,
- json=self.json, es=self.errstring)
+++ /dev/null
-from StringIO import StringIO
-import json
-import logging
-from unittest import SkipTest
-
-from tasks.cephfs.fuse_mount import FuseMount
-from teuthology.exceptions import CommandFailedError
-from tasks.cephfs.cephfs_test_case import CephFSTestCase
-
-log = logging.getLogger(__name__)
-
-
-class TestSessionMap(CephFSTestCase):
- CLIENTS_REQUIRED = 2
- MDSS_REQUIRED = 2
-
- def test_tell_session_drop(self):
- """
- That when a `tell` command is sent using the python CLI,
- its MDS session is gone after it terminates
- """
- self.mount_a.umount_wait()
- self.mount_b.umount_wait()
-
- mds_id = self.fs.get_lone_mds_id()
- self.fs.mon_manager.raw_cluster_cmd("tell", "mds.{0}".format(mds_id), "session", "ls")
-
- ls_data = self.fs.mds_asok(['session', 'ls'])
- self.assertEqual(len(ls_data), 0)
-
- def _get_thread_count(self, mds_id):
- remote = self.fs.mds_daemons[mds_id].remote
-
- ps_txt = remote.run(
- args=["ps", "-ww", "axo", "nlwp,cmd"],
- stdout=StringIO()
- ).stdout.getvalue().strip()
- lines = ps_txt.split("\n")[1:]
-
- for line in lines:
- if "ceph-mds" in line and not "daemon-helper" in line:
- if line.find("-i {0}".format(mds_id)) != -1:
- log.info("Found ps line for daemon: {0}".format(line))
- return int(line.split()[0])
-
- raise RuntimeError("No process found in ps output for MDS {0}: {1}".format(
- mds_id, ps_txt
- ))
-
- def test_tell_conn_close(self):
- """
- That when a `tell` command is sent using the python CLI,
- the thread count goes back to where it started (i.e. we aren't
- leaving connections open)
- """
- self.mount_a.umount_wait()
- self.mount_b.umount_wait()
-
- mds_id = self.fs.get_lone_mds_id()
-
- initial_thread_count = self._get_thread_count(mds_id)
- self.fs.mon_manager.raw_cluster_cmd("tell", "mds.{0}".format(mds_id), "session", "ls")
- final_thread_count = self._get_thread_count(mds_id)
-
- self.assertEqual(initial_thread_count, final_thread_count)
-
- def test_mount_conn_close(self):
- """
- That when a client unmounts, the thread count on the MDS goes back
- to what it was before the client mounted
- """
- self.mount_a.umount_wait()
- self.mount_b.umount_wait()
-
- mds_id = self.fs.get_lone_mds_id()
-
- initial_thread_count = self._get_thread_count(mds_id)
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
- self.assertGreater(self._get_thread_count(mds_id), initial_thread_count)
- self.mount_a.umount_wait()
- final_thread_count = self._get_thread_count(mds_id)
-
- self.assertEqual(initial_thread_count, final_thread_count)
-
- def test_version_splitting(self):
- """
- That when many sessions are updated, they are correctly
- split into multiple versions to obey mds_sessionmap_keys_per_op
- """
-
- # Start umounted
- self.mount_a.umount_wait()
- self.mount_b.umount_wait()
-
- # Configure MDS to write one OMAP key at once
- self.set_conf('mds', 'mds_sessionmap_keys_per_op', 1)
- self.fs.mds_fail_restart()
- self.fs.wait_for_daemons()
-
- # I would like two MDSs, so that I can do an export dir later
- self.fs.mon_manager.raw_cluster_cmd_result('mds', 'set', "allow_multimds",
- "true", "--yes-i-really-mean-it")
- self.fs.mon_manager.raw_cluster_cmd_result('mds', 'set', "max_mds", "2")
- self.fs.wait_for_daemons()
-
- active_mds_names = self.fs.get_active_names()
- rank_0_id = active_mds_names[0]
- rank_1_id = active_mds_names[1]
- log.info("Ranks 0 and 1 are {0} and {1}".format(
- rank_0_id, rank_1_id))
-
- # Bring the clients back
- self.mount_a.mount()
- self.mount_b.mount()
- self.mount_a.create_files() # Kick the client into opening sessions
- self.mount_b.create_files()
-
- # See that they've got sessions
- self.assert_session_count(2, mds_id=rank_0_id)
-
- # See that we persist their sessions
- self.fs.mds_asok(["flush", "journal"], rank_0_id)
- table_json = json.loads(self.fs.table_tool(["0", "show", "session"]))
- log.info("SessionMap: {0}".format(json.dumps(table_json, indent=2)))
- self.assertEqual(table_json['0']['result'], 0)
- self.assertEqual(len(table_json['0']['data']['Sessions']), 2)
-
- # Now, induce a "force_open_sessions" event by exporting a dir
- self.mount_a.run_shell(["mkdir", "bravo"])
- self.mount_a.run_shell(["touch", "bravo/file"])
- self.mount_b.run_shell(["ls", "-l", "bravo/file"])
-
- def get_omap_wrs():
- return self.fs.mds_asok(['perf', 'dump', 'objecter'], rank_1_id)['objecter']['omap_wr']
-
- # Flush so that there are no dirty sessions on rank 1
- self.fs.mds_asok(["flush", "journal"], rank_1_id)
-
- # Export so that we get a force_open to rank 1 for the two sessions from rank 0
- initial_omap_wrs = get_omap_wrs()
- self.fs.mds_asok(['export', 'dir', '/bravo', '1'], rank_0_id)
-
- # This is the critical (if rather subtle) check: that in the process of doing an export dir,
- # we hit force_open_sessions, and as a result we end up writing out the sessionmap. There
- # will be two sessions dirtied here, and because we have set keys_per_op to 1, we should see
- # a single session get written out (the first of the two, triggered by the second getting marked
- # dirty)
- # The number of writes is two per session, because the header (sessionmap version) update and
- # KV write both count.
- self.wait_until_true(
- lambda: get_omap_wrs() - initial_omap_wrs == 2,
- timeout=10 # Long enough for an export to get acked
- )
-
- # Now end our sessions and check the backing sessionmap is updated correctly
- self.mount_a.umount_wait()
- self.mount_b.umount_wait()
-
- # In-memory sessionmap check
- self.assert_session_count(0, mds_id=rank_0_id)
-
- # On-disk sessionmap check
- self.fs.mds_asok(["flush", "journal"], rank_0_id)
- table_json = json.loads(self.fs.table_tool(["0", "show", "session"]))
- log.info("SessionMap: {0}".format(json.dumps(table_json, indent=2)))
- self.assertEqual(table_json['0']['result'], 0)
- self.assertEqual(len(table_json['0']['data']['Sessions']), 0)
-
- def _sudo_write_file(self, remote, path, data):
- """
- Write data to a remote file as super user
-
- :param remote: Remote site.
- :param path: Path on the remote being written to.
- :param data: Data to be written.
-
- Both perms and owner are passed directly to chmod.
- """
- remote.run(
- args=[
- 'sudo',
- 'python',
- '-c',
- 'import shutil, sys; shutil.copyfileobj(sys.stdin, file(sys.argv[1], "wb"))',
- path,
- ],
- stdin=data,
- )
-
- def _configure_auth(self, mount, id_name, mds_caps, osd_caps=None, mon_caps=None):
- """
- Set up auth credentials for a client mount, and write out the keyring
- for the client to use.
- """
-
- if osd_caps is None:
- osd_caps = "allow rw"
-
- if mon_caps is None:
- mon_caps = "allow r"
-
- out = self.fs.mon_manager.raw_cluster_cmd(
- "auth", "get-or-create", "client.{name}".format(name=id_name),
- "mds", mds_caps,
- "osd", osd_caps,
- "mon", mon_caps
- )
- mount.client_id = id_name
- self._sudo_write_file(mount.client_remote, mount.get_keyring_path(), out)
- self.set_conf("client.{name}".format(name=id_name), "keyring", mount.get_keyring_path())
-
- def test_session_reject(self):
- if not isinstance(self.mount_a, FuseMount):
- raise SkipTest("Requires FUSE client to inject client metadata")
-
- self.mount_a.run_shell(["mkdir", "foo"])
- self.mount_a.run_shell(["mkdir", "foo/bar"])
- self.mount_a.umount_wait()
-
- # Mount B will be my rejected client
- self.mount_b.umount_wait()
-
- # Configure a client that is limited to /foo/bar
- self._configure_auth(self.mount_b, "badguy", "allow rw path=/foo/bar")
- # Check he can mount that dir and do IO
- self.mount_b.mount(mount_path="/foo/bar")
- self.mount_b.wait_until_mounted()
- self.mount_b.create_destroy()
- self.mount_b.umount_wait()
-
- # Configure the client to claim that its mount point metadata is /baz
- self.set_conf("client.badguy", "client_metadata", "root=/baz")
- # Try to mount the client, see that it fails
- with self.assert_cluster_log("client session with invalid root '/baz' denied"):
- with self.assertRaises(CommandFailedError):
- self.mount_b.mount(mount_path="/foo/bar")
+++ /dev/null
-import json
-import time
-import logging
-from textwrap import dedent
-import gevent
-from teuthology.orchestra.run import CommandFailedError
-from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
-
-log = logging.getLogger(__name__)
-
-
-class TestStrays(CephFSTestCase):
- MDSS_REQUIRED = 2
-
- OPS_THROTTLE = 1
- FILES_THROTTLE = 2
-
- # Range of different file sizes used in throttle test's workload
- throttle_workload_size_range = 16
-
- @for_teuthology
- def test_ops_throttle(self):
- self._test_throttling(self.OPS_THROTTLE)
-
- @for_teuthology
- def test_files_throttle(self):
- self._test_throttling(self.FILES_THROTTLE)
-
- def test_dir_deletion(self):
- """
- That when deleting a bunch of dentries and the containing
- directory, everything gets purged.
- Catches cases where the client might e.g. fail to trim
- the unlinked dir from its cache.
- """
- file_count = 1000
- create_script = dedent("""
- import os
-
- mount_path = "{mount_path}"
- subdir = "delete_me"
- size = {size}
- file_count = {file_count}
- os.mkdir(os.path.join(mount_path, subdir))
- for i in xrange(0, file_count):
- filename = "{{0}}_{{1}}.bin".format(i, size)
- f = open(os.path.join(mount_path, subdir, filename), 'w')
- f.write(size * 'x')
- f.close()
- """.format(
- mount_path=self.mount_a.mountpoint,
- size=1024,
- file_count=file_count
- ))
-
- self.mount_a.run_python(create_script)
- self.mount_a.run_shell(["rm", "-rf", "delete_me"])
- self.fs.mds_asok(["flush", "journal"])
- strays = self.get_mdc_stat("strays_created")
- self.assertEqual(strays, file_count + 1)
- self.wait_until_equal(
- lambda: self.get_mdc_stat("strays_purged"),
- strays,
- timeout=600
-
- )
-
- def _test_throttling(self, throttle_type):
- """
- That the mds_max_purge_ops setting is respected
- """
-
- def set_throttles(files, ops):
- """
- Helper for updating ops/files limits, and calculating effective
- ops_per_pg setting to give the same ops limit.
- """
- self.set_conf('mds', 'mds_max_purge_files', "%d" % files)
- self.set_conf('mds', 'mds_max_purge_ops', "%d" % ops)
-
- pgs = self.fs.mon_manager.get_pool_property(
- self.fs.get_data_pool_name(),
- "pg_num"
- )
- ops_per_pg = float(ops) / pgs
- self.set_conf('mds', 'mds_max_purge_ops_per_pg', "%s" % ops_per_pg)
-
- # Test conditions depend on what we're going to be exercising.
- # * Lift the threshold on whatever throttle we are *not* testing, so
- # that the throttle of interest is the one that will be the bottleneck
- # * Create either many small files (test file count throttling) or fewer
- # large files (test op throttling)
- if throttle_type == self.OPS_THROTTLE:
- set_throttles(files=100000000, ops=16)
- size_unit = 1024 * 1024 # big files, generate lots of ops
- file_multiplier = 100
- elif throttle_type == self.FILES_THROTTLE:
- # The default value of file limit is pretty permissive, so to avoid
- # the test running too fast, create lots of files and set the limit
- # pretty low.
- set_throttles(ops=100000000, files=6)
- size_unit = 1024 # small, numerous files
- file_multiplier = 200
- else:
- raise NotImplemented(throttle_type)
-
- # Pick up config changes
- self.fs.mds_fail_restart()
- self.fs.wait_for_daemons()
-
- create_script = dedent("""
- import os
-
- mount_path = "{mount_path}"
- subdir = "delete_me"
- size_unit = {size_unit}
- file_multiplier = {file_multiplier}
- os.mkdir(os.path.join(mount_path, subdir))
- for i in xrange(0, file_multiplier):
- for size in xrange(0, {size_range}*size_unit, size_unit):
- filename = "{{0}}_{{1}}.bin".format(i, size / size_unit)
- f = open(os.path.join(mount_path, subdir, filename), 'w')
- f.write(size * 'x')
- f.close()
- """.format(
- mount_path=self.mount_a.mountpoint,
- size_unit=size_unit,
- file_multiplier=file_multiplier,
- size_range=self.throttle_workload_size_range
- ))
-
- self.mount_a.run_python(create_script)
-
- # We will run the deletion in the background, to reduce the risk of it completing before
- # we have started monitoring the stray statistics.
- def background():
- self.mount_a.run_shell(["rm", "-rf", "delete_me"])
- self.fs.mds_asok(["flush", "journal"])
-
- background_thread = gevent.spawn(background)
-
- total_inodes = file_multiplier * self.throttle_workload_size_range + 1
- mds_max_purge_ops = int(self.fs.get_config("mds_max_purge_ops", 'mds'))
- mds_max_purge_files = int(self.fs.get_config("mds_max_purge_files", 'mds'))
-
- # During this phase we look for the concurrent ops to exceed half
- # the limit (a heuristic) and not exceed the limit (a correctness
- # condition).
- purge_timeout = 600
- elapsed = 0
- files_high_water = 0
- ops_high_water = 0
- while True:
- mdc_stats = self.fs.mds_asok(['perf', 'dump', 'mds_cache'])['mds_cache']
- if elapsed >= purge_timeout:
- raise RuntimeError("Timeout waiting for {0} inodes to purge, stats:{1}".format(total_inodes, mdc_stats))
-
- num_strays = mdc_stats['num_strays']
- num_strays_purging = mdc_stats['num_strays_purging']
- num_purge_ops = mdc_stats['num_purge_ops']
-
- files_high_water = max(files_high_water, num_strays_purging)
- ops_high_water = max(ops_high_water, num_purge_ops)
-
- total_strays_created = mdc_stats['strays_created']
- total_strays_purged = mdc_stats['strays_purged']
-
- if total_strays_purged == total_inodes:
- log.info("Complete purge in {0} seconds".format(elapsed))
- break
- elif total_strays_purged > total_inodes:
- raise RuntimeError("Saw more strays than expected, mdc stats: {0}".format(mdc_stats))
- else:
- if throttle_type == self.OPS_THROTTLE:
- if num_strays_purging > mds_max_purge_files:
- raise RuntimeError("num_purge_ops violates threshold {0}/{1}".format(
- num_purge_ops, mds_max_purge_ops
- ))
- elif throttle_type == self.FILES_THROTTLE:
- if num_strays_purging > mds_max_purge_files:
- raise RuntimeError("num_strays_purging violates threshold {0}/{1}".format(
- num_strays_purging, mds_max_purge_files
- ))
- else:
- raise NotImplemented(throttle_type)
-
- log.info("Waiting for purge to complete {0}/{1}, {2}/{3}".format(
- num_strays_purging, num_strays,
- total_strays_purged, total_strays_created
- ))
- time.sleep(1)
- elapsed += 1
-
- background_thread.join()
-
- # Check that we got up to a respectable rate during the purge. This is totally
- # racy, but should be safeish unless the cluster is pathologically slow, or
- # insanely fast such that the deletions all pass before we have polled the
- # statistics.
- if throttle_type == self.OPS_THROTTLE:
- if ops_high_water < mds_max_purge_ops / 2:
- raise RuntimeError("Ops in flight high water is unexpectedly low ({0} / {1})".format(
- ops_high_water, mds_max_purge_ops
- ))
- elif throttle_type == self.FILES_THROTTLE:
- if files_high_water < mds_max_purge_files / 2:
- raise RuntimeError("Files in flight high water is unexpectedly low ({0} / {1})".format(
- ops_high_water, mds_max_purge_files
- ))
-
- # Sanity check all MDC stray stats
- mdc_stats = self.fs.mds_asok(['perf', 'dump', 'mds_cache'])['mds_cache']
- self.assertEqual(mdc_stats['num_strays'], 0)
- self.assertEqual(mdc_stats['num_strays_purging'], 0)
- self.assertEqual(mdc_stats['num_strays_delayed'], 0)
- self.assertEqual(mdc_stats['num_purge_ops'], 0)
- self.assertEqual(mdc_stats['strays_created'], total_inodes)
- self.assertEqual(mdc_stats['strays_purged'], total_inodes)
-
- def get_mdc_stat(self, name, mds_id=None):
- return self.fs.mds_asok(['perf', 'dump', "mds_cache", name],
- mds_id=mds_id)['mds_cache'][name]
-
- def test_open_inode(self):
- """
- That the case of a dentry unlinked while a client holds an
- inode open is handled correctly.
-
- The inode should be moved into a stray dentry, while the original
- dentry and directory should be purged.
-
- The inode's data should be purged when the client eventually closes
- it.
- """
- mount_a_client_id = self.mount_a.get_global_id()
-
- # Write some bytes to a file
- size_mb = 8
- self.mount_a.write_n_mb("open_file", size_mb)
- open_file_ino = self.mount_a.path_to_ino("open_file")
-
- # Hold the file open
- p = self.mount_a.open_background("open_file")
-
- self.assertEqual(self.get_session(mount_a_client_id)['num_caps'], 2)
-
- # Unlink the dentry
- self.mount_a.run_shell(["rm", "-f", "open_file"])
-
- # Wait to see the stray count increment
- self.wait_until_equal(
- lambda: self.get_mdc_stat("num_strays"),
- expect_val=1, timeout=60, reject_fn=lambda x: x > 1)
-
- # See that while the stray count has incremented, the purge count
- # has not
- self.assertEqual(self.get_mdc_stat("strays_created"), 1)
- self.assertEqual(self.get_mdc_stat("strays_purged"), 0)
-
- # See that the client still holds 2 caps
- self.assertEqual(self.get_session(mount_a_client_id)['num_caps'], 2)
-
- # See that the data objects remain in the data pool
- self.assertTrue(self.fs.data_objects_present(open_file_ino, size_mb * 1024 * 1024))
-
- # Now close the file
- self.mount_a.kill_background(p)
-
- # Wait to see the client cap count decrement
- self.wait_until_equal(
- lambda: self.get_session(mount_a_client_id)['num_caps'],
- expect_val=1, timeout=60, reject_fn=lambda x: x > 2 or x < 1
- )
- # Wait to see the purge counter increment, stray count go to zero
- self.wait_until_equal(
- lambda: self.get_mdc_stat("strays_purged"),
- expect_val=1, timeout=60, reject_fn=lambda x: x > 1
- )
- self.wait_until_equal(
- lambda: self.get_mdc_stat("num_strays"),
- expect_val=0, timeout=6, reject_fn=lambda x: x > 1
- )
-
- # See that the data objects no longer exist
- self.assertTrue(self.fs.data_objects_absent(open_file_ino, size_mb * 1024 * 1024))
-
- self.await_data_pool_empty()
-
- def test_hardlink_reintegration(self):
- """
- That removal of primary dentry of hardlinked inode results
- in reintegration of inode into the previously-remote dentry,
- rather than lingering as a stray indefinitely.
- """
- # Write some bytes to file_a
- size_mb = 8
- self.mount_a.write_n_mb("file_a", size_mb)
- ino = self.mount_a.path_to_ino("file_a")
-
- # Create a hardlink named file_b
- self.mount_a.run_shell(["ln", "file_a", "file_b"])
- self.assertEqual(self.mount_a.path_to_ino("file_b"), ino)
-
- # Flush journal
- self.fs.mds_asok(['flush', 'journal'])
-
- # See that backtrace for the file points to the file_a path
- pre_unlink_bt = self.fs.read_backtrace(ino)
- self.assertEqual(pre_unlink_bt['ancestors'][0]['dname'], "file_a")
-
- # Unlink file_a
- self.mount_a.run_shell(["rm", "-f", "file_a"])
-
- # See that a stray was created
- self.assertEqual(self.get_mdc_stat("num_strays"), 1)
- self.assertEqual(self.get_mdc_stat("strays_created"), 1)
-
- # Wait, see that data objects are still present (i.e. that the
- # stray did not advance to purging given time)
- time.sleep(30)
- self.assertTrue(self.fs.data_objects_present(ino, size_mb * 1024 * 1024))
- self.assertEqual(self.get_mdc_stat("strays_purged"), 0)
-
- # See that before reintegration, the inode's backtrace points to a stray dir
- self.fs.mds_asok(['flush', 'journal'])
- self.assertTrue(self.get_backtrace_path(ino).startswith("stray"))
-
- # Do a metadata operation on the remaining link (mv is heavy handed, but
- # others like touch may be satisfied from caps without poking MDS)
- self.mount_a.run_shell(["mv", "file_b", "file_c"])
-
- # See the reintegration counter increment
- # This should happen as a result of the eval_remote call on
- # responding to a client request.
- self.wait_until_equal(
- lambda: self.get_mdc_stat("strays_reintegrated"),
- expect_val=1, timeout=60, reject_fn=lambda x: x > 1
- )
-
- # Flush the journal
- self.fs.mds_asok(['flush', 'journal'])
-
- # See that the backtrace for the file points to the remaining link's path
- post_reint_bt = self.fs.read_backtrace(ino)
- self.assertEqual(post_reint_bt['ancestors'][0]['dname'], "file_c")
-
- # See that the number of strays in existence is zero
- self.assertEqual(self.get_mdc_stat("num_strays"), 0)
-
- # Now really delete it
- self.mount_a.run_shell(["rm", "-f", "file_c"])
- self.wait_until_equal(
- lambda: self.get_mdc_stat("strays_purged"),
- expect_val=1, timeout=60, reject_fn=lambda x: x > 1
- )
- self.assert_purge_idle()
- self.assertTrue(self.fs.data_objects_absent(ino, size_mb * 1024 * 1024))
-
- # We caused the inode to go stray twice
- self.assertEqual(self.get_mdc_stat("strays_created"), 2)
- # One time we reintegrated it
- self.assertEqual(self.get_mdc_stat("strays_reintegrated"), 1)
- # Then the second time we purged it
- self.assertEqual(self.get_mdc_stat("strays_purged"), 1)
-
- def test_mv_hardlink_cleanup(self):
- """
- That when doing a rename from A to B, and B has hardlinks,
- then we make a stray for B which is then reintegrated
- into one of his hardlinks.
- """
- # Create file_a, file_b, and a hardlink to file_b
- size_mb = 8
- self.mount_a.write_n_mb("file_a", size_mb)
- file_a_ino = self.mount_a.path_to_ino("file_a")
-
- self.mount_a.write_n_mb("file_b", size_mb)
- file_b_ino = self.mount_a.path_to_ino("file_b")
-
- self.mount_a.run_shell(["ln", "file_b", "linkto_b"])
- self.assertEqual(self.mount_a.path_to_ino("linkto_b"), file_b_ino)
-
- # mv file_a file_b
- self.mount_a.run_shell(["mv", "file_a", "file_b"])
-
- self.fs.mds_asok(['flush', 'journal'])
-
- # Initially, linkto_b will still be a remote inode pointing to a newly created
- # stray from when file_b was unlinked due to the 'mv'. No data objects should
- # have been deleted, as both files still have linkage.
- self.assertEqual(self.get_mdc_stat("num_strays"), 1)
- self.assertEqual(self.get_mdc_stat("strays_created"), 1)
- self.assertTrue(self.get_backtrace_path(file_b_ino).startswith("stray"))
- self.assertTrue(self.fs.data_objects_present(file_a_ino, size_mb * 1024 * 1024))
- self.assertTrue(self.fs.data_objects_present(file_b_ino, size_mb * 1024 * 1024))
-
- # Trigger reintegration and wait for it to happen
- self.assertEqual(self.get_mdc_stat("strays_reintegrated"), 0)
- self.mount_a.run_shell(["mv", "linkto_b", "file_c"])
- self.wait_until_equal(
- lambda: self.get_mdc_stat("strays_reintegrated"),
- expect_val=1, timeout=60, reject_fn=lambda x: x > 1
- )
-
- self.fs.mds_asok(['flush', 'journal'])
-
- post_reint_bt = self.fs.read_backtrace(file_b_ino)
- self.assertEqual(post_reint_bt['ancestors'][0]['dname'], "file_c")
- self.assertEqual(self.get_mdc_stat("num_strays"), 0)
-
- def test_migration_on_shutdown(self):
- """
- That when an MDS rank is shut down, any not-yet-purging strays
- are migrated to another MDS's stray dir.
- """
-
- # Set up two MDSs
- self.fs.mon_manager.raw_cluster_cmd_result('mds', 'set', "allow_multimds",
- "true", "--yes-i-really-mean-it")
- self.fs.mon_manager.raw_cluster_cmd_result('mds', 'set', "max_mds", "2")
-
- # See that we have two active MDSs
- self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30,
- reject_fn=lambda v: v > 2 or v < 1)
-
- active_mds_names = self.fs.get_active_names()
- rank_0_id = active_mds_names[0]
- rank_1_id = active_mds_names[1]
- log.info("Ranks 0 and 1 are {0} and {1}".format(
- rank_0_id, rank_1_id))
-
- # Get rid of other MDS daemons so that it's easier to know which
- # daemons to expect in which ranks after restarts
- for unneeded_mds in set(self.mds_cluster.mds_ids) - {rank_0_id, rank_1_id}:
- self.mds_cluster.mds_stop(unneeded_mds)
- self.mds_cluster.mds_fail(unneeded_mds)
-
- # Set the purge file throttle to 0 on MDS rank 1
- self.set_conf("mds.{0}".format(rank_1_id), 'mds_max_purge_files', "0")
- self.fs.mds_fail_restart(rank_1_id)
- self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30,
- reject_fn=lambda v: v > 2 or v < 1)
-
- # Create a file
- # Export dir on an empty dir doesn't work, so we create the file before
- # calling export dir in order to kick a dirfrag into existence
- size_mb = 8
- self.mount_a.run_shell(["mkdir", "ALPHA"])
- self.mount_a.write_n_mb("ALPHA/alpha_file", size_mb)
- ino = self.mount_a.path_to_ino("ALPHA/alpha_file")
-
- result = self.fs.mds_asok(["export", "dir", "/ALPHA", "1"], rank_0_id)
- self.assertEqual(result["return_code"], 0)
-
- # Poll the MDS cache dump to watch for the export completing
- migrated = False
- migrate_timeout = 60
- migrate_elapsed = 0
- while not migrated:
- data = self.fs.mds_asok(["dump", "cache"], rank_1_id)
- for inode_data in data:
- if inode_data['ino'] == ino:
- log.debug("Found ino in cache: {0}".format(json.dumps(inode_data, indent=2)))
- if inode_data['is_auth'] is True:
- migrated = True
- break
-
- if not migrated:
- if migrate_elapsed > migrate_timeout:
- raise RuntimeError("Migration hasn't happened after {0}s!".format(migrate_elapsed))
- else:
- migrate_elapsed += 1
- time.sleep(1)
-
- # Delete the file on rank 1
- self.mount_a.run_shell(["rm", "-f", "ALPHA/alpha_file"])
-
- # See the stray counter increment, but the purge counter doesn't
- # See that the file objects are still on disk
- self.wait_until_equal(
- lambda: self.get_mdc_stat("num_strays", rank_1_id),
- expect_val=1, timeout=60, reject_fn=lambda x: x > 1)
- self.assertEqual(self.get_mdc_stat("strays_created", rank_1_id), 1)
- time.sleep(60) # period that we want to see if it gets purged
- self.assertEqual(self.get_mdc_stat("strays_purged", rank_1_id), 0)
- self.assertTrue(self.fs.data_objects_present(ino, size_mb * 1024 * 1024))
-
- # Shut down rank 1
- self.fs.mon_manager.raw_cluster_cmd_result('mds', 'set', "max_mds", "1")
- self.fs.mon_manager.raw_cluster_cmd_result('mds', 'deactivate', "1")
-
- # Wait til we get to a single active MDS mdsmap state
- def is_stopped():
- mds_map = self.fs.get_mds_map()
- return 1 not in [i['rank'] for i in mds_map['info'].values()]
-
- self.wait_until_true(is_stopped, timeout=120)
-
- # See that the stray counter on rank 0 has incremented
- self.assertEqual(self.get_mdc_stat("strays_created", rank_0_id), 1)
-
- # Wait til the purge counter on rank 0 increments
- self.wait_until_equal(
- lambda: self.get_mdc_stat("strays_purged", rank_0_id),
- 1, timeout=60, reject_fn=lambda x: x > 1)
-
- # See that the file objects no longer exist
- self.assertTrue(self.fs.data_objects_absent(ino, size_mb * 1024 * 1024))
-
- self.await_data_pool_empty()
-
- def assert_backtrace(self, ino, expected_path):
- """
- Assert that the backtrace in the data pool for an inode matches
- an expected /foo/bar path.
- """
- expected_elements = expected_path.strip("/").split("/")
- bt = self.fs.read_backtrace(ino)
- actual_elements = list(reversed([dn['dname'] for dn in bt['ancestors']]))
- self.assertListEqual(expected_elements, actual_elements)
-
- def get_backtrace_path(self, ino):
- bt = self.fs.read_backtrace(ino)
- elements = reversed([dn['dname'] for dn in bt['ancestors']])
- return "/".join(elements)
-
- def assert_purge_idle(self):
- """
- Assert that the MDS perf counters indicate no strays exist and
- no ongoing purge activity. Sanity check for when PurgeQueue should
- be idle.
- """
- stats = self.fs.mds_asok(['perf', 'dump', "mds_cache"])['mds_cache']
- self.assertEqual(stats["num_strays"], 0)
- self.assertEqual(stats["num_strays_purging"], 0)
- self.assertEqual(stats["num_strays_delayed"], 0)
- self.assertEqual(stats["num_purge_ops"], 0)
-
- def test_mv_cleanup(self):
- """
- That when doing a rename from A to B, and B has no hardlinks,
- then we make a stray for B and purge him.
- """
- # Create file_a and file_b, write some to both
- size_mb = 8
- self.mount_a.write_n_mb("file_a", size_mb)
- file_a_ino = self.mount_a.path_to_ino("file_a")
- self.mount_a.write_n_mb("file_b", size_mb)
- file_b_ino = self.mount_a.path_to_ino("file_b")
-
- self.fs.mds_asok(['flush', 'journal'])
- self.assert_backtrace(file_a_ino, "file_a")
- self.assert_backtrace(file_b_ino, "file_b")
-
- # mv file_a file_b
- self.mount_a.run_shell(['mv', 'file_a', 'file_b'])
-
- # See that stray counter increments
- self.assertEqual(self.get_mdc_stat("strays_created"), 1)
- # Wait for purge counter to increment
- self.wait_until_equal(
- lambda: self.get_mdc_stat("strays_purged"),
- expect_val=1, timeout=60, reject_fn=lambda x: x > 1
- )
- self.assert_purge_idle()
-
- # file_b should have been purged
- self.assertTrue(self.fs.data_objects_absent(file_b_ino, size_mb * 1024 * 1024))
-
- # Backtrace should have updated from file_a to file_b
- self.fs.mds_asok(['flush', 'journal'])
- self.assert_backtrace(file_a_ino, "file_b")
-
- # file_a's data should still exist
- self.assertTrue(self.fs.data_objects_present(file_a_ino, size_mb * 1024 * 1024))
-
- def _pool_df(self, pool_name):
- """
- Return a dict like
- {
- "kb_used": 0,
- "bytes_used": 0,
- "max_avail": 19630292406,
- "objects": 0
- }
-
- :param pool_name: Which pool (must exist)
- """
- out = self.fs.mon_manager.raw_cluster_cmd("df", "--format=json-pretty")
- for p in json.loads(out)['pools']:
- if p['name'] == pool_name:
- return p['stats']
-
- raise RuntimeError("Pool '{0}' not found".format(pool_name))
-
- def await_data_pool_empty(self):
- self.wait_until_true(
- lambda: self._pool_df(
- self.fs.get_data_pool_name()
- )['objects'] == 0,
- timeout=60)
-
- def test_snapshot_remove(self):
- """
- That removal of a snapshot that references a now-unlinked file results
- in purging on the stray for the file.
- """
- # Enable snapshots
- self.fs.mon_manager.raw_cluster_cmd("mds", "set", "allow_new_snaps", "true",
- "--yes-i-really-mean-it")
-
- # Create a dir with a file in it
- size_mb = 8
- self.mount_a.run_shell(["mkdir", "snapdir"])
- self.mount_a.run_shell(["mkdir", "snapdir/subdir"])
- self.mount_a.write_test_pattern("snapdir/subdir/file_a", size_mb * 1024 * 1024)
- file_a_ino = self.mount_a.path_to_ino("snapdir/subdir/file_a")
-
- # Snapshot the dir
- self.mount_a.run_shell(["mkdir", "snapdir/.snap/snap1"])
-
- # Cause the head revision to deviate from the snapshot
- self.mount_a.write_n_mb("snapdir/subdir/file_a", size_mb)
-
- # Flush the journal so that backtraces, dirfrag objects will actually be written
- self.fs.mds_asok(["flush", "journal"])
-
- # Unlink the file
- self.mount_a.run_shell(["rm", "-f", "snapdir/subdir/file_a"])
- self.mount_a.run_shell(["rmdir", "snapdir/subdir"])
-
- # Unmount the client because when I come back to check the data is still
- # in the file I don't want to just see what's in the page cache.
- self.mount_a.umount_wait()
-
- self.assertEqual(self.get_mdc_stat("strays_created"), 2)
-
- # FIXME: at this stage we see a purge and the stray count drops to
- # zero, but there's actually still a stray, so at the very
- # least the StrayManager stats code is slightly off
-
- self.mount_a.mount()
-
- # See that the data from the snapshotted revision of the file is still present
- # and correct
- self.mount_a.validate_test_pattern("snapdir/.snap/snap1/subdir/file_a", size_mb * 1024 * 1024)
-
- # Remove the snapshot
- self.mount_a.run_shell(["rmdir", "snapdir/.snap/snap1"])
- self.mount_a.umount_wait()
-
- # Purging file_a doesn't happen until after we've flushed the journal, because
- # it is referenced by the snapshotted subdir, and the snapshot isn't really
- # gone until the journal references to it are gone
- self.fs.mds_asok(["flush", "journal"])
-
- # See that a purge happens now
- self.wait_until_equal(
- lambda: self.get_mdc_stat("strays_purged"),
- expect_val=2, timeout=60, reject_fn=lambda x: x > 1
- )
-
- self.assertTrue(self.fs.data_objects_absent(file_a_ino, size_mb * 1024 * 1024))
- self.await_data_pool_empty()
-
- def test_fancy_layout(self):
- """
- purge stray file with fancy layout
- """
-
- file_name = "fancy_layout_file"
- self.mount_a.run_shell(["touch", file_name])
-
- file_layout = "stripe_unit=1048576 stripe_count=4 object_size=8388608"
- self.mount_a.run_shell(["setfattr", "-n", "ceph.file.layout", "-v", file_layout, file_name])
-
- # 35MB requires 7 objects
- size_mb = 35
- self.mount_a.write_n_mb(file_name, size_mb)
-
- self.mount_a.run_shell(["rm", "-f", file_name])
- self.fs.mds_asok(["flush", "journal"])
-
- # can't use self.fs.data_objects_absent here, it does not support fancy layout
- self.await_data_pool_empty()
-
- def test_dirfrag_limit(self):
- """
- That the directory fragment size cannot exceed mds_bal_fragment_size_max (using a limit of 50 in all configurations).
-
- That fragmentation (forced) will allow more entries to be created.
-
- That unlinking fails when the stray directory fragment becomes too large and that unlinking may continue once those strays are purged.
- """
-
- self.fs.mon_manager.raw_cluster_cmd("mds", "set", "allow_dirfrags", "true", "--yes-i-really-mean-it")
-
- LOW_LIMIT = 50
- for mds in self.fs.get_daemon_names():
- self.fs.mds_asok(["config", "set", "mds_bal_fragment_size_max", str(LOW_LIMIT)], mds)
-
- try:
- self.mount_a.run_python(dedent("""
- import os
- path = os.path.join("{path}", "subdir")
- os.mkdir(path)
- for n in range(0, {file_count}):
- open(os.path.join(path, "%s" % n), 'w').write("%s" % n)
- """.format(
- path=self.mount_a.mountpoint,
- file_count=LOW_LIMIT+1
- )))
- except CommandFailedError:
- pass # ENOSPAC
- else:
- raise RuntimeError("fragment size exceeded")
-
- # Now test that we can go beyond the limit if we fragment the directory
-
- self.mount_a.run_python(dedent("""
- import os
- path = os.path.join("{path}", "subdir2")
- os.mkdir(path)
- for n in range(0, {file_count}):
- open(os.path.join(path, "%s" % n), 'w').write("%s" % n)
- dfd = os.open(path, os.O_DIRECTORY)
- os.fsync(dfd)
- """.format(
- path=self.mount_a.mountpoint,
- file_count=LOW_LIMIT
- )))
-
- # Ensure that subdir2 is fragmented
- mds_id = self.fs.get_active_names()[0]
- self.fs.mds_asok(["dirfrag", "split", "/subdir2", "0/0", "1"], mds_id)
-
- # remount+flush (release client caps)
- self.mount_a.umount_wait()
- self.fs.mds_asok(["flush", "journal"], mds_id)
- self.mount_a.mount()
- self.mount_a.wait_until_mounted()
-
- # Create 50% more files than the current fragment limit
- self.mount_a.run_python(dedent("""
- import os
- path = os.path.join("{path}", "subdir2")
- for n in range({file_count}, ({file_count}*3)//2):
- open(os.path.join(path, "%s" % n), 'w').write("%s" % n)
- """.format(
- path=self.mount_a.mountpoint,
- file_count=LOW_LIMIT
- )))
-
- # Now test the stray directory size is limited and recovers
- strays_before = self.get_mdc_stat("strays_created")
- try:
- self.mount_a.run_python(dedent("""
- import os
- path = os.path.join("{path}", "subdir3")
- os.mkdir(path)
- for n in range({file_count}):
- fpath = os.path.join(path, "%s" % n)
- f = open(fpath, 'w')
- f.write("%s" % n)
- f.close()
- os.unlink(fpath)
- """.format(
- path=self.mount_a.mountpoint,
- file_count=LOW_LIMIT*10 # 10 stray directories, should collide before this count
- )))
- except CommandFailedError:
- pass # ENOSPAC
- else:
- raise RuntimeError("fragment size exceeded")
-
- strays_after = self.get_mdc_stat("strays_created")
- self.assertGreaterEqual(strays_after-strays_before, LOW_LIMIT)
-
- self.wait_until_equal(
- lambda: self.get_mdc_stat("strays_purged"),
- strays_after,
- timeout=600
- )
-
- self.mount_a.run_python(dedent("""
- import os
- path = os.path.join("{path}", "subdir4")
- os.mkdir(path)
- for n in range({file_count}):
- fpath = os.path.join(path, "%s" % n)
- f = open(fpath, 'w')
- f.write("%s" % n)
- f.close()
- os.unlink(fpath)
- """.format(
- path=self.mount_a.mountpoint,
- file_count=LOW_LIMIT
- )))
+++ /dev/null
-import json
-import logging
-import time
-import os
-from textwrap import dedent
-from tasks.cephfs.cephfs_test_case import CephFSTestCase
-from tasks.cephfs.fuse_mount import FuseMount
-from teuthology.exceptions import CommandFailedError
-
-log = logging.getLogger(__name__)
-
-
-class TestVolumeClient(CephFSTestCase):
- #
- # TODO: Test that VolumeClient can recover from partial auth updates.
- #
-
- # One for looking at the global filesystem, one for being
- # the VolumeClient, two for mounting the created shares
- CLIENTS_REQUIRED = 4
-
- def _volume_client_python(self, client, script, vol_prefix=None, ns_prefix=None):
- # Can't dedent this *and* the script we pass in, because they might have different
- # levels of indentation to begin with, so leave this string zero-indented
- if vol_prefix:
- vol_prefix = "\"" + vol_prefix + "\""
- if ns_prefix:
- ns_prefix = "\"" + ns_prefix + "\""
- return client.run_python("""
-from ceph_volume_client import CephFSVolumeClient, VolumePath
-import logging
-log = logging.getLogger("ceph_volume_client")
-log.addHandler(logging.StreamHandler())
-log.setLevel(logging.DEBUG)
-vc = CephFSVolumeClient("manila", "{conf_path}", "ceph", {vol_prefix}, {ns_prefix})
-vc.connect()
-{payload}
-vc.disconnect()
- """.format(payload=script, conf_path=client.config_path, vol_prefix=vol_prefix, ns_prefix=ns_prefix))
-
- def _sudo_write_file(self, remote, path, data):
- """
- Write data to a remote file as super user
-
- :param remote: Remote site.
- :param path: Path on the remote being written to.
- :param data: Data to be written.
-
- Both perms and owner are passed directly to chmod.
- """
- remote.run(
- args=[
- 'sudo',
- 'python',
- '-c',
- 'import shutil, sys; shutil.copyfileobj(sys.stdin, file(sys.argv[1], "wb"))',
- path,
- ],
- stdin=data,
- )
-
- def _configure_vc_auth(self, mount, id_name):
- """
- Set up auth credentials for the VolumeClient user
- """
- out = self.fs.mon_manager.raw_cluster_cmd(
- "auth", "get-or-create", "client.{name}".format(name=id_name),
- "mds", "allow *",
- "osd", "allow rw",
- "mon", "allow *"
- )
- mount.client_id = id_name
- self._sudo_write_file(mount.client_remote, mount.get_keyring_path(), out)
- self.set_conf("client.{name}".format(name=id_name), "keyring", mount.get_keyring_path())
-
- def _configure_guest_auth(self, volumeclient_mount, guest_mount,
- guest_entity, mount_path,
- namespace_prefix=None, readonly=False,
- tenant_id=None):
- """
- Set up auth credentials for the guest client to mount a volume.
-
- :param volumeclient_mount: mount used as the handle for driving
- volumeclient.
- :param guest_mount: mount used by the guest client.
- :param guest_entity: auth ID used by the guest client.
- :param mount_path: path of the volume.
- :param namespace_prefix: name prefix of the RADOS namespace, which
- is used for the volume's layout.
- :param readonly: defaults to False. If set to 'True' only read-only
- mount access is granted to the guest.
- :param tenant_id: (OpenStack) tenant ID of the guest client.
- """
-
- head, volume_id = os.path.split(mount_path)
- head, group_id = os.path.split(head)
- head, volume_prefix = os.path.split(head)
- volume_prefix = "/" + volume_prefix
-
- # Authorize the guest client's auth ID to mount the volume.
- key = self._volume_client_python(volumeclient_mount, dedent("""
- vp = VolumePath("{group_id}", "{volume_id}")
- auth_result = vc.authorize(vp, "{guest_entity}", readonly={readonly},
- tenant_id="{tenant_id}")
- print auth_result['auth_key']
- """.format(
- group_id=group_id,
- volume_id=volume_id,
- guest_entity=guest_entity,
- readonly=readonly,
- tenant_id=tenant_id)), volume_prefix, namespace_prefix
- )
-
- # CephFSVolumeClient's authorize() does not return the secret
- # key to a caller who isn't multi-tenant aware. Explicitly
- # query the key for such a client.
- if not tenant_id:
- key = self.fs.mon_manager.raw_cluster_cmd(
- "auth", "get-key", "client.{name}".format(name=guest_entity),
- )
-
- # The guest auth ID should exist.
- existing_ids = [a['entity'] for a in self.auth_list()]
- self.assertIn("client.{0}".format(guest_entity), existing_ids)
-
- # Create keyring file for the guest client.
- keyring_txt = dedent("""
- [client.{guest_entity}]
- key = {key}
-
- """.format(
- guest_entity=guest_entity,
- key=key
- ))
- guest_mount.client_id = guest_entity
- self._sudo_write_file(guest_mount.client_remote,
- guest_mount.get_keyring_path(),
- keyring_txt)
-
- # Add a guest client section to the ceph config file.
- self.set_conf("client.{0}".format(guest_entity), "client quota", "True")
- self.set_conf("client.{0}".format(guest_entity), "debug client", "20")
- self.set_conf("client.{0}".format(guest_entity), "debug objecter", "20")
- self.set_conf("client.{0}".format(guest_entity),
- "keyring", guest_mount.get_keyring_path())
-
- def test_default_prefix(self):
- group_id = "grpid"
- volume_id = "volid"
- DEFAULT_VOL_PREFIX = "volumes"
- DEFAULT_NS_PREFIX = "fsvolumens_"
-
- self.mount_b.umount_wait()
- self._configure_vc_auth(self.mount_b, "manila")
-
- #create a volume with default prefix
- self._volume_client_python(self.mount_b, dedent("""
- vp = VolumePath("{group_id}", "{volume_id}")
- vc.create_volume(vp, 10, data_isolated=True)
- """.format(
- group_id=group_id,
- volume_id=volume_id,
- )))
-
- # The dir should be created
- self.mount_a.stat(os.path.join(DEFAULT_VOL_PREFIX, group_id, volume_id))
-
- #namespace should be set
- ns_in_attr = self.mount_a.getfattr(os.path.join(DEFAULT_VOL_PREFIX, group_id, volume_id), "ceph.dir.layout.pool_namespace")
- namespace = "{0}{1}".format(DEFAULT_NS_PREFIX, volume_id)
- self.assertEqual(namespace, ns_in_attr)
-
-
- def test_lifecycle(self):
- """
- General smoke test for create, extend, destroy
- """
-
- # I'm going to use mount_c later as a guest for mounting the created
- # shares
- self.mounts[2].umount_wait()
-
- # I'm going to leave mount_b unmounted and just use it as a handle for
- # driving volumeclient. It's a little hacky but we don't have a more
- # general concept for librados/libcephfs clients as opposed to full
- # blown mounting clients.
- self.mount_b.umount_wait()
- self._configure_vc_auth(self.mount_b, "manila")
-
- guest_entity = "guest"
- group_id = "grpid"
- volume_id = "volid"
-
- volume_prefix = "/myprefix"
- namespace_prefix = "mynsprefix_"
-
- # Create a 100MB volume
- volume_size = 100
- mount_path = self._volume_client_python(self.mount_b, dedent("""
- vp = VolumePath("{group_id}", "{volume_id}")
- create_result = vc.create_volume(vp, 1024*1024*{volume_size})
- print create_result['mount_path']
- """.format(
- group_id=group_id,
- volume_id=volume_id,
- volume_size=volume_size
- )), volume_prefix, namespace_prefix)
-
- # The dir should be created
- self.mount_a.stat(os.path.join("myprefix", group_id, volume_id))
-
- # Authorize and configure credentials for the guest to mount the
- # the volume.
- self._configure_guest_auth(self.mount_b, self.mounts[2], guest_entity,
- mount_path, namespace_prefix)
- self.mounts[2].mount(mount_path=mount_path)
-
- # The kernel client doesn't have the quota-based df behaviour,
- # or quotas at all, so only exercise the client behaviour when
- # running fuse.
- if isinstance(self.mounts[2], FuseMount):
- # df should see volume size, same as the quota set on volume's dir
- self.assertEqual(self.mounts[2].df()['total'],
- volume_size * 1024 * 1024)
- self.assertEqual(
- self.mount_a.getfattr(
- os.path.join(volume_prefix.strip("/"), group_id, volume_id),
- "ceph.quota.max_bytes"),
- "%s" % (volume_size * 1024 * 1024))
-
- # df granularity is 4MB block so have to write at least that much
- data_bin_mb = 4
- self.mounts[2].write_n_mb("data.bin", data_bin_mb)
-
- # Write something outside volume to check this space usage is
- # not reported in the volume's DF.
- other_bin_mb = 6
- self.mount_a.write_n_mb("other.bin", other_bin_mb)
-
- # global: df should see all the writes (data + other). This is a >
- # rather than a == because the global spaced used includes all pools
- self.assertGreater(self.mount_a.df()['used'],
- (data_bin_mb + other_bin_mb) * 1024 * 1024)
-
- # Hack: do a metadata IO to kick rstats
- self.mounts[2].run_shell(["touch", "foo"])
-
- # volume: df should see the data_bin_mb consumed from quota, same
- # as the rbytes for the volume's dir
- self.wait_until_equal(
- lambda: self.mounts[2].df()['used'],
- data_bin_mb * 1024 * 1024, timeout=60)
- self.wait_until_equal(
- lambda: self.mount_a.getfattr(
- os.path.join(volume_prefix.strip("/"), group_id, volume_id),
- "ceph.dir.rbytes"),
- "%s" % (data_bin_mb * 1024 * 1024), timeout=60)
-
- # sync so that file data are persist to rados
- self.mounts[2].run_shell(["sync"])
-
- # Our data should stay in particular rados namespace
- pool_name = self.mount_a.getfattr(os.path.join("myprefix", group_id, volume_id), "ceph.dir.layout.pool")
- namespace = "{0}{1}".format(namespace_prefix, volume_id)
- ns_in_attr = self.mount_a.getfattr(os.path.join("myprefix", group_id, volume_id), "ceph.dir.layout.pool_namespace")
- self.assertEqual(namespace, ns_in_attr)
-
- objects_in_ns = set(self.fs.rados(["ls"], pool=pool_name, namespace=namespace).split("\n"))
- self.assertNotEqual(objects_in_ns, set())
-
- # De-authorize the guest
- self._volume_client_python(self.mount_b, dedent("""
- vp = VolumePath("{group_id}", "{volume_id}")
- vc.deauthorize(vp, "{guest_entity}")
- vc.evict("{guest_entity}")
- """.format(
- group_id=group_id,
- volume_id=volume_id,
- guest_entity=guest_entity
- )), volume_prefix, namespace_prefix)
-
- # Once deauthorized, the client should be unable to do any more metadata ops
- # The way that the client currently behaves here is to block (it acts like
- # it has lost network, because there is nothing to tell it that is messages
- # are being dropped because it's identity is gone)
- background = self.mounts[2].write_n_mb("rogue.bin", 1, wait=False)
- time.sleep(10) # Approximate check for 'stuck' as 'still running after 10s'
- self.assertFalse(background.finished)
-
- # After deauthorisation, the client ID should be gone (this was the only
- # volume it was authorised for)
- self.assertNotIn("client.{0}".format(guest_entity), [e['entity'] for e in self.auth_list()])
-
- # Clean up the dead mount (ceph-fuse's behaviour here is a bit undefined)
- self.mounts[2].kill()
- self.mounts[2].kill_cleanup()
- try:
- background.wait()
- except CommandFailedError:
- # We killed the mount out from under you
- pass
-
- self._volume_client_python(self.mount_b, dedent("""
- vp = VolumePath("{group_id}", "{volume_id}")
- vc.delete_volume(vp)
- vc.purge_volume(vp)
- """.format(
- group_id=group_id,
- volume_id=volume_id,
- )), volume_prefix, namespace_prefix)
-
- def test_idempotency(self):
- """
- That the volumeclient interface works when calling everything twice
- """
- self.mount_b.umount_wait()
- self._configure_vc_auth(self.mount_b, "manila")
-
- guest_entity = "guest"
- group_id = "grpid"
- volume_id = "volid"
- self._volume_client_python(self.mount_b, dedent("""
- vp = VolumePath("{group_id}", "{volume_id}")
- vc.create_volume(vp, 10)
- vc.create_volume(vp, 10)
- vc.authorize(vp, "{guest_entity}")
- vc.authorize(vp, "{guest_entity}")
- vc.deauthorize(vp, "{guest_entity}")
- vc.deauthorize(vp, "{guest_entity}")
- vc.delete_volume(vp)
- vc.delete_volume(vp)
- vc.purge_volume(vp)
- vc.purge_volume(vp)
-
- vc.create_volume(vp, 10, data_isolated=True)
- vc.create_volume(vp, 10, data_isolated=True)
- vc.authorize(vp, "{guest_entity}")
- vc.authorize(vp, "{guest_entity}")
- vc.deauthorize(vp, "{guest_entity}")
- vc.deauthorize(vp, "{guest_entity}")
- vc.evict("{guest_entity}")
- vc.evict("{guest_entity}")
- vc.delete_volume(vp, data_isolated=True)
- vc.delete_volume(vp, data_isolated=True)
- vc.purge_volume(vp, data_isolated=True)
- vc.purge_volume(vp, data_isolated=True)
- """.format(
- group_id=group_id,
- volume_id=volume_id,
- guest_entity=guest_entity
- )))
-
- def test_data_isolated(self):
- """
- That data isolated shares get their own pool
- :return:
- """
-
- # Because the teuthology config template sets mon_pg_warn_max_per_osd to
- # 10000 (i.e. it just tries to ignore health warnings), reset it to something
- # sane before using volume_client, to avoid creating pools with absurdly large
- # numbers of PGs.
- self.set_conf("global", "mon pg warn max per osd", "300")
- for mon_daemon_state in self.ctx.daemons.iter_daemons_of_role('mon'):
- mon_daemon_state.restart()
-
- self.mount_b.umount_wait()
- self._configure_vc_auth(self.mount_b, "manila")
-
- # Calculate how many PGs we'll expect the new volume pool to have
- osd_map = json.loads(self.fs.mon_manager.raw_cluster_cmd('osd', 'dump', '--format=json-pretty'))
- max_per_osd = int(self.fs.get_config('mon_pg_warn_max_per_osd'))
- osd_count = len(osd_map['osds'])
- max_overall = osd_count * max_per_osd
-
- existing_pg_count = 0
- for p in osd_map['pools']:
- existing_pg_count += p['pg_num']
-
- expected_pg_num = (max_overall - existing_pg_count) / 10
- log.info("max_per_osd {0}".format(max_per_osd))
- log.info("osd_count {0}".format(osd_count))
- log.info("max_overall {0}".format(max_overall))
- log.info("existing_pg_count {0}".format(existing_pg_count))
- log.info("expected_pg_num {0}".format(expected_pg_num))
-
- pools_a = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools']
-
- group_id = "grpid"
- volume_id = "volid"
- self._volume_client_python(self.mount_b, dedent("""
- vp = VolumePath("{group_id}", "{volume_id}")
- vc.create_volume(vp, 10, data_isolated=True)
- """.format(
- group_id=group_id,
- volume_id=volume_id,
- )))
-
- pools_b = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools']
-
- # Should have created one new pool
- new_pools = set(p['pool_name'] for p in pools_b) - set([p['pool_name'] for p in pools_a])
- self.assertEqual(len(new_pools), 1)
-
- # It should have followed the heuristic for PG count
- # (this is an overly strict test condition, so we may want to remove
- # it at some point as/when the logic gets fancier)
- created_pg_num = self.fs.mon_manager.get_pool_property(list(new_pools)[0], "pg_num")
- self.assertEqual(expected_pg_num, created_pg_num)
-
- def test_15303(self):
- """
- Reproducer for #15303 "Client holds incorrect complete flag on dir
- after losing caps" (http://tracker.ceph.com/issues/15303)
- """
- for m in self.mounts:
- m.umount_wait()
-
- # Create a dir on mount A
- self.mount_a.mount()
- self.mount_a.run_shell(["mkdir", "parent1"])
- self.mount_a.run_shell(["mkdir", "parent2"])
- self.mount_a.run_shell(["mkdir", "parent1/mydir"])
-
- # Put some files in it from mount B
- self.mount_b.mount()
- self.mount_b.run_shell(["touch", "parent1/mydir/afile"])
- self.mount_b.umount_wait()
-
- # List the dir's contents on mount A
- self.assertListEqual(self.mount_a.ls("parent1/mydir"),
- ["afile"])
-
- def test_evict_client(self):
- """
- That a volume client can be evicted based on its auth ID and the volume
- path it has mounted.
- """
-
- if not isinstance(self.mount_a, FuseMount):
- self.skipTest("Requires FUSE client to inject client metadata")
-
- # mounts[1] would be used as handle for driving VolumeClient. mounts[2]
- # and mounts[3] would be used as guests to mount the volumes/shares.
-
- for i in range(1, 4):
- self.mounts[i].umount_wait()
-
- volumeclient_mount = self.mounts[1]
- self._configure_vc_auth(volumeclient_mount, "manila")
- guest_mounts = (self.mounts[2], self.mounts[3])
-
- guest_entity = "guest"
- group_id = "grpid"
- mount_paths = []
- volume_ids = []
-
- # Create two volumes. Authorize 'guest' auth ID to mount the two
- # volumes. Mount the two volumes. Write data to the volumes.
- for i in range(2):
- # Create volume.
- volume_ids.append("volid_{0}".format(str(i)))
- mount_paths.append(
- self._volume_client_python(volumeclient_mount, dedent("""
- vp = VolumePath("{group_id}", "{volume_id}")
- create_result = vc.create_volume(vp, 10 * 1024 * 1024)
- print create_result['mount_path']
- """.format(
- group_id=group_id,
- volume_id=volume_ids[i]
- ))))
-
- # Authorize 'guest' auth ID to mount the volume.
- self._configure_guest_auth(volumeclient_mount, guest_mounts[i],
- guest_entity, mount_paths[i])
-
- # Mount the volume.
- guest_mounts[i].mountpoint_dir_name = 'mnt.{id}.{suffix}'.format(
- id=guest_entity, suffix=str(i))
- guest_mounts[i].mount(mount_path=mount_paths[i])
- guest_mounts[i].write_n_mb("data.bin", 1)
-
-
- # Evict client, guest_mounts[0], using auth ID 'guest' and has mounted
- # one volume.
- self._volume_client_python(self.mount_b, dedent("""
- vp = VolumePath("{group_id}", "{volume_id}")
- vc.deauthorize(vp, "{guest_entity}")
- vc.evict("{guest_entity}", volume_path=vp)
- """.format(
- group_id=group_id,
- volume_id=volume_ids[0],
- guest_entity=guest_entity
- )))
-
- # Evicted guest client, guest_mounts[0], should not be able to do
- # anymore metadata ops. It behaves as if it has lost network
- # connection.
- background = guest_mounts[0].write_n_mb("rogue.bin", 1, wait=False)
- # Approximate check for 'stuck' as 'still running after 10s'.
- time.sleep(10)
- self.assertFalse(background.finished)
-
- # Guest client, guest_mounts[1], using the same auth ID 'guest', but
- # has mounted the other volume, should be able to use its volume
- # unaffected.
- guest_mounts[1].write_n_mb("data.bin.1", 1)
-
- # Cleanup.
- for i in range(2):
- self._volume_client_python(volumeclient_mount, dedent("""
- vp = VolumePath("{group_id}", "{volume_id}")
- vc.deauthorize(vp, "{guest_entity}")
- vc.delete_volume(vp)
- vc.purge_volume(vp)
- """.format(
- group_id=group_id,
- volume_id=volume_ids[i],
- guest_entity=guest_entity
- )))
-
- # We must hard-umount the one that we evicted
- guest_mounts[0].umount_wait(force=True)
-
- def test_purge(self):
- """
- Reproducer for #15266, exception trying to purge volumes that
- contain non-ascii filenames.
-
- Additionally test any other purge corner cases here.
- """
- # I'm going to leave mount_b unmounted and just use it as a handle for
- # driving volumeclient. It's a little hacky but we don't have a more
- # general concept for librados/libcephfs clients as opposed to full
- # blown mounting clients.
- self.mount_b.umount_wait()
- self._configure_vc_auth(self.mount_b, "manila")
-
- group_id = "grpid"
- # Use a unicode volume ID (like Manila), to reproduce #15266
- volume_id = u"volid"
-
- # Create
- mount_path = self._volume_client_python(self.mount_b, dedent("""
- vp = VolumePath("{group_id}", u"{volume_id}")
- create_result = vc.create_volume(vp, 10)
- print create_result['mount_path']
- """.format(
- group_id=group_id,
- volume_id=volume_id
- )))
-
- # Strip leading "/"
- mount_path = mount_path[1:]
-
- # A file with non-ascii characters
- self.mount_a.run_shell(["touch", os.path.join(mount_path, u"b\u00F6b")])
-
- # A file with no permissions to do anything
- self.mount_a.run_shell(["touch", os.path.join(mount_path, "noperms")])
- self.mount_a.run_shell(["chmod", "0000", os.path.join(mount_path, "noperms")])
-
- self._volume_client_python(self.mount_b, dedent("""
- vp = VolumePath("{group_id}", u"{volume_id}")
- vc.delete_volume(vp)
- vc.purge_volume(vp)
- """.format(
- group_id=group_id,
- volume_id=volume_id
- )))
-
- # Check it's really gone
- self.assertEqual(self.mount_a.ls("volumes/_deleting"), [])
- self.assertEqual(self.mount_a.ls("volumes/"), ["_deleting", group_id])
-
- def test_readonly_authorization(self):
- """
- That guest clients can be restricted to read-only mounts of volumes.
- """
-
- volumeclient_mount = self.mounts[1]
- guest_mount = self.mounts[2]
- volumeclient_mount.umount_wait()
- guest_mount.umount_wait()
-
- # Configure volumeclient_mount as the handle for driving volumeclient.
- self._configure_vc_auth(volumeclient_mount, "manila")
-
- guest_entity = "guest"
- group_id = "grpid"
- volume_id = "volid"
-
- # Create a volume.
- mount_path = self._volume_client_python(volumeclient_mount, dedent("""
- vp = VolumePath("{group_id}", "{volume_id}")
- create_result = vc.create_volume(vp, 1024*1024*10)
- print create_result['mount_path']
- """.format(
- group_id=group_id,
- volume_id=volume_id,
- )))
-
- # Authorize and configure credentials for the guest to mount the
- # the volume with read-write access.
- self._configure_guest_auth(volumeclient_mount, guest_mount, guest_entity,
- mount_path, readonly=False)
-
- # Mount the volume, and write to it.
- guest_mount.mount(mount_path=mount_path)
- guest_mount.write_n_mb("data.bin", 1)
-
- # Change the guest auth ID's authorization to read-only mount access.
- self._volume_client_python(volumeclient_mount, dedent("""
- vp = VolumePath("{group_id}", "{volume_id}")
- vc.deauthorize(vp, "{guest_entity}")
- """.format(
- group_id=group_id,
- volume_id=volume_id,
- guest_entity=guest_entity
- )))
- self._configure_guest_auth(volumeclient_mount, guest_mount, guest_entity,
- mount_path, readonly=True)
-
- # The effect of the change in access level to read-only is not
- # immediate. The guest sees the change only after a remount of
- # the volume.
- guest_mount.umount_wait()
- guest_mount.mount(mount_path=mount_path)
-
- # Read existing content of the volume.
- self.assertListEqual(guest_mount.ls(guest_mount.mountpoint), ["data.bin"])
- # Cannot write into read-only volume.
- with self.assertRaises(CommandFailedError):
- guest_mount.write_n_mb("rogue.bin", 1)
-
- def test_get_authorized_ids(self):
- """
- That for a volume, the authorized IDs and their access levels
- can be obtained using CephFSVolumeClient's get_authorized_ids().
- """
- volumeclient_mount = self.mounts[1]
- volumeclient_mount.umount_wait()
-
- # Configure volumeclient_mount as the handle for driving volumeclient.
- self._configure_vc_auth(volumeclient_mount, "manila")
-
- group_id = "grpid"
- volume_id = "volid"
- guest_entity_1 = "guest1"
- guest_entity_2 = "guest2"
-
- log.info("print group ID: {0}".format(group_id))
-
- # Create a volume.
- auths = self._volume_client_python(volumeclient_mount, dedent("""
- vp = VolumePath("{group_id}", "{volume_id}")
- vc.create_volume(vp, 1024*1024*10)
- auths = vc.get_authorized_ids(vp)
- print auths
- """.format(
- group_id=group_id,
- volume_id=volume_id,
- )))
- # Check the list of authorized IDs for the volume.
- expected_result = None
- self.assertEqual(str(expected_result), auths)
-
- # Allow two auth IDs access to the volume.
- auths = self._volume_client_python(volumeclient_mount, dedent("""
- vp = VolumePath("{group_id}", "{volume_id}")
- vc.authorize(vp, "{guest_entity_1}", readonly=False)
- vc.authorize(vp, "{guest_entity_2}", readonly=True)
- auths = vc.get_authorized_ids(vp)
- print auths
- """.format(
- group_id=group_id,
- volume_id=volume_id,
- guest_entity_1=guest_entity_1,
- guest_entity_2=guest_entity_2,
- )))
- # Check the list of authorized IDs and their access levels.
- expected_result = [(u'guest1', u'rw'), (u'guest2', u'r')]
- self.assertItemsEqual(str(expected_result), auths)
-
- # Disallow both the auth IDs' access to the volume.
- auths = self._volume_client_python(volumeclient_mount, dedent("""
- vp = VolumePath("{group_id}", "{volume_id}")
- vc.deauthorize(vp, "{guest_entity_1}")
- vc.deauthorize(vp, "{guest_entity_2}")
- auths = vc.get_authorized_ids(vp)
- print auths
- """.format(
- group_id=group_id,
- volume_id=volume_id,
- guest_entity_1=guest_entity_1,
- guest_entity_2=guest_entity_2,
- )))
- # Check the list of authorized IDs for the volume.
- expected_result = None
- self.assertItemsEqual(str(expected_result), auths)
-
- def test_multitenant_volumes(self):
- """
- That volume access can be restricted to a tenant.
-
- That metadata used to enforce tenant isolation of
- volumes is stored as a two-way mapping between auth
- IDs and volumes that they're authorized to access.
- """
- volumeclient_mount = self.mounts[1]
- volumeclient_mount.umount_wait()
-
- # Configure volumeclient_mount as the handle for driving volumeclient.
- self._configure_vc_auth(volumeclient_mount, "manila")
-
- group_id = "groupid"
- volume_id = "volumeid"
-
- # Guest clients belonging to different tenants, but using the same
- # auth ID.
- auth_id = "guest"
- guestclient_1 = {
- "auth_id": auth_id,
- "tenant_id": "tenant1",
- }
- guestclient_2 = {
- "auth_id": auth_id,
- "tenant_id": "tenant2",
- }
-
- # Create a volume.
- self._volume_client_python(volumeclient_mount, dedent("""
- vp = VolumePath("{group_id}", "{volume_id}")
- vc.create_volume(vp, 1024*1024*10)
- """.format(
- group_id=group_id,
- volume_id=volume_id,
- )))
-
- # Check that volume metadata file is created on volume creation.
- vol_metadata_filename = "_{0}:{1}.meta".format(group_id, volume_id)
- self.assertIn(vol_metadata_filename, self.mounts[0].ls("volumes"))
-
- # Authorize 'guestclient_1', using auth ID 'guest' and belonging to
- # 'tenant1', with 'rw' access to the volume.
- self._volume_client_python(volumeclient_mount, dedent("""
- vp = VolumePath("{group_id}", "{volume_id}")
- vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
- """.format(
- group_id=group_id,
- volume_id=volume_id,
- auth_id=guestclient_1["auth_id"],
- tenant_id=guestclient_1["tenant_id"]
- )))
-
- # Check that auth metadata file for auth ID 'guest', is
- # created on authorizing 'guest' access to the volume.
- auth_metadata_filename = "${0}.meta".format(guestclient_1["auth_id"])
- self.assertIn(auth_metadata_filename, self.mounts[0].ls("volumes"))
-
- # Verify that the auth metadata file stores the tenant ID that the
- # auth ID belongs to, the auth ID's authorized access levels
- # for different volumes, versioning details, etc.
- expected_auth_metadata = {
- u"version": 1,
- u"compat_version": 1,
- u"dirty": False,
- u"tenant_id": u"tenant1",
- u"volumes": {
- u"groupid/volumeid": {
- u"dirty": False,
- u"access_level": u"rw",
- }
- }
- }
-
- auth_metadata = self._volume_client_python(volumeclient_mount, dedent("""
- vp = VolumePath("{group_id}", "{volume_id}")
- auth_metadata = vc._auth_metadata_get("{auth_id}")
- print auth_metadata
- """.format(
- group_id=group_id,
- volume_id=volume_id,
- auth_id=guestclient_1["auth_id"],
- )))
-
- self.assertItemsEqual(str(expected_auth_metadata), auth_metadata)
-
- # Verify that the volume metadata file stores info about auth IDs
- # and their access levels to the volume, versioning details, etc.
- expected_vol_metadata = {
- u"version": 1,
- u"compat_version": 1,
- u"auths": {
- u"guest": {
- u"dirty": False,
- u"access_level": u"rw"
- }
- }
- }
-
- vol_metadata = self._volume_client_python(volumeclient_mount, dedent("""
- vp = VolumePath("{group_id}", "{volume_id}")
- volume_metadata = vc._volume_metadata_get(vp)
- print volume_metadata
- """.format(
- group_id=group_id,
- volume_id=volume_id,
- )))
- self.assertItemsEqual(str(expected_vol_metadata), vol_metadata)
-
- # Cannot authorize 'guestclient_2' to access the volume.
- # It uses auth ID 'guest', which has already been used by a
- # 'guestclient_1' belonging to an another tenant for accessing
- # the volume.
- with self.assertRaises(CommandFailedError):
- self._volume_client_python(volumeclient_mount, dedent("""
- vp = VolumePath("{group_id}", "{volume_id}")
- vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
- """.format(
- group_id=group_id,
- volume_id=volume_id,
- auth_id=guestclient_2["auth_id"],
- tenant_id=guestclient_2["tenant_id"]
- )))
-
- # Check that auth metadata file is cleaned up on removing
- # auth ID's only access to a volume.
- self._volume_client_python(volumeclient_mount, dedent("""
- vp = VolumePath("{group_id}", "{volume_id}")
- vc.deauthorize(vp, "{guest_entity}")
- """.format(
- group_id=group_id,
- volume_id=volume_id,
- guest_entity=guestclient_1["auth_id"]
- )))
-
- self.assertNotIn(auth_metadata_filename, self.mounts[0].ls("volumes"))
-
- # Check that volume metadata file is cleaned up on volume deletion.
- self._volume_client_python(volumeclient_mount, dedent("""
- vp = VolumePath("{group_id}", "{volume_id}")
- vc.delete_volume(vp)
- """.format(
- group_id=group_id,
- volume_id=volume_id,
- )))
- self.assertNotIn(vol_metadata_filename, self.mounts[0].ls("volumes"))
-
- def test_recover_metadata(self):
- """
- That volume client can recover from partial auth updates using
- metadata files, which store auth info and its update status info.
- """
- volumeclient_mount = self.mounts[1]
- volumeclient_mount.umount_wait()
-
- # Configure volumeclient_mount as the handle for driving volumeclient.
- self._configure_vc_auth(volumeclient_mount, "manila")
-
- group_id = "groupid"
- volume_id = "volumeid"
-
- guestclient = {
- "auth_id": "guest",
- "tenant_id": "tenant",
- }
-
- # Create a volume.
- self._volume_client_python(volumeclient_mount, dedent("""
- vp = VolumePath("{group_id}", "{volume_id}")
- vc.create_volume(vp, 1024*1024*10)
- """.format(
- group_id=group_id,
- volume_id=volume_id,
- )))
-
- # Authorize 'guestclient' access to the volume.
- self._volume_client_python(volumeclient_mount, dedent("""
- vp = VolumePath("{group_id}", "{volume_id}")
- vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
- """.format(
- group_id=group_id,
- volume_id=volume_id,
- auth_id=guestclient["auth_id"],
- tenant_id=guestclient["tenant_id"]
- )))
-
- # Check that auth metadata file for auth ID 'guest' is created.
- auth_metadata_filename = "${0}.meta".format(guestclient["auth_id"])
- self.assertIn(auth_metadata_filename, self.mounts[0].ls("volumes"))
-
- # Induce partial auth update state by modifying the auth metadata file,
- # and then run recovery procedure.
- self._volume_client_python(volumeclient_mount, dedent("""
- vp = VolumePath("{group_id}", "{volume_id}")
- auth_metadata = vc._auth_metadata_get("{auth_id}")
- auth_metadata['dirty'] = True
- vc._auth_metadata_set("{auth_id}", auth_metadata)
- vc.recover()
- """.format(
- group_id=group_id,
- volume_id=volume_id,
- auth_id=guestclient["auth_id"],
- )))
+++ /dev/null
-import contextlib
-import logging
-import os
-import unittest
-from unittest import suite, loader, case
-from teuthology.task import interactive
-from teuthology import misc
-from tasks.cephfs.filesystem import Filesystem, MDSCluster, CephCluster
-from tasks.mgr.mgr_test_case import MgrCluster
-
-log = logging.getLogger(__name__)
-
-
-class DecoratingLoader(loader.TestLoader):
- """
- A specialization of TestLoader that tags some extra attributes
- onto test classes as they are loaded.
- """
- def __init__(self, params):
- self._params = params
- super(DecoratingLoader, self).__init__()
-
- def _apply_params(self, obj):
- for k, v in self._params.items():
- setattr(obj, k, v)
-
- def loadTestsFromTestCase(self, testCaseClass):
- self._apply_params(testCaseClass)
- return super(DecoratingLoader, self).loadTestsFromTestCase(testCaseClass)
-
- def loadTestsFromName(self, name, module=None):
- result = super(DecoratingLoader, self).loadTestsFromName(name, module)
-
- # Special case for when we were called with the name of a method, we get
- # a suite with one TestCase
- tests_in_result = list(result)
- if len(tests_in_result) == 1 and isinstance(tests_in_result[0], case.TestCase):
- self._apply_params(tests_in_result[0])
-
- return result
-
-
-class LogStream(object):
- def __init__(self):
- self.buffer = ""
-
- def write(self, data):
- self.buffer += data
- if "\n" in self.buffer:
- lines = self.buffer.split("\n")
- for line in lines[:-1]:
- log.info(line)
- self.buffer = lines[-1]
-
- def flush(self):
- pass
-
-
-class InteractiveFailureResult(unittest.TextTestResult):
- """
- Specialization that implements interactive-on-error style
- behavior.
- """
- ctx = None
-
- def addFailure(self, test, err):
- log.error(self._exc_info_to_string(err, test))
- log.error("Failure in test '{0}', going interactive".format(
- self.getDescription(test)
- ))
- interactive.task(ctx=self.ctx, config=None)
-
- def addError(self, test, err):
- log.error(self._exc_info_to_string(err, test))
- log.error("Error in test '{0}', going interactive".format(
- self.getDescription(test)
- ))
- interactive.task(ctx=self.ctx, config=None)
-
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Run the CephFS test cases.
-
- Run everything in tasks/cephfs/test_*.py:
-
- ::
-
- tasks:
- - install:
- - ceph:
- - ceph-fuse:
- - cephfs_test_runner:
-
- `modules` argument allows running only some specific modules:
-
- ::
-
- tasks:
- ...
- - cephfs_test_runner:
- modules:
- - tasks.cephfs.test_sessionmap
- - tasks.cephfs.test_auto_repair
-
- By default, any cases that can't be run on the current cluster configuration
- will generate a failure. When the optional `fail_on_skip` argument is set
- to false, any tests that can't be run on the current configuration will
- simply be skipped:
-
- ::
- tasks:
- ...
- - cephfs_test_runner:
- fail_on_skip: false
-
- """
-
- ceph_cluster = CephCluster(ctx)
-
- if len(list(misc.all_roles_of_type(ctx.cluster, 'mds'))):
- mds_cluster = MDSCluster(ctx)
- fs = Filesystem(ctx)
- else:
- mds_cluster = None
- fs = None
-
- if len(list(misc.all_roles_of_type(ctx.cluster, 'mgr'))):
- mgr_cluster = MgrCluster(ctx)
- else:
- mgr_cluster = None
-
- # Mount objects, sorted by ID
- if hasattr(ctx, 'mounts'):
- mounts = [v for k, v in sorted(ctx.mounts.items(), lambda a, b: cmp(a[0], b[0]))]
- else:
- # The test configuration has a filesystem but no fuse/kclient mounts
- mounts = []
-
- decorating_loader = DecoratingLoader({
- "ctx": ctx,
- "mounts": mounts,
- "fs": fs,
- "ceph_cluster": ceph_cluster,
- "mds_cluster": mds_cluster,
- "mgr_cluster": mgr_cluster,
- })
-
- fail_on_skip = config.get('fail_on_skip', True)
-
- # Put useful things onto ctx for interactive debugging
- ctx.fs = fs
- ctx.mds_cluster = mds_cluster
- ctx.mgr_cluster = mgr_cluster
-
- # Depending on config, either load specific modules, or scan for moduless
- if config and 'modules' in config and config['modules']:
- module_suites = []
- for mod_name in config['modules']:
- # Test names like cephfs.test_auto_repair
- module_suites.append(decorating_loader.loadTestsFromName(mod_name))
- overall_suite = suite.TestSuite(module_suites)
- else:
- # Default, run all tests
- overall_suite = decorating_loader.discover(
- os.path.join(
- os.path.dirname(os.path.abspath(__file__)),
- "cephfs/"
- )
- )
-
- if ctx.config.get("interactive-on-error", False):
- InteractiveFailureResult.ctx = ctx
- result_class = InteractiveFailureResult
- else:
- result_class = unittest.TextTestResult
-
- class LoggingResult(result_class):
- def startTest(self, test):
- log.info("Starting test: {0}".format(self.getDescription(test)))
- return super(LoggingResult, self).startTest(test)
-
- def addSkip(self, test, reason):
- if fail_on_skip:
- # Don't just call addFailure because that requires a traceback
- self.failures.append((test, reason))
- else:
- super(LoggingResult, self).addSkip(test, reason)
-
- # Execute!
- result = unittest.TextTestRunner(
- stream=LogStream(),
- resultclass=LoggingResult,
- verbosity=2,
- failfast=True).run(overall_suite)
-
- if not result.wasSuccessful():
- result.printErrors() # duplicate output at end for convenience
-
- bad_tests = []
- for test, error in result.errors:
- bad_tests.append(str(test))
- for test, failure in result.failures:
- bad_tests.append(str(test))
-
- raise RuntimeError("Test failure: {0}".format(", ".join(bad_tests)))
-
- yield
+++ /dev/null
-"""
-Mount cifs clients. Unmount when finished.
-"""
-import contextlib
-import logging
-import os
-
-from teuthology import misc as teuthology
-from teuthology.orchestra import run
-
-log = logging.getLogger(__name__)
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Mount/unmount a cifs client.
-
- The config is optional and defaults to mounting on all clients. If
- a config is given, it is expected to be a list of clients to do
- this operation on.
-
- Example that starts smbd and mounts cifs on all nodes::
-
- tasks:
- - ceph:
- - samba:
- - cifs-mount:
- - interactive:
-
- Example that splits smbd and cifs:
-
- tasks:
- - ceph:
- - samba: [samba.0]
- - cifs-mount: [client.0]
- - ceph-fuse: [client.1]
- - interactive:
-
- Example that specifies the share name:
-
- tasks:
- - ceph:
- - ceph-fuse:
- - samba:
- samba.0:
- cephfuse: "{testdir}/mnt.0"
- - cifs-mount:
- client.0:
- share: cephfuse
-
- :param ctx: Context
- :param config: Configuration
- """
- log.info('Mounting cifs clients...')
-
- if config is None:
- config = dict(('client.{id}'.format(id=id_), None)
- for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client'))
- elif isinstance(config, list):
- config = dict((name, None) for name in config)
-
- clients = list(teuthology.get_clients(ctx=ctx, roles=config.keys()))
-
- from .samba import get_sambas
- samba_roles = ['samba.{id_}'.format(id_=id_) for id_ in teuthology.all_roles_of_type(ctx.cluster, 'samba')]
- sambas = list(get_sambas(ctx=ctx, roles=samba_roles))
- (ip, _) = sambas[0][1].ssh.get_transport().getpeername()
- log.info('samba ip: {ip}'.format(ip=ip))
-
- for id_, remote in clients:
- mnt = os.path.join(teuthology.get_testdir(ctx), 'mnt.{id}'.format(id=id_))
- log.info('Mounting cifs client.{id} at {remote} {mnt}...'.format(
- id=id_, remote=remote,mnt=mnt))
-
- remote.run(
- args=[
- 'mkdir',
- '--',
- mnt,
- ],
- )
-
- rolestr = 'client.{id_}'.format(id_=id_)
- unc = "ceph"
- log.info("config: {c}".format(c=config))
- if config[rolestr] is not None and 'share' in config[rolestr]:
- unc = config[rolestr]['share']
-
- remote.run(
- args=[
- 'sudo',
- 'mount',
- '-t',
- 'cifs',
- '//{sambaip}/{unc}'.format(sambaip=ip, unc=unc),
- '-o',
- 'username=ubuntu,password=ubuntu',
- mnt,
- ],
- )
-
- remote.run(
- args=[
- 'sudo',
- 'chown',
- 'ubuntu:ubuntu',
- '{m}/'.format(m=mnt),
- ],
- )
-
- try:
- yield
- finally:
- log.info('Unmounting cifs clients...')
- for id_, remote in clients:
- remote.run(
- args=[
- 'sudo',
- 'umount',
- mnt,
- ],
- )
- for id_, remote in clients:
- while True:
- try:
- remote.run(
- args=[
- 'rmdir', '--', mnt,
- run.Raw('2>&1'),
- run.Raw('|'),
- 'grep', 'Device or resource busy',
- ],
- )
- import time
- time.sleep(1)
- except Exception:
- break
+++ /dev/null
-"""
-Cram tests
-"""
-import logging
-import os
-
-from teuthology import misc as teuthology
-from teuthology.parallel import parallel
-from teuthology.orchestra import run
-
-log = logging.getLogger(__name__)
-
-def task(ctx, config):
- """
- Run all cram tests from the specified urls on the specified
- clients. Each client runs tests in parallel.
-
- Limitations:
- Tests must have a .t suffix. Tests with duplicate names will
- overwrite each other, so only the last one will run.
-
- For example::
-
- tasks:
- - ceph:
- - cram:
- clients:
- client.0:
- - http://ceph.com/qa/test.t
- - http://ceph.com/qa/test2.t]
- client.1: [http://ceph.com/qa/test.t]
- branch: foo
-
- You can also run a list of cram tests on all clients::
-
- tasks:
- - ceph:
- - cram:
- clients:
- all: [http://ceph.com/qa/test.t]
-
- :param ctx: Context
- :param config: Configuration
- """
- assert isinstance(config, dict)
- assert 'clients' in config and isinstance(config['clients'], dict), \
- 'configuration must contain a dictionary of clients'
-
- clients = teuthology.replace_all_with_clients(ctx.cluster,
- config['clients'])
- testdir = teuthology.get_testdir(ctx)
-
- overrides = ctx.config.get('overrides', {})
- teuthology.deep_merge(config, overrides.get('workunit', {}))
-
- refspec = config.get('branch')
- if refspec is None:
- refspec = config.get('tag')
- if refspec is None:
- refspec = config.get('sha1')
- if refspec is None:
- refspec = 'HEAD'
-
- try:
- for client, tests in clients.iteritems():
- (remote,) = ctx.cluster.only(client).remotes.iterkeys()
- client_dir = '{tdir}/archive/cram.{role}'.format(tdir=testdir, role=client)
- remote.run(
- args=[
- 'mkdir', '--', client_dir,
- run.Raw('&&'),
- 'virtualenv', '{tdir}/virtualenv'.format(tdir=testdir),
- run.Raw('&&'),
- '{tdir}/virtualenv/bin/pip'.format(tdir=testdir),
- 'install', 'cram==0.6',
- ],
- )
- for test in tests:
- log.info('fetching test %s for %s', test, client)
- assert test.endswith('.t'), 'tests must end in .t'
- remote.run(
- args=[
- 'wget', '-nc', '-nv', '-P', client_dir, '--', test.format(branch=refspec),
- ],
- )
-
- with parallel() as p:
- for role in clients.iterkeys():
- p.spawn(_run_tests, ctx, role)
- finally:
- for client, tests in clients.iteritems():
- (remote,) = ctx.cluster.only(client).remotes.iterkeys()
- client_dir = '{tdir}/archive/cram.{role}'.format(tdir=testdir, role=client)
- test_files = set([test.rsplit('/', 1)[1] for test in tests])
-
- # remove test files unless they failed
- for test_file in test_files:
- abs_file = os.path.join(client_dir, test_file)
- remote.run(
- args=[
- 'test', '-f', abs_file + '.err',
- run.Raw('||'),
- 'rm', '-f', '--', abs_file,
- ],
- )
-
- # ignore failure since more than one client may
- # be run on a host, and the client dir should be
- # non-empty if the test failed
- remote.run(
- args=[
- 'rm', '-rf', '--',
- '{tdir}/virtualenv'.format(tdir=testdir),
- run.Raw(';'),
- 'rmdir', '--ignore-fail-on-non-empty', client_dir,
- ],
- )
-
-def _run_tests(ctx, role):
- """
- For each role, check to make sure it's a client, then run the cram on that client
-
- :param ctx: Context
- :param role: Roles
- """
- assert isinstance(role, basestring)
- PREFIX = 'client.'
- assert role.startswith(PREFIX)
- id_ = role[len(PREFIX):]
- (remote,) = ctx.cluster.only(role).remotes.iterkeys()
- ceph_ref = ctx.summary.get('ceph-sha1', 'master')
-
- testdir = teuthology.get_testdir(ctx)
- log.info('Running tests for %s...', role)
- remote.run(
- args=[
- run.Raw('CEPH_REF={ref}'.format(ref=ceph_ref)),
- run.Raw('CEPH_ID="{id}"'.format(id=id_)),
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- '{tdir}/virtualenv/bin/cram'.format(tdir=testdir),
- '-v', '--',
- run.Raw('{tdir}/archive/cram.{role}/*.t'.format(tdir=testdir, role=role)),
- ],
- logger=log.getChild(role),
- )
+++ /dev/null
-"""
-Rados modle-based integration tests
-"""
-import contextlib
-import logging
-
-log = logging.getLogger(__name__)
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- For each combination of namespace and name_length, create
- <num_objects> objects with name length <name_length>
- on entry. On exit, verify that the objects still exist, can
- be deleted, and then don't exist.
-
- Usage::
-
- create_verify_lfn_objects.py:
- pool: <pool_name> default: 'data'
- prefix: <prefix> default: ''
- namespace: [<namespace>] default: ['']
- num_objects: [<num_objects>] default: 10
- name_length: [<name_length>] default: [400]
- """
- pool = config.get('pool', 'data')
- num_objects = config.get('num_objects', 10)
- name_length = config.get('name_length', [400])
- namespace = config.get('namespace', [None])
- prefix = config.get('prefix', None)
- manager = ctx.managers['ceph']
-
- objects = []
- for l in name_length:
- for ns in namespace:
- def object_name(i):
- nslength = 0
- if namespace is not '':
- nslength = len(namespace)
- numstr = str(i)
- fillerlen = l - nslength - len(prefix) - len(numstr)
- assert fillerlen >= 0
- return prefix + ('a'*fillerlen) + numstr
- objects += [(ns, object_name(i)) for i in range(num_objects)]
-
- for ns, name in objects:
- err = manager.do_put(
- pool,
- name,
- '/etc/resolv.conf',
- namespace=ns)
- log.info("err is " + str(err))
- assert err == 0
-
- try:
- yield
- finally:
- log.info('ceph_verify_lfn_objects verifying...')
- for ns, name in objects:
- err = manager.do_get(
- pool,
- name,
- namespace=ns)
- log.info("err is " + str(err))
- assert err == 0
-
- log.info('ceph_verify_lfn_objects deleting...')
- for ns, name in objects:
- err = manager.do_rm(
- pool,
- name,
- namespace=ns)
- log.info("err is " + str(err))
- assert err == 0
-
- log.info('ceph_verify_lfn_objects verifying absent...')
- for ns, name in objects:
- err = manager.do_get(
- pool,
- name,
- namespace=ns)
- log.info("err is " + str(err))
- assert err != 0
+++ /dev/null
-#!/usr/bin/env python
-import contextlib
-import logging
-from cStringIO import StringIO
-import textwrap
-from configparser import ConfigParser
-import time
-
-from teuthology.orchestra import run
-from teuthology import misc
-from teuthology.contextutil import nested
-
-log = logging.getLogger(__name__)
-
-DEVSTACK_GIT_REPO = 'https://github.com/openstack-dev/devstack.git'
-DS_STABLE_BRANCHES = ("havana", "grizzly")
-
-is_devstack_node = lambda role: role.startswith('devstack')
-is_osd_node = lambda role: role.startswith('osd')
-
-
-@contextlib.contextmanager
-def task(ctx, config):
- if config is None:
- config = {}
- if not isinstance(config, dict):
- raise TypeError("config must be a dict")
- with nested(lambda: install(ctx=ctx, config=config),
- lambda: smoke(ctx=ctx, config=config),
- ):
- yield
-
-
-@contextlib.contextmanager
-def install(ctx, config):
- """
- Install OpenStack DevStack and configure it to use a Ceph cluster for
- Glance and Cinder.
-
- Requires one node with a role 'devstack'
-
- Since devstack runs rampant on the system it's used on, typically you will
- want to reprovision that machine after using devstack on it.
-
- Also, the default 2GB of RAM that is given to vps nodes is insufficient. I
- recommend 4GB. Downburst can be instructed to give 4GB to a vps node by
- adding this to the yaml:
-
- downburst:
- ram: 4G
-
- This was created using documentation found here:
- https://github.com/openstack-dev/devstack/blob/master/README.md
- http://ceph.com/docs/master/rbd/rbd-openstack/
- """
- if config is None:
- config = {}
- if not isinstance(config, dict):
- raise TypeError("config must be a dict")
-
- devstack_node = ctx.cluster.only(is_devstack_node).remotes.keys()[0]
- an_osd_node = ctx.cluster.only(is_osd_node).remotes.keys()[0]
-
- devstack_branch = config.get("branch", "master")
- install_devstack(devstack_node, devstack_branch)
- try:
- configure_devstack_and_ceph(ctx, config, devstack_node, an_osd_node)
- yield
- finally:
- pass
-
-
-def install_devstack(devstack_node, branch="master"):
- log.info("Cloning DevStack repo...")
-
- args = ['git', 'clone', DEVSTACK_GIT_REPO]
- devstack_node.run(args=args)
-
- if branch != "master":
- if branch in DS_STABLE_BRANCHES and not branch.startswith("stable"):
- branch = "stable/" + branch
- log.info("Checking out {branch} branch...".format(branch=branch))
- cmd = "cd devstack && git checkout " + branch
- devstack_node.run(args=cmd)
-
- log.info("Installing DevStack...")
- args = ['cd', 'devstack', run.Raw('&&'), './stack.sh']
- devstack_node.run(args=args)
-
-
-def configure_devstack_and_ceph(ctx, config, devstack_node, ceph_node):
- pool_size = config.get('pool_size', '128')
- create_pools(ceph_node, pool_size)
- distribute_ceph_conf(devstack_node, ceph_node)
- # This is where we would install python-ceph and ceph-common but it appears
- # the ceph task does that for us.
- generate_ceph_keys(ceph_node)
- distribute_ceph_keys(devstack_node, ceph_node)
- secret_uuid = set_libvirt_secret(devstack_node, ceph_node)
- update_devstack_config_files(devstack_node, secret_uuid)
- set_apache_servername(devstack_node)
- # Rebooting is the most-often-used method of restarting devstack services
- misc.reboot(devstack_node)
- start_devstack(devstack_node)
- restart_apache(devstack_node)
-
-
-def create_pools(ceph_node, pool_size):
- log.info("Creating pools on Ceph cluster...")
-
- for pool_name in ['volumes', 'images', 'backups']:
- args = ['sudo', 'ceph', 'osd', 'pool', 'create', pool_name, pool_size]
- ceph_node.run(args=args)
-
-
-def distribute_ceph_conf(devstack_node, ceph_node):
- log.info("Copying ceph.conf to DevStack node...")
-
- ceph_conf_path = '/etc/ceph/ceph.conf'
- ceph_conf = misc.get_file(ceph_node, ceph_conf_path, sudo=True)
- misc.sudo_write_file(devstack_node, ceph_conf_path, ceph_conf)
-
-
-def generate_ceph_keys(ceph_node):
- log.info("Generating Ceph keys...")
-
- ceph_auth_cmds = [
- ['sudo', 'ceph', 'auth', 'get-or-create', 'client.cinder', 'mon',
- 'allow r', 'osd', 'allow class-read object_prefix rbd_children, allow rwx pool=volumes, allow rx pool=images'], # noqa
- ['sudo', 'ceph', 'auth', 'get-or-create', 'client.glance', 'mon',
- 'allow r', 'osd', 'allow class-read object_prefix rbd_children, allow rwx pool=images'], # noqa
- ['sudo', 'ceph', 'auth', 'get-or-create', 'client.cinder-backup', 'mon',
- 'allow r', 'osd', 'allow class-read object_prefix rbd_children, allow rwx pool=backups'], # noqa
- ]
- for cmd in ceph_auth_cmds:
- ceph_node.run(args=cmd)
-
-
-def distribute_ceph_keys(devstack_node, ceph_node):
- log.info("Copying Ceph keys to DevStack node...")
-
- def copy_key(from_remote, key_name, to_remote, dest_path, owner):
- key_stringio = StringIO()
- from_remote.run(
- args=['sudo', 'ceph', 'auth', 'get-or-create', key_name],
- stdout=key_stringio)
- key_stringio.seek(0)
- misc.sudo_write_file(to_remote, dest_path,
- key_stringio, owner=owner)
- keys = [
- dict(name='client.glance',
- path='/etc/ceph/ceph.client.glance.keyring',
- # devstack appears to just want root:root
- #owner='glance:glance',
- ),
- dict(name='client.cinder',
- path='/etc/ceph/ceph.client.cinder.keyring',
- # devstack appears to just want root:root
- #owner='cinder:cinder',
- ),
- dict(name='client.cinder-backup',
- path='/etc/ceph/ceph.client.cinder-backup.keyring',
- # devstack appears to just want root:root
- #owner='cinder:cinder',
- ),
- ]
- for key_dict in keys:
- copy_key(ceph_node, key_dict['name'], devstack_node,
- key_dict['path'], key_dict.get('owner'))
-
-
-def set_libvirt_secret(devstack_node, ceph_node):
- log.info("Setting libvirt secret...")
-
- cinder_key_stringio = StringIO()
- ceph_node.run(args=['sudo', 'ceph', 'auth', 'get-key', 'client.cinder'],
- stdout=cinder_key_stringio)
- cinder_key = cinder_key_stringio.getvalue().strip()
-
- uuid_stringio = StringIO()
- devstack_node.run(args=['uuidgen'], stdout=uuid_stringio)
- uuid = uuid_stringio.getvalue().strip()
-
- secret_path = '/tmp/secret.xml'
- secret_template = textwrap.dedent("""
- <secret ephemeral='no' private='no'>
- <uuid>{uuid}</uuid>
- <usage type='ceph'>
- <name>client.cinder secret</name>
- </usage>
- </secret>""")
- misc.sudo_write_file(devstack_node, secret_path,
- secret_template.format(uuid=uuid))
- devstack_node.run(args=['sudo', 'virsh', 'secret-define', '--file',
- secret_path])
- devstack_node.run(args=['sudo', 'virsh', 'secret-set-value', '--secret',
- uuid, '--base64', cinder_key])
- return uuid
-
-
-def update_devstack_config_files(devstack_node, secret_uuid):
- log.info("Updating DevStack config files to use Ceph...")
-
- def backup_config(node, file_name, backup_ext='.orig.teuth'):
- node.run(args=['cp', '-f', file_name, file_name + backup_ext])
-
- def update_config(config_name, config_stream, update_dict,
- section='DEFAULT'):
- parser = ConfigParser()
- parser.read_file(config_stream)
- for (key, value) in update_dict.items():
- parser.set(section, key, value)
- out_stream = StringIO()
- parser.write(out_stream)
- out_stream.seek(0)
- return out_stream
-
- updates = [
- dict(name='/etc/glance/glance-api.conf', options=dict(
- default_store='rbd',
- rbd_store_user='glance',
- rbd_store_pool='images',
- show_image_direct_url='True',)),
- dict(name='/etc/cinder/cinder.conf', options=dict(
- volume_driver='cinder.volume.drivers.rbd.RBDDriver',
- rbd_pool='volumes',
- rbd_ceph_conf='/etc/ceph/ceph.conf',
- rbd_flatten_volume_from_snapshot='false',
- rbd_max_clone_depth='5',
- glance_api_version='2',
- rbd_user='cinder',
- rbd_secret_uuid=secret_uuid,
- backup_driver='cinder.backup.drivers.ceph',
- backup_ceph_conf='/etc/ceph/ceph.conf',
- backup_ceph_user='cinder-backup',
- backup_ceph_chunk_size='134217728',
- backup_ceph_pool='backups',
- backup_ceph_stripe_unit='0',
- backup_ceph_stripe_count='0',
- restore_discard_excess_bytes='true',
- )),
- dict(name='/etc/nova/nova.conf', options=dict(
- libvirt_images_type='rbd',
- libvirt_images_rbd_pool='volumes',
- libvirt_images_rbd_ceph_conf='/etc/ceph/ceph.conf',
- rbd_user='cinder',
- rbd_secret_uuid=secret_uuid,
- libvirt_inject_password='false',
- libvirt_inject_key='false',
- libvirt_inject_partition='-2',
- )),
- ]
-
- for update in updates:
- file_name = update['name']
- options = update['options']
- config_str = misc.get_file(devstack_node, file_name, sudo=True)
- config_stream = StringIO(config_str)
- backup_config(devstack_node, file_name)
- new_config_stream = update_config(file_name, config_stream, options)
- misc.sudo_write_file(devstack_node, file_name, new_config_stream)
-
-
-def set_apache_servername(node):
- # Apache complains: "Could not reliably determine the server's fully
- # qualified domain name, using 127.0.0.1 for ServerName"
- # So, let's make sure it knows its name.
- log.info("Setting Apache ServerName...")
-
- hostname = node.hostname
- config_file = '/etc/apache2/conf.d/servername'
- misc.sudo_write_file(node, config_file,
- "ServerName {name}".format(name=hostname))
-
-
-def start_devstack(devstack_node):
- log.info("Patching devstack start script...")
- # This causes screen to start headless - otherwise rejoin-stack.sh fails
- # because there is no terminal attached.
- cmd = "cd devstack && sed -ie 's/screen -c/screen -dm -c/' rejoin-stack.sh"
- devstack_node.run(args=cmd)
-
- log.info("Starting devstack...")
- cmd = "cd devstack && ./rejoin-stack.sh"
- devstack_node.run(args=cmd)
-
- # This was added because I was getting timeouts on Cinder requests - which
- # were trying to access Keystone on port 5000. A more robust way to handle
- # this would be to introduce a wait-loop on devstack_node that checks to
- # see if a service is listening on port 5000.
- log.info("Waiting 30s for devstack to start...")
- time.sleep(30)
-
-
-def restart_apache(node):
- node.run(args=['sudo', '/etc/init.d/apache2', 'restart'], wait=True)
-
-
-@contextlib.contextmanager
-def exercise(ctx, config):
- log.info("Running devstack exercises...")
-
- if config is None:
- config = {}
- if not isinstance(config, dict):
- raise TypeError("config must be a dict")
-
- devstack_node = ctx.cluster.only(is_devstack_node).remotes.keys()[0]
-
- # TODO: save the log *and* preserve failures
- #devstack_archive_dir = create_devstack_archive(ctx, devstack_node)
-
- try:
- #cmd = "cd devstack && ./exercise.sh 2>&1 | tee {dir}/exercise.log".format( # noqa
- # dir=devstack_archive_dir)
- cmd = "cd devstack && ./exercise.sh"
- devstack_node.run(args=cmd, wait=True)
- yield
- finally:
- pass
-
-
-def create_devstack_archive(ctx, devstack_node):
- test_dir = misc.get_testdir(ctx)
- devstack_archive_dir = "{test_dir}/archive/devstack".format(
- test_dir=test_dir)
- devstack_node.run(args="mkdir -p " + devstack_archive_dir)
- return devstack_archive_dir
-
-
-@contextlib.contextmanager
-def smoke(ctx, config):
- log.info("Running a basic smoketest...")
-
- devstack_node = ctx.cluster.only(is_devstack_node).remotes.keys()[0]
- an_osd_node = ctx.cluster.only(is_osd_node).remotes.keys()[0]
-
- try:
- create_volume(devstack_node, an_osd_node, 'smoke0', 1)
- yield
- finally:
- pass
-
-
-def create_volume(devstack_node, ceph_node, vol_name, size):
- """
- :param size: The size of the volume, in GB
- """
- size = str(size)
- log.info("Creating a {size}GB volume named {name}...".format(
- name=vol_name,
- size=size))
- args = ['source', 'devstack/openrc', run.Raw('&&'), 'cinder', 'create',
- '--display-name', vol_name, size]
- out_stream = StringIO()
- devstack_node.run(args=args, stdout=out_stream, wait=True)
- vol_info = parse_os_table(out_stream.getvalue())
- log.debug("Volume info: %s", str(vol_info))
-
- out_stream = StringIO()
- try:
- ceph_node.run(args="rbd --id cinder ls -l volumes", stdout=out_stream,
- wait=True)
- except run.CommandFailedError:
- log.debug("Original rbd call failed; retrying without '--id cinder'")
- ceph_node.run(args="rbd ls -l volumes", stdout=out_stream,
- wait=True)
-
- assert vol_info['id'] in out_stream.getvalue(), \
- "Volume not found on Ceph cluster"
- assert vol_info['size'] == size, \
- "Volume size on Ceph cluster is different than specified"
- return vol_info['id']
-
-
-def parse_os_table(table_str):
- out_dict = dict()
- for line in table_str.split('\n'):
- if line.startswith('|'):
- items = line.split()
- out_dict[items[1]] = items[3]
- return out_dict
+++ /dev/null
-"""
-Raise exceptions on osd coredumps or test err directories
-"""
-import contextlib
-import logging
-import time
-from teuthology.orchestra import run
-
-import ceph_manager
-from teuthology import misc as teuthology
-
-log = logging.getLogger(__name__)
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Die if {testdir}/err exists or if an OSD dumps core
- """
- if config is None:
- config = {}
-
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
-
- num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
- log.info('num_osds is %s' % num_osds)
-
- manager = ceph_manager.CephManager(
- mon,
- ctx=ctx,
- logger=log.getChild('ceph_manager'),
- )
-
- while len(manager.get_osd_status()['up']) < num_osds:
- time.sleep(10)
-
- testdir = teuthology.get_testdir(ctx)
-
- while True:
- for i in range(num_osds):
- (osd_remote,) = ctx.cluster.only('osd.%d' % i).remotes.iterkeys()
- p = osd_remote.run(
- args = [ 'test', '-e', '{tdir}/err'.format(tdir=testdir) ],
- wait=True,
- check_status=False,
- )
- exit_status = p.exitstatus
-
- if exit_status == 0:
- log.info("osd %d has an error" % i)
- raise Exception("osd %d error" % i)
-
- log_path = '/var/log/ceph/osd.%d.log' % (i)
-
- p = osd_remote.run(
- args = [
- 'tail', '-1', log_path,
- run.Raw('|'),
- 'grep', '-q', 'end dump'
- ],
- wait=True,
- check_status=False,
- )
- exit_status = p.exitstatus
-
- if exit_status == 0:
- log.info("osd %d dumped core" % i)
- raise Exception("osd %d dumped core" % i)
-
- time.sleep(5)
+++ /dev/null
-"""
-Special case divergence test
-"""
-import logging
-import time
-
-from teuthology import misc as teuthology
-from util.rados import rados
-
-
-log = logging.getLogger(__name__)
-
-
-def task(ctx, config):
- """
- Test handling of divergent entries with prior_version
- prior to log_tail
-
- overrides:
- ceph:
- conf:
- osd:
- debug osd: 5
-
- Requires 3 osds on a single test node.
- """
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- 'divergent_priors task only accepts a dict for configuration'
-
- manager = ctx.managers['ceph']
-
- while len(manager.get_osd_status()['up']) < 3:
- time.sleep(10)
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.raw_cluster_cmd('osd', 'set', 'noout')
- manager.raw_cluster_cmd('osd', 'set', 'noin')
- manager.raw_cluster_cmd('osd', 'set', 'nodown')
- manager.wait_for_clean()
-
- # something that is always there
- dummyfile = '/etc/fstab'
- dummyfile2 = '/etc/resolv.conf'
-
- # create 1 pg pool
- log.info('creating foo')
- manager.raw_cluster_cmd('osd', 'pool', 'create', 'foo', '1')
-
- osds = [0, 1, 2]
- for i in osds:
- manager.set_config(i, osd_min_pg_log_entries=10)
- manager.set_config(i, osd_max_pg_log_entries=10)
- manager.set_config(i, osd_pg_log_trim_min=5)
-
- # determine primary
- divergent = manager.get_pg_primary('foo', 0)
- log.info("primary and soon to be divergent is %d", divergent)
- non_divergent = list(osds)
- non_divergent.remove(divergent)
-
- log.info('writing initial objects')
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
- # write 100 objects
- for i in range(100):
- rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile])
-
- manager.wait_for_clean()
-
- # blackhole non_divergent
- log.info("blackholing osds %s", str(non_divergent))
- for i in non_divergent:
- manager.set_config(i, objectstore_blackhole=1)
-
- DIVERGENT_WRITE = 5
- DIVERGENT_REMOVE = 5
- # Write some soon to be divergent
- log.info('writing divergent objects')
- for i in range(DIVERGENT_WRITE):
- rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i,
- dummyfile2], wait=False)
- # Remove some soon to be divergent
- log.info('remove divergent objects')
- for i in range(DIVERGENT_REMOVE):
- rados(ctx, mon, ['-p', 'foo', 'rm',
- 'existing_%d' % (i + DIVERGENT_WRITE)], wait=False)
- time.sleep(10)
- mon.run(
- args=['killall', '-9', 'rados'],
- wait=True,
- check_status=False)
-
- # kill all the osds but leave divergent in
- log.info('killing all the osds')
- for i in osds:
- manager.kill_osd(i)
- for i in osds:
- manager.mark_down_osd(i)
- for i in non_divergent:
- manager.mark_out_osd(i)
-
- # bring up non-divergent
- log.info("bringing up non_divergent %s", str(non_divergent))
- for i in non_divergent:
- manager.revive_osd(i)
- for i in non_divergent:
- manager.mark_in_osd(i)
-
- # write 1 non-divergent object (ensure that old divergent one is divergent)
- objname = "existing_%d" % (DIVERGENT_WRITE + DIVERGENT_REMOVE)
- log.info('writing non-divergent object ' + objname)
- rados(ctx, mon, ['-p', 'foo', 'put', objname, dummyfile2])
-
- manager.wait_for_recovery()
-
- # ensure no recovery of up osds first
- log.info('delay recovery')
- for i in non_divergent:
- manager.wait_run_admin_socket(
- 'osd', i, ['set_recovery_delay', '100000'])
-
- # bring in our divergent friend
- log.info("revive divergent %d", divergent)
- manager.raw_cluster_cmd('osd', 'set', 'noup')
- manager.revive_osd(divergent)
-
- log.info('delay recovery divergent')
- manager.wait_run_admin_socket(
- 'osd', divergent, ['set_recovery_delay', '100000'])
-
- manager.raw_cluster_cmd('osd', 'unset', 'noup')
- while len(manager.get_osd_status()['up']) < 3:
- time.sleep(10)
-
- log.info('wait for peering')
- rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile])
-
- # At this point the divergent_priors should have been detected
-
- log.info("killing divergent %d", divergent)
- manager.kill_osd(divergent)
- log.info("reviving divergent %d", divergent)
- manager.revive_osd(divergent)
-
- time.sleep(20)
-
- log.info('allowing recovery')
- # Set osd_recovery_delay_start back to 0 and kick the queue
- for i in osds:
- manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'debug',
- 'kick_recovery_wq', ' 0')
-
- log.info('reading divergent objects')
- for i in range(DIVERGENT_WRITE + DIVERGENT_REMOVE):
- exit_status = rados(ctx, mon, ['-p', 'foo', 'get', 'existing_%d' % i,
- '/tmp/existing'])
- assert exit_status is 0
-
- log.info("success")
+++ /dev/null
-"""
-Special case divergence test with ceph-objectstore-tool export/remove/import
-"""
-import logging
-import time
-from cStringIO import StringIO
-
-from teuthology import misc as teuthology
-from util.rados import rados
-import os
-
-
-log = logging.getLogger(__name__)
-
-
-def task(ctx, config):
- """
- Test handling of divergent entries with prior_version
- prior to log_tail and a ceph-objectstore-tool export/import
-
- overrides:
- ceph:
- conf:
- osd:
- debug osd: 5
-
- Requires 3 osds on a single test node.
- """
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- 'divergent_priors task only accepts a dict for configuration'
-
- manager = ctx.managers['ceph']
-
- while len(manager.get_osd_status()['up']) < 3:
- time.sleep(10)
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.raw_cluster_cmd('osd', 'set', 'noout')
- manager.raw_cluster_cmd('osd', 'set', 'noin')
- manager.raw_cluster_cmd('osd', 'set', 'nodown')
- manager.wait_for_clean()
-
- # something that is always there
- dummyfile = '/etc/fstab'
- dummyfile2 = '/etc/resolv.conf'
- testdir = teuthology.get_testdir(ctx)
-
- # create 1 pg pool
- log.info('creating foo')
- manager.raw_cluster_cmd('osd', 'pool', 'create', 'foo', '1')
-
- osds = [0, 1, 2]
- for i in osds:
- manager.set_config(i, osd_min_pg_log_entries=10)
- manager.set_config(i, osd_max_pg_log_entries=10)
- manager.set_config(i, osd_pg_log_trim_min=5)
-
- # determine primary
- divergent = manager.get_pg_primary('foo', 0)
- log.info("primary and soon to be divergent is %d", divergent)
- non_divergent = list(osds)
- non_divergent.remove(divergent)
-
- log.info('writing initial objects')
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
- # write 100 objects
- for i in range(100):
- rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile])
-
- manager.wait_for_clean()
-
- # blackhole non_divergent
- log.info("blackholing osds %s", str(non_divergent))
- for i in non_divergent:
- manager.set_config(i, objectstore_blackhole=1)
-
- DIVERGENT_WRITE = 5
- DIVERGENT_REMOVE = 5
- # Write some soon to be divergent
- log.info('writing divergent objects')
- for i in range(DIVERGENT_WRITE):
- rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i,
- dummyfile2], wait=False)
- # Remove some soon to be divergent
- log.info('remove divergent objects')
- for i in range(DIVERGENT_REMOVE):
- rados(ctx, mon, ['-p', 'foo', 'rm',
- 'existing_%d' % (i + DIVERGENT_WRITE)], wait=False)
- time.sleep(10)
- mon.run(
- args=['killall', '-9', 'rados'],
- wait=True,
- check_status=False)
-
- # kill all the osds but leave divergent in
- log.info('killing all the osds')
- for i in osds:
- manager.kill_osd(i)
- for i in osds:
- manager.mark_down_osd(i)
- for i in non_divergent:
- manager.mark_out_osd(i)
-
- # bring up non-divergent
- log.info("bringing up non_divergent %s", str(non_divergent))
- for i in non_divergent:
- manager.revive_osd(i)
- for i in non_divergent:
- manager.mark_in_osd(i)
-
- # write 1 non-divergent object (ensure that old divergent one is divergent)
- objname = "existing_%d" % (DIVERGENT_WRITE + DIVERGENT_REMOVE)
- log.info('writing non-divergent object ' + objname)
- rados(ctx, mon, ['-p', 'foo', 'put', objname, dummyfile2])
-
- manager.wait_for_recovery()
-
- # ensure no recovery of up osds first
- log.info('delay recovery')
- for i in non_divergent:
- manager.wait_run_admin_socket(
- 'osd', i, ['set_recovery_delay', '100000'])
-
- # bring in our divergent friend
- log.info("revive divergent %d", divergent)
- manager.raw_cluster_cmd('osd', 'set', 'noup')
- manager.revive_osd(divergent)
-
- log.info('delay recovery divergent')
- manager.wait_run_admin_socket(
- 'osd', divergent, ['set_recovery_delay', '100000'])
-
- manager.raw_cluster_cmd('osd', 'unset', 'noup')
- while len(manager.get_osd_status()['up']) < 3:
- time.sleep(10)
-
- log.info('wait for peering')
- rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile])
-
- # At this point the divergent_priors should have been detected
-
- log.info("killing divergent %d", divergent)
- manager.kill_osd(divergent)
-
- # Export a pg
- (exp_remote,) = ctx.\
- cluster.only('osd.{o}'.format(o=divergent)).remotes.iterkeys()
- FSPATH = manager.get_filepath()
- JPATH = os.path.join(FSPATH, "journal")
- prefix = ("sudo adjust-ulimits ceph-objectstore-tool "
- "--data-path {fpath} --journal-path {jpath} "
- "--log-file="
- "/var/log/ceph/objectstore_tool.$$.log ".
- format(fpath=FSPATH, jpath=JPATH))
- pid = os.getpid()
- expfile = os.path.join(testdir, "exp.{pid}.out".format(pid=pid))
- cmd = ((prefix + "--op export --pgid 1.0 --file {file}").
- format(id=divergent, file=expfile))
- proc = exp_remote.run(args=cmd, wait=True,
- check_status=False, stdout=StringIO())
- assert proc.exitstatus == 0
-
- cmd = ((prefix + "--op remove --pgid 1.0").
- format(id=divergent, file=expfile))
- proc = exp_remote.run(args=cmd, wait=True,
- check_status=False, stdout=StringIO())
- assert proc.exitstatus == 0
-
- cmd = ((prefix + "--op import --file {file}").
- format(id=divergent, file=expfile))
- proc = exp_remote.run(args=cmd, wait=True,
- check_status=False, stdout=StringIO())
- assert proc.exitstatus == 0
-
- log.info("reviving divergent %d", divergent)
- manager.revive_osd(divergent)
- manager.wait_run_admin_socket('osd', divergent, ['dump_ops_in_flight'])
- time.sleep(20);
-
- log.info('allowing recovery')
- # Set osd_recovery_delay_start back to 0 and kick the queue
- for i in osds:
- manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'debug',
- 'kick_recovery_wq', ' 0')
-
- log.info('reading divergent objects')
- for i in range(DIVERGENT_WRITE + DIVERGENT_REMOVE):
- exit_status = rados(ctx, mon, ['-p', 'foo', 'get', 'existing_%d' % i,
- '/tmp/existing'])
- assert exit_status is 0
-
- cmd = 'rm {file}'.format(file=expfile)
- exp_remote.run(args=cmd, wait=True)
- log.info("success")
+++ /dev/null
-"""
-Dump_stuck command
-"""
-import logging
-import re
-import time
-
-import ceph_manager
-from teuthology import misc as teuthology
-
-
-log = logging.getLogger(__name__)
-
-def check_stuck(manager, num_inactive, num_unclean, num_stale, timeout=10):
- """
- Do checks. Make sure get_stuck_pgs return the right amout of information, then
- extract health information from the raw_cluster_cmd and compare the results with
- values passed in. This passes if all asserts pass.
-
- :param num_manager: Ceph manager
- :param num_inactive: number of inaactive pages that are stuck
- :param num_unclean: number of unclean pages that are stuck
- :paran num_stale: number of stale pages that are stuck
- :param timeout: timeout value for get_stuck_pgs calls
- """
- inactive = manager.get_stuck_pgs('inactive', timeout)
- unclean = manager.get_stuck_pgs('unclean', timeout)
- stale = manager.get_stuck_pgs('stale', timeout)
- log.info('hi mom')
- log.info('inactive %s / %d, unclean %s / %d, stale %s / %d',
- len(inactive), num_inactive,
- len(unclean), num_unclean,
- len(stale), num_stale)
- assert len(inactive) == num_inactive
- assert len(unclean) == num_unclean
- assert len(stale) == num_stale
-
- # check health output as well
- health = manager.raw_cluster_cmd('health')
- log.debug('ceph health is: %s', health)
- if num_inactive > 0:
- m = re.search('(\d+) pgs stuck inactive', health)
- assert int(m.group(1)) == num_inactive
- if num_unclean > 0:
- m = re.search('(\d+) pgs stuck unclean', health)
- assert int(m.group(1)) == num_unclean
- if num_stale > 0:
- m = re.search('(\d+) pgs stuck stale', health)
- assert int(m.group(1)) == num_stale
-
-def task(ctx, config):
- """
- Test the dump_stuck command.
-
- :param ctx: Context
- :param config: Configuration
- """
- assert config is None, \
- 'dump_stuck requires no configuration'
- assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \
- 'dump_stuck requires exactly 2 osds'
-
- timeout = 60
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
-
- manager = ceph_manager.CephManager(
- mon,
- ctx=ctx,
- logger=log.getChild('ceph_manager'),
- )
-
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.wait_for_clean(timeout)
-
- manager.raw_cluster_cmd('tell', 'mon.0', 'injectargs', '--',
-# '--mon-osd-report-timeout 90',
- '--mon-pg-stuck-threshold 10')
-
- check_stuck(
- manager,
- num_inactive=0,
- num_unclean=0,
- num_stale=0,
- )
- num_pgs = manager.get_num_pgs()
-
- manager.mark_out_osd(0)
- time.sleep(timeout)
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.wait_for_recovery(timeout)
-
- check_stuck(
- manager,
- num_inactive=0,
- num_unclean=num_pgs,
- num_stale=0,
- )
-
- manager.mark_in_osd(0)
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.wait_for_clean(timeout)
-
- check_stuck(
- manager,
- num_inactive=0,
- num_unclean=0,
- num_stale=0,
- )
-
- log.info('stopping first osd')
- manager.kill_osd(0)
- manager.mark_down_osd(0)
-
- log.info('waiting for all to be unclean')
- starttime = time.time()
- done = False
- while not done:
- try:
- check_stuck(
- manager,
- num_inactive=0,
- num_unclean=num_pgs,
- num_stale=0,
- )
- done = True
- except AssertionError:
- # wait up to 15 minutes to become stale
- if time.time() - starttime > 900:
- raise
-
-
- log.info('stopping second osd')
- manager.kill_osd(1)
- manager.mark_down_osd(1)
-
- log.info('waiting for all to be stale')
- starttime = time.time()
- done = False
- while not done:
- try:
- check_stuck(
- manager,
- num_inactive=0,
- num_unclean=num_pgs,
- num_stale=num_pgs,
- )
- done = True
- except AssertionError:
- # wait up to 15 minutes to become stale
- if time.time() - starttime > 900:
- raise
-
- log.info('reviving')
- for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):
- manager.revive_osd(id_)
- manager.mark_in_osd(id_)
- while True:
- try:
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- break
- except Exception:
- log.exception('osds must not be started yet, waiting...')
- time.sleep(1)
- manager.wait_for_clean(timeout)
-
- check_stuck(
- manager,
- num_inactive=0,
- num_unclean=0,
- num_stale=0,
- )
+++ /dev/null
-"""
-Lost_unfound
-"""
-from teuthology.orchestra import run
-import logging
-import ceph_manager
-from teuthology import misc as teuthology
-from util.rados import rados
-import time
-
-log = logging.getLogger(__name__)
-
-def task(ctx, config):
- """
- Test handling of lost objects on an ec pool.
-
- A pretty rigid cluster is brought up andtested by this task
- """
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- 'lost_unfound task only accepts a dict for configuration'
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
-
- manager = ceph_manager.CephManager(
- mon,
- ctx=ctx,
- logger=log.getChild('ceph_manager'),
- )
-
- manager.wait_for_clean()
-
- profile = config.get('erasure_code_profile', {
- 'k': '2',
- 'm': '2',
- 'ruleset-failure-domain': 'osd'
- })
- profile_name = profile.get('name', 'lost_unfound')
- manager.create_erasure_code_profile(profile_name, profile)
- pool = manager.create_pool_with_unique_name(
- erasure_code_profile_name=profile_name,
- min_size=2)
-
- # something that is always there, readable and never empty
- dummyfile = '/etc/group'
-
- # kludge to make sure they get a map
- rados(ctx, mon, ['-p', pool, 'put', 'dummy', dummyfile])
-
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.wait_for_recovery()
-
- # create old objects
- for f in range(1, 10):
- rados(ctx, mon, ['-p', pool, 'put', 'existing_%d' % f, dummyfile])
- rados(ctx, mon, ['-p', pool, 'put', 'existed_%d' % f, dummyfile])
- rados(ctx, mon, ['-p', pool, 'rm', 'existed_%d' % f])
-
- # delay recovery, and make the pg log very long (to prevent backfill)
- manager.raw_cluster_cmd(
- 'tell', 'osd.1',
- 'injectargs',
- '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000'
- )
-
- manager.kill_osd(0)
- manager.mark_down_osd(0)
- manager.kill_osd(3)
- manager.mark_down_osd(3)
-
- for f in range(1, 10):
- rados(ctx, mon, ['-p', pool, 'put', 'new_%d' % f, dummyfile])
- rados(ctx, mon, ['-p', pool, 'put', 'existed_%d' % f, dummyfile])
- rados(ctx, mon, ['-p', pool, 'put', 'existing_%d' % f, dummyfile])
-
- # take out osd.1 and a necessary shard of those objects.
- manager.kill_osd(1)
- manager.mark_down_osd(1)
- manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it')
- manager.revive_osd(0)
- manager.wait_till_osd_is_up(0)
- manager.revive_osd(3)
- manager.wait_till_osd_is_up(3)
-
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats')
- manager.wait_till_active()
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats')
-
- # verify that there are unfound objects
- unfound = manager.get_num_unfound_objects()
- log.info("there are %d unfound objects" % unfound)
- assert unfound
-
- testdir = teuthology.get_testdir(ctx)
- procs = []
- if config.get('parallel_bench', True):
- procs.append(mon.run(
- args=[
- "/bin/sh", "-c",
- " ".join(['adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage',
- 'rados',
- '--no-log-to-stderr',
- '--name', 'client.admin',
- '-b', str(4<<10),
- '-p' , pool,
- '-t', '20',
- 'bench', '240', 'write',
- ]).format(tdir=testdir),
- ],
- logger=log.getChild('radosbench.{id}'.format(id='client.admin')),
- stdin=run.PIPE,
- wait=False
- ))
- time.sleep(10)
-
- # mark stuff lost
- pgs = manager.get_pg_stats()
- for pg in pgs:
- if pg['stat_sum']['num_objects_unfound'] > 0:
- # verify that i can list them direct from the osd
- log.info('listing missing/lost in %s state %s', pg['pgid'],
- pg['state']);
- m = manager.list_pg_missing(pg['pgid'])
- log.info('%s' % m)
- assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound']
-
- log.info("reverting unfound in %s", pg['pgid'])
- manager.raw_cluster_cmd('pg', pg['pgid'],
- 'mark_unfound_lost', 'delete')
- else:
- log.info("no unfound in %s", pg['pgid'])
-
- manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5')
- manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5')
- manager.raw_cluster_cmd('tell', 'osd.3', 'debug', 'kick_recovery_wq', '5')
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats')
- manager.wait_for_recovery()
-
- if not config.get('parallel_bench', True):
- time.sleep(20)
-
- # verify result
- for f in range(1, 10):
- err = rados(ctx, mon, ['-p', pool, 'get', 'new_%d' % f, '-'])
- assert err
- err = rados(ctx, mon, ['-p', pool, 'get', 'existed_%d' % f, '-'])
- assert err
- err = rados(ctx, mon, ['-p', pool, 'get', 'existing_%d' % f, '-'])
- assert err
-
- # see if osd.1 can cope
- manager.revive_osd(1)
- manager.wait_till_osd_is_up(1)
- manager.wait_for_clean()
- run.wait(procs)
+++ /dev/null
-"""
-Filestore/filejournal handler
-"""
-import logging
-from teuthology.orchestra import run
-import random
-
-from teuthology import misc as teuthology
-
-log = logging.getLogger(__name__)
-
-def task(ctx, config):
- """
- Test filestore/filejournal handling of non-idempotent events.
-
- Currently this is a kludge; we require the ceph task preceeds us just
- so that we get the tarball installed to run the test binary.
-
- :param ctx: Context
- :param config: Configuration
- """
- assert config is None or isinstance(config, list) \
- or isinstance(config, dict), \
- "task only supports a list or dictionary for configuration"
- all_clients = ['client.{id}'.format(id=id_)
- for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
- if config is None:
- config = all_clients
- if isinstance(config, list):
- config = dict.fromkeys(config)
- clients = config.keys()
-
- # just use the first client...
- client = clients[0];
- (remote,) = ctx.cluster.only(client).remotes.iterkeys()
-
- testdir = teuthology.get_testdir(ctx)
-
- dir = '%s/ceph.data/test.%s' % (testdir, client)
-
- seed = str(int(random.uniform(1,100)))
-
- try:
- log.info('creating a working dir')
- remote.run(args=['mkdir', dir])
- remote.run(
- args=[
- 'cd', dir,
- run.Raw('&&'),
- 'wget','-q', '-Orun_seed_to.sh',
- 'http://git.ceph.com/?p=ceph.git;a=blob_plain;f=src/test/objectstore/run_seed_to.sh;hb=HEAD',
- run.Raw('&&'),
- 'wget','-q', '-Orun_seed_to_range.sh',
- 'http://git.ceph.com/?p=ceph.git;a=blob_plain;f=src/test/objectstore/run_seed_to_range.sh;hb=HEAD',
- run.Raw('&&'),
- 'chmod', '+x', 'run_seed_to.sh', 'run_seed_to_range.sh',
- ]);
-
- log.info('running a series of tests')
- proc = remote.run(
- args=[
- 'cd', dir,
- run.Raw('&&'),
- './run_seed_to_range.sh', seed, '50', '300',
- ],
- wait=False,
- check_status=False)
- result = proc.wait()
-
- if result != 0:
- remote.run(
- args=[
- 'cp', '-a', dir, '{tdir}/archive/idempotent_failure'.format(tdir=testdir),
- ])
- raise Exception("./run_seed_to_range.sh errored out")
-
- finally:
- remote.run(args=[
- 'rm', '-rf', '--', dir
- ])
-
+++ /dev/null
-"""
-Mount/unmount a ``kernel`` client.
-"""
-import contextlib
-import logging
-
-from teuthology.misc import deep_merge
-from teuthology import misc
-from cephfs.kernel_mount import KernelMount
-
-log = logging.getLogger(__name__)
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Mount/unmount a ``kernel`` client.
-
- The config is optional and defaults to mounting on all clients. If
- a config is given, it is expected to be a list of clients to do
- this operation on. This lets you e.g. set up one client with
- ``ceph-fuse`` and another with ``kclient``.
-
- Example that mounts all clients::
-
- tasks:
- - ceph:
- - kclient:
- - interactive:
-
- Example that uses both ``kclient` and ``ceph-fuse``::
-
- tasks:
- - ceph:
- - ceph-fuse: [client.0]
- - kclient: [client.1]
- - interactive:
-
-
- Pass a dictionary instead of lists to specify per-client config:
-
- tasks:
- -kclient:
- client.0:
- debug: true
-
- :param ctx: Context
- :param config: Configuration
- """
- log.info('Mounting kernel clients...')
- assert config is None or isinstance(config, list) or isinstance(config, dict), \
- "task kclient got invalid config"
-
- if config is None:
- config = ['client.{id}'.format(id=id_)
- for id_ in misc.all_roles_of_type(ctx.cluster, 'client')]
-
- if isinstance(config, list):
- client_roles = config
- config = dict([r, dict()] for r in client_roles)
- elif isinstance(config, dict):
- client_roles = filter(lambda x: 'client.' in x, config.keys())
- else:
- raise ValueError("Invalid config object: {0} ({1})".format(config, config.__class__))
-
- # config has been converted to a dict by this point
- overrides = ctx.config.get('overrides', {})
- deep_merge(config, overrides.get('kclient', {}))
-
- clients = list(misc.get_clients(ctx=ctx, roles=client_roles))
-
- test_dir = misc.get_testdir(ctx)
-
- # Assemble mon addresses
- remotes_and_roles = ctx.cluster.remotes.items()
- roles = [roles for (remote_, roles) in remotes_and_roles]
- ips = [remote_.ssh.get_transport().getpeername()[0]
- for (remote_, _) in remotes_and_roles]
- mons = misc.get_mons(roles, ips).values()
-
- mounts = {}
- for id_, remote in clients:
- client_config = config.get("client.%s" % id_)
- if client_config is None:
- client_config = {}
-
- if config.get("disabled", False) or not client_config.get('mounted', True):
- continue
-
- kernel_mount = KernelMount(
- mons,
- test_dir,
- id_,
- remote,
- ctx.teuthology_config.get('ipmi_user', None),
- ctx.teuthology_config.get('ipmi_password', None),
- ctx.teuthology_config.get('ipmi_domain', None)
- )
-
- mounts[id_] = kernel_mount
-
- if client_config.get('debug', False):
- remote.run(args=["sudo", "bash", "-c", "echo 'module ceph +p' > /sys/kernel/debug/dynamic_debug/control"])
- remote.run(args=["sudo", "bash", "-c", "echo 'module libceph +p' > /sys/kernel/debug/dynamic_debug/control"])
-
- kernel_mount.mount()
-
- ctx.mounts = mounts
- try:
- yield mounts
- finally:
- log.info('Unmounting kernel clients...')
- for mount in mounts.values():
- if mount.is_mounted():
- mount.umount()
+++ /dev/null
-"""
-locktests
-"""
-import logging
-
-from teuthology.orchestra import run
-from teuthology import misc as teuthology
-
-log = logging.getLogger(__name__)
-
-def task(ctx, config):
- """
- Run locktests, from the xfstests suite, on the given
- clients. Whether the clients are ceph-fuse or kernel does not
- matter, and the two clients can refer to the same mount.
-
- The config is a list of two clients to run the locktest on. The
- first client will be the host.
-
- For example:
- tasks:
- - ceph:
- - ceph-fuse: [client.0, client.1]
- - locktest:
- [client.0, client.1]
-
- This task does not yield; there would be little point.
-
- :param ctx: Context
- :param config: Configuration
- """
-
- assert isinstance(config, list)
- log.info('fetching and building locktests...')
- (host,) = ctx.cluster.only(config[0]).remotes
- (client,) = ctx.cluster.only(config[1]).remotes
- ( _, _, host_id) = config[0].partition('.')
- ( _, _, client_id) = config[1].partition('.')
- testdir = teuthology.get_testdir(ctx)
- hostmnt = '{tdir}/mnt.{id}'.format(tdir=testdir, id=host_id)
- clientmnt = '{tdir}/mnt.{id}'.format(tdir=testdir, id=client_id)
-
- try:
- for client_name in config:
- log.info('building on {client_}'.format(client_=client_name))
- ctx.cluster.only(client_name).run(
- args=[
- # explicitly does not support multiple autotest tasks
- # in a single run; the result archival would conflict
- 'mkdir', '{tdir}/archive/locktest'.format(tdir=testdir),
- run.Raw('&&'),
- 'mkdir', '{tdir}/locktest'.format(tdir=testdir),
- run.Raw('&&'),
- 'wget',
- '-nv',
- 'https://raw.github.com/gregsfortytwo/xfstests-ceph/master/src/locktest.c',
- '-O', '{tdir}/locktest/locktest.c'.format(tdir=testdir),
- run.Raw('&&'),
- 'g++', '{tdir}/locktest/locktest.c'.format(tdir=testdir),
- '-o', '{tdir}/locktest/locktest'.format(tdir=testdir)
- ],
- logger=log.getChild('locktest_client.{id}'.format(id=client_name)),
- )
-
- log.info('built locktest on each client')
-
- host.run(args=['sudo', 'touch',
- '{mnt}/locktestfile'.format(mnt=hostmnt),
- run.Raw('&&'),
- 'sudo', 'chown', 'ubuntu.ubuntu',
- '{mnt}/locktestfile'.format(mnt=hostmnt)
- ]
- )
-
- log.info('starting on host')
- hostproc = host.run(
- args=[
- '{tdir}/locktest/locktest'.format(tdir=testdir),
- '-p', '6788',
- '-d',
- '{mnt}/locktestfile'.format(mnt=hostmnt),
- ],
- wait=False,
- logger=log.getChild('locktest.host'),
- )
- log.info('starting on client')
- (_,_,hostaddr) = host.name.partition('@')
- clientproc = client.run(
- args=[
- '{tdir}/locktest/locktest'.format(tdir=testdir),
- '-p', '6788',
- '-d',
- '-h', hostaddr,
- '{mnt}/locktestfile'.format(mnt=clientmnt),
- ],
- logger=log.getChild('locktest.client'),
- wait=False
- )
-
- hostresult = hostproc.wait()
- clientresult = clientproc.wait()
- if (hostresult != 0) or (clientresult != 0):
- raise Exception("Did not pass locking test!")
- log.info('finished locktest executable with results {r} and {s}'. \
- format(r=hostresult, s=clientresult))
-
- finally:
- log.info('cleaning up host dir')
- host.run(
- args=[
- 'mkdir', '-p', '{tdir}/locktest'.format(tdir=testdir),
- run.Raw('&&'),
- 'rm', '-f', '{tdir}/locktest/locktest.c'.format(tdir=testdir),
- run.Raw('&&'),
- 'rm', '-f', '{tdir}/locktest/locktest'.format(tdir=testdir),
- run.Raw('&&'),
- 'rmdir', '{tdir}/locktest'
- ],
- logger=log.getChild('.{id}'.format(id=config[0])),
- )
- log.info('cleaning up client dir')
- client.run(
- args=[
- 'mkdir', '-p', '{tdir}/locktest'.format(tdir=testdir),
- run.Raw('&&'),
- 'rm', '-f', '{tdir}/locktest/locktest.c'.format(tdir=testdir),
- run.Raw('&&'),
- 'rm', '-f', '{tdir}/locktest/locktest'.format(tdir=testdir),
- run.Raw('&&'),
- 'rmdir', '{tdir}/locktest'.format(tdir=testdir)
- ],
- logger=log.getChild('.{id}'.format(\
- id=config[1])),
- )
+++ /dev/null
-/var/log/ceph/*{daemon_type}*.log {{
- rotate 100
- size {max_size}
- compress
- sharedscripts
- postrotate
- killall {daemon_type} -1 || true
- endscript
- missingok
- notifempty
- su root root
-}}
-
+++ /dev/null
-"""
-Lost_unfound
-"""
-import logging
-import time
-import ceph_manager
-from teuthology import misc as teuthology
-from teuthology.orchestra import run
-from util.rados import rados
-
-log = logging.getLogger(__name__)
-
-def task(ctx, config):
- """
- Test handling of lost objects.
-
- A pretty rigid cluseter is brought up andtested by this task
- """
- POOL = 'unfound_pool'
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- 'lost_unfound task only accepts a dict for configuration'
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
-
- manager = ceph_manager.CephManager(
- mon,
- ctx=ctx,
- logger=log.getChild('ceph_manager'),
- )
-
- while len(manager.get_osd_status()['up']) < 3:
- time.sleep(10)
-
- manager.wait_for_clean()
-
- manager.create_pool(POOL)
-
- # something that is always there
- dummyfile = '/etc/fstab'
-
- # take an osd out until the very end
- manager.kill_osd(2)
- manager.mark_down_osd(2)
- manager.mark_out_osd(2)
-
- # kludge to make sure they get a map
- rados(ctx, mon, ['-p', POOL, 'put', 'dummy', dummyfile])
-
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.wait_for_recovery()
-
- # create old objects
- for f in range(1, 10):
- rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile])
- rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile])
- rados(ctx, mon, ['-p', POOL, 'rm', 'existed_%d' % f])
-
- # delay recovery, and make the pg log very long (to prevent backfill)
- manager.raw_cluster_cmd(
- 'tell', 'osd.1',
- 'injectargs',
- '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000'
- )
-
- manager.kill_osd(0)
- manager.mark_down_osd(0)
-
- for f in range(1, 10):
- rados(ctx, mon, ['-p', POOL, 'put', 'new_%d' % f, dummyfile])
- rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile])
- rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile])
-
- # bring osd.0 back up, let it peer, but don't replicate the new
- # objects...
- log.info('osd.0 command_args is %s' % 'foo')
- log.info(ctx.daemons.get_daemon('osd', 0).command_args)
- ctx.daemons.get_daemon('osd', 0).command_kwargs['args'].extend([
- '--osd-recovery-delay-start', '1000'
- ])
- manager.revive_osd(0)
- manager.mark_in_osd(0)
- manager.wait_till_osd_is_up(0)
-
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.wait_till_active()
-
- # take out osd.1 and the only copy of those objects.
- manager.kill_osd(1)
- manager.mark_down_osd(1)
- manager.mark_out_osd(1)
- manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it')
-
- # bring up osd.2 so that things would otherwise, in theory, recovery fully
- manager.revive_osd(2)
- manager.mark_in_osd(2)
- manager.wait_till_osd_is_up(2)
-
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.wait_till_active()
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
-
- # verify that there are unfound objects
- unfound = manager.get_num_unfound_objects()
- log.info("there are %d unfound objects" % unfound)
- assert unfound
-
- testdir = teuthology.get_testdir(ctx)
- procs = []
- if config.get('parallel_bench', True):
- procs.append(mon.run(
- args=[
- "/bin/sh", "-c",
- " ".join(['adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage',
- 'rados',
- '--no-log-to-stderr',
- '--name', 'client.admin',
- '-b', str(4<<10),
- '-p' , POOL,
- '-t', '20',
- 'bench', '240', 'write',
- ]).format(tdir=testdir),
- ],
- logger=log.getChild('radosbench.{id}'.format(id='client.admin')),
- stdin=run.PIPE,
- wait=False
- ))
- time.sleep(10)
-
- # mark stuff lost
- pgs = manager.get_pg_stats()
- for pg in pgs:
- if pg['stat_sum']['num_objects_unfound'] > 0:
- primary = 'osd.%d' % pg['acting'][0]
-
- # verify that i can list them direct from the osd
- log.info('listing missing/lost in %s state %s', pg['pgid'],
- pg['state']);
- m = manager.list_pg_missing(pg['pgid'])
- #log.info('%s' % m)
- assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound']
- num_unfound=0
- for o in m['objects']:
- if len(o['locations']) == 0:
- num_unfound += 1
- assert m['num_unfound'] == num_unfound
-
- log.info("reverting unfound in %s on %s", pg['pgid'], primary)
- manager.raw_cluster_cmd('pg', pg['pgid'],
- 'mark_unfound_lost', 'revert')
- else:
- log.info("no unfound in %s", pg['pgid'])
-
- manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5')
- manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5')
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.wait_for_recovery()
-
- # verify result
- for f in range(1, 10):
- err = rados(ctx, mon, ['-p', POOL, 'get', 'new_%d' % f, '-'])
- assert err
- err = rados(ctx, mon, ['-p', POOL, 'get', 'existed_%d' % f, '-'])
- assert err
- err = rados(ctx, mon, ['-p', POOL, 'get', 'existing_%d' % f, '-'])
- assert not err
-
- # see if osd.1 can cope
- manager.revive_osd(1)
- manager.mark_in_osd(1)
- manager.wait_till_osd_is_up(1)
- manager.wait_for_clean()
- run.wait(procs)
+++ /dev/null
-"""
-Force pg creation on all osds
-"""
-from teuthology import misc as teuthology
-from teuthology.orchestra import run
-import logging
-
-log = logging.getLogger(__name__)
-
-def task(ctx, config):
- """
- Create the specified number of pools and write 16 objects to them (thereby forcing
- the PG creation on each OSD). This task creates pools from all the clients,
- in parallel. It is easy to add other daemon types which have the appropriate
- permissions, but I don't think anything else does.
- The config is just the number of pools to create. I recommend setting
- "mon create pg interval" to a very low value in your ceph config to speed
- this up.
-
- You probably want to do this to look at memory consumption, and
- maybe to test how performance changes with the number of PGs. For example:
-
- tasks:
- - ceph:
- config:
- mon:
- mon create pg interval: 1
- - manypools: 3000
- - radosbench:
- clients: [client.0]
- time: 360
- """
-
- log.info('creating {n} pools'.format(n=config))
-
- poolnum = int(config)
- creator_remotes = []
- client_roles = teuthology.all_roles_of_type(ctx.cluster, 'client')
- log.info('got client_roles={client_roles_}'.format(client_roles_=client_roles))
- for role in client_roles:
- log.info('role={role_}'.format(role_=role))
- (creator_remote, ) = ctx.cluster.only('client.{id}'.format(id=role)).remotes.iterkeys()
- creator_remotes.append((creator_remote, 'client.{id}'.format(id=role)))
-
- remaining_pools = poolnum
- poolprocs=dict()
- while (remaining_pools > 0):
- log.info('{n} pools remaining to create'.format(n=remaining_pools))
- for remote, role_ in creator_remotes:
- poolnum = remaining_pools
- remaining_pools -= 1
- if remaining_pools < 0:
- continue
- log.info('creating pool{num} on {role}'.format(num=poolnum, role=role_))
- proc = remote.run(
- args=[
- 'rados',
- '--name', role_,
- 'mkpool', 'pool{num}'.format(num=poolnum), '-1',
- run.Raw('&&'),
- 'rados',
- '--name', role_,
- '--pool', 'pool{num}'.format(num=poolnum),
- 'bench', '0', 'write', '-t', '16', '--block-size', '1'
- ],
- wait = False
- )
- log.info('waiting for pool and object creates')
- poolprocs[remote] = proc
-
- run.wait(poolprocs.itervalues())
-
- log.info('created all {n} pools and wrote 16 objects to each'.format(n=poolnum))
+++ /dev/null
-
-import logging
-import contextlib
-import time
-import ceph_manager
-from teuthology import misc
-from teuthology.orchestra.run import CommandFailedError, Raw
-
-log = logging.getLogger(__name__)
-
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Go through filesystem creation with a synthetic failure in an MDS
- in its 'up:creating' state, to exercise the retry behaviour.
- """
- # Grab handles to the teuthology objects of interest
- mdslist = list(misc.all_roles_of_type(ctx.cluster, 'mds'))
- if len(mdslist) != 1:
- # Require exactly one MDS, the code path for creation failure when
- # a standby is available is different
- raise RuntimeError("This task requires exactly one MDS")
-
- mds_id = mdslist[0]
- (mds_remote,) = ctx.cluster.only('mds.{_id}'.format(_id=mds_id)).remotes.iterkeys()
- manager = ceph_manager.CephManager(
- mds_remote, ctx=ctx, logger=log.getChild('ceph_manager'),
- )
-
- # Stop MDS
- manager.raw_cluster_cmd('mds', 'set', "max_mds", "0")
- mds = ctx.daemons.get_daemon('mds', mds_id)
- mds.stop()
- manager.raw_cluster_cmd('mds', 'fail', mds_id)
-
- # Reset the filesystem so that next start will go into CREATING
- manager.raw_cluster_cmd('fs', 'rm', "default", "--yes-i-really-mean-it")
- manager.raw_cluster_cmd('fs', 'new', "default", "metadata", "data")
-
- # Start the MDS with mds_kill_create_at set, it will crash during creation
- mds.restart_with_args(["--mds_kill_create_at=1"])
- try:
- mds.wait_for_exit()
- except CommandFailedError as e:
- if e.exitstatus == 1:
- log.info("MDS creation killed as expected")
- else:
- log.error("Unexpected status code %s" % e.exitstatus)
- raise
-
- # Since I have intentionally caused a crash, I will clean up the resulting core
- # file to avoid task.internal.coredump seeing it as a failure.
- log.info("Removing core file from synthetic MDS failure")
- mds_remote.run(args=['rm', '-f', Raw("{archive}/coredump/*.core".format(archive=misc.get_archive_dir(ctx)))])
-
- # It should have left the MDS map state still in CREATING
- status = manager.get_mds_status(mds_id)
- assert status['state'] == 'up:creating'
-
- # Start the MDS again without the kill flag set, it should proceed with creation successfully
- mds.restart()
-
- # Wait for state ACTIVE
- t = 0
- create_timeout = 120
- while True:
- status = manager.get_mds_status(mds_id)
- if status['state'] == 'up:active':
- log.info("MDS creation completed successfully")
- break
- elif status['state'] == 'up:creating':
- log.info("MDS still in creating state")
- if t > create_timeout:
- log.error("Creating did not complete within %ss" % create_timeout)
- raise RuntimeError("Creating did not complete within %ss" % create_timeout)
- t += 1
- time.sleep(1)
- else:
- log.error("Unexpected MDS state: %s" % status['state'])
- assert(status['state'] in ['up:active', 'up:creating'])
-
- # The system should be back up in a happy healthy state, go ahead and run any further tasks
- # inside this context.
- yield
+++ /dev/null
-"""
-Thrash mds by simulating failures
-"""
-import logging
-import contextlib
-import ceph_manager
-import random
-import time
-
-from gevent.greenlet import Greenlet
-from gevent.event import Event
-from teuthology import misc as teuthology
-
-from tasks.cephfs.filesystem import MDSCluster, Filesystem
-
-log = logging.getLogger(__name__)
-
-
-class MDSThrasher(Greenlet):
- """
- MDSThrasher::
-
- The MDSThrasher thrashes MDSs during execution of other tasks (workunits, etc).
-
- The config is optional. Many of the config parameters are a a maximum value
- to use when selecting a random value from a range. To always use the maximum
- value, set no_random to true. The config is a dict containing some or all of:
-
- max_thrash: [default: 1] the maximum number of active MDSs per FS that will be thrashed at
- any given time.
-
- max_thrash_delay: [default: 30] maximum number of seconds to delay before
- thrashing again.
-
- max_replay_thrash_delay: [default: 4] maximum number of seconds to delay while in
- the replay state before thrashing.
-
- max_revive_delay: [default: 10] maximum number of seconds to delay before
- bringing back a thrashed MDS.
-
- randomize: [default: true] enables randomization and use the max/min values
-
- seed: [no default] seed the random number generator
-
- thrash_in_replay: [default: 0.0] likelihood that the MDS will be thrashed
- during replay. Value should be between 0.0 and 1.0.
-
- thrash_max_mds: [default: 0.25] likelihood that the max_mds of the mds
- cluster will be modified to a value [1, current) or (current, starting
- max_mds]. When reduced, randomly selected MDSs other than rank 0 will be
- deactivated to reach the new max_mds. Value should be between 0.0 and 1.0.
-
- thrash_weights: allows specific MDSs to be thrashed more/less frequently.
- This option overrides anything specified by max_thrash. This option is a
- dict containing mds.x: weight pairs. For example, [mds.a: 0.7, mds.b:
- 0.3, mds.c: 0.0]. Each weight is a value from 0.0 to 1.0. Any MDSs not
- specified will be automatically given a weight of 0.0 (not thrashed).
- For a given MDS, by default the trasher delays for up to
- max_thrash_delay, trashes, waits for the MDS to recover, and iterates.
- If a non-zero weight is specified for an MDS, for each iteration the
- thrasher chooses whether to thrash during that iteration based on a
- random value [0-1] not exceeding the weight of that MDS.
-
- Examples::
-
-
- The following example sets the likelihood that mds.a will be thrashed
- to 80%, mds.b to 20%, and other MDSs will not be thrashed. It also sets the
- likelihood that an MDS will be thrashed in replay to 40%.
- Thrash weights do not have to sum to 1.
-
- tasks:
- - ceph:
- - mds_thrash:
- thrash_weights:
- - mds.a: 0.8
- - mds.b: 0.2
- thrash_in_replay: 0.4
- - ceph-fuse:
- - workunit:
- clients:
- all: [suites/fsx.sh]
-
- The following example disables randomization, and uses the max delay values:
-
- tasks:
- - ceph:
- - mds_thrash:
- max_thrash_delay: 10
- max_revive_delay: 1
- max_replay_thrash_delay: 4
-
- """
-
- def __init__(self, ctx, manager, config, logger, fs, max_mds):
- super(MDSThrasher, self).__init__()
-
- self.ctx = ctx
- self.manager = manager
- assert self.manager.is_clean()
- self.config = config
- self.logger = logger
- self.fs = fs
- self.max_mds = max_mds
-
- self.stopping = Event()
-
- self.randomize = bool(self.config.get('randomize', True))
- self.thrash_max_mds = float(self.config.get('thrash_max_mds', 0.25))
- self.max_thrash = int(self.config.get('max_thrash', 1))
- self.max_thrash_delay = float(self.config.get('thrash_delay', 120.0))
- self.thrash_in_replay = float(self.config.get('thrash_in_replay', False))
- assert self.thrash_in_replay >= 0.0 and self.thrash_in_replay <= 1.0, 'thrash_in_replay ({v}) must be between [0.0, 1.0]'.format(
- v=self.thrash_in_replay)
- self.max_replay_thrash_delay = float(self.config.get('max_replay_thrash_delay', 4.0))
- self.max_revive_delay = float(self.config.get('max_revive_delay', 10.0))
-
- def _run(self):
- try:
- self.do_thrash()
- except:
- # Log exceptions here so we get the full backtrace (it's lost
- # by the time someone does a .get() on this greenlet)
- self.logger.exception("Exception in do_thrash:")
- raise
-
- def log(self, x):
- """Write data to logger assigned to this MDThrasher"""
- self.logger.info(x)
-
- def stop(self):
- self.stopping.set()
-
- def kill_mds(self, mds):
- if self.config.get('powercycle'):
- (remote,) = (self.ctx.cluster.only('mds.{m}'.format(m=mds)).
- remotes.iterkeys())
- self.log('kill_mds on mds.{m} doing powercycle of {s}'.
- format(m=mds, s=remote.name))
- self._assert_ipmi(remote)
- remote.console.power_off()
- else:
- self.ctx.daemons.get_daemon('mds', mds).stop()
-
- @staticmethod
- def _assert_ipmi(remote):
- assert remote.console.has_ipmi_credentials, (
- "powercycling requested but RemoteConsole is not "
- "initialized. Check ipmi config.")
-
- def revive_mds(self, mds, standby_for_rank=None):
- """
- Revive mds -- do an ipmpi powercycle (if indicated by the config)
- and then restart (using --hot-standby if specified.
- """
- if self.config.get('powercycle'):
- (remote,) = (self.ctx.cluster.only('mds.{m}'.format(m=mds)).
- remotes.iterkeys())
- self.log('revive_mds on mds.{m} doing powercycle of {s}'.
- format(m=mds, s=remote.name))
- self._assert_ipmi(remote)
- remote.console.power_on()
- self.manager.make_admin_daemon_dir(self.ctx, remote)
- args = []
- if standby_for_rank:
- args.extend(['--hot-standby', standby_for_rank])
- self.ctx.daemons.get_daemon('mds', mds).restart(*args)
-
- def wait_for_stable(self, rank = None, gid = None):
- self.log('waiting for mds cluster to stabilize...')
- status = self.fs.status()
- itercount = 0
- while True:
- max_mds = status.get_fsmap(self.fs.id)['mdsmap']['max_mds']
- if rank is not None:
- try:
- info = status.get_rank(self.fs.id, rank)
- if info['gid'] != gid:
- self.log('mds.{name} has gained rank={rank}, replacing gid={gid}'.format(name = info['name'], rank = rank, gid = gid))
- return status, info['name']
- except:
- pass # no rank present
- else:
- ranks = filter(lambda info: "up:active" == info['state'] and "laggy_since" not in info, list(status.get_ranks(self.fs.id)))
- count = len(ranks)
- if count >= max_mds:
- self.log('mds cluster has {count} alive and active, now stable!'.format(count = count))
- return status, None
- itercount = itercount + 1
- if itercount > 10:
- self.log('mds map: {status}'.format(status=self.fs.status()))
- time.sleep(2)
- status = self.fs.status()
-
- def do_thrash(self):
- """
- Perform the random thrashing action
- """
-
- self.log('starting mds_do_thrash for fs {fs}'.format(fs = self.fs.name))
- stats = {
- "max_mds": 0,
- "deactivate": 0,
- "kill": 0,
- }
-
- while not self.stopping.is_set():
- delay = self.max_thrash_delay
- if self.randomize:
- delay = random.randrange(0.0, self.max_thrash_delay)
-
- if delay > 0.0:
- self.log('waiting for {delay} secs before thrashing'.format(delay=delay))
- self.stopping.wait(delay)
- if self.stopping.is_set():
- continue
-
- status = self.fs.status()
-
- if random.randrange(0.0, 1.0) <= self.thrash_max_mds:
- max_mds = status.get_fsmap(self.fs.id)['mdsmap']['max_mds']
- options = range(1, max_mds)+range(max_mds+1, self.max_mds+1)
- if len(options) > 0:
- sample = random.sample(options, 1)
- new_max_mds = sample[0]
- self.log('thrashing max_mds: %d -> %d' % (max_mds, new_max_mds))
- self.fs.set_max_mds(new_max_mds)
- stats['max_mds'] += 1
-
- # Now randomly deactivate mds if we shrank
- for rank in random.sample(range(1, max_mds), max(0, max_mds-new_max_mds)):
- self.fs.deactivate(rank)
- stats['deactivate'] += 1
-
- status = self.wait_for_stable()[0]
-
- count = 0
- for info in status.get_ranks(self.fs.id):
- name = info['name']
- label = 'mds.' + name
- rank = info['rank']
- gid = info['gid']
-
- # if thrash_weights isn't specified and we've reached max_thrash,
- # we're done
- count = count + 1
- if 'thrash_weights' not in self.config and count > self.max_thrash:
- break
-
- weight = 1.0
- if 'thrash_weights' in self.config:
- weight = self.config['thrash_weights'].get(label, '0.0')
- skip = random.randrange(0.0, 1.0)
- if weight <= skip:
- self.log('skipping thrash iteration with skip ({skip}) > weight ({weight})'.format(skip=skip, weight=weight))
- continue
-
- self.log('kill {label} (rank={rank})'.format(label=label, rank=rank))
- self.kill_mds(name)
- stats['kill'] += 1
-
- # wait for mon to report killed mds as crashed
- last_laggy_since = None
- itercount = 0
- while True:
- status = self.fs.status()
- info = status.get_mds(name)
- if not info:
- break
- if 'laggy_since' in info:
- last_laggy_since = info['laggy_since']
- break
- if any([(f == name) for f in status.get_fsmap(self.fs.id)['mdsmap']['failed']]):
- break
- self.log(
- 'waiting till mds map indicates {label} is laggy/crashed, in failed state, or {label} is removed from mdsmap'.format(
- label=label))
- itercount = itercount + 1
- if itercount > 10:
- self.log('mds map: {status}'.format(status=status))
- time.sleep(2)
-
- if last_laggy_since:
- self.log(
- '{label} reported laggy/crashed since: {since}'.format(label=label, since=last_laggy_since))
- else:
- self.log('{label} down, removed from mdsmap'.format(label=label, since=last_laggy_since))
-
- # wait for a standby mds to takeover and become active
- status, takeover_mds = self.wait_for_stable(rank, gid)
- self.log('New active mds is mds.{_id}'.format(_id=takeover_mds))
-
- # wait for a while before restarting old active to become new
- # standby
- delay = self.max_revive_delay
- if self.randomize:
- delay = random.randrange(0.0, self.max_revive_delay)
-
- self.log('waiting for {delay} secs before reviving {label}'.format(
- delay=delay, label=label))
- time.sleep(delay)
-
- self.log('reviving {label}'.format(label=label))
- self.revive_mds(name)
-
- while True:
- status = self.fs.status()
- info = status.get_mds(name)
- if info and info['state'] in ('up:standby', 'up:standby-replay'):
- self.log('{label} reported in {state} state'.format(label=label, state=info['state']))
- break
- self.log(
- 'waiting till mds map indicates {label} is in standby or standby-replay'.format(label=label))
- time.sleep(2)
-
- for stat in stats:
- self.log("stat['{key}'] = {value}".format(key = stat, value = stats[stat]))
-
- # don't do replay thrashing right now
-# for info in status.get_replays(self.fs.id):
-# # this might race with replay -> active transition...
-# if status['state'] == 'up:replay' and random.randrange(0.0, 1.0) < self.thrash_in_replay:
-# delay = self.max_replay_thrash_delay
-# if self.randomize:
-# delay = random.randrange(0.0, self.max_replay_thrash_delay)
-# time.sleep(delay)
-# self.log('kill replaying mds.{id}'.format(id=self.to_kill))
-# self.kill_mds(self.to_kill)
-#
-# delay = self.max_revive_delay
-# if self.randomize:
-# delay = random.randrange(0.0, self.max_revive_delay)
-#
-# self.log('waiting for {delay} secs before reviving mds.{id}'.format(
-# delay=delay, id=self.to_kill))
-# time.sleep(delay)
-#
-# self.log('revive mds.{id}'.format(id=self.to_kill))
-# self.revive_mds(self.to_kill)
-
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Stress test the mds by thrashing while another task/workunit
- is running.
-
- Please refer to MDSThrasher class for further information on the
- available options.
- """
-
- mds_cluster = MDSCluster(ctx)
-
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- 'mds_thrash task only accepts a dict for configuration'
- mdslist = list(teuthology.all_roles_of_type(ctx.cluster, 'mds'))
- assert len(mdslist) > 1, \
- 'mds_thrash task requires at least 2 metadata servers'
-
- # choose random seed
- if 'seed' in config:
- seed = int(config['seed'])
- else:
- seed = int(time.time())
- log.info('mds thrasher using random seed: {seed}'.format(seed=seed))
- random.seed(seed)
-
- (first,) = ctx.cluster.only('mds.{_id}'.format(_id=mdslist[0])).remotes.iterkeys()
- manager = ceph_manager.CephManager(
- first, ctx=ctx, logger=log.getChild('ceph_manager'),
- )
-
- # make sure everyone is in active, standby, or standby-replay
- log.info('Wait for all MDSs to reach steady state...')
- status = mds_cluster.status()
- while True:
- steady = True
- for info in status.get_all():
- state = info['state']
- if state not in ('up:active', 'up:standby', 'up:standby-replay'):
- steady = False
- break
- if steady:
- break
- time.sleep(2)
- status = mds_cluster.status()
- log.info('Ready to start thrashing')
-
- manager.wait_for_clean()
- thrashers = {}
- for fs in status.get_filesystems():
- name = fs['mdsmap']['fs_name']
- log.info('Running thrasher against FS {f}'.format(f = name))
- thrasher = MDSThrasher(
- ctx, manager, config,
- log.getChild('fs.[{f}]'.format(f = name)),
- Filesystem(ctx, fs['id']), fs['mdsmap']['max_mds']
- )
- thrasher.start()
- thrashers[name] = thrasher
-
- try:
- log.debug('Yielding')
- yield
- finally:
- log.info('joining mds_thrashers')
- for name in thrashers:
- log.info('join thrasher mds_thrasher.fs.[{f}]'.format(f=name))
- thrashers[name].stop()
- thrashers[name].get() # Raise any exception from _run()
- thrashers[name].join()
- log.info('done joining')
+++ /dev/null
-instance-id: test
-local-hostname: test
+++ /dev/null
-
-from unittest import case
-import json
-
-from teuthology import misc
-from tasks.ceph_test_case import CephTestCase
-
-# TODO move definition of CephCluster
-from tasks.cephfs.filesystem import CephCluster
-
-
-class MgrCluster(CephCluster):
- def __init__(self, ctx):
- super(MgrCluster, self).__init__(ctx)
- self.mgr_ids = list(misc.all_roles_of_type(ctx.cluster, 'mgr'))
-
- if len(self.mgr_ids) == 0:
- raise RuntimeError(
- "This task requires at least one manager daemon")
-
- self.mgr_daemons = dict(
- [(mgr_id, self._ctx.daemons.get_daemon('mgr', mgr_id)) for mgr_id
- in self.mgr_ids])
-
- def mgr_stop(self, mgr_id):
- self.mgr_daemons[mgr_id].stop()
-
- def mgr_fail(self, mgr_id):
- self.mon_manager.raw_cluster_cmd("mgr", "fail", mgr_id)
-
- def mgr_restart(self, mgr_id):
- self.mgr_daemons[mgr_id].restart()
-
- def get_mgr_map(self):
- status = json.loads(
- self.mon_manager.raw_cluster_cmd("status", "--format=json-pretty"))
-
- return status["mgrmap"]
-
- def get_active_id(self):
- return self.get_mgr_map()["active_name"]
-
- def get_standby_ids(self):
- return [s['name'] for s in self.get_mgr_map()["standbys"]]
-
-
-class MgrTestCase(CephTestCase):
- REQUIRE_MGRS = 1
-
- def setUp(self):
- super(MgrTestCase, self).setUp()
-
- # The test runner should have populated this
- assert self.mgr_cluster is not None
-
- if len(self.mgr_cluster.mgr_ids) < self.REQUIRE_MGRS:
- raise case.SkipTest("Only have {0} manager daemons, "
- "{1} are required".format(
- len(self.mgr_cluster.mgr_ids), self.REQUIRE_MGRS))
-
- # Restart all the daemons
- for daemon in self.mgr_cluster.mgr_daemons.values():
- daemon.stop()
-
- for mgr_id in self.mgr_cluster.mgr_ids:
- self.mgr_cluster.mgr_fail(mgr_id)
-
- for daemon in self.mgr_cluster.mgr_daemons.values():
- daemon.restart()
-
- # Wait for an active to come up
- self.wait_until_true(lambda: self.mgr_cluster.get_active_id() != "",
- timeout=20)
-
- expect_standbys = set(self.mgr_cluster.mgr_ids) \
- - {self.mgr_cluster.get_active_id()}
- self.wait_until_true(
- lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys,
- timeout=20)
+++ /dev/null
-
-import logging
-
-from tasks.mgr.mgr_test_case import MgrTestCase
-
-
-log = logging.getLogger(__name__)
-
-
-class TestFailover(MgrTestCase):
- REQUIRE_MGRS = 2
-
- def test_timeout(self):
- """
- That when an active mgr stops responding, a standby is promoted
- after mon_mgr_beacon_grace.
- """
-
- # Query which mgr is active
- original_active = self.mgr_cluster.get_active_id()
- original_standbys = self.mgr_cluster.get_standby_ids()
-
- # Stop that daemon
- self.mgr_cluster.mgr_stop(original_active)
-
- # Assert that the other mgr becomes active
- self.wait_until_true(
- lambda: self.mgr_cluster.get_active_id() in original_standbys,
- timeout=60
- )
-
- self.mgr_cluster.mgr_restart(original_active)
- self.wait_until_true(
- lambda: original_active in self.mgr_cluster.get_standby_ids(),
- timeout=10
- )
-
- def test_explicit_fail(self):
- """
- That when a user explicitly fails a daemon, a standby immediately
- replaces it.
- :return:
- """
- # Query which mgr is active
- original_active = self.mgr_cluster.get_active_id()
- original_standbys = self.mgr_cluster.get_standby_ids()
-
- self.mgr_cluster.mgr_fail(original_active)
-
- # A standby should take over
- self.wait_until_true(
- lambda: self.mgr_cluster.get_active_id() in original_standbys,
- timeout=60
- )
-
- # The one we failed should come back as a standby (he isn't
- # really dead)
- self.wait_until_true(
- lambda: original_active in self.mgr_cluster.get_standby_ids(),
- timeout=10
- )
-
- def test_standby_timeout(self):
- """
- That when a standby daemon stops sending beacons, it is
- removed from the list of standbys
- :return:
- """
- original_active = self.mgr_cluster.get_active_id()
- original_standbys = self.mgr_cluster.get_standby_ids()
-
- victim = original_standbys[0]
- self.mgr_cluster.mgr_stop(victim)
-
- expect_standbys = set(original_standbys) - {victim}
-
- self.wait_until_true(
- lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys,
- timeout=60
- )
- self.assertEqual(self.mgr_cluster.get_active_id(), original_active)
+++ /dev/null
-# mod_fastcgi config goes here
-
-# Set fastcgi environment variables.
-# Note that this is separate from Unix environment variables!
-SetEnv RGW_LOG_LEVEL 20
-SetEnv RGW_SHOULD_LOG yes
-SetEnv RGW_PRINT_CONTINUE {print_continue}
-
-<IfModule !fastcgi_module>
- LoadModule fastcgi_module {mod_path}/mod_fastcgi.so
-</IfModule>
-
-FastCgiIPCDir {testdir}/apache/tmp.{client}/fastcgi_sock
-FastCgiExternalServer {testdir}/apache/htdocs.{client}/rgw.fcgi -socket rgw_sock -idle-timeout {idle_timeout}
-RewriteEngine On
-
-RewriteRule ^/([a-zA-Z0-9-_.]*)([/]?.*) /rgw.fcgi?page=$1¶ms=$2&%{{QUERY_STRING}} [E=HTTP_AUTHORIZATION:%{{HTTP:Authorization}},L]
+++ /dev/null
-# mod_proxy_fcgi config, using TCP
-
-<IfModule !proxy_module>
- LoadModule proxy_module {mod_path}/mod_proxy.so
-</IfModule>
-<IfModule !proxy_fcgi_module>
- LoadModule proxy_fcgi_module {mod_path}/mod_proxy_fcgi.so
-</IfModule>
-
-RewriteEngine On
-
-RewriteRule .* - [E=HTTP_AUTHORIZATION:%{{HTTP:Authorization}},L]
-
-SetEnv proxy-nokeepalive 1
-
-ProxyPass / fcgi://0.0.0.0:9000/
+++ /dev/null
-# mod_proxy_fcgi config, using UDS
-
-<IfModule !proxy_module>
- LoadModule proxy_module {mod_path}/mod_proxy.so
-</IfModule>
-<IfModule !proxy_fcgi_module>
- LoadModule proxy_fcgi_module {mod_path}/mod_proxy_fcgi.so
-</IfModule>
-
-RewriteEngine On
-
-RewriteRule .* - [E=HTTP_AUTHORIZATION:%{{HTTP:Authorization}},L]
-
-ProxyPass / unix://{testdir}/apache/tmp.{client}/fastcgi_sock/rgw_sock|fcgi://localhost:9000/ disablereuse=On
+++ /dev/null
-"""
-Handle clock skews in monitors.
-"""
-import logging
-import contextlib
-import ceph_manager
-import time
-import gevent
-from StringIO import StringIO
-from teuthology import misc as teuthology
-
-log = logging.getLogger(__name__)
-
-class ClockSkewCheck:
- """
- Periodically check if there are any clock skews among the monitors in the
- quorum. By default, assume no skews are supposed to exist; that can be
- changed using the 'expect-skew' option. If 'fail-on-skew' is set to false,
- then we will always succeed and only report skews if any are found.
-
- This class does not spawn a thread. It assumes that, if that is indeed
- wanted, it should be done by a third party (for instance, the task using
- this class). We intend it as such in order to reuse this class if need be.
-
- This task accepts the following options:
-
- interval amount of seconds to wait in-between checks. (default: 30.0)
- max-skew maximum skew, in seconds, that is considered tolerable before
- issuing a warning. (default: 0.05)
- expect-skew 'true' or 'false', to indicate whether to expect a skew during
- the run or not. If 'true', the test will fail if no skew is
- found, and succeed if a skew is indeed found; if 'false', it's
- the other way around. (default: false)
- never-fail Don't fail the run if a skew is detected and we weren't
- expecting it, or if no skew is detected and we were expecting
- it. (default: False)
-
- at-least-once Runs at least once, even if we are told to stop.
- (default: True)
- at-least-once-timeout If we were told to stop but we are attempting to
- run at least once, timeout after this many seconds.
- (default: 600)
-
- Example:
- Expect a skew higher than 0.05 seconds, but only report it without
- failing the teuthology run.
-
- - mon_clock_skew_check:
- interval: 30
- max-skew: 0.05
- expect_skew: true
- never-fail: true
- """
-
- def __init__(self, ctx, manager, config, logger):
- self.ctx = ctx
- self.manager = manager
-
- self.stopping = False
- self.logger = logger
- self.config = config
-
- if self.config is None:
- self.config = dict()
-
- self.check_interval = float(self.config.get('interval', 30.0))
-
- first_mon = teuthology.get_first_mon(ctx, config)
- remote = ctx.cluster.only(first_mon).remotes.keys()[0]
- proc = remote.run(
- args=[
- 'sudo',
- 'ceph-mon',
- '-i', first_mon[4:],
- '--show-config-value', 'mon_clock_drift_allowed'
- ], stdout=StringIO(), wait=True
- )
- self.max_skew = self.config.get('max-skew', float(proc.stdout.getvalue()))
-
- self.expect_skew = self.config.get('expect-skew', False)
- self.never_fail = self.config.get('never-fail', False)
- self.at_least_once = self.config.get('at-least-once', True)
- self.at_least_once_timeout = self.config.get('at-least-once-timeout', 600.0)
-
- def info(self, x):
- """
- locally define logger for info messages
- """
- self.logger.info(x)
-
- def warn(self, x):
- """
- locally define logger for warnings
- """
- self.logger.warn(x)
-
- def debug(self, x):
- """
- locally define logger for debug messages
- """
- self.logger.info(x)
- self.logger.debug(x)
-
- def finish(self):
- """
- Break out of the do_check loop.
- """
- self.stopping = True
-
- def sleep_interval(self):
- """
- If a sleep interval is set, sleep for that amount of time.
- """
- if self.check_interval > 0.0:
- self.debug('sleeping for {s} seconds'.format(
- s=self.check_interval))
- time.sleep(self.check_interval)
-
- def print_skews(self, skews):
- """
- Display skew values.
- """
- total = len(skews)
- if total > 0:
- self.info('---------- found {n} skews ----------'.format(n=total))
- for mon_id, values in skews.iteritems():
- self.info('mon.{id}: {v}'.format(id=mon_id, v=values))
- self.info('-------------------------------------')
- else:
- self.info('---------- no skews were found ----------')
-
- def do_check(self):
- """
- Clock skew checker. Loops until finish() is called.
- """
- self.info('start checking for clock skews')
- skews = dict()
- ran_once = False
-
- started_on = None
-
- while not self.stopping or (self.at_least_once and not ran_once):
-
- if self.at_least_once and not ran_once and self.stopping:
- if started_on is None:
- self.info('kicking-off timeout (if any)')
- started_on = time.time()
- elif self.at_least_once_timeout > 0.0:
- assert time.time() - started_on < self.at_least_once_timeout, \
- 'failed to obtain a timecheck before timeout expired'
-
- quorum_size = len(teuthology.get_mon_names(self.ctx))
- self.manager.wait_for_mon_quorum_size(quorum_size)
-
- health = self.manager.get_mon_health(True)
- timechecks = health['timechecks']
-
- clean_check = False
-
- if timechecks['round_status'] == 'finished':
- assert (timechecks['round'] % 2) == 0, \
- 'timecheck marked as finished but round ' \
- 'disagrees (r {r})'.format(
- r=timechecks['round'])
- clean_check = True
- else:
- assert timechecks['round_status'] == 'on-going', \
- 'timecheck status expected \'on-going\' ' \
- 'but found \'{s}\' instead'.format(
- s=timechecks['round_status'])
- if 'mons' in timechecks.keys() and len(timechecks['mons']) > 1:
- self.info('round still on-going, but there are available reports')
- else:
- self.info('no timechecks available just yet')
- self.sleep_interval()
- continue
-
- assert len(timechecks['mons']) > 1, \
- 'there are not enough reported timechecks; ' \
- 'expected > 1 found {n}'.format(n=len(timechecks['mons']))
-
- for check in timechecks['mons']:
- mon_skew = float(check['skew'])
- mon_health = check['health']
- mon_id = check['name']
- if abs(mon_skew) > self.max_skew:
- assert mon_health == 'HEALTH_WARN', \
- 'mon.{id} health is \'{health}\' but skew {s} > max {ms}'.format(
- id=mon_id,health=mon_health,s=abs(mon_skew),ms=self.max_skew)
-
- log_str = 'mon.{id} with skew {s} > max {ms}'.format(
- id=mon_id,s=abs(mon_skew),ms=self.max_skew)
-
- """ add to skew list """
- details = check['details']
- skews[mon_id] = {'skew': mon_skew, 'details': details}
-
- if self.expect_skew:
- self.info('expected skew: {str}'.format(str=log_str))
- else:
- self.warn('unexpected skew: {str}'.format(str=log_str))
-
- if clean_check or (self.expect_skew and len(skews) > 0):
- ran_once = True
- self.print_skews(skews)
- self.sleep_interval()
-
- total = len(skews)
- self.print_skews(skews)
-
- error_str = ''
- found_error = False
-
- if self.expect_skew:
- if total == 0:
- error_str = 'We were expecting a skew, but none was found!'
- found_error = True
- else:
- if total > 0:
- error_str = 'We were not expecting a skew, but we did find it!'
- found_error = True
-
- if found_error:
- self.info(error_str)
- if not self.never_fail:
- assert False, error_str
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Use clas ClockSkewCheck to check for clock skews on the monitors.
- This task will spawn a thread running ClockSkewCheck's do_check().
-
- All the configuration will be directly handled by ClockSkewCheck,
- so please refer to the class documentation for further information.
- """
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- 'mon_clock_skew_check task only accepts a dict for configuration'
- log.info('Beginning mon_clock_skew_check...')
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
- manager = ceph_manager.CephManager(
- mon,
- ctx=ctx,
- logger=log.getChild('ceph_manager'),
- )
-
- skew_check = ClockSkewCheck(ctx,
- manager, config,
- logger=log.getChild('mon_clock_skew_check'))
- skew_check_thread = gevent.spawn(skew_check.do_check)
- try:
- yield
- finally:
- log.info('joining mon_clock_skew_check')
- skew_check.finish()
- skew_check_thread.get()
-
-
+++ /dev/null
-"""
-Monitor recovery
-"""
-import logging
-import ceph_manager
-from teuthology import misc as teuthology
-
-
-log = logging.getLogger(__name__)
-
-def task(ctx, config):
- """
- Test monitor recovery.
- """
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- 'task only accepts a dict for configuration'
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
-
- manager = ceph_manager.CephManager(
- mon,
- ctx=ctx,
- logger=log.getChild('ceph_manager'),
- )
-
- mons = [f.split('.')[1] for f in teuthology.get_mon_names(ctx)]
- log.info("mon ids = %s" % mons)
-
- manager.wait_for_mon_quorum_size(len(mons))
-
- log.info('verifying all monitors are in the quorum')
- for m in mons:
- s = manager.get_mon_status(m)
- assert s['state'] == 'leader' or s['state'] == 'peon'
- assert len(s['quorum']) == len(mons)
-
- log.info('restarting each monitor in turn')
- for m in mons:
- # stop a monitor
- manager.kill_mon(m)
- manager.wait_for_mon_quorum_size(len(mons) - 1)
-
- # restart
- manager.revive_mon(m)
- manager.wait_for_mon_quorum_size(len(mons))
-
- # in forward and reverse order,
- rmons = mons
- rmons.reverse()
- for mons in mons, rmons:
- log.info('stopping all monitors')
- for m in mons:
- manager.kill_mon(m)
-
- log.info('forming a minimal quorum for %s, then adding monitors' % mons)
- qnum = (len(mons) / 2) + 1
- num = 0
- for m in mons:
- manager.revive_mon(m)
- num += 1
- if num >= qnum:
- manager.wait_for_mon_quorum_size(num)
-
- # on both leader and non-leader ranks...
- for rank in [0, 1]:
- # take one out
- log.info('removing mon %s' % mons[rank])
- manager.kill_mon(mons[rank])
- manager.wait_for_mon_quorum_size(len(mons) - 1)
-
- log.info('causing some monitor log activity')
- m = 30
- for n in range(1, m):
- manager.raw_cluster_cmd('log', '%d of %d' % (n, m))
-
- log.info('adding mon %s back in' % mons[rank])
- manager.revive_mon(mons[rank])
- manager.wait_for_mon_quorum_size(len(mons))
+++ /dev/null
-from cStringIO import StringIO
-
-import contextlib
-import logging
-import random
-
-from teuthology import misc as teuthology
-from teuthology.orchestra import run
-
-from ceph_manager import CephManager, write_conf
-
-
-log = logging.getLogger(__name__)
-
-
-def _get_mons(ctx):
- return [name[len('mon.'):] for name in teuthology.get_mon_names(ctx)]
-
-
-# teuthology prepares the monitor IPs (and ports) in get_mons(), we can
-# enumerate all monitor ports ([6789..]), and find the next available one.
-def _get_next_port(ctx, ip, cluster):
- # assuming we have only one cluster here.
- used = []
- for name in teuthology.get_mon_names(ctx, cluster):
- addr = ctx.ceph[cluster].conf[name]['mon addr']
- mon_ip, mon_port = addr.split(':')
- if mon_ip != ip:
- continue
- used.append(int(mon_port))
- port = 6789
- used.sort()
- for p in used:
- if p != port:
- break
- port += 1
- return port
-
-
-def _setup_mon(ctx, manager, remote, mon, name, data_path, conf_path):
- # co-locate a new monitor on remote where an existing monitor is hosted
- cluster = manager.cluster
- remote.run(args=['sudo', 'mkdir', '-p', data_path])
- keyring_path = '/etc/ceph/{cluster}.keyring'.format(
- cluster=manager.cluster)
- testdir = teuthology.get_testdir(ctx)
- monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
- cluster=cluster)
- manager.raw_cluster_cmd('mon', 'getmap', '-o', monmap_path)
- if manager.controller != remote:
- monmap = teuthology.get_file(manager.controller, monmap_path)
- teuthology.write_file(remote, monmap_path, StringIO(monmap))
- remote.run(
- args=[
- 'sudo',
- 'ceph-mon',
- '--cluster', cluster,
- '--mkfs',
- '-i', mon,
- '--monmap', monmap_path,
- '--keyring', keyring_path])
- if manager.controller != remote:
- teuthology.delete_file(remote, monmap_path)
- # raw_cluster_cmd() is performed using sudo, so sudo here also.
- teuthology.delete_file(manager.controller, monmap_path, sudo=True)
- # update ceph.conf so that the ceph CLI is able to connect to the cluster
- if conf_path:
- ip = remote.ip_address
- port = _get_next_port(ctx, ip, cluster)
- mon_addr = '{ip}:{port}'.format(ip=ip, port=port)
- ctx.ceph[cluster].conf[name] = {'mon addr': mon_addr}
- write_conf(ctx, conf_path, cluster)
-
-
-def _teardown_mon(ctx, manager, remote, name, data_path, conf_path):
- cluster = manager.cluster
- del ctx.ceph[cluster].conf[name]
- write_conf(ctx, conf_path, cluster)
- remote.run(args=['sudo', 'rm', '-rf', data_path])
-
-
-@contextlib.contextmanager
-def _prepare_mon(ctx, manager, remote, mon):
- cluster = manager.cluster
- data_path = '/var/lib/ceph/mon/{cluster}-{id}'.format(
- cluster=cluster, id=mon)
- conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster)
- name = 'mon.{0}'.format(mon)
- _setup_mon(ctx, manager, remote, mon, name, data_path, conf_path)
- yield
- _teardown_mon(ctx, manager, remote, name,
- data_path, conf_path)
-
-
-# run_daemon() in ceph.py starts a herd of daemons of the same type, but
-# _run_daemon() starts only one instance.
-@contextlib.contextmanager
-def _run_daemon(ctx, remote, cluster, type_, id_):
- testdir = teuthology.get_testdir(ctx)
- coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
- daemon_signal = 'kill'
- run_cmd = [
- 'sudo',
- 'adjust-ulimits',
- 'ceph-coverage',
- coverage_dir,
- 'daemon-helper',
- daemon_signal,
- ]
- run_cmd_tail = [
- 'ceph-%s' % (type_),
- '-f',
- '--cluster', cluster,
- '-i', id_]
- run_cmd.extend(run_cmd_tail)
- ctx.daemons.add_daemon(remote, type_, id_,
- cluster=cluster,
- args=run_cmd,
- logger=log.getChild(type_),
- stdin=run.PIPE,
- wait=False)
- daemon = ctx.daemons.get_daemon(type_, id_, cluster)
- yield daemon
- daemon.stop()
-
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- replace a monitor with a newly added one, and then revert this change
-
- How it works::
- 1. add a mon with specified id (mon.victim_prime)
- 2. wait for quorum
- 3. remove a monitor with specified id (mon.victim), mon.victim will commit
- suicide
- 4. wait for quorum
- 5. <yield>
- 5. add mon.a back, and start it
- 6. wait for quorum
- 7. remove mon.a_prime
-
- Options::
- victim the id of the mon to be removed (pick a random mon by default)
- replacer the id of the new mon (use "${victim}_prime" if not specified)
- """
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
- manager = CephManager(mon, ctx=ctx, logger=log.getChild('ceph_manager'))
-
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- "task ceph only supports a dictionary for configuration"
- overrides = ctx.config.get('overrides', {})
- teuthology.deep_merge(config, overrides.get('mon_seesaw', {}))
- victim = config.get('victim', random.choice(_get_mons(ctx)))
- replacer = config.get('replacer', '{0}_prime'.format(victim))
- remote = manager.find_remote('mon', victim)
- quorum = manager.get_mon_quorum()
- cluster = manager.cluster
- log.info('replacing {victim} with {replacer}'.format(victim=victim,
- replacer=replacer))
- with _prepare_mon(ctx, manager, remote, replacer):
- with _run_daemon(ctx, remote, cluster, 'mon', replacer):
- # replacer will join the quorum automatically
- manager.wait_for_mon_quorum_size(len(quorum) + 1, 10)
- # if we don't remove the victim from monmap, there is chance that
- # we are leaving the new joiner with a monmap of 2 mon, and it will
- # not able to reach the other one, it will be keeping probing for
- # ever.
- log.info('removing {mon}'.format(mon=victim))
- manager.raw_cluster_cmd('mon', 'remove', victim)
- manager.wait_for_mon_quorum_size(len(quorum), 10)
- # the victim will commit suicide after being removed from
- # monmap, let's wait until it stops.
- ctx.daemons.get_daemon('mon', victim, cluster).wait(10)
- try:
- # perform other tasks
- yield
- finally:
- # bring the victim back online
- # nuke the monstore of victim, otherwise it will refuse to boot
- # with following message:
- #
- # not in monmap and have been in a quorum before; must have
- # been removed
- log.info('re-adding {mon}'.format(mon=victim))
- data_path = '/var/lib/ceph/mon/{cluster}-{id}'.format(
- cluster=cluster, id=victim)
- remote.run(args=['sudo', 'rm', '-rf', data_path])
- name = 'mon.{0}'.format(victim)
- _setup_mon(ctx, manager, remote, victim, name, data_path, None)
- log.info('reviving {mon}'.format(mon=victim))
- manager.revive_mon(victim)
- manager.wait_for_mon_quorum_size(len(quorum) + 1, 10)
- manager.raw_cluster_cmd('mon', 'remove', replacer)
- manager.wait_for_mon_quorum_size(len(quorum), 10)
+++ /dev/null
-"""
-Monitor thrash
-"""
-import logging
-import contextlib
-import ceph_manager
-import random
-import time
-import gevent
-import json
-import math
-from teuthology import misc as teuthology
-
-log = logging.getLogger(__name__)
-
-def _get_mons(ctx):
- """
- Get monitor names from the context value.
- """
- mons = [f[len('mon.'):] for f in teuthology.get_mon_names(ctx)]
- return mons
-
-class MonitorThrasher:
- """
- How it works::
-
- - pick a monitor
- - kill it
- - wait for quorum to be formed
- - sleep for 'revive_delay' seconds
- - revive monitor
- - wait for quorum to be formed
- - sleep for 'thrash_delay' seconds
-
- Options::
-
- seed Seed to use on the RNG to reproduce a previous
- behaviour (default: None; i.e., not set)
- revive_delay Number of seconds to wait before reviving
- the monitor (default: 10)
- thrash_delay Number of seconds to wait in-between
- test iterations (default: 0)
- thrash_store Thrash monitor store before killing the monitor being thrashed (default: False)
- thrash_store_probability Probability of thrashing a monitor's store
- (default: 50)
- thrash_many Thrash multiple monitors instead of just one. If
- 'maintain-quorum' is set to False, then we will
- thrash up to as many monitors as there are
- available. (default: False)
- maintain_quorum Always maintain quorum, taking care on how many
- monitors we kill during the thrashing. If we
- happen to only have one or two monitors configured,
- if this option is set to True, then we won't run
- this task as we cannot guarantee maintenance of
- quorum. Setting it to false however would allow the
- task to run with as many as just one single monitor.
- (default: True)
- freeze_mon_probability: how often to freeze the mon instead of killing it,
- in % (default: 0)
- freeze_mon_duration: how many seconds to freeze the mon (default: 15)
- scrub Scrub after each iteration (default: True)
-
- Note: if 'store-thrash' is set to True, then 'maintain-quorum' must also
- be set to True.
-
- For example::
-
- tasks:
- - ceph:
- - mon_thrash:
- revive_delay: 20
- thrash_delay: 1
- thrash_store: true
- thrash_store_probability: 40
- seed: 31337
- maintain_quorum: true
- thrash_many: true
- - ceph-fuse:
- - workunit:
- clients:
- all:
- - mon/workloadgen.sh
- """
- def __init__(self, ctx, manager, config, logger):
- self.ctx = ctx
- self.manager = manager
- self.manager.wait_for_clean()
-
- self.stopping = False
- self.logger = logger
- self.config = config
-
- if self.config is None:
- self.config = dict()
-
- """ Test reproducibility """
- self.random_seed = self.config.get('seed', None)
-
- if self.random_seed is None:
- self.random_seed = int(time.time())
-
- self.rng = random.Random()
- self.rng.seed(int(self.random_seed))
-
- """ Monitor thrashing """
- self.revive_delay = float(self.config.get('revive_delay', 10.0))
- self.thrash_delay = float(self.config.get('thrash_delay', 0.0))
-
- self.thrash_many = self.config.get('thrash_many', False)
- self.maintain_quorum = self.config.get('maintain_quorum', True)
-
- self.scrub = self.config.get('scrub', True)
-
- self.freeze_mon_probability = float(self.config.get('freeze_mon_probability', 10))
- self.freeze_mon_duration = float(self.config.get('freeze_mon_duration', 15.0))
-
- assert self.max_killable() > 0, \
- 'Unable to kill at least one monitor with the current config.'
-
- """ Store thrashing """
- self.store_thrash = self.config.get('store_thrash', False)
- self.store_thrash_probability = int(
- self.config.get('store_thrash_probability', 50))
- if self.store_thrash:
- assert self.store_thrash_probability > 0, \
- 'store_thrash is set, probability must be > 0'
- assert self.maintain_quorum, \
- 'store_thrash = true must imply maintain_quorum = true'
-
- self.thread = gevent.spawn(self.do_thrash)
-
- def log(self, x):
- """
- locally log info messages
- """
- self.logger.info(x)
-
- def do_join(self):
- """
- Break out of this processes thrashing loop.
- """
- self.stopping = True
- self.thread.get()
-
- def should_thrash_store(self):
- """
- If allowed, indicate that we should thrash a certain percentage of
- the time as determined by the store_thrash_probability value.
- """
- if not self.store_thrash:
- return False
- return self.rng.randrange(0, 101) < self.store_thrash_probability
-
- def thrash_store(self, mon):
- """
- Thrash the monitor specified.
- :param mon: monitor to thrash
- """
- addr = self.ctx.ceph['ceph'].conf['mon.%s' % mon]['mon addr']
- self.log('thrashing mon.{id}@{addr} store'.format(id=mon, addr=addr))
- out = self.manager.raw_cluster_cmd('-m', addr, 'sync', 'force')
- j = json.loads(out)
- assert j['ret'] == 0, \
- 'error forcing store sync on mon.{id}:\n{ret}'.format(
- id=mon,ret=out)
-
- def should_freeze_mon(self):
- """
- Indicate that we should freeze a certain percentago of the time
- as determined by the freeze_mon_probability value.
- """
- return self.rng.randrange(0, 101) < self.freeze_mon_probability
-
- def freeze_mon(self, mon):
- """
- Send STOP signal to freeze the monitor.
- """
- log.info('Sending STOP to mon %s', mon)
- self.manager.signal_mon(mon, 19) # STOP
-
- def unfreeze_mon(self, mon):
- """
- Send CONT signal to unfreeze the monitor.
- """
- log.info('Sending CONT to mon %s', mon)
- self.manager.signal_mon(mon, 18) # CONT
-
- def kill_mon(self, mon):
- """
- Kill the monitor specified
- """
- self.log('killing mon.{id}'.format(id=mon))
- self.manager.kill_mon(mon)
-
- def revive_mon(self, mon):
- """
- Revive the monitor specified
- """
- self.log('killing mon.{id}'.format(id=mon))
- self.log('reviving mon.{id}'.format(id=mon))
- self.manager.revive_mon(mon)
-
- def max_killable(self):
- """
- Return the maximum number of monitors we can kill.
- """
- m = len(_get_mons(self.ctx))
- if self.maintain_quorum:
- return max(math.ceil(m/2.0)-1, 0)
- else:
- return m
-
- def do_thrash(self):
- """
- Cotinuously loop and thrash the monitors.
- """
- self.log('start thrashing')
- self.log('seed: {s}, revive delay: {r}, thrash delay: {t} '\
- 'thrash many: {tm}, maintain quorum: {mq} '\
- 'store thrash: {st}, probability: {stp} '\
- 'freeze mon: prob {fp} duration {fd}'.format(
- s=self.random_seed,r=self.revive_delay,t=self.thrash_delay,
- tm=self.thrash_many, mq=self.maintain_quorum,
- st=self.store_thrash,stp=self.store_thrash_probability,
- fp=self.freeze_mon_probability,fd=self.freeze_mon_duration,
- ))
-
- while not self.stopping:
- mons = _get_mons(self.ctx)
- self.manager.wait_for_mon_quorum_size(len(mons))
- self.log('making sure all monitors are in the quorum')
- for m in mons:
- s = self.manager.get_mon_status(m)
- assert s['state'] == 'leader' or s['state'] == 'peon'
- assert len(s['quorum']) == len(mons)
-
- kill_up_to = self.rng.randrange(1, self.max_killable()+1)
- mons_to_kill = self.rng.sample(mons, kill_up_to)
- self.log('monitors to thrash: {m}'.format(m=mons_to_kill))
-
- mons_to_freeze = []
- for mon in mons:
- if mon in mons_to_kill:
- continue
- if self.should_freeze_mon():
- mons_to_freeze.append(mon)
- self.log('monitors to freeze: {m}'.format(m=mons_to_freeze))
-
- for mon in mons_to_kill:
- self.log('thrashing mon.{m}'.format(m=mon))
-
- """ we only thrash stores if we are maintaining quorum """
- if self.should_thrash_store() and self.maintain_quorum:
- self.thrash_store(mon)
-
- self.kill_mon(mon)
-
- if mons_to_freeze:
- for mon in mons_to_freeze:
- self.freeze_mon(mon)
- self.log('waiting for {delay} secs to unfreeze mons'.format(
- delay=self.freeze_mon_duration))
- time.sleep(self.freeze_mon_duration)
- for mon in mons_to_freeze:
- self.unfreeze_mon(mon)
-
- if self.maintain_quorum:
- self.manager.wait_for_mon_quorum_size(len(mons)-len(mons_to_kill))
- for m in mons:
- if m in mons_to_kill:
- continue
- s = self.manager.get_mon_status(m)
- assert s['state'] == 'leader' or s['state'] == 'peon'
- assert len(s['quorum']) == len(mons)-len(mons_to_kill)
-
- self.log('waiting for {delay} secs before reviving monitors'.format(
- delay=self.revive_delay))
- time.sleep(self.revive_delay)
-
- for mon in mons_to_kill:
- self.revive_mon(mon)
- # do more freezes
- if mons_to_freeze:
- for mon in mons_to_freeze:
- self.freeze_mon(mon)
- self.log('waiting for {delay} secs to unfreeze mons'.format(
- delay=self.freeze_mon_duration))
- time.sleep(self.freeze_mon_duration)
- for mon in mons_to_freeze:
- self.unfreeze_mon(mon)
-
- self.manager.wait_for_mon_quorum_size(len(mons))
- for m in mons:
- s = self.manager.get_mon_status(m)
- assert s['state'] == 'leader' or s['state'] == 'peon'
- assert len(s['quorum']) == len(mons)
-
- if self.scrub:
- self.log('triggering scrub')
- try:
- self.manager.raw_cluster_cmd('scrub')
- except Exception:
- log.exception("Saw exception while triggering scrub")
-
- if self.thrash_delay > 0.0:
- self.log('waiting for {delay} secs before continuing thrashing'.format(
- delay=self.thrash_delay))
- time.sleep(self.thrash_delay)
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Stress test the monitor by thrashing them while another task/workunit
- is running.
-
- Please refer to MonitorThrasher class for further information on the
- available options.
- """
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- 'mon_thrash task only accepts a dict for configuration'
- assert len(_get_mons(ctx)) > 2, \
- 'mon_thrash task requires at least 3 monitors'
- log.info('Beginning mon_thrash...')
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
- manager = ceph_manager.CephManager(
- mon,
- ctx=ctx,
- logger=log.getChild('ceph_manager'),
- )
- thrash_proc = MonitorThrasher(ctx,
- manager, config,
- logger=log.getChild('mon_thrasher'))
- try:
- log.debug('Yielding')
- yield
- finally:
- log.info('joining mon_thrasher')
- thrash_proc.do_join()
- mons = _get_mons(ctx)
- manager.wait_for_mon_quorum_size(len(mons))
+++ /dev/null
-"""
-Multibench testing
-"""
-import contextlib
-import logging
-import radosbench
-import time
-import copy
-import gevent
-
-log = logging.getLogger(__name__)
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Run multibench
-
- The config should be as follows:
-
- multibench:
- time: <seconds to run total>
- segments: <number of concurrent benches>
- radosbench: <config for radosbench>
-
- example:
-
- tasks:
- - ceph:
- - multibench:
- clients: [client.0]
- time: 360
- - interactive:
- """
- log.info('Beginning multibench...')
- assert isinstance(config, dict), \
- "please list clients to run on"
-
- def run_one(num):
- """Run test spawn from gevent"""
- start = time.time()
- if not config.get('radosbench'):
- benchcontext = {}
- else:
- benchcontext = copy.copy(config.get('radosbench'))
- iterations = 0
- while time.time() - start < int(config.get('time', 600)):
- log.info("Starting iteration %s of segment %s"%(iterations, num))
- benchcontext['pool'] = str(num) + "-" + str(iterations)
- with radosbench.task(ctx, benchcontext):
- time.sleep()
- iterations += 1
- log.info("Starting %s threads"%(str(config.get('segments', 3)),))
- segments = [
- gevent.spawn(run_one, i)
- for i in range(0, int(config.get('segments', 3)))]
-
- try:
- yield
- finally:
- [i.get() for i in segments]
+++ /dev/null
-"""
-Test Object locations going down
-"""
-import logging
-import ceph_manager
-import time
-from teuthology import misc as teuthology
-from util.rados import rados
-
-log = logging.getLogger(__name__)
-
-def task(ctx, config):
- """
- Test handling of object location going down
- """
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- 'lost_unfound task only accepts a dict for configuration'
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
-
- manager = ceph_manager.CephManager(
- mon,
- ctx=ctx,
- logger=log.getChild('ceph_manager'),
- )
-
- while len(manager.get_osd_status()['up']) < 3:
- time.sleep(10)
- manager.wait_for_clean()
-
- # something that is always there
- dummyfile = '/etc/fstab'
-
- # take 0, 1 out
- manager.mark_out_osd(0)
- manager.mark_out_osd(1)
- manager.wait_for_clean()
-
- # delay recovery, and make the pg log very long (to prevent backfill)
- manager.raw_cluster_cmd(
- 'tell', 'osd.0',
- 'injectargs',
- '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000'
- )
- # delay recovery, and make the pg log very long (to prevent backfill)
- manager.raw_cluster_cmd(
- 'tell', 'osd.1',
- 'injectargs',
- '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000'
- )
- # delay recovery, and make the pg log very long (to prevent backfill)
- manager.raw_cluster_cmd(
- 'tell', 'osd.2',
- 'injectargs',
- '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000'
- )
- # delay recovery, and make the pg log very long (to prevent backfill)
- manager.raw_cluster_cmd(
- 'tell', 'osd.3',
- 'injectargs',
- '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000'
- )
-
- # kludge to make sure they get a map
- rados(ctx, mon, ['-p', 'data', 'put', 'dummy', dummyfile])
-
- # create old objects
- for f in range(1, 10):
- rados(ctx, mon, ['-p', 'data', 'put', 'existing_%d' % f, dummyfile])
-
- manager.mark_out_osd(3)
- manager.wait_till_active()
-
- manager.mark_in_osd(0)
- manager.wait_till_active()
-
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
-
- manager.mark_out_osd(2)
- manager.wait_till_active()
-
- # bring up 1
- manager.mark_in_osd(1)
- manager.wait_till_active()
-
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- log.info("Getting unfound objects")
- unfound = manager.get_num_unfound_objects()
- assert not unfound
-
- manager.kill_osd(2)
- manager.mark_down_osd(2)
- manager.kill_osd(3)
- manager.mark_down_osd(3)
-
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- log.info("Getting unfound objects")
- unfound = manager.get_num_unfound_objects()
- assert unfound
+++ /dev/null
-"""
-Run omapbench executable within teuthology
-"""
-import contextlib
-import logging
-
-from teuthology.orchestra import run
-from teuthology import misc as teuthology
-
-log = logging.getLogger(__name__)
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Run omapbench
-
- The config should be as follows::
-
- omapbench:
- clients: [client list]
- threads: <threads at once>
- objects: <number of objects to write>
- entries: <number of entries per object map>
- keysize: <number of characters per object map key>
- valsize: <number of characters per object map val>
- increment: <interval to show in histogram (in ms)>
- omaptype: <how the omaps should be generated>
-
- example::
-
- tasks:
- - ceph:
- - omapbench:
- clients: [client.0]
- threads: 30
- objects: 1000
- entries: 10
- keysize: 10
- valsize: 100
- increment: 100
- omaptype: uniform
- - interactive:
- """
- log.info('Beginning omapbench...')
- assert isinstance(config, dict), \
- "please list clients to run on"
- omapbench = {}
- testdir = teuthology.get_testdir(ctx)
- print(str(config.get('increment',-1)))
- for role in config.get('clients', ['client.0']):
- assert isinstance(role, basestring)
- PREFIX = 'client.'
- assert role.startswith(PREFIX)
- id_ = role[len(PREFIX):]
- (remote,) = ctx.cluster.only(role).remotes.iterkeys()
- proc = remote.run(
- args=[
- "/bin/sh", "-c",
- " ".join(['adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage',
- 'omapbench',
- '--name', role[len(PREFIX):],
- '-t', str(config.get('threads', 30)),
- '-o', str(config.get('objects', 1000)),
- '--entries', str(config.get('entries',10)),
- '--keysize', str(config.get('keysize',10)),
- '--valsize', str(config.get('valsize',1000)),
- '--inc', str(config.get('increment',10)),
- '--omaptype', str(config.get('omaptype','uniform'))
- ]).format(tdir=testdir),
- ],
- logger=log.getChild('omapbench.{id}'.format(id=id_)),
- stdin=run.PIPE,
- wait=False
- )
- omapbench[id_] = proc
-
- try:
- yield
- finally:
- log.info('joining omapbench')
- run.wait(omapbench.itervalues())
+++ /dev/null
-"""
-Osd backfill test
-"""
-import logging
-import ceph_manager
-import time
-from teuthology import misc as teuthology
-
-
-log = logging.getLogger(__name__)
-
-
-def rados_start(ctx, remote, cmd):
- """
- Run a remote rados command (currently used to only write data)
- """
- log.info("rados %s" % ' '.join(cmd))
- testdir = teuthology.get_testdir(ctx)
- pre = [
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'rados',
- ];
- pre.extend(cmd)
- proc = remote.run(
- args=pre,
- wait=False,
- )
- return proc
-
-def task(ctx, config):
- """
- Test backfill
- """
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- 'thrashosds task only accepts a dict for configuration'
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
-
- num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
- log.info('num_osds is %s' % num_osds)
- assert num_osds == 3
-
- manager = ceph_manager.CephManager(
- mon,
- ctx=ctx,
- logger=log.getChild('ceph_manager'),
- )
-
- while len(manager.get_osd_status()['up']) < 3:
- time.sleep(10)
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.wait_for_clean()
-
- # write some data
- p = rados_start(ctx, mon, ['-p', 'rbd', 'bench', '15', 'write', '-b', '4096',
- '--no-cleanup'])
- err = p.wait()
- log.info('err is %d' % err)
-
- # mark osd.0 out to trigger a rebalance/backfill
- manager.mark_out_osd(0)
-
- # also mark it down to it won't be included in pg_temps
- manager.kill_osd(0)
- manager.mark_down_osd(0)
-
- # wait for everything to peer and be happy...
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.wait_for_recovery()
-
- # write some new data
- p = rados_start(ctx, mon, ['-p', 'rbd', 'bench', '30', 'write', '-b', '4096',
- '--no-cleanup'])
-
- time.sleep(15)
-
- # blackhole + restart osd.1
- # this triggers a divergent backfill target
- manager.blackhole_kill_osd(1)
- time.sleep(2)
- manager.revive_osd(1)
-
- # wait for our writes to complete + succeed
- err = p.wait()
- log.info('err is %d' % err)
-
- # cluster must recover
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.wait_for_recovery()
-
- # re-add osd.0
- manager.revive_osd(0)
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.wait_for_clean()
-
-
+++ /dev/null
-"""
-Handle osdfailsafe configuration settings (nearfull ratio and full ratio)
-"""
-from cStringIO import StringIO
-import logging
-import time
-
-from teuthology.orchestra import run
-from util.rados import rados
-from teuthology import misc as teuthology
-
-log = logging.getLogger(__name__)
-
-def task(ctx, config):
- """
- Test handling of osd_failsafe_nearfull_ratio and osd_failsafe_full_ratio
- configuration settings
-
- In order for test to pass must use log-whitelist as follows
-
- tasks:
- - chef:
- - install:
- - ceph:
- log-whitelist: ['OSD near full', 'OSD full dropping all updates']
- - osd_failsafe_enospc:
-
- """
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- 'osd_failsafe_enospc task only accepts a dict for configuration'
-
- # Give 2 seconds for injectargs + osd_op_complaint_time (30) + 2 * osd_heartbeat_interval (6) + 6 padding
- sleep_time = 50
-
- # something that is always there
- dummyfile = '/etc/fstab'
- dummyfile2 = '/etc/resolv.conf'
-
- manager = ctx.managers['ceph']
-
- # create 1 pg pool with 1 rep which can only be on osd.0
- osds = manager.get_osd_dump()
- for osd in osds:
- if osd['osd'] != 0:
- manager.mark_out_osd(osd['osd'])
-
- log.info('creating pool foo')
- manager.create_pool("foo")
- manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'size', '1')
-
- # State NONE -> NEAR
- log.info('1. Verify warning messages when exceeding nearfull_ratio')
-
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
-
- proc = mon.run(
- args=[
- 'sudo',
- 'daemon-helper',
- 'kill',
- 'ceph', '-w'
- ],
- stdin=run.PIPE,
- stdout=StringIO(),
- wait=False,
- )
-
- manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .00001')
-
- time.sleep(sleep_time)
- proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
- proc.wait()
-
- lines = proc.stdout.getvalue().split('\n')
-
- count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
- assert count == 2, 'Incorrect number of warning messages expected 2 got %d' % count
- count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
- assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
-
- # State NEAR -> FULL
- log.info('2. Verify error messages when exceeding full_ratio')
-
- proc = mon.run(
- args=[
- 'sudo',
- 'daemon-helper',
- 'kill',
- 'ceph', '-w'
- ],
- stdin=run.PIPE,
- stdout=StringIO(),
- wait=False,
- )
-
- manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')
-
- time.sleep(sleep_time)
- proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
- proc.wait()
-
- lines = proc.stdout.getvalue().split('\n')
-
- count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
- assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count
-
- log.info('3. Verify write failure when exceeding full_ratio')
-
- # Write data should fail
- ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile1', dummyfile])
- assert ret != 0, 'Expected write failure but it succeeded with exit status 0'
-
- # Put back default
- manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
- time.sleep(10)
-
- # State FULL -> NEAR
- log.info('4. Verify write success when NOT exceeding full_ratio')
-
- # Write should succeed
- ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile2', dummyfile2])
- assert ret == 0, 'Expected write to succeed, but got exit status %d' % ret
-
- log.info('5. Verify warning messages again when exceeding nearfull_ratio')
-
- proc = mon.run(
- args=[
- 'sudo',
- 'daemon-helper',
- 'kill',
- 'ceph', '-w'
- ],
- stdin=run.PIPE,
- stdout=StringIO(),
- wait=False,
- )
-
- time.sleep(sleep_time)
- proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
- proc.wait()
-
- lines = proc.stdout.getvalue().split('\n')
-
- count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
- assert count == 1 or count == 2, 'Incorrect number of warning messages expected 1 or 2 got %d' % count
- count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
- assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
-
- manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .90')
- time.sleep(10)
-
- # State NONE -> FULL
- log.info('6. Verify error messages again when exceeding full_ratio')
-
- proc = mon.run(
- args=[
- 'sudo',
- 'daemon-helper',
- 'kill',
- 'ceph', '-w'
- ],
- stdin=run.PIPE,
- stdout=StringIO(),
- wait=False,
- )
-
- manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')
-
- time.sleep(sleep_time)
- proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
- proc.wait()
-
- lines = proc.stdout.getvalue().split('\n')
-
- count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
- assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
- count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
- assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count
-
- # State FULL -> NONE
- log.info('7. Verify no messages settings back to default')
-
- manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
- time.sleep(10)
-
- proc = mon.run(
- args=[
- 'sudo',
- 'daemon-helper',
- 'kill',
- 'ceph', '-w'
- ],
- stdin=run.PIPE,
- stdout=StringIO(),
- wait=False,
- )
-
- time.sleep(sleep_time)
- proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
- proc.wait()
-
- lines = proc.stdout.getvalue().split('\n')
-
- count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
- assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
- count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
- assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
-
- log.info('Test Passed')
-
- # Bring all OSDs back in
- manager.remove_pool("foo")
- for osd in osds:
- if osd['osd'] != 0:
- manager.mark_in_osd(osd['osd'])
+++ /dev/null
-"""
-osd recovery
-"""
-import logging
-import ceph_manager
-import time
-from teuthology import misc as teuthology
-
-
-log = logging.getLogger(__name__)
-
-
-def rados_start(testdir, remote, cmd):
- """
- Run a remote rados command (currently used to only write data)
- """
- log.info("rados %s" % ' '.join(cmd))
- pre = [
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'rados',
- ];
- pre.extend(cmd)
- proc = remote.run(
- args=pre,
- wait=False,
- )
- return proc
-
-def task(ctx, config):
- """
- Test (non-backfill) recovery
- """
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- 'task only accepts a dict for configuration'
- testdir = teuthology.get_testdir(ctx)
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
-
- num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
- log.info('num_osds is %s' % num_osds)
- assert num_osds == 3
-
- manager = ceph_manager.CephManager(
- mon,
- ctx=ctx,
- logger=log.getChild('ceph_manager'),
- )
-
- while len(manager.get_osd_status()['up']) < 3:
- time.sleep(10)
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.wait_for_clean()
-
- # test some osdmap flags
- manager.raw_cluster_cmd('osd', 'set', 'noin')
- manager.raw_cluster_cmd('osd', 'set', 'noout')
- manager.raw_cluster_cmd('osd', 'set', 'noup')
- manager.raw_cluster_cmd('osd', 'set', 'nodown')
- manager.raw_cluster_cmd('osd', 'unset', 'noin')
- manager.raw_cluster_cmd('osd', 'unset', 'noout')
- manager.raw_cluster_cmd('osd', 'unset', 'noup')
- manager.raw_cluster_cmd('osd', 'unset', 'nodown')
-
- # write some new data
- p = rados_start(testdir, mon, ['-p', 'rbd', 'bench', '20', 'write', '-b', '4096',
- '--no-cleanup'])
-
- time.sleep(15)
-
- # trigger a divergent target:
- # blackhole + restart osd.1 (shorter log)
- manager.blackhole_kill_osd(1)
- # kill osd.2 (longer log... we'll make it divergent below)
- manager.kill_osd(2)
- time.sleep(2)
- manager.revive_osd(1)
-
- # wait for our writes to complete + succeed
- err = p.wait()
- log.info('err is %d' % err)
-
- # cluster must repeer
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.wait_for_active_or_down()
-
- # write some more (make sure osd.2 really is divergent)
- p = rados_start(testdir, mon, ['-p', 'rbd', 'bench', '15', 'write', '-b', '4096'])
- p.wait()
-
- # revive divergent osd
- manager.revive_osd(2)
-
- while len(manager.get_osd_status()['up']) < 3:
- log.info('waiting a bit...')
- time.sleep(2)
- log.info('3 are up!')
-
- # cluster must recover
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.wait_for_clean()
-
-
-def test_incomplete_pgs(ctx, config):
- """
- Test handling of incomplete pgs. Requires 4 osds.
- """
- testdir = teuthology.get_testdir(ctx)
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- 'task only accepts a dict for configuration'
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
-
- num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
- log.info('num_osds is %s' % num_osds)
- assert num_osds == 4
-
- manager = ceph_manager.CephManager(
- mon,
- ctx=ctx,
- logger=log.getChild('ceph_manager'),
- )
-
- while len(manager.get_osd_status()['up']) < 4:
- time.sleep(10)
-
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats')
- manager.wait_for_clean()
-
- log.info('Testing incomplete pgs...')
-
- for i in range(4):
- manager.set_config(
- i,
- osd_recovery_delay_start=1000)
-
- # move data off of osd.0, osd.1
- manager.raw_cluster_cmd('osd', 'out', '0', '1')
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats')
- manager.wait_for_clean()
-
- # lots of objects in rbd (no pg log, will backfill)
- p = rados_start(testdir, mon,
- ['-p', 'rbd', 'bench', '20', 'write', '-b', '1',
- '--no-cleanup'])
- p.wait()
-
- # few objects in rbd pool (with pg log, normal recovery)
- for f in range(1, 20):
- p = rados_start(testdir, mon, ['-p', 'rbd', 'put',
- 'foo.%d' % f, '/etc/passwd'])
- p.wait()
-
- # move it back
- manager.raw_cluster_cmd('osd', 'in', '0', '1')
- manager.raw_cluster_cmd('osd', 'out', '2', '3')
- time.sleep(10)
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats')
- time.sleep(10)
- manager.wait_for_active()
-
- assert not manager.is_clean()
- assert not manager.is_recovered()
-
- # kill 2 + 3
- log.info('stopping 2,3')
- manager.kill_osd(2)
- manager.kill_osd(3)
- log.info('...')
- manager.raw_cluster_cmd('osd', 'down', '2', '3')
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.wait_for_active_or_down()
-
- assert manager.get_num_down() > 0
-
- # revive 2 + 3
- manager.revive_osd(2)
- manager.revive_osd(3)
- while len(manager.get_osd_status()['up']) < 4:
- log.info('waiting a bit...')
- time.sleep(2)
- log.info('all are up!')
-
- for i in range(4):
- manager.kick_recovery_wq(i)
-
- # cluster must recover
- manager.wait_for_clean()
+++ /dev/null
-"""
-Peer test (Single test, not much configurable here)
-"""
-import logging
-import json
-import time
-
-import ceph_manager
-from teuthology import misc as teuthology
-from util.rados import rados
-
-log = logging.getLogger(__name__)
-
-def task(ctx, config):
- """
- Test peering.
- """
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- 'peer task only accepts a dict for configuration'
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
-
- manager = ceph_manager.CephManager(
- mon,
- ctx=ctx,
- logger=log.getChild('ceph_manager'),
- )
-
- while len(manager.get_osd_status()['up']) < 3:
- time.sleep(10)
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.wait_for_clean()
-
- for i in range(3):
- manager.set_config(
- i,
- osd_recovery_delay_start=120)
-
- # take on osd down
- manager.kill_osd(2)
- manager.mark_down_osd(2)
-
- # kludge to make sure they get a map
- rados(ctx, mon, ['-p', 'data', 'get', 'dummy', '-'])
-
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.wait_for_recovery()
-
- # kill another and revive 2, so that some pgs can't peer.
- manager.kill_osd(1)
- manager.mark_down_osd(1)
- manager.revive_osd(2)
- manager.wait_till_osd_is_up(2)
-
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
-
- manager.wait_for_active_or_down()
-
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
-
- # look for down pgs
- num_down_pgs = 0
- pgs = manager.get_pg_stats()
- for pg in pgs:
- out = manager.raw_cluster_cmd('pg', pg['pgid'], 'query')
- log.debug("out string %s",out)
- j = json.loads(out)
- log.info("pg is %s, query json is %s", pg, j)
-
- if pg['state'].count('down'):
- num_down_pgs += 1
- # verify that it is blocked on osd.1
- rs = j['recovery_state']
- assert len(rs) > 0
- assert rs[0]['name'] == 'Started/Primary/Peering/GetInfo'
- assert rs[1]['name'] == 'Started/Primary/Peering'
- assert rs[1]['blocked']
- assert rs[1]['down_osds_we_would_probe'] == [1]
- assert len(rs[1]['peering_blocked_by']) == 1
- assert rs[1]['peering_blocked_by'][0]['osd'] == 1
-
- assert num_down_pgs > 0
-
- # bring it all back
- manager.revive_osd(1)
- manager.wait_till_osd_is_up(1)
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.wait_for_clean()
+++ /dev/null
-"""
-Remotely run peering tests.
-"""
-import logging
-import time
-
-log = logging.getLogger(__name__)
-
-from args import argify
-
-POOLNAME = "POOLNAME"
-ARGS = [
- ('num_pgs', 'number of pgs to create', 256, int),
- ('max_time', 'seconds to complete peering', 0, int),
- ('runs', 'trials to run', 10, int),
- ('num_objects', 'objects to create', 256 * 1024, int),
- ('object_size', 'size in bytes for objects', 64, int),
- ('creation_time_limit', 'time limit for pool population', 60*60, int),
- ('create_threads', 'concurrent writes for create', 256, int)
- ]
-
-def setup(ctx, config):
- """
- Setup peering test on remotes.
- """
- manager = ctx.managers['ceph']
- manager.clear_pools()
- manager.create_pool(POOLNAME, config.num_pgs)
- log.info("populating pool")
- manager.rados_write_objects(
- POOLNAME,
- config.num_objects,
- config.object_size,
- config.creation_time_limit,
- config.create_threads)
- log.info("done populating pool")
-
-def do_run(ctx, config):
- """
- Perform the test.
- """
- start = time.time()
- # mark in osd
- manager = ctx.managers['ceph']
- manager.mark_in_osd(0)
- log.info("writing out objects")
- manager.rados_write_objects(
- POOLNAME,
- config.num_pgs, # write 1 object per pg or so
- 1,
- config.creation_time_limit,
- config.num_pgs, # lots of concurrency
- cleanup = True)
- peering_end = time.time()
-
- log.info("peering done, waiting on recovery")
- manager.wait_for_clean()
-
- log.info("recovery done")
- recovery_end = time.time()
- if config.max_time:
- assert(peering_end - start < config.max_time)
- manager.mark_out_osd(0)
- manager.wait_for_clean()
- return {
- 'time_to_active': peering_end - start,
- 'time_to_clean': recovery_end - start
- }
-
-@argify("peering_speed_test", ARGS)
-def task(ctx, config):
- """
- Peering speed test
- """
- setup(ctx, config)
- manager = ctx.managers['ceph']
- manager.mark_out_osd(0)
- manager.wait_for_clean()
- ret = []
- for i in range(config.runs):
- log.info("Run {i}".format(i = i))
- ret.append(do_run(ctx, config))
-
- manager.mark_in_osd(0)
- ctx.summary['recovery_times'] = {
- 'runs': ret
- }
+++ /dev/null
-"""
-Populate rbd pools
-"""
-import contextlib
-import logging
-
-log = logging.getLogger(__name__)
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Populate <num_pools> pools with prefix <pool_prefix> with <num_images>
- rbd images at <num_snaps> snaps
-
- The config could be as follows::
-
- populate_rbd_pool:
- client: <client>
- pool_prefix: foo
- num_pools: 5
- num_images: 10
- num_snaps: 3
- image_size: 10737418240
- """
- if config is None:
- config = {}
- client = config.get("client", "client.0")
- pool_prefix = config.get("pool_prefix", "foo")
- num_pools = config.get("num_pools", 2)
- num_images = config.get("num_images", 20)
- num_snaps = config.get("num_snaps", 4)
- image_size = config.get("image_size", 100)
- write_size = config.get("write_size", 1024*1024)
- write_threads = config.get("write_threads", 10)
- write_total_per_snap = config.get("write_total_per_snap", 1024*1024*30)
-
- (remote,) = ctx.cluster.only(client).remotes.iterkeys()
-
- for poolid in range(num_pools):
- poolname = "%s-%s" % (pool_prefix, str(poolid))
- log.info("Creating pool %s" % (poolname,))
- ctx.managers['ceph'].create_pool(poolname)
- for imageid in range(num_images):
- imagename = "rbd-%s" % (str(imageid),)
- log.info("Creating imagename %s" % (imagename,))
- remote.run(
- args = [
- "rbd",
- "create",
- imagename,
- "--image-format", "1",
- "--size", str(image_size),
- "--pool", str(poolname)])
- def bench_run():
- remote.run(
- args = [
- "rbd",
- "bench-write",
- imagename,
- "--pool", poolname,
- "--io-size", str(write_size),
- "--io-threads", str(write_threads),
- "--io-total", str(write_total_per_snap),
- "--io-pattern", "rand"])
- log.info("imagename %s first bench" % (imagename,))
- bench_run()
- for snapid in range(num_snaps):
- snapname = "snap-%s" % (str(snapid),)
- log.info("imagename %s creating snap %s" % (imagename, snapname))
- remote.run(
- args = [
- "rbd", "snap", "create",
- "--pool", poolname,
- "--snap", snapname,
- imagename
- ])
- bench_run()
-
- try:
- yield
- finally:
- log.info('done')
+++ /dev/null
-"""
-Qemu task
-"""
-from cStringIO import StringIO
-
-import contextlib
-import logging
-import os
-
-from teuthology import misc as teuthology
-from teuthology import contextutil
-from tasks import rbd
-from teuthology.orchestra import run
-
-log = logging.getLogger(__name__)
-
-DEFAULT_NUM_RBD = 1
-DEFAULT_IMAGE_URL = 'http://ceph.com/qa/ubuntu-12.04.qcow2'
-DEFAULT_MEM = 4096 # in megabytes
-
-def create_images(ctx, config, managers):
- for client, client_config in config.iteritems():
- num_rbd = client_config.get('num_rbd', 1)
- clone = client_config.get('clone', False)
- assert num_rbd > 0, 'at least one rbd device must be used'
- for i in xrange(num_rbd):
- create_config = {
- client: {
- 'image_name': '{client}.{num}'.format(client=client, num=i),
- 'image_format': 2 if clone else 1,
- }
- }
- managers.append(
- lambda create_config=create_config:
- rbd.create_image(ctx=ctx, config=create_config)
- )
-
-def create_clones(ctx, config, managers):
- for client, client_config in config.iteritems():
- num_rbd = client_config.get('num_rbd', 1)
- clone = client_config.get('clone', False)
- if clone:
- for i in xrange(num_rbd):
- create_config = {
- client: {
- 'image_name':
- '{client}.{num}-clone'.format(client=client, num=i),
- 'parent_name':
- '{client}.{num}'.format(client=client, num=i),
- }
- }
- managers.append(
- lambda create_config=create_config:
- rbd.clone_image(ctx=ctx, config=create_config)
- )
-
-@contextlib.contextmanager
-def create_dirs(ctx, config):
- """
- Handle directory creation and cleanup
- """
- testdir = teuthology.get_testdir(ctx)
- for client, client_config in config.iteritems():
- assert 'test' in client_config, 'You must specify a test to run'
- (remote,) = ctx.cluster.only(client).remotes.keys()
- remote.run(
- args=[
- 'install', '-d', '-m0755', '--',
- '{tdir}/qemu'.format(tdir=testdir),
- '{tdir}/archive/qemu'.format(tdir=testdir),
- ]
- )
- try:
- yield
- finally:
- for client, client_config in config.iteritems():
- assert 'test' in client_config, 'You must specify a test to run'
- (remote,) = ctx.cluster.only(client).remotes.keys()
- remote.run(
- args=[
- 'rmdir', '{tdir}/qemu'.format(tdir=testdir), run.Raw('||'), 'true',
- ]
- )
-
-@contextlib.contextmanager
-def generate_iso(ctx, config):
- """Execute system commands to generate iso"""
- log.info('generating iso...')
- testdir = teuthology.get_testdir(ctx)
- for client, client_config in config.iteritems():
- assert 'test' in client_config, 'You must specify a test to run'
- (remote,) = ctx.cluster.only(client).remotes.keys()
- src_dir = os.path.dirname(__file__)
- userdata_path = os.path.join(testdir, 'qemu', 'userdata.' + client)
- metadata_path = os.path.join(testdir, 'qemu', 'metadata.' + client)
-
- with file(os.path.join(src_dir, 'userdata_setup.yaml'), 'rb') as f:
- test_setup = ''.join(f.readlines())
- # configuring the commands to setup the nfs mount
- mnt_dir = "/export/{client}".format(client=client)
- test_setup = test_setup.format(
- mnt_dir=mnt_dir
- )
-
- with file(os.path.join(src_dir, 'userdata_teardown.yaml'), 'rb') as f:
- test_teardown = ''.join(f.readlines())
-
- user_data = test_setup
- if client_config.get('type', 'filesystem') == 'filesystem':
- for i in xrange(0, client_config.get('num_rbd', DEFAULT_NUM_RBD)):
- dev_letter = chr(ord('b') + i)
- user_data += """
-- |
- #!/bin/bash
- mkdir /mnt/test_{dev_letter}
- mkfs -t xfs /dev/vd{dev_letter}
- mount -t xfs /dev/vd{dev_letter} /mnt/test_{dev_letter}
-""".format(dev_letter=dev_letter)
-
- # this may change later to pass the directories as args to the
- # script or something. xfstests needs that.
- user_data += """
-- |
- #!/bin/bash
- test -d /mnt/test_b && cd /mnt/test_b
- /mnt/cdrom/test.sh > /mnt/log/test.log 2>&1 && touch /mnt/log/success
-""" + test_teardown
-
- teuthology.write_file(remote, userdata_path, StringIO(user_data))
-
- with file(os.path.join(src_dir, 'metadata.yaml'), 'rb') as f:
- teuthology.write_file(remote, metadata_path, f)
-
- test_file = '{tdir}/qemu/{client}.test.sh'.format(tdir=testdir, client=client)
- remote.run(
- args=[
- 'wget', '-nv', '-O', test_file,
- client_config['test'],
- run.Raw('&&'),
- 'chmod', '755', test_file,
- ],
- )
- remote.run(
- args=[
- 'genisoimage', '-quiet', '-input-charset', 'utf-8',
- '-volid', 'cidata', '-joliet', '-rock',
- '-o', '{tdir}/qemu/{client}.iso'.format(tdir=testdir, client=client),
- '-graft-points',
- 'user-data={userdata}'.format(userdata=userdata_path),
- 'meta-data={metadata}'.format(metadata=metadata_path),
- 'test.sh={file}'.format(file=test_file),
- ],
- )
- try:
- yield
- finally:
- for client in config.iterkeys():
- (remote,) = ctx.cluster.only(client).remotes.keys()
- remote.run(
- args=[
- 'rm', '-f',
- '{tdir}/qemu/{client}.iso'.format(tdir=testdir, client=client),
- os.path.join(testdir, 'qemu', 'userdata.' + client),
- os.path.join(testdir, 'qemu', 'metadata.' + client),
- '{tdir}/qemu/{client}.test.sh'.format(tdir=testdir, client=client),
- ],
- )
-
-@contextlib.contextmanager
-def download_image(ctx, config):
- """Downland base image, remove image file when done"""
- log.info('downloading base image')
- testdir = teuthology.get_testdir(ctx)
- for client, client_config in config.iteritems():
- (remote,) = ctx.cluster.only(client).remotes.keys()
- base_file = '{tdir}/qemu/base.{client}.qcow2'.format(tdir=testdir, client=client)
- remote.run(
- args=[
- 'wget', '-nv', '-O', base_file, DEFAULT_IMAGE_URL,
- ]
- )
- try:
- yield
- finally:
- log.debug('cleaning up base image files')
- for client in config.iterkeys():
- base_file = '{tdir}/qemu/base.{client}.qcow2'.format(
- tdir=testdir,
- client=client,
- )
- (remote,) = ctx.cluster.only(client).remotes.keys()
- remote.run(
- args=[
- 'rm', '-f', base_file,
- ],
- )
-
-
-def _setup_nfs_mount(remote, client, mount_dir):
- """
- Sets up an nfs mount on the remote that the guest can use to
- store logs. This nfs mount is also used to touch a file
- at the end of the test to indiciate if the test was successful
- or not.
- """
- export_dir = "/export/{client}".format(client=client)
- log.info("Creating the nfs export directory...")
- remote.run(args=[
- 'sudo', 'mkdir', '-p', export_dir,
- ])
- log.info("Mounting the test directory...")
- remote.run(args=[
- 'sudo', 'mount', '--bind', mount_dir, export_dir,
- ])
- log.info("Adding mount to /etc/exports...")
- export = "{dir} *(rw,no_root_squash,no_subtree_check,insecure)".format(
- dir=export_dir
- )
- remote.run(args=[
- 'sudo', 'sed', '-i', '/^\/export\//d', "/etc/exports",
- ])
- remote.run(args=[
- 'echo', export, run.Raw("|"),
- 'sudo', 'tee', '-a', "/etc/exports",
- ])
- log.info("Restarting NFS...")
- if remote.os.package_type == "deb":
- remote.run(args=['sudo', 'service', 'nfs-kernel-server', 'restart'])
- else:
- remote.run(args=['sudo', 'systemctl', 'restart', 'nfs'])
-
-
-def _teardown_nfs_mount(remote, client):
- """
- Tears down the nfs mount on the remote used for logging and reporting the
- status of the tests being ran in the guest.
- """
- log.info("Tearing down the nfs mount for {remote}".format(remote=remote))
- export_dir = "/export/{client}".format(client=client)
- log.info("Stopping NFS...")
- if remote.os.package_type == "deb":
- remote.run(args=[
- 'sudo', 'service', 'nfs-kernel-server', 'stop'
- ])
- else:
- remote.run(args=[
- 'sudo', 'systemctl', 'stop', 'nfs'
- ])
- log.info("Unmounting exported directory...")
- remote.run(args=[
- 'sudo', 'umount', export_dir
- ])
- log.info("Deleting exported directory...")
- remote.run(args=[
- 'sudo', 'rm', '-r', '/export'
- ])
- log.info("Deleting export from /etc/exports...")
- remote.run(args=[
- 'sudo', 'sed', '-i', '$ d', '/etc/exports'
- ])
- log.info("Starting NFS...")
- if remote.os.package_type == "deb":
- remote.run(args=[
- 'sudo', 'service', 'nfs-kernel-server', 'start'
- ])
- else:
- remote.run(args=[
- 'sudo', 'systemctl', 'start', 'nfs'
- ])
-
-
-@contextlib.contextmanager
-def run_qemu(ctx, config):
- """Setup kvm environment and start qemu"""
- procs = []
- testdir = teuthology.get_testdir(ctx)
- for client, client_config in config.iteritems():
- (remote,) = ctx.cluster.only(client).remotes.keys()
- log_dir = '{tdir}/archive/qemu/{client}'.format(tdir=testdir, client=client)
- remote.run(
- args=[
- 'mkdir', log_dir, run.Raw('&&'),
- 'sudo', 'modprobe', 'kvm',
- ]
- )
-
- # make an nfs mount to use for logging and to
- # allow to test to tell teuthology the tests outcome
- _setup_nfs_mount(remote, client, log_dir)
-
- base_file = '{tdir}/qemu/base.{client}.qcow2'.format(
- tdir=testdir,
- client=client
- )
- # Hack to make sure /dev/kvm permissions are set correctly
- # See http://tracker.ceph.com/issues/17977 and
- # https://bugzilla.redhat.com/show_bug.cgi?id=1333159
- remote.run(args='sudo udevadm control --reload')
- remote.run(args='sudo udevadm trigger /dev/kvm')
- remote.run(args='ls -l /dev/kvm')
-
- qemu_cmd = 'qemu-system-x86_64'
- if remote.os.package_type == "rpm":
- qemu_cmd = "/usr/libexec/qemu-kvm"
- args=[
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'daemon-helper',
- 'term',
- qemu_cmd, '-enable-kvm', '-nographic',
- '-m', str(client_config.get('memory', DEFAULT_MEM)),
- # base OS device
- '-drive',
- 'file={base},format=qcow2,if=virtio'.format(base=base_file),
- # cd holding metadata for cloud-init
- '-cdrom', '{tdir}/qemu/{client}.iso'.format(tdir=testdir, client=client),
- ]
-
- cachemode = 'none'
- ceph_config = ctx.ceph['ceph'].conf.get('global', {})
- ceph_config.update(ctx.ceph['ceph'].conf.get('client', {}))
- ceph_config.update(ctx.ceph['ceph'].conf.get(client, {}))
- if ceph_config.get('rbd cache'):
- if ceph_config.get('rbd cache max dirty', 1) > 0:
- cachemode = 'writeback'
- else:
- cachemode = 'writethrough'
-
- clone = client_config.get('clone', False)
- for i in xrange(client_config.get('num_rbd', DEFAULT_NUM_RBD)):
- suffix = '-clone' if clone else ''
- args.extend([
- '-drive',
- 'file=rbd:rbd/{img}:id={id},format=raw,if=virtio,cache={cachemode}'.format(
- img='{client}.{num}{suffix}'.format(client=client, num=i,
- suffix=suffix),
- id=client[len('client.'):],
- cachemode=cachemode,
- ),
- ])
-
- log.info('starting qemu...')
- procs.append(
- remote.run(
- args=args,
- logger=log.getChild(client),
- stdin=run.PIPE,
- wait=False,
- )
- )
-
- try:
- yield
- finally:
- log.info('waiting for qemu tests to finish...')
- run.wait(procs)
-
- log.debug('checking that qemu tests succeeded...')
- for client in config.iterkeys():
- (remote,) = ctx.cluster.only(client).remotes.keys()
- # teardown nfs mount
- _teardown_nfs_mount(remote, client)
- # check for test status
- remote.run(
- args=[
- 'test', '-f',
- '{tdir}/archive/qemu/{client}/success'.format(
- tdir=testdir,
- client=client
- ),
- ],
- )
-
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Run a test inside of QEMU on top of rbd. Only one test
- is supported per client.
-
- For example, you can specify which clients to run on::
-
- tasks:
- - ceph:
- - qemu:
- client.0:
- test: http://ceph.com/qa/test.sh
- client.1:
- test: http://ceph.com/qa/test2.sh
-
- Or use the same settings on all clients:
-
- tasks:
- - ceph:
- - qemu:
- all:
- test: http://ceph.com/qa/test.sh
-
- For tests that don't need a filesystem, set type to block::
-
- tasks:
- - ceph:
- - qemu:
- client.0:
- test: http://ceph.com/qa/test.sh
- type: block
-
- The test should be configured to run on /dev/vdb and later
- devices.
-
- If you want to run a test that uses more than one rbd image,
- specify how many images to use::
-
- tasks:
- - ceph:
- - qemu:
- client.0:
- test: http://ceph.com/qa/test.sh
- type: block
- num_rbd: 2
-
- You can set the amount of memory the VM has (default is 1024 MB)::
-
- tasks:
- - ceph:
- - qemu:
- client.0:
- test: http://ceph.com/qa/test.sh
- memory: 512 # megabytes
-
- If you want to run a test against a cloned rbd image, set clone to true::
-
- tasks:
- - ceph:
- - qemu:
- client.0:
- test: http://ceph.com/qa/test.sh
- clone: true
- """
- assert isinstance(config, dict), \
- "task qemu only supports a dictionary for configuration"
-
- config = teuthology.replace_all_with_clients(ctx.cluster, config)
-
- managers = []
- create_images(ctx=ctx, config=config, managers=managers)
- managers.extend([
- lambda: create_dirs(ctx=ctx, config=config),
- lambda: generate_iso(ctx=ctx, config=config),
- lambda: download_image(ctx=ctx, config=config),
- ])
- create_clones(ctx=ctx, config=config, managers=managers)
- managers.append(
- lambda: run_qemu(ctx=ctx, config=config),
- )
-
- with contextutil.nested(*managers):
- yield
+++ /dev/null
-"""
-Rados modle-based integration tests
-"""
-import contextlib
-import logging
-import gevent
-from teuthology import misc as teuthology
-
-from teuthology.orchestra import run
-
-log = logging.getLogger(__name__)
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Run RadosModel-based integration tests.
-
- The config should be as follows::
-
- rados:
- clients: [client list]
- ops: <number of ops>
- objects: <number of objects to use>
- max_in_flight: <max number of operations in flight>
- object_size: <size of objects in bytes>
- min_stride_size: <minimum write stride size in bytes>
- max_stride_size: <maximum write stride size in bytes>
- op_weights: <dictionary mapping operation type to integer weight>
- runs: <number of times to run> - the pool is remade between runs
- ec_pool: use an ec pool
- erasure_code_profile: profile to use with the erasure coded pool
- pool_snaps: use pool snapshots instead of selfmanaged snapshots
- write_fadvise_dontneed: write behavior like with LIBRADOS_OP_FLAG_FADVISE_DONTNEED.
- This mean data don't access in the near future.
- Let osd backend don't keep data in cache.
-
- For example::
-
- tasks:
- - ceph:
- - rados:
- clients: [client.0]
- ops: 1000
- max_seconds: 0 # 0 for no limit
- objects: 25
- max_in_flight: 16
- object_size: 4000000
- min_stride_size: 1024
- max_stride_size: 4096
- op_weights:
- read: 20
- write: 10
- delete: 2
- snap_create: 3
- rollback: 2
- snap_remove: 0
- ec_pool: create an ec pool, defaults to False
- erasure_code_use_hacky_overwrites: use the whitebox
- testing experimental
- overwrites mode
- erasure_code_profile:
- name: teuthologyprofile
- k: 2
- m: 1
- ruleset-failure-domain: osd
- pool_snaps: true
- write_fadvise_dontneed: true
- runs: 10
- - interactive:
-
- Optionally, you can provide the pool name to run against:
-
- tasks:
- - ceph:
- - exec:
- client.0:
- - ceph osd pool create foo
- - rados:
- clients: [client.0]
- pools: [foo]
- ...
-
- Alternatively, you can provide a pool prefix:
-
- tasks:
- - ceph:
- - exec:
- client.0:
- - ceph osd pool create foo.client.0
- - rados:
- clients: [client.0]
- pool_prefix: foo
- ...
-
- The tests are run asynchronously, they are not complete when the task
- returns. For instance:
-
- - rados:
- clients: [client.0]
- pools: [ecbase]
- ops: 4000
- objects: 500
- op_weights:
- read: 100
- write: 100
- delete: 50
- copy_from: 50
- - print: "**** done rados ec-cache-agent (part 2)"
-
- will run the print task immediately after the rados tasks begins but
- not after it completes. To make the rados task a blocking / sequential
- task, use:
-
- - sequential:
- - rados:
- clients: [client.0]
- pools: [ecbase]
- ops: 4000
- objects: 500
- op_weights:
- read: 100
- write: 100
- delete: 50
- copy_from: 50
- - print: "**** done rados ec-cache-agent (part 2)"
-
- """
- log.info('Beginning rados...')
- assert isinstance(config, dict), \
- "please list clients to run on"
-
- object_size = int(config.get('object_size', 4000000))
- op_weights = config.get('op_weights', {})
- testdir = teuthology.get_testdir(ctx)
- args = [
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'ceph_test_rados']
- if config.get('ec_pool', False):
- args.extend(['--no-omap'])
- if config.get('erasure_code_use_hacky_overwrites', False):
- args.extend(['--no-sparse'])
- else:
- args.extend(['--ec-pool'])
- if config.get('write_fadvise_dontneed', False):
- args.extend(['--write-fadvise-dontneed'])
- if config.get('pool_snaps', False):
- args.extend(['--pool-snaps'])
- args.extend([
- '--max-ops', str(config.get('ops', 10000)),
- '--objects', str(config.get('objects', 500)),
- '--max-in-flight', str(config.get('max_in_flight', 16)),
- '--size', str(object_size),
- '--min-stride-size', str(config.get('min_stride_size', object_size / 10)),
- '--max-stride-size', str(config.get('max_stride_size', object_size / 5)),
- '--max-seconds', str(config.get('max_seconds', 0))
- ])
-
- weights = {}
- weights['read'] = 100
- weights['write'] = 100
- weights['delete'] = 10
- # Parallel of the op_types in test/osd/TestRados.cc
- for field in [
- # read handled above
- # write handled above
- # delete handled above
- "snap_create",
- "snap_remove",
- "rollback",
- "setattr",
- "rmattr",
- "watch",
- "copy_from",
- "hit_set_list",
- "is_dirty",
- "undirty",
- "cache_flush",
- "cache_try_flush",
- "cache_evict",
- "append",
- "write",
- "read",
- "delete"
- ]:
- if field in op_weights:
- weights[field] = op_weights[field]
-
- if config.get('write_append_excl', True):
- if 'write' in weights:
- weights['write'] = weights['write'] / 2
- weights['write_excl'] = weights['write']
-
- if 'append' in weights:
- weights['append'] = weights['append'] / 2
- weights['append_excl'] = weights['append']
-
- for op, weight in weights.iteritems():
- args.extend([
- '--op', op, str(weight)
- ])
-
-
- def thread():
- """Thread spawned by gevent"""
- clients = ['client.{id}'.format(id=id_) for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
- log.info('clients are %s' % clients)
- manager = ctx.managers['ceph']
- if config.get('ec_pool', False):
- profile = config.get('erasure_code_profile', {})
- profile_name = profile.get('name', 'teuthologyprofile')
- manager.create_erasure_code_profile(profile_name, profile)
- else:
- profile_name = None
- for i in range(int(config.get('runs', '1'))):
- log.info("starting run %s out of %s", str(i), config.get('runs', '1'))
- tests = {}
- existing_pools = config.get('pools', [])
- created_pools = []
- for role in config.get('clients', clients):
- assert isinstance(role, basestring)
- PREFIX = 'client.'
- assert role.startswith(PREFIX)
- id_ = role[len(PREFIX):]
-
- pool = config.get('pool', None)
- if not pool and existing_pools:
- pool = existing_pools.pop()
- else:
- pool = manager.create_pool_with_unique_name(
- erasure_code_profile_name=profile_name,
- erasure_code_use_hacky_overwrites=
- config.get('erasure_code_use_hacky_overwrites', False)
- )
- created_pools.append(pool)
- if config.get('fast_read', False):
- manager.raw_cluster_cmd(
- 'osd', 'pool', 'set', pool, 'fast_read', 'true')
-
- (remote,) = ctx.cluster.only(role).remotes.iterkeys()
- proc = remote.run(
- args=["CEPH_CLIENT_ID={id_}".format(id_=id_)] + args +
- ["--pool", pool],
- logger=log.getChild("rados.{id}".format(id=id_)),
- stdin=run.PIPE,
- wait=False
- )
- tests[id_] = proc
- run.wait(tests.itervalues())
-
- for pool in created_pools:
- manager.remove_pool(pool)
-
- running = gevent.spawn(thread)
-
- try:
- yield
- finally:
- log.info('joining rados')
- running.get()
+++ /dev/null
-"""
-Rados benchmarking
-"""
-import contextlib
-import logging
-
-from teuthology.orchestra import run
-from teuthology import misc as teuthology
-
-log = logging.getLogger(__name__)
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Run radosbench
-
- The config should be as follows:
-
- radosbench:
- clients: [client list]
- time: <seconds to run>
- pool: <pool to use>
- size: write size to use
- unique_pool: use a unique pool, defaults to False
- ec_pool: create an ec pool, defaults to False
- create_pool: create pool, defaults to False
- erasure_code_profile:
- name: teuthologyprofile
- k: 2
- m: 1
- ruleset-failure-domain: osd
- cleanup: false (defaults to true)
- example:
-
- tasks:
- - ceph:
- - radosbench:
- clients: [client.0]
- time: 360
- - interactive:
- """
- log.info('Beginning radosbench...')
- assert isinstance(config, dict), \
- "please list clients to run on"
- radosbench = {}
-
- testdir = teuthology.get_testdir(ctx)
- manager = ctx.managers['ceph']
-
- create_pool = config.get('create_pool', True)
- for role in config.get('clients', ['client.0']):
- assert isinstance(role, basestring)
- PREFIX = 'client.'
- assert role.startswith(PREFIX)
- id_ = role[len(PREFIX):]
- (remote,) = ctx.cluster.only(role).remotes.iterkeys()
-
- if config.get('ec_pool', False):
- profile = config.get('erasure_code_profile', {})
- profile_name = profile.get('name', 'teuthologyprofile')
- manager.create_erasure_code_profile(profile_name, profile)
- else:
- profile_name = None
-
- cleanup = []
- if not config.get('cleanup', True):
- cleanup = ['--no-cleanup']
-
- pool = config.get('pool', 'data')
- if create_pool:
- if pool != 'data':
- manager.create_pool(pool, erasure_code_profile_name=profile_name)
- else:
- pool = manager.create_pool_with_unique_name(erasure_code_profile_name=profile_name)
-
- proc = remote.run(
- args=[
- "/bin/sh", "-c",
- " ".join(['adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage',
- 'rados',
- '--no-log-to-stderr',
- '--name', role,
- '-b', str(config.get('size', 4<<20)),
- '-p' , pool,
- 'bench', str(config.get('time', 360)), 'write',
- ] + cleanup).format(tdir=testdir),
- ],
- logger=log.getChild('radosbench.{id}'.format(id=id_)),
- stdin=run.PIPE,
- wait=False
- )
- radosbench[id_] = proc
-
- try:
- yield
- finally:
- timeout = config.get('time', 360) * 5 + 180
- log.info('joining radosbench (timing out after %ss)', timeout)
- run.wait(radosbench.itervalues(), timeout=timeout)
-
- if pool is not 'data' and create_pool:
- manager.remove_pool(pool)
+++ /dev/null
-"""
-Rados benchmarking sweep
-"""
-import contextlib
-import logging
-import re
-
-from cStringIO import StringIO
-from itertools import product
-
-from teuthology.orchestra import run
-from teuthology import misc as teuthology
-
-log = logging.getLogger(__name__)
-
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Execute a radosbench parameter sweep
-
- Puts radosbench in a loop, taking values from the given config at each
- iteration. If given, the min and max values below create a range, e.g.
- min_replicas=1 and max_replicas=3 implies executing with 1-3 replicas.
-
- Parameters:
-
- clients: [client list]
- time: seconds to run (default=120)
- sizes: [list of object sizes] (default=[4M])
- mode: <write|read|seq> (default=write)
- repetitions: execute the same configuration multiple times (default=1)
- min_num_replicas: minimum number of replicas to use (default = 3)
- max_num_replicas: maximum number of replicas to use (default = 3)
- min_num_osds: the minimum number of OSDs in a pool (default=all)
- max_num_osds: the maximum number of OSDs in a pool (default=all)
- file: name of CSV-formatted output file (default='radosbench.csv')
- columns: columns to include (default=all)
- - rep: execution number (takes values from 'repetitions')
- - num_osd: number of osds for pool
- - num_replica: number of replicas
- - avg_throughput: throughput
- - avg_latency: latency
- - stdev_throughput:
- - stdev_latency:
-
- Example:
- - radsobenchsweep:
- columns: [rep, num_osd, num_replica, avg_throughput, stdev_throughput]
- """
- log.info('Beginning radosbenchsweep...')
- assert isinstance(config, dict), 'expecting dictionary for configuration'
-
- # get and validate config values
- # {
-
- # only one client supported for now
- if len(config.get('clients', [])) != 1:
- raise Exception("Only one client can be specified")
-
- # only write mode
- if config.get('mode', 'write') != 'write':
- raise Exception("Only 'write' mode supported for now.")
-
- # OSDs
- total_osds_in_cluster = teuthology.num_instances_of_type(ctx.cluster, 'osd')
- min_num_osds = config.get('min_num_osds', total_osds_in_cluster)
- max_num_osds = config.get('max_num_osds', total_osds_in_cluster)
-
- if max_num_osds > total_osds_in_cluster:
- raise Exception('max_num_osds cannot be greater than total in cluster')
- if min_num_osds < 1:
- raise Exception('min_num_osds cannot be less than 1')
- if min_num_osds > max_num_osds:
- raise Exception('min_num_osds cannot be greater than max_num_osd')
- osds = range(0, (total_osds_in_cluster + 1))
-
- # replicas
- min_num_replicas = config.get('min_num_replicas', 3)
- max_num_replicas = config.get('max_num_replicas', 3)
-
- if min_num_replicas < 1:
- raise Exception('min_num_replicas cannot be less than 1')
- if min_num_replicas > max_num_replicas:
- raise Exception('min_num_replicas cannot be greater than max_replicas')
- if max_num_replicas > max_num_osds:
- raise Exception('max_num_replicas cannot be greater than max_num_osds')
- replicas = range(min_num_replicas, (max_num_replicas + 1))
-
- # object size
- sizes = config.get('size', [4 << 20])
-
- # repetitions
- reps = range(config.get('repetitions', 1))
-
- # file
- fname = config.get('file', 'radosbench.csv')
- f = open('{}/{}'.format(ctx.archive, fname), 'w')
- f.write(get_csv_header(config) + '\n')
- # }
-
- # set default pools size=1 to avoid 'unhealthy' issues
- ctx.manager.set_pool_property('data', 'size', 1)
- ctx.manager.set_pool_property('metadata', 'size', 1)
- ctx.manager.set_pool_property('rbd', 'size', 1)
-
- current_osds_out = 0
-
- # sweep through all parameters
- for osds_out, size, replica, rep in product(osds, sizes, replicas, reps):
-
- osds_in = total_osds_in_cluster - osds_out
-
- if osds_in == 0:
- # we're done
- break
-
- if current_osds_out != osds_out:
- # take an osd out
- ctx.manager.raw_cluster_cmd(
- 'osd', 'reweight', str(osds_out-1), '0.0')
- wait_until_healthy(ctx, config)
- current_osds_out = osds_out
-
- if osds_in not in range(min_num_osds, (max_num_osds + 1)):
- # no need to execute with a number of osds that wasn't requested
- continue
-
- if osds_in < replica:
- # cannot execute with more replicas than available osds
- continue
-
- run_radosbench(ctx, config, f, osds_in, size, replica, rep)
-
- f.close()
-
- yield
-
-
-def get_csv_header(conf):
- all_columns = [
- 'rep', 'num_osd', 'num_replica', 'avg_throughput',
- 'avg_latency', 'stdev_throughput', 'stdev_latency'
- ]
- given_columns = conf.get('columns', None)
- if given_columns and len(given_columns) != 0:
- for column in given_columns:
- if column not in all_columns:
- raise Exception('Unknown column ' + column)
- return ','.join(conf['columns'])
- else:
- conf['columns'] = all_columns
- return ','.join(all_columns)
-
-
-def run_radosbench(ctx, config, f, num_osds, size, replica, rep):
- pool = ctx.manager.create_pool_with_unique_name()
-
- ctx.manager.set_pool_property(pool, 'size', replica)
-
- wait_until_healthy(ctx, config)
-
- log.info('Executing with parameters: ')
- log.info(' num_osd =' + str(num_osds))
- log.info(' size =' + str(size))
- log.info(' num_replicas =' + str(replica))
- log.info(' repetition =' + str(rep))
-
- for role in config.get('clients', ['client.0']):
- assert isinstance(role, basestring)
- PREFIX = 'client.'
- assert role.startswith(PREFIX)
- id_ = role[len(PREFIX):]
- (remote,) = ctx.cluster.only(role).remotes.iterkeys()
-
- proc = remote.run(
- args=[
- 'adjust-ulimits',
- 'ceph-coverage',
- '{}/archive/coverage'.format(teuthology.get_testdir(ctx)),
- 'rados',
- '--no-log-to-stderr',
- '--name', role,
- '-b', str(size),
- '-p', pool,
- 'bench', str(config.get('time', 120)), 'write',
- ],
- logger=log.getChild('radosbench.{id}'.format(id=id_)),
- stdin=run.PIPE,
- stdout=StringIO(),
- wait=False
- )
-
- # parse output to get summary and format it as CSV
- proc.wait()
- out = proc.stdout.getvalue()
- all_values = {
- 'stdev_throughput': re.sub(r'Stddev Bandwidth: ', '', re.search(
- r'Stddev Bandwidth:.*', out).group(0)),
- 'stdev_latency': re.sub(r'Stddev Latency: ', '', re.search(
- r'Stddev Latency:.*', out).group(0)),
- 'avg_throughput': re.sub(r'Bandwidth \(MB/sec\): ', '', re.search(
- r'Bandwidth \(MB/sec\):.*', out).group(0)),
- 'avg_latency': re.sub(r'Average Latency: ', '', re.search(
- r'Average Latency:.*', out).group(0)),
- 'rep': str(rep),
- 'num_osd': str(num_osds),
- 'num_replica': str(replica)
- }
- values_to_write = []
- for column in config['columns']:
- values_to_write.extend([all_values[column]])
- f.write(','.join(values_to_write) + '\n')
-
- ctx.manager.remove_pool(pool)
-
-
-def wait_until_healthy(ctx, config):
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
- teuthology.wait_until_healthy(ctx, mon_remote)
+++ /dev/null
-"""
-Rgw admin testing against a running instance
-"""
-# The test cases in this file have been annotated for inventory.
-# To extract the inventory (in csv format) use the command:
-#
-# grep '^ *# TESTCASE' | sed 's/^ *# TESTCASE //'
-#
-
-import copy
-import json
-import logging
-import time
-import datetime
-
-from cStringIO import StringIO
-
-import boto.exception
-import boto.s3.connection
-import boto.s3.acl
-
-import httplib2
-
-import util.rgw as rgw_utils
-
-from teuthology import misc as teuthology
-from util.rgw import rgwadmin, get_user_summary, get_user_successful_ops
-
-log = logging.getLogger(__name__)
-
-def create_presigned_url(conn, method, bucket_name, key_name, expiration):
- return conn.generate_url(expires_in=expiration,
- method=method,
- bucket=bucket_name,
- key=key_name,
- query_auth=True,
- )
-
-def send_raw_http_request(conn, method, bucket_name, key_name, follow_redirects = False):
- url = create_presigned_url(conn, method, bucket_name, key_name, 3600)
- print url
- h = httplib2.Http()
- h.follow_redirects = follow_redirects
- return h.request(url, method)
-
-
-def get_acl(key):
- """
- Helper function to get the xml acl from a key, ensuring that the xml
- version tag is removed from the acl response
- """
- raw_acl = key.get_xml_acl()
-
- def remove_version(string):
- return string.split(
- '<?xml version="1.0" encoding="UTF-8"?>'
- )[-1]
-
- def remove_newlines(string):
- return string.strip('\n')
-
- return remove_version(
- remove_newlines(raw_acl)
- )
-
-
-def task(ctx, config):
- """
- Test radosgw-admin functionality against a running rgw instance.
- """
- global log
- assert config is None or isinstance(config, list) \
- or isinstance(config, dict), \
- "task s3tests only supports a list or dictionary for configuration"
- all_clients = ['client.{id}'.format(id=id_)
- for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
- if config is None:
- config = all_clients
- if isinstance(config, list):
- config = dict.fromkeys(config)
- clients = config.keys()
-
- multi_region_run = rgw_utils.multi_region_enabled(ctx)
-
- client = clients[0]; # default choice, multi-region code may overwrite this
- if multi_region_run:
- client = rgw_utils.get_master_client(ctx, clients)
-
- # once the client is chosen, pull the host name and assigned port out of
- # the role_endpoints that were assigned by the rgw task
- (remote_host, remote_port) = ctx.rgw.role_endpoints[client]
-
- realm = ctx.rgw.realm
- log.debug('radosgw-admin: realm %r', realm)
-
- ##
- user1='foo'
- user2='fud'
- subuser1='foo:foo1'
- subuser2='foo:foo2'
- display_name1='Foo'
- display_name2='Fud'
- email='foo@foo.com'
- email2='bar@bar.com'
- access_key='9te6NH5mcdcq0Tc5i8i1'
- secret_key='Ny4IOauQoL18Gp2zM7lC1vLmoawgqcYP/YGcWfXu'
- access_key2='p5YnriCv1nAtykxBrupQ'
- secret_key2='Q8Tk6Q/27hfbFSYdSkPtUqhqx1GgzvpXa4WARozh'
- swift_secret1='gpS2G9RREMrnbqlp29PP2D36kgPR1tm72n5fPYfL'
- swift_secret2='ri2VJQcKSYATOY6uaDUX7pxgkW+W1YmC6OCxPHwy'
-
- bucket_name='myfoo'
- bucket_name2='mybar'
-
- # connect to rgw
- connection = boto.s3.connection.S3Connection(
- aws_access_key_id=access_key,
- aws_secret_access_key=secret_key,
- is_secure=False,
- port=remote_port,
- host=remote_host,
- calling_format=boto.s3.connection.OrdinaryCallingFormat(),
- )
- connection2 = boto.s3.connection.S3Connection(
- aws_access_key_id=access_key2,
- aws_secret_access_key=secret_key2,
- is_secure=False,
- port=remote_port,
- host=remote_host,
- calling_format=boto.s3.connection.OrdinaryCallingFormat(),
- )
-
- # legend (test cases can be easily grep-ed out)
- # TESTCASE 'testname','object','method','operation','assertion'
- # TESTCASE 'info-nosuch','user','info','non-existent user','fails'
- (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1])
- assert err
-
- # TESTCASE 'create-ok','user','create','w/all valid info','succeeds'
- (err, out) = rgwadmin(ctx, client, [
- 'user', 'create',
- '--uid', user1,
- '--display-name', display_name1,
- '--email', email,
- '--access-key', access_key,
- '--secret', secret_key,
- '--max-buckets', '4'
- ],
- check_status=True)
-
- # TESTCASE 'duplicate email','user','create','existing user email','fails'
- (err, out) = rgwadmin(ctx, client, [
- 'user', 'create',
- '--uid', user2,
- '--display-name', display_name2,
- '--email', email,
- ])
- assert err
-
- # TESTCASE 'info-existing','user','info','existing user','returns correct info'
- (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True)
- assert out['user_id'] == user1
- assert out['email'] == email
- assert out['display_name'] == display_name1
- assert len(out['keys']) == 1
- assert out['keys'][0]['access_key'] == access_key
- assert out['keys'][0]['secret_key'] == secret_key
- assert not out['suspended']
-
- # this whole block should only be run if regions have been configured
- if multi_region_run:
- rgw_utils.radosgw_agent_sync_all(ctx)
- # post-sync, validate that user1 exists on the sync destination host
- for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
- dest_client = c_config['dest']
- (err, out) = rgwadmin(ctx, dest_client, ['metadata', 'list', 'user'])
- (err, out) = rgwadmin(ctx, dest_client, ['user', 'info', '--uid', user1], check_status=True)
- assert out['user_id'] == user1
- assert out['email'] == email
- assert out['display_name'] == display_name1
- assert len(out['keys']) == 1
- assert out['keys'][0]['access_key'] == access_key
- assert out['keys'][0]['secret_key'] == secret_key
- assert not out['suspended']
-
- # compare the metadata between different regions, make sure it matches
- log.debug('compare the metadata between different regions, make sure it matches')
- for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
- source_client = c_config['src']
- dest_client = c_config['dest']
- (err1, out1) = rgwadmin(ctx, source_client,
- ['metadata', 'get', 'user:{uid}'.format(uid=user1)], check_status=True)
- (err2, out2) = rgwadmin(ctx, dest_client,
- ['metadata', 'get', 'user:{uid}'.format(uid=user1)], check_status=True)
- assert out1 == out2
-
- # suspend a user on the master, then check the status on the destination
- log.debug('suspend a user on the master, then check the status on the destination')
- for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
- source_client = c_config['src']
- dest_client = c_config['dest']
- (err, out) = rgwadmin(ctx, source_client, ['user', 'suspend', '--uid', user1])
- rgw_utils.radosgw_agent_sync_all(ctx)
- (err, out) = rgwadmin(ctx, dest_client, ['user', 'info', '--uid', user1], check_status=True)
- assert out['suspended']
-
- # delete a user on the master, then check that it's gone on the destination
- log.debug('delete a user on the master, then check that it\'s gone on the destination')
- for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
- source_client = c_config['src']
- dest_client = c_config['dest']
- (err, out) = rgwadmin(ctx, source_client, ['user', 'rm', '--uid', user1], check_status=True)
- rgw_utils.radosgw_agent_sync_all(ctx)
- (err, out) = rgwadmin(ctx, source_client, ['user', 'info', '--uid', user1])
- assert out is None
- (err, out) = rgwadmin(ctx, dest_client, ['user', 'info', '--uid', user1])
- assert out is None
-
- # then recreate it so later tests pass
- (err, out) = rgwadmin(ctx, client, [
- 'user', 'create',
- '--uid', user1,
- '--display-name', display_name1,
- '--email', email,
- '--access-key', access_key,
- '--secret', secret_key,
- '--max-buckets', '4'
- ],
- check_status=True)
-
- # now do the multi-region bucket tests
- log.debug('now do the multi-region bucket tests')
-
- # Create a second user for the following tests
- log.debug('Create a second user for the following tests')
- (err, out) = rgwadmin(ctx, client, [
- 'user', 'create',
- '--uid', user2,
- '--display-name', display_name2,
- '--email', email2,
- '--access-key', access_key2,
- '--secret', secret_key2,
- '--max-buckets', '4'
- ],
- check_status=True)
- (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user2], check_status=True)
- assert out is not None
-
- # create a bucket and do a sync
- log.debug('create a bucket and do a sync')
- bucket = connection.create_bucket(bucket_name2)
- rgw_utils.radosgw_agent_sync_all(ctx)
-
- # compare the metadata for the bucket between different regions, make sure it matches
- log.debug('compare the metadata for the bucket between different regions, make sure it matches')
- for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
- source_client = c_config['src']
- dest_client = c_config['dest']
- (err1, out1) = rgwadmin(ctx, source_client,
- ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)],
- check_status=True)
- (err2, out2) = rgwadmin(ctx, dest_client,
- ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)],
- check_status=True)
- log.debug('metadata 1 %r', out1)
- log.debug('metadata 2 %r', out2)
- assert out1 == out2
-
- # get the bucket.instance info and compare that
- src_bucket_id = out1['data']['bucket']['bucket_id']
- dest_bucket_id = out2['data']['bucket']['bucket_id']
- (err1, out1) = rgwadmin(ctx, source_client, ['metadata', 'get',
- 'bucket.instance:{bucket_name}:{bucket_instance}'.format(
- bucket_name=bucket_name2,bucket_instance=src_bucket_id)],
- check_status=True)
- (err2, out2) = rgwadmin(ctx, dest_client, ['metadata', 'get',
- 'bucket.instance:{bucket_name}:{bucket_instance}'.format(
- bucket_name=bucket_name2,bucket_instance=dest_bucket_id)],
- check_status=True)
- del out1['data']['bucket_info']['bucket']['pool']
- del out1['data']['bucket_info']['bucket']['index_pool']
- del out1['data']['bucket_info']['bucket']['data_extra_pool']
- del out2['data']['bucket_info']['bucket']['pool']
- del out2['data']['bucket_info']['bucket']['index_pool']
- del out2['data']['bucket_info']['bucket']['data_extra_pool']
- assert out1 == out2
-
- same_region = 0
- for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
- source_client = c_config['src']
- dest_client = c_config['dest']
-
- source_region = rgw_utils.region_for_client(ctx, source_client)
- dest_region = rgw_utils.region_for_client(ctx, dest_client)
-
- # 301 is only returned for requests to something in a different region
- if source_region == dest_region:
- log.debug('301 is only returned for requests to something in a different region')
- same_region += 1
- continue
-
- # Attempt to create a new connection with user1 to the destination RGW
- log.debug('Attempt to create a new connection with user1 to the destination RGW')
- # and use that to attempt a delete (that should fail)
-
- (dest_remote_host, dest_remote_port) = ctx.rgw.role_endpoints[dest_client]
- connection_dest = boto.s3.connection.S3Connection(
- aws_access_key_id=access_key,
- aws_secret_access_key=secret_key,
- is_secure=False,
- port=dest_remote_port,
- host=dest_remote_host,
- calling_format=boto.s3.connection.OrdinaryCallingFormat(),
- )
-
- # this should fail
- r, content = send_raw_http_request(connection_dest, 'DELETE', bucket_name2, '', follow_redirects = False)
- assert r.status == 301
-
- # now delete the bucket on the source RGW and do another sync
- log.debug('now delete the bucket on the source RGW and do another sync')
- bucket.delete()
- rgw_utils.radosgw_agent_sync_all(ctx)
-
- if same_region == len(ctx.radosgw_agent.config):
- bucket.delete()
- rgw_utils.radosgw_agent_sync_all(ctx)
-
- # make sure that the bucket no longer exists in either region
- log.debug('make sure that the bucket no longer exists in either region')
- for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
- source_client = c_config['src']
- dest_client = c_config['dest']
- (err1, out1) = rgwadmin(ctx, source_client, ['metadata', 'get',
- 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)])
- (err2, out2) = rgwadmin(ctx, dest_client, ['metadata', 'get',
- 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)])
- # Both of the previous calls should have errors due to requesting
- # metadata for non-existent buckets
- assert err1
- assert err2
-
- # create a bucket and then sync it
- log.debug('create a bucket and then sync it')
- bucket = connection.create_bucket(bucket_name2)
- rgw_utils.radosgw_agent_sync_all(ctx)
-
- # compare the metadata for the bucket between different regions, make sure it matches
- log.debug('compare the metadata for the bucket between different regions, make sure it matches')
- for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
- source_client = c_config['src']
- dest_client = c_config['dest']
- (err1, out1) = rgwadmin(ctx, source_client,
- ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)],
- check_status=True)
- (err2, out2) = rgwadmin(ctx, dest_client,
- ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)],
- check_status=True)
- assert out1 == out2
-
- # Now delete the bucket and recreate it with a different user
- log.debug('Now delete the bucket and recreate it with a different user')
- # within the same window of time and then sync.
- bucket.delete()
- bucket = connection2.create_bucket(bucket_name2)
- rgw_utils.radosgw_agent_sync_all(ctx)
-
- # compare the metadata for the bucket between different regions, make sure it matches
- log.debug('compare the metadata for the bucket between different regions, make sure it matches')
- # user2 should own the bucket in both regions
- for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
- source_client = c_config['src']
- dest_client = c_config['dest']
- (err1, out1) = rgwadmin(ctx, source_client,
- ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)],
- check_status=True)
- (err2, out2) = rgwadmin(ctx, dest_client,
- ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)],
- check_status=True)
- assert out1 == out2
- assert out1['data']['owner'] == user2
- assert out1['data']['owner'] != user1
-
- # now we're going to use this bucket to test meta-data update propagation
- log.debug('now we\'re going to use this bucket to test meta-data update propagation')
- for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
- source_client = c_config['src']
- dest_client = c_config['dest']
-
- # get the metadata so we can tweak it
- log.debug('get the metadata so we can tweak it')
- (err, orig_data) = rgwadmin(ctx, source_client,
- ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)],
- check_status=True)
-
- # manually edit mtime for this bucket to be 300 seconds in the past
- log.debug('manually edit mtime for this bucket to be 300 seconds in the past')
- new_data = copy.deepcopy(orig_data)
- mtime = datetime.datetime.strptime(orig_data['mtime'], "%Y-%m-%d %H:%M:%S.%fZ") - datetime.timedelta(300)
- new_data['mtime'] = unicode(mtime.strftime("%Y-%m-%d %H:%M:%S.%fZ"))
- log.debug("new mtime ", mtime)
- assert new_data != orig_data
- (err, out) = rgwadmin(ctx, source_client,
- ['metadata', 'put', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)],
- stdin=StringIO(json.dumps(new_data)),
- check_status=True)
-
- # get the metadata and make sure that the 'put' worked
- log.debug('get the metadata and make sure that the \'put\' worked')
- (err, out) = rgwadmin(ctx, source_client,
- ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)],
- check_status=True)
- assert out == new_data
-
- # sync to propagate the new metadata
- log.debug('sync to propagate the new metadata')
- rgw_utils.radosgw_agent_sync_all(ctx)
-
- # get the metadata from the dest and compare it to what we just set
- log.debug('get the metadata from the dest and compare it to what we just set')
- # and what the source region has.
- (err1, out1) = rgwadmin(ctx, source_client,
- ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)],
- check_status=True)
- (err2, out2) = rgwadmin(ctx, dest_client,
- ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)],
- check_status=True)
- # yeah for the transitive property
- assert out1 == out2
- assert out1 == new_data
-
- # now we delete the bucket
- log.debug('now we delete the bucket')
- bucket.delete()
-
- log.debug('sync to propagate the deleted bucket')
- rgw_utils.radosgw_agent_sync_all(ctx)
-
- # Delete user2 as later tests do not expect it to exist.
- # Verify that it is gone on both regions
- for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
- source_client = c_config['src']
- dest_client = c_config['dest']
- (err, out) = rgwadmin(ctx, source_client,
- ['user', 'rm', '--uid', user2], check_status=True)
- rgw_utils.radosgw_agent_sync_all(ctx)
- # The two 'user info' calls should fail and not return any data
- # since we just deleted this user.
- (err, out) = rgwadmin(ctx, source_client, ['user', 'info', '--uid', user2])
- assert out is None
- (err, out) = rgwadmin(ctx, dest_client, ['user', 'info', '--uid', user2])
- assert out is None
-
- # Test data sync
-
- # First create a bucket for data sync test purpose
- bucket = connection.create_bucket(bucket_name + 'data')
-
- # Create a tiny file and check if in sync
- for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
- if c_config.get('metadata-only'):
- continue
-
- for full in (True, False):
- source_client = c_config['src']
- dest_client = c_config['dest']
- k = boto.s3.key.Key(bucket)
- k.key = 'tiny_file'
- k.set_contents_from_string("123456789")
- safety_window = rgw_utils.radosgw_data_log_window(ctx, source_client)
- time.sleep(safety_window)
- rgw_utils.radosgw_agent_sync_all(ctx, data=True, full=full)
- (dest_host, dest_port) = ctx.rgw.role_endpoints[dest_client]
- dest_connection = boto.s3.connection.S3Connection(
- aws_access_key_id=access_key,
- aws_secret_access_key=secret_key,
- is_secure=False,
- port=dest_port,
- host=dest_host,
- calling_format=boto.s3.connection.OrdinaryCallingFormat(),
- )
- dest_k = dest_connection.get_bucket(bucket_name + 'data').get_key('tiny_file')
- assert k.get_contents_as_string() == dest_k.get_contents_as_string()
-
- # check that deleting it removes it from the dest zone
- k.delete()
- time.sleep(safety_window)
- # full sync doesn't handle deleted objects yet
- rgw_utils.radosgw_agent_sync_all(ctx, data=True, full=False)
-
- dest_bucket = dest_connection.get_bucket(bucket_name + 'data')
- dest_k = dest_bucket.get_key('tiny_file')
- assert dest_k == None, 'object not deleted from destination zone'
-
- # finally we delete the bucket
- bucket.delete()
-
- bucket = connection.create_bucket(bucket_name + 'data2')
- for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
- if c_config.get('metadata-only'):
- continue
-
- for full in (True, False):
- source_client = c_config['src']
- dest_client = c_config['dest']
- (dest_host, dest_port) = ctx.rgw.role_endpoints[dest_client]
- dest_connection = boto.s3.connection.S3Connection(
- aws_access_key_id=access_key,
- aws_secret_access_key=secret_key,
- is_secure=False,
- port=dest_port,
- host=dest_host,
- calling_format=boto.s3.connection.OrdinaryCallingFormat(),
- )
- for i in range(20):
- k = boto.s3.key.Key(bucket)
- k.key = 'tiny_file_' + str(i)
- k.set_contents_from_string(str(i) * 100)
-
- safety_window = rgw_utils.radosgw_data_log_window(ctx, source_client)
- time.sleep(safety_window)
- rgw_utils.radosgw_agent_sync_all(ctx, data=True, full=full)
-
- for i in range(20):
- dest_k = dest_connection.get_bucket(bucket_name + 'data2').get_key('tiny_file_' + str(i))
- assert (str(i) * 100) == dest_k.get_contents_as_string()
- k = boto.s3.key.Key(bucket)
- k.key = 'tiny_file_' + str(i)
- k.delete()
-
- # check that deleting removes the objects from the dest zone
- time.sleep(safety_window)
- # full sync doesn't delete deleted objects yet
- rgw_utils.radosgw_agent_sync_all(ctx, data=True, full=False)
-
- for i in range(20):
- dest_bucket = dest_connection.get_bucket(bucket_name + 'data2')
- dest_k = dest_bucket.get_key('tiny_file_' + str(i))
- assert dest_k == None, 'object %d not deleted from destination zone' % i
- bucket.delete()
-
- # end of 'if multi_region_run:'
-
- # TESTCASE 'suspend-ok','user','suspend','active user','succeeds'
- (err, out) = rgwadmin(ctx, client, ['user', 'suspend', '--uid', user1],
- check_status=True)
-
- # TESTCASE 'suspend-suspended','user','suspend','suspended user','succeeds w/advisory'
- (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True)
- assert out['suspended']
-
- # TESTCASE 're-enable','user','enable','suspended user','succeeds'
- (err, out) = rgwadmin(ctx, client, ['user', 'enable', '--uid', user1], check_status=True)
-
- # TESTCASE 'info-re-enabled','user','info','re-enabled user','no longer suspended'
- (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True)
- assert not out['suspended']
-
- # TESTCASE 'add-keys','key','create','w/valid info','succeeds'
- (err, out) = rgwadmin(ctx, client, [
- 'key', 'create', '--uid', user1,
- '--access-key', access_key2, '--secret', secret_key2,
- ], check_status=True)
-
- # TESTCASE 'info-new-key','user','info','after key addition','returns all keys'
- (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1],
- check_status=True)
- assert len(out['keys']) == 2
- assert out['keys'][0]['access_key'] == access_key2 or out['keys'][1]['access_key'] == access_key2
- assert out['keys'][0]['secret_key'] == secret_key2 or out['keys'][1]['secret_key'] == secret_key2
-
- # TESTCASE 'rm-key','key','rm','newly added key','succeeds, key is removed'
- (err, out) = rgwadmin(ctx, client, [
- 'key', 'rm', '--uid', user1,
- '--access-key', access_key2,
- ], check_status=True)
- assert len(out['keys']) == 1
- assert out['keys'][0]['access_key'] == access_key
- assert out['keys'][0]['secret_key'] == secret_key
-
- # TESTCASE 'add-swift-key','key','create','swift key','succeeds'
- subuser_access = 'full'
- subuser_perm = 'full-control'
-
- (err, out) = rgwadmin(ctx, client, [
- 'subuser', 'create', '--subuser', subuser1,
- '--access', subuser_access
- ], check_status=True)
-
- # TESTCASE 'add-swift-key','key','create','swift key','succeeds'
- (err, out) = rgwadmin(ctx, client, [
- 'subuser', 'modify', '--subuser', subuser1,
- '--secret', swift_secret1,
- '--key-type', 'swift',
- ], check_status=True)
-
- # TESTCASE 'subuser-perm-mask', 'subuser', 'info', 'test subuser perm mask durability', 'succeeds'
- (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1])
-
- assert out['subusers'][0]['permissions'] == subuser_perm
-
- # TESTCASE 'info-swift-key','user','info','after key addition','returns all keys'
- (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True)
- assert len(out['swift_keys']) == 1
- assert out['swift_keys'][0]['user'] == subuser1
- assert out['swift_keys'][0]['secret_key'] == swift_secret1
-
- # TESTCASE 'add-swift-subuser','key','create','swift sub-user key','succeeds'
- (err, out) = rgwadmin(ctx, client, [
- 'subuser', 'create', '--subuser', subuser2,
- '--secret', swift_secret2,
- '--key-type', 'swift',
- ], check_status=True)
-
- # TESTCASE 'info-swift-subuser','user','info','after key addition','returns all sub-users/keys'
- (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True)
- assert len(out['swift_keys']) == 2
- assert out['swift_keys'][0]['user'] == subuser2 or out['swift_keys'][1]['user'] == subuser2
- assert out['swift_keys'][0]['secret_key'] == swift_secret2 or out['swift_keys'][1]['secret_key'] == swift_secret2
-
- # TESTCASE 'rm-swift-key1','key','rm','subuser','succeeds, one key is removed'
- (err, out) = rgwadmin(ctx, client, [
- 'key', 'rm', '--subuser', subuser1,
- '--key-type', 'swift',
- ], check_status=True)
- assert len(out['swift_keys']) == 1
-
- # TESTCASE 'rm-subuser','subuser','rm','subuser','success, subuser is removed'
- (err, out) = rgwadmin(ctx, client, [
- 'subuser', 'rm', '--subuser', subuser1,
- ], check_status=True)
- assert len(out['subusers']) == 1
-
- # TESTCASE 'rm-subuser-with-keys','subuser','rm','subuser','succeeds, second subser and key is removed'
- (err, out) = rgwadmin(ctx, client, [
- 'subuser', 'rm', '--subuser', subuser2,
- '--key-type', 'swift', '--purge-keys',
- ], check_status=True)
- assert len(out['swift_keys']) == 0
- assert len(out['subusers']) == 0
-
- # TESTCASE 'bucket-stats','bucket','stats','no session/buckets','succeeds, empty list'
- (err, out) = rgwadmin(ctx, client, ['bucket', 'stats', '--uid', user1],
- check_status=True)
- assert len(out) == 0
-
- if multi_region_run:
- rgw_utils.radosgw_agent_sync_all(ctx)
-
- # TESTCASE 'bucket-stats2','bucket','stats','no buckets','succeeds, empty list'
- (err, out) = rgwadmin(ctx, client, ['bucket', 'list', '--uid', user1], check_status=True)
- assert len(out) == 0
-
- # create a first bucket
- bucket = connection.create_bucket(bucket_name)
-
- # TESTCASE 'bucket-list','bucket','list','one bucket','succeeds, expected list'
- (err, out) = rgwadmin(ctx, client, ['bucket', 'list', '--uid', user1], check_status=True)
- assert len(out) == 1
- assert out[0] == bucket_name
-
- # TESTCASE 'bucket-list-all','bucket','list','all buckets','succeeds, expected list'
- (err, out) = rgwadmin(ctx, client, ['bucket', 'list'], check_status=True)
- assert len(out) >= 1
- assert bucket_name in out;
-
- # TESTCASE 'max-bucket-limit,'bucket','create','4 buckets','5th bucket fails due to max buckets == 4'
- bucket2 = connection.create_bucket(bucket_name + '2')
- bucket3 = connection.create_bucket(bucket_name + '3')
- bucket4 = connection.create_bucket(bucket_name + '4')
- # the 5th should fail.
- failed = False
- try:
- connection.create_bucket(bucket_name + '5')
- except Exception:
- failed = True
- assert failed
-
- # delete the buckets
- bucket2.delete()
- bucket3.delete()
- bucket4.delete()
-
- # TESTCASE 'bucket-stats3','bucket','stats','new empty bucket','succeeds, empty list'
- (err, out) = rgwadmin(ctx, client, [
- 'bucket', 'stats', '--bucket', bucket_name], check_status=True)
- assert out['owner'] == user1
- bucket_id = out['id']
-
- # TESTCASE 'bucket-stats4','bucket','stats','new empty bucket','succeeds, expected bucket ID'
- (err, out) = rgwadmin(ctx, client, ['bucket', 'stats', '--uid', user1], check_status=True)
- assert len(out) == 1
- assert out[0]['id'] == bucket_id # does it return the same ID twice in a row?
-
- # use some space
- key = boto.s3.key.Key(bucket)
- key.set_contents_from_string('one')
-
- # TESTCASE 'bucket-stats5','bucket','stats','after creating key','succeeds, lists one non-empty object'
- (err, out) = rgwadmin(ctx, client, [
- 'bucket', 'stats', '--bucket', bucket_name], check_status=True)
- assert out['id'] == bucket_id
- assert out['usage']['rgw.main']['num_objects'] == 1
- assert out['usage']['rgw.main']['size_kb'] > 0
-
- # reclaim it
- key.delete()
-
- # TESTCASE 'bucket unlink', 'bucket', 'unlink', 'unlink bucket from user', 'fails', 'access denied error'
- (err, out) = rgwadmin(ctx, client,
- ['bucket', 'unlink', '--uid', user1, '--bucket', bucket_name],
- check_status=True)
-
- # create a second user to link the bucket to
- (err, out) = rgwadmin(ctx, client, [
- 'user', 'create',
- '--uid', user2,
- '--display-name', display_name2,
- '--access-key', access_key2,
- '--secret', secret_key2,
- '--max-buckets', '1',
- ],
- check_status=True)
-
- # try creating an object with the first user before the bucket is relinked
- denied = False
- key = boto.s3.key.Key(bucket)
-
- try:
- key.set_contents_from_string('two')
- except boto.exception.S3ResponseError:
- denied = True
-
- assert not denied
-
- # delete the object
- key.delete()
-
- # link the bucket to another user
- (err, out) = rgwadmin(ctx, client, ['metadata', 'get', 'bucket:{n}'.format(n=bucket_name)],
- check_status=True)
-
- bucket_data = out['data']
- assert bucket_data['bucket']['name'] == bucket_name
-
- bucket_id = bucket_data['bucket']['bucket_id']
-
- # link the bucket to another user
- (err, out) = rgwadmin(ctx, client, ['bucket', 'link', '--uid', user2, '--bucket', bucket_name, '--bucket-id', bucket_id],
- check_status=True)
-
- # try to remove user, should fail (has a linked bucket)
- (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--uid', user2])
- assert err
-
- # TESTCASE 'bucket unlink', 'bucket', 'unlink', 'unlink bucket from user', 'succeeds, bucket unlinked'
- (err, out) = rgwadmin(ctx, client, ['bucket', 'unlink', '--uid', user2, '--bucket', bucket_name],
- check_status=True)
-
- # relink the bucket to the first user and delete the second user
- (err, out) = rgwadmin(ctx, client,
- ['bucket', 'link', '--uid', user1, '--bucket', bucket_name, '--bucket-id', bucket_id],
- check_status=True)
-
- (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--uid', user2],
- check_status=True)
-
- # TESTCASE 'object-rm', 'object', 'rm', 'remove object', 'succeeds, object is removed'
-
- # upload an object
- object_name = 'four'
- key = boto.s3.key.Key(bucket, object_name)
- key.set_contents_from_string(object_name)
-
- # now delete it
- (err, out) = rgwadmin(ctx, client,
- ['object', 'rm', '--bucket', bucket_name, '--object', object_name],
- check_status=True)
-
- # TESTCASE 'bucket-stats6','bucket','stats','after deleting key','succeeds, lists one no objects'
- (err, out) = rgwadmin(ctx, client, [
- 'bucket', 'stats', '--bucket', bucket_name],
- check_status=True)
- assert out['id'] == bucket_id
- assert out['usage']['rgw.main']['num_objects'] == 0
-
- # list log objects
- # TESTCASE 'log-list','log','list','after activity','succeeds, lists one no objects'
- (err, out) = rgwadmin(ctx, client, ['log', 'list'], check_status=True)
- assert len(out) > 0
-
- for obj in out:
- # TESTCASE 'log-show','log','show','after activity','returns expected info'
- if obj[:4] == 'meta' or obj[:4] == 'data' or obj[:18] == 'obj_delete_at_hint':
- continue
-
- (err, rgwlog) = rgwadmin(ctx, client, ['log', 'show', '--object', obj],
- check_status=True)
- assert len(rgwlog) > 0
-
- # exempt bucket_name2 from checking as it was only used for multi-region tests
- assert rgwlog['bucket'].find(bucket_name) == 0 or rgwlog['bucket'].find(bucket_name2) == 0
- assert rgwlog['bucket'] != bucket_name or rgwlog['bucket_id'] == bucket_id
- assert rgwlog['bucket_owner'] == user1 or rgwlog['bucket'] == bucket_name + '5' or rgwlog['bucket'] == bucket_name2
- for entry in rgwlog['log_entries']:
- log.debug('checking log entry: ', entry)
- assert entry['bucket'] == rgwlog['bucket']
- possible_buckets = [bucket_name + '5', bucket_name2]
- user = entry['user']
- assert user == user1 or user.endswith('system-user') or \
- rgwlog['bucket'] in possible_buckets
-
- # TESTCASE 'log-rm','log','rm','delete log objects','succeeds'
- (err, out) = rgwadmin(ctx, client, ['log', 'rm', '--object', obj],
- check_status=True)
-
- # TODO: show log by bucket+date
-
- # need to wait for all usage data to get flushed, should take up to 30 seconds
- timestamp = time.time()
- while time.time() - timestamp <= (20 * 60): # wait up to 20 minutes
- (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--categories', 'delete_obj']) # last operation we did is delete obj, wait for it to flush
- if get_user_successful_ops(out, user1) > 0:
- break
- time.sleep(1)
-
- assert time.time() - timestamp <= (20 * 60)
-
- # TESTCASE 'usage-show' 'usage' 'show' 'all usage' 'succeeds'
- (err, out) = rgwadmin(ctx, client, ['usage', 'show'], check_status=True)
- assert len(out['entries']) > 0
- assert len(out['summary']) > 0
-
- user_summary = get_user_summary(out, user1)
-
- total = user_summary['total']
- assert total['successful_ops'] > 0
-
- # TESTCASE 'usage-show2' 'usage' 'show' 'user usage' 'succeeds'
- (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1],
- check_status=True)
- assert len(out['entries']) > 0
- assert len(out['summary']) > 0
- user_summary = out['summary'][0]
- for entry in user_summary['categories']:
- assert entry['successful_ops'] > 0
- assert user_summary['user'] == user1
-
- # TESTCASE 'usage-show3' 'usage' 'show' 'user usage categories' 'succeeds'
- test_categories = ['create_bucket', 'put_obj', 'delete_obj', 'delete_bucket']
- for cat in test_categories:
- (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1, '--categories', cat],
- check_status=True)
- assert len(out['summary']) > 0
- user_summary = out['summary'][0]
- assert user_summary['user'] == user1
- assert len(user_summary['categories']) == 1
- entry = user_summary['categories'][0]
- assert entry['category'] == cat
- assert entry['successful_ops'] > 0
-
- # the usage flush interval is 30 seconds, wait that much an then some
- # to make sure everything has been flushed
- time.sleep(35)
-
- # TESTCASE 'usage-trim' 'usage' 'trim' 'user usage' 'succeeds, usage removed'
- (err, out) = rgwadmin(ctx, client, ['usage', 'trim', '--uid', user1],
- check_status=True)
- (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1],
- check_status=True)
- assert len(out['entries']) == 0
- assert len(out['summary']) == 0
-
- # TESTCASE 'user-suspend2','user','suspend','existing user','succeeds'
- (err, out) = rgwadmin(ctx, client, ['user', 'suspend', '--uid', user1],
- check_status=True)
-
- # TESTCASE 'user-suspend3','user','suspend','suspended user','cannot write objects'
- try:
- key = boto.s3.key.Key(bucket)
- key.set_contents_from_string('five')
- except boto.exception.S3ResponseError as e:
- assert e.status == 403
-
- # TESTCASE 'user-renable2','user','enable','suspended user','succeeds'
- (err, out) = rgwadmin(ctx, client, ['user', 'enable', '--uid', user1],
- check_status=True)
-
- # TESTCASE 'user-renable3','user','enable','reenabled user','can write objects'
- key = boto.s3.key.Key(bucket)
- key.set_contents_from_string('six')
-
- # TESTCASE 'gc-list', 'gc', 'list', 'get list of objects ready for garbage collection'
-
- # create an object large enough to be split into multiple parts
- test_string = 'foo'*10000000
-
- big_key = boto.s3.key.Key(bucket)
- big_key.set_contents_from_string(test_string)
-
- # now delete the head
- big_key.delete()
-
- # wait a bit to give the garbage collector time to cycle
- time.sleep(15)
-
- (err, out) = rgwadmin(ctx, client, ['gc', 'list'])
-
- assert len(out) > 0
-
- # TESTCASE 'gc-process', 'gc', 'process', 'manually collect garbage'
- (err, out) = rgwadmin(ctx, client, ['gc', 'process'], check_status=True)
-
- #confirm
- (err, out) = rgwadmin(ctx, client, ['gc', 'list'])
-
- assert len(out) == 0
-
- # TESTCASE 'rm-user-buckets','user','rm','existing user','fails, still has buckets'
- (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--uid', user1])
- assert err
-
- # delete should fail because ``key`` still exists
- try:
- bucket.delete()
- except boto.exception.S3ResponseError as e:
- assert e.status == 409
-
- key.delete()
- bucket.delete()
-
- # TESTCASE 'policy', 'bucket', 'policy', 'get bucket policy', 'returns S3 policy'
- bucket = connection.create_bucket(bucket_name)
-
- # create an object
- key = boto.s3.key.Key(bucket)
- key.set_contents_from_string('seven')
-
- # should be private already but guarantee it
- key.set_acl('private')
-
- (err, out) = rgwadmin(ctx, client,
- ['policy', '--bucket', bucket.name, '--object', key.key],
- check_status=True, format='xml')
-
- acl = get_acl(key)
-
- assert acl == out.strip('\n')
-
- # add another grantee by making the object public read
- key.set_acl('public-read')
-
- (err, out) = rgwadmin(ctx, client,
- ['policy', '--bucket', bucket.name, '--object', key.key],
- check_status=True, format='xml')
-
- acl = get_acl(key)
-
- assert acl == out.strip('\n')
-
- # TESTCASE 'rm-bucket', 'bucket', 'rm', 'bucket with objects', 'succeeds'
- bucket = connection.create_bucket(bucket_name)
- key_name = ['eight', 'nine', 'ten', 'eleven']
- for i in range(4):
- key = boto.s3.key.Key(bucket)
- key.set_contents_from_string(key_name[i])
-
- (err, out) = rgwadmin(ctx, client,
- ['bucket', 'rm', '--bucket', bucket_name, '--purge-objects'],
- check_status=True)
-
- # TESTCASE 'caps-add', 'caps', 'add', 'add user cap', 'succeeds'
- caps='user=read'
- (err, out) = rgwadmin(ctx, client, ['caps', 'add', '--uid', user1, '--caps', caps])
-
- assert out['caps'][0]['perm'] == 'read'
-
- # TESTCASE 'caps-rm', 'caps', 'rm', 'remove existing cap from user', 'succeeds'
- (err, out) = rgwadmin(ctx, client, ['caps', 'rm', '--uid', user1, '--caps', caps])
-
- assert not out['caps']
-
- # TESTCASE 'rm-user','user','rm','existing user','fails, still has buckets'
- bucket = connection.create_bucket(bucket_name)
- key = boto.s3.key.Key(bucket)
-
- (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--uid', user1])
- assert err
-
- # TESTCASE 'rm-user2', 'user', 'rm', 'user with data', 'succeeds'
- bucket = connection.create_bucket(bucket_name)
- key = boto.s3.key.Key(bucket)
- key.set_contents_from_string('twelve')
-
- (err, out) = rgwadmin(ctx, client,
- ['user', 'rm', '--uid', user1, '--purge-data' ],
- check_status=True)
-
- # TESTCASE 'rm-user3','user','rm','deleted user','fails'
- (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1])
- assert err
-
- # TESTCASE 'zone-info', 'zone', 'get', 'get zone info', 'succeeds, has default placement rule'
- #
-
- if realm is None:
- (err, out) = rgwadmin(ctx, client, ['zone', 'get','--rgw-zone','default'])
- else:
- (err, out) = rgwadmin(ctx, client, ['zone', 'get'])
- orig_placement_pools = len(out['placement_pools'])
-
- # removed this test, it is not correct to assume that zone has default placement, it really
- # depends on how we set it up before
- #
- # assert len(out) > 0
- # assert len(out['placement_pools']) == 1
-
- # default_rule = out['placement_pools'][0]
- # assert default_rule['key'] == 'default-placement'
-
- rule={'key': 'new-placement', 'val': {'data_pool': '.rgw.buckets.2', 'index_pool': '.rgw.buckets.index.2'}}
-
- out['placement_pools'].append(rule)
-
- (err, out) = rgwadmin(ctx, client, ['zone', 'set'],
- stdin=StringIO(json.dumps(out)),
- check_status=True)
-
- if realm is None:
- (err, out) = rgwadmin(ctx, client, ['zone', 'get','--rgw-zone','default'])
- else:
- (err, out) = rgwadmin(ctx, client, ['zone', 'get'])
- assert len(out) > 0
- assert len(out['placement_pools']) == orig_placement_pools + 1
+++ /dev/null
-"""
-Run a series of rgw admin commands through the rest interface.
-
-The test cases in this file have been annotated for inventory.
-To extract the inventory (in csv format) use the command:
-
- grep '^ *# TESTCASE' | sed 's/^ *# TESTCASE //'
-
-"""
-from cStringIO import StringIO
-import logging
-import json
-
-import boto.exception
-import boto.s3.connection
-import boto.s3.acl
-
-import requests
-import time
-
-from boto.connection import AWSAuthConnection
-from teuthology import misc as teuthology
-from util.rgw import get_user_summary, get_user_successful_ops
-
-log = logging.getLogger(__name__)
-
-def rgwadmin(ctx, client, cmd):
- """
- Perform rgw admin command
-
- :param client: client
- :param cmd: command to execute.
- :return: command exit status, json result.
- """
- log.info('radosgw-admin: %s' % cmd)
- testdir = teuthology.get_testdir(ctx)
- pre = [
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'radosgw-admin',
- '--log-to-stderr',
- '--format', 'json',
- ]
- pre.extend(cmd)
- (remote,) = ctx.cluster.only(client).remotes.iterkeys()
- proc = remote.run(
- args=pre,
- check_status=False,
- stdout=StringIO(),
- stderr=StringIO(),
- )
- r = proc.exitstatus
- out = proc.stdout.getvalue()
- j = None
- if not r and out != '':
- try:
- j = json.loads(out)
- log.info(' json result: %s' % j)
- except ValueError:
- j = out
- log.info(' raw result: %s' % j)
- return (r, j)
-
-
-def rgwadmin_rest(connection, cmd, params=None, headers=None, raw=False):
- """
- perform a rest command
- """
- log.info('radosgw-admin-rest: %s %s' % (cmd, params))
- put_cmds = ['create', 'link', 'add']
- post_cmds = ['unlink', 'modify']
- delete_cmds = ['trim', 'rm', 'process']
- get_cmds = ['check', 'info', 'show', 'list']
-
- bucket_sub_resources = ['object', 'policy', 'index']
- user_sub_resources = ['subuser', 'key', 'caps']
- zone_sub_resources = ['pool', 'log', 'garbage']
-
- def get_cmd_method_and_handler(cmd):
- """
- Get the rest command and handler from information in cmd and
- from the imported requests object.
- """
- if cmd[1] in put_cmds:
- return 'PUT', requests.put
- elif cmd[1] in delete_cmds:
- return 'DELETE', requests.delete
- elif cmd[1] in post_cmds:
- return 'POST', requests.post
- elif cmd[1] in get_cmds:
- return 'GET', requests.get
-
- def get_resource(cmd):
- """
- Get the name of the resource from information in cmd.
- """
- if cmd[0] == 'bucket' or cmd[0] in bucket_sub_resources:
- if cmd[0] == 'bucket':
- return 'bucket', ''
- else:
- return 'bucket', cmd[0]
- elif cmd[0] == 'user' or cmd[0] in user_sub_resources:
- if cmd[0] == 'user':
- return 'user', ''
- else:
- return 'user', cmd[0]
- elif cmd[0] == 'usage':
- return 'usage', ''
- elif cmd[0] == 'zone' or cmd[0] in zone_sub_resources:
- if cmd[0] == 'zone':
- return 'zone', ''
- else:
- return 'zone', cmd[0]
-
- def build_admin_request(conn, method, resource = '', headers=None, data='',
- query_args=None, params=None):
- """
- Build an administative request adapted from the build_request()
- method of boto.connection
- """
-
- path = conn.calling_format.build_path_base('admin', resource)
- auth_path = conn.calling_format.build_auth_path('admin', resource)
- host = conn.calling_format.build_host(conn.server_name(), 'admin')
- if query_args:
- path += '?' + query_args
- boto.log.debug('path=%s' % path)
- auth_path += '?' + query_args
- boto.log.debug('auth_path=%s' % auth_path)
- return AWSAuthConnection.build_base_http_request(conn, method, path,
- auth_path, params, headers, data, host)
-
- method, handler = get_cmd_method_and_handler(cmd)
- resource, query_args = get_resource(cmd)
- request = build_admin_request(connection, method, resource,
- query_args=query_args, headers=headers)
-
- url = '{protocol}://{host}{path}'.format(protocol=request.protocol,
- host=request.host, path=request.path)
-
- request.authorize(connection=connection)
- result = handler(url, params=params, headers=request.headers)
-
- if raw:
- log.info(' text result: %s' % result.txt)
- return result.status_code, result.txt
- else:
- log.info(' json result: %s' % result.json())
- return result.status_code, result.json()
-
-
-def task(ctx, config):
- """
- Test radosgw-admin functionality through the RESTful interface
- """
- assert config is None or isinstance(config, list) \
- or isinstance(config, dict), \
- "task s3tests only supports a list or dictionary for configuration"
- all_clients = ['client.{id}'.format(id=id_)
- for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
- if config is None:
- config = all_clients
- if isinstance(config, list):
- config = dict.fromkeys(config)
- clients = config.keys()
-
- # just use the first client...
- client = clients[0]
-
- ##
- admin_user = 'ada'
- admin_display_name = 'Ms. Admin User'
- admin_access_key = 'MH1WC2XQ1S8UISFDZC8W'
- admin_secret_key = 'dQyrTPA0s248YeN5bBv4ukvKU0kh54LWWywkrpoG'
- admin_caps = 'users=read, write; usage=read, write; buckets=read, write; zone=read, write'
-
- user1 = 'foo'
- user2 = 'fud'
- subuser1 = 'foo:foo1'
- subuser2 = 'foo:foo2'
- display_name1 = 'Foo'
- display_name2 = 'Fud'
- email = 'foo@foo.com'
- access_key = '9te6NH5mcdcq0Tc5i8i1'
- secret_key = 'Ny4IOauQoL18Gp2zM7lC1vLmoawgqcYP/YGcWfXu'
- access_key2 = 'p5YnriCv1nAtykxBrupQ'
- secret_key2 = 'Q8Tk6Q/27hfbFSYdSkPtUqhqx1GgzvpXa4WARozh'
- swift_secret1 = 'gpS2G9RREMrnbqlp29PP2D36kgPR1tm72n5fPYfL'
- swift_secret2 = 'ri2VJQcKSYATOY6uaDUX7pxgkW+W1YmC6OCxPHwy'
-
- bucket_name = 'myfoo'
-
- # legend (test cases can be easily grep-ed out)
- # TESTCASE 'testname','object','method','operation','assertion'
- # TESTCASE 'create-admin-user','user','create','administrative user','succeeds'
- (err, out) = rgwadmin(ctx, client, [
- 'user', 'create',
- '--uid', admin_user,
- '--display-name', admin_display_name,
- '--access-key', admin_access_key,
- '--secret', admin_secret_key,
- '--max-buckets', '0',
- '--caps', admin_caps
- ])
- logging.error(out)
- logging.error(err)
- assert not err
-
- (remote,) = ctx.cluster.only(client).remotes.iterkeys()
- remote_host = remote.name.split('@')[1]
- admin_conn = boto.s3.connection.S3Connection(
- aws_access_key_id=admin_access_key,
- aws_secret_access_key=admin_secret_key,
- is_secure=False,
- port=7280,
- host=remote_host,
- calling_format=boto.s3.connection.OrdinaryCallingFormat(),
- )
-
- # TESTCASE 'info-nosuch','user','info','non-existent user','fails'
- (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {"uid": user1})
- assert ret == 404
-
- # TESTCASE 'create-ok','user','create','w/all valid info','succeeds'
- (ret, out) = rgwadmin_rest(admin_conn,
- ['user', 'create'],
- {'uid' : user1,
- 'display-name' : display_name1,
- 'email' : email,
- 'access-key' : access_key,
- 'secret-key' : secret_key,
- 'max-buckets' : '4'
- })
-
- assert ret == 200
-
- # TESTCASE 'info-existing','user','info','existing user','returns correct info'
- (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
-
- assert out['user_id'] == user1
- assert out['email'] == email
- assert out['display_name'] == display_name1
- assert len(out['keys']) == 1
- assert out['keys'][0]['access_key'] == access_key
- assert out['keys'][0]['secret_key'] == secret_key
- assert not out['suspended']
-
- # TESTCASE 'suspend-ok','user','suspend','active user','succeeds'
- (ret, out) = rgwadmin_rest(admin_conn, ['user', 'modify'], {'uid' : user1, 'suspended' : True})
- assert ret == 200
-
- # TESTCASE 'suspend-suspended','user','suspend','suspended user','succeeds w/advisory'
- (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
- assert ret == 200
- assert out['suspended']
-
- # TESTCASE 're-enable','user','enable','suspended user','succeeds'
- (ret, out) = rgwadmin_rest(admin_conn, ['user', 'modify'], {'uid' : user1, 'suspended' : 'false'})
- assert not err
-
- # TESTCASE 'info-re-enabled','user','info','re-enabled user','no longer suspended'
- (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
- assert ret == 200
- assert not out['suspended']
-
- # TESTCASE 'add-keys','key','create','w/valid info','succeeds'
- (ret, out) = rgwadmin_rest(admin_conn,
- ['key', 'create'],
- {'uid' : user1,
- 'access-key' : access_key2,
- 'secret-key' : secret_key2
- })
-
-
- assert ret == 200
-
- # TESTCASE 'info-new-key','user','info','after key addition','returns all keys'
- (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
- assert ret == 200
- assert len(out['keys']) == 2
- assert out['keys'][0]['access_key'] == access_key2 or out['keys'][1]['access_key'] == access_key2
- assert out['keys'][0]['secret_key'] == secret_key2 or out['keys'][1]['secret_key'] == secret_key2
-
- # TESTCASE 'rm-key','key','rm','newly added key','succeeds, key is removed'
- (ret, out) = rgwadmin_rest(admin_conn,
- ['key', 'rm'],
- {'uid' : user1,
- 'access-key' : access_key2
- })
-
- assert ret == 200
-
- (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
-
- assert len(out['keys']) == 1
- assert out['keys'][0]['access_key'] == access_key
- assert out['keys'][0]['secret_key'] == secret_key
-
- # TESTCASE 'add-swift-key','key','create','swift key','succeeds'
- (ret, out) = rgwadmin_rest(admin_conn,
- ['subuser', 'create'],
- {'subuser' : subuser1,
- 'secret-key' : swift_secret1,
- 'key-type' : 'swift'
- })
-
- assert ret == 200
-
- # TESTCASE 'info-swift-key','user','info','after key addition','returns all keys'
- (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
- assert ret == 200
- assert len(out['swift_keys']) == 1
- assert out['swift_keys'][0]['user'] == subuser1
- assert out['swift_keys'][0]['secret_key'] == swift_secret1
-
- # TESTCASE 'add-swift-subuser','key','create','swift sub-user key','succeeds'
- (ret, out) = rgwadmin_rest(admin_conn,
- ['subuser', 'create'],
- {'subuser' : subuser2,
- 'secret-key' : swift_secret2,
- 'key-type' : 'swift'
- })
-
- assert ret == 200
-
- # TESTCASE 'info-swift-subuser','user','info','after key addition','returns all sub-users/keys'
- (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
- assert ret == 200
- assert len(out['swift_keys']) == 2
- assert out['swift_keys'][0]['user'] == subuser2 or out['swift_keys'][1]['user'] == subuser2
- assert out['swift_keys'][0]['secret_key'] == swift_secret2 or out['swift_keys'][1]['secret_key'] == swift_secret2
-
- # TESTCASE 'rm-swift-key1','key','rm','subuser','succeeds, one key is removed'
- (ret, out) = rgwadmin_rest(admin_conn,
- ['key', 'rm'],
- {'subuser' : subuser1,
- 'key-type' :'swift'
- })
-
- assert ret == 200
-
- (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
- assert len(out['swift_keys']) == 1
-
- # TESTCASE 'rm-subuser','subuser','rm','subuser','success, subuser is removed'
- (ret, out) = rgwadmin_rest(admin_conn,
- ['subuser', 'rm'],
- {'subuser' : subuser1
- })
-
- assert ret == 200
-
- (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
- assert len(out['subusers']) == 1
-
- # TESTCASE 'rm-subuser-with-keys','subuser','rm','subuser','succeeds, second subser and key is removed'
- (ret, out) = rgwadmin_rest(admin_conn,
- ['subuser', 'rm'],
- {'subuser' : subuser2,
- 'key-type' : 'swift',
- '{purge-keys' :True
- })
-
- assert ret == 200
-
- (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
- assert len(out['swift_keys']) == 0
- assert len(out['subusers']) == 0
-
- # TESTCASE 'bucket-stats','bucket','info','no session/buckets','succeeds, empty list'
- (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'uid' : user1})
- assert ret == 200
- assert len(out) == 0
-
- # connect to rgw
- connection = boto.s3.connection.S3Connection(
- aws_access_key_id=access_key,
- aws_secret_access_key=secret_key,
- is_secure=False,
- port=7280,
- host=remote_host,
- calling_format=boto.s3.connection.OrdinaryCallingFormat(),
- )
-
- # TESTCASE 'bucket-stats2','bucket','stats','no buckets','succeeds, empty list'
- (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'uid' : user1, 'stats' : True})
- assert ret == 200
- assert len(out) == 0
-
- # create a first bucket
- bucket = connection.create_bucket(bucket_name)
-
- # TESTCASE 'bucket-list','bucket','list','one bucket','succeeds, expected list'
- (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'uid' : user1})
- assert ret == 200
- assert len(out) == 1
- assert out[0] == bucket_name
-
- # TESTCASE 'bucket-stats3','bucket','stats','new empty bucket','succeeds, empty list'
- (ret, out) = rgwadmin_rest(admin_conn,
- ['bucket', 'info'], {'bucket' : bucket_name, 'stats' : True})
-
- assert ret == 200
- assert out['owner'] == user1
- bucket_id = out['id']
-
- # TESTCASE 'bucket-stats4','bucket','stats','new empty bucket','succeeds, expected bucket ID'
- (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'uid' : user1, 'stats' : True})
- assert ret == 200
- assert len(out) == 1
- assert out[0]['id'] == bucket_id # does it return the same ID twice in a row?
-
- # use some space
- key = boto.s3.key.Key(bucket)
- key.set_contents_from_string('one')
-
- # TESTCASE 'bucket-stats5','bucket','stats','after creating key','succeeds, lists one non-empty object'
- (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'bucket' : bucket_name, 'stats' : True})
- assert ret == 200
- assert out['id'] == bucket_id
- assert out['usage']['rgw.main']['num_objects'] == 1
- assert out['usage']['rgw.main']['size_kb'] > 0
-
- # reclaim it
- key.delete()
-
- # TESTCASE 'bucket unlink', 'bucket', 'unlink', 'unlink bucket from user', 'fails', 'access denied error'
- (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'unlink'], {'uid' : user1, 'bucket' : bucket_name})
-
- assert ret == 200
-
- # create a second user to link the bucket to
- (ret, out) = rgwadmin_rest(admin_conn,
- ['user', 'create'],
- {'uid' : user2,
- 'display-name' : display_name2,
- 'access-key' : access_key2,
- 'secret-key' : secret_key2,
- 'max-buckets' : '1',
- })
-
- assert ret == 200
-
- # try creating an object with the first user before the bucket is relinked
- denied = False
- key = boto.s3.key.Key(bucket)
-
- try:
- key.set_contents_from_string('two')
- except boto.exception.S3ResponseError:
- denied = True
-
- assert not denied
-
- # delete the object
- key.delete()
-
- # link the bucket to another user
- (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'link'], {'uid' : user2, 'bucket' : bucket_name})
-
- assert ret == 200
-
- # try creating an object with the first user which should cause an error
- key = boto.s3.key.Key(bucket)
-
- try:
- key.set_contents_from_string('three')
- except boto.exception.S3ResponseError:
- denied = True
-
- assert denied
-
- # relink the bucket to the first user and delete the second user
- (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'link'], {'uid' : user1, 'bucket' : bucket_name})
- assert ret == 200
-
- (ret, out) = rgwadmin_rest(admin_conn, ['user', 'rm'], {'uid' : user2})
- assert ret == 200
-
- # TESTCASE 'object-rm', 'object', 'rm', 'remove object', 'succeeds, object is removed'
-
- # upload an object
- object_name = 'four'
- key = boto.s3.key.Key(bucket, object_name)
- key.set_contents_from_string(object_name)
-
- # now delete it
- (ret, out) = rgwadmin_rest(admin_conn, ['object', 'rm'], {'bucket' : bucket_name, 'object' : object_name})
- assert ret == 200
-
- # TESTCASE 'bucket-stats6','bucket','stats','after deleting key','succeeds, lists one no objects'
- (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'bucket' : bucket_name, 'stats' : True})
- assert ret == 200
- assert out['id'] == bucket_id
- assert out['usage']['rgw.main']['num_objects'] == 0
-
- # create a bucket for deletion stats
- useless_bucket = connection.create_bucket('useless_bucket')
- useless_key = useless_bucket.new_key('useless_key')
- useless_key.set_contents_from_string('useless string')
-
- # delete it
- useless_key.delete()
- useless_bucket.delete()
-
- # wait for the statistics to flush
- time.sleep(60)
-
- # need to wait for all usage data to get flushed, should take up to 30 seconds
- timestamp = time.time()
- while time.time() - timestamp <= (20 * 60): # wait up to 20 minutes
- (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'], {'categories' : 'delete_obj'}) # last operation we did is delete obj, wait for it to flush
-
- if get_user_successful_ops(out, user1) > 0:
- break
- time.sleep(1)
-
- assert time.time() - timestamp <= (20 * 60)
-
- # TESTCASE 'usage-show' 'usage' 'show' 'all usage' 'succeeds'
- (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'])
- assert ret == 200
- assert len(out['entries']) > 0
- assert len(out['summary']) > 0
- user_summary = get_user_summary(out, user1)
- total = user_summary['total']
- assert total['successful_ops'] > 0
-
- # TESTCASE 'usage-show2' 'usage' 'show' 'user usage' 'succeeds'
- (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'], {'uid' : user1})
- assert ret == 200
- assert len(out['entries']) > 0
- assert len(out['summary']) > 0
- user_summary = out['summary'][0]
- for entry in user_summary['categories']:
- assert entry['successful_ops'] > 0
- assert user_summary['user'] == user1
-
- # TESTCASE 'usage-show3' 'usage' 'show' 'user usage categories' 'succeeds'
- test_categories = ['create_bucket', 'put_obj', 'delete_obj', 'delete_bucket']
- for cat in test_categories:
- (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'], {'uid' : user1, 'categories' : cat})
- assert ret == 200
- assert len(out['summary']) > 0
- user_summary = out['summary'][0]
- assert user_summary['user'] == user1
- assert len(user_summary['categories']) == 1
- entry = user_summary['categories'][0]
- assert entry['category'] == cat
- assert entry['successful_ops'] > 0
-
- # TESTCASE 'usage-trim' 'usage' 'trim' 'user usage' 'succeeds, usage removed'
- (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'trim'], {'uid' : user1})
- assert ret == 200
- (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'], {'uid' : user1})
- assert ret == 200
- assert len(out['entries']) == 0
- assert len(out['summary']) == 0
-
- # TESTCASE 'user-suspend2','user','suspend','existing user','succeeds'
- (ret, out) = rgwadmin_rest(admin_conn, ['user', 'modify'], {'uid' : user1, 'suspended' : True})
- assert ret == 200
-
- # TESTCASE 'user-suspend3','user','suspend','suspended user','cannot write objects'
- try:
- key = boto.s3.key.Key(bucket)
- key.set_contents_from_string('five')
- except boto.exception.S3ResponseError as e:
- assert e.status == 403
-
- # TESTCASE 'user-renable2','user','enable','suspended user','succeeds'
- (ret, out) = rgwadmin_rest(admin_conn, ['user', 'modify'], {'uid' : user1, 'suspended' : 'false'})
- assert ret == 200
-
- # TESTCASE 'user-renable3','user','enable','reenabled user','can write objects'
- key = boto.s3.key.Key(bucket)
- key.set_contents_from_string('six')
-
- # TESTCASE 'garbage-list', 'garbage', 'list', 'get list of objects ready for garbage collection'
-
- # create an object large enough to be split into multiple parts
- test_string = 'foo'*10000000
-
- big_key = boto.s3.key.Key(bucket)
- big_key.set_contents_from_string(test_string)
-
- # now delete the head
- big_key.delete()
-
- # TESTCASE 'rm-user-buckets','user','rm','existing user','fails, still has buckets'
- (ret, out) = rgwadmin_rest(admin_conn, ['user', 'rm'], {'uid' : user1})
- assert ret == 409
-
- # delete should fail because ``key`` still exists
- try:
- bucket.delete()
- except boto.exception.S3ResponseError as e:
- assert e.status == 409
-
- key.delete()
- bucket.delete()
-
- # TESTCASE 'policy', 'bucket', 'policy', 'get bucket policy', 'returns S3 policy'
- bucket = connection.create_bucket(bucket_name)
-
- # create an object
- key = boto.s3.key.Key(bucket)
- key.set_contents_from_string('seven')
-
- # should be private already but guarantee it
- key.set_acl('private')
-
- (ret, out) = rgwadmin_rest(admin_conn, ['policy', 'show'], {'bucket' : bucket.name, 'object' : key.key})
- assert ret == 200
-
- acl = key.get_xml_acl()
- assert acl == out.strip('\n')
-
- # add another grantee by making the object public read
- key.set_acl('public-read')
-
- (ret, out) = rgwadmin_rest(admin_conn, ['policy', 'show'], {'bucket' : bucket.name, 'object' : key.key})
- assert ret == 200
-
- acl = key.get_xml_acl()
- assert acl == out.strip('\n')
-
- # TESTCASE 'rm-bucket', 'bucket', 'rm', 'bucket with objects', 'succeeds'
- bucket = connection.create_bucket(bucket_name)
- key_name = ['eight', 'nine', 'ten', 'eleven']
- for i in range(4):
- key = boto.s3.key.Key(bucket)
- key.set_contents_from_string(key_name[i])
-
- (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'rm'], {'bucket' : bucket_name, 'purge-objects' : True})
- assert ret == 200
-
- # TESTCASE 'caps-add', 'caps', 'add', 'add user cap', 'succeeds'
- caps = 'usage=read'
- (ret, out) = rgwadmin_rest(admin_conn, ['caps', 'add'], {'uid' : user1, 'user-caps' : caps})
- assert ret == 200
- assert out[0]['perm'] == 'read'
-
- # TESTCASE 'caps-rm', 'caps', 'rm', 'remove existing cap from user', 'succeeds'
- (ret, out) = rgwadmin_rest(admin_conn, ['caps', 'rm'], {'uid' : user1, 'user-caps' : caps})
- assert ret == 200
- assert not out
-
- # TESTCASE 'rm-user','user','rm','existing user','fails, still has buckets'
- bucket = connection.create_bucket(bucket_name)
- key = boto.s3.key.Key(bucket)
-
- (ret, out) = rgwadmin_rest(admin_conn, ['user', 'rm'], {'uid' : user1})
- assert ret == 409
-
- # TESTCASE 'rm-user2', 'user', 'rm', user with data', 'succeeds'
- bucket = connection.create_bucket(bucket_name)
- key = boto.s3.key.Key(bucket)
- key.set_contents_from_string('twelve')
-
- (ret, out) = rgwadmin_rest(admin_conn, ['user', 'rm'], {'uid' : user1, 'purge-data' : True})
- assert ret == 200
-
- # TESTCASE 'rm-user3','user','info','deleted user','fails'
- (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
- assert ret == 404
-
+++ /dev/null
-"""
-Run rados gateway agent in test mode
-"""
-import contextlib
-import logging
-import argparse
-
-from teuthology.orchestra import run
-from teuthology import misc as teuthology
-import util.rgw as rgw_utils
-
-log = logging.getLogger(__name__)
-
-def run_radosgw_agent(ctx, config):
- """
- Run a single radosgw-agent. See task() for config format.
- """
- return_list = list()
- for (client, cconf) in config.items():
- # don't process entries that are not clients
- if not client.startswith('client.'):
- log.debug('key {data} does not start with \'client.\', moving on'.format(
- data=client))
- continue
-
- src_client = cconf['src']
- dest_client = cconf['dest']
-
- src_zone = rgw_utils.zone_for_client(ctx, src_client)
- dest_zone = rgw_utils.zone_for_client(ctx, dest_client)
-
- log.info("source is %s", src_zone)
- log.info("dest is %s", dest_zone)
-
- testdir = teuthology.get_testdir(ctx)
- (remote,) = ctx.cluster.only(client).remotes.keys()
- # figure out which branch to pull from
- branch = cconf.get('force-branch', None)
- if not branch:
- branch = cconf.get('branch', 'master')
- sha1 = cconf.get('sha1')
- remote.run(
- args=[
- 'cd', testdir, run.Raw('&&'),
- 'git', 'clone',
- '-b', branch,
-# 'https://github.com/ceph/radosgw-agent.git',
- 'git://git.ceph.com/radosgw-agent.git',
- 'radosgw-agent.{client}'.format(client=client),
- ]
- )
- if sha1 is not None:
- remote.run(
- args=[
- 'cd', testdir, run.Raw('&&'),
- run.Raw('&&'),
- 'git', 'reset', '--hard', sha1,
- ]
- )
- remote.run(
- args=[
- 'cd', testdir, run.Raw('&&'),
- 'cd', 'radosgw-agent.{client}'.format(client=client),
- run.Raw('&&'),
- './bootstrap',
- ]
- )
-
- src_host, src_port = rgw_utils.get_zone_host_and_port(ctx, src_client,
- src_zone)
- dest_host, dest_port = rgw_utils.get_zone_host_and_port(ctx, dest_client,
- dest_zone)
- src_access, src_secret = rgw_utils.get_zone_system_keys(ctx, src_client,
- src_zone)
- dest_access, dest_secret = rgw_utils.get_zone_system_keys(ctx, dest_client,
- dest_zone)
- sync_scope = cconf.get('sync-scope', None)
- port = cconf.get('port', 8000)
- daemon_name = '{host}.{port}.syncdaemon'.format(host=remote.name, port=port)
- in_args=[
- 'daemon-helper',
- 'kill',
- '{tdir}/radosgw-agent.{client}/radosgw-agent'.format(tdir=testdir,
- client=client),
- '-v',
- '--src-access-key', src_access,
- '--src-secret-key', src_secret,
- '--source', "http://{addr}:{port}".format(addr=src_host, port=src_port),
- '--dest-access-key', dest_access,
- '--dest-secret-key', dest_secret,
- '--max-entries', str(cconf.get('max-entries', 1000)),
- '--log-file', '{tdir}/archive/rgw_sync_agent.{client}.log'.format(
- tdir=testdir,
- client=client),
- '--object-sync-timeout', '30',
- ]
-
- if cconf.get('metadata-only', False):
- in_args.append('--metadata-only')
-
- # the test server and full/incremental flags are mutually exclusive
- if sync_scope is None:
- in_args.append('--test-server-host')
- in_args.append('0.0.0.0')
- in_args.append('--test-server-port')
- in_args.append(str(port))
- log.debug('Starting a sync test server on {client}'.format(client=client))
- # Stash the radosgw-agent server / port # for use by subsequent tasks
- ctx.radosgw_agent.endpoint = (client, str(port))
- else:
- in_args.append('--sync-scope')
- in_args.append(sync_scope)
- log.debug('Starting a {scope} sync on {client}'.format(scope=sync_scope,client=client))
-
- # positional arg for destination must come last
- in_args.append("http://{addr}:{port}".format(addr=dest_host,
- port=dest_port))
-
- return_list.append((client, remote.run(
- args=in_args,
- wait=False,
- stdin=run.PIPE,
- logger=log.getChild(daemon_name),
- )))
- return return_list
-
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Run radosgw-agents in test mode.
-
- Configuration is clients to run the agents on, with settings for
- source client, destination client, and port to listen on. Binds
- to 0.0.0.0. Port defaults to 8000. This must be run on clients
- that have the correct zone root pools and rgw zone set in
- ceph.conf, or the task cannot read the region information from the
- cluster.
-
- By default, this task will start an HTTP server that will trigger full
- or incremental syncs based on requests made to it.
- Alternatively, a single full sync can be triggered by
- specifying 'sync-scope: full' or a loop of incremental syncs can be triggered
- by specifying 'sync-scope: incremental' (the loop will sleep
- '--incremental-sync-delay' seconds between each sync, default is 30 seconds).
-
- By default, both data and metadata are synced. To only sync
- metadata, for example because you want to sync between regions,
- set metadata-only: true.
-
- An example::
-
- tasks:
- - ceph:
- conf:
- client.0:
- rgw zone = foo
- rgw zone root pool = .root.pool
- client.1:
- rgw zone = bar
- rgw zone root pool = .root.pool2
- - rgw: # region configuration omitted for brevity
- - radosgw-agent:
- client.0:
- branch: wip-next-feature-branch
- src: client.0
- dest: client.1
- sync-scope: full
- metadata-only: true
- # port: 8000 (default)
- client.1:
- src: client.1
- dest: client.0
- port: 8001
- """
- assert isinstance(config, dict), 'rgw_sync_agent requires a dictionary config'
- log.debug("config is %s", config)
-
- overrides = ctx.config.get('overrides', {})
- # merge each client section, but only if it exists in config since there isn't
- # a sensible default action for this task
- for client in config.iterkeys():
- if config[client]:
- log.debug('config[{client}]: {data}'.format(client=client, data=config[client]))
- teuthology.deep_merge(config[client], overrides.get('radosgw-agent', {}))
-
- ctx.radosgw_agent = argparse.Namespace()
- ctx.radosgw_agent.config = config
-
- procs = run_radosgw_agent(ctx, config)
-
- ctx.radosgw_agent.procs = procs
-
- try:
- yield
- finally:
- testdir = teuthology.get_testdir(ctx)
- try:
- for client, proc in procs:
- log.info("shutting down sync agent on %s", client)
- proc.stdin.close()
- proc.wait()
- finally:
- for client, proc in procs:
- ctx.cluster.only(client).run(
- args=[
- 'rm', '-rf',
- '{tdir}/radosgw-agent.{client}'.format(tdir=testdir,
- client=client)
- ]
- )
+++ /dev/null
-"""
-Rbd testing task
-"""
-import contextlib
-import logging
-import os
-
-from cStringIO import StringIO
-from teuthology.orchestra import run
-from teuthology import misc as teuthology
-from teuthology import contextutil
-from teuthology.parallel import parallel
-from teuthology.task.common_fs_utils import generic_mkfs
-from teuthology.task.common_fs_utils import generic_mount
-from teuthology.task.common_fs_utils import default_image_name
-
-log = logging.getLogger(__name__)
-
-@contextlib.contextmanager
-def create_image(ctx, config):
- """
- Create an rbd image.
-
- For example::
-
- tasks:
- - ceph:
- - rbd.create_image:
- client.0:
- image_name: testimage
- image_size: 100
- image_format: 1
- client.1:
-
- Image size is expressed as a number of megabytes; default value
- is 10240.
-
- Image format value must be either 1 or 2; default value is 1.
-
- """
- assert isinstance(config, dict) or isinstance(config, list), \
- "task create_image only supports a list or dictionary for configuration"
-
- if isinstance(config, dict):
- images = config.items()
- else:
- images = [(role, None) for role in config]
-
- testdir = teuthology.get_testdir(ctx)
- for role, properties in images:
- if properties is None:
- properties = {}
- name = properties.get('image_name', default_image_name(role))
- size = properties.get('image_size', 10240)
- fmt = properties.get('image_format', 1)
- (remote,) = ctx.cluster.only(role).remotes.keys()
- log.info('Creating image {name} with size {size}'.format(name=name,
- size=size))
- args = [
- 'adjust-ulimits',
- 'ceph-coverage'.format(tdir=testdir),
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'rbd',
- '-p', 'rbd',
- 'create',
- '--size', str(size),
- name,
- ]
- # omit format option if using the default (format 1)
- # since old versions of don't support it
- if int(fmt) != 1:
- args += ['--image-format', str(fmt)]
- remote.run(args=args)
- try:
- yield
- finally:
- log.info('Deleting rbd images...')
- for role, properties in images:
- if properties is None:
- properties = {}
- name = properties.get('image_name', default_image_name(role))
- (remote,) = ctx.cluster.only(role).remotes.keys()
- remote.run(
- args=[
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'rbd',
- '-p', 'rbd',
- 'rm',
- name,
- ],
- )
-
-@contextlib.contextmanager
-def clone_image(ctx, config):
- """
- Clones a parent imag
-
- For example::
-
- tasks:
- - ceph:
- - rbd.clone_image:
- client.0:
- parent_name: testimage
- image_name: cloneimage
- """
- assert isinstance(config, dict) or isinstance(config, list), \
- "task clone_image only supports a list or dictionary for configuration"
-
- if isinstance(config, dict):
- images = config.items()
- else:
- images = [(role, None) for role in config]
-
- testdir = teuthology.get_testdir(ctx)
- for role, properties in images:
- if properties is None:
- properties = {}
-
- name = properties.get('image_name', default_image_name(role))
- parent_name = properties.get('parent_name')
- assert parent_name is not None, \
- "parent_name is required"
- parent_spec = '{name}@{snap}'.format(name=parent_name, snap=name)
-
- (remote,) = ctx.cluster.only(role).remotes.keys()
- log.info('Clone image {parent} to {child}'.format(parent=parent_name,
- child=name))
- for cmd in [('snap', 'create', parent_spec),
- ('snap', 'protect', parent_spec),
- ('clone', parent_spec, name)]:
- args = [
- 'adjust-ulimits',
- 'ceph-coverage'.format(tdir=testdir),
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'rbd', '-p', 'rbd'
- ]
- args.extend(cmd)
- remote.run(args=args)
-
- try:
- yield
- finally:
- log.info('Deleting rbd clones...')
- for role, properties in images:
- if properties is None:
- properties = {}
- name = properties.get('image_name', default_image_name(role))
- parent_name = properties.get('parent_name')
- parent_spec = '{name}@{snap}'.format(name=parent_name, snap=name)
-
- (remote,) = ctx.cluster.only(role).remotes.keys()
-
- for cmd in [('rm', name),
- ('snap', 'unprotect', parent_spec),
- ('snap', 'rm', parent_spec)]:
- args = [
- 'adjust-ulimits',
- 'ceph-coverage'.format(tdir=testdir),
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'rbd', '-p', 'rbd'
- ]
- args.extend(cmd)
- remote.run(args=args)
-
-@contextlib.contextmanager
-def modprobe(ctx, config):
- """
- Load the rbd kernel module..
-
- For example::
-
- tasks:
- - ceph:
- - rbd.create_image: [client.0]
- - rbd.modprobe: [client.0]
- """
- log.info('Loading rbd kernel module...')
- for role in config:
- (remote,) = ctx.cluster.only(role).remotes.keys()
- remote.run(
- args=[
- 'sudo',
- 'modprobe',
- 'rbd',
- ],
- )
- try:
- yield
- finally:
- log.info('Unloading rbd kernel module...')
- for role in config:
- (remote,) = ctx.cluster.only(role).remotes.keys()
- remote.run(
- args=[
- 'sudo',
- 'modprobe',
- '-r',
- 'rbd',
- # force errors to be ignored; necessary if more
- # than one device was created, which may mean
- # the module isn't quite ready to go the first
- # time through.
- run.Raw('||'),
- 'true',
- ],
- )
-
-@contextlib.contextmanager
-def dev_create(ctx, config):
- """
- Map block devices to rbd images.
-
- For example::
-
- tasks:
- - ceph:
- - rbd.create_image: [client.0]
- - rbd.modprobe: [client.0]
- - rbd.dev_create:
- client.0: testimage.client.0
- """
- assert isinstance(config, dict) or isinstance(config, list), \
- "task dev_create only supports a list or dictionary for configuration"
-
- if isinstance(config, dict):
- role_images = config.items()
- else:
- role_images = [(role, None) for role in config]
-
- log.info('Creating rbd block devices...')
-
- testdir = teuthology.get_testdir(ctx)
-
- for role, image in role_images:
- if image is None:
- image = default_image_name(role)
- (remote,) = ctx.cluster.only(role).remotes.keys()
-
- remote.run(
- args=[
- 'sudo',
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'rbd',
- '--user', role.rsplit('.')[-1],
- '-p', 'rbd',
- 'map',
- image,
- run.Raw('&&'),
- # wait for the symlink to be created by udev
- 'while', 'test', '!', '-e', '/dev/rbd/rbd/{image}'.format(image=image), run.Raw(';'), 'do',
- 'sleep', '1', run.Raw(';'),
- 'done',
- ],
- )
- try:
- yield
- finally:
- log.info('Unmapping rbd devices...')
- for role, image in role_images:
- if image is None:
- image = default_image_name(role)
- (remote,) = ctx.cluster.only(role).remotes.keys()
- remote.run(
- args=[
- 'LD_LIBRARY_PATH={tdir}/binary/usr/local/lib'.format(tdir=testdir),
- 'sudo',
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'rbd',
- '-p', 'rbd',
- 'unmap',
- '/dev/rbd/rbd/{imgname}'.format(imgname=image),
- run.Raw('&&'),
- # wait for the symlink to be deleted by udev
- 'while', 'test', '-e', '/dev/rbd/rbd/{image}'.format(image=image),
- run.Raw(';'),
- 'do',
- 'sleep', '1', run.Raw(';'),
- 'done',
- ],
- )
-
-
-def rbd_devname_rtn(ctx, image):
- return '/dev/rbd/rbd/{image}'.format(image=image)
-
-def canonical_path(ctx, role, path):
- """
- Determine the canonical path for a given path on the host
- representing the given role. A canonical path contains no
- . or .. components, and includes no symbolic links.
- """
- version_fp = StringIO()
- ctx.cluster.only(role).run(
- args=[ 'readlink', '-f', path ],
- stdout=version_fp,
- )
- canonical_path = version_fp.getvalue().rstrip('\n')
- version_fp.close()
- return canonical_path
-
-@contextlib.contextmanager
-def run_xfstests(ctx, config):
- """
- Run xfstests over specified devices.
-
- Warning: both the test and scratch devices specified will be
- overwritten. Normally xfstests modifies (but does not destroy)
- the test device, but for now the run script used here re-makes
- both filesystems.
-
- Note: Only one instance of xfstests can run on a single host at
- a time, although this is not enforced.
-
- This task in its current form needs some improvement. For
- example, it assumes all roles provided in the config are
- clients, and that the config provided is a list of key/value
- pairs. For now please use the xfstests() interface, below.
-
- For example::
-
- tasks:
- - ceph:
- - rbd.run_xfstests:
- client.0:
- count: 2
- test_dev: 'test_dev'
- scratch_dev: 'scratch_dev'
- fs_type: 'xfs'
- tests: 'generic/100 xfs/003 xfs/005 xfs/006 generic/015'
- randomize: true
- """
- with parallel() as p:
- for role, properties in config.items():
- p.spawn(run_xfstests_one_client, ctx, role, properties)
- yield
-
-def run_xfstests_one_client(ctx, role, properties):
- """
- Spawned routine to handle xfs tests for a single client
- """
- testdir = teuthology.get_testdir(ctx)
- try:
- count = properties.get('count')
- test_dev = properties.get('test_dev')
- assert test_dev is not None, \
- "task run_xfstests requires test_dev to be defined"
- test_dev = canonical_path(ctx, role, test_dev)
-
- scratch_dev = properties.get('scratch_dev')
- assert scratch_dev is not None, \
- "task run_xfstests requires scratch_dev to be defined"
- scratch_dev = canonical_path(ctx, role, scratch_dev)
-
- fs_type = properties.get('fs_type')
- tests = properties.get('tests')
- randomize = properties.get('randomize')
-
-
- (remote,) = ctx.cluster.only(role).remotes.keys()
-
- # Fetch the test script
- test_root = teuthology.get_testdir(ctx)
- test_script = 'run_xfstests_krbd.sh'
- test_path = os.path.join(test_root, test_script)
-
- xfstests_url = properties.get('xfstests_url')
- assert xfstests_url is not None, \
- "task run_xfstests requires xfstests_url to be defined"
-
- xfstests_krbd_url = xfstests_url + '/' + test_script
-
- log.info('Fetching {script} for {role} from {url}'.format(
- script=test_script,
- role=role,
- url=xfstests_krbd_url))
-
- args = [ 'wget', '-O', test_path, '--', xfstests_krbd_url ]
- remote.run(args=args)
-
- log.info('Running xfstests on {role}:'.format(role=role))
- log.info(' iteration count: {count}:'.format(count=count))
- log.info(' test device: {dev}'.format(dev=test_dev))
- log.info(' scratch device: {dev}'.format(dev=scratch_dev))
- log.info(' using fs_type: {fs_type}'.format(fs_type=fs_type))
- log.info(' tests to run: {tests}'.format(tests=tests))
- log.info(' randomize: {randomize}'.format(randomize=randomize))
-
- # Note that the device paths are interpreted using
- # readlink -f <path> in order to get their canonical
- # pathname (so it matches what the kernel remembers).
- args = [
- '/usr/bin/sudo',
- 'TESTDIR={tdir}'.format(tdir=testdir),
- 'URL_BASE={url}'.format(url=xfstests_url),
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- '/bin/bash',
- test_path,
- '-c', str(count),
- '-f', fs_type,
- '-t', test_dev,
- '-s', scratch_dev,
- ]
- if randomize:
- args.append('-r')
- if tests:
- args.extend(['--', tests])
- remote.run(args=args, logger=log.getChild(role))
- finally:
- log.info('Removing {script} on {role}'.format(script=test_script,
- role=role))
- remote.run(args=['rm', '-f', test_path])
-
-@contextlib.contextmanager
-def xfstests(ctx, config):
- """
- Run xfstests over rbd devices. This interface sets up all
- required configuration automatically if not otherwise specified.
- Note that only one instance of xfstests can run on a single host
- at a time. By default, the set of tests specified is run once.
- If a (non-zero) count value is supplied, the complete set of
- tests will be run that number of times.
-
- For example::
-
- tasks:
- - ceph:
- # Image sizes are in MB
- - rbd.xfstests:
- client.0:
- count: 3
- test_image: 'test_image'
- test_size: 250
- test_format: 2
- scratch_image: 'scratch_image'
- scratch_size: 250
- scratch_format: 1
- fs_type: 'xfs'
- tests: 'generic/100 xfs/003 xfs/005 xfs/006 generic/015'
- randomize: true
- xfstests_branch: master
- xfstests_url: 'https://raw.github.com/ceph/branch/master/qa'
- """
- if config is None:
- config = { 'all': None }
- assert isinstance(config, dict) or isinstance(config, list), \
- "task xfstests only supports a list or dictionary for configuration"
- if isinstance(config, dict):
- config = teuthology.replace_all_with_clients(ctx.cluster, config)
- runs = config.items()
- else:
- runs = [(role, None) for role in config]
-
- running_xfstests = {}
- for role, properties in runs:
- assert role.startswith('client.'), \
- "task xfstests can only run on client nodes"
- for host, roles_for_host in ctx.cluster.remotes.items():
- if role in roles_for_host:
- assert host not in running_xfstests, \
- "task xfstests allows only one instance at a time per host"
- running_xfstests[host] = True
-
- images_config = {}
- scratch_config = {}
- modprobe_config = {}
- image_map_config = {}
- scratch_map_config = {}
- xfstests_config = {}
- for role, properties in runs:
- if properties is None:
- properties = {}
-
- test_image = properties.get('test_image', 'test_image.{role}'.format(role=role))
- test_size = properties.get('test_size', 10000) # 10G
- test_fmt = properties.get('test_format', 1)
- scratch_image = properties.get('scratch_image', 'scratch_image.{role}'.format(role=role))
- scratch_size = properties.get('scratch_size', 10000) # 10G
- scratch_fmt = properties.get('scratch_format', 1)
-
- images_config[role] = dict(
- image_name=test_image,
- image_size=test_size,
- image_format=test_fmt,
- )
-
- scratch_config[role] = dict(
- image_name=scratch_image,
- image_size=scratch_size,
- image_format=scratch_fmt,
- )
-
- xfstests_branch = properties.get('xfstests_branch', 'master')
- xfstests_url = properties.get('xfstests_url', 'https://raw.github.com/ceph/ceph/{branch}/qa'.format(branch=xfstests_branch))
-
- xfstests_config[role] = dict(
- count=properties.get('count', 1),
- test_dev='/dev/rbd/rbd/{image}'.format(image=test_image),
- scratch_dev='/dev/rbd/rbd/{image}'.format(image=scratch_image),
- fs_type=properties.get('fs_type', 'xfs'),
- randomize=properties.get('randomize', False),
- tests=properties.get('tests'),
- xfstests_url=xfstests_url,
- )
-
- log.info('Setting up xfstests using RBD images:')
- log.info(' test ({size} MB): {image}'.format(size=test_size,
- image=test_image))
- log.info(' scratch ({size} MB): {image}'.format(size=scratch_size,
- image=scratch_image))
- modprobe_config[role] = None
- image_map_config[role] = test_image
- scratch_map_config[role] = scratch_image
-
- with contextutil.nested(
- lambda: create_image(ctx=ctx, config=images_config),
- lambda: create_image(ctx=ctx, config=scratch_config),
- lambda: modprobe(ctx=ctx, config=modprobe_config),
- lambda: dev_create(ctx=ctx, config=image_map_config),
- lambda: dev_create(ctx=ctx, config=scratch_map_config),
- lambda: run_xfstests(ctx=ctx, config=xfstests_config),
- ):
- yield
-
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Create and mount an rbd image.
-
- For example, you can specify which clients to run on::
-
- tasks:
- - ceph:
- - rbd: [client.0, client.1]
-
- There are a few image options::
-
- tasks:
- - ceph:
- - rbd:
- client.0: # uses defaults
- client.1:
- image_name: foo
- image_size: 2048
- image_format: 2
- fs_type: xfs
-
- To use default options on all clients::
-
- tasks:
- - ceph:
- - rbd:
- all:
-
- To create 20GiB images and format them with xfs on all clients::
-
- tasks:
- - ceph:
- - rbd:
- all:
- image_size: 20480
- fs_type: xfs
- """
- if config is None:
- config = { 'all': None }
- norm_config = config
- if isinstance(config, dict):
- norm_config = teuthology.replace_all_with_clients(ctx.cluster, config)
- if isinstance(norm_config, dict):
- role_images = {}
- for role, properties in norm_config.iteritems():
- if properties is None:
- properties = {}
- role_images[role] = properties.get('image_name')
- else:
- role_images = norm_config
-
- log.debug('rbd config is: %s', norm_config)
-
- with contextutil.nested(
- lambda: create_image(ctx=ctx, config=norm_config),
- lambda: modprobe(ctx=ctx, config=norm_config),
- lambda: dev_create(ctx=ctx, config=role_images),
- lambda: generic_mkfs(ctx=ctx, config=norm_config,
- devname_rtn=rbd_devname_rtn),
- lambda: generic_mount(ctx=ctx, config=role_images,
- devname_rtn=rbd_devname_rtn),
- ):
- yield
+++ /dev/null
-"""
- Long running fio tests on rbd mapped devices for format/features provided in config
- Many fio parameters can be configured so that this task can be used along with thrash/power-cut tests
- and exercise IO on full disk for all format/features
- - This test should not be run on VM due to heavy use of resource
-
-"""
-import contextlib
-import json
-import logging
-import StringIO
-import re
-
-from teuthology.parallel import parallel
-from teuthology import misc as teuthology
-from tempfile import NamedTemporaryFile
-from teuthology.orchestra import run
-
-log = logging.getLogger(__name__)
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- client.0:
- fio-io-size: 100g or 80% or 100m
- fio-version: 2.2.9
- formats: [2]
- features: [[layering],[striping],[layering,exclusive-lock,object-map]]
- test-clone-io: 1 #remove this option to not run create rbd clone and not run io on clone
- io-engine: "sync or rbd or any io-engine"
- rw: randrw
- client.1:
- fio-io-size: 100g
- fio-version: 2.2.9
- rw: read
- image-size:20480
-
-or
- all:
- fio-io-size: 400g
- rw: randrw
- formats: [2]
- features: [[layering],[striping]]
- io-engine: libaio
-
- Create rbd image + device and exercise IO for format/features provided in config file
- Config can be per client or one config can be used for all clients, fio jobs are run in parallel for client provided
-
- """
- if config.get('all'):
- client_config = config['all']
- clients = ctx.cluster.only(teuthology.is_type('client'))
- rbd_test_dir = teuthology.get_testdir(ctx) + "/rbd_fio_test"
- for remote,role in clients.remotes.iteritems():
- if 'client_config' in locals():
- with parallel() as p:
- p.spawn(run_fio, remote, client_config, rbd_test_dir)
- else:
- for client_config in config:
- if client_config in role:
- with parallel() as p:
- p.spawn(run_fio, remote, config[client_config], rbd_test_dir)
-
- yield
-
-
-def run_fio(remote, config, rbd_test_dir):
- """
- create fio config file with options based on above config
- get the fio from github, generate binary, and use it to run on
- the generated fio config file
- """
- fio_config=NamedTemporaryFile(prefix='fio_rbd_', dir='/tmp/', delete=False)
- fio_config.write('[global]\n')
- if config.get('io-engine'):
- ioengine=config['io-engine']
- fio_config.write('ioengine={ioe}\n'.format(ioe=ioengine))
- else:
- fio_config.write('ioengine=sync\n')
- if config.get('bs'):
- bs=config['bs']
- fio_config.write('bs={bs}\n'.format(bs=bs))
- else:
- fio_config.write('bs=4k\n')
- fio_config.write('iodepth=2\n')
- if config.get('fio-io-size'):
- size=config['fio-io-size']
- fio_config.write('size={size}\n'.format(size=size))
- else:
- fio_config.write('size=100m\n')
-
- fio_config.write('time_based\n')
- if config.get('runtime'):
- runtime=config['runtime']
- fio_config.write('runtime={runtime}\n'.format(runtime=runtime))
- else:
- fio_config.write('runtime=1800\n')
- fio_config.write('allow_file_create=0\n')
- image_size=10240
- if config.get('image_size'):
- image_size=config['image_size']
-
- formats=[1,2]
- features=[['layering'],['striping'],['exclusive-lock','object-map']]
- fio_version='2.7'
- if config.get('formats'):
- formats=config['formats']
- if config.get('features'):
- features=config['features']
- if config.get('fio-version'):
- fio_version=config['fio-version']
-
- fio_config.write('norandommap\n')
- if ioengine == 'rbd':
- fio_config.write('invalidate=0\n')
- #handle package required for librbd engine
- sn=remote.shortname
- system_type= teuthology.get_system_type(remote)
- if system_type == 'rpm' and ioengine == 'rbd':
- log.info("Installing librbd1 devel package on {sn}".format(sn=sn))
- remote.run(args=['sudo', 'yum' , 'install', 'librbd1-devel', '-y'])
- elif ioengine == 'rbd':
- log.info("Installing librbd devel package on {sn}".format(sn=sn))
- remote.run(args=['sudo', 'apt-get', '-y',
- '--force-yes',
- 'install', 'librbd-dev'])
- if ioengine == 'rbd':
- fio_config.write('clientname=admin\n')
- fio_config.write('pool=rbd\n')
- for frmt in formats:
- for feature in features:
- log.info("Creating rbd images on {sn}".format(sn=sn))
- feature_name = '-'.join(feature)
- rbd_name = 'i{i}f{f}{sn}'.format(i=frmt,f=feature_name,sn=sn)
- rbd_snap_name = 'i{i}f{f}{sn}@i{i}f{f}{sn}Snap'.format(i=frmt,f=feature_name,sn=sn)
- rbd_clone_name = 'i{i}f{f}{sn}Clone'.format(i=frmt,f=feature_name,sn=sn)
- create_args=['rbd', 'create',
- '--size', '{size}'.format(size=image_size),
- '--image', rbd_name,
- '--image-format', '{f}'.format(f=frmt)]
- map(lambda x: create_args.extend(['--image-feature', x]), feature)
- remote.run(args=create_args)
- remote.run(args=['rbd', 'info', rbd_name])
- if ioengine != 'rbd':
- out=StringIO.StringIO()
- remote.run(args=['sudo', 'rbd', 'map', rbd_name ],stdout=out)
- dev=re.search(r'(/dev/rbd\d+)',out.getvalue())
- rbd_dev=dev.group(1)
- if config.get('test-clone-io'):
- log.info("Testing clones using fio")
- remote.run(args=['rbd', 'snap', 'create', rbd_snap_name])
- remote.run(args=['rbd', 'snap', 'protect', rbd_snap_name])
- remote.run(args=['rbd', 'clone', rbd_snap_name, rbd_clone_name])
- remote.run(args=['sudo', 'rbd', 'map', rbd_clone_name], stdout=out)
- dev=re.search(r'(/dev/rbd\d+)',out.getvalue())
- rbd_clone_dev=dev.group(1)
- fio_config.write('[{rbd_dev}]\n'.format(rbd_dev=rbd_dev))
- if config.get('rw'):
- rw=config['rw']
- fio_config.write('rw={rw}\n'.format(rw=rw))
- else:
- fio_config .write('rw=randrw\n')
- fio_config.write('filename={rbd_dev}\n'.format(rbd_dev=rbd_dev))
- if config.get('test-clone-io'):
- fio_config.write('[{rbd_clone_dev}]\n'.format(rbd_clone_dev=rbd_clone_dev))
- fio_config.write('rw={rw}\n'.format(rw=rw))
- fio_config.write('filename={rbd_clone_dev}\n'.format(rbd_clone_dev=rbd_clone_dev))
- else:
- if config.get('test-clone-io'):
- log.info("Testing clones using fio")
- remote.run(args=['rbd', 'snap', 'create', rbd_snap_name])
- remote.run(args=['rbd', 'snap', 'protect', rbd_snap_name])
- remote.run(args=['rbd', 'clone', rbd_snap_name, rbd_clone_name])
- fio_config.write('[{img_name}]\n'.format(img_name=rbd_name))
- if config.get('rw'):
- rw=config['rw']
- fio_config.write('rw={rw}\n'.format(rw=rw))
- else:
- fio_config.write('rw=randrw\n')
- fio_config.write('rbdname={img_name}\n'.format(img_name=rbd_name))
- if config.get('test-clone-io'):
- fio_config.write('[{clone_img_name}]\n'.format(clone_img_name=rbd_clone_name))
- fio_config.write('rw={rw}\n'.format(rw=rw))
- fio_config.write('rbdname={clone_img_name}\n'.format(clone_img_name=rbd_clone_name))
-
-
- fio_config.close()
- remote.put_file(fio_config.name,fio_config.name)
- try:
- log.info("Running rbd feature - fio test on {sn}".format(sn=sn))
- fio = "https://github.com/axboe/fio/archive/fio-" + fio_version + ".tar.gz"
- remote.run(args=['mkdir', run.Raw(rbd_test_dir),])
- remote.run(args=['cd' , run.Raw(rbd_test_dir),
- run.Raw(';'), 'wget' , fio , run.Raw(';'), run.Raw('tar -xvf fio*tar.gz'), run.Raw(';'),
- run.Raw('cd fio-fio*'), 'configure', run.Raw(';') ,'make'])
- remote.run(args=['ceph', '-s'])
- remote.run(args=['sudo', run.Raw('{tdir}/fio-fio-{v}/fio {f}'.format(tdir=rbd_test_dir,v=fio_version,f=fio_config.name))])
- remote.run(args=['ceph', '-s'])
- finally:
- out=StringIO.StringIO()
- remote.run(args=['rbd','showmapped', '--format=json'], stdout=out)
- mapped_images = json.loads(out.getvalue())
- if mapped_images:
- log.info("Unmapping rbd images on {sn}".format(sn=sn))
- for image in mapped_images.itervalues():
- remote.run(args=['sudo', 'rbd', 'unmap', str(image['device'])])
- log.info("Cleaning up fio install")
- remote.run(args=['rm','-rf', run.Raw(rbd_test_dir)])
- if system_type == 'rpm' and ioengine == 'rbd':
- log.info("Uninstall librbd1 devel package on {sn}".format(sn=sn))
- remote.run(args=['sudo', 'yum' , 'remove', 'librbd1-devel', '-y'])
- elif ioengine == 'rbd':
- log.info("Uninstall librbd devel package on {sn}".format(sn=sn))
- remote.run(args=['sudo', 'apt-get', '-y', 'remove', 'librbd-dev'])
+++ /dev/null
-"""
-Run fsx on an rbd image
-"""
-import contextlib
-import logging
-
-from teuthology.parallel import parallel
-from teuthology import misc as teuthology
-
-log = logging.getLogger(__name__)
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Run fsx on an rbd image.
-
- Currently this requires running as client.admin
- to create a pool.
-
- Specify which clients to run on as a list::
-
- tasks:
- ceph:
- rbd_fsx:
- clients: [client.0, client.1]
-
- You can optionally change some properties of fsx:
-
- tasks:
- ceph:
- rbd_fsx:
- clients: <list of clients>
- seed: <random seed number, or 0 to use the time>
- ops: <number of operations to do>
- size: <maximum image size in bytes>
- valgrind: [--tool=<valgrind tool>]
- """
- log.info('starting rbd_fsx...')
- with parallel() as p:
- for role in config['clients']:
- p.spawn(_run_one_client, ctx, config, role)
- yield
-
-def _run_one_client(ctx, config, role):
- """Spawned task that runs the client"""
- krbd = config.get('krbd', False)
- nbd = config.get('nbd', False)
- testdir = teuthology.get_testdir(ctx)
- (remote,) = ctx.cluster.only(role).remotes.iterkeys()
-
- args = []
- if krbd or nbd:
- args.append('sudo') # rbd(-nbd) map/unmap need privileges
- args.extend([
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir)
- ])
-
- overrides = ctx.config.get('overrides', {})
- teuthology.deep_merge(config, overrides.get('rbd_fsx', {}))
-
- if config.get('valgrind'):
- args = teuthology.get_valgrind_args(
- testdir,
- 'fsx_{id}'.format(id=role),
- args,
- config.get('valgrind')
- )
-
- args.extend([
- 'ceph_test_librbd_fsx',
- '-d', # debug output for all operations
- '-W', '-R', # mmap doesn't work with rbd
- '-p', str(config.get('progress_interval', 100)), # show progress
- '-P', '{tdir}/archive'.format(tdir=testdir),
- '-r', str(config.get('readbdy',1)),
- '-w', str(config.get('writebdy',1)),
- '-t', str(config.get('truncbdy',1)),
- '-h', str(config.get('holebdy',1)),
- '-l', str(config.get('size', 250000000)),
- '-S', str(config.get('seed', 0)),
- '-N', str(config.get('ops', 1000)),
- ])
- if krbd:
- args.append('-K') # -K enables krbd mode
- if nbd:
- args.append('-M') # -M enables nbd mode
- if config.get('direct_io', False):
- args.append('-Z') # -Z use direct IO
- if not config.get('randomized_striping', True):
- args.append('-U') # -U disables randomized striping
- if not config.get('punch_holes', True):
- args.append('-H') # -H disables discard ops
- if config.get('journal_replay', False):
- args.append('-j') # -j replay all IO events from journal
- args.extend([
- 'pool_{pool}'.format(pool=role),
- 'image_{image}'.format(image=role),
- ])
-
- remote.run(args=args)
+++ /dev/null
-"""
-Task for running rbd mirroring daemons and configuring mirroring
-"""
-
-import logging
-
-from teuthology.orchestra import run
-from teuthology import misc
-from teuthology.exceptions import ConfigError
-from teuthology.task import Task
-from util import get_remote_for_role
-
-log = logging.getLogger(__name__)
-
-
-class RBDMirror(Task):
- """
- Run an rbd-mirror daemon to sync rbd images between clusters.
-
- This requires two clients (one from each cluster) on the same host
- to connect with. The pool configuration should be adjusted by later
- test scripts to include the remote client and cluster name. This task
- just needs to know how to connect to the local cluster.
-
- For example:
-
- roles:
- - [primary.mon.a, primary.osd.0, primary.osd.1, primary.osd.2]
- - [secondary.mon.a, secondary.osd.0, secondary.osd.1, secondary.osd.2]
- - [primary.client.mirror, secondary.client.mirror]
- tasks:
- - ceph:
- cluster: primary
- - ceph:
- cluster: secondary
- - rbd-mirror:
- client: primary.client.mirror
-
- To mirror back to the primary cluster as well, add another
- rbd_mirror instance:
-
- - rbd-mirror:
- client: secondary.client.mirror
-
- Possible options for this task are:
-
- client: role - ceph client to connect as
- valgrind: [--tool=<valgrind tool>] - none by default
- coverage: bool - whether this run may be collecting coverage data
- """
- def __init__(self, ctx, config):
- super(RBDMirror, self).__init__(ctx, config)
- self.log = log
-
- def setup(self):
- super(RBDMirror, self).setup()
- try:
- self.client = self.config['client']
- except KeyError:
- raise ConfigError('rbd-mirror requires a client to connect with')
-
- self.cluster_name, type_, self.client_id = misc.split_role(self.client)
-
- if type_ != 'client':
- msg = 'client role ({0}) must be a client'.format(self.client)
- raise ConfigError(msg)
-
- self.remote = get_remote_for_role(self.ctx, self.client)
-
- def begin(self):
- super(RBDMirror, self).begin()
- testdir = misc.get_testdir(self.ctx)
- daemon_signal = 'kill'
- if 'coverage' in self.config or 'valgrind' in self.config:
- daemon_signal = 'term'
-
- args = [
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'daemon-helper',
- daemon_signal,
- ]
-
- if 'valgrind' in self.config:
- args = misc.get_valgrind_args(
- testdir,
- 'rbd-mirror-{id}'.format(id=self.client),
- args,
- self.config.get('valgrind')
- )
-
- args.extend([
- 'rbd-mirror',
- '--cluster',
- self.cluster_name,
- '--id',
- self.client_id,
- ])
-
- self.ctx.daemons.add_daemon(
- self.remote, 'rbd-mirror', self.client,
- cluster=self.cluster_name,
- args=args,
- logger=self.log.getChild(self.client),
- stdin=run.PIPE,
- wait=False,
- )
-
- def end(self):
- mirror_daemon = self.ctx.daemons.get_daemon('rbd-mirror',
- self.client,
- self.cluster_name)
- mirror_daemon.stop()
- super(RBDMirror, self).end()
-
-task = RBDMirror
+++ /dev/null
-"""
-Test if we can recover the leveldb from OSD after where all leveldbs are
-corrupted
-"""
-
-import logging
-import os.path
-import shutil
-import tempfile
-
-import ceph_manager
-from teuthology import misc as teuthology
-
-log = logging.getLogger(__name__)
-
-
-def push_directory(path, remote, remote_dir):
- """
- local_temp_path=`mktemp`
- tar czf $local_temp_path $path
- ssh remote mkdir -p remote_dir
- remote_temp_path=`mktemp`
- scp $local_temp_path $remote_temp_path
- rm $local_temp_path
- tar xzf $remote_temp_path -C $remote_dir
- ssh remote:$remote_temp_path
- """
- fd, local_temp_path = tempfile.mkstemp(suffix='.tgz',
- prefix='rebuild_mondb-')
- os.close(fd)
- cmd = ' '.join(['tar', 'cz',
- '-f', local_temp_path,
- '-C', path,
- '--', '.'])
- teuthology.sh(cmd)
- _, fname = os.path.split(local_temp_path)
- fd, remote_temp_path = tempfile.mkstemp(suffix='.tgz',
- prefix='rebuild_mondb-')
- os.close(fd)
- remote.put_file(local_temp_path, remote_temp_path)
- os.remove(local_temp_path)
- remote.run(args=['sudo',
- 'tar', 'xz',
- '-C', remote_dir,
- '-f', remote_temp_path])
- remote.run(args=['sudo', 'rm', '-fr', remote_temp_path])
-
-
-def task(ctx, config):
- """
- Test monitor recovery from OSD
- """
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- 'task only accepts a dict for configuration'
-
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
-
- manager = ceph_manager.CephManager(
- mon,
- ctx=ctx,
- logger=log.getChild('ceph_manager'))
-
- mons = ctx.cluster.only(teuthology.is_type('mon'))
- assert mons
- # note down the first cluster_name and mon_id
- # we will recover it later on
- cluster_name = None
- mon_id = None
- for remote, roles in mons.remotes.iteritems():
- is_mon = teuthology.is_type('mon')
- for role in roles:
- if not is_mon(role):
- continue
- cluster, _, m = teuthology.split_role(role)
- if cluster_name is None:
- cluster_name = cluster
- mon_id = m
- assert cluster_name == cluster
- log.info('killing {cluster}:mon.{mon}'.format(
- cluster=cluster,
- mon=m))
- manager.kill_mon(m)
- mon_data = os.path.join('/var/lib/ceph/mon/',
- '{0}-{1}'.format(cluster_name, m))
- if m == mon_id:
- # so we will only need to recreate the store.db for the
- # first mon, would be easier than mkfs on it then replace
- # the its store.db with the recovered one
- store_dir = os.path.join(mon_data, 'store.db')
- remote.run(args=['sudo', 'rm', '-r', store_dir])
- else:
- remote.run(args=['sudo', 'rm', '-r', mon_data])
-
- local_mstore = tempfile.mkdtemp()
-
- # collect the maps from all OSDs
- osds = ctx.cluster.only(teuthology.is_type('osd'))
- assert osds
- for osd, roles in osds.remotes.iteritems():
- is_osd = teuthology.is_type('osd')
- for role in roles:
- if not is_osd(role):
- continue
- cluster, _, osd_id = teuthology.split_role(role)
- assert cluster_name == cluster
- log.info('collecting maps from {cluster}:osd.{osd}'.format(
- cluster=cluster,
- osd=osd_id))
- # push leveldb to OSD
- osd_mstore = os.path.join(teuthology.get_testdir(ctx), 'mon-store')
- osd.run(args=['sudo', 'mkdir', '-m', 'o+x', '-p', osd_mstore])
-
- push_directory(local_mstore, osd, osd_mstore)
- log.info('rm -rf {0}'.format(local_mstore))
- shutil.rmtree(local_mstore)
- # update leveldb with OSD data
- options = '--op update-mon-db --mon-store-path {0}'
- log.info('cot {0}'.format(osd_mstore))
- manager.objectstore_tool(pool=None,
- options=options.format(osd_mstore),
- args='',
- osd=osd_id,
- do_revive=False)
- # pull the updated mon db
- log.info('pull dir {0} -> {1}'.format(osd_mstore, local_mstore))
- local_mstore = tempfile.mkdtemp()
- teuthology.pull_directory(osd, osd_mstore, local_mstore)
- log.info('rm -rf osd:{0}'.format(osd_mstore))
- osd.run(args=['sudo', 'rm', '-fr', osd_mstore])
-
- # recover the first_mon with re-built mon db
- # pull from recovered leveldb from client
- mon_store_dir = os.path.join('/var/lib/ceph/mon',
- '{0}-{1}'.format(cluster_name, mon_id))
- push_directory(local_mstore, mon, mon_store_dir)
- mon.run(args=['sudo', 'chown', '-R', 'ceph:ceph', mon_store_dir])
- shutil.rmtree(local_mstore)
- default_keyring = '/etc/ceph/{cluster}.keyring'.format(
- cluster=cluster_name)
- keyring_path = config.get('keyring_path', default_keyring)
- # fill up the caps in the keyring file
- mon.run(args=['sudo',
- 'ceph-authtool', keyring_path,
- '-n', 'mon.',
- '--cap', 'mon', 'allow *'])
- mon.run(args=['sudo',
- 'ceph-authtool', keyring_path,
- '-n', 'client.admin',
- '--cap', 'mon', 'allow *',
- '--cap', 'osd', 'allow *',
- '--cap', 'mds', 'allow *'])
- mon.run(args=['sudo', '-u', 'ceph',
- 'ceph-monstore-tool', mon_store_dir,
- 'rebuild', '--', '--keyring',
- keyring_path])
-
- # revive monitors
- # the initial monmap is in the ceph.conf, so we are good.
- n_mons = 0
- for remote, roles in mons.remotes.iteritems():
- is_mon = teuthology.is_type('mon')
- for role in roles:
- if not is_mon(role):
- continue
- cluster, _, m = teuthology.split_role(role)
- assert cluster_name == cluster
- if mon_id != m:
- log.info('running mkfs on {cluster}:mon.{mon}'.format(
- cluster=cluster,
- mon=m))
- remote.run(
- args=[
- 'sudo',
- 'ceph-mon',
- '--cluster', cluster,
- '--mkfs',
- '-i', m,
- '--keyring', keyring_path])
- manager.revive_mon(m)
- n_mons += 1
-
- manager.wait_for_mon_quorum_size(n_mons, timeout=30)
- for osd, roles in osds.remotes.iteritems():
- is_osd = teuthology.is_type('osd')
- for role in roles:
- if not is_osd(role):
- continue
- _, _, osd_id = teuthology.split_role(role)
- log.info('reviving osd.{0}'.format(osd_id))
- manager.revive_osd(osd_id)
+++ /dev/null
-"""
-Recovery system benchmarking
-"""
-from cStringIO import StringIO
-
-import contextlib
-import gevent
-import json
-import logging
-import random
-import time
-
-import ceph_manager
-from teuthology import misc as teuthology
-
-log = logging.getLogger(__name__)
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Benchmark the recovery system.
-
- Generates objects with smalliobench, runs it normally to get a
- baseline performance measurement, then marks an OSD out and reruns
- to measure performance during recovery.
-
- The config should be as follows:
-
- recovery_bench:
- duration: <seconds for each measurement run>
- num_objects: <number of objects>
- io_size: <io size in bytes>
-
- example:
-
- tasks:
- - ceph:
- - recovery_bench:
- duration: 60
- num_objects: 500
- io_size: 4096
- """
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- 'recovery_bench task only accepts a dict for configuration'
-
- log.info('Beginning recovery bench...')
-
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
-
- manager = ceph_manager.CephManager(
- mon,
- ctx=ctx,
- logger=log.getChild('ceph_manager'),
- )
-
- num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
- while len(manager.get_osd_status()['up']) < num_osds:
- time.sleep(10)
-
- bench_proc = RecoveryBencher(
- manager,
- config,
- )
- try:
- yield
- finally:
- log.info('joining recovery bencher')
- bench_proc.do_join()
-
-class RecoveryBencher:
- """
- RecoveryBencher
- """
- def __init__(self, manager, config):
- self.ceph_manager = manager
- self.ceph_manager.wait_for_clean()
-
- osd_status = self.ceph_manager.get_osd_status()
- self.osds = osd_status['up']
-
- self.config = config
- if self.config is None:
- self.config = dict()
-
- else:
- def tmp(x):
- """
- Local wrapper to print value.
- """
- print x
- self.log = tmp
-
- log.info("spawning thread")
-
- self.thread = gevent.spawn(self.do_bench)
-
- def do_join(self):
- """
- Join the recovery bencher. This is called after the main
- task exits.
- """
- self.thread.get()
-
- def do_bench(self):
- """
- Do the benchmarking.
- """
- duration = self.config.get("duration", 60)
- num_objects = self.config.get("num_objects", 500)
- io_size = self.config.get("io_size", 4096)
-
- osd = str(random.choice(self.osds))
- (osd_remote,) = self.ceph_manager.ctx.cluster.only('osd.%s' % osd).remotes.iterkeys()
-
- testdir = teuthology.get_testdir(self.ceph_manager.ctx)
-
- # create the objects
- osd_remote.run(
- args=[
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'smalliobench'.format(tdir=testdir),
- '--use-prefix', 'recovery_bench',
- '--init-only', '1',
- '--num-objects', str(num_objects),
- '--io-size', str(io_size),
- ],
- wait=True,
- )
-
- # baseline bench
- log.info('non-recovery (baseline)')
- p = osd_remote.run(
- args=[
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'smalliobench',
- '--use-prefix', 'recovery_bench',
- '--do-not-init', '1',
- '--duration', str(duration),
- '--io-size', str(io_size),
- ],
- stdout=StringIO(),
- stderr=StringIO(),
- wait=True,
- )
- self.process_samples(p.stderr.getvalue())
-
- self.ceph_manager.raw_cluster_cmd('osd', 'out', osd)
- time.sleep(5)
-
- # recovery bench
- log.info('recovery active')
- p = osd_remote.run(
- args=[
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'smalliobench',
- '--use-prefix', 'recovery_bench',
- '--do-not-init', '1',
- '--duration', str(duration),
- '--io-size', str(io_size),
- ],
- stdout=StringIO(),
- stderr=StringIO(),
- wait=True,
- )
- self.process_samples(p.stderr.getvalue())
-
- self.ceph_manager.raw_cluster_cmd('osd', 'in', osd)
-
- def process_samples(self, input):
- """
- Extract samples from the input and process the results
-
- :param input: input lines in JSON format
- """
- lat = {}
- for line in input.split('\n'):
- try:
- sample = json.loads(line)
- samples = lat.setdefault(sample['type'], [])
- samples.append(float(sample['latency']))
- except Exception:
- pass
-
- for type in lat:
- samples = lat[type]
- samples.sort()
-
- num = len(samples)
-
- # median
- if num & 1 == 1: # odd number of samples
- median = samples[num / 2]
- else:
- median = (samples[num / 2] + samples[num / 2 - 1]) / 2
-
- # 99%
- ninety_nine = samples[int(num * 0.99)]
-
- log.info("%s: median %f, 99%% %f" % (type, median, ninety_nine))
+++ /dev/null
-"""
-Special regression test for tracker #11184
-
-Synopsis: osd/SnapMapper.cc: 282: FAILED assert(check(oid))
-
-This is accomplished by moving a pg that wasn't part of split and still include
-divergent priors.
-"""
-import logging
-import time
-from cStringIO import StringIO
-
-from teuthology import misc as teuthology
-from util.rados import rados
-import os
-
-
-log = logging.getLogger(__name__)
-
-
-def task(ctx, config):
- """
- Test handling of divergent entries during export / import
- to regression test tracker #11184
-
- overrides:
- ceph:
- conf:
- osd:
- debug osd: 5
-
- Requires 3 osds on a single test node.
- """
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- 'divergent_priors task only accepts a dict for configuration'
-
- manager = ctx.managers['ceph']
-
- while len(manager.get_osd_status()['up']) < 3:
- time.sleep(10)
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.raw_cluster_cmd('osd', 'set', 'noout')
- manager.raw_cluster_cmd('osd', 'set', 'noin')
- manager.raw_cluster_cmd('osd', 'set', 'nodown')
- manager.wait_for_clean()
-
- # something that is always there
- dummyfile = '/etc/fstab'
- dummyfile2 = '/etc/resolv.conf'
- testdir = teuthology.get_testdir(ctx)
-
- # create 1 pg pool
- log.info('creating foo')
- manager.raw_cluster_cmd('osd', 'pool', 'create', 'foo', '1')
-
- osds = [0, 1, 2]
- for i in osds:
- manager.set_config(i, osd_min_pg_log_entries=10)
- manager.set_config(i, osd_max_pg_log_entries=10)
- manager.set_config(i, osd_pg_log_trim_min=5)
-
- # determine primary
- divergent = manager.get_pg_primary('foo', 0)
- log.info("primary and soon to be divergent is %d", divergent)
- non_divergent = list(osds)
- non_divergent.remove(divergent)
-
- log.info('writing initial objects')
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
- # write 100 objects
- for i in range(100):
- rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile])
-
- manager.wait_for_clean()
-
- # blackhole non_divergent
- log.info("blackholing osds %s", str(non_divergent))
- for i in non_divergent:
- manager.set_config(i, objectstore_blackhole=1)
-
- DIVERGENT_WRITE = 5
- DIVERGENT_REMOVE = 5
- # Write some soon to be divergent
- log.info('writing divergent objects')
- for i in range(DIVERGENT_WRITE):
- rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i,
- dummyfile2], wait=False)
- # Remove some soon to be divergent
- log.info('remove divergent objects')
- for i in range(DIVERGENT_REMOVE):
- rados(ctx, mon, ['-p', 'foo', 'rm',
- 'existing_%d' % (i + DIVERGENT_WRITE)], wait=False)
- time.sleep(10)
- mon.run(
- args=['killall', '-9', 'rados'],
- wait=True,
- check_status=False)
-
- # kill all the osds but leave divergent in
- log.info('killing all the osds')
- for i in osds:
- manager.kill_osd(i)
- for i in osds:
- manager.mark_down_osd(i)
- for i in non_divergent:
- manager.mark_out_osd(i)
-
- # bring up non-divergent
- log.info("bringing up non_divergent %s", str(non_divergent))
- for i in non_divergent:
- manager.revive_osd(i)
- for i in non_divergent:
- manager.mark_in_osd(i)
-
- # write 1 non-divergent object (ensure that old divergent one is divergent)
- objname = "existing_%d" % (DIVERGENT_WRITE + DIVERGENT_REMOVE)
- log.info('writing non-divergent object ' + objname)
- rados(ctx, mon, ['-p', 'foo', 'put', objname, dummyfile2])
-
- manager.wait_for_recovery()
-
- # ensure no recovery of up osds first
- log.info('delay recovery')
- for i in non_divergent:
- manager.wait_run_admin_socket(
- 'osd', i, ['set_recovery_delay', '100000'])
-
- # bring in our divergent friend
- log.info("revive divergent %d", divergent)
- manager.raw_cluster_cmd('osd', 'set', 'noup')
- manager.revive_osd(divergent)
-
- log.info('delay recovery divergent')
- manager.wait_run_admin_socket(
- 'osd', divergent, ['set_recovery_delay', '100000'])
-
- manager.raw_cluster_cmd('osd', 'unset', 'noup')
- while len(manager.get_osd_status()['up']) < 3:
- time.sleep(10)
-
- log.info('wait for peering')
- rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile])
-
- # At this point the divergent_priors should have been detected
-
- log.info("killing divergent %d", divergent)
- manager.kill_osd(divergent)
-
- # Split pgs for pool foo
- manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'pg_num', '2')
- time.sleep(5)
-
- # Export a pg
- (exp_remote,) = ctx.\
- cluster.only('osd.{o}'.format(o=divergent)).remotes.iterkeys()
- FSPATH = manager.get_filepath()
- JPATH = os.path.join(FSPATH, "journal")
- prefix = ("sudo adjust-ulimits ceph-objectstore-tool "
- "--data-path {fpath} --journal-path {jpath} "
- "--log-file="
- "/var/log/ceph/objectstore_tool.$$.log ".
- format(fpath=FSPATH, jpath=JPATH))
- pid = os.getpid()
- expfile = os.path.join(testdir, "exp.{pid}.out".format(pid=pid))
- cmd = ((prefix + "--op export --pgid 1.0 --file {file}").
- format(id=divergent, file=expfile))
- proc = exp_remote.run(args=cmd, wait=True,
- check_status=False, stdout=StringIO())
- assert proc.exitstatus == 0
-
- # Remove the same pg that was exported
- cmd = ((prefix + "--op remove --pgid 1.0").
- format(id=divergent, file=expfile))
- proc = exp_remote.run(args=cmd, wait=True,
- check_status=False, stdout=StringIO())
- assert proc.exitstatus == 0
-
- # Kill one of non-divergent OSDs
- log.info('killing osd.%d' % non_divergent[1])
- manager.kill_osd(non_divergent[1])
- manager.mark_down_osd(non_divergent[1])
- # manager.mark_out_osd(non_divergent[1])
-
- cmd = ((prefix + "--op import --file {file}").
- format(id=non_divergent[1], file=expfile))
- proc = exp_remote.run(args=cmd, wait=True,
- check_status=False, stdout=StringIO())
- assert proc.exitstatus == 0
-
- # bring in our divergent friend and other node
- log.info("revive divergent %d", divergent)
- manager.revive_osd(divergent)
- manager.mark_in_osd(divergent)
- log.info("revive %d", non_divergent[1])
- manager.revive_osd(non_divergent[1])
-
- while len(manager.get_osd_status()['up']) < 3:
- time.sleep(10)
-
- log.info('delay recovery divergent')
- manager.set_config(divergent, osd_recovery_delay_start=100000)
- log.info('mark divergent in')
- manager.mark_in_osd(divergent)
-
- log.info('wait for peering')
- rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile])
-
- log.info("killing divergent %d", divergent)
- manager.kill_osd(divergent)
- log.info("reviving divergent %d", divergent)
- manager.revive_osd(divergent)
- time.sleep(3)
-
- log.info('allowing recovery')
- # Set osd_recovery_delay_start back to 0 and kick the queue
- for i in osds:
- manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'debug',
- 'kick_recovery_wq', ' 0')
-
- log.info('reading divergent objects')
- for i in range(DIVERGENT_WRITE + DIVERGENT_REMOVE):
- exit_status = rados(ctx, mon, ['-p', 'foo', 'get', 'existing_%d' % i,
- '/tmp/existing'])
- assert exit_status is 0
-
- (remote,) = ctx.\
- cluster.only('osd.{o}'.format(o=divergent)).remotes.iterkeys()
- cmd = 'rm {file}'.format(file=expfile)
- remote.run(args=cmd, wait=True)
- log.info("success")
+++ /dev/null
-"""
-Lost_unfound
-"""
-import logging
-from teuthology.orchestra import run
-import ceph_manager
-import time
-from teuthology import misc as teuthology
-from util.rados import rados
-
-log = logging.getLogger(__name__)
-
-def task(ctx, config):
- """
- Test handling of lost objects.
-
- A pretty rigid cluseter is brought up andtested by this task
- """
- POOL = 'unfounddel_pool'
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- 'lost_unfound task only accepts a dict for configuration'
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
-
- manager = ceph_manager.CephManager(
- mon,
- ctx=ctx,
- logger=log.getChild('ceph_manager'),
- )
-
- while len(manager.get_osd_status()['up']) < 3:
- time.sleep(10)
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.wait_for_clean()
-
- manager.create_pool(POOL)
-
- # something that is always there
- dummyfile = '/etc/fstab'
-
- # take an osd out until the very end
- manager.kill_osd(2)
- manager.mark_down_osd(2)
- manager.mark_out_osd(2)
-
- # kludge to make sure they get a map
- rados(ctx, mon, ['-p', POOL, 'put', 'dummy', dummyfile])
-
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.wait_for_recovery()
-
- # create old objects
- for f in range(1, 10):
- rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile])
- rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile])
- rados(ctx, mon, ['-p', POOL, 'rm', 'existed_%d' % f])
-
- # delay recovery, and make the pg log very long (to prevent backfill)
- manager.raw_cluster_cmd(
- 'tell', 'osd.1',
- 'injectargs',
- '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000'
- )
-
- manager.kill_osd(0)
- manager.mark_down_osd(0)
-
- for f in range(1, 10):
- rados(ctx, mon, ['-p', POOL, 'put', 'new_%d' % f, dummyfile])
- rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile])
- rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile])
-
- # bring osd.0 back up, let it peer, but don't replicate the new
- # objects...
- log.info('osd.0 command_args is %s' % 'foo')
- log.info(ctx.daemons.get_daemon('osd', 0).command_args)
- ctx.daemons.get_daemon('osd', 0).command_kwargs['args'].extend([
- '--osd-recovery-delay-start', '1000'
- ])
- manager.revive_osd(0)
- manager.mark_in_osd(0)
- manager.wait_till_osd_is_up(0)
-
- manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.wait_till_active()
-
- # take out osd.1 and the only copy of those objects.
- manager.kill_osd(1)
- manager.mark_down_osd(1)
- manager.mark_out_osd(1)
- manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it')
-
- # bring up osd.2 so that things would otherwise, in theory, recovery fully
- manager.revive_osd(2)
- manager.mark_in_osd(2)
- manager.wait_till_osd_is_up(2)
-
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.wait_till_active()
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
-
- # verify that there are unfound objects
- unfound = manager.get_num_unfound_objects()
- log.info("there are %d unfound objects" % unfound)
- assert unfound
-
- testdir = teuthology.get_testdir(ctx)
- procs = []
- if config.get('parallel_bench', True):
- procs.append(mon.run(
- args=[
- "/bin/sh", "-c",
- " ".join(['adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage',
- 'rados',
- '--no-log-to-stderr',
- '--name', 'client.admin',
- '-b', str(4<<10),
- '-p' , POOL,
- '-t', '20',
- 'bench', '240', 'write',
- ]).format(tdir=testdir),
- ],
- logger=log.getChild('radosbench.{id}'.format(id='client.admin')),
- stdin=run.PIPE,
- wait=False
- ))
- time.sleep(10)
-
- # mark stuff lost
- pgs = manager.get_pg_stats()
- for pg in pgs:
- if pg['stat_sum']['num_objects_unfound'] > 0:
- primary = 'osd.%d' % pg['acting'][0]
-
- # verify that i can list them direct from the osd
- log.info('listing missing/lost in %s state %s', pg['pgid'],
- pg['state']);
- m = manager.list_pg_missing(pg['pgid'])
- #log.info('%s' % m)
- assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound']
- num_unfound=0
- for o in m['objects']:
- if len(o['locations']) == 0:
- num_unfound += 1
- assert m['num_unfound'] == num_unfound
-
- log.info("reverting unfound in %s on %s", pg['pgid'], primary)
- manager.raw_cluster_cmd('pg', pg['pgid'],
- 'mark_unfound_lost', 'delete')
- else:
- log.info("no unfound in %s", pg['pgid'])
-
- manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5')
- manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5')
- manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
- manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
- manager.wait_for_recovery()
-
- # verify result
- for f in range(1, 10):
- err = rados(ctx, mon, ['-p', POOL, 'get', 'new_%d' % f, '-'])
- assert err
- err = rados(ctx, mon, ['-p', POOL, 'get', 'existed_%d' % f, '-'])
- assert err
- err = rados(ctx, mon, ['-p', POOL, 'get', 'existing_%d' % f, '-'])
- assert err
-
- # see if osd.1 can cope
- manager.revive_osd(1)
- manager.mark_in_osd(1)
- manager.wait_till_osd_is_up(1)
- manager.wait_for_clean()
- run.wait(procs)
-
+++ /dev/null
-"""
-Test pool repairing after objects are damaged.
-"""
-import logging
-import time
-
-from teuthology import misc as teuthology
-
-log = logging.getLogger(__name__)
-
-
-def choose_primary(manager, pool, num):
- """
- Return primary to test on.
- """
- log.info("Choosing primary")
- return manager.get_pg_primary(pool, num)
-
-
-def choose_replica(manager, pool, num):
- """
- Return replica to test on.
- """
- log.info("Choosing replica")
- return manager.get_pg_replica(pool, num)
-
-
-def trunc(manager, osd, pool, obj):
- """
- truncate an object
- """
- log.info("truncating object")
- return manager.osd_admin_socket(
- osd,
- ['truncobj', pool, obj, '1'])
-
-
-def dataerr(manager, osd, pool, obj):
- """
- cause an error in the data
- """
- log.info("injecting data err on object")
- return manager.osd_admin_socket(
- osd,
- ['injectdataerr', pool, obj])
-
-
-def mdataerr(manager, osd, pool, obj):
- """
- cause an error in the mdata
- """
- log.info("injecting mdata err on object")
- return manager.osd_admin_socket(
- osd,
- ['injectmdataerr', pool, obj])
-
-
-def omaperr(manager, osd, pool, obj):
- """
- Cause an omap error.
- """
- log.info("injecting omap err on object")
- return manager.osd_admin_socket(osd, ['setomapval', pool, obj,
- 'badkey', 'badval'])
-
-
-def repair_test_1(manager, corrupter, chooser, scrub_type):
- """
- Creates an object in the pool, corrupts it,
- scrubs it, and verifies that the pool is inconsistent. It then repairs
- the pool, rescrubs it, and verifies that the pool is consistent
-
- :param corrupter: error generating function (truncate, data-error, or
- meta-data error, for example).
- :param chooser: osd type chooser (primary or replica)
- :param scrub_type: regular scrub or deep-scrub
- """
- pool = "repair_pool_1"
- manager.wait_for_clean()
- with manager.pool(pool, 1):
-
- log.info("starting repair test type 1")
- victim_osd = chooser(manager, pool, 0)
-
- # create object
- log.info("doing put")
- manager.do_put(pool, 'repair_test_obj', '/etc/hosts')
-
- # corrupt object
- log.info("corrupting object")
- corrupter(manager, victim_osd, pool, 'repair_test_obj')
-
- # verify inconsistent
- log.info("scrubbing")
- manager.do_pg_scrub(pool, 0, scrub_type)
-
- assert manager.pg_inconsistent(pool, 0)
-
- # repair
- log.info("repairing")
- manager.do_pg_scrub(pool, 0, "repair")
-
- log.info("re-scrubbing")
- manager.do_pg_scrub(pool, 0, scrub_type)
-
- # verify consistent
- assert not manager.pg_inconsistent(pool, 0)
- log.info("done")
-
-
-def repair_test_2(ctx, manager, config, chooser):
- """
- First creates a set of objects and
- sets the omap value. It then corrupts an object, does both a scrub
- and a deep-scrub, and then corrupts more objects. After that, it
- repairs the pool and makes sure that the pool is consistent some
- time after a deep-scrub.
-
- :param chooser: primary or replica selection routine.
- """
- pool = "repair_pool_2"
- manager.wait_for_clean()
- with manager.pool(pool, 1):
- log.info("starting repair test type 2")
- victim_osd = chooser(manager, pool, 0)
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
-
- # create object
- log.info("doing put and setomapval")
- manager.do_put(pool, 'file1', '/etc/hosts')
- manager.do_rados(mon, ['-p', pool, 'setomapval', 'file1',
- 'key', 'val'])
- manager.do_put(pool, 'file2', '/etc/hosts')
- manager.do_put(pool, 'file3', '/etc/hosts')
- manager.do_put(pool, 'file4', '/etc/hosts')
- manager.do_put(pool, 'file5', '/etc/hosts')
- manager.do_rados(mon, ['-p', pool, 'setomapval', 'file5',
- 'key', 'val'])
- manager.do_put(pool, 'file6', '/etc/hosts')
-
- # corrupt object
- log.info("corrupting object")
- omaperr(manager, victim_osd, pool, 'file1')
-
- # verify inconsistent
- log.info("scrubbing")
- manager.do_pg_scrub(pool, 0, 'deep-scrub')
-
- assert manager.pg_inconsistent(pool, 0)
-
- # Regression test for bug #4778, should still
- # be inconsistent after scrub
- manager.do_pg_scrub(pool, 0, 'scrub')
-
- assert manager.pg_inconsistent(pool, 0)
-
- # Additional corruptions including 2 types for file1
- log.info("corrupting more objects")
- dataerr(manager, victim_osd, pool, 'file1')
- mdataerr(manager, victim_osd, pool, 'file2')
- trunc(manager, victim_osd, pool, 'file3')
- omaperr(manager, victim_osd, pool, 'file6')
-
- # see still inconsistent
- log.info("scrubbing")
- manager.do_pg_scrub(pool, 0, 'deep-scrub')
-
- assert manager.pg_inconsistent(pool, 0)
-
- # repair
- log.info("repairing")
- manager.do_pg_scrub(pool, 0, "repair")
-
- # Let repair clear inconsistent flag
- time.sleep(10)
-
- # verify consistent
- assert not manager.pg_inconsistent(pool, 0)
-
- # In the future repair might determine state of
- # inconsistency itself, verify with a deep-scrub
- log.info("scrubbing")
- manager.do_pg_scrub(pool, 0, 'deep-scrub')
-
- # verify consistent
- assert not manager.pg_inconsistent(pool, 0)
-
- log.info("done")
-
-
-def hinfoerr(manager, victim, pool, obj):
- """
- cause an error in the hinfo_key
- """
- log.info("remove the hinfo_key")
- manager.objectstore_tool(pool,
- options='',
- args='rm-attr hinfo_key',
- object_name=obj,
- osd=victim)
-
-
-def repair_test_erasure_code(manager, corrupter, victim, scrub_type):
- """
- Creates an object in the pool, corrupts it,
- scrubs it, and verifies that the pool is inconsistent. It then repairs
- the pool, rescrubs it, and verifies that the pool is consistent
-
- :param corrupter: error generating function.
- :param chooser: osd type chooser (primary or replica)
- :param scrub_type: regular scrub or deep-scrub
- """
- pool = "repair_pool_3"
- manager.wait_for_clean()
- with manager.pool(pool_name=pool, pg_num=1,
- erasure_code_profile_name='default'):
-
- log.info("starting repair test for erasure code")
-
- # create object
- log.info("doing put")
- manager.do_put(pool, 'repair_test_obj', '/etc/hosts')
-
- # corrupt object
- log.info("corrupting object")
- corrupter(manager, victim, pool, 'repair_test_obj')
-
- # verify inconsistent
- log.info("scrubbing")
- manager.do_pg_scrub(pool, 0, scrub_type)
-
- assert manager.pg_inconsistent(pool, 0)
-
- # repair
- log.info("repairing")
- manager.do_pg_scrub(pool, 0, "repair")
-
- log.info("re-scrubbing")
- manager.do_pg_scrub(pool, 0, scrub_type)
-
- # verify consistent
- assert not manager.pg_inconsistent(pool, 0)
- log.info("done")
-
-
-def task(ctx, config):
- """
- Test [deep] repair in several situations:
- Repair [Truncate, Data EIO, MData EIO] on [Primary|Replica]
-
- The config should be as follows:
-
- Must include the log-whitelist below
- Must enable filestore_debug_inject_read_err config
-
- example:
-
- tasks:
- - chef:
- - install:
- - ceph:
- log-whitelist:
- - 'candidate had a stat error'
- - 'candidate had a read error'
- - 'deep-scrub 0 missing, 1 inconsistent objects'
- - 'deep-scrub 0 missing, 4 inconsistent objects'
- - 'deep-scrub [0-9]+ errors'
- - '!= omap_digest'
- - '!= data_digest'
- - 'repair 0 missing, 1 inconsistent objects'
- - 'repair 0 missing, 4 inconsistent objects'
- - 'repair [0-9]+ errors, [0-9]+ fixed'
- - 'scrub 0 missing, 1 inconsistent objects'
- - 'scrub [0-9]+ errors'
- - 'size 1 != size'
- - 'attr name mismatch'
- conf:
- osd:
- filestore debug inject read err: true
- - repair_test:
-
- """
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- 'repair_test task only accepts a dict for config'
-
- manager = ctx.managers['ceph']
- manager.wait_for_all_up()
-
- manager.raw_cluster_cmd('osd', 'set', 'noscrub')
- manager.raw_cluster_cmd('osd', 'set', 'nodeep-scrub')
-
- repair_test_1(manager, mdataerr, choose_primary, "scrub")
- repair_test_1(manager, mdataerr, choose_replica, "scrub")
- repair_test_1(manager, dataerr, choose_primary, "deep-scrub")
- repair_test_1(manager, dataerr, choose_replica, "deep-scrub")
- repair_test_1(manager, trunc, choose_primary, "scrub")
- repair_test_1(manager, trunc, choose_replica, "scrub")
- repair_test_2(ctx, manager, config, choose_primary)
- repair_test_2(ctx, manager, config, choose_replica)
-
- repair_test_erasure_code(manager, hinfoerr, 'primary', "deep-scrub")
+++ /dev/null
-"""
-Resolve stuck peering
-"""
-import logging
-import time
-
-from teuthology import misc as teuthology
-from util.rados import rados
-
-log = logging.getLogger(__name__)
-
-def task(ctx, config):
- """
- Test handling resolve stuck peering
-
- requires 3 osds on a single test node
- """
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- 'Resolve stuck peering only accepts a dict for config'
-
- manager = ctx.managers['ceph']
-
- while len(manager.get_osd_status()['up']) < 3:
- time.sleep(10)
-
-
- manager.wait_for_clean()
-
- dummyfile = '/etc/fstab'
- dummyfile1 = '/etc/resolv.conf'
-
- #create 1 PG pool
- pool='foo'
- log.info('creating pool foo')
- manager.raw_cluster_cmd('osd', 'pool', 'create', '%s' % pool, '1')
-
- #set min_size of the pool to 1
- #so that we can continue with I/O
- #when 2 osds are down
- manager.set_pool_property(pool, "min_size", 1)
-
- osds = [0, 1, 2]
-
- primary = manager.get_pg_primary('foo', 0)
- log.info("primary osd is %d", primary)
-
- others = list(osds)
- others.remove(primary)
-
- log.info('writing initial objects')
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
- #create few objects
- for i in range(100):
- rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile])
-
- manager.wait_for_clean()
-
- #kill other osds except primary
- log.info('killing other osds except primary')
- for i in others:
- manager.kill_osd(i)
- for i in others:
- manager.mark_down_osd(i)
-
-
- for i in range(100):
- rados(ctx, mon, ['-p', 'foo', 'put', 'new_%d' % i, dummyfile1])
-
- #kill primary osd
- manager.kill_osd(primary)
- manager.mark_down_osd(primary)
-
- #revive other 2 osds
- for i in others:
- manager.revive_osd(i)
-
- #make sure that pg is down
- #Assuming pg number for single pg pool will start from 0
- pgnum=0
- pgstr = manager.get_pgid(pool, pgnum)
- stats = manager.get_single_pg_stats(pgstr)
- print stats['state']
-
- timeout=60
- start=time.time()
-
- while 'down' not in stats['state']:
- assert time.time() - start < timeout, \
- 'failed to reach down state before timeout expired'
- stats = manager.get_single_pg_stats(pgstr)
-
- #mark primary as lost
- manager.raw_cluster_cmd('osd', 'lost', '%d' % primary,\
- '--yes-i-really-mean-it')
-
-
- #expect the pg status to be active+undersized+degraded
- #pg should recover and become active+clean within timeout
- stats = manager.get_single_pg_stats(pgstr)
- print stats['state']
-
- timeout=10
- start=time.time()
-
- while manager.get_num_down():
- assert time.time() - start < timeout, \
- 'failed to recover before timeout expired'
+++ /dev/null
-"""
-Rest Api
-"""
-import logging
-import contextlib
-import time
-
-from teuthology import misc as teuthology
-from teuthology import contextutil
-from teuthology.orchestra import run
-from teuthology.orchestra.daemon import DaemonGroup
-
-log = logging.getLogger(__name__)
-
-
-@contextlib.contextmanager
-def run_rest_api_daemon(ctx, api_clients):
- """
- Wrapper starts the rest api daemons
- """
- if not hasattr(ctx, 'daemons'):
- ctx.daemons = DaemonGroup()
- remotes = ctx.cluster.only(teuthology.is_type('client')).remotes
- for rems, roles in remotes.iteritems():
- for whole_id_ in roles:
- if whole_id_ in api_clients:
- id_ = whole_id_[len('clients'):]
- run_cmd = [
- 'sudo',
- 'daemon-helper',
- 'kill',
- 'ceph-rest-api',
- '-n',
- 'client.rest{id}'.format(id=id_), ]
- cl_rest_id = 'client.rest{id}'.format(id=id_)
- ctx.daemons.add_daemon(rems, 'restapi',
- cl_rest_id,
- args=run_cmd,
- logger=log.getChild(cl_rest_id),
- stdin=run.PIPE,
- wait=False,
- )
- for i in range(1, 12):
- log.info('testing for ceph-rest-api try {0}'.format(i))
- run_cmd = [
- 'wget',
- '-O',
- '/dev/null',
- '-q',
- 'http://localhost:5000/api/v0.1/status'
- ]
- proc = rems.run(
- args=run_cmd,
- check_status=False
- )
- if proc.exitstatus == 0:
- break
- time.sleep(5)
- if proc.exitstatus != 0:
- raise RuntimeError('Cannot contact ceph-rest-api')
- try:
- yield
-
- finally:
- """
- TO DO: destroy daemons started -- modify iter_daemons_of_role
- """
- teuthology.stop_daemons_of_type(ctx, 'restapi')
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Start up rest-api.
-
- To start on on all clients::
-
- tasks:
- - ceph:
- - rest-api:
-
- To only run on certain clients::
-
- tasks:
- - ceph:
- - rest-api: [client.0, client.3]
-
- or
-
- tasks:
- - ceph:
- - rest-api:
- client.0:
- client.3:
-
- The general flow of things here is:
- 1. Find clients on which rest-api is supposed to run (api_clients)
- 2. Generate keyring values
- 3. Start up ceph-rest-api daemons
- On cleanup:
- 4. Stop the daemons
- 5. Delete keyring value files.
- """
- api_clients = []
- remotes = ctx.cluster.only(teuthology.is_type('client')).remotes
- log.info(remotes)
- if config == None:
- api_clients = ['client.{id}'.format(id=id_)
- for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
- else:
- api_clients = config
- log.info(api_clients)
- testdir = teuthology.get_testdir(ctx)
- coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
- for rems, roles in remotes.iteritems():
- for whole_id_ in roles:
- if whole_id_ in api_clients:
- id_ = whole_id_[len('client.'):]
- keyring = '/etc/ceph/ceph.client.rest{id}.keyring'.format(
- id=id_)
- rems.run(
- args=[
- 'sudo',
- 'adjust-ulimits',
- 'ceph-coverage',
- coverage_dir,
- 'ceph-authtool',
- '--create-keyring',
- '--gen-key',
- '--name=client.rest{id}'.format(id=id_),
- '--set-uid=0',
- '--cap', 'mon', 'allow *',
- '--cap', 'osd', 'allow *',
- '--cap', 'mds', 'allow',
- keyring,
- run.Raw('&&'),
- 'sudo',
- 'chmod',
- '0644',
- keyring,
- ],
- )
- rems.run(
- args=[
- 'sudo',
- 'sh',
- '-c',
- run.Raw("'"),
- "echo",
- '[client.rest{id}]'.format(id=id_),
- run.Raw('>>'),
- "/etc/ceph/ceph.conf",
- run.Raw("'")
- ]
- )
- rems.run(
- args=[
- 'sudo',
- 'sh',
- '-c',
- run.Raw("'"),
- 'echo',
- 'restapi',
- 'keyring',
- '=',
- '/etc/ceph/ceph.client.rest{id}.keyring'.format(id=id_),
- run.Raw('>>'),
- '/etc/ceph/ceph.conf',
- run.Raw("'"),
- ]
- )
- rems.run(
- args=[
- 'sudo',
- 'ceph',
- 'auth',
- 'import',
- '-i',
- '/etc/ceph/ceph.client.rest{id}.keyring'.format(id=id_),
- ]
- )
- with contextutil.nested(
- lambda: run_rest_api_daemon(ctx=ctx, api_clients=api_clients),):
- yield
-
+++ /dev/null
-"""
-Daemon restart
-"""
-import logging
-import pipes
-
-from teuthology import misc as teuthology
-from teuthology.orchestra import run as tor
-
-from teuthology.orchestra import run
-log = logging.getLogger(__name__)
-
-def restart_daemon(ctx, config, role, id_, *args):
- """
- Handle restart (including the execution of the command parameters passed)
- """
- log.info('Restarting {r}.{i} daemon...'.format(r=role, i=id_))
- daemon = ctx.daemons.get_daemon(role, id_)
- log.debug('Waiting for exit of {r}.{i} daemon...'.format(r=role, i=id_))
- try:
- daemon.wait_for_exit()
- except tor.CommandFailedError as e:
- log.debug('Command Failed: {e}'.format(e=e))
- if len(args) > 0:
- confargs = ['--{k}={v}'.format(k=k, v=v) for k,v in zip(args[0::2], args[1::2])]
- log.debug('Doing restart of {r}.{i} daemon with args: {a}...'.format(r=role, i=id_, a=confargs))
- daemon.restart_with_args(confargs)
- else:
- log.debug('Doing restart of {r}.{i} daemon...'.format(r=role, i=id_))
- daemon.restart()
-
-def get_tests(ctx, config, role, remote, testdir):
- """Download restart tests"""
- srcdir = '{tdir}/restart.{role}'.format(tdir=testdir, role=role)
-
- refspec = config.get('branch')
- if refspec is None:
- refspec = config.get('sha1')
- if refspec is None:
- refspec = config.get('tag')
- if refspec is None:
- refspec = 'HEAD'
- log.info('Pulling restart qa/workunits from ref %s', refspec)
-
- remote.run(
- logger=log.getChild(role),
- args=[
- 'mkdir', '--', srcdir,
- run.Raw('&&'),
- 'git',
- 'archive',
- '--remote=git://git.ceph.com/ceph.git',
- '%s:qa/workunits' % refspec,
- run.Raw('|'),
- 'tar',
- '-C', srcdir,
- '-x',
- '-f-',
- run.Raw('&&'),
- 'cd', '--', srcdir,
- run.Raw('&&'),
- 'if', 'test', '-e', 'Makefile', run.Raw(';'), 'then', 'make', run.Raw(';'), 'fi',
- run.Raw('&&'),
- 'find', '-executable', '-type', 'f', '-printf', r'%P\0'.format(srcdir=srcdir),
- run.Raw('>{tdir}/restarts.list'.format(tdir=testdir)),
- ],
- )
- restarts = sorted(teuthology.get_file(
- remote,
- '{tdir}/restarts.list'.format(tdir=testdir)).split('\0'))
- return (srcdir, restarts)
-
-def task(ctx, config):
- """
- Execute commands and allow daemon restart with config options.
- Each process executed can output to stdout restart commands of the form:
- restart <role> <id> <conf_key1> <conf_value1> <conf_key2> <conf_value2>
- This will restart the daemon <role>.<id> with the specified config values once
- by modifying the conf file with those values, and then replacing the old conf file
- once the daemon is restarted.
- This task does not kill a running daemon, it assumes the daemon will abort on an
- assert specified in the config.
-
- tasks:
- - install:
- - ceph:
- - restart:
- exec:
- client.0:
- - test_backtraces.py
-
- """
- assert isinstance(config, dict), "task kill got invalid config"
-
- testdir = teuthology.get_testdir(ctx)
-
- try:
- assert 'exec' in config, "config requires exec key with <role>: <command> entries"
- for role, task in config['exec'].iteritems():
- log.info('restart for role {r}'.format(r=role))
- (remote,) = ctx.cluster.only(role).remotes.iterkeys()
- srcdir, restarts = get_tests(ctx, config, role, remote, testdir)
- log.info('Running command on role %s host %s', role, remote.name)
- spec = '{spec}'.format(spec=task[0])
- log.info('Restarts list: %s', restarts)
- log.info('Spec is %s', spec)
- to_run = [w for w in restarts if w == task or w.find(spec) != -1]
- log.info('To run: %s', to_run)
- for c in to_run:
- log.info('Running restart script %s...', c)
- args = [
- run.Raw('TESTDIR="{tdir}"'.format(tdir=testdir)),
- ]
- env = config.get('env')
- if env is not None:
- for var, val in env.iteritems():
- quoted_val = pipes.quote(val)
- env_arg = '{var}={val}'.format(var=var, val=quoted_val)
- args.append(run.Raw(env_arg))
- args.extend([
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- '{srcdir}/{c}'.format(
- srcdir=srcdir,
- c=c,
- ),
- ])
- proc = remote.run(
- args=args,
- stdout=tor.PIPE,
- stdin=tor.PIPE,
- stderr=log,
- wait=False,
- )
- log.info('waiting for a command from script')
- while True:
- l = proc.stdout.readline()
- if not l or l == '':
- break
- log.debug('script command: {c}'.format(c=l))
- ll = l.strip()
- cmd = ll.split(' ')
- if cmd[0] == "done":
- break
- assert cmd[0] == 'restart', "script sent invalid command request to kill task"
- # cmd should be: restart <role> <id> <conf_key1> <conf_value1> <conf_key2> <conf_value2>
- # or to clear, just: restart <role> <id>
- restart_daemon(ctx, config, cmd[1], cmd[2], *cmd[3:])
- proc.stdin.writelines(['restarted\n'])
- proc.stdin.flush()
- try:
- proc.wait()
- except tor.CommandFailedError:
- raise Exception('restart task got non-zero exit status from script: {s}'.format(s=c))
- finally:
- log.info('Finishing %s on %s...', task, role)
- remote.run(
- logger=log.getChild(role),
- args=[
- 'rm', '-rf', '--', '{tdir}/restarts.list'.format(tdir=testdir), srcdir,
- ],
- )
+++ /dev/null
-"""
-rgw routines
-"""
-import argparse
-import contextlib
-import json
-import logging
-import os
-import errno
-import util.rgw as rgw_utils
-
-from requests.packages.urllib3 import PoolManager
-from requests.packages.urllib3.util import Retry
-
-from cStringIO import StringIO
-
-from teuthology.orchestra import run
-from teuthology import misc as teuthology
-from teuthology import contextutil
-from teuthology.orchestra.run import CommandFailedError
-from util.rgw import rgwadmin
-from util.rados import (rados, create_ec_pool,
- create_replicated_pool,
- create_cache_pool)
-
-log = logging.getLogger(__name__)
-
-def get_config_master_client(ctx, config, regions):
-
- role_zones = dict([(client, extract_zone_info(ctx, client, c_config))
- for client, c_config in config.iteritems()])
- log.debug('roles_zones = %r', role_zones)
- region_info = dict([
- (region_name, extract_region_info(region_name, r_config))
- for region_name, r_config in regions.iteritems()])
-
- # read master zonegroup and master_zone
- for zonegroup, zg_info in region_info.iteritems():
- if zg_info['is_master']:
- master_zonegroup = zonegroup
- master_zone = zg_info['master_zone']
- break
-
- for client in config.iterkeys():
- (zonegroup, zone, zone_info) = role_zones[client]
- if zonegroup == master_zonegroup and zone == master_zone:
- return client
-
- return None
-
-@contextlib.contextmanager
-def create_apache_dirs(ctx, config, on_client = None, except_client = None):
- """
- Remotely create apache directories. Delete when finished.
- """
- log.info('Creating apache directories...')
- log.debug('client is %r', on_client)
- testdir = teuthology.get_testdir(ctx)
- clients_to_create_as = [on_client]
- if on_client is None:
- clients_to_create_as = config.keys()
- for client in clients_to_create_as:
- if client == except_client:
- continue
- ctx.cluster.only(client).run(
- args=[
- 'mkdir',
- '-p',
- '{tdir}/apache/htdocs.{client}'.format(tdir=testdir,
- client=client),
- '{tdir}/apache/tmp.{client}/fastcgi_sock'.format(
- tdir=testdir,
- client=client),
- run.Raw('&&'),
- 'mkdir',
- '{tdir}/archive/apache.{client}'.format(tdir=testdir,
- client=client),
- ],
- )
- try:
- yield
- finally:
- log.info('Cleaning up apache directories...')
- for client in clients_to_create_as:
- ctx.cluster.only(client).run(
- args=[
- 'rm',
- '-rf',
- '{tdir}/apache/tmp.{client}'.format(tdir=testdir,
- client=client),
- run.Raw('&&'),
- 'rmdir',
- '{tdir}/apache/htdocs.{client}'.format(tdir=testdir,
- client=client),
- ],
- )
- for client in clients_to_create_as:
- ctx.cluster.only(client).run(
- args=[
- 'rmdir',
- '{tdir}/apache'.format(tdir=testdir),
- ],
- check_status=False, # only need to remove once per host
- )
-
-
-def _use_uds_with_fcgi(remote):
- """
- Returns true if this node supports the usage of
- unix domain sockets with mod_proxy_fcgi.
-
- FIXME: returns False always for now until we know for
- sure what distros will support UDS. RHEL 7.0 is the only one
- currently I know of, but we can't install that version of apache
- yet in the labs.
- """
- return False
-
-
-@contextlib.contextmanager
-def ship_apache_configs(ctx, config, role_endpoints, on_client = None,
- except_client = None):
- """
- Ship apache config and rgw.fgci to all clients. Clean up on termination
- """
- assert isinstance(config, dict)
- assert isinstance(role_endpoints, dict)
- testdir = teuthology.get_testdir(ctx)
- log.info('Shipping apache config and rgw.fcgi...')
- src = os.path.join(os.path.dirname(__file__), 'apache.conf.template')
- clients_to_create_as = [on_client]
- if on_client is None:
- clients_to_create_as = config.keys()
- for client in clients_to_create_as:
- if client == except_client:
- continue
- (remote,) = ctx.cluster.only(client).remotes.keys()
- system_type = teuthology.get_system_type(remote)
- conf = config.get(client)
- if not conf:
- conf = {}
- idle_timeout = conf.get('idle_timeout', ctx.rgw.default_idle_timeout)
- if system_type == 'deb':
- mod_path = '/usr/lib/apache2/modules'
- print_continue = 'on'
- user = 'www-data'
- group = 'www-data'
- apache24_modconfig = '''
- IncludeOptional /etc/apache2/mods-available/mpm_event.conf
- IncludeOptional /etc/apache2/mods-available/mpm_event.load
-'''
- else:
- mod_path = '/usr/lib64/httpd/modules'
- print_continue = 'off'
- user = 'apache'
- group = 'apache'
- apache24_modconfig = \
- 'IncludeOptional /etc/httpd/conf.modules.d/00-mpm.conf'
- host, port = role_endpoints[client]
-
- # decide if we want to use mod_fastcgi or mod_proxy_fcgi
- template_dir = os.path.dirname(__file__)
- fcgi_config = os.path.join(template_dir,
- 'mod_proxy_fcgi.tcp.conf.template')
- if ctx.rgw.use_fastcgi:
- log.info("Apache is configured to use mod_fastcgi")
- fcgi_config = os.path.join(template_dir,
- 'mod_fastcgi.conf.template')
- elif _use_uds_with_fcgi(remote):
- log.info("Apache is configured to use mod_proxy_fcgi with UDS")
- fcgi_config = os.path.join(template_dir,
- 'mod_proxy_fcgi.uds.conf.template')
- else:
- log.info("Apache is configured to use mod_proxy_fcgi with TCP")
-
- with file(fcgi_config, 'rb') as f:
- fcgi_config = f.read()
- with file(src, 'rb') as f:
- conf = f.read() + fcgi_config
- conf = conf.format(
- testdir=testdir,
- mod_path=mod_path,
- print_continue=print_continue,
- host=host,
- port=port,
- client=client,
- idle_timeout=idle_timeout,
- user=user,
- group=group,
- apache24_modconfig=apache24_modconfig,
- )
- teuthology.write_file(
- remote=remote,
- path='{tdir}/apache/apache.{client}.conf'.format(
- tdir=testdir,
- client=client),
- data=conf,
- )
- rgw_options = []
- if ctx.rgw.use_fastcgi or _use_uds_with_fcgi(remote):
- rgw_options = [
- '--rgw-socket-path',
- '{tdir}/apache/tmp.{client}/fastcgi_sock/rgw_sock'.format(
- tdir=testdir,
- client=client
- ),
- '--rgw-frontends',
- 'fastcgi',
- ]
- else:
- rgw_options = [
- '--rgw-socket-path', '""',
- '--rgw-print-continue', 'false',
- '--rgw-frontends',
- 'fastcgi socket_port=9000 socket_host=0.0.0.0',
- ]
-
- teuthology.write_file(
- remote=remote,
- path='{tdir}/apache/htdocs.{client}/rgw.fcgi'.format(
- tdir=testdir,
- client=client),
- data="""#!/bin/sh
-ulimit -c unlimited
-exec radosgw -f -n {client} -k /etc/ceph/ceph.{client}.keyring {rgw_options}
-
-""".format(tdir=testdir, client=client, rgw_options=" ".join(rgw_options))
- )
- remote.run(
- args=[
- 'chmod',
- 'a=rx',
- '{tdir}/apache/htdocs.{client}/rgw.fcgi'.format(tdir=testdir,
- client=client),
- ],
- )
- try:
- yield
- finally:
- log.info('Removing apache config...')
- for client in clients_to_create_as:
- ctx.cluster.only(client).run(
- args=[
- 'rm',
- '-f',
- '{tdir}/apache/apache.{client}.conf'.format(tdir=testdir,
- client=client),
- run.Raw('&&'),
- 'rm',
- '-f',
- '{tdir}/apache/htdocs.{client}/rgw.fcgi'.format(
- tdir=testdir,
- client=client),
- ],
- )
-
-
-@contextlib.contextmanager
-def start_rgw(ctx, config, on_client = None, except_client = None):
- """
- Start rgw on remote sites.
- """
- log.info('Starting rgw...')
- log.debug('client %r', on_client)
- clients_to_run = [on_client]
- if on_client is None:
- clients_to_run = config.keys()
- testdir = teuthology.get_testdir(ctx)
- for client in clients_to_run:
- if client == except_client:
- continue
- (remote,) = ctx.cluster.only(client).remotes.iterkeys()
- zone = rgw_utils.zone_for_client(ctx, client)
- log.debug('zone %s', zone)
- client_config = config.get(client)
- if client_config is None:
- client_config = {}
- log.info("rgw %s config is %s", client, client_config)
- id_ = client.split('.', 1)[1]
- log.info('client {client} is id {id}'.format(client=client, id=id_))
- cmd_prefix = [
- 'sudo',
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'daemon-helper',
- 'term',
- ]
-
- rgw_cmd = ['radosgw']
-
- if ctx.rgw.frontend == 'apache':
- if ctx.rgw.use_fastcgi or _use_uds_with_fcgi(remote):
- rgw_cmd.extend([
- '--rgw-socket-path',
- '{tdir}/apache/tmp.{client}/fastcgi_sock/rgw_sock'.format(
- tdir=testdir,
- client=client,
- ),
- '--rgw-frontends',
- 'fastcgi',
- ])
- else:
- # for mod_proxy_fcgi, using tcp
- rgw_cmd.extend([
- '--rgw-socket-path', '',
- '--rgw-print-continue', 'false',
- '--rgw-frontends',
- 'fastcgi socket_port=9000 socket_host=0.0.0.0',
- ])
-
- elif ctx.rgw.frontend == 'civetweb':
- host, port = ctx.rgw.role_endpoints[client]
- rgw_cmd.extend([
- '--rgw-frontends',
- 'civetweb port={port}'.format(port=port),
- ])
-
- if zone is not None:
- rgw_cmd.extend(['--rgw-zone', zone])
-
- rgw_cmd.extend([
- '-n', client,
- '-k', '/etc/ceph/ceph.{client}.keyring'.format(client=client),
- '--log-file',
- '/var/log/ceph/rgw.{client}.log'.format(client=client),
- '--rgw_ops_log_socket_path',
- '{tdir}/rgw.opslog.{client}.sock'.format(tdir=testdir,
- client=client),
- '--foreground',
- run.Raw('|'),
- 'sudo',
- 'tee',
- '/var/log/ceph/rgw.{client}.stdout'.format(tdir=testdir,
- client=client),
- run.Raw('2>&1'),
- ])
-
- if client_config.get('valgrind'):
- cmd_prefix = teuthology.get_valgrind_args(
- testdir,
- client,
- cmd_prefix,
- client_config.get('valgrind')
- )
-
- run_cmd = list(cmd_prefix)
- run_cmd.extend(rgw_cmd)
-
- ctx.daemons.add_daemon(
- remote, 'rgw', client,
- args=run_cmd,
- logger=log.getChild(client),
- stdin=run.PIPE,
- wait=False,
- )
-
- # XXX: add_daemon() doesn't let us wait until radosgw finishes startup
- # use a connection pool with retry/backoff to poll each gateway until it starts listening
- http = PoolManager(retries=Retry(connect=8, backoff_factor=1))
- for client in clients_to_run:
- if client == except_client:
- continue
- host, port = ctx.rgw.role_endpoints[client]
- endpoint = 'http://{host}:{port}/'.format(host=host, port=port)
- log.info('Polling {client} until it starts accepting connections on {endpoint}'.format(client=client, endpoint=endpoint))
- http.request('GET', endpoint)
-
- try:
- yield
- finally:
- teuthology.stop_daemons_of_type(ctx, 'rgw')
- for client in config.iterkeys():
- ctx.cluster.only(client).run(
- args=[
- 'rm',
- '-f',
- '{tdir}/rgw.opslog.{client}.sock'.format(tdir=testdir,
- client=client),
- ],
- )
-
-
-@contextlib.contextmanager
-def start_apache(ctx, config, on_client = None, except_client = None):
- """
- Start apache on remote sites.
- """
- log.info('Starting apache...')
- testdir = teuthology.get_testdir(ctx)
- apaches = {}
- clients_to_run = [on_client]
- if on_client is None:
- clients_to_run = config.keys()
- for client in clients_to_run:
- if client == except_client:
- continue
- (remote,) = ctx.cluster.only(client).remotes.keys()
- system_type = teuthology.get_system_type(remote)
- if system_type == 'deb':
- apache_name = 'apache2'
- else:
- try:
- remote.run(
- args=[
- 'stat',
- '/usr/sbin/httpd.worker',
- ],
- )
- apache_name = '/usr/sbin/httpd.worker'
- except CommandFailedError:
- apache_name = '/usr/sbin/httpd'
-
- proc = remote.run(
- args=[
- 'adjust-ulimits',
- 'daemon-helper',
- 'kill',
- apache_name,
- '-X',
- '-f',
- '{tdir}/apache/apache.{client}.conf'.format(tdir=testdir,
- client=client),
- ],
- logger=log.getChild(client),
- stdin=run.PIPE,
- wait=False,
- )
- apaches[client] = proc
-
- try:
- yield
- finally:
- log.info('Stopping apache...')
- for client, proc in apaches.iteritems():
- proc.stdin.close()
-
- run.wait(apaches.itervalues())
-
-
-def extract_user_info(client_config):
- """
- Extract user info from the client config specified. Returns a dict
- that includes system key information.
- """
- # test if there isn't a system user or if there isn't a name for that
- # user, return None
- if ('system user' not in client_config or
- 'name' not in client_config['system user']):
- return None
-
- user_info = dict()
- user_info['system_key'] = dict(
- user=client_config['system user']['name'],
- access_key=client_config['system user']['access key'],
- secret_key=client_config['system user']['secret key'],
- )
- return user_info
-
-
-def extract_zone_info(ctx, client, client_config):
- """
- Get zone information.
- :param client: dictionary of client information
- :param client_config: dictionary of client configuration information
- :returns: zone extracted from client and client_config information
- """
- ceph_config = ctx.ceph['ceph'].conf.get('global', {})
- ceph_config.update(ctx.ceph['ceph'].conf.get('client', {}))
- ceph_config.update(ctx.ceph['ceph'].conf.get(client, {}))
- for key in ['rgw zone', 'rgw region', 'rgw zone root pool']:
- assert key in ceph_config, \
- 'ceph conf must contain {key} for {client}'.format(key=key,
- client=client)
- region = ceph_config['rgw region']
- zone = ceph_config['rgw zone']
- zone_info = dict()
- for key in ['rgw control pool', 'rgw gc pool', 'rgw log pool',
- 'rgw intent log pool', 'rgw usage log pool',
- 'rgw user keys pool', 'rgw user email pool',
- 'rgw user swift pool', 'rgw user uid pool',
- 'rgw domain root']:
- new_key = key.split(' ', 1)[1]
- new_key = new_key.replace(' ', '_')
-
- if key in ceph_config:
- value = ceph_config[key]
- log.debug('{key} specified in ceph_config ({val})'.format(
- key=key, val=value))
- zone_info[new_key] = value
- else:
- zone_info[new_key] = '.' + region + '.' + zone + '.' + new_key
-
- index_pool = '.' + region + '.' + zone + '.' + 'index_pool'
- data_pool = '.' + region + '.' + zone + '.' + 'data_pool'
- data_extra_pool = '.' + region + '.' + zone + '.' + 'data_extra_pool'
- compression_type = ceph_config.get('rgw compression type', '')
-
- zone_info['placement_pools'] = [{'key': 'default_placement',
- 'val': {'index_pool': index_pool,
- 'data_pool': data_pool,
- 'data_extra_pool': data_extra_pool,
- 'compression': compression_type}
- }]
-
- # these keys are meant for the zones argument in the region info. We
- # insert them into zone_info with a different format and then remove them
- # in the fill_in_endpoints() method
- for key in ['rgw log meta', 'rgw log data']:
- if key in ceph_config:
- zone_info[key] = ceph_config[key]
-
- # these keys are meant for the zones argument in the region info. We
- # insert them into zone_info with a different format and then remove them
- # in the fill_in_endpoints() method
- for key in ['rgw log meta', 'rgw log data']:
- if key in ceph_config:
- zone_info[key] = ceph_config[key]
-
- return region, zone, zone_info
-
-
-def extract_region_info(region, region_info):
- """
- Extract region information from the region_info parameter, using get
- to set default values.
-
- :param region: name of the region
- :param region_info: region information (in dictionary form).
- :returns: dictionary of region information set from region_info, using
- default values for missing fields.
- """
- assert isinstance(region_info['zones'], list) and region_info['zones'], \
- 'zones must be a non-empty list'
- return dict(
- name=region,
- api_name=region_info.get('api name', region),
- is_master=region_info.get('is master', False),
- log_meta=region_info.get('log meta', False),
- log_data=region_info.get('log data', False),
- master_zone=region_info.get('master zone', region_info['zones'][0]),
- placement_targets=region_info.get('placement targets',
- [{'name': 'default_placement',
- 'tags': []}]),
- default_placement=region_info.get('default placement',
- 'default_placement'),
- )
-
-
-def assign_ports(ctx, config):
- """
- Assign port numberst starting with port 7280.
- """
- port = 7280
- role_endpoints = {}
- for remote, roles_for_host in ctx.cluster.remotes.iteritems():
- for role in roles_for_host:
- if role in config:
- role_endpoints[role] = (remote.name.split('@')[1], port)
- port += 1
-
- return role_endpoints
-
-
-def fill_in_endpoints(region_info, role_zones, role_endpoints):
- """
- Iterate through the list of role_endpoints, filling in zone information
-
- :param region_info: region data
- :param role_zones: region and zone information.
- :param role_endpoints: endpoints being used
- """
- for role, (host, port) in role_endpoints.iteritems():
- region, zone, zone_info, _ = role_zones[role]
- host, port = role_endpoints[role]
- endpoint = 'http://{host}:{port}/'.format(host=host, port=port)
- # check if the region specified under client actually exists
- # in region_info (it should, if properly configured).
- # If not, throw a reasonable error
- if region not in region_info:
- raise Exception(
- 'Region: {region} was specified but no corresponding'
- ' entry was found under \'regions\''.format(region=region))
-
- region_conf = region_info[region]
- region_conf.setdefault('endpoints', [])
- region_conf['endpoints'].append(endpoint)
-
- # this is the payload for the 'zones' field in the region field
- zone_payload = dict()
- zone_payload['endpoints'] = [endpoint]
- zone_payload['name'] = zone
-
- # Pull the log meta and log data settings out of zone_info, if they
- # exist, then pop them as they don't actually belong in the zone info
- for key in ['rgw log meta', 'rgw log data']:
- new_key = key.split(' ', 1)[1]
- new_key = new_key.replace(' ', '_')
-
- if key in zone_info:
- value = zone_info.pop(key)
- else:
- value = 'false'
-
- zone_payload[new_key] = value
-
- region_conf.setdefault('zones', [])
- region_conf['zones'].append(zone_payload)
-
-
-@contextlib.contextmanager
-def configure_users_for_client(ctx, config, client, everywhere=False):
- """
- Create users by remotely running rgwadmin commands using extracted
- user information.
- """
- log.info('Configuring users...')
- log.info('for client %s', client)
- log.info('everywhere %s', everywhere)
-
- # For data sync the master zones and regions must have the
- # system users of the secondary zones. To keep this simple,
- # just create the system users on every client if regions are
- # configured.
- clients_to_create_as = [client]
- if everywhere:
- clients_to_create_as = config.keys()
-
- # extract the user info and append it to the payload tuple for the given
- # client
- for client, c_config in config.iteritems():
- if not c_config:
- continue
- user_info = extract_user_info(c_config)
- if not user_info:
- continue
-
- for client_name in clients_to_create_as:
- log.debug('Creating user {user} on {client}'.format(
- user=user_info['system_key']['user'], client=client_name))
- rgwadmin(ctx, client_name,
- cmd=[
- 'user', 'create',
- '--uid', user_info['system_key']['user'],
- '--access-key', user_info['system_key']['access_key'],
- '--secret', user_info['system_key']['secret_key'],
- '--display-name', user_info['system_key']['user'],
- '--system',
- ],
- check_status=True,
- )
- yield
-
-@contextlib.contextmanager
-def configure_users(ctx, config, everywhere=False):
- """
- Create users by remotely running rgwadmin commands using extracted
- user information.
- """
- log.info('Configuring users...')
-
- # extract the user info and append it to the payload tuple for the given
- # client
- for client, c_config in config.iteritems():
- if not c_config:
- continue
- user_info = extract_user_info(c_config)
- if not user_info:
- continue
-
- # For data sync the master zones and regions must have the
- # system users of the secondary zones. To keep this simple,
- # just create the system users on every client if regions are
- # configured.
- clients_to_create_as = [client]
- if everywhere:
- clients_to_create_as = config.keys()
- for client_name in clients_to_create_as:
- log.debug('Creating user {user} on {client}'.format(
- user=user_info['system_key']['user'], client=client))
- rgwadmin(ctx, client_name,
- cmd=[
- 'user', 'create',
- '--uid', user_info['system_key']['user'],
- '--access-key', user_info['system_key']['access_key'],
- '--secret', user_info['system_key']['secret_key'],
- '--display-name', user_info['system_key']['user'],
- '--system',
- ],
- check_status=True,
- )
-
- yield
-
-@contextlib.contextmanager
-def create_nonregion_pools(ctx, config, regions):
- """Create replicated or erasure coded data pools for rgw."""
- if regions:
- yield
- return
-
- log.info('creating data pools')
- for client in config.keys():
- (remote,) = ctx.cluster.only(client).remotes.iterkeys()
- data_pool = '.rgw.buckets'
- if ctx.rgw.ec_data_pool:
- create_ec_pool(remote, data_pool, client, 64,
- ctx.rgw.erasure_code_profile)
- else:
- create_replicated_pool(remote, data_pool, 64)
- if ctx.rgw.cache_pools:
- create_cache_pool(remote, data_pool, data_pool + '.cache', 64,
- 64*1024*1024)
- yield
-
-@contextlib.contextmanager
-def configure_multisite_regions_and_zones(ctx, config, regions, role_endpoints, realm, master_client):
- """
- Configure multisite regions and zones from rados and rgw.
- """
- if not regions:
- log.debug(
- 'In rgw.configure_multisite_regions_and_zones() and regions is None. '
- 'Bailing')
- yield
- return
-
- if not realm:
- log.debug(
- 'In rgw.configure_multisite_regions_and_zones() and realm is None. '
- 'Bailing')
- yield
- return
-
- log.info('Configuring multisite regions and zones...')
-
- log.debug('config is %r', config)
- log.debug('regions are %r', regions)
- log.debug('role_endpoints = %r', role_endpoints)
- log.debug('realm is %r', realm)
- # extract the zone info
- role_zones = dict([(client, extract_zone_info(ctx, client, c_config))
- for client, c_config in config.iteritems()])
- log.debug('role_zones = %r', role_zones)
-
- # extract the user info and append it to the payload tuple for the given
- # client
- for client, c_config in config.iteritems():
- if not c_config:
- user_info = None
- else:
- user_info = extract_user_info(c_config)
-
- (region, zone, zone_info) = role_zones[client]
- role_zones[client] = (region, zone, zone_info, user_info)
-
- region_info = dict([
- (region_name, extract_region_info(region_name, r_config))
- for region_name, r_config in regions.iteritems()])
-
- fill_in_endpoints(region_info, role_zones, role_endpoints)
-
- # clear out the old defaults
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
-
- # read master zonegroup and master_zone
- for zonegroup, zg_info in region_info.iteritems():
- if zg_info['is_master']:
- master_zonegroup = zonegroup
- master_zone = zg_info['master_zone']
- break
-
- log.debug('master zonegroup =%r', master_zonegroup)
- log.debug('master zone = %r', master_zone)
- log.debug('master client = %r', master_client)
-
- rgwadmin(ctx, master_client,
- cmd=['realm', 'create', '--rgw-realm', realm, '--default'],
- check_status=True)
-
- for region, info in region_info.iteritems():
- region_json = json.dumps(info)
- log.debug('region info is: %s', region_json)
- rgwadmin(ctx, master_client,
- cmd=['zonegroup', 'set'],
- stdin=StringIO(region_json),
- check_status=True)
-
- rgwadmin(ctx, master_client,
- cmd=['zonegroup', 'default', '--rgw-zonegroup', master_zonegroup],
- check_status=True)
-
- for role, (zonegroup, zone, zone_info, user_info) in role_zones.iteritems():
- (remote,) = ctx.cluster.only(role).remotes.keys()
- for pool_info in zone_info['placement_pools']:
- remote.run(args=['sudo', 'ceph', 'osd', 'pool', 'create',
- pool_info['val']['index_pool'], '64', '64'])
- if ctx.rgw.ec_data_pool:
- create_ec_pool(remote, pool_info['val']['data_pool'],
- zone, 64, ctx.rgw.erasure_code_profile)
- else:
- create_replicated_pool(remote, pool_info['val']['data_pool'], 64)
-
- (zonegroup, zone, zone_info, user_info) = role_zones[master_client]
- zone_json = json.dumps(dict(zone_info.items() + user_info.items()))
- log.debug("zone info is: %r", zone_json)
- rgwadmin(ctx, master_client,
- cmd=['zone', 'set', '--rgw-zonegroup', zonegroup,
- '--rgw-zone', zone],
- stdin=StringIO(zone_json),
- check_status=True)
-
- rgwadmin(ctx, master_client,
- cmd=['-n', master_client, 'zone', 'default', zone],
- check_status=True)
-
- rgwadmin(ctx, master_client,
- cmd=['-n', master_client, 'period', 'update', '--commit'],
- check_status=True)
-
- yield
-
-def configure_compression_in_default_zone(ctx, config):
- ceph_config = ctx.ceph['ceph'].conf.get('global', {})
- ceph_config.update(ctx.ceph['ceph'].conf.get('client', {}))
- for client, c_config in config.iteritems():
- ceph_config.update(ctx.ceph['ceph'].conf.get(client, {}))
- key = 'rgw compression type'
- if not key in ceph_config:
- log.debug('No compression setting to enable')
- break
- compression = ceph_config[key]
- log.debug('Configuring compression type = %s', compression)
-
- # XXX: the 'default' zone and zonegroup aren't created until we run RGWRados::init_complete().
- # issue a 'radosgw-admin user list' command to trigger this
- rgwadmin(ctx, client, cmd=['user', 'list'], check_status=True)
-
- rgwadmin(ctx, client,
- cmd=['zone', 'placement', 'modify', '--rgw-zone', 'default',
- '--placement-id', 'default-placement', '--compression', compression],
- check_status=True)
- break # only the first client
-
-@contextlib.contextmanager
-def configure_regions_and_zones(ctx, config, regions, role_endpoints, realm):
- """
- Configure regions and zones from rados and rgw.
- """
- if not regions:
- log.debug(
- 'In rgw.configure_regions_and_zones() and regions is None. '
- 'Bailing')
- configure_compression_in_default_zone(ctx, config)
- yield
- return
-
- if not realm:
- log.debug(
- 'In rgw.configure_regions_and_zones() and realm is None. '
- 'Bailing')
- configure_compression_in_default_zone(ctx, config)
- yield
- return
-
- log.info('Configuring regions and zones...')
-
- log.debug('config is %r', config)
- log.debug('regions are %r', regions)
- log.debug('role_endpoints = %r', role_endpoints)
- log.debug('realm is %r', realm)
- # extract the zone info
- role_zones = dict([(client, extract_zone_info(ctx, client, c_config))
- for client, c_config in config.iteritems()])
- log.debug('roles_zones = %r', role_zones)
-
- # extract the user info and append it to the payload tuple for the given
- # client
- for client, c_config in config.iteritems():
- if not c_config:
- user_info = None
- else:
- user_info = extract_user_info(c_config)
-
- (region, zone, zone_info) = role_zones[client]
- role_zones[client] = (region, zone, zone_info, user_info)
-
- region_info = dict([
- (region_name, extract_region_info(region_name, r_config))
- for region_name, r_config in regions.iteritems()])
-
- fill_in_endpoints(region_info, role_zones, role_endpoints)
-
- # clear out the old defaults
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
- # removing these objects from .rgw.root and the per-zone root pools
- # may or may not matter
- rados(ctx, mon,
- cmd=['-p', '.rgw.root', 'rm', 'region_info.default'])
- rados(ctx, mon,
- cmd=['-p', '.rgw.root', 'rm', 'zone_info.default'])
-
- # read master zonegroup and master_zone
- for zonegroup, zg_info in region_info.iteritems():
- if zg_info['is_master']:
- master_zonegroup = zonegroup
- master_zone = zg_info['master_zone']
- break
-
- for client in config.iterkeys():
- (zonegroup, zone, zone_info, user_info) = role_zones[client]
- if zonegroup == master_zonegroup and zone == master_zone:
- master_client = client
- break
-
- log.debug('master zonegroup =%r', master_zonegroup)
- log.debug('master zone = %r', master_zone)
- log.debug('master client = %r', master_client)
- log.debug('config %r ', config)
-
- (ret, out)=rgwadmin(ctx, master_client,
- cmd=['realm', 'create', '--rgw-realm', realm, '--default'])
- log.debug('realm create ret %r exists %r', -ret, errno.EEXIST)
- assert ret == 0 or ret != -errno.EEXIST
- if ret is -errno.EEXIST:
- log.debug('realm %r exists', realm)
-
- for client in config.iterkeys():
- for role, (zonegroup, zone, zone_info, user_info) in role_zones.iteritems():
- rados(ctx, mon,
- cmd=['-p', zone_info['domain_root'],
- 'rm', 'region_info.default'])
- rados(ctx, mon,
- cmd=['-p', zone_info['domain_root'],
- 'rm', 'zone_info.default'])
-
- (remote,) = ctx.cluster.only(role).remotes.keys()
- for pool_info in zone_info['placement_pools']:
- remote.run(args=['sudo', 'ceph', 'osd', 'pool', 'create',
- pool_info['val']['index_pool'], '64', '64'])
- if ctx.rgw.ec_data_pool:
- create_ec_pool(remote, pool_info['val']['data_pool'],
- zone, 64, ctx.rgw.erasure_code_profile)
- else:
- create_replicated_pool(
- remote, pool_info['val']['data_pool'],
- 64)
- zone_json = json.dumps(dict(zone_info.items() + user_info.items()))
- log.debug('zone info is: %r', zone_json)
- rgwadmin(ctx, client,
- cmd=['zone', 'set', '--rgw-zonegroup', zonegroup,
- '--rgw-zone', zone],
- stdin=StringIO(zone_json),
- check_status=True)
-
- for region, info in region_info.iteritems():
- region_json = json.dumps(info)
- log.debug('region info is: %s', region_json)
- rgwadmin(ctx, client,
- cmd=['zonegroup', 'set'],
- stdin=StringIO(region_json),
- check_status=True)
- if info['is_master']:
- rgwadmin(ctx, client,
- cmd=['zonegroup', 'default', '--rgw-zonegroup', master_zonegroup],
- check_status=True)
-
- (zonegroup, zone, zone_info, user_info) = role_zones[client]
- rgwadmin(ctx, client,
- cmd=['zone', 'default', zone],
- check_status=True)
-
- rgwadmin(ctx, master_client,
- cmd=['-n', master_client, 'period', 'update', '--commit'],
- check_status=True)
-
- yield
-
-@contextlib.contextmanager
-def pull_configuration(ctx, config, regions, role_endpoints, realm, master_client):
- """
- Configure regions and zones from rados and rgw.
- """
- if not regions:
- log.debug(
- 'In rgw.pull_confguration() and regions is None. '
- 'Bailing')
- yield
- return
-
- if not realm:
- log.debug(
- 'In rgw.pull_configuration() and realm is None. '
- 'Bailing')
- yield
- return
-
- log.info('Pulling configuration...')
-
- log.debug('config is %r', config)
- log.debug('regions are %r', regions)
- log.debug('role_endpoints = %r', role_endpoints)
- log.debug('realm is %r', realm)
- log.debug('master client = %r', master_client)
-
- # extract the zone info
- role_zones = dict([(client, extract_zone_info(ctx, client, c_config))
- for client, c_config in config.iteritems()])
- log.debug('roles_zones = %r', role_zones)
-
- # extract the user info and append it to the payload tuple for the given
- # client
- for client, c_config in config.iteritems():
- if not c_config:
- user_info = None
- else:
- user_info = extract_user_info(c_config)
-
- (region, zone, zone_info) = role_zones[client]
- role_zones[client] = (region, zone, zone_info, user_info)
-
- region_info = dict([
- (region_name, extract_region_info(region_name, r_config))
- for region_name, r_config in regions.iteritems()])
-
- fill_in_endpoints(region_info, role_zones, role_endpoints)
-
- for client in config.iterkeys():
- if client != master_client:
- host, port = role_endpoints[master_client]
- endpoint = 'http://{host}:{port}/'.format(host=host, port=port)
- log.debug("endpoint: %s", endpoint)
- rgwadmin(ctx, client,
- cmd=['-n', client, 'realm', 'pull', '--rgw-realm', realm, '--default', '--url',
- endpoint, '--access_key',
- user_info['system_key']['access_key'], '--secret',
- user_info['system_key']['secret_key']],
- check_status=True)
-
- (zonegroup, zone, zone_info, zone_user_info) = role_zones[client]
- zone_json = json.dumps(dict(zone_info.items() + zone_user_info.items()))
- log.debug("zone info is: %r"), zone_json
- rgwadmin(ctx, client,
- cmd=['zone', 'set', '--rgw-zonegroup', zonegroup,
- '--rgw-zone', zone],
- stdin=StringIO(zone_json),
- check_status=True)
-
- rgwadmin(ctx, client,
- cmd=['period', 'update', '--commit', '--url',
- endpoint, '--access_key',
- user_info['system_key']['access_key'], '--secret',
- user_info['system_key']['secret_key']],
- check_status=True)
-
- yield
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Either use configure apache to run a rados gateway, or use the built-in
- civetweb server.
- Only one should be run per machine, since it uses a hard-coded port for
- now.
-
- For example, to run rgw on all clients::
-
- tasks:
- - ceph:
- - rgw:
-
- To only run on certain clients::
-
- tasks:
- - ceph:
- - rgw: [client.0, client.3]
-
- or
-
- tasks:
- - ceph:
- - rgw:
- client.0:
- client.3:
-
- You can adjust the idle timeout for fastcgi (default is 30 seconds):
-
- tasks:
- - ceph:
- - rgw:
- client.0:
- idle_timeout: 90
-
- To run radosgw through valgrind:
-
- tasks:
- - ceph:
- - rgw:
- client.0:
- valgrind: [--tool=memcheck]
- client.3:
- valgrind: [--tool=memcheck]
-
- To use civetweb instead of apache:
-
- tasks:
- - ceph:
- - rgw:
- - client.0
- overrides:
- rgw:
- frontend: civetweb
-
- Note that without a modified fastcgi module e.g. with the default
- one on CentOS, you must have rgw print continue = false in ceph.conf::
-
- tasks:
- - ceph:
- conf:
- global:
- rgw print continue: false
- - rgw: [client.0]
-
- To use mod_proxy_fcgi instead of mod_fastcgi:
-
- overrides:
- rgw:
- use_fcgi: true
-
- To run rgws for multiple regions or zones, describe the regions
- and their zones in a regions section. The endpoints will be
- generated by this task. Each client must have a region, zone,
- and pools assigned in ceph.conf::
-
- tasks:
- - install:
- - ceph:
- conf:
- client.0:
- rgw region: foo
- rgw zone: foo-1
- rgw region root pool: .rgw.rroot.foo
- rgw zone root pool: .rgw.zroot.foo
- rgw log meta: true
- rgw log data: true
- client.1:
- rgw region: bar
- rgw zone: bar-master
- rgw region root pool: .rgw.rroot.bar
- rgw zone root pool: .rgw.zroot.bar
- rgw log meta: true
- rgw log data: true
- client.2:
- rgw region: bar
- rgw zone: bar-secondary
- rgw region root pool: .rgw.rroot.bar
- rgw zone root pool: .rgw.zroot.bar-secondary
- - rgw:
- default_idle_timeout: 30
- ec-data-pool: true
- erasure_code_profile:
- k: 2
- m: 1
- ruleset-failure-domain: osd
- realm: foo
- regions:
- foo:
- api name: api_name # default: region name
- is master: true # default: false
- master zone: foo-1 # default: first zone
- zones: [foo-1]
- log meta: true
- log data: true
- placement targets: [target1, target2] # default: []
- default placement: target2 # default: ''
- bar:
- api name: bar-api
- zones: [bar-master, bar-secondary]
- client.0:
- system user:
- name: foo-system
- access key: X2IYPSTY1072DDY1SJMC
- secret key: YIMHICpPvT+MhLTbSsiBJ1jQF15IFvJA8tgwJEcm
- client.1:
- system user:
- name: bar1
- access key: Y2IYPSTY1072DDY1SJMC
- secret key: XIMHICpPvT+MhLTbSsiBJ1jQF15IFvJA8tgwJEcm
- client.2:
- system user:
- name: bar2
- access key: Z2IYPSTY1072DDY1SJMC
- secret key: ZIMHICpPvT+MhLTbSsiBJ1jQF15IFvJA8tgwJEcm
- """
- if config is None:
- config = dict(('client.{id}'.format(id=id_), None)
- for id_ in teuthology.all_roles_of_type(
- ctx.cluster, 'client'))
- elif isinstance(config, list):
- config = dict((name, None) for name in config)
-
- overrides = ctx.config.get('overrides', {})
- teuthology.deep_merge(config, overrides.get('rgw', {}))
-
- regions = {}
- if 'regions' in config:
- # separate region info so only clients are keys in config
- regions = config['regions']
- del config['regions']
-
- role_endpoints = assign_ports(ctx, config)
- ctx.rgw = argparse.Namespace()
- ctx.rgw.role_endpoints = role_endpoints
- # stash the region info for later, since it was deleted from the config
- # structure
- ctx.rgw.regions = regions
-
- realm = None
- if 'realm' in config:
- # separate region info so only clients are keys in config
- realm = config['realm']
- del config['realm']
- ctx.rgw.realm = realm
-
- ctx.rgw.ec_data_pool = False
- if 'ec-data-pool' in config:
- ctx.rgw.ec_data_pool = bool(config['ec-data-pool'])
- del config['ec-data-pool']
- ctx.rgw.erasure_code_profile = {}
- if 'erasure_code_profile' in config:
- ctx.rgw.erasure_code_profile = config['erasure_code_profile']
- del config['erasure_code_profile']
- ctx.rgw.default_idle_timeout = 30
- if 'default_idle_timeout' in config:
- ctx.rgw.default_idle_timeout = int(config['default_idle_timeout'])
- del config['default_idle_timeout']
- ctx.rgw.cache_pools = False
- if 'cache-pools' in config:
- ctx.rgw.cache_pools = bool(config['cache-pools'])
- del config['cache-pools']
-
- ctx.rgw.frontend = 'civetweb'
- if 'frontend' in config:
- ctx.rgw.frontend = config['frontend']
- del config['frontend']
-
- ctx.rgw.use_fastcgi = True
- if "use_fcgi" in config:
- ctx.rgw.use_fastcgi = False
- log.info("Using mod_proxy_fcgi instead of mod_fastcgi...")
- del config['use_fcgi']
-
- subtasks = [
- lambda: create_nonregion_pools(
- ctx=ctx, config=config, regions=regions),
- ]
-
- multisite = len(regions) > 1
-
- if not multisite:
- for zonegroup, zonegroup_info in regions.iteritems():
- log.debug("zonegroup_info =%r", zonegroup_info)
- if len(zonegroup_info['zones']) > 1:
- multisite = True
- break
-
- log.debug('multisite %s', multisite)
- multi_cluster = multisite and len(ctx.config['roles']) > 1
- log.debug('multi_cluster %s', multi_cluster)
- master_client = None
-
- if multi_cluster:
- log.debug('multi cluster run')
-
- master_client = get_config_master_client(ctx=ctx,
- config=config,
- regions=regions)
- log.debug('master_client %r', master_client)
- subtasks.extend([
- lambda: configure_multisite_regions_and_zones(
- ctx=ctx,
- config=config,
- regions=regions,
- role_endpoints=role_endpoints,
- realm=realm,
- master_client = master_client,
- )
- ])
-
- subtasks.extend([
- lambda: configure_users_for_client(
- ctx=ctx,
- config=config,
- client=master_client,
- everywhere=False,
- ),
- ])
-
- if ctx.rgw.frontend == 'apache':
- subtasks.insert(0,
- lambda: create_apache_dirs(ctx=ctx, config=config,
- on_client=master_client))
- subtasks.extend([
- lambda: ship_apache_configs(ctx=ctx, config=config,
- role_endpoints=role_endpoints, on_client=master_client),
- lambda: start_rgw(ctx=ctx, config=config, on_client=master_client),
- lambda: start_apache(ctx=ctx, config=config, on_client=master_client),
- ])
- elif ctx.rgw.frontend == 'civetweb':
- subtasks.extend([
- lambda: start_rgw(ctx=ctx, config=config, on_client=master_client),
- ])
- else:
- raise ValueError("frontend must be 'apache' or 'civetweb'")
-
- subtasks.extend([
- lambda: pull_configuration(ctx=ctx,
- config=config,
- regions=regions,
- role_endpoints=role_endpoints,
- realm=realm,
- master_client=master_client
- ),
- ])
-
- subtasks.extend([
- lambda: configure_users_for_client(
- ctx=ctx,
- config=config,
- client=master_client,
- everywhere=True
- ),
- ])
-
- if ctx.rgw.frontend == 'apache':
- subtasks.insert(0,
- lambda: create_apache_dirs(ctx=ctx, config=config,
- on_client=None,
- except_client = master_client))
- subtasks.extend([
- lambda: ship_apache_configs(ctx=ctx, config=config,
- role_endpoints=role_endpoints,
- on_client=None,
- except_client = master_client,
- ),
- lambda: start_rgw(ctx=ctx,
- config=config,
- on_client=None,
- except_client = master_client),
- lambda: start_apache(ctx=ctx,
- config = config,
- on_client=None,
- except_client = master_client,
- ),
- ])
- elif ctx.rgw.frontend == 'civetweb':
- subtasks.extend([
- lambda: start_rgw(ctx=ctx,
- config=config,
- on_client=None,
- except_client = master_client),
- ])
- else:
- raise ValueError("frontend must be 'apache' or 'civetweb'")
-
- else:
- log.debug('single cluster run')
- subtasks.extend([
- lambda: configure_regions_and_zones(
- ctx=ctx,
- config=config,
- regions=regions,
- role_endpoints=role_endpoints,
- realm=realm,
- ),
- lambda: configure_users(
- ctx=ctx,
- config=config,
- everywhere=True,
- ),
- ])
- if ctx.rgw.frontend == 'apache':
- subtasks.insert(0, lambda: create_apache_dirs(ctx=ctx, config=config))
- subtasks.extend([
- lambda: ship_apache_configs(ctx=ctx, config=config,
- role_endpoints=role_endpoints),
- lambda: start_rgw(ctx=ctx,
- config=config),
- lambda: start_apache(ctx=ctx, config=config),
- ])
- elif ctx.rgw.frontend == 'civetweb':
- subtasks.extend([
- lambda: start_rgw(ctx=ctx,
- config=config),
- ])
- else:
- raise ValueError("frontend must be 'apache' or 'civetweb'")
-
- log.info("Using %s as radosgw frontend", ctx.rgw.frontend)
- with contextutil.nested(*subtasks):
- yield
+++ /dev/null
-"""
-rgw s3tests logging wrappers
-"""
-from cStringIO import StringIO
-from configobj import ConfigObj
-import contextlib
-import logging
-import s3tests
-
-from teuthology import misc as teuthology
-from teuthology import contextutil
-
-log = logging.getLogger(__name__)
-
-
-@contextlib.contextmanager
-def download(ctx, config):
- """
- Run s3tests download function
- """
- return s3tests.download(ctx, config)
-
-def _config_user(s3tests_conf, section, user):
- """
- Run s3tests user config function
- """
- return s3tests._config_user(s3tests_conf, section, user)
-
-@contextlib.contextmanager
-def create_users(ctx, config):
- """
- Run s3tests user create function
- """
- return s3tests.create_users(ctx, config)
-
-@contextlib.contextmanager
-def configure(ctx, config):
- """
- Run s3tests user configure function
- """
- return s3tests.configure(ctx, config)
-
-@contextlib.contextmanager
-def run_tests(ctx, config):
- """
- Run remote netcat tests
- """
- assert isinstance(config, dict)
- testdir = teuthology.get_testdir(ctx)
- for client, client_config in config.iteritems():
- client_config['extra_args'] = [
- 's3tests.functional.test_s3:test_bucket_list_return_data',
- ]
-# args = [
-# 'S3TEST_CONF={tdir}/archive/s3-tests.{client}.conf'.format(tdir=testdir, client=client),
-# '{tdir}/s3-tests/virtualenv/bin/nosetests'.format(tdir=testdir),
-# '-w',
-# '{tdir}/s3-tests'.format(tdir=testdir),
-# '-v',
-# 's3tests.functional.test_s3:test_bucket_list_return_data',
-# ]
-# if client_config is not None and 'extra_args' in client_config:
-# args.extend(client_config['extra_args'])
-#
-# ctx.cluster.only(client).run(
-# args=args,
-# )
-
- s3tests.run_tests(ctx, config)
-
- netcat_out = StringIO()
-
- for client, client_config in config.iteritems():
- ctx.cluster.only(client).run(
- args = [
- 'netcat',
- '-w', '5',
- '-U', '{tdir}/rgw.opslog.sock'.format(tdir=testdir),
- ],
- stdout = netcat_out,
- )
-
- out = netcat_out.getvalue()
-
- assert len(out) > 100
-
- log.info('Received', out)
-
- yield
-
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Run some s3-tests suite against rgw, verify opslog socket returns data
-
- Must restrict testing to a particular client::
-
- tasks:
- - ceph:
- - rgw: [client.0]
- - s3tests: [client.0]
-
- To pass extra arguments to nose (e.g. to run a certain test)::
-
- tasks:
- - ceph:
- - rgw: [client.0]
- - s3tests:
- client.0:
- extra_args: ['test_s3:test_object_acl_grand_public_read']
- client.1:
- extra_args: ['--exclude', 'test_100_continue']
- """
- assert config is None or isinstance(config, list) \
- or isinstance(config, dict), \
- "task s3tests only supports a list or dictionary for configuration"
- all_clients = ['client.{id}'.format(id=id_)
- for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
- if config is None:
- config = all_clients
- if isinstance(config, list):
- config = dict.fromkeys(config)
- clients = config.keys()
-
- overrides = ctx.config.get('overrides', {})
- # merge each client section, not the top level.
- for (client, cconf) in config.iteritems():
- teuthology.deep_merge(cconf, overrides.get('rgw-logsocket', {}))
-
- log.debug('config is %s', config)
-
- s3tests_conf = {}
- for client in clients:
- s3tests_conf[client] = ConfigObj(
- indent_type='',
- infile={
- 'DEFAULT':
- {
- 'port' : 7280,
- 'is_secure' : 'no',
- },
- 'fixtures' : {},
- 's3 main' : {},
- 's3 alt' : {},
- }
- )
-
- with contextutil.nested(
- lambda: download(ctx=ctx, config=config),
- lambda: create_users(ctx=ctx, config=dict(
- clients=clients,
- s3tests_conf=s3tests_conf,
- )),
- lambda: configure(ctx=ctx, config=dict(
- clients=config,
- s3tests_conf=s3tests_conf,
- )),
- lambda: run_tests(ctx=ctx, config=config),
- ):
- yield
+++ /dev/null
-"""
-Run rgw s3 readwite tests
-"""
-from cStringIO import StringIO
-import base64
-import contextlib
-import logging
-import os
-import random
-import string
-import yaml
-
-from teuthology import misc as teuthology
-from teuthology import contextutil
-from teuthology.config import config as teuth_config
-from teuthology.orchestra import run
-from teuthology.orchestra.connection import split_user
-
-log = logging.getLogger(__name__)
-
-
-@contextlib.contextmanager
-def download(ctx, config):
- """
- Download the s3 tests from the git builder.
- Remove downloaded s3 file upon exit.
-
- The context passed in should be identical to the context
- passed in to the main task.
- """
- assert isinstance(config, dict)
- log.info('Downloading s3-tests...')
- testdir = teuthology.get_testdir(ctx)
- for (client, cconf) in config.items():
- branch = cconf.get('force-branch', None)
- if not branch:
- branch = cconf.get('branch', 'master')
- sha1 = cconf.get('sha1')
- ctx.cluster.only(client).run(
- args=[
- 'git', 'clone',
- '-b', branch,
- teuth_config.ceph_git_base_url + 's3-tests.git',
- '{tdir}/s3-tests'.format(tdir=testdir),
- ],
- )
- if sha1 is not None:
- ctx.cluster.only(client).run(
- args=[
- 'cd', '{tdir}/s3-tests'.format(tdir=testdir),
- run.Raw('&&'),
- 'git', 'reset', '--hard', sha1,
- ],
- )
- try:
- yield
- finally:
- log.info('Removing s3-tests...')
- testdir = teuthology.get_testdir(ctx)
- for client in config:
- ctx.cluster.only(client).run(
- args=[
- 'rm',
- '-rf',
- '{tdir}/s3-tests'.format(tdir=testdir),
- ],
- )
-
-
-def _config_user(s3tests_conf, section, user):
- """
- Configure users for this section by stashing away keys, ids, and
- email addresses.
- """
- s3tests_conf[section].setdefault('user_id', user)
- s3tests_conf[section].setdefault('email', '{user}+test@test.test'.format(user=user))
- s3tests_conf[section].setdefault('display_name', 'Mr. {user}'.format(user=user))
- s3tests_conf[section].setdefault('access_key', ''.join(random.choice(string.uppercase) for i in xrange(20)))
- s3tests_conf[section].setdefault('secret_key', base64.b64encode(os.urandom(40)))
-
-@contextlib.contextmanager
-def create_users(ctx, config):
- """
- Create a default s3 user.
- """
- assert isinstance(config, dict)
- log.info('Creating rgw users...')
- testdir = teuthology.get_testdir(ctx)
- users = {'s3': 'foo'}
- cached_client_user_names = dict()
- for client in config['clients']:
- cached_client_user_names[client] = dict()
- s3tests_conf = config['s3tests_conf'][client]
- s3tests_conf.setdefault('readwrite', {})
- s3tests_conf['readwrite'].setdefault('bucket', 'rwtest-' + client + '-{random}-')
- s3tests_conf['readwrite'].setdefault('readers', 10)
- s3tests_conf['readwrite'].setdefault('writers', 3)
- s3tests_conf['readwrite'].setdefault('duration', 300)
- s3tests_conf['readwrite'].setdefault('files', {})
- rwconf = s3tests_conf['readwrite']
- rwconf['files'].setdefault('num', 10)
- rwconf['files'].setdefault('size', 2000)
- rwconf['files'].setdefault('stddev', 500)
- for section, user in users.iteritems():
- _config_user(s3tests_conf, section, '{user}.{client}'.format(user=user, client=client))
- log.debug('creating user {user} on {client}'.format(user=s3tests_conf[section]['user_id'],
- client=client))
-
- # stash the 'delete_user' flag along with user name for easier cleanup
- delete_this_user = True
- if 'delete_user' in s3tests_conf['s3']:
- delete_this_user = s3tests_conf['s3']['delete_user']
- log.debug('delete_user set to {flag} for {client}'.format(flag=delete_this_user, client=client))
- cached_client_user_names[client][section+user] = (s3tests_conf[section]['user_id'], delete_this_user)
-
- # skip actual user creation if the create_user flag is set to false for this client
- if 'create_user' in s3tests_conf['s3'] and s3tests_conf['s3']['create_user'] == False:
- log.debug('create_user set to False, skipping user creation for {client}'.format(client=client))
- continue
- else:
- ctx.cluster.only(client).run(
- args=[
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'radosgw-admin',
- '-n', client,
- 'user', 'create',
- '--uid', s3tests_conf[section]['user_id'],
- '--display-name', s3tests_conf[section]['display_name'],
- '--access-key', s3tests_conf[section]['access_key'],
- '--secret', s3tests_conf[section]['secret_key'],
- '--email', s3tests_conf[section]['email'],
- ],
- )
- try:
- yield
- finally:
- for client in config['clients']:
- for section, user in users.iteritems():
- #uid = '{user}.{client}'.format(user=user, client=client)
- real_uid, delete_this_user = cached_client_user_names[client][section+user]
- if delete_this_user:
- ctx.cluster.only(client).run(
- args=[
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'radosgw-admin',
- '-n', client,
- 'user', 'rm',
- '--uid', real_uid,
- '--purge-data',
- ],
- )
- else:
- log.debug('skipping delete for user {uid} on {client}'.format(uid=real_uid, client=client))
-
-@contextlib.contextmanager
-def configure(ctx, config):
- """
- Configure the s3-tests. This includes the running of the
- bootstrap code and the updating of local conf files.
- """
- assert isinstance(config, dict)
- log.info('Configuring s3-readwrite-tests...')
- for client, properties in config['clients'].iteritems():
- s3tests_conf = config['s3tests_conf'][client]
- if properties is not None and 'rgw_server' in properties:
- host = None
- for target, roles in zip(ctx.config['targets'].iterkeys(), ctx.config['roles']):
- log.info('roles: ' + str(roles))
- log.info('target: ' + str(target))
- if properties['rgw_server'] in roles:
- _, host = split_user(target)
- assert host is not None, "Invalid client specified as the rgw_server"
- s3tests_conf['s3']['host'] = host
- else:
- s3tests_conf['s3']['host'] = 'localhost'
-
- def_conf = s3tests_conf['DEFAULT']
- s3tests_conf['s3'].setdefault('port', def_conf['port'])
- s3tests_conf['s3'].setdefault('is_secure', def_conf['is_secure'])
-
- (remote,) = ctx.cluster.only(client).remotes.keys()
- remote.run(
- args=[
- 'cd',
- '{tdir}/s3-tests'.format(tdir=teuthology.get_testdir(ctx)),
- run.Raw('&&'),
- './bootstrap',
- ],
- )
- conf_fp = StringIO()
- conf = dict(
- s3=s3tests_conf['s3'],
- readwrite=s3tests_conf['readwrite'],
- )
- yaml.safe_dump(conf, conf_fp, default_flow_style=False)
- teuthology.write_file(
- remote=remote,
- path='{tdir}/archive/s3readwrite.{client}.config.yaml'.format(tdir=teuthology.get_testdir(ctx), client=client),
- data=conf_fp.getvalue(),
- )
- yield
-
-
-@contextlib.contextmanager
-def run_tests(ctx, config):
- """
- Run the s3readwrite tests after everything is set up.
-
- :param ctx: Context passed to task
- :param config: specific configuration information
- """
- assert isinstance(config, dict)
- testdir = teuthology.get_testdir(ctx)
- for client, client_config in config.iteritems():
- (remote,) = ctx.cluster.only(client).remotes.keys()
- conf = teuthology.get_file(remote, '{tdir}/archive/s3readwrite.{client}.config.yaml'.format(tdir=testdir, client=client))
- args = [
- '{tdir}/s3-tests/virtualenv/bin/s3tests-test-readwrite'.format(tdir=testdir),
- ]
- if client_config is not None and 'extra_args' in client_config:
- args.extend(client_config['extra_args'])
-
- ctx.cluster.only(client).run(
- args=args,
- stdin=conf,
- )
- yield
-
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Run the s3tests-test-readwrite suite against rgw.
-
- To run all tests on all clients::
-
- tasks:
- - ceph:
- - rgw:
- - s3readwrite:
-
- To restrict testing to particular clients::
-
- tasks:
- - ceph:
- - rgw: [client.0]
- - s3readwrite: [client.0]
-
- To run against a server on client.1::
-
- tasks:
- - ceph:
- - rgw: [client.1]
- - s3readwrite:
- client.0:
- rgw_server: client.1
-
- To pass extra test arguments
-
- tasks:
- - ceph:
- - rgw: [client.0]
- - s3readwrite:
- client.0:
- readwrite:
- bucket: mybucket
- readers: 10
- writers: 3
- duration: 600
- files:
- num: 10
- size: 2000
- stddev: 500
- client.1:
- ...
-
- To override s3 configuration
-
- tasks:
- - ceph:
- - rgw: [client.0]
- - s3readwrite:
- client.0:
- s3:
- user_id: myuserid
- display_name: myname
- email: my@email
- access_key: myaccesskey
- secret_key: mysecretkey
-
- """
- assert config is None or isinstance(config, list) \
- or isinstance(config, dict), \
- "task s3tests only supports a list or dictionary for configuration"
- all_clients = ['client.{id}'.format(id=id_)
- for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
- if config is None:
- config = all_clients
- if isinstance(config, list):
- config = dict.fromkeys(config)
- clients = config.keys()
-
- overrides = ctx.config.get('overrides', {})
- # merge each client section, not the top level.
- for client in config.iterkeys():
- if not config[client]:
- config[client] = {}
- teuthology.deep_merge(config[client], overrides.get('s3readwrite', {}))
-
- log.debug('in s3readwrite, config is %s', config)
-
- s3tests_conf = {}
- for client in clients:
- if config[client] is None:
- config[client] = {}
- config[client].setdefault('s3', {})
- config[client].setdefault('readwrite', {})
-
- s3tests_conf[client] = ({
- 'DEFAULT':
- {
- 'port' : 7280,
- 'is_secure' : False,
- },
- 'readwrite' : config[client]['readwrite'],
- 's3' : config[client]['s3'],
- })
-
- with contextutil.nested(
- lambda: download(ctx=ctx, config=config),
- lambda: create_users(ctx=ctx, config=dict(
- clients=clients,
- s3tests_conf=s3tests_conf,
- )),
- lambda: configure(ctx=ctx, config=dict(
- clients=config,
- s3tests_conf=s3tests_conf,
- )),
- lambda: run_tests(ctx=ctx, config=config),
- ):
- pass
- yield
+++ /dev/null
-"""
-Run rgw roundtrip message tests
-"""
-from cStringIO import StringIO
-import base64
-import contextlib
-import logging
-import os
-import random
-import string
-import yaml
-
-from teuthology import misc as teuthology
-from teuthology import contextutil
-from teuthology.config import config as teuth_config
-from teuthology.orchestra import run
-from teuthology.orchestra.connection import split_user
-
-log = logging.getLogger(__name__)
-
-
-@contextlib.contextmanager
-def download(ctx, config):
- """
- Download the s3 tests from the git builder.
- Remove downloaded s3 file upon exit.
-
- The context passed in should be identical to the context
- passed in to the main task.
- """
- assert isinstance(config, dict)
- log.info('Downloading s3-tests...')
- testdir = teuthology.get_testdir(ctx)
- for (client, cconf) in config.iteritems():
- branch = cconf.get('force-branch', None)
- if not branch:
- branch = cconf.get('branch', 'master')
- ctx.cluster.only(client).run(
- args=[
- 'git', 'clone',
- '-b', branch,
- teuth_config.ceph_git_base_url + 's3-tests.git',
- '{tdir}/s3-tests'.format(tdir=testdir),
- ],
- )
- try:
- yield
- finally:
- log.info('Removing s3-tests...')
- for client in config:
- ctx.cluster.only(client).run(
- args=[
- 'rm',
- '-rf',
- '{tdir}/s3-tests'.format(tdir=testdir),
- ],
- )
-
-def _config_user(s3tests_conf, section, user):
- """
- Configure users for this section by stashing away keys, ids, and
- email addresses.
- """
- s3tests_conf[section].setdefault('user_id', user)
- s3tests_conf[section].setdefault('email', '{user}+test@test.test'.format(user=user))
- s3tests_conf[section].setdefault('display_name', 'Mr. {user}'.format(user=user))
- s3tests_conf[section].setdefault('access_key', ''.join(random.choice(string.uppercase) for i in xrange(20)))
- s3tests_conf[section].setdefault('secret_key', base64.b64encode(os.urandom(40)))
-
-@contextlib.contextmanager
-def create_users(ctx, config):
- """
- Create a default s3 user.
- """
- assert isinstance(config, dict)
- log.info('Creating rgw users...')
- testdir = teuthology.get_testdir(ctx)
- users = {'s3': 'foo'}
- for client in config['clients']:
- s3tests_conf = config['s3tests_conf'][client]
- s3tests_conf.setdefault('roundtrip', {})
- s3tests_conf['roundtrip'].setdefault('bucket', 'rttest-' + client + '-{random}-')
- s3tests_conf['roundtrip'].setdefault('readers', 10)
- s3tests_conf['roundtrip'].setdefault('writers', 3)
- s3tests_conf['roundtrip'].setdefault('duration', 300)
- s3tests_conf['roundtrip'].setdefault('files', {})
- rtconf = s3tests_conf['roundtrip']
- rtconf['files'].setdefault('num', 10)
- rtconf['files'].setdefault('size', 2000)
- rtconf['files'].setdefault('stddev', 500)
- for section, user in [('s3', 'foo')]:
- _config_user(s3tests_conf, section, '{user}.{client}'.format(user=user, client=client))
- ctx.cluster.only(client).run(
- args=[
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'radosgw-admin',
- '-n', client,
- 'user', 'create',
- '--uid', s3tests_conf[section]['user_id'],
- '--display-name', s3tests_conf[section]['display_name'],
- '--access-key', s3tests_conf[section]['access_key'],
- '--secret', s3tests_conf[section]['secret_key'],
- '--email', s3tests_conf[section]['email'],
- ],
- )
- try:
- yield
- finally:
- for client in config['clients']:
- for user in users.itervalues():
- uid = '{user}.{client}'.format(user=user, client=client)
- ctx.cluster.only(client).run(
- args=[
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'radosgw-admin',
- '-n', client,
- 'user', 'rm',
- '--uid', uid,
- '--purge-data',
- ],
- )
-
-@contextlib.contextmanager
-def configure(ctx, config):
- """
- Configure the s3-tests. This includes the running of the
- bootstrap code and the updating of local conf files.
- """
- assert isinstance(config, dict)
- log.info('Configuring s3-roundtrip-tests...')
- testdir = teuthology.get_testdir(ctx)
- for client, properties in config['clients'].iteritems():
- s3tests_conf = config['s3tests_conf'][client]
- if properties is not None and 'rgw_server' in properties:
- host = None
- for target, roles in zip(ctx.config['targets'].iterkeys(), ctx.config['roles']):
- log.info('roles: ' + str(roles))
- log.info('target: ' + str(target))
- if properties['rgw_server'] in roles:
- _, host = split_user(target)
- assert host is not None, "Invalid client specified as the rgw_server"
- s3tests_conf['s3']['host'] = host
- else:
- s3tests_conf['s3']['host'] = 'localhost'
-
- def_conf = s3tests_conf['DEFAULT']
- s3tests_conf['s3'].setdefault('port', def_conf['port'])
- s3tests_conf['s3'].setdefault('is_secure', def_conf['is_secure'])
-
- (remote,) = ctx.cluster.only(client).remotes.keys()
- remote.run(
- args=[
- 'cd',
- '{tdir}/s3-tests'.format(tdir=testdir),
- run.Raw('&&'),
- './bootstrap',
- ],
- )
- conf_fp = StringIO()
- conf = dict(
- s3=s3tests_conf['s3'],
- roundtrip=s3tests_conf['roundtrip'],
- )
- yaml.safe_dump(conf, conf_fp, default_flow_style=False)
- teuthology.write_file(
- remote=remote,
- path='{tdir}/archive/s3roundtrip.{client}.config.yaml'.format(tdir=testdir, client=client),
- data=conf_fp.getvalue(),
- )
- yield
-
-
-@contextlib.contextmanager
-def run_tests(ctx, config):
- """
- Run the s3 roundtrip after everything is set up.
-
- :param ctx: Context passed to task
- :param config: specific configuration information
- """
- assert isinstance(config, dict)
- testdir = teuthology.get_testdir(ctx)
- for client, client_config in config.iteritems():
- (remote,) = ctx.cluster.only(client).remotes.keys()
- conf = teuthology.get_file(remote, '{tdir}/archive/s3roundtrip.{client}.config.yaml'.format(tdir=testdir, client=client))
- args = [
- '{tdir}/s3-tests/virtualenv/bin/s3tests-test-roundtrip'.format(tdir=testdir),
- ]
- if client_config is not None and 'extra_args' in client_config:
- args.extend(client_config['extra_args'])
-
- ctx.cluster.only(client).run(
- args=args,
- stdin=conf,
- )
- yield
-
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Run the s3tests-test-roundtrip suite against rgw.
-
- To run all tests on all clients::
-
- tasks:
- - ceph:
- - rgw:
- - s3roundtrip:
-
- To restrict testing to particular clients::
-
- tasks:
- - ceph:
- - rgw: [client.0]
- - s3roundtrip: [client.0]
-
- To run against a server on client.1::
-
- tasks:
- - ceph:
- - rgw: [client.1]
- - s3roundtrip:
- client.0:
- rgw_server: client.1
-
- To pass extra test arguments
-
- tasks:
- - ceph:
- - rgw: [client.0]
- - s3roundtrip:
- client.0:
- roundtrip:
- bucket: mybucket
- readers: 10
- writers: 3
- duration: 600
- files:
- num: 10
- size: 2000
- stddev: 500
- client.1:
- ...
-
- To override s3 configuration
-
- tasks:
- - ceph:
- - rgw: [client.0]
- - s3roundtrip:
- client.0:
- s3:
- user_id: myuserid
- display_name: myname
- email: my@email
- access_key: myaccesskey
- secret_key: mysecretkey
-
- """
- assert config is None or isinstance(config, list) \
- or isinstance(config, dict), \
- "task s3tests only supports a list or dictionary for configuration"
- all_clients = ['client.{id}'.format(id=id_)
- for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
- if config is None:
- config = all_clients
- if isinstance(config, list):
- config = dict.fromkeys(config)
- clients = config.keys()
-
- s3tests_conf = {}
- for client in clients:
- if config[client] is None:
- config[client] = {}
- config[client].setdefault('s3', {})
- config[client].setdefault('roundtrip', {})
-
- s3tests_conf[client] = ({
- 'DEFAULT':
- {
- 'port' : 7280,
- 'is_secure' : False,
- },
- 'roundtrip' : config[client]['roundtrip'],
- 's3' : config[client]['s3'],
- })
-
- with contextutil.nested(
- lambda: download(ctx=ctx, config=config),
- lambda: create_users(ctx=ctx, config=dict(
- clients=clients,
- s3tests_conf=s3tests_conf,
- )),
- lambda: configure(ctx=ctx, config=dict(
- clients=config,
- s3tests_conf=s3tests_conf,
- )),
- lambda: run_tests(ctx=ctx, config=config),
- ):
- pass
- yield
+++ /dev/null
-"""
-Run a set of s3 tests on rgw.
-"""
-from cStringIO import StringIO
-from configobj import ConfigObj
-import base64
-import contextlib
-import logging
-import os
-import random
-import string
-
-import util.rgw as rgw_utils
-
-from teuthology import misc as teuthology
-from teuthology import contextutil
-from teuthology.config import config as teuth_config
-from teuthology.orchestra import run
-from teuthology.orchestra.connection import split_user
-
-log = logging.getLogger(__name__)
-
-def extract_sync_client_data(ctx, client_name):
- """
- Extract synchronized client rgw zone and rgw region information.
-
- :param ctx: Context passed to the s3tests task
- :param name: Name of client that we are synching with
- """
- return_region_name = None
- return_dict = None
- client = ctx.ceph['ceph'].conf.get(client_name, None)
- if client:
- current_client_zone = client.get('rgw zone', None)
- if current_client_zone:
- (endpoint_host, endpoint_port) = ctx.rgw.role_endpoints.get(client_name, (None, None))
- # pull out the radosgw_agent stuff
- regions = ctx.rgw.regions
- for region in regions:
- log.debug('jbuck, region is {region}'.format(region=region))
- region_data = ctx.rgw.regions[region]
- log.debug('region data is {region}'.format(region=region_data))
- zones = region_data['zones']
- for zone in zones:
- if current_client_zone in zone:
- return_region_name = region
- return_dict = dict()
- return_dict['api_name'] = region_data['api name']
- return_dict['is_master'] = region_data['is master']
- return_dict['port'] = endpoint_port
- return_dict['host'] = endpoint_host
-
- # The s3tests expect the sync_agent_[addr|port} to be
- # set on the non-master node for some reason
- if not region_data['is master']:
- (rgwagent_host, rgwagent_port) = ctx.radosgw_agent.endpoint
- (return_dict['sync_agent_addr'], _) = ctx.rgw.role_endpoints[rgwagent_host]
- return_dict['sync_agent_port'] = rgwagent_port
-
- else: #if client_zone:
- log.debug('No zone info for {host}'.format(host=client_name))
- else: # if client
- log.debug('No ceph conf for {host}'.format(host=client_name))
-
- return return_region_name, return_dict
-
-def update_conf_with_region_info(ctx, config, s3tests_conf):
- """
- Scan for a client (passed in s3tests_conf) that is an s3agent
- with which we can sync. Update information in local conf file
- if such a client is found.
- """
- for key in s3tests_conf.keys():
- # we'll assume that there's only one sync relationship (source / destination) with client.X
- # as the key for now
-
- # Iterate through all of the radosgw_agent (rgwa) configs and see if a
- # given client is involved in a relationship.
- # If a given client isn't, skip it
- this_client_in_rgwa_config = False
- for rgwa in ctx.radosgw_agent.config.keys():
- rgwa_data = ctx.radosgw_agent.config[rgwa]
-
- if key in rgwa_data['src'] or key in rgwa_data['dest']:
- this_client_in_rgwa_config = True
- log.debug('{client} is in an radosgw-agent sync relationship'.format(client=key))
- radosgw_sync_data = ctx.radosgw_agent.config[key]
- break
- if not this_client_in_rgwa_config:
- log.debug('{client} is NOT in an radosgw-agent sync relationship'.format(client=key))
- continue
-
- source_client = radosgw_sync_data['src']
- dest_client = radosgw_sync_data['dest']
-
- # #xtract the pertinent info for the source side
- source_region_name, source_region_dict = extract_sync_client_data(ctx, source_client)
- log.debug('\t{key} source_region {source_region} source_dict {source_dict}'.format
- (key=key,source_region=source_region_name,source_dict=source_region_dict))
-
- # The source *should* be the master region, but test anyway and then set it as the default region
- if source_region_dict['is_master']:
- log.debug('Setting {region} as default_region'.format(region=source_region_name))
- s3tests_conf[key]['fixtures'].setdefault('default_region', source_region_name)
-
- # Extract the pertinent info for the destination side
- dest_region_name, dest_region_dict = extract_sync_client_data(ctx, dest_client)
- log.debug('\t{key} dest_region {dest_region} dest_dict {dest_dict}'.format
- (key=key,dest_region=dest_region_name,dest_dict=dest_region_dict))
-
- # now add these regions to the s3tests_conf object
- s3tests_conf[key]['region {region_name}'.format(region_name=source_region_name)] = source_region_dict
- s3tests_conf[key]['region {region_name}'.format(region_name=dest_region_name)] = dest_region_dict
-
-@contextlib.contextmanager
-def download(ctx, config):
- """
- Download the s3 tests from the git builder.
- Remove downloaded s3 file upon exit.
-
- The context passed in should be identical to the context
- passed in to the main task.
- """
- assert isinstance(config, dict)
- log.info('Downloading s3-tests...')
- testdir = teuthology.get_testdir(ctx)
- s3_branches = [ 'giant', 'firefly', 'firefly-original', 'hammer' ]
- for (client, cconf) in config.items():
- branch = cconf.get('force-branch', None)
- if not branch:
- ceph_branch = ctx.config.get('branch')
- suite_branch = ctx.config.get('suite_branch', ceph_branch)
- if suite_branch in s3_branches:
- branch = cconf.get('branch', suite_branch)
- else:
- branch = cconf.get('branch', 'ceph-' + suite_branch)
- if not branch:
- raise ValueError(
- "Could not determine what branch to use for s3tests!")
- else:
- log.info("Using branch '%s' for s3tests", branch)
- sha1 = cconf.get('sha1')
- ctx.cluster.only(client).run(
- args=[
- 'git', 'clone',
- '-b', branch,
- teuth_config.ceph_git_base_url + 's3-tests.git',
- '{tdir}/s3-tests'.format(tdir=testdir),
- ],
- )
- if sha1 is not None:
- ctx.cluster.only(client).run(
- args=[
- 'cd', '{tdir}/s3-tests'.format(tdir=testdir),
- run.Raw('&&'),
- 'git', 'reset', '--hard', sha1,
- ],
- )
- try:
- yield
- finally:
- log.info('Removing s3-tests...')
- testdir = teuthology.get_testdir(ctx)
- for client in config:
- ctx.cluster.only(client).run(
- args=[
- 'rm',
- '-rf',
- '{tdir}/s3-tests'.format(tdir=testdir),
- ],
- )
-
-
-def _config_user(s3tests_conf, section, user):
- """
- Configure users for this section by stashing away keys, ids, and
- email addresses.
- """
- s3tests_conf[section].setdefault('user_id', user)
- s3tests_conf[section].setdefault('email', '{user}+test@test.test'.format(user=user))
- s3tests_conf[section].setdefault('display_name', 'Mr. {user}'.format(user=user))
- s3tests_conf[section].setdefault('access_key', ''.join(random.choice(string.uppercase) for i in xrange(20)))
- s3tests_conf[section].setdefault('secret_key', base64.b64encode(os.urandom(40)))
-
-
-@contextlib.contextmanager
-def create_users(ctx, config):
- """
- Create a main and an alternate s3 user.
- """
- assert isinstance(config, dict)
- log.info('Creating rgw users...')
- testdir = teuthology.get_testdir(ctx)
- users = {'s3 main': 'foo', 's3 alt': 'bar'}
- for client in config['clients']:
- s3tests_conf = config['s3tests_conf'][client]
- s3tests_conf.setdefault('fixtures', {})
- s3tests_conf['fixtures'].setdefault('bucket prefix', 'test-' + client + '-{random}-')
- for section, user in users.iteritems():
- _config_user(s3tests_conf, section, '{user}.{client}'.format(user=user, client=client))
- log.debug('Creating user {user} on {host}'.format(user=s3tests_conf[section]['user_id'], host=client))
- ctx.cluster.only(client).run(
- args=[
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'radosgw-admin',
- '-n', client,
- 'user', 'create',
- '--uid', s3tests_conf[section]['user_id'],
- '--display-name', s3tests_conf[section]['display_name'],
- '--access-key', s3tests_conf[section]['access_key'],
- '--secret', s3tests_conf[section]['secret_key'],
- '--email', s3tests_conf[section]['email'],
- ],
- )
- try:
- yield
- finally:
- for client in config['clients']:
- for user in users.itervalues():
- uid = '{user}.{client}'.format(user=user, client=client)
- ctx.cluster.only(client).run(
- args=[
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'radosgw-admin',
- '-n', client,
- 'user', 'rm',
- '--uid', uid,
- '--purge-data',
- ],
- )
-
-
-@contextlib.contextmanager
-def configure(ctx, config):
- """
- Configure the s3-tests. This includes the running of the
- bootstrap code and the updating of local conf files.
- """
- assert isinstance(config, dict)
- log.info('Configuring s3-tests...')
- testdir = teuthology.get_testdir(ctx)
- for client, properties in config['clients'].iteritems():
- s3tests_conf = config['s3tests_conf'][client]
- if properties is not None and 'rgw_server' in properties:
- host = None
- for target, roles in zip(ctx.config['targets'].iterkeys(), ctx.config['roles']):
- log.info('roles: ' + str(roles))
- log.info('target: ' + str(target))
- if properties['rgw_server'] in roles:
- _, host = split_user(target)
- assert host is not None, "Invalid client specified as the rgw_server"
- s3tests_conf['DEFAULT']['host'] = host
- else:
- s3tests_conf['DEFAULT']['host'] = 'localhost'
-
- if properties is not None and 'slow_backend' in properties:
- s3tests_conf['fixtures']['slow backend'] = properties['slow_backend']
-
- (remote,) = ctx.cluster.only(client).remotes.keys()
- remote.run(
- args=[
- 'cd',
- '{tdir}/s3-tests'.format(tdir=testdir),
- run.Raw('&&'),
- './bootstrap',
- ],
- )
- conf_fp = StringIO()
- s3tests_conf.write(conf_fp)
- teuthology.write_file(
- remote=remote,
- path='{tdir}/archive/s3-tests.{client}.conf'.format(tdir=testdir, client=client),
- data=conf_fp.getvalue(),
- )
-
- log.info('Configuring boto...')
- boto_src = os.path.join(os.path.dirname(__file__), 'boto.cfg.template')
- for client, properties in config['clients'].iteritems():
- with file(boto_src, 'rb') as f:
- (remote,) = ctx.cluster.only(client).remotes.keys()
- conf = f.read().format(
- idle_timeout=config.get('idle_timeout', 30)
- )
- teuthology.write_file(
- remote=remote,
- path='{tdir}/boto.cfg'.format(tdir=testdir),
- data=conf,
- )
-
- try:
- yield
-
- finally:
- log.info('Cleaning up boto...')
- for client, properties in config['clients'].iteritems():
- (remote,) = ctx.cluster.only(client).remotes.keys()
- remote.run(
- args=[
- 'rm',
- '{tdir}/boto.cfg'.format(tdir=testdir),
- ],
- )
-
-@contextlib.contextmanager
-def sync_users(ctx, config):
- """
- Sync this user.
- """
- assert isinstance(config, dict)
- # do a full sync if this is a multi-region test
- if rgw_utils.multi_region_enabled(ctx):
- log.debug('Doing a full sync')
- rgw_utils.radosgw_agent_sync_all(ctx)
- else:
- log.debug('Not a multi-region config; skipping the metadata sync')
-
- yield
-
-@contextlib.contextmanager
-def run_tests(ctx, config):
- """
- Run the s3tests after everything is set up.
-
- :param ctx: Context passed to task
- :param config: specific configuration information
- """
- assert isinstance(config, dict)
- testdir = teuthology.get_testdir(ctx)
- attrs = ["!fails_on_rgw"]
- if not ctx.rgw.use_fastcgi:
- attrs.append("!fails_on_mod_proxy_fcgi")
- for client, client_config in config.iteritems():
- args = [
- 'S3TEST_CONF={tdir}/archive/s3-tests.{client}.conf'.format(tdir=testdir, client=client),
- 'BOTO_CONFIG={tdir}/boto.cfg'.format(tdir=testdir),
- '{tdir}/s3-tests/virtualenv/bin/nosetests'.format(tdir=testdir),
- '-w',
- '{tdir}/s3-tests'.format(tdir=testdir),
- '-v',
- '-a', ','.join(attrs),
- ]
- if client_config is not None and 'extra_args' in client_config:
- args.extend(client_config['extra_args'])
-
- ctx.cluster.only(client).run(
- args=args,
- label="s3 tests against rgw"
- )
- yield
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Run the s3-tests suite against rgw.
-
- To run all tests on all clients::
-
- tasks:
- - ceph:
- - rgw:
- - s3tests:
-
- To restrict testing to particular clients::
-
- tasks:
- - ceph:
- - rgw: [client.0]
- - s3tests: [client.0]
-
- To run against a server on client.1 and increase the boto timeout to 10m::
-
- tasks:
- - ceph:
- - rgw: [client.1]
- - s3tests:
- client.0:
- rgw_server: client.1
- idle_timeout: 600
-
- To pass extra arguments to nose (e.g. to run a certain test)::
-
- tasks:
- - ceph:
- - rgw: [client.0]
- - s3tests:
- client.0:
- extra_args: ['test_s3:test_object_acl_grand_public_read']
- client.1:
- extra_args: ['--exclude', 'test_100_continue']
- """
- assert config is None or isinstance(config, list) \
- or isinstance(config, dict), \
- "task s3tests only supports a list or dictionary for configuration"
- all_clients = ['client.{id}'.format(id=id_)
- for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
- if config is None:
- config = all_clients
- if isinstance(config, list):
- config = dict.fromkeys(config)
- clients = config.keys()
-
- overrides = ctx.config.get('overrides', {})
- # merge each client section, not the top level.
- for client in config.iterkeys():
- if not config[client]:
- config[client] = {}
- teuthology.deep_merge(config[client], overrides.get('s3tests', {}))
-
- log.debug('s3tests config is %s', config)
-
- s3tests_conf = {}
- for client in clients:
- s3tests_conf[client] = ConfigObj(
- indent_type='',
- infile={
- 'DEFAULT':
- {
- 'port' : 7280,
- 'is_secure' : 'no',
- },
- 'fixtures' : {},
- 's3 main' : {},
- 's3 alt' : {},
- }
- )
-
- # Only attempt to add in the region info if there's a radosgw_agent configured
- if hasattr(ctx, 'radosgw_agent'):
- update_conf_with_region_info(ctx, config, s3tests_conf)
-
- with contextutil.nested(
- lambda: download(ctx=ctx, config=config),
- lambda: create_users(ctx=ctx, config=dict(
- clients=clients,
- s3tests_conf=s3tests_conf,
- )),
- lambda: sync_users(ctx=ctx, config=config),
- lambda: configure(ctx=ctx, config=dict(
- clients=config,
- s3tests_conf=s3tests_conf,
- )),
- lambda: run_tests(ctx=ctx, config=config),
- ):
- pass
- yield
+++ /dev/null
-"""
-Samba
-"""
-import contextlib
-import logging
-import sys
-import time
-
-from teuthology import misc as teuthology
-from teuthology.orchestra import run
-from teuthology.orchestra.daemon import DaemonGroup
-
-log = logging.getLogger(__name__)
-
-
-def get_sambas(ctx, roles):
- """
- Scan for roles that are samba. Yield the id of the the samba role
- (samba.0, samba.1...) and the associated remote site
-
- :param ctx: Context
- :param roles: roles for this test (extracted from yaml files)
- """
- for role in roles:
- assert isinstance(role, basestring)
- PREFIX = 'samba.'
- assert role.startswith(PREFIX)
- id_ = role[len(PREFIX):]
- (remote,) = ctx.cluster.only(role).remotes.iterkeys()
- yield (id_, remote)
-
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Setup samba smbd with ceph vfs module. This task assumes the samba
- package has already been installed via the install task.
-
- The config is optional and defaults to starting samba on all nodes.
- If a config is given, it is expected to be a list of
- samba nodes to start smbd servers on.
-
- Example that starts smbd on all samba nodes::
-
- tasks:
- - install:
- - install:
- project: samba
- extra_packages: ['samba']
- - ceph:
- - samba:
- - interactive:
-
- Example that starts smbd on just one of the samba nodes and cifs on the other::
-
- tasks:
- - samba: [samba.0]
- - cifs: [samba.1]
-
- An optional backend can be specified, and requires a path which smbd will
- use as the backend storage location:
-
- roles:
- - [osd.0, osd.1, osd.2, mon.0, mon.1, mon.2, mds.a]
- - [client.0, samba.0]
-
- tasks:
- - ceph:
- - ceph-fuse: [client.0]
- - samba:
- samba.0:
- cephfuse: "{testdir}/mnt.0"
-
- This mounts ceph to {testdir}/mnt.0 using fuse, and starts smbd with
- a UNC of //localhost/cephfuse. Access through that UNC will be on
- the ceph fuse mount point.
-
- If no arguments are specified in the samba
- role, the default behavior is to enable the ceph UNC //localhost/ceph
- and use the ceph vfs module as the smbd backend.
-
- :param ctx: Context
- :param config: Configuration
- """
- log.info("Setting up smbd with ceph vfs...")
- assert config is None or isinstance(config, list) or isinstance(config, dict), \
- "task samba got invalid config"
-
- if config is None:
- config = dict(('samba.{id}'.format(id=id_), None)
- for id_ in teuthology.all_roles_of_type(ctx.cluster, 'samba'))
- elif isinstance(config, list):
- config = dict((name, None) for name in config)
-
- samba_servers = list(get_sambas(ctx=ctx, roles=config.keys()))
-
- testdir = teuthology.get_testdir(ctx)
-
- if not hasattr(ctx, 'daemons'):
- ctx.daemons = DaemonGroup()
-
- for id_, remote in samba_servers:
-
- rolestr = "samba.{id_}".format(id_=id_)
-
- confextras = """vfs objects = ceph
- ceph:config_file = /etc/ceph/ceph.conf"""
-
- unc = "ceph"
- backend = "/"
-
- if config[rolestr] is not None:
- # verify that there's just one parameter in role
- if len(config[rolestr]) != 1:
- log.error("samba config for role samba.{id_} must have only one parameter".format(id_=id_))
- raise Exception('invalid config')
- confextras = ""
- (unc, backendstr) = config[rolestr].items()[0]
- backend = backendstr.format(testdir=testdir)
-
- # on first samba role, set ownership and permissions of ceph root
- # so that samba tests succeed
- if config[rolestr] is None and id_ == samba_servers[0][0]:
- remote.run(
- args=[
- 'mkdir', '-p', '/tmp/cmnt', run.Raw('&&'),
- 'sudo', 'ceph-fuse', '/tmp/cmnt', run.Raw('&&'),
- 'sudo', 'chown', 'ubuntu:ubuntu', '/tmp/cmnt/', run.Raw('&&'),
- 'sudo', 'chmod', '1777', '/tmp/cmnt/', run.Raw('&&'),
- 'sudo', 'umount', '/tmp/cmnt/', run.Raw('&&'),
- 'rm', '-rf', '/tmp/cmnt',
- ],
- )
- else:
- remote.run(
- args=[
- 'sudo', 'chown', 'ubuntu:ubuntu', backend, run.Raw('&&'),
- 'sudo', 'chmod', '1777', backend,
- ],
- )
-
- teuthology.sudo_write_file(remote, "/usr/local/samba/etc/smb.conf", """
-[global]
- workgroup = WORKGROUP
- netbios name = DOMAIN
-
-[{unc}]
- path = {backend}
- {extras}
- writeable = yes
- valid users = ubuntu
-""".format(extras=confextras, unc=unc, backend=backend))
-
- # create ubuntu user
- remote.run(
- args=[
- 'sudo', '/usr/local/samba/bin/smbpasswd', '-e', 'ubuntu',
- run.Raw('||'),
- 'printf', run.Raw('"ubuntu\nubuntu\n"'),
- run.Raw('|'),
- 'sudo', '/usr/local/samba/bin/smbpasswd', '-s', '-a', 'ubuntu'
- ])
-
- smbd_cmd = [
- 'sudo',
- 'daemon-helper',
- 'term',
- 'nostdin',
- '/usr/local/samba/sbin/smbd',
- '-F',
- ]
- ctx.daemons.add_daemon(remote, 'smbd', id_,
- args=smbd_cmd,
- logger=log.getChild("smbd.{id_}".format(id_=id_)),
- stdin=run.PIPE,
- wait=False,
- )
-
- # let smbd initialize, probably a better way...
- seconds_to_sleep = 100
- log.info('Sleeping for %s seconds...' % seconds_to_sleep)
- time.sleep(seconds_to_sleep)
- log.info('Sleeping stopped...')
-
- try:
- yield
- finally:
- log.info('Stopping smbd processes...')
- exc_info = (None, None, None)
- for d in ctx.daemons.iter_daemons_of_role('smbd'):
- try:
- d.stop()
- except (run.CommandFailedError,
- run.CommandCrashedError,
- run.ConnectionLostError):
- exc_info = sys.exc_info()
- log.exception('Saw exception from %s.%s', d.role, d.id_)
- if exc_info != (None, None, None):
- raise exc_info[0], exc_info[1], exc_info[2]
-
- for id_, remote in samba_servers:
- remote.run(
- args=[
- 'sudo',
- 'rm', '-rf',
- '/usr/local/samba/etc/smb.conf',
- '/usr/local/samba/private/*',
- '/usr/local/samba/var/run/',
- '/usr/local/samba/var/locks',
- '/usr/local/samba/var/lock',
- ],
- )
- # make sure daemons are gone
- try:
- remote.run(
- args=[
- 'while',
- 'sudo', 'killall', '-9', 'smbd',
- run.Raw(';'),
- 'do', 'sleep', '1',
- run.Raw(';'),
- 'done',
- ],
- )
-
- remote.run(
- args=[
- 'sudo',
- 'lsof',
- backend,
- ],
- check_status=False
- )
- remote.run(
- args=[
- 'sudo',
- 'fuser',
- '-M',
- backend,
- ],
- check_status=False
- )
- except Exception:
- log.exception("Saw exception")
- pass
+++ /dev/null
-"""
-Scrub osds
-"""
-import contextlib
-import gevent
-import logging
-import random
-import time
-
-import ceph_manager
-from teuthology import misc as teuthology
-
-log = logging.getLogger(__name__)
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Run scrub periodically. Randomly chooses an OSD to scrub.
-
- The config should be as follows:
-
- scrub:
- frequency: <seconds between scrubs>
- deep: <bool for deepness>
-
- example:
-
- tasks:
- - ceph:
- - scrub:
- frequency: 30
- deep: 0
- """
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- 'scrub task only accepts a dict for configuration'
-
- log.info('Beginning scrub...')
-
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
-
- manager = ceph_manager.CephManager(
- mon,
- ctx=ctx,
- logger=log.getChild('ceph_manager'),
- )
-
- num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
- while len(manager.get_osd_status()['up']) < num_osds:
- time.sleep(10)
-
- scrub_proc = Scrubber(
- manager,
- config,
- )
- try:
- yield
- finally:
- log.info('joining scrub')
- scrub_proc.do_join()
-
-class Scrubber:
- """
- Scrubbing is actually performed during initialzation
- """
- def __init__(self, manager, config):
- """
- Spawn scrubbing thread upon completion.
- """
- self.ceph_manager = manager
- self.ceph_manager.wait_for_clean()
-
- osd_status = self.ceph_manager.get_osd_status()
- self.osds = osd_status['up']
-
- self.config = config
- if self.config is None:
- self.config = dict()
-
- else:
- def tmp(x):
- """Local display"""
- print x
- self.log = tmp
-
- self.stopping = False
-
- log.info("spawning thread")
-
- self.thread = gevent.spawn(self.do_scrub)
-
- def do_join(self):
- """Scrubbing thread finished"""
- self.stopping = True
- self.thread.get()
-
- def do_scrub(self):
- """Perform the scrub operation"""
- frequency = self.config.get("frequency", 30)
- deep = self.config.get("deep", 0)
-
- log.info("stopping %s" % self.stopping)
-
- while not self.stopping:
- osd = str(random.choice(self.osds))
-
- if deep:
- cmd = 'deep-scrub'
- else:
- cmd = 'scrub'
-
- log.info('%sbing %s' % (cmd, osd))
- self.ceph_manager.raw_cluster_cmd('osd', cmd, osd)
-
- time.sleep(frequency)
+++ /dev/null
-"""Scrub testing"""
-from cStringIO import StringIO
-
-import contextlib
-import json
-import logging
-import os
-import time
-import tempfile
-
-import ceph_manager
-from teuthology import misc as teuthology
-
-log = logging.getLogger(__name__)
-
-
-def wait_for_victim_pg(manager):
- """Return a PG with some data and its acting set"""
- # wait for some PG to have data that we can mess with
- victim = None
- while victim is None:
- stats = manager.get_pg_stats()
- for pg in stats:
- size = pg['stat_sum']['num_bytes']
- if size > 0:
- victim = pg['pgid']
- acting = pg['acting']
- return victim, acting
- time.sleep(3)
-
-
-def find_victim_object(ctx, pg, osd):
- """Return a file to be fuzzed"""
- (osd_remote,) = ctx.cluster.only('osd.%d' % osd).remotes.iterkeys()
- data_path = os.path.join(
- '/var/lib/ceph/osd',
- 'ceph-{id}'.format(id=osd),
- 'fuse',
- '{pg}_head'.format(pg=pg),
- 'all',
- )
-
- # fuzz time
- with contextlib.closing(StringIO()) as ls_fp:
- osd_remote.run(
- args=['sudo', 'ls', data_path],
- stdout=ls_fp,
- )
- ls_out = ls_fp.getvalue()
-
- # find an object file we can mess with (and not the pg info object)
- osdfilename = next(line for line in ls_out.split('\n')
- if not line.endswith('::::head#'))
- assert osdfilename is not None
-
- # Get actual object name from osd stored filename
- objname = osdfilename.split(':')[4]
- return osd_remote, os.path.join(data_path, osdfilename), objname
-
-
-def corrupt_file(osd_remote, path):
- # put a single \0 at the beginning of the file
- osd_remote.run(
- args=['sudo', 'dd',
- 'if=/dev/zero',
- 'of=%s/data' % path,
- 'bs=1', 'count=1', 'conv=notrunc']
- )
-
-
-def get_pgnum(pgid):
- pos = pgid.find('.')
- assert pos != -1
- return pgid[pos+1:]
-
-
-def deep_scrub(manager, victim, pool):
- # scrub, verify inconsistent
- pgnum = get_pgnum(victim)
- manager.do_pg_scrub(pool, pgnum, 'deep-scrub')
-
- stats = manager.get_single_pg_stats(victim)
- inconsistent = stats['state'].find('+inconsistent') != -1
- assert inconsistent
-
-
-def repair(manager, victim, pool):
- # repair, verify no longer inconsistent
- pgnum = get_pgnum(victim)
- manager.do_pg_scrub(pool, pgnum, 'repair')
-
- stats = manager.get_single_pg_stats(victim)
- inconsistent = stats['state'].find('+inconsistent') != -1
- assert not inconsistent
-
-
-def test_repair_corrupted_obj(ctx, manager, pg, osd_remote, obj_path, pool):
- corrupt_file(osd_remote, obj_path)
- deep_scrub(manager, pg, pool)
- repair(manager, pg, pool)
-
-
-def test_repair_bad_omap(ctx, manager, pg, osd, objname):
- # Test deep-scrub with various omap modifications
- # Modify omap on specific osd
- log.info('fuzzing omap of %s' % objname)
- manager.osd_admin_socket(osd, ['rmomapkey', 'rbd', objname, 'key'])
- manager.osd_admin_socket(osd, ['setomapval', 'rbd', objname,
- 'badkey', 'badval'])
- manager.osd_admin_socket(osd, ['setomapheader', 'rbd', objname, 'badhdr'])
-
- deep_scrub(manager, pg, 'rbd')
- # please note, the repair here is errnomous, it rewrites the correct omap
- # digest and data digest on the replicas with the corresponding digests
- # from the primary osd which is hosting the victim object, see
- # find_victim_object().
- # so we need to either put this test and the end of this task or
- # undo the mess-up manually before the "repair()" that just ensures
- # the cleanup is sane, otherwise the succeeding tests will fail. if they
- # try set "badkey" in hope to get an "inconsistent" pg with a deep-scrub.
- manager.osd_admin_socket(osd, ['setomapheader', 'rbd', objname, 'hdr'])
- manager.osd_admin_socket(osd, ['rmomapkey', 'rbd', objname, 'badkey'])
- manager.osd_admin_socket(osd, ['setomapval', 'rbd', objname,
- 'key', 'val'])
- repair(manager, pg, 'rbd')
-
-
-class MessUp:
- def __init__(self, manager, osd_remote, pool, osd_id,
- obj_name, obj_path, omap_key, omap_val):
- self.manager = manager
- self.osd = osd_remote
- self.pool = pool
- self.osd_id = osd_id
- self.obj = obj_name
- self.path = obj_path
- self.omap_key = omap_key
- self.omap_val = omap_val
-
- @contextlib.contextmanager
- def _test_with_file(self, messup_cmd, *checks):
- temp = tempfile.mktemp()
- backup_cmd = ['sudo', 'cp', os.path.join(self.path, 'data'), temp]
- self.osd.run(args=backup_cmd)
- self.osd.run(args=messup_cmd.split())
- yield checks
- create_cmd = ['sudo', 'mkdir', self.path]
- self.osd.run(args=create_cmd, check_status=False)
- restore_cmd = ['sudo', 'cp', temp, os.path.join(self.path, 'data')]
- self.osd.run(args=restore_cmd)
-
- def remove(self):
- cmd = 'sudo rmdir {path}'.format(path=self.path)
- return self._test_with_file(cmd, 'missing')
-
- def append(self):
- cmd = 'sudo dd if=/dev/zero of={path}/data bs=1 count=1 ' \
- 'conv=notrunc oflag=append'.format(path=self.path)
- return self._test_with_file(cmd,
- 'data_digest_mismatch',
- 'size_mismatch')
-
- def truncate(self):
- cmd = 'sudo dd if=/dev/null of={path}/data'.format(path=self.path)
- return self._test_with_file(cmd,
- 'data_digest_mismatch',
- 'size_mismatch')
-
- def change_obj(self):
- cmd = 'sudo dd if=/dev/zero of={path}/data bs=1 count=1 ' \
- 'conv=notrunc'.format(path=self.path)
- return self._test_with_file(cmd,
- 'data_digest_mismatch')
-
- @contextlib.contextmanager
- def rm_omap(self):
- cmd = ['rmomapkey', self.pool, self.obj, self.omap_key]
- self.manager.osd_admin_socket(self.osd_id, cmd)
- yield ('omap_digest_mismatch',)
- cmd = ['setomapval', self.pool, self.obj,
- self.omap_key, self.omap_val]
- self.manager.osd_admin_socket(self.osd_id, cmd)
-
- @contextlib.contextmanager
- def add_omap(self):
- cmd = ['setomapval', self.pool, self.obj, 'badkey', 'badval']
- self.manager.osd_admin_socket(self.osd_id, cmd)
- yield ('omap_digest_mismatch',)
- cmd = ['rmomapkey', self.pool, self.obj, 'badkey']
- self.manager.osd_admin_socket(self.osd_id, cmd)
-
- @contextlib.contextmanager
- def change_omap(self):
- cmd = ['setomapval', self.pool, self.obj, self.omap_key, 'badval']
- self.manager.osd_admin_socket(self.osd_id, cmd)
- yield ('omap_digest_mismatch',)
- cmd = ['setomapval', self.pool, self.obj, self.omap_key, self.omap_val]
- self.manager.osd_admin_socket(self.osd_id, cmd)
-
-
-class InconsistentObjChecker:
- """Check the returned inconsistents/inconsistent info"""
-
- def __init__(self, osd, acting, obj_name):
- self.osd = osd
- self.acting = acting
- self.obj = obj_name
- assert self.osd in self.acting
-
- def basic_checks(self, inc):
- assert inc['object']['name'] == self.obj
- assert inc['object']['snap'] == "head"
- assert len(inc['shards']) == len(self.acting), \
- "the number of returned shard does not match with the acting set"
-
- def run(self, check, inc):
- func = getattr(self, check)
- func(inc)
-
- def _check_errors(self, inc, err_name):
- bad_found = False
- good_found = False
- for shard in inc['shards']:
- log.info('shard = %r' % shard)
- log.info('err = %s' % err_name)
- assert 'osd' in shard
- osd = shard['osd']
- err = err_name in shard['errors']
- if osd == self.osd:
- assert bad_found is False, \
- "multiple entries found for the given OSD"
- assert err is True, \
- "Didn't find '{err}' in errors".format(err=err_name)
- bad_found = True
- else:
- assert osd in self.acting, "shard not in acting set"
- assert err is False, \
- "Expected '{err}' in errors".format(err=err_name)
- good_found = True
- assert bad_found is True, \
- "Shard for osd.{osd} not found".format(osd=self.osd)
- assert good_found is True, \
- "No other acting shards found"
-
- def _check_attrs(self, inc, attr_name):
- bad_attr = None
- good_attr = None
- for shard in inc['shards']:
- log.info('shard = %r' % shard)
- log.info('attr = %s' % attr_name)
- assert 'osd' in shard
- osd = shard['osd']
- attr = shard.get(attr_name, False)
- if osd == self.osd:
- assert bad_attr is None, \
- "multiple entries found for the given OSD"
- bad_attr = attr
- else:
- assert osd in self.acting, "shard not in acting set"
- assert good_attr is None or good_attr == attr, \
- "multiple good attrs found"
- good_attr = attr
- assert bad_attr is not None, \
- "bad {attr} not found".format(attr=attr_name)
- assert good_attr is not None, \
- "good {attr} not found".format(attr=attr_name)
- assert good_attr != bad_attr, \
- "bad attr is identical to the good ones: " \
- "{0} == {1}".format(good_attr, bad_attr)
-
- def data_digest_mismatch(self, inc):
- assert 'data_digest_mismatch' in inc['errors']
- self._check_attrs(inc, 'data_digest')
-
- def missing(self, inc):
- assert 'missing' in inc['union_shard_errors']
- self._check_errors(inc, 'missing')
-
- def size_mismatch(self, inc):
- assert 'size_mismatch' in inc['errors']
- self._check_attrs(inc, 'size')
-
- def omap_digest_mismatch(self, inc):
- assert 'omap_digest_mismatch' in inc['errors']
- self._check_attrs(inc, 'omap_digest')
-
-
-def test_list_inconsistent_obj(ctx, manager, osd_remote, pg, acting, osd_id,
- obj_name, obj_path):
- mon = manager.controller
- pool = 'rbd'
- omap_key = 'key'
- omap_val = 'val'
- manager.do_rados(mon, ['-p', pool, 'setomapval', obj_name,
- omap_key, omap_val])
- # Update missing digests, requires "osd deep scrub update digest min age: 0"
- pgnum = get_pgnum(pg)
- manager.do_pg_scrub(pool, pgnum, 'deep-scrub')
-
- messup = MessUp(manager, osd_remote, pool, osd_id, obj_name, obj_path,
- omap_key, omap_val)
- for test in [messup.rm_omap, messup.add_omap, messup.change_omap,
- messup.append, messup.truncate, messup.change_obj,
- messup.remove]:
- with test() as checks:
- deep_scrub(manager, pg, pool)
- cmd = 'rados list-inconsistent-pg {pool} ' \
- '--format=json'.format(pool=pool)
- with contextlib.closing(StringIO()) as out:
- mon.run(args=cmd.split(), stdout=out)
- pgs = json.loads(out.getvalue())
- assert pgs == [pg]
-
- cmd = 'rados list-inconsistent-obj {pg} ' \
- '--format=json'.format(pg=pg)
- with contextlib.closing(StringIO()) as out:
- mon.run(args=cmd.split(), stdout=out)
- objs = json.loads(out.getvalue())
- assert len(objs['inconsistents']) == 1
-
- checker = InconsistentObjChecker(osd_id, acting, obj_name)
- inc_obj = objs['inconsistents'][0]
- log.info('inc = %r', inc_obj)
- checker.basic_checks(inc_obj)
- for check in checks:
- checker.run(check, inc_obj)
-
-
-def task(ctx, config):
- """
- Test [deep] scrub
-
- tasks:
- - chef:
- - install:
- - ceph:
- log-whitelist:
- - '!= data_digest'
- - '!= omap_digest'
- - '!= size'
- - deep-scrub 0 missing, 1 inconsistent objects
- - deep-scrub [0-9]+ errors
- - repair 0 missing, 1 inconsistent objects
- - repair [0-9]+ errors, [0-9]+ fixed
- - shard [0-9]+ missing
- - deep-scrub 1 missing, 1 inconsistent objects
- - does not match object info size
- - attr name mistmatch
- - deep-scrub 1 missing, 0 inconsistent objects
- - failed to pick suitable auth object
- conf:
- osd:
- osd deep scrub update digest min age: 0
- - scrub_test:
- """
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- 'scrub_test task only accepts a dict for configuration'
- first_mon = teuthology.get_first_mon(ctx, config)
- (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
-
- num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
- log.info('num_osds is %s' % num_osds)
-
- manager = ceph_manager.CephManager(
- mon,
- ctx=ctx,
- logger=log.getChild('ceph_manager'),
- )
-
- while len(manager.get_osd_status()['up']) < num_osds:
- time.sleep(10)
-
- for i in range(num_osds):
- manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'injectargs',
- '--', '--osd-objectstore-fuse')
- for i in range(num_osds):
- manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'flush_pg_stats')
- manager.wait_for_clean()
-
- # write some data
- p = manager.do_rados(mon, ['-p', 'rbd', 'bench', '--no-cleanup', '1',
- 'write', '-b', '4096'])
- log.info('err is %d' % p.exitstatus)
-
- # wait for some PG to have data that we can mess with
- pg, acting = wait_for_victim_pg(manager)
- osd = acting[0]
-
- osd_remote, obj_path, obj_name = find_victim_object(ctx, pg, osd)
- manager.do_rados(mon, ['-p', 'rbd', 'setomapval', obj_name, 'key', 'val'])
- log.info('err is %d' % p.exitstatus)
- manager.do_rados(mon, ['-p', 'rbd', 'setomapheader', obj_name, 'hdr'])
- log.info('err is %d' % p.exitstatus)
-
- # Update missing digests, requires "osd deep scrub update digest min age: 0"
- pgnum = get_pgnum(pg)
- manager.do_pg_scrub('rbd', pgnum, 'deep-scrub')
-
- log.info('messing with PG %s on osd %d' % (pg, osd))
- test_repair_corrupted_obj(ctx, manager, pg, osd_remote, obj_path, 'rbd')
- test_repair_bad_omap(ctx, manager, pg, osd, obj_name)
- test_list_inconsistent_obj(ctx, manager, osd_remote, pg, acting, osd,
- obj_name, obj_path)
- log.info('test successful!')
-
- # shut down fuse mount
- for i in range(num_osds):
- manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'injectargs',
- '--', '--no-osd-objectstore-fuse')
- time.sleep(5)
- log.info('done')
+++ /dev/null
-"""
-Systemd test
-"""
-import contextlib
-import logging
-import re
-import time
-
-from cStringIO import StringIO
-from teuthology.orchestra import run
-
-log = logging.getLogger(__name__)
-
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- - tasks:
- ceph-deploy:
- systemd:
-
- Test ceph systemd services can start, stop and restart and
- check for any failed services and report back errors
- """
- for remote, roles in ctx.cluster.remotes.iteritems():
- remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
- 'grep', 'ceph'])
- r = remote.run(args=['sudo', 'systemctl', 'list-units', run.Raw('|'),
- 'grep', 'ceph'], stdout=StringIO(),
- check_status=False)
- log.info(r.stdout.getvalue())
- if r.stdout.getvalue().find('failed'):
- log.info("Ceph services in failed state")
-
- # test overall service stop and start using ceph.target
- # ceph.target tests are meant for ceph systemd tests
- # and not actual process testing using 'ps'
- log.info("Stopping all Ceph services")
- remote.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
- r = remote.run(args=['sudo', 'systemctl', 'status', 'ceph.target'],
- stdout=StringIO(), check_status=False)
- log.info(r.stdout.getvalue())
- log.info("Checking process status")
- r = remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
- 'grep', 'ceph'], stdout=StringIO())
- if r.stdout.getvalue().find('Active: inactive'):
- log.info("Sucessfully stopped all ceph services")
- else:
- log.info("Failed to stop ceph services")
-
- log.info("Starting all Ceph services")
- remote.run(args=['sudo', 'systemctl', 'start', 'ceph.target'])
- r = remote.run(args=['sudo', 'systemctl', 'status', 'ceph.target'],
- stdout=StringIO())
- log.info(r.stdout.getvalue())
- if r.stdout.getvalue().find('Active: active'):
- log.info("Sucessfully started all Ceph services")
- else:
- log.info("info", "Failed to start Ceph services")
- r = remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
- 'grep', 'ceph'], stdout=StringIO())
- log.info(r.stdout.getvalue())
- time.sleep(4)
-
- # test individual services start stop
- name = remote.shortname
- mon_name = 'ceph-mon@' + name + '.service'
- mds_name = 'ceph-mds@' + name + '.service'
- mgr_name = 'ceph-mgr@' + name + '.service'
- mon_role_name = 'mon.' + name
- mds_role_name = 'mds.' + name
- mgr_role_name = 'mgr.' + name
- m_osd = re.search('--id (\d+) --setuser ceph', r.stdout.getvalue())
- if m_osd:
- osd_service = 'ceph-osd@{m}.service'.format(m=m_osd.group(1))
- remote.run(args=['sudo', 'systemctl', 'status',
- osd_service])
- remote.run(args=['sudo', 'systemctl', 'stop',
- osd_service])
- time.sleep(4) # immediate check will result in deactivating state
- r = remote.run(args=['sudo', 'systemctl', 'status', osd_service],
- stdout=StringIO(), check_status=False)
- log.info(r.stdout.getvalue())
- if r.stdout.getvalue().find('Active: inactive'):
- log.info("Sucessfully stopped single osd ceph service")
- else:
- log.info("Failed to stop ceph osd services")
- remote.run(args=['sudo', 'systemctl', 'start',
- osd_service])
- time.sleep(4)
- if mon_role_name in roles:
- remote.run(args=['sudo', 'systemctl', 'status', mon_name])
- remote.run(args=['sudo', 'systemctl', 'stop', mon_name])
- time.sleep(4) # immediate check will result in deactivating state
- r = remote.run(args=['sudo', 'systemctl', 'status', mon_name],
- stdout=StringIO(), check_status=False)
- if r.stdout.getvalue().find('Active: inactive'):
- log.info("Sucessfully stopped single mon ceph service")
- else:
- log.info("Failed to stop ceph mon service")
- remote.run(args=['sudo', 'systemctl', 'start', mon_name])
- time.sleep(4)
- if mgr_role_name in roles:
- remote.run(args=['sudo', 'systemctl', 'status', mgr_name])
- remote.run(args=['sudo', 'systemctl', 'stop', mgr_name])
- time.sleep(4) # immediate check will result in deactivating state
- r = remote.run(args=['sudo', 'systemctl', 'status', mgr_name],
- stdout=StringIO(), check_status=False)
- if r.stdout.getvalue().find('Active: inactive'):
- log.info("Sucessfully stopped single ceph mgr service")
- else:
- log.info("Failed to stop ceph mgr service")
- remote.run(args=['sudo', 'systemctl', 'start', mgr_name])
- time.sleep(4)
- if mds_role_name in roles:
- remote.run(args=['sudo', 'systemctl', 'status', mds_name])
- remote.run(args=['sudo', 'systemctl', 'stop', mds_name])
- time.sleep(4) # immediate check will result in deactivating state
- r = remote.run(args=['sudo', 'systemctl', 'status', mds_name],
- stdout=StringIO(), check_status=False)
- if r.stdout.getvalue().find('Active: inactive'):
- log.info("Sucessfully stopped single ceph mds service")
- else:
- log.info("Failed to stop ceph mds service")
- remote.run(args=['sudo', 'systemctl', 'start', mds_name])
- time.sleep(4)
- yield
+++ /dev/null
-# py.test -v -s tests/test_buildpackages.py
-
-from mock import patch, Mock
-
-from .. import buildpackages
-from teuthology import packaging
-
-def test_get_tag_branch_sha1():
- gitbuilder = packaging.GitbuilderProject(
- 'ceph',
- {
- 'os_type': 'centos',
- 'os_version': '7.0',
- })
- (tag, branch, sha1) = buildpackages.get_tag_branch_sha1(gitbuilder)
- assert tag == None
- assert branch == None
- assert sha1 is not None
-
- gitbuilder = packaging.GitbuilderProject(
- 'ceph',
- {
- 'os_type': 'centos',
- 'os_version': '7.0',
- 'sha1': 'asha1',
- })
- (tag, branch, sha1) = buildpackages.get_tag_branch_sha1(gitbuilder)
- assert tag == None
- assert branch == None
- assert sha1 == 'asha1'
-
- remote = Mock
- remote.arch = 'x86_64'
- remote.os = Mock
- remote.os.name = 'ubuntu'
- remote.os.version = '14.04'
- remote.os.codename = 'trusty'
- remote.system_type = 'deb'
- ctx = Mock
- ctx.cluster = Mock
- ctx.cluster.remotes = {remote: ['client.0']}
-
- expected_tag = 'v0.94.1'
- expected_sha1 = 'expectedsha1'
- def check_output(cmd, shell):
- assert shell == True
- return expected_sha1 + " refs/tags/" + expected_tag
- with patch.multiple(
- buildpackages,
- check_output=check_output,
- ):
- gitbuilder = packaging.GitbuilderProject(
- 'ceph',
- {
- 'os_type': 'centos',
- 'os_version': '7.0',
- 'sha1': 'asha1',
- 'all': {
- 'tag': tag,
- },
- },
- ctx = ctx,
- remote = remote)
- (tag, branch, sha1) = buildpackages.get_tag_branch_sha1(gitbuilder)
- assert tag == expected_tag
- assert branch == None
- assert sha1 == expected_sha1
-
- expected_branch = 'hammer'
- expected_sha1 = 'otherexpectedsha1'
- def check_output(cmd, shell):
- assert shell == True
- return expected_sha1 + " refs/heads/" + expected_branch
- with patch.multiple(
- buildpackages,
- check_output=check_output,
- ):
- gitbuilder = packaging.GitbuilderProject(
- 'ceph',
- {
- 'os_type': 'centos',
- 'os_version': '7.0',
- 'sha1': 'asha1',
- 'all': {
- 'branch': branch,
- },
- },
- ctx = ctx,
- remote = remote)
- (tag, branch, sha1) = buildpackages.get_tag_branch_sha1(gitbuilder)
- assert tag == None
- assert branch == expected_branch
- assert sha1 == expected_sha1
-
-def test_lookup_configs():
- expected_system_type = 'deb'
- def make_remote():
- remote = Mock()
- remote.arch = 'x86_64'
- remote.os = Mock()
- remote.os.name = 'ubuntu'
- remote.os.version = '14.04'
- remote.os.codename = 'trusty'
- remote.system_type = expected_system_type
- return remote
- ctx = Mock()
- class cluster:
- remote1 = make_remote()
- remote2 = make_remote()
- remotes = {
- remote1: ['client.0'],
- remote2: ['mon.a','osd.0'],
- }
- def only(self, role):
- result = Mock()
- if role in ('client.0',):
- result.remotes = { cluster.remote1: None }
- elif role in ('osd.0', 'mon.a'):
- result.remotes = { cluster.remote2: None }
- else:
- result.remotes = None
- return result
- ctx.cluster = cluster()
- ctx.config = {
- 'roles': [ ['client.0'], ['mon.a','osd.0'] ],
- }
-
- # nothing -> nothing
- assert buildpackages.lookup_configs(ctx, {}) == []
- assert buildpackages.lookup_configs(ctx, {1:[1,2,3]}) == []
- assert buildpackages.lookup_configs(ctx, [[1,2,3]]) == []
- assert buildpackages.lookup_configs(ctx, None) == []
-
- #
- # the overrides applies to install and to install.upgrade
- # that have no tag, branch or sha1
- #
- config = {
- 'overrides': {
- 'install': {
- 'ceph': {
- 'sha1': 'overridesha1',
- 'tag': 'overridetag',
- 'branch': 'overridebranch',
- },
- },
- },
- 'tasks': [
- {
- 'install': {
- 'sha1': 'installsha1',
- },
- },
- {
- 'install.upgrade': {
- 'osd.0': {
- },
- 'client.0': {
- 'sha1': 'client0sha1',
- },
- },
- }
- ],
- }
- ctx.config = config
- expected_configs = [{'branch': 'overridebranch', 'sha1': 'overridesha1', 'tag': 'overridetag'},
- {'project': 'ceph', 'branch': 'overridebranch', 'sha1': 'overridesha1', 'tag': 'overridetag'},
- {'project': 'ceph', 'sha1': 'client0sha1'}]
-
- assert buildpackages.lookup_configs(ctx, config) == expected_configs
+++ /dev/null
-from textwrap import dedent
-
-from .. import devstack
-
-
-class TestDevstack(object):
- def test_parse_os_table(self):
- table_str = dedent("""
- +---------------------+--------------------------------------+
- | Property | Value |
- +---------------------+--------------------------------------+
- | attachments | [] |
- | availability_zone | nova |
- | bootable | false |
- | created_at | 2014-02-21T17:14:47.548361 |
- | display_description | None |
- | display_name | NAME |
- | id | ffdbd1bb-60dc-4d95-acfe-88774c09ad3e |
- | metadata | {} |
- | size | 1 |
- | snapshot_id | None |
- | source_volid | None |
- | status | creating |
- | volume_type | None |
- +---------------------+--------------------------------------+
- """).strip()
- expected = {
- 'Property': 'Value',
- 'attachments': '[]',
- 'availability_zone': 'nova',
- 'bootable': 'false',
- 'created_at': '2014-02-21T17:14:47.548361',
- 'display_description': 'None',
- 'display_name': 'NAME',
- 'id': 'ffdbd1bb-60dc-4d95-acfe-88774c09ad3e',
- 'metadata': '{}',
- 'size': '1',
- 'snapshot_id': 'None',
- 'source_volid': 'None',
- 'status': 'creating',
- 'volume_type': 'None'}
-
- vol_info = devstack.parse_os_table(table_str)
- assert vol_info == expected
-
-
-
-
+++ /dev/null
-from mock import Mock
-
-from .. import radosgw_admin
-
-acl_with_version = """<?xml version="1.0" encoding="UTF-8"?><AccessControlPolicy xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Owner><ID>foo</ID><DisplayName>Foo</DisplayName></Owner><AccessControlList><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>foo</ID><DisplayName>Foo</DisplayName></Grantee><Permission>FULL_CONTROL</Permission></Grant></AccessControlList></AccessControlPolicy>
-""" # noqa
-
-
-acl_without_version = """<AccessControlPolicy xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Owner><ID>foo</ID><DisplayName>Foo</DisplayName></Owner><AccessControlList><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>foo</ID><DisplayName>Foo</DisplayName></Grantee><Permission>FULL_CONTROL</Permission></Grant></AccessControlList></AccessControlPolicy>
-""" # noqa
-
-
-class TestGetAcl(object):
-
- def setup(self):
- self.key = Mock()
-
- def test_removes_xml_version(self):
- self.key.get_xml_acl = Mock(return_value=acl_with_version)
- result = radosgw_admin.get_acl(self.key)
- assert result.startswith('<AccessControlPolicy')
-
- def test_xml_version_is_already_removed(self):
- self.key.get_xml_acl = Mock(return_value=acl_without_version)
- result = radosgw_admin.get_acl(self.key)
- assert result.startswith('<AccessControlPolicy')
-
- def test_newline_gets_trimmed(self):
- self.key.get_xml_acl = Mock(return_value=acl_without_version)
- result = radosgw_admin.get_acl(self.key)
- assert result.endswith('\n') is False
+++ /dev/null
-import logging
-from teuthology import misc
-from teuthology.task import Task
-
-log = logging.getLogger(__name__)
-
-
-class TeuthologyIntegration(Task):
-
- def begin(self):
- misc.sh("""
- set -x
- pip install tox
- tox
- # tox -e py27-integration
- tox -e openstack-integration
- """)
-
-task = TeuthologyIntegration
+++ /dev/null
-"""
-Task to handle tgt
-
-Assumptions made:
- The ceph-extras tgt package may need to get installed.
- The open-iscsi package needs to get installed.
-"""
-import logging
-import contextlib
-
-from teuthology import misc as teuthology
-from teuthology import contextutil
-
-log = logging.getLogger(__name__)
-
-
-@contextlib.contextmanager
-def start_tgt_remotes(ctx, start_tgtd):
- """
- This subtask starts up a tgtd on the clients specified
- """
- remotes = ctx.cluster.only(teuthology.is_type('client')).remotes
- tgtd_list = []
- for rem, roles in remotes.iteritems():
- for _id in roles:
- if _id in start_tgtd:
- if not rem in tgtd_list:
- tgtd_list.append(rem)
- size = ctx.config.get('image_size', 10240)
- rem.run(
- args=[
- 'rbd',
- 'create',
- 'iscsi-image',
- '--size',
- str(size),
- ])
- rem.run(
- args=[
- 'sudo',
- 'tgtadm',
- '--lld',
- 'iscsi',
- '--mode',
- 'target',
- '--op',
- 'new',
- '--tid',
- '1',
- '--targetname',
- 'rbd',
- ])
- rem.run(
- args=[
- 'sudo',
- 'tgtadm',
- '--lld',
- 'iscsi',
- '--mode',
- 'logicalunit',
- '--op',
- 'new',
- '--tid',
- '1',
- '--lun',
- '1',
- '--backing-store',
- 'iscsi-image',
- '--bstype',
- 'rbd',
- ])
- rem.run(
- args=[
- 'sudo',
- 'tgtadm',
- '--lld',
- 'iscsi',
- '--op',
- 'bind',
- '--mode',
- 'target',
- '--tid',
- '1',
- '-I',
- 'ALL',
- ])
- try:
- yield
-
- finally:
- for rem in tgtd_list:
- rem.run(
- args=[
- 'sudo',
- 'tgtadm',
- '--lld',
- 'iscsi',
- '--mode',
- 'target',
- '--op',
- 'delete',
- '--force',
- '--tid',
- '1',
- ])
- rem.run(
- args=[
- 'rbd',
- 'snap',
- 'purge',
- 'iscsi-image',
- ])
- rem.run(
- args=[
- 'sudo',
- 'rbd',
- 'rm',
- 'iscsi-image',
- ])
-
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Start up tgt.
-
- To start on on all clients::
-
- tasks:
- - ceph:
- - tgt:
-
- To start on certain clients::
-
- tasks:
- - ceph:
- - tgt: [client.0, client.3]
-
- or
-
- tasks:
- - ceph:
- - tgt:
- client.0:
- client.3:
-
- An image blocksize size can also be specified::
-
- tasks:
- - ceph:
- - tgt:
- image_size = 20480
-
- The general flow of things here is:
- 1. Find clients on which tgt is supposed to run (start_tgtd)
- 2. Remotely start up tgt daemon
- On cleanup:
- 3. Stop tgt daemon
-
- The iscsi administration is handled by the iscsi task.
- """
- if config:
- config = {key : val for key, val in config.items()
- if key.startswith('client')}
- # config at this point should only contain keys starting with 'client'
- start_tgtd = []
- remotes = ctx.cluster.only(teuthology.is_type('client')).remotes
- log.info(remotes)
- if not config:
- start_tgtd = ['client.{id}'.format(id=id_)
- for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
- else:
- start_tgtd = config
- log.info(start_tgtd)
- with contextutil.nested(
- lambda: start_tgt_remotes(ctx=ctx, start_tgtd=start_tgtd),):
- yield
+++ /dev/null
-"""
-Thrash -- Simulate random osd failures.
-"""
-import contextlib
-import logging
-import gevent
-import time
-import random
-
-
-log = logging.getLogger(__name__)
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- "Thrash" snap creation and removal on the listed pools
-
- Example:
-
- thrash_pool_snaps:
- pools: [.rgw.buckets, .rgw.buckets.index]
- max_snaps: 10
- min_snaps: 5
- period: 10
- """
- stopping = False
- def do_thrash():
- pools = config.get('pools', [])
- max_snaps = config.get('max_snaps', 10)
- min_snaps = config.get('min_snaps', 5)
- period = config.get('period', 30)
- snaps = []
- manager = ctx.managers['ceph']
- def remove_snap():
- assert len(snaps) > 0
- snap = random.choice(snaps)
- log.info("Removing snap %s" % (snap,))
- for pool in pools:
- manager.remove_pool_snap(pool, str(snap))
- snaps.remove(snap)
- def add_snap(snap):
- log.info("Adding snap %s" % (snap,))
- for pool in pools:
- manager.add_pool_snap(pool, str(snap))
- snaps.append(snap)
- index = 0
- while not stopping:
- index += 1
- time.sleep(period)
- if len(snaps) <= min_snaps:
- add_snap(index)
- elif len(snaps) >= max_snaps:
- remove_snap()
- else:
- random.choice([lambda: add_snap(index), remove_snap])()
- log.info("Stopping")
- thread = gevent.spawn(do_thrash)
- yield
- stopping = True
- thread.join()
-
+++ /dev/null
-"""
-Thrash -- Simulate random osd failures.
-"""
-import contextlib
-import logging
-import ceph_manager
-from teuthology import misc as teuthology
-
-
-log = logging.getLogger(__name__)
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- "Thrash" the OSDs by randomly marking them out/down (and then back
- in) until the task is ended. This loops, and every op_delay
- seconds it randomly chooses to add or remove an OSD (even odds)
- unless there are fewer than min_out OSDs out of the cluster, or
- more than min_in OSDs in the cluster.
-
- All commands are run on mon0 and it stops when __exit__ is called.
-
- The config is optional, and is a dict containing some or all of:
-
- cluster: (default 'ceph') the name of the cluster to thrash
-
- min_in: (default 3) the minimum number of OSDs to keep in the
- cluster
-
- min_out: (default 0) the minimum number of OSDs to keep out of the
- cluster
-
- op_delay: (5) the length of time to sleep between changing an
- OSD's status
-
- min_dead: (0) minimum number of osds to leave down/dead.
-
- max_dead: (0) maximum number of osds to leave down/dead before waiting
- for clean. This should probably be num_replicas - 1.
-
- clean_interval: (60) the approximate length of time to loop before
- waiting until the cluster goes clean. (In reality this is used
- to probabilistically choose when to wait, and the method used
- makes it closer to -- but not identical to -- the half-life.)
-
- scrub_interval: (-1) the approximate length of time to loop before
- waiting until a scrub is performed while cleaning. (In reality
- this is used to probabilistically choose when to wait, and it
- only applies to the cases where cleaning is being performed).
- -1 is used to indicate that no scrubbing will be done.
-
- chance_down: (0.4) the probability that the thrasher will mark an
- OSD down rather than marking it out. (The thrasher will not
- consider that OSD out of the cluster, since presently an OSD
- wrongly marked down will mark itself back up again.) This value
- can be either an integer (eg, 75) or a float probability (eg
- 0.75).
-
- chance_test_min_size: (0) chance to run test_pool_min_size,
- which:
- - kills all but one osd
- - waits
- - kills that osd
- - revives all other osds
- - verifies that the osds fully recover
-
- timeout: (360) the number of seconds to wait for the cluster
- to become clean after each cluster change. If this doesn't
- happen within the timeout, an exception will be raised.
-
- revive_timeout: (150) number of seconds to wait for an osd asok to
- appear after attempting to revive the osd
-
- thrash_primary_affinity: (true) randomly adjust primary-affinity
-
- chance_pgnum_grow: (0) chance to increase a pool's size
- chance_pgpnum_fix: (0) chance to adjust pgpnum to pg for a pool
- pool_grow_by: (10) amount to increase pgnum by
- max_pgs_per_pool_osd: (1200) don't expand pools past this size per osd
-
- pause_short: (3) duration of short pause
- pause_long: (80) duration of long pause
- pause_check_after: (50) assert osd down after this long
- chance_inject_pause_short: (1) chance of injecting short stall
- chance_inject_pause_long: (0) chance of injecting long stall
-
- clean_wait: (0) duration to wait before resuming thrashing once clean
-
- sighup_delay: (0.1) duration to delay between sending signal.SIGHUP to a
- random live osd
-
- powercycle: (false) whether to power cycle the node instead
- of just the osd process. Note that this assumes that a single
- osd is the only important process on the node.
-
- bdev_inject_crash: (0) seconds to delay while inducing a synthetic crash.
- the delay lets the BlockDevice "accept" more aio operations but blocks
- any flush, and then eventually crashes (losing some or all ios). If 0,
- no bdev failure injection is enabled.
-
- bdev_inject_crash_probability: (.5) probability of doing a bdev failure
- injection crash vs a normal OSD kill.
-
- chance_test_backfill_full: (0) chance to simulate full disks stopping
- backfill
-
- chance_test_map_discontinuity: (0) chance to test map discontinuity
- map_discontinuity_sleep_time: (40) time to wait for map trims
-
- ceph_objectstore_tool: (true) whether to export/import a pg while an osd is down
- chance_move_pg: (1.0) chance of moving a pg if more than 1 osd is down (default 100%)
-
- optrack_toggle_delay: (2.0) duration to delay between toggling op tracker
- enablement to all osds
-
- dump_ops_enable: (true) continuously dump ops on all live osds
-
- noscrub_toggle_delay: (2.0) duration to delay between toggling noscrub
-
- disable_objectstore_tool_tests: (false) disable ceph_objectstore_tool based
- tests
-
- example:
-
- tasks:
- - ceph:
- - thrashosds:
- cluster: ceph
- chance_down: 10
- op_delay: 3
- min_in: 1
- timeout: 600
- - interactive:
- """
- if config is None:
- config = {}
- assert isinstance(config, dict), \
- 'thrashosds task only accepts a dict for configuration'
- # add default value for sighup_delay
- config['sighup_delay'] = config.get('sighup_delay', 0.1)
- # add default value for optrack_toggle_delay
- config['optrack_toggle_delay'] = config.get('optrack_toggle_delay', 2.0)
- # add default value for dump_ops_enable
- config['dump_ops_enable'] = config.get('dump_ops_enable', "true")
- # add default value for noscrub_toggle_delay
- config['noscrub_toggle_delay'] = config.get('noscrub_toggle_delay', 2.0)
-
- log.info("config is {config}".format(config=str(config)))
-
- overrides = ctx.config.get('overrides', {})
- log.info("overrides is {overrides}".format(overrides=str(overrides)))
- teuthology.deep_merge(config, overrides.get('thrashosds', {}))
- cluster = config.get('cluster', 'ceph')
-
- log.info("config is {config}".format(config=str(config)))
-
- if 'powercycle' in config:
-
- # sync everyone first to avoid collateral damage to / etc.
- log.info('Doing preliminary sync to avoid collateral damage...')
- ctx.cluster.run(args=['sync'])
-
- if 'ipmi_user' in ctx.teuthology_config:
- for remote in ctx.cluster.remotes.keys():
- log.debug('checking console status of %s' % remote.shortname)
- if not remote.console.check_status():
- log.warn('Failed to get console status for %s',
- remote.shortname)
-
- # check that all osd remotes have a valid console
- osds = ctx.cluster.only(teuthology.is_type('osd', cluster))
- for remote in osds.remotes.keys():
- if not remote.console.has_ipmi_credentials:
- raise Exception(
- 'IPMI console required for powercycling, '
- 'but not available on osd role: {r}'.format(
- r=remote.name))
-
- cluster_manager = ctx.managers[cluster]
- for f in ['powercycle', 'bdev_inject_crash']:
- if config.get(f):
- cluster_manager.config[f] = config.get(f)
-
- log.info('Beginning thrashosds...')
- thrash_proc = ceph_manager.Thrasher(
- cluster_manager,
- config,
- logger=log.getChild('thrasher')
- )
- try:
- yield
- finally:
- log.info('joining thrashosds')
- thrash_proc.do_join()
- cluster_manager.wait_for_recovery(config.get('timeout', 360))
+++ /dev/null
-#cloud-config-archive
-
-- type: text/cloud-config
- content: |
- output:
- all: '| tee -a /var/log/cloud-init-output.log'
-
-# allow passwordless access for debugging
-- |
- #!/bin/bash
- exec passwd -d ubuntu
-
-- |
- #!/bin/bash
-
- # mount a NFS share for storing logs
- apt-get update
- apt-get -y install nfs-common
- mkdir /mnt/log
- # 10.0.2.2 is the host
- mount -v -t nfs -o proto=tcp 10.0.2.2:{mnt_dir} /mnt/log
-
- # mount the iso image that has the test script
- mkdir /mnt/cdrom
- mount -t auto /dev/cdrom /mnt/cdrom
+++ /dev/null
-- |
- #!/bin/bash
- cp /var/log/cloud-init-output.log /mnt/log
-
-- |
- #!/bin/bash
- umount /mnt/log
-
-- |
- #!/bin/bash
- shutdown -h -P now
+++ /dev/null
-from teuthology import misc
-
-def get_remote(ctx, cluster, service_type, service_id):
- """
- Get the Remote for the host where a particular role runs.
-
- :param cluster: name of the cluster the service is part of
- :param service_type: e.g. 'mds', 'osd', 'client'
- :param service_id: The third part of a role, e.g. '0' for
- the role 'ceph.client.0'
- :return: a Remote instance for the host where the
- requested role is placed
- """
- def _is_instance(role):
- role_tuple = misc.split_role(role)
- return role_tuple == (cluster, service_type, str(service_id))
- try:
- (remote,) = ctx.cluster.only(_is_instance).remotes.keys()
- except ValueError:
- raise KeyError("Service {0}.{1}.{2} not found".format(cluster,
- service_type,
- service_id))
- return remote
-
-def get_remote_for_role(ctx, role):
- return get_remote(ctx, *misc.split_role(role))
+++ /dev/null
-import logging
-
-from teuthology import misc as teuthology
-
-log = logging.getLogger(__name__)
-
-def rados(ctx, remote, cmd, wait=True, check_status=False):
- testdir = teuthology.get_testdir(ctx)
- log.info("rados %s" % ' '.join(cmd))
- pre = [
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'rados',
- ];
- pre.extend(cmd)
- proc = remote.run(
- args=pre,
- check_status=check_status,
- wait=wait,
- )
- if wait:
- return proc.exitstatus
- else:
- return proc
-
-def create_ec_pool(remote, name, profile_name, pgnum, profile={}):
- remote.run(args=['sudo', 'ceph'] +
- cmd_erasure_code_profile(profile_name, profile))
- remote.run(args=[
- 'sudo', 'ceph', 'osd', 'pool', 'create', name,
- str(pgnum), str(pgnum), 'erasure', profile_name,
- ])
-
-def create_replicated_pool(remote, name, pgnum):
- remote.run(args=[
- 'sudo', 'ceph', 'osd', 'pool', 'create', name, str(pgnum), str(pgnum),
- ])
-
-def create_cache_pool(remote, base_name, cache_name, pgnum, size):
- remote.run(args=[
- 'sudo', 'ceph', 'osd', 'pool', 'create', cache_name, str(pgnum)
- ])
- remote.run(args=[
- 'sudo', 'ceph', 'osd', 'tier', 'add-cache', base_name, cache_name,
- str(size),
- ])
-
-def cmd_erasure_code_profile(profile_name, profile):
- """
- Return the shell command to run to create the erasure code profile
- described by the profile parameter.
-
- :param profile_name: a string matching [A-Za-z0-9-_.]+
- :param profile: a map whose semantic depends on the erasure code plugin
- :returns: a shell command as an array suitable for Remote.run
-
- If profile is {}, it is replaced with
-
- { 'k': '2', 'm': '1', 'ruleset-failure-domain': 'osd'}
-
- for backward compatibility. In previous versions of teuthology,
- these values were hardcoded as function arguments and some yaml
- files were designed with these implicit values. The teuthology
- code should not know anything about the erasure code profile
- content or semantic. The valid values and parameters are outside
- its scope.
- """
-
- if profile == {}:
- profile = {
- 'k': '2',
- 'm': '1',
- 'ruleset-failure-domain': 'osd'
- }
- return [
- 'osd', 'erasure-code-profile', 'set',
- profile_name
- ] + [ str(key) + '=' + str(value) for key, value in profile.iteritems() ]
+++ /dev/null
-from cStringIO import StringIO
-import logging
-import json
-import requests
-from requests.packages.urllib3.util import Retry
-from urlparse import urlparse
-
-from teuthology.orchestra.connection import split_user
-from teuthology import misc as teuthology
-
-log = logging.getLogger(__name__)
-
-# simple test to indicate if multi-region testing should occur
-def multi_region_enabled(ctx):
- # this is populated by the radosgw-agent task, seems reasonable to
- # use that as an indicator that we're testing multi-region sync
- return 'radosgw_agent' in ctx
-
-def rgwadmin(ctx, client, cmd, stdin=StringIO(), check_status=False,
- format='json'):
- log.info('rgwadmin: {client} : {cmd}'.format(client=client,cmd=cmd))
- testdir = teuthology.get_testdir(ctx)
- pre = [
- 'adjust-ulimits',
- 'ceph-coverage'.format(tdir=testdir),
- '{tdir}/archive/coverage'.format(tdir=testdir),
- 'radosgw-admin'.format(tdir=testdir),
- '--log-to-stderr',
- '--format', format,
- '-n', client,
- ]
- pre.extend(cmd)
- log.info('rgwadmin: cmd=%s' % pre)
- (remote,) = ctx.cluster.only(client).remotes.iterkeys()
- proc = remote.run(
- args=pre,
- check_status=check_status,
- stdout=StringIO(),
- stderr=StringIO(),
- stdin=stdin,
- )
- r = proc.exitstatus
- out = proc.stdout.getvalue()
- j = None
- if not r and out != '':
- try:
- j = json.loads(out)
- log.info(' json result: %s' % j)
- except ValueError:
- j = out
- log.info(' raw result: %s' % j)
- return (r, j)
-
-def get_user_summary(out, user):
- """Extract the summary for a given user"""
- user_summary = None
- for summary in out['summary']:
- if summary.get('user') == user:
- user_summary = summary
-
- if not user_summary:
- raise AssertionError('No summary info found for user: %s' % user)
-
- return user_summary
-
-def get_user_successful_ops(out, user):
- summary = out['summary']
- if len(summary) == 0:
- return 0
- return get_user_summary(out, user)['total']['successful_ops']
-
-def get_zone_host_and_port(ctx, client, zone):
- _, region_map = rgwadmin(ctx, client, check_status=True,
- cmd=['-n', client, 'region-map', 'get'])
- regions = region_map['zonegroups']
- for region in regions:
- for zone_info in region['val']['zones']:
- if zone_info['name'] == zone:
- endpoint = urlparse(zone_info['endpoints'][0])
- host, port = endpoint.hostname, endpoint.port
- if port is None:
- port = 80
- return host, port
- assert False, 'no endpoint for zone {zone} found'.format(zone=zone)
-
-def get_master_zone(ctx, client):
- _, region_map = rgwadmin(ctx, client, check_status=True,
- cmd=['-n', client, 'region-map', 'get'])
- regions = region_map['zonegroups']
- for region in regions:
- is_master = (region['val']['is_master'] == "true")
- log.info('region={r} is_master={ism}'.format(r=region, ism=is_master))
- if not is_master:
- continue
- master_zone = region['val']['master_zone']
- log.info('master_zone=%s' % master_zone)
- for zone_info in region['val']['zones']:
- if zone_info['name'] == master_zone:
- return master_zone
- log.info('couldn\'t find master zone')
- return None
-
-def get_master_client(ctx, clients):
- master_zone = get_master_zone(ctx, clients[0]) # can use any client for this as long as system configured correctly
- if not master_zone:
- return None
-
- for client in clients:
- zone = zone_for_client(ctx, client)
- if zone == master_zone:
- return client
-
- return None
-
-def get_zone_system_keys(ctx, client, zone):
- _, zone_info = rgwadmin(ctx, client, check_status=True,
- cmd=['-n', client,
- 'zone', 'get', '--rgw-zone', zone])
- system_key = zone_info['system_key']
- return system_key['access_key'], system_key['secret_key']
-
-def zone_for_client(ctx, client):
- ceph_config = ctx.ceph['ceph'].conf.get('global', {})
- ceph_config.update(ctx.ceph['ceph'].conf.get('client', {}))
- ceph_config.update(ctx.ceph['ceph'].conf.get(client, {}))
- return ceph_config.get('rgw zone')
-
-def region_for_client(ctx, client):
- ceph_config = ctx.ceph['ceph'].conf.get('global', {})
- ceph_config.update(ctx.ceph['ceph'].conf.get('client', {}))
- ceph_config.update(ctx.ceph['ceph'].conf.get(client, {}))
- return ceph_config.get('rgw region')
-
-def radosgw_data_log_window(ctx, client):
- ceph_config = ctx.ceph['ceph'].conf.get('global', {})
- ceph_config.update(ctx.ceph['ceph'].conf.get('client', {}))
- ceph_config.update(ctx.ceph['ceph'].conf.get(client, {}))
- return ceph_config.get('rgw data log window', 30)
-
-def radosgw_agent_sync_data(ctx, agent_host, agent_port, full=False):
- log.info('sync agent {h}:{p}'.format(h=agent_host, p=agent_port))
- # use retry with backoff to tolerate slow startup of radosgw-agent
- s = requests.Session()
- s.mount('http://{addr}:{port}/'.format(addr = agent_host, port = agent_port),
- requests.adapters.HTTPAdapter(max_retries=Retry(total=5, backoff_factor=1)))
- method = "full" if full else "incremental"
- return s.post('http://{addr}:{port}/data/{method}'.format(addr = agent_host, port = agent_port, method = method))
-
-def radosgw_agent_sync_metadata(ctx, agent_host, agent_port, full=False):
- log.info('sync agent {h}:{p}'.format(h=agent_host, p=agent_port))
- # use retry with backoff to tolerate slow startup of radosgw-agent
- s = requests.Session()
- s.mount('http://{addr}:{port}/'.format(addr = agent_host, port = agent_port),
- requests.adapters.HTTPAdapter(max_retries=Retry(total=5, backoff_factor=1)))
- method = "full" if full else "incremental"
- return s.post('http://{addr}:{port}/metadata/{method}'.format(addr = agent_host, port = agent_port, method = method))
-
-def radosgw_agent_sync_all(ctx, full=False, data=False):
- if ctx.radosgw_agent.procs:
- for agent_client, c_config in ctx.radosgw_agent.config.iteritems():
- zone_for_client(ctx, agent_client)
- sync_host, sync_port = get_sync_agent(ctx, agent_client)
- log.debug('doing a sync via {host1}'.format(host1=sync_host))
- radosgw_agent_sync_metadata(ctx, sync_host, sync_port, full)
- if (data):
- radosgw_agent_sync_data(ctx, sync_host, sync_port, full)
-
-def host_for_role(ctx, role):
- for target, roles in zip(ctx.config['targets'].iterkeys(), ctx.config['roles']):
- if role in roles:
- _, host = split_user(target)
- return host
-
-def get_sync_agent(ctx, source):
- for task in ctx.config['tasks']:
- if 'radosgw-agent' not in task:
- continue
- for client, conf in task['radosgw-agent'].iteritems():
- if conf['src'] == source:
- return host_for_role(ctx, source), conf.get('port', 8000)
- return None, None
+++ /dev/null
-#
-# The MIT License
-#
-# Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
-#
-# Author: Loic Dachary <loic@dachary.org>
-#
-# Permission is hereby granted, free of charge, to any person
-# obtaining a copy of this software and associated documentation
-# files (the "Software"), to deal in the Software without
-# restriction, including without limitation the rights to use,
-# copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following
-# conditions:
-#
-# The above copyright notice and this permission notice shall be
-# included in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-# OTHER DEALINGS IN THE SOFTWARE.
-#
-from .. import rados
-
-class TestRados(object):
-
- def test_cmd_erasure_code_profile(self):
- name = 'NAME'
- cmd = rados.cmd_erasure_code_profile(name, {})
- assert 'k=2' in cmd
- assert name in cmd
- cmd = rados.cmd_erasure_code_profile(name, { 'k': '88' })
- assert 'k=88' in cmd
- assert name in cmd
+++ /dev/null
-"""
-vstart_runner: override Filesystem and Mount interfaces to run a CephFSTestCase against a vstart
-ceph instance instead of a packaged/installed cluster. Use this to turn around test cases
-quickly during development.
-
-Usage (assuming teuthology, ceph, ceph-qa-suite checked out in ~/git):
-
- # Activate the teuthology virtualenv
- source ~/git/teuthology/virtualenv/bin/activate
- # Go into your ceph build directory
- cd ~/git/ceph/build
- # Start a vstart cluster
- MDS=2 MON=1 OSD=3 ../src/vstart.sh -n
- # Invoke a test using this script, with PYTHONPATH set appropriately
- python ~/git/ceph-qa-suite/tasks/vstart_runner.py
-
- # Alternatively, if you use different paths, specify them as follows:
- LD_LIBRARY_PATH=`pwd`/lib PYTHONPATH=~/git/teuthology:~/git/ceph-qa-suite:`pwd`/../src/pybind:`pwd`/lib/cython_modules/lib.2 python ~/git/ceph-qa-suite/tasks/vstart_runner.py
-
- # If you wish to drop to a python shell on failures, use --interactive:
- python ~/git/ceph-qa-suite/tasks/vstart_runner.py --interactive
-
- # If you wish to run a named test case, pass it as an argument:
- python ~/git/ceph-qa-suite/tasks/vstart_runner.py tasks.cephfs.test_data_scan
-
-"""
-
-from StringIO import StringIO
-from collections import defaultdict
-import getpass
-import signal
-import tempfile
-import threading
-import datetime
-import shutil
-import re
-import os
-import time
-import json
-import sys
-import errno
-from unittest import suite
-import unittest
-import platform
-from teuthology.orchestra.run import Raw, quote
-from teuthology.orchestra.daemon import DaemonGroup
-from teuthology.config import config as teuth_config
-
-import logging
-
-log = logging.getLogger(__name__)
-
-handler = logging.FileHandler("./vstart_runner.log")
-formatter = logging.Formatter(
- fmt=u'%(asctime)s.%(msecs)03d %(levelname)s:%(name)s:%(message)s',
- datefmt='%Y-%m-%dT%H:%M:%S')
-handler.setFormatter(formatter)
-log.addHandler(handler)
-log.setLevel(logging.INFO)
-
-
-def respawn_in_path(lib_path, python_paths):
- execv_cmd = ['python']
- if platform.system() == "Darwin":
- lib_path_var = "DYLD_LIBRARY_PATH"
- else:
- lib_path_var = "LD_LIBRARY_PATH"
-
- py_binary = os.environ.get("PYTHON", "python")
-
- if lib_path_var in os.environ:
- if lib_path not in os.environ[lib_path_var]:
- os.environ[lib_path_var] += ':' + lib_path
- os.execvp(py_binary, execv_cmd + sys.argv)
- else:
- os.environ[lib_path_var] = lib_path
- os.execvp(py_binary, execv_cmd + sys.argv)
-
- for p in python_paths:
- sys.path.insert(0, p)
-
-
-# Let's use some sensible defaults
-if os.path.exists("./CMakeCache.txt") and os.path.exists("./bin"):
-
- # A list of candidate paths for each package we need
- guesses = [
- ["~/git/teuthology", "~/scm/teuthology", "~/teuthology"],
- ["~/git/ceph-qa-suite", "~/scm/ceph-qa-suite", "~/ceph-qa-suite"],
- ["lib/cython_modules/lib.2"],
- ["../src/pybind"],
- ]
-
- python_paths = []
- for package_guesses in guesses:
- for g in package_guesses:
- g_exp = os.path.abspath(os.path.expanduser(g))
- if os.path.exists(g_exp):
- python_paths.append(g_exp)
-
- ld_path = os.path.join(os.getcwd(), "lib/")
- print "Using guessed paths {0} {1}".format(ld_path, python_paths)
- respawn_in_path(ld_path, python_paths)
-
-
-try:
- from teuthology.exceptions import CommandFailedError
- from tasks.ceph_manager import CephManager
- from tasks.cephfs.fuse_mount import FuseMount
- from tasks.cephfs.filesystem import Filesystem, MDSCluster, CephCluster
- from mgr.mgr_test_case import MgrCluster
- from teuthology.contextutil import MaxWhileTries
- from teuthology.task import interactive
-except ImportError:
- sys.stderr.write("***\nError importing packages, have you activated your teuthology virtualenv "
- "and set PYTHONPATH to point to teuthology and ceph-qa-suite?\n***\n\n")
- raise
-
-# Must import after teuthology because of gevent monkey patching
-import subprocess
-
-if os.path.exists("./CMakeCache.txt"):
- # Running in build dir of a cmake build
- BIN_PREFIX = "./bin/"
-else:
- # Running in src/ of an autotools build
- BIN_PREFIX = "./"
-
-
-class LocalRemoteProcess(object):
- def __init__(self, args, subproc, check_status, stdout, stderr):
- self.args = args
- self.subproc = subproc
- if stdout is None:
- self.stdout = StringIO()
- else:
- self.stdout = stdout
-
- if stderr is None:
- self.stderr = StringIO()
- else:
- self.stderr = stderr
-
- self.check_status = check_status
- self.exitstatus = self.returncode = None
-
- def wait(self):
- if self.finished:
- # Avoid calling communicate() on a dead process because it'll
- # give you stick about std* already being closed
- if self.exitstatus != 0:
- raise CommandFailedError(self.args, self.exitstatus)
- else:
- return
-
- out, err = self.subproc.communicate()
- self.stdout.write(out)
- self.stderr.write(err)
-
- self.exitstatus = self.returncode = self.subproc.returncode
-
- if self.exitstatus != 0:
- sys.stderr.write(out)
- sys.stderr.write(err)
-
- if self.check_status and self.exitstatus != 0:
- raise CommandFailedError(self.args, self.exitstatus)
-
- @property
- def finished(self):
- if self.exitstatus is not None:
- return True
-
- if self.subproc.poll() is not None:
- out, err = self.subproc.communicate()
- self.stdout.write(out)
- self.stderr.write(err)
- self.exitstatus = self.returncode = self.subproc.returncode
- return True
- else:
- return False
-
- def kill(self):
- log.info("kill ")
- if self.subproc.pid and not self.finished:
- log.info("kill: killing pid {0} ({1})".format(
- self.subproc.pid, self.args))
- safe_kill(self.subproc.pid)
- else:
- log.info("kill: already terminated ({0})".format(self.args))
-
- @property
- def stdin(self):
- class FakeStdIn(object):
- def __init__(self, mount_daemon):
- self.mount_daemon = mount_daemon
-
- def close(self):
- self.mount_daemon.kill()
-
- return FakeStdIn(self)
-
-
-class LocalRemote(object):
- """
- Amusingly named class to present the teuthology RemoteProcess interface when we are really
- running things locally for vstart
-
- Run this inside your src/ dir!
- """
-
- def __init__(self):
- self.name = "local"
- self.hostname = "localhost"
- self.user = getpass.getuser()
-
- def get_file(self, path, sudo, dest_dir):
- tmpfile = tempfile.NamedTemporaryFile(delete=False).name
- shutil.copy(path, tmpfile)
- return tmpfile
-
- def put_file(self, src, dst, sudo=False):
- shutil.copy(src, dst)
-
- def run(self, args, check_status=True, wait=True,
- stdout=None, stderr=None, cwd=None, stdin=None,
- logger=None, label=None):
- log.info("run args={0}".format(args))
-
- # We don't need no stinkin' sudo
- args = [a for a in args if a != "sudo"]
-
- # We have to use shell=True if any run.Raw was present, e.g. &&
- shell = any([a for a in args if isinstance(a, Raw)])
-
- if shell:
- filtered = []
- i = 0
- while i < len(args):
- if args[i] == 'adjust-ulimits':
- i += 1
- elif args[i] == 'ceph-coverage':
- i += 2
- elif args[i] == 'timeout':
- i += 2
- else:
- filtered.append(args[i])
- i += 1
-
- args = quote(filtered)
- log.info("Running {0}".format(args))
-
- subproc = subprocess.Popen(args,
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- stdin=subprocess.PIPE,
- cwd=cwd,
- shell=True)
- else:
- log.info("Running {0}".format(args))
-
- for arg in args:
- if not isinstance(arg, basestring):
- raise RuntimeError("Oops, can't handle arg {0} type {1}".format(
- arg, arg.__class__
- ))
-
- subproc = subprocess.Popen(args,
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- stdin=subprocess.PIPE,
- cwd=cwd)
-
- if stdin:
- if not isinstance(stdin, basestring):
- raise RuntimeError("Can't handle non-string stdins on a vstart cluster")
-
- # Hack: writing to stdin is not deadlock-safe, but it "always" works
- # as long as the input buffer is "small"
- subproc.stdin.write(stdin)
-
- proc = LocalRemoteProcess(
- args, subproc, check_status,
- stdout, stderr
- )
-
- if wait:
- proc.wait()
-
- return proc
-
-
-class LocalDaemon(object):
- def __init__(self, daemon_type, daemon_id):
- self.daemon_type = daemon_type
- self.daemon_id = daemon_id
- self.controller = LocalRemote()
- self.proc = None
-
- @property
- def remote(self):
- return LocalRemote()
-
- def running(self):
- return self._get_pid() is not None
-
- def _get_pid(self):
- """
- Return PID as an integer or None if not found
- """
- ps_txt = self.controller.run(
- args=["ps", "-xwwu"+str(os.getuid())]
- ).stdout.getvalue().strip()
- lines = ps_txt.split("\n")[1:]
-
- for line in lines:
- if line.find("ceph-{0} -i {1}".format(self.daemon_type, self.daemon_id)) != -1:
- log.info("Found ps line for daemon: {0}".format(line))
- return int(line.split()[1])
- log.info("No match for {0} {1}: {2}".format(
- self.daemon_type, self.daemon_id, ps_txt
- ))
- return None
-
- def wait(self, timeout):
- waited = 0
- while self._get_pid() is not None:
- if waited > timeout:
- raise MaxWhileTries("Timed out waiting for daemon {0}.{1}".format(self.daemon_type, self.daemon_id))
- time.sleep(1)
- waited += 1
-
- def stop(self, timeout=300):
- if not self.running():
- log.error('tried to stop a non-running daemon')
- return
-
- pid = self._get_pid()
- log.info("Killing PID {0} for {1}.{2}".format(pid, self.daemon_type, self.daemon_id))
- os.kill(pid, signal.SIGKILL)
- self.wait(timeout=timeout)
-
- def restart(self):
- if self._get_pid() is not None:
- self.stop()
-
- self.proc = self.controller.run([os.path.join(BIN_PREFIX, "./ceph-{0}".format(self.daemon_type)), "-i", self.daemon_id])
-
-
-def safe_kill(pid):
- """
- os.kill annoyingly raises exception if process already dead. Ignore it.
- """
- try:
- return os.kill(pid, signal.SIGKILL)
- except OSError as e:
- if e.errno == errno.ESRCH:
- # Raced with process termination
- pass
- else:
- raise
-
-
-class LocalFuseMount(FuseMount):
- def __init__(self, test_dir, client_id):
- super(LocalFuseMount, self).__init__(None, test_dir, client_id, LocalRemote())
-
- @property
- def config_path(self):
- return "./ceph.conf"
-
- def get_keyring_path(self):
- # This is going to end up in a config file, so use an absolute path
- # to avoid assumptions about daemons' pwd
- return os.path.abspath("./client.{0}.keyring".format(self.client_id))
-
- def run_shell(self, args, wait=True):
- # FIXME maybe should add a pwd arg to teuthology.orchestra so that
- # the "cd foo && bar" shenanigans isn't needed to begin with and
- # then we wouldn't have to special case this
- return self.client_remote.run(
- args, wait=wait, cwd=self.mountpoint
- )
-
- @property
- def _prefix(self):
- return BIN_PREFIX
-
- def _asok_path(self):
- # In teuthology, the asok is named after the PID of the ceph-fuse process, because it's
- # run foreground. When running it daemonized however, the asok is named after
- # the PID of the launching process, not the long running ceph-fuse process. Therefore
- # we need to give an exact path here as the logic for checking /proc/ for which
- # asok is alive does not work.
- path = "./out/client.{0}.{1}.asok".format(self.client_id, self.fuse_daemon.subproc.pid)
- log.info("I think my launching pid was {0}".format(self.fuse_daemon.subproc.pid))
- return path
-
- def umount(self):
- if self.is_mounted():
- super(LocalFuseMount, self).umount()
-
- def mount(self, mount_path=None, mount_fs_name=None):
- self.client_remote.run(
- args=[
- 'mkdir',
- '--',
- self.mountpoint,
- ],
- )
-
- def list_connections():
- self.client_remote.run(
- args=["mount", "-t", "fusectl", "/sys/fs/fuse/connections", "/sys/fs/fuse/connections"],
- check_status=False
- )
- p = self.client_remote.run(
- args=["ls", "/sys/fs/fuse/connections"],
- check_status=False
- )
- if p.exitstatus != 0:
- log.warn("ls conns failed with {0}, assuming none".format(p.exitstatus))
- return []
-
- ls_str = p.stdout.getvalue().strip()
- if ls_str:
- return [int(n) for n in ls_str.split("\n")]
- else:
- return []
-
- # Before starting ceph-fuse process, note the contents of
- # /sys/fs/fuse/connections
- pre_mount_conns = list_connections()
- log.info("Pre-mount connections: {0}".format(pre_mount_conns))
-
- prefix = [os.path.join(BIN_PREFIX, "ceph-fuse")]
- if os.getuid() != 0:
- prefix += ["--client-die-on-failed-remount=false"]
-
- if mount_path is not None:
- prefix += ["--client_mountpoint={0}".format(mount_path)]
-
- if mount_fs_name is not None:
- prefix += ["--client_mds_namespace={0}".format(mount_fs_name)]
-
- self.fuse_daemon = self.client_remote.run(args=
- prefix + [
- "-f",
- "--name",
- "client.{0}".format(self.client_id),
- self.mountpoint
- ], wait=False)
-
- log.info("Mounting client.{0} with pid {1}".format(self.client_id, self.fuse_daemon.subproc.pid))
-
- # Wait for the connection reference to appear in /sys
- waited = 0
- post_mount_conns = list_connections()
- while len(post_mount_conns) <= len(pre_mount_conns):
- if self.fuse_daemon.finished:
- # Did mount fail? Raise the CommandFailedError instead of
- # hitting the "failed to populate /sys/" timeout
- self.fuse_daemon.wait()
- time.sleep(1)
- waited += 1
- if waited > 30:
- raise RuntimeError("Fuse mount failed to populate /sys/ after {0} seconds".format(
- waited
- ))
- post_mount_conns = list_connections()
-
- log.info("Post-mount connections: {0}".format(post_mount_conns))
-
- # Record our fuse connection number so that we can use it when
- # forcing an unmount
- new_conns = list(set(post_mount_conns) - set(pre_mount_conns))
- if len(new_conns) == 0:
- raise RuntimeError("New fuse connection directory not found ({0})".format(new_conns))
- elif len(new_conns) > 1:
- raise RuntimeError("Unexpectedly numerous fuse connections {0}".format(new_conns))
- else:
- self._fuse_conn = new_conns[0]
-
- def _run_python(self, pyscript):
- """
- Override this to remove the daemon-helper prefix that is used otherwise
- to make the process killable.
- """
- return self.client_remote.run(args=[
- 'python', '-c', pyscript
- ], wait=False)
-
-
-class LocalCephManager(CephManager):
- def __init__(self):
- # Deliberately skip parent init, only inheriting from it to get
- # util methods like osd_dump that sit on top of raw_cluster_cmd
- self.controller = LocalRemote()
-
- # A minority of CephManager fns actually bother locking for when
- # certain teuthology tests want to run tasks in parallel
- self.lock = threading.RLock()
-
- def find_remote(self, daemon_type, daemon_id):
- """
- daemon_type like 'mds', 'osd'
- daemon_id like 'a', '0'
- """
- return LocalRemote()
-
- def run_ceph_w(self):
- proc = self.controller.run([os.path.join(BIN_PREFIX, "ceph"), "-w"], wait=False, stdout=StringIO())
- return proc
-
- def raw_cluster_cmd(self, *args):
- """
- args like ["osd", "dump"}
- return stdout string
- """
- proc = self.controller.run([os.path.join(BIN_PREFIX, "ceph")] + list(args))
- return proc.stdout.getvalue()
-
- def raw_cluster_cmd_result(self, *args):
- """
- like raw_cluster_cmd but don't check status, just return rc
- """
- proc = self.controller.run([os.path.join(BIN_PREFIX, "ceph")] + list(args), check_status=False)
- return proc.exitstatus
-
- def admin_socket(self, daemon_type, daemon_id, command, check_status=True):
- return self.controller.run(
- args=[os.path.join(BIN_PREFIX, "ceph"), "daemon", "{0}.{1}".format(daemon_type, daemon_id)] + command, check_status=check_status
- )
-
- # FIXME: copypasta
- def get_mds_status(self, mds):
- """
- Run cluster commands for the mds in order to get mds information
- """
- out = self.raw_cluster_cmd('mds', 'dump', '--format=json')
- j = json.loads(' '.join(out.splitlines()[1:]))
- # collate; for dup ids, larger gid wins.
- for info in j['info'].itervalues():
- if info['name'] == mds:
- return info
- return None
-
- # FIXME: copypasta
- def get_mds_status_by_rank(self, rank):
- """
- Run cluster commands for the mds in order to get mds information
- check rank.
- """
- j = self.get_mds_status_all()
- # collate; for dup ids, larger gid wins.
- for info in j['info'].itervalues():
- if info['rank'] == rank:
- return info
- return None
-
- def get_mds_status_all(self):
- """
- Run cluster command to extract all the mds status.
- """
- out = self.raw_cluster_cmd('mds', 'dump', '--format=json')
- j = json.loads(' '.join(out.splitlines()[1:]))
- return j
-
-
-class LocalCephCluster(CephCluster):
- def __init__(self, ctx):
- # Deliberately skip calling parent constructor
- self._ctx = ctx
- self.mon_manager = LocalCephManager()
- self._conf = defaultdict(dict)
-
- def get_config(self, key, service_type=None):
- if service_type is None:
- service_type = 'mon'
-
- # FIXME hardcoded vstart service IDs
- service_id = {
- 'mon': 'a',
- 'mds': 'a',
- 'osd': '0'
- }[service_type]
-
- return self.json_asok(['config', 'get', key], service_type, service_id)[key]
-
- def _write_conf(self):
- # In teuthology, we have the honour of writing the entire ceph.conf, but
- # in vstart land it has mostly already been written and we need to carefully
- # append to it.
- conf_path = "./ceph.conf"
- banner = "\n#LOCAL_TEST\n"
- existing_str = open(conf_path).read()
-
- if banner in existing_str:
- existing_str = existing_str[0:existing_str.find(banner)]
-
- existing_str += banner
-
- for subsys, kvs in self._conf.items():
- existing_str += "\n[{0}]\n".format(subsys)
- for key, val in kvs.items():
- # Comment out existing instance if it exists
- log.info("Searching for existing instance {0}/{1}".format(
- key, subsys
- ))
- existing_section = re.search("^\[{0}\]$([\n]|[^\[])+".format(
- subsys
- ), existing_str, re.MULTILINE)
-
- if existing_section:
- section_str = existing_str[existing_section.start():existing_section.end()]
- existing_val = re.search("^\s*[^#]({0}) =".format(key), section_str, re.MULTILINE)
- if existing_val:
- start = existing_section.start() + existing_val.start(1)
- log.info("Found string to replace at {0}".format(
- start
- ))
- existing_str = existing_str[0:start] + "#" + existing_str[start:]
-
- existing_str += "{0} = {1}\n".format(key, val)
-
- open(conf_path, "w").write(existing_str)
-
- def set_ceph_conf(self, subsys, key, value):
- self._conf[subsys][key] = value
- self._write_conf()
-
- def clear_ceph_conf(self, subsys, key):
- del self._conf[subsys][key]
- self._write_conf()
-
-
-class LocalMDSCluster(LocalCephCluster, MDSCluster):
- def __init__(self, ctx):
- super(LocalMDSCluster, self).__init__(ctx)
-
- self.mds_ids = ctx.daemons.daemons['mds'].keys()
- if not self.mds_ids:
- raise RuntimeError("No MDSs found in ceph.conf!")
-
- self.mds_daemons = dict([(id_, LocalDaemon("mds", id_)) for id_ in self.mds_ids])
-
- def clear_firewall(self):
- # FIXME: unimplemented
- pass
-
- def newfs(self, name):
- return LocalFilesystem(self._ctx, create=name)
-
-
-class LocalMgrCluster(LocalCephCluster, MgrCluster):
- def __init__(self, ctx):
- super(LocalMgrCluster, self).__init__(ctx)
-
- self.mgr_ids = ctx.daemons.daemons['mgr'].keys()
- if not self.mgr_ids:
- raise RuntimeError("No manager daemonss found in ceph.conf!")
-
- self.mgr_daemons = dict([(id_, LocalDaemon("mgr", id_)) for id_ in self.mgr_ids])
-
-
-class LocalFilesystem(Filesystem, LocalMDSCluster):
- @property
- def admin_remote(self):
- return LocalRemote()
-
- def __init__(self, ctx, fscid=None, create=None):
- # Deliberately skip calling parent constructor
- self._ctx = ctx
-
- self.id = None
- self.name = None
- self.metadata_pool_name = None
- self.data_pools = None
-
- # Hack: cheeky inspection of ceph.conf to see what MDSs exist
- self.mds_ids = set()
- for line in open("ceph.conf").readlines():
- match = re.match("^\[mds\.(.+)\]$", line)
- if match:
- self.mds_ids.add(match.group(1))
-
- if not self.mds_ids:
- raise RuntimeError("No MDSs found in ceph.conf!")
-
- self.mds_ids = list(self.mds_ids)
-
- log.info("Discovered MDS IDs: {0}".format(self.mds_ids))
-
- self.mon_manager = LocalCephManager()
-
- self.mds_daemons = dict([(id_, LocalDaemon("mds", id_)) for id_ in self.mds_ids])
-
- self.client_remote = LocalRemote()
-
- self._conf = defaultdict(dict)
-
- if create is not None:
- if fscid is not None:
- raise RuntimeError("cannot specify fscid when creating fs")
- if create is True:
- self.name = 'cephfs'
- else:
- self.name = create
- self.create()
- elif fscid is not None:
- self.id = fscid
- self.getinfo(refresh=True)
-
- # Stash a reference to the first created filesystem on ctx, so
- # that if someone drops to the interactive shell they can easily
- # poke our methods.
- if not hasattr(self._ctx, "filesystem"):
- self._ctx.filesystem = self
-
- @property
- def _prefix(self):
- return BIN_PREFIX
-
- def set_clients_block(self, blocked, mds_id=None):
- raise NotImplementedError()
-
- def get_pgs_per_fs_pool(self):
- # FIXME: assuming there are 3 OSDs
- return 3 * int(self.get_config('mon_pg_warn_min_per_osd'))
-
-
-class InteractiveFailureResult(unittest.TextTestResult):
- """
- Specialization that implements interactive-on-error style
- behavior.
- """
- def addFailure(self, test, err):
- super(InteractiveFailureResult, self).addFailure(test, err)
- log.error(self._exc_info_to_string(err, test))
- log.error("Failure in test '{0}', going interactive".format(
- self.getDescription(test)
- ))
- interactive.task(ctx=None, config=None)
-
- def addError(self, test, err):
- super(InteractiveFailureResult, self).addError(test, err)
- log.error(self._exc_info_to_string(err, test))
- log.error("Error in test '{0}', going interactive".format(
- self.getDescription(test)
- ))
- interactive.task(ctx=None, config=None)
-
-
-def exec_test():
- # Help developers by stopping up-front if their tree isn't built enough for all the
- # tools that the tests might want to use (add more here if needed)
- require_binaries = ["ceph-dencoder", "cephfs-journal-tool", "cephfs-data-scan",
- "cephfs-table-tool", "ceph-fuse", "rados"]
- missing_binaries = [b for b in require_binaries if not os.path.exists(os.path.join(BIN_PREFIX, b))]
- if missing_binaries:
- log.error("Some ceph binaries missing, please build them: {0}".format(" ".join(missing_binaries)))
- sys.exit(-1)
-
- test_dir = tempfile.mkdtemp()
-
- # Create as many of these as the biggest test requires
- clients = ["0", "1", "2", "3"]
-
- remote = LocalRemote()
-
- # Tolerate no MDSs or clients running at start
- ps_txt = remote.run(
- args=["ps", "-u"+str(os.getuid())]
- ).stdout.getvalue().strip()
- lines = ps_txt.split("\n")[1:]
-
- for line in lines:
- if 'ceph-fuse' in line or 'ceph-mds' in line:
- pid = int(line.split()[0])
- log.warn("Killing stray process {0}".format(line))
- os.kill(pid, signal.SIGKILL)
-
- class LocalCluster(object):
- def __init__(self, rolename="placeholder"):
- self.remotes = {
- remote: [rolename]
- }
-
- def only(self, requested):
- return self.__class__(rolename=requested)
-
- teuth_config['test_path'] = test_dir
-
- class LocalContext(object):
- def __init__(self):
- self.config = {}
- self.teuthology_config = teuth_config
- self.cluster = LocalCluster()
- self.daemons = DaemonGroup()
-
- # Shove some LocalDaemons into the ctx.daemons DaemonGroup instance so that any
- # tests that want to look these up via ctx can do so.
- # Inspect ceph.conf to see what roles exist
- for conf_line in open("ceph.conf").readlines():
- for svc_type in ["mon", "osd", "mds", "mgr"]:
- if svc_type not in self.daemons.daemons:
- self.daemons.daemons[svc_type] = {}
- match = re.match("^\[{0}\.(.+)\]$".format(svc_type), conf_line)
- if match:
- svc_id = match.group(1)
- self.daemons.daemons[svc_type][svc_id] = LocalDaemon(svc_type, svc_id)
-
- def __del__(self):
- shutil.rmtree(self.teuthology_config['test_path'])
-
- ctx = LocalContext()
-
- mounts = []
- for client_id in clients:
- # Populate client keyring (it sucks to use client.admin for test clients
- # because it's awkward to find the logs later)
- client_name = "client.{0}".format(client_id)
-
- if client_name not in open("./keyring").read():
- p = remote.run(args=[os.path.join(BIN_PREFIX, "ceph"), "auth", "get-or-create", client_name,
- "osd", "allow rw",
- "mds", "allow",
- "mon", "allow r"])
-
- open("./keyring", "a").write(p.stdout.getvalue())
-
- mount = LocalFuseMount(test_dir, client_id)
- mounts.append(mount)
- if mount.is_mounted():
- log.warn("unmounting {0}".format(mount.mountpoint))
- mount.umount_wait()
- else:
- if os.path.exists(mount.mountpoint):
- os.rmdir(mount.mountpoint)
- ceph_cluster = LocalCephCluster(ctx)
- mds_cluster = LocalMDSCluster(ctx)
- mgr_cluster = LocalMgrCluster(ctx)
-
- from tasks.cephfs_test_runner import DecoratingLoader
-
- class LogStream(object):
- def __init__(self):
- self.buffer = ""
-
- def write(self, data):
- self.buffer += data
- if "\n" in self.buffer:
- lines = self.buffer.split("\n")
- for line in lines[:-1]:
- pass
- # sys.stderr.write(line + "\n")
- log.info(line)
- self.buffer = lines[-1]
-
- def flush(self):
- pass
-
- decorating_loader = DecoratingLoader({
- "ctx": ctx,
- "mounts": mounts,
- "ceph_cluster": ceph_cluster,
- "mds_cluster": mds_cluster,
- "mgr_cluster": mgr_cluster,
- })
-
- # For the benefit of polling tests like test_full -- in teuthology land we set this
- # in a .yaml, here it's just a hardcoded thing for the developer's pleasure.
- remote.run(args=[os.path.join(BIN_PREFIX, "ceph"), "tell", "osd.*", "injectargs", "--osd-mon-report-interval-max", "5"])
- ceph_cluster.set_ceph_conf("osd", "osd_mon_report_interval_max", "5")
-
- # Vstart defaults to two segments, which very easily gets a "behind on trimming" health warning
- # from normal IO latency. Increase it for running teests.
- ceph_cluster.set_ceph_conf("mds", "mds log max segments", "10")
-
- # Make sure the filesystem created in tests has uid/gid that will let us talk to
- # it after mounting it (without having to go root). Set in 'global' not just 'mds'
- # so that cephfs-data-scan will pick it up too.
- ceph_cluster.set_ceph_conf("global", "mds root ino uid", "%s" % os.getuid())
- ceph_cluster.set_ceph_conf("global", "mds root ino gid", "%s" % os.getgid())
-
- # Monkeypatch get_package_version to avoid having to work out what kind of distro we're on
- def _get_package_version(remote, pkg_name):
- # Used in cephfs tests to find fuse version. Your development workstation *does* have >=2.9, right?
- return "2.9"
-
- import teuthology.packaging
- teuthology.packaging.get_package_version = _get_package_version
-
- def enumerate_methods(s):
- for t in s._tests:
- if isinstance(t, suite.BaseTestSuite):
- for sub in enumerate_methods(t):
- yield sub
- else:
- yield s, t
-
- interactive_on_error = False
-
- args = sys.argv[1:]
- flags = [a for a in args if a.startswith("-")]
- modules = [a for a in args if not a.startswith("-")]
- for f in flags:
- if f == "--interactive":
- interactive_on_error = True
- else:
- log.error("Unknown option '{0}'".format(f))
- sys.exit(-1)
-
- if modules:
- log.info("Executing modules: {0}".format(modules))
- module_suites = []
- for mod_name in modules:
- # Test names like cephfs.test_auto_repair
- module_suites.append(decorating_loader.loadTestsFromName(mod_name))
- log.info("Loaded: {0}".format(list(module_suites)))
- overall_suite = suite.TestSuite(module_suites)
- else:
- log.info("Executing all cephfs tests")
- overall_suite = decorating_loader.discover(
- os.path.join(os.path.dirname(os.path.abspath(__file__)), "cephfs")
- )
-
- # Filter out tests that don't lend themselves to interactive running,
- victims = []
- for case, method in enumerate_methods(overall_suite):
- fn = getattr(method, method._testMethodName)
-
- drop_test = False
-
- if hasattr(fn, 'is_for_teuthology') and getattr(fn, 'is_for_teuthology') is True:
- drop_test = True
- log.warn("Dropping test because long running: ".format(method.id()))
-
- if getattr(fn, "needs_trimming", False) is True:
- drop_test = (os.getuid() != 0)
- log.warn("Dropping test because client trim unavailable: ".format(method.id()))
-
- if drop_test:
- # Don't drop the test if it was explicitly requested in arguments
- is_named = False
- for named in modules:
- if named.endswith(method.id()):
- is_named = True
- break
-
- if not is_named:
- victims.append((case, method))
-
- log.info("Disabling {0} tests because of is_for_teuthology or needs_trimming".format(len(victims)))
- for s, method in victims:
- s._tests.remove(method)
-
- if interactive_on_error:
- result_class = InteractiveFailureResult
- else:
- result_class = unittest.TextTestResult
- fail_on_skip = False
-
- class LoggingResult(result_class):
- def startTest(self, test):
- log.info("Starting test: {0}".format(self.getDescription(test)))
- test.started_at = datetime.datetime.utcnow()
- return super(LoggingResult, self).startTest(test)
-
- def stopTest(self, test):
- log.info("Stopped test: {0} in {1}s".format(
- self.getDescription(test),
- (datetime.datetime.utcnow() - test.started_at).total_seconds()
- ))
-
- def addSkip(self, test, reason):
- if fail_on_skip:
- # Don't just call addFailure because that requires a traceback
- self.failures.append((test, reason))
- else:
- super(LoggingResult, self).addSkip(test, reason)
-
- # Execute!
- result = unittest.TextTestRunner(
- stream=LogStream(),
- resultclass=LoggingResult,
- verbosity=2,
- failfast=True).run(overall_suite)
-
- if not result.wasSuccessful():
- result.printErrors() # duplicate output at end for convenience
-
- bad_tests = []
- for test, error in result.errors:
- bad_tests.append(str(test))
- for test, failure in result.failures:
- bad_tests.append(str(test))
-
- sys.exit(-1)
- else:
- sys.exit(0)
-
-
-if __name__ == "__main__":
- exec_test()
+++ /dev/null
-
-"""
-watch_notify_same_primary task
-"""
-from cStringIO import StringIO
-import contextlib
-import logging
-
-from teuthology.orchestra import run
-from teuthology.contextutil import safe_while
-
-log = logging.getLogger(__name__)
-
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Run watch_notify_same_primary
-
- The config should be as follows:
-
- watch_notify_same_primary:
- clients: [client list]
-
- The client list should contain 1 client
-
- The test requires 3 osds.
-
- example:
-
- tasks:
- - ceph:
- - watch_notify_same_primary:
- clients: [client.0]
- - interactive:
- """
- log.info('Beginning watch_notify_same_primary...')
- assert isinstance(config, dict), \
- "please list clients to run on"
-
- clients = config.get('clients', ['client.0'])
- assert len(clients) == 1
- role = clients[0]
- assert isinstance(role, basestring)
- PREFIX = 'client.'
- assert role.startswith(PREFIX)
- (remote,) = ctx.cluster.only(role).remotes.iterkeys()
- manager = ctx.managers['ceph']
- manager.raw_cluster_cmd('osd', 'set', 'noout')
-
- pool = manager.create_pool_with_unique_name()
- def obj(n): return "foo-{num}".format(num=n)
- def start_watch(n):
- remote.run(
- args = [
- "rados",
- "-p", pool,
- "put",
- obj(n),
- "/etc/resolv.conf"],
- logger=log.getChild('watch.{id}'.format(id=n)))
- proc = remote.run(
- args = [
- "rados",
- "-p", pool,
- "watch",
- obj(n)],
- stdin=run.PIPE,
- stdout=StringIO(),
- stderr=StringIO(),
- wait=False)
- return proc
-
- num = 20
-
- watches = [start_watch(i) for i in range(num)]
-
- # wait for them all to register
- for i in range(num):
- with safe_while() as proceed:
- while proceed():
- proc = remote.run(
- args = [
- "rados",
- "-p", pool,
- "listwatchers",
- obj(i)],
- stdout=StringIO())
- lines = proc.stdout.getvalue()
- num_watchers = lines.count('watcher=')
- log.info('i see %d watchers for %s', num_watchers, obj(i))
- if num_watchers >= 1:
- break
-
- def notify(n, msg):
- remote.run(
- args = [
- "rados",
- "-p", pool,
- "notify",
- obj(n),
- msg],
- logger=log.getChild('notify.{id}'.format(id=n)))
-
- [notify(n, 'notify1') for n in range(len(watches))]
-
- manager.kill_osd(0)
- manager.mark_down_osd(0)
-
- [notify(n, 'notify2') for n in range(len(watches))]
-
- try:
- yield
- finally:
- log.info('joining watch_notify_stress')
- for watch in watches:
- watch.stdin.write("\n")
-
- run.wait(watches)
-
- for watch in watches:
- lines = watch.stdout.getvalue().split("\n")
- got1 = False
- got2 = False
- for l in lines:
- if 'notify1' in l:
- got1 = True
- if 'notify2' in l:
- got2 = True
- log.info(lines)
- assert got1 and got2
-
- manager.revive_osd(0)
- manager.remove_pool(pool)
+++ /dev/null
-"""
-test_stress_watch task
-"""
-import contextlib
-import logging
-import proc_thrasher
-
-from teuthology.orchestra import run
-
-log = logging.getLogger(__name__)
-
-
-@contextlib.contextmanager
-def task(ctx, config):
- """
- Run test_stress_watch
-
- The config should be as follows:
-
- test_stress_watch:
- clients: [client list]
-
- example:
-
- tasks:
- - ceph:
- - test_stress_watch:
- clients: [client.0]
- - interactive:
- """
- log.info('Beginning test_stress_watch...')
- assert isinstance(config, dict), \
- "please list clients to run on"
- testwatch = {}
-
- remotes = []
-
- for role in config.get('clients', ['client.0']):
- assert isinstance(role, basestring)
- PREFIX = 'client.'
- assert role.startswith(PREFIX)
- id_ = role[len(PREFIX):]
- (remote,) = ctx.cluster.only(role).remotes.iterkeys()
- remotes.append(remote)
-
- args =['CEPH_CLIENT_ID={id_}'.format(id_=id_),
- 'CEPH_ARGS="{flags}"'.format(flags=config.get('flags', '')),
- 'daemon-helper',
- 'kill',
- 'multi_stress_watch foo foo'
- ]
-
- log.info("args are %s" % (args,))
-
- proc = proc_thrasher.ProcThrasher({}, remote,
- args=[run.Raw(i) for i in args],
- logger=log.getChild('testwatch.{id}'.format(id=id_)),
- stdin=run.PIPE,
- wait=False
- )
- proc.start()
- testwatch[id_] = proc
-
- try:
- yield
- finally:
- log.info('joining watch_notify_stress')
- for i in testwatch.itervalues():
- i.join()
+++ /dev/null
-"""
-Workunit task -- Run ceph on sets of specific clients
-"""
-import logging
-import pipes
-import os
-
-from util import get_remote_for_role
-
-from teuthology import misc
-from teuthology.config import config as teuth_config
-from teuthology.orchestra.run import CommandFailedError
-from teuthology.parallel import parallel
-from teuthology.orchestra import run
-
-log = logging.getLogger(__name__)
-
-
-def task(ctx, config):
- """
- Run ceph on all workunits found under the specified path.
-
- For example::
-
- tasks:
- - ceph:
- - ceph-fuse: [client.0]
- - workunit:
- clients:
- client.0: [direct_io, xattrs.sh]
- client.1: [snaps]
- branch: foo
-
- You can also run a list of workunits on all clients:
- tasks:
- - ceph:
- - ceph-fuse:
- - workunit:
- tag: v0.47
- clients:
- all: [direct_io, xattrs.sh, snaps]
-
- If you have an "all" section it will run all the workunits
- on each client simultaneously, AFTER running any workunits specified
- for individual clients. (This prevents unintended simultaneous runs.)
-
- To customize tests, you can specify environment variables as a dict. You
- can also specify a time limit for each work unit (defaults to 3h):
-
- tasks:
- - ceph:
- - ceph-fuse:
- - workunit:
- sha1: 9b28948635b17165d17c1cf83d4a870bd138ddf6
- clients:
- all: [snaps]
- env:
- FOO: bar
- BAZ: quux
- timeout: 3h
-
- This task supports roles that include a ceph cluster, e.g.::
-
- tasks:
- - ceph:
- - workunit:
- clients:
- backup.client.0: [foo]
- client.1: [bar] # cluster is implicitly 'ceph'
-
- :param ctx: Context
- :param config: Configuration
- """
- assert isinstance(config, dict)
- assert isinstance(config.get('clients'), dict), \
- 'configuration must contain a dictionary of clients'
-
- overrides = ctx.config.get('overrides', {})
- misc.deep_merge(config, overrides.get('workunit', {}))
-
- refspec = config.get('branch')
- if refspec is None:
- refspec = config.get('tag')
- if refspec is None:
- refspec = config.get('sha1')
- if refspec is None:
- refspec = 'HEAD'
-
- timeout = config.get('timeout', '3h')
-
- log.info('Pulling workunits from ref %s', refspec)
-
- created_mountpoint = {}
-
- if config.get('env') is not None:
- assert isinstance(config['env'], dict), 'env must be a dictionary'
- clients = config['clients']
-
- # Create scratch dirs for any non-all workunits
- log.info('Making a separate scratch dir for every client...')
- for role in clients.iterkeys():
- assert isinstance(role, basestring)
- if role == "all":
- continue
-
- assert 'client' in role
- created_mnt_dir = _make_scratch_dir(ctx, role, config.get('subdir'))
- created_mountpoint[role] = created_mnt_dir
-
- # Execute any non-all workunits
- with parallel() as p:
- for role, tests in clients.iteritems():
- if role != "all":
- p.spawn(_run_tests, ctx, refspec, role, tests,
- config.get('env'), timeout=timeout)
-
- # Clean up dirs from any non-all workunits
- for role, created in created_mountpoint.items():
- _delete_dir(ctx, role, created)
-
- # Execute any 'all' workunits
- if 'all' in clients:
- all_tasks = clients["all"]
- _spawn_on_all_clients(ctx, refspec, all_tasks, config.get('env'),
- config.get('subdir'), timeout=timeout)
-
-
-def _client_mountpoint(ctx, cluster, id_):
- """
- Returns the path to the expected mountpoint for workunits running
- on some kind of filesystem.
- """
- # for compatibility with tasks like ceph-fuse that aren't cluster-aware yet,
- # only include the cluster name in the dir if the cluster is not 'ceph'
- if cluster == 'ceph':
- dir_ = 'mnt.{0}'.format(id_)
- else:
- dir_ = 'mnt.{0}.{1}'.format(cluster, id_)
- return os.path.join(misc.get_testdir(ctx), dir_)
-
-
-def _delete_dir(ctx, role, created_mountpoint):
- """
- Delete file used by this role, and delete the directory that this
- role appeared in.
-
- :param ctx: Context
- :param role: "role.#" where # is used for the role id.
- """
- cluster, _, id_ = misc.split_role(role)
- remote = get_remote_for_role(ctx, role)
- mnt = _client_mountpoint(ctx, cluster, id_)
- client = os.path.join(mnt, 'client.{id}'.format(id=id_))
-
- # Remove the directory inside the mount where the workunit ran
- remote.run(
- args=[
- 'sudo',
- 'rm',
- '-rf',
- '--',
- client,
- ],
- )
- log.info("Deleted dir {dir}".format(dir=client))
-
- # If the mount was an artificially created dir, delete that too
- if created_mountpoint:
- remote.run(
- args=[
- 'rmdir',
- '--',
- mnt,
- ],
- )
- log.info("Deleted artificial mount point {dir}".format(dir=client))
-
-
-def _make_scratch_dir(ctx, role, subdir):
- """
- Make scratch directories for this role. This also makes the mount
- point if that directory does not exist.
-
- :param ctx: Context
- :param role: "role.#" where # is used for the role id.
- :param subdir: use this subdir (False if not used)
- """
- created_mountpoint = False
- cluster, _, id_ = misc.split_role(role)
- remote = get_remote_for_role(ctx, role)
- dir_owner = remote.user
- mnt = _client_mountpoint(ctx, cluster, id_)
- # if neither kclient nor ceph-fuse are required for a workunit,
- # mnt may not exist. Stat and create the directory if it doesn't.
- try:
- remote.run(
- args=[
- 'stat',
- '--',
- mnt,
- ],
- )
- log.info('Did not need to create dir {dir}'.format(dir=mnt))
- except CommandFailedError:
- remote.run(
- args=[
- 'mkdir',
- '--',
- mnt,
- ],
- )
- log.info('Created dir {dir}'.format(dir=mnt))
- created_mountpoint = True
-
- if not subdir:
- subdir = 'client.{id}'.format(id=id_)
-
- if created_mountpoint:
- remote.run(
- args=[
- 'cd',
- '--',
- mnt,
- run.Raw('&&'),
- 'mkdir',
- '--',
- subdir,
- ],
- )
- else:
- remote.run(
- args=[
- # cd first so this will fail if the mount point does
- # not exist; pure install -d will silently do the
- # wrong thing
- 'cd',
- '--',
- mnt,
- run.Raw('&&'),
- 'sudo',
- 'install',
- '-d',
- '-m', '0755',
- '--owner={user}'.format(user=dir_owner),
- '--',
- subdir,
- ],
- )
-
- return created_mountpoint
-
-
-def _spawn_on_all_clients(ctx, refspec, tests, env, subdir, timeout=None):
- """
- Make a scratch directory for each client in the cluster, and then for each
- test spawn _run_tests() for each role.
-
- See run_tests() for parameter documentation.
- """
- is_client = misc.is_type('client')
- client_remotes = {}
- created_mountpoint = {}
- for remote, roles_for_host in ctx.cluster.remotes.items():
- for role in roles_for_host:
- if is_client(role):
- client_remotes[role] = remote
- created_mountpoint[role] = _make_scratch_dir(ctx, role, subdir)
-
- for unit in tests:
- with parallel() as p:
- for role, remote in client_remotes.items():
- p.spawn(_run_tests, ctx, refspec, role, [unit], env, subdir,
- timeout=timeout)
-
- # cleanup the generated client directories
- for role, _ in client_remotes.items():
- _delete_dir(ctx, role, created_mountpoint[role])
-
-def _run_tests(ctx, refspec, role, tests, env, subdir=None, timeout=None):
- """
- Run the individual test. Create a scratch directory and then extract the
- workunits from git. Make the executables, and then run the tests.
- Clean up (remove files created) after the tests are finished.
-
- :param ctx: Context
- :param refspec: branch, sha1, or version tag used to identify this
- build
- :param tests: specific tests specified.
- :param env: environment set in yaml file. Could be None.
- :param subdir: subdirectory set in yaml file. Could be None
- :param timeout: If present, use the 'timeout' command on the remote host
- to limit execution time. Must be specified by a number
- followed by 's' for seconds, 'm' for minutes, 'h' for
- hours, or 'd' for days. If '0' or anything that evaluates
- to False is passed, the 'timeout' command is not used.
- """
- testdir = misc.get_testdir(ctx)
- assert isinstance(role, basestring)
- cluster, type_, id_ = misc.split_role(role)
- assert type_ == 'client'
- remote = get_remote_for_role(ctx, role)
- mnt = _client_mountpoint(ctx, cluster, id_)
- # subdir so we can remove and recreate this a lot without sudo
- if subdir is None:
- scratch_tmp = os.path.join(mnt, 'client.{id}'.format(id=id_), 'tmp')
- else:
- scratch_tmp = os.path.join(mnt, subdir)
- srcdir = '{tdir}/workunit.{role}'.format(tdir=testdir, role=role)
- clonedir = '{tdir}/clone.{role}'.format(tdir=testdir, role=role)
-
- git_url = teuth_config.get_ceph_git_url()
- remote.run(
- logger=log.getChild(role),
- args=[
- 'git',
- 'clone',
- git_url,
- clonedir,
- run.Raw(';'),
- 'cd', '--', clonedir,
- run.Raw('&&'),
- 'git', 'checkout', refspec,
- run.Raw('&&'),
- 'mv', 'qa/workunits', srcdir,
- ],
- )
-
- remote.run(
- logger=log.getChild(role),
- args=[
- 'cd', '--', srcdir,
- run.Raw('&&'),
- 'if', 'test', '-e', 'Makefile', run.Raw(';'), 'then', 'make', run.Raw(';'), 'fi',
- run.Raw('&&'),
- 'find', '-executable', '-type', 'f', '-printf', r'%P\0'.format(srcdir=srcdir),
- run.Raw('>{tdir}/workunits.list.{role}'.format(tdir=testdir, role=role)),
- ],
- )
-
- workunits_file = '{tdir}/workunits.list.{role}'.format(tdir=testdir, role=role)
- workunits = sorted(misc.get_file(remote, workunits_file).split('\0'))
- assert workunits
-
- try:
- assert isinstance(tests, list)
- for spec in tests:
- log.info('Running workunits matching %s on %s...', spec, role)
- prefix = '{spec}/'.format(spec=spec)
- to_run = [w for w in workunits if w == spec or w.startswith(prefix)]
- if not to_run:
- raise RuntimeError('Spec did not match any workunits: {spec!r}'.format(spec=spec))
- for workunit in to_run:
- log.info('Running workunit %s...', workunit)
- args = [
- 'mkdir', '-p', '--', scratch_tmp,
- run.Raw('&&'),
- 'cd', '--', scratch_tmp,
- run.Raw('&&'),
- run.Raw('CEPH_CLI_TEST_DUP_COMMAND=1'),
- run.Raw('CEPH_REF={ref}'.format(ref=refspec)),
- run.Raw('TESTDIR="{tdir}"'.format(tdir=testdir)),
- run.Raw('CEPH_ARGS="--cluster {0}"'.format(cluster)),
- run.Raw('CEPH_ID="{id}"'.format(id=id_)),
- run.Raw('PATH=$PATH:/usr/sbin')
- ]
- if env is not None:
- for var, val in env.iteritems():
- quoted_val = pipes.quote(val)
- env_arg = '{var}={val}'.format(var=var, val=quoted_val)
- args.append(run.Raw(env_arg))
- args.extend([
- 'adjust-ulimits',
- 'ceph-coverage',
- '{tdir}/archive/coverage'.format(tdir=testdir)])
- if timeout and timeout != '0':
- args.extend(['timeout', timeout])
- args.extend([
- '{srcdir}/{workunit}'.format(
- srcdir=srcdir,
- workunit=workunit,
- ),
- ])
- remote.run(
- logger=log.getChild(role),
- args=args,
- label="workunit test {workunit}".format(workunit=workunit)
- )
- remote.run(
- logger=log.getChild(role),
- args=['sudo', 'rm', '-rf', '--', scratch_tmp],
- )
- finally:
- log.info('Stopping %s on %s...', tests, role)
- remote.run(
- logger=log.getChild(role),
- args=[
- 'rm', '-rf', '--', workunits_file, srcdir, clonedir,
- ],
- )
+++ /dev/null
-tasks:
-- exec:
- all:
- - echo America/New_York | sudo tee /etc/timezone
+++ /dev/null
-tasks:
-- exec:
- all:
- - echo America/Los_Angeles | sudo tee /etc/timezone
+++ /dev/null
-tasks:
-- exec:
- all:
- - echo America/Los_Angeles | sudo tee /etc/timezone
- - [ $RANDOM -gt 32000 ] && echo America/New_York | sudo tee /etc/timezone
+++ /dev/null
-[tox]
-envlist = flake8
-skipsdist = True
-
-[testenv:flake8]
-deps=
- flake8
-commands=flake8 --select=F,E9 --exclude=venv,.tox