--- /dev/null
+../../../../../cephfs/clusters/3-mds.yaml
\ No newline at end of file
--- /dev/null
+../../../../cephfs/objectstore-ec/
\ No newline at end of file
--- /dev/null
+../../../../../cephfs/overrides/debug.yaml
\ No newline at end of file
--- /dev/null
+../../../../../cephfs/overrides/frag_enable.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ max_mds: 1
--- /dev/null
+../../../../../cephfs/overrides/whitelist_health.yaml
\ No newline at end of file
--- /dev/null
+../../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml
\ No newline at end of file
--- /dev/null
+meta:
+- desc: |
+ install ceph/luminous latest
+tasks:
+- install:
+ branch: luminous
+- print: "**** done installing luminous"
+- ceph:
+ log-whitelist:
+ - overall HEALTH_
+ - \(FS_
+ - \(MDS_
+ - \(OSD_
+ - \(MON_DOWN\)
+ - \(CACHE_POOL_
+ - \(POOL_
+ - \(MGR_DOWN\)
+ - \(PG_
+ - \(SMALLER_PGP_NUM\)
+ - Monitor daemon marked osd
+ - Behind on trimming
+ - Manager daemon
+ conf:
+ global:
+ mon warn on pool no app: false
+- exec:
+ osd.0:
+ - ceph osd require-osd-release luminous
+ - ceph osd set-require-min-compat-client luminous
+- print: "**** done ceph"
--- /dev/null
+tasks:
+- ceph-fuse:
+- print: "**** done luminous client"
+- exec:
+ mon.a:
+ - ceph fs set cephfs allow_new_snaps true --yes-i-really-mean-it
+- workunit:
+ timeout: 5m
+ cleanup: false
+ clients:
+ client.0:
+ - fs/snaps/snap-hierarchy.sh
+- print: "**** done snap hierarchy"
--- /dev/null
+tasks:
+- mds_pre_upgrade:
+- print: "**** done mds pre-upgrade sequence"
+- install.upgrade:
+ mon.a:
+ mon.b:
+- print: "**** done install.upgrade both hosts"
+- ceph.stop: [mds.*]
+- ceph.restart:
+ daemons: [mon.*, mgr.*, osd.*, mds.*]
+ mon-health-to-clog: false
+- print: "**** done ceph.restart"
--- /dev/null
+tasks:
+- exec:
+ mon.a:
+ - ceph status
+ - ceph fs dump --format=json-pretty
+ - ceph fs set cephfs max_mds 2 && exit 1 || true
+- print: "**** confirmed cannot set max_mds=2"
+- exec:
+ mon.a:
+ - ceph fs set cephfs allow_new_snaps true
--- /dev/null
+tasks:
+- install.upgrade:
+ client.0:
+- print: "**** done install.upgrade on client.0"
+- ceph-fuse:
+ client.0:
+ mounted: false
+- ceph-fuse:
+ client.0:
+- print: "**** done remount client"
--- /dev/null
+tasks:
+- workunit:
+ timeout: 5m
+ cleanup: false
+ env:
+ VERIFY: verify
+ clients:
+ client.0:
+ - fs/snaps/snap-hierarchy.sh
+- print: "**** done verify snap hierarchy"
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - bad backtrace on inode
+tasks:
+- cephfs_upgrade_snap:
+- print: "**** upgraded snapshot metadata"
+- exec:
+ mon.a:
+ - ceph fs set cephfs max_mds 2
+- print: "**** increased max_mds=2"
+- sleep:
+ duration: 10
+- exec:
+ mon.a:
+ - ceph fs dump | grep '^max_mds.*2'
--- /dev/null
+5-client-sanity.yaml
\ No newline at end of file
all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
num_active = len([r for r in all_roles if is_active_mds(r)])
- fs.set_max_mds(num_active)
+ fs.set_max_mds(config.get('max_mds', num_active))
yield
return result
+ def get_rank(self, rank=0, status=None):
+ if status is None:
+ status = self.getinfo()
+ return status.get_rank(self.id, rank)
+
+ def get_ranks(self, status=None):
+ if status is None:
+ status = self.getinfo()
+ return status.get_ranks(self.id)
+
def get_rank_names(self, status=None):
"""
Return MDS daemon names of those daemons holding a rank,
return self.json_asok(command, 'mds', mds_id)
+ def rank_asok(self, command, rank=0):
+ info = self.get_rank(rank=rank)
+ return self.json_asok(command, 'mds', info['name'])
+
def read_cache(self, path, depth=None):
cmd = ["dump", "tree", path]
if depth is not None:
--- /dev/null
+"""
+Upgrade cluster snap format.
+"""
+
+import logging
+import time
+
+from tasks.cephfs.filesystem import Filesystem
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+ """
+ Upgrade CephFS file system snap format.
+ """
+
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'snap-upgrade task only accepts a dict for configuration'
+
+ fs = Filesystem(ctx)
+
+ mds_map = fs.get_mds_map()
+ assert(mds_map['max_mds'] == 1)
+
+ json = fs.rank_asok(["scrub_path", "/", "force", "recursive", "repair"])
+ if not json or json['return_code'] == 0:
+ log.info("scrub / completed")
+ else:
+ log.info("scrub / failed: {}".format(json))
+
+ json = fs.rank_asok(["scrub_path", "~mdsdir", "force", "recursive", "repair"])
+ if not json or json['return_code'] == 0:
+ log.info("scrub ~mdsdir completed")
+ else:
+ log.info("scrub / failed: {}".format(json))
+
+ for i in range(0, 10):
+ mds_map = fs.get_mds_map()
+ if (mds_map['flags'] & (1<<1)) != 0 and (mds_map['flags'] & (1<<4)) != 0:
+ break
+ time.sleep(10)
+ assert((mds_map['flags'] & (1<<1)) != 0) # Test CEPH_MDSMAP_ALLOW_SNAPS
+ assert((mds_map['flags'] & (1<<4)) != 0) # Test CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS
--- /dev/null
+"""
+Prepare MDS cluster for upgrade.
+"""
+
+import logging
+import time
+
+from tasks.cephfs.filesystem import Filesystem
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+ """
+ Prepare MDS cluster for upgrade.
+
+ This task reduces ranks to 1 and stops all standbys.
+ """
+
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'snap-upgrade task only accepts a dict for configuration'
+
+ fs = Filesystem(ctx)
+ status = fs.getinfo()
+
+ fs.set_max_mds(1)
+ status = fs.getinfo()
+ targets = filter(lambda r: r['rank'] >= 1, fs.get_ranks(status=status))
+ if len(targets) > 0:
+ # deactivate mds in decending order
+ targets = sorted(targets, key=lambda r: r['rank'], reverse=True)
+ for target in targets:
+ self.log("deactivating rank %d" % target['rank'])
+ self.fs.deactivate(target['rank'])
+ status = self.wait_for_stable()[0]
+ else:
+ status = self.wait_for_stable()[0]
+
+ assert(fs.get_mds_map(status=status)['max_mds'] == 1)
+ assert(fs.get_mds_map(status=status)['in'] == [0])
+
+ # Stop standbys now to minimize time rank 0 is down in subsequent:
+ # tasks:
+ # - ceph.stop: [mds.*]
+ rank0 = fs.get_rank(rank=0, status=status)
+ for daemon in ctx.daemons.iter_daemons_of_role('mds', fs.mon_manager.cluster):
+ if rank0['name'] != daemon.id_:
+ daemon.stop()
+
+ for i in range(1, 10):
+ time.sleep(5) # time for FSMap to update
+ status = fs.getinfo()
+ if len(list(status.get_standbys())) == 0:
+ break
+ assert(len(list(status.get_standbys())) == 0)
refspec = Head()
timeout = config.get('timeout', '3h')
+ cleanup = config.get('cleanup', True)
log.info('Pulling workunits from ref %s', refspec)
created_mountpoint[role] = created_mnt_dir
# Execute any non-all workunits
+ log.info("timeout={}".format(timeout))
+ log.info("cleanup={}".format(cleanup))
with parallel() as p:
for role, tests in clients.iteritems():
if role != "all":
p.spawn(_run_tests, ctx, refspec, role, tests,
config.get('env'),
basedir=config.get('basedir','qa/workunits'),
- timeout=timeout)
+ timeout=timeout,cleanup=cleanup)
- # Clean up dirs from any non-all workunits
- for role, created in created_mountpoint.items():
- _delete_dir(ctx, role, created)
+ if cleanup:
+ # Clean up dirs from any non-all workunits
+ for role, created in created_mountpoint.items():
+ _delete_dir(ctx, role, created)
# Execute any 'all' workunits
if 'all' in clients:
all_tasks = clients["all"]
_spawn_on_all_clients(ctx, refspec, all_tasks, config.get('env'),
config.get('basedir', 'qa/workunits'),
- config.get('subdir'), timeout=timeout)
+ config.get('subdir'), timeout=timeout,
+ cleanup=cleanup)
def _client_mountpoint(ctx, cluster, id_):
return created_mountpoint
-def _spawn_on_all_clients(ctx, refspec, tests, env, basedir, subdir, timeout=None):
+def _spawn_on_all_clients(ctx, refspec, tests, env, basedir, subdir, timeout=None, cleanup=True):
"""
Make a scratch directory for each client in the cluster, and then for each
test spawn _run_tests() for each role.
timeout=timeout)
# cleanup the generated client directories
- for role, _ in client_remotes.items():
- _delete_dir(ctx, role, created_mountpoint[role])
+ if cleanup:
+ for role, _ in client_remotes.items():
+ _delete_dir(ctx, role, created_mountpoint[role])
def _run_tests(ctx, refspec, role, tests, env, basedir,
- subdir=None, timeout=None):
+ subdir=None, timeout=None, cleanup=True):
"""
Run the individual test. Create a scratch directory and then extract the
workunits from git. Make the executables, and then run the tests.
args=args,
label="workunit test {workunit}".format(workunit=workunit)
)
- remote.run(
- logger=log.getChild(role),
- args=['sudo', 'rm', '-rf', '--', scratch_tmp],
- )
+ if cleanup:
+ remote.run(
+ logger=log.getChild(role),
+ args=['sudo', 'rm', '-rf', '--', scratch_tmp],
+ )
finally:
log.info('Stopping %s on %s...', tests, role)
remote.run(
--- /dev/null
+#!/bin/sh
+
+set -ex
+
+if [ -d "$1" ]; then
+ mkdir -p -- "$1" && cd "$1"
+fi
+
+[ "$VERIFY" != verify ] && mkdir 1
+[ "$VERIFY" != verify ] && mkdir 1/.snap/first
+stat 1/.snap/first
+[ "$VERIFY" != verify ] && mkdir 1/2
+stat 1/.snap/first/2 && exit 1
+[ "$VERIFY" != verify ] && mkdir 1/2/.snap/second
+stat 1/2/.snap/second
+[ "$VERIFY" != verify ] && touch 1/foo
+stat 1/.snap/first/foo && exit 1
+[ "$VERIFY" != verify ] && mkdir 1/.snap/third
+stat 1/.snap/third/foo || exit 1
+[ "$VERIFY" != verify ] && mkdir 1/2/3
+[ "$VERIFY" != verify ] && mkdir 1/2/.snap/fourth
+stat 1/2/.snap/fourth/3
+
+exit 0