From: David Zafman Date: Thu, 14 Aug 2014 18:46:29 +0000 (-0700) Subject: ceph_manager: Add test code to use export/import to move a pg X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=d6782a685c18572bdbcc21cbea37c737f0b409be;p=ceph.git ceph_manager: Add test code to use export/import to move a pg Check for more than 1 osd down and randomize on chance_move_pg (100%) For now only export from older down osd to newly down osd to avoid missing map Signed-off-by: David Zafman (cherry picked from commit 05eee9fa79ef4cb432cb28e3a2eb0e66fd686a7b) Conflicts: tasks/ceph_manager.py --- diff --git a/tasks/ceph_manager.py b/tasks/ceph_manager.py index f1b72e04636..007f446b37d 100644 --- a/tasks/ceph_manager.py +++ b/tasks/ceph_manager.py @@ -10,6 +10,7 @@ import threading import os from teuthology import misc as teuthology from tasks.scrub import Scrubber +from teuthology.orchestra.remote import Remote def make_admin_daemon_dir(ctx, remote): """ @@ -76,6 +77,7 @@ class Thrasher: self.clean_wait = self.config.get('clean_wait', 0) self.minin = self.config.get("min_in", 3) self.ceph_objectstore_tool = self.config.get('ceph_objectstore_tool', True) + self.chance_move_pg = self.config.get('chance_move_pg', 1.0) num_osds = self.in_osds + self.out_osds self.max_pgs = self.config.get("max_pgs_per_pool_osd", 1200) * num_osds @@ -121,36 +123,62 @@ class Thrasher: (remote,) = self.ceph_manager.ctx.cluster.only('osd.{o}'.format(o=osd)).remotes.iterkeys() FSPATH = self.ceph_manager.get_filepath() JPATH = os.path.join(FSPATH, "journal") + exp_osd = imp_osd = osd + exp_remote = imp_remote = remote + # If an older osd is available we'll move a pg from there + if len(self.dead_osds) > 1 and random.random() < self.chance_move_pg: + exp_osd = random.choice(self.dead_osds[:-1]) + (exp_remote,) = self.ceph_manager.ctx.cluster.only('osd.{o}'.format(o=exp_osd)).remotes.iterkeys() prefix = "sudo ceph_objectstore_tool --data-path {fpath} --journal-path {jpath} ".format(fpath=FSPATH, jpath=JPATH) - cmd = (prefix + "--op list-pgs").format(id=osd) - proc = remote.run(args=cmd, wait=True, check_status=True, stdout=StringIO()) - if proc.exitstatus != 0: - self.log("Failed to get pg list for osd.{osd}".format(osd=osd)) - return + cmd = (prefix + "--op list-pgs").format(id=exp_osd) + proc = exp_remote.run(args=cmd, wait=True, check_status=True, stdout=StringIO()) + if proc.exitstatus: + raise Exception("ceph_objectstore_tool: exp list-pgs failure with status {ret}".format(ret=proc.exitstatus)) pgs = proc.stdout.getvalue().split('\n')[:-1] if len(pgs) == 0: - self.log("No PGs found for osd.{osd}".format(osd=osd)) + self.log("No PGs found for osd.{osd}".format(osd=exp_osd)) return pg = random.choice(pgs) - fpath = os.path.join(os.path.join(teuthology.get_testdir(self.ceph_manager.ctx), "data"), "exp.{pg}.{id}".format(pg=pg,id=osd)) + exp_path = os.path.join(os.path.join(teuthology.get_testdir(self.ceph_manager.ctx), "data"), "exp.{pg}.{id}".format(pg=pg, id=exp_osd)) # export - success = False - cmd = (prefix + "--op export --pgid {pg} --file {file}").format(id=osd, pg=pg, file=fpath) - proc = remote.run(args=cmd) - if proc.exitstatus == 0: - # remove - cmd = (prefix + "--op remove --pgid {pg}").format(id=osd, pg=pg) - proc = remote.run(args=cmd) - if proc.exitstatus == 0: - # import - cmd = (prefix + "--op import --file {file}").format(id=osd, file=fpath) - remote.run(args=cmd) - if proc.exitstatus == 0: - success = True - cmd = "rm -f {file}".format(file=fpath) - remote.run(args=cmd) - if not success: - raise Exception("ceph_objectstore_tool: failure with status {ret}".format(ret=proc.exitstatus)) + cmd = (prefix + "--op export --pgid {pg} --file {file}").format(id=exp_osd, pg=pg, file=exp_path) + proc = exp_remote.run(args=cmd) + if proc.exitstatus: + raise Exception("ceph_objectstore_tool: export failure with status {ret}".format(ret=proc.exitstatus)) + # remove + cmd = (prefix + "--op remove --pgid {pg}").format(id=exp_osd, pg=pg) + proc = exp_remote.run(args=cmd) + if proc.exitstatus: + raise Exception("ceph_objectstore_tool: remove failure with status {ret}".format(ret=proc.exitstatus)) + # If there are at least 2 dead osds we might move the pg + if exp_osd != imp_osd: + # If pg isn't already on this osd, then we will move it there + cmd = (prefix + "--op list-pgs").format(id=imp_osd) + proc = imp_remote.run(args=cmd, wait=True, check_status=True, stdout=StringIO()) + if proc.exitstatus: + raise Exception("ceph_objectstore_tool: imp list-pgs failure with status {ret}".format(ret=proc.exitstatus)) + pgs = proc.stdout.getvalue().split('\n')[:-1] + if pg not in pgs: + self.log("Moving pg {pg} from osd.{fosd} to osd.{tosd}".format(pg=pg, fosd=exp_osd, tosd=imp_osd)) + if imp_remote != exp_remote: + # Copy export file to the other machine + self.log("Transfer export file from {srem} to {trem}".format(srem=exp_remote, trem=imp_remote)) + tmpexport = Remote.get_file(exp_remote, exp_path) + Remote.put_file(imp_remote, tmpexport, exp_path) + os.remove(tmpexport) + else: + # Can't move the pg after all + imp_osd = exp_osd + imp_remote = exp_remote + # import + cmd = (prefix + "--op import --file {file}").format(id=imp_osd, file=exp_path) + imp_remote.run(args=cmd) + if proc.exitstatus: + raise Exception("ceph_objectstore_tool: import failure with status {ret}".format(ret=proc.exitstatus)) + cmd = "rm -f {file}".format(file=exp_path) + exp_remote.run(args=cmd) + if imp_remote != exp_remote: + imp_remote.run(args=cmd) def blackhole_kill_osd(self, osd=None): diff --git a/tasks/thrashosds.py b/tasks/thrashosds.py index c19631fbe09..7c15f9aaf43 100644 --- a/tasks/thrashosds.py +++ b/tasks/thrashosds.py @@ -95,6 +95,7 @@ def task(ctx, config): map_discontinuity_sleep_time: (40) time to wait for map trims ceph_objectstore_tool: (true) whether to export/import a pg while an osd is down + chance_move_pg: (1.0) chance of moving a pg if more than 1 osd is down (default 100%) example: