From 9169119bd3c0ac976871e7f3321d9d7a53335c82 Mon Sep 17 00:00:00 2001 From: Tim Serong Date: Wed, 15 Apr 2020 19:26:19 +1000 Subject: [PATCH] cephadm: handle adopting offline OSDSs The current adopt behavior expects OSDs to be online, in order to read /var/lib/ceph/osd/ceph-$ID/fsid. To handle the case where OSDs are offline, this change first checks to see if that file is present, and if not, falls back to calling `ceph-volume lvm list` to see if there's a matching OSD there, and if that doesn't work, it checks /etc/ceph/osd/*.json to see if there's a matching old-style simple OSD present. For LVM OSDs, the only thing we need is the ODS's fsid; the remainer of the adopt procedure "just works", as the various other files in /var/lib/ceph/$FSID/osd.$ID are created by magic anyway when the OSD is activated, so it doesn't matter if they're not present at adoption time. For simple (ceph-disk created) OSDs, we actually need all the files under /var/lib/ceph/osd/ceph-$ID/ to be moved to /var/lib/ceph/$FSID/osd.$ID so if a simple OSD is found, it's mounted first, so the existing move_files() a bit further down around line 3200 continues to work. Fixes: https://tracker.ceph.com/issues/45095 Signed-off-by: Tim Serong --- src/cephadm/cephadm | 102 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 88 insertions(+), 14 deletions(-) diff --git a/src/cephadm/cephadm b/src/cephadm/cephadm index 2c34d6e5f2c68..59154a48cf2b5 100755 --- a/src/cephadm/cephadm +++ b/src/cephadm/cephadm @@ -3089,6 +3089,84 @@ def command_adopt(): raise Error('daemon type %s not recognized' % daemon_type) +class AdoptOsd(object): + def __init__(self, osd_data_dir, osd_id): + # type: (str, str) -> None + self.osd_data_dir = osd_data_dir + self.osd_id = osd_id + + def check_online_osd(self): + # type: () -> Tuple[Optional[str], Optional[str]] + + osd_fsid, osd_type = None, None + + path = os.path.join(self.osd_data_dir, 'fsid') + try: + with open(path, 'r') as f: + osd_fsid = f.read().strip() + logger.info("Found online OSD at %s" % path) + if os.path.exists(os.path.join(self.osd_data_dir, 'type')): + with open(os.path.join(self.osd_data_dir, 'type')) as f: + osd_type = f.read().strip() + else: + logger.info('"type" file missing for OSD data dir') + except IOError: + logger.info('Unable to read OSD fsid from %s' % path) + + return osd_fsid, osd_type + + def check_offline_lvm_osd(self): + # type: () -> Tuple[Optional[str], Optional[str]] + + osd_fsid, osd_type = None, None + + c = CephContainer( + image=args.image, + entrypoint='/usr/sbin/ceph-volume', + args=['lvm', 'list', '--format=json'], + privileged=True + ) + out, err, code = call_throws(c.run_cmd(), verbose=False) + if not code: + try: + js = json.loads(out) + if self.osd_id in js: + logger.info("Found offline LVM OSD {}".format(self.osd_id)) + osd_fsid = js[self.osd_id][0]['tags']['ceph.osd_fsid'] + for device in js[self.osd_id]: + if device['tags']['ceph.type'] == 'block': + osd_type = 'bluestore' + break + if device['tags']['ceph.type'] == 'data': + osd_type = 'filestore' + break + except ValueError as e: + logger.info("Invalid JSON in ceph-volume lvm list: {}".format(e)) + + return osd_fsid, osd_type + + def check_offline_simple_osd(self): + # type: () -> Tuple[Optional[str], Optional[str]] + + osd_fsid, osd_type = None, None + + osd_file = glob("/etc/ceph/osd/{}-[a-f0-9-]*.json".format(self.osd_id)) + if len(osd_file) == 1: + with open(osd_file[0], 'r') as f: + try: + js = json.loads(f.read()) + logger.info("Found offline simple OSD {}".format(self.osd_id)) + osd_fsid = js["fsid"] + osd_type = js["type"] + if osd_type != "filestore": + # need this to be mounted for the adopt to work, as it + # needs to move files from this directory + call_throws(['mount', js["data"]["path"], self.osd_data_dir]) + except ValueError as e: + logger.info("Invalid JSON in {}: {}".format(osd_file, e)) + + return osd_fsid, osd_type + def command_adopt_ceph(daemon_type, daemon_id, fsid): # type: (str, str, str) -> None @@ -3101,20 +3179,16 @@ def command_adopt_ceph(daemon_type, daemon_id, fsid): osd_fsid = None if daemon_type == 'osd': - path = os.path.join(data_dir_src, 'fsid') - try: - with open(path, 'r') as f: - osd_fsid = f.read().strip() - except IOError: - raise Error('unable to read OSD fsid from %s' % path) - os_type = None - if os.path.exists(os.path.join(data_dir_src, 'type')): - with open(os.path.join(data_dir_src, 'type')) as f: - os_type = f.read().strip() - else: - raise Error('"type" file missing for OSD data dir') - logger.info('objectstore_type is %s' % os_type) - if os_type == 'filestore': + adopt_osd = AdoptOsd(data_dir_src, daemon_id) + osd_fsid, osd_type = adopt_osd.check_online_osd() + if not osd_fsid: + osd_fsid, osd_type = adopt_osd.check_offline_lvm_osd() + if not osd_fsid: + osd_fsid, osd_type = adopt_osd.check_offline_simple_osd() + if not osd_fsid: + raise Error('Unable to find OSD {}'.format(daemon_id)) + logger.info('objectstore_type is %s' % osd_type) + if osd_type == 'filestore': raise Error('FileStore is not supported by cephadm') # NOTE: implicit assumption here that the units correspond to the -- 2.39.5