From: Igor Fedotov Date: Thu, 29 Sep 2022 11:52:45 +0000 (+0300) Subject: osd: improve OSD robustness. X-Git-Tag: v18.2.4~352^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=6abacb86dc7a7f6d4ec84499bc35df811094d339;p=ceph.git osd: improve OSD robustness. Achieved by 1. osd superblock data is replicated in onode's OMAP - hence one can recover from that after onode's content is corrupted. 2. pg_num_history object gets full overwrite which eliminatess the need to merge with previous data (and hence reading corrupted data wouldn't kill OSD). Signed-off-by: Igor Fedotov (cherry picked from commit 2e9c723b3eddd71b8226be790cc71f5c065e819d) --- diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index c61e7d33218a..1fbbeff92377 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -2092,6 +2092,22 @@ int heap(CephContext& cct, } // namespace ceph::osd_cmds +void OSD::write_superblock(CephContext* cct, OSDSuperblock& sb, ObjectStore::Transaction& t) +{ + dout(10) << "write_superblock " << sb << dendl; + + //hack: at minimum it's using the baseline feature set + if (!sb.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE)) + sb.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE); + + bufferlist bl; + encode(sb, bl); + t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl); + std::map attrs; + attrs.emplace(OSD_SUPERBLOCK_OMAP_KEY, bl); + t.omap_setkeys(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, attrs); +} + int OSD::mkfs(CephContext *cct, std::unique_ptr store, uuid_d fsid, @@ -2153,15 +2169,11 @@ int OSD::mkfs(CephContext *cct, sb.osd_fsid = store->get_fsid(); sb.whoami = whoami; sb.compat_features = get_osd_initial_compat_set(); - - bufferlist bl; - encode(sb, bl); - ObjectStore::CollectionHandle ch = store->create_new_collection( coll_t::meta()); ObjectStore::Transaction t; t.create_collection(coll_t::meta(), 0); - t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl); + write_superblock(cct, sb, t); ret = store->queue_transaction(ch, std::move(t)); if (ret) { derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: " @@ -3762,7 +3774,7 @@ int OSD::init() } ObjectStore::Transaction t; - write_superblock(t); + write_superblock(cct, superblock, t); r = store->queue_transaction(service.meta_ch, std::move(t)); if (r < 0) goto out; @@ -4572,7 +4584,7 @@ int OSD::shutdown() superblock.mounted = service.get_boot_epoch(); superblock.clean_thru = get_osdmap_epoch(); ObjectStore::Transaction t; - write_superblock(t); + write_superblock(cct, superblock, t); int r = store->queue_transaction(service.meta_ch, std::move(t)); if (r) { derr << "OSD::shutdown: error writing superblock: " @@ -4769,25 +4781,35 @@ int OSD::update_crush_device_class() } } -void OSD::write_superblock(ObjectStore::Transaction& t) -{ - dout(10) << "write_superblock " << superblock << dendl; - - //hack: at minimum it's using the baseline feature set - if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE)) - superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE); - - bufferlist bl; - encode(superblock, bl); - t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl); -} int OSD::read_superblock() { bufferlist bl; - int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl); - if (r < 0) - return r; + + set keys; + keys.insert(OSD_SUPERBLOCK_OMAP_KEY); + map vals; + // Let's read from OMAP first to be able to better handle + // "recover-after-an-error' case when main OSD volume data + // is partially corrupted (csums don't match for a bunch of onodes). + // As a result we might want to set bluestore_ignore_csum_error option which + // will silent disk read errors. + // Clearly such a reading from corrupted superblock will miss an error as well + // and it wouldn't attempt to use still valid OMAP's replica. + // Hence preferring omap reading over disk one. + int r = store->omap_get_values( + service.meta_ch, OSD_SUPERBLOCK_GOBJECT, keys, &vals); + if (r < 0 || vals.size() == 0) { + dout(10) << __func__ << " attempt reading from disk replica" << dendl; + + r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl); + if (r < 0) { + return -ENOENT; + } + dout(10) << __func__ << " got disk replica" << dendl; + } else { + std::swap(bl, vals.begin()->second); + } auto p = bl.cbegin(); decode(superblock, p); @@ -6695,7 +6717,7 @@ void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m) m->purged_snaps); } superblock.purged_snaps_last = m->last; - write_superblock(t); + write_superblock(cct, superblock, t); store->queue_transaction( service.meta_ch, std::move(t)); @@ -7179,7 +7201,7 @@ void OSD::scrub_purged_snaps() dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl; ObjectStore::Transaction t; superblock.last_purged_snaps_scrub = ceph_clock_now(); - write_superblock(t); + write_superblock(cct, superblock, t); int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr); ceph_assert(tr == 0); if (is_active()) { @@ -7892,7 +7914,7 @@ void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps) num++; if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) { service.publish_superblock(superblock); - write_superblock(t); + write_superblock(cct, superblock, t); int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr); ceph_assert(tr == 0); num = 0; @@ -7908,7 +7930,7 @@ void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps) } if (num > 0) { service.publish_superblock(superblock); - write_superblock(t); + write_superblock(cct, superblock, t); int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr); ceph_assert(tr == 0); } @@ -8220,7 +8242,19 @@ void OSD::handle_osd_map(MOSDMap *m) { bufferlist bl; ::encode(pg_num_history, bl); - t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl); + auto oid = make_pg_num_history_oid(); + t.truncate(coll_t::meta(), oid, 0); // we don't need bytes left if new data + // block is shorter than the previous + // one. And better to trim them, e.g. + // this allows to avoid csum eroors + // when issuing overwrite + // (which happens to be partial) + // and original data is corrupted. + // Another side effect is that the + // superblock is not permanently + // anchored to a fixed disk location + // any more. + t.write(coll_t::meta(), oid, 0, bl.length(), bl); dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl; } @@ -8240,7 +8274,7 @@ void OSD::handle_osd_map(MOSDMap *m) } // superblock and commit - write_superblock(t); + write_superblock(cct, superblock, t); t.register_on_commit(new C_OnMapCommit(this, start, last, m)); store->queue_transaction( service.meta_ch, @@ -8558,7 +8592,7 @@ void OSD::check_osdmap_features() dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl; superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS); ObjectStore::Transaction t; - write_superblock(t); + write_superblock(cct, superblock, t); int err = store->queue_transaction(service.meta_ch, std::move(t), NULL); ceph_assert(err == 0); } diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 00fab7ec83ed..fc40c93a52e3 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -1235,8 +1235,9 @@ private: // -- superblock -- OSDSuperblock superblock; - void write_superblock(); - void write_superblock(ObjectStore::Transaction& t); + static void write_superblock(CephContext* cct, + OSDSuperblock& sb, + ObjectStore::Transaction& t); int read_superblock(); void clear_temp_objects(); diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index afed5fa83510..01da2b534a81 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -373,6 +373,7 @@ enum { // pg stuff #define OSD_SUPERBLOCK_GOBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0))) +#define OSD_SUPERBLOCK_OMAP_KEY "osd_superblock" // placement seed (a hash value) typedef uint32_t ps_t;