]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: improve OSD robustness.
authorIgor Fedotov <ifedotov@suse.com>
Thu, 29 Sep 2022 11:52:45 +0000 (14:52 +0300)
committerIgor Fedotov <igor.fedotov@croit.io>
Thu, 21 Mar 2024 15:20:43 +0000 (18:20 +0300)
Achieved by
1. osd superblock data is replicated in onode's OMAP - hence one can
   recover from that after onode's content is corrupted.
2. pg_num_history object gets full overwrite which eliminatess the need to
   merge with previous data (and hence reading corrupted data wouldn't
   kill OSD).

Signed-off-by: Igor Fedotov <ifedotov@croit.io>
(cherry picked from commit 2e9c723b3eddd71b8226be790cc71f5c065e819d)

src/osd/OSD.cc
src/osd/OSD.h
src/osd/osd_types.h

index c53f47b7c8990c66249f1c00951d259e95df409c..bc7105b87ec5f35a98df2c325d7f1cc85454b6e5 100644 (file)
@@ -2100,6 +2100,22 @@ int heap(CephContext& cct,
 
 } // namespace ceph::osd_cmds
 
+void OSD::write_superblock(CephContext* cct, OSDSuperblock& sb, ObjectStore::Transaction& t)
+{
+  dout(10) << "write_superblock " << sb << dendl;
+
+  //hack: at minimum it's using the baseline feature set
+  if (!sb.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
+    sb.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
+
+  bufferlist bl;
+  encode(sb, bl);
+  t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
+  std::map<std::string, ceph::buffer::list> attrs;
+  attrs.emplace(OSD_SUPERBLOCK_OMAP_KEY, bl);
+  t.omap_setkeys(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, attrs);
+}
+
 int OSD::mkfs(CephContext *cct,
              std::unique_ptr<ObjectStore> store,
              uuid_d fsid,
@@ -2161,15 +2177,11 @@ int OSD::mkfs(CephContext *cct,
     sb.osd_fsid = store->get_fsid();
     sb.whoami = whoami;
     sb.compat_features = get_osd_initial_compat_set();
-
-    bufferlist bl;
-    encode(sb, bl);
-
     ObjectStore::CollectionHandle ch = store->create_new_collection(
       coll_t::meta());
     ObjectStore::Transaction t;
     t.create_collection(coll_t::meta(), 0);
-    t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
+    write_superblock(cct, sb, t);
     ret = store->queue_transaction(ch, std::move(t));
     if (ret) {
       derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
@@ -3125,7 +3137,7 @@ will start to track new ops received afterwards.";
     superblock.purged_snaps_last = 0;
     ObjectStore::Transaction t;
     dout(10) << __func__ << " updating superblock" << dendl;
-    write_superblock(t);
+    write_superblock(cct, superblock, t);
     ret = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
     if (ret < 0) {
       ss << "Error writing superblock: " << cpp_strerror(ret);
@@ -3774,7 +3786,7 @@ int OSD::init()
     }
 
     ObjectStore::Transaction t;
-    write_superblock(t);
+    write_superblock(cct, superblock, t);
     r = store->queue_transaction(service.meta_ch, std::move(t));
     if (r < 0)
       goto out;
@@ -4577,7 +4589,7 @@ int OSD::shutdown()
   superblock.mounted = service.get_boot_epoch();
   superblock.clean_thru = get_osdmap_epoch();
   ObjectStore::Transaction t;
-  write_superblock(t);
+  write_superblock(cct, superblock, t);
   int r = store->queue_transaction(service.meta_ch, std::move(t));
   if (r) {
     derr << "OSD::shutdown: error writing superblock: "
@@ -4775,25 +4787,35 @@ int OSD::update_crush_device_class()
   }
 }
 
-void OSD::write_superblock(ObjectStore::Transaction& t)
-{
-  dout(10) << "write_superblock " << superblock << dendl;
-
-  //hack: at minimum it's using the baseline feature set
-  if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
-    superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
-
-  bufferlist bl;
-  encode(superblock, bl);
-  t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
-}
 
 int OSD::read_superblock()
 {
   bufferlist bl;
-  int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
-  if (r < 0)
-    return r;
+  
+  set<string> keys;
+  keys.insert(OSD_SUPERBLOCK_OMAP_KEY);
+  map<string, bufferlist> vals;
+  // Let's read from OMAP first to be able to better handle
+  // "recover-after-an-error' case when main OSD volume data
+  // is partially corrupted (csums don't match for a bunch of onodes).
+  // As a result we might want to set bluestore_ignore_csum_error option which
+  // will silent disk read errors.
+  // Clearly such a reading from corrupted superblock will miss an error as well
+  // and it wouldn't attempt to use still valid OMAP's replica.
+  // Hence preferring omap reading over disk one.
+  int r = store->omap_get_values(
+    service.meta_ch, OSD_SUPERBLOCK_GOBJECT, keys, &vals);
+  if (r < 0 || vals.size() == 0) {
+    dout(10) << __func__ << " attempt reading from disk replica" << dendl;
+
+    r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
+    if (r < 0) {
+      return -ENOENT;
+    }
+    dout(10) << __func__ << " got disk replica" << dendl;
+  } else {
+    std::swap(bl, vals.begin()->second);
+  }
 
   auto p = bl.cbegin();
   decode(superblock, p);
@@ -6774,7 +6796,7 @@ void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
                                  make_purged_snaps_oid(), &t,
                                  m->purged_snaps);
   superblock.purged_snaps_last = m->last;
-  write_superblock(t);
+  write_superblock(cct, superblock, t);
   store->queue_transaction(
     service.meta_ch,
     std::move(t));
@@ -7258,7 +7280,7 @@ void OSD::scrub_purged_snaps()
   dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl;
   ObjectStore::Transaction t;
   superblock.last_purged_snaps_scrub = ceph_clock_now();
-  write_superblock(t);
+  write_superblock(cct, superblock, t);
   int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
   ceph_assert(tr == 0);
   if (is_active()) {
@@ -8062,7 +8084,7 @@ void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
     num++;
     if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
       service.publish_superblock(superblock);
-      write_superblock(t);
+      write_superblock(cct, superblock, t);
       int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
       ceph_assert(tr == 0);
       num = 0;
@@ -8078,7 +8100,7 @@ void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
   }
   if (num > 0) {
     service.publish_superblock(superblock);
-    write_superblock(t);
+    write_superblock(cct, superblock, t);
     int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
     ceph_assert(tr == 0);
   }
@@ -8390,7 +8412,19 @@ void OSD::handle_osd_map(MOSDMap *m)
   {
     bufferlist bl;
     ::encode(pg_num_history, bl);
-    t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
+    auto oid = make_pg_num_history_oid();
+    t.truncate(coll_t::meta(), oid, 0); // we don't need bytes left if new data
+                                        // block is shorter than the previous
+                                        // one. And better to trim them, e.g.
+                                        // this allows to avoid csum eroors
+                                        // when issuing overwrite
+                                        // (which happens to be partial)
+                                        // and original data is corrupted.
+                                        // Another side effect is that the
+                                        // superblock is not permanently
+                                        // anchored to a fixed disk location
+                                        // any more.
+    t.write(coll_t::meta(), oid, 0, bl.length(), bl);
     dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
   }
 
@@ -8407,7 +8441,7 @@ void OSD::handle_osd_map(MOSDMap *m)
   }
 
   // superblock and commit
-  write_superblock(t);
+  write_superblock(cct, superblock, t);
   t.register_on_commit(new C_OnMapCommit(this, start, last, m));
   store->queue_transaction(
     service.meta_ch,
@@ -8725,7 +8759,7 @@ void OSD::check_osdmap_features()
       dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
       superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
       ObjectStore::Transaction t;
-      write_superblock(t);
+      write_superblock(cct, superblock, t);
       int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
       ceph_assert(err == 0);
     }
index 45bceadeecfe14a347fe72b65feebb7a71657dc3..52880bf4acea27a5142e46ee8826fb05bbcc983c 100644 (file)
@@ -1239,8 +1239,9 @@ private:
   // -- superblock --
   OSDSuperblock superblock;
 
-  void write_superblock();
-  void write_superblock(ObjectStore::Transaction& t);
+  static void write_superblock(CephContext* cct,
+                               OSDSuperblock& sb,
+                               ObjectStore::Transaction& t);
   int read_superblock();
 
   void clear_temp_objects();
index 538c3f42848e6610358ad89fbe203d268f5870e5..6e9a6cd5ab0f00fc1378bb65dd0f98960068573b 100644 (file)
@@ -376,6 +376,7 @@ enum {
 // pg stuff
 
 #define OSD_SUPERBLOCK_GOBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0)))
+#define OSD_SUPERBLOCK_OMAP_KEY "osd_superblock"
 
 // placement seed (a hash value)
 typedef uint32_t ps_t;