]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
osd: EC optimizations: changes to rollback to support partial writes
authorBill Scales <156200352+bill-scales@users.noreply.github.com>
Thu, 6 Mar 2025 09:46:05 +0000 (09:46 +0000)
committerBill Scales <bill_scales@uk.ibm.com>
Thu, 10 Apr 2025 08:12:56 +0000 (09:12 +0100)
EC Pools create an ObjectModDesc entry in each log entry that describes
how to undo the modifcation. During peering if it is determined that
only some of the shards completed an update then these shards are
instructed to rollback the change. This ensures that each update
is either applied to all or none of the shards.

For EC optimized pools rollback becomes a bit more complicated.
Firstly because not all shards may have been updated the rollback
needs to be more selective in what is undone. Secondly optimized
pools do not pad objects to be a multiple of the stripe width
which means shards can be different sizes.

There is a single ObjectModDesc entry that contains a set of
operations applied by every shard, individial operations need
to include enough information to work out what has to be undone
on each shard.

Signed-off-by: Bill Scales <bill_scales@uk.ibm.com>
src/osd/ECBackend.h
src/osd/ECBackendL.h
src/osd/ECSwitch.h
src/osd/PGBackend.cc
src/osd/PGBackend.h
src/osd/osd_types.cc
src/osd/osd_types.h

index 71d14a1e9fc44655afd487130db47ed95382493d..8b50cc22e489a2cb155b0fa91b81a67119335c14 100644 (file)
@@ -388,7 +388,11 @@ public:
   int get_ec_stripe_chunk_size() const {
     return sinfo.get_chunk_size();
   }
-  uint64_t object_size_to_shard_size(const uint64_t size, int shard) const {
+  uint64_t object_size_to_shard_size(const uint64_t size,
+                                    shard_id_t shard) const {
+    if (size == std::numeric_limits<uint64_t>::max()) {
+      return size;
+    }
     return sinfo.logical_to_next_chunk_offset(size);
   }
   /**
index f4f0c3afd50526e5e9f13ec0e2dcbde575dd030f..6a578057acd850f1bde67beec75f332d80932395 100644 (file)
@@ -385,6 +385,9 @@ END_IGNORE_DEPRECATED
     return sinfo.get_chunk_size();
   }
   uint64_t object_size_to_shard_size(const uint64_t size) const {
+    if (size == std::numeric_limits<uint64_t>::max()) {
+      return size;
+    }
     return sinfo.logical_to_next_chunk_offset(size);
   }
   /**
index 893869793ef12f8e5cc80095e6672f4754ac5387..a5e81e8cbb9c577ae9178bc390091bebea827f3d 100644 (file)
@@ -347,7 +347,7 @@ public:
   }
 
   uint64_t
-  object_size_to_shard_size(const uint64_t size, int shard) const override
+  object_size_to_shard_size(const uint64_t size, shard_id_t shard) const override
   {
     if (is_optimized()) {
       return optimized.object_size_to_shard_size(size, shard);
index 9bf461f2528fe2b49fbfb00f473674c75aed868b..e91614192f5c23d75f311d0d9b51181e5d63f79f 100644 (file)
@@ -204,23 +204,56 @@ void PGBackend::rollback(
   struct RollbackVisitor : public ObjectModDesc::Visitor {
     const hobject_t &hoid;
     PGBackend *pg;
+    const pg_log_entry_t &entry;
     ObjectStore::Transaction t;
     RollbackVisitor(
       const hobject_t &hoid,
-      PGBackend *pg) : hoid(hoid), pg(pg) {}
+      PGBackend *pg,
+      const pg_log_entry_t &entry) : hoid(hoid), pg(pg), entry(entry) {}
     void append(uint64_t old_size) override {
       ObjectStore::Transaction temp;
-      int s = static_cast<int>(pg->get_parent()->whoami_shard().shard);
-      const uint64_t shard_size = pg->object_size_to_shard_size(old_size, s);
+      auto dpp = pg->get_parent()->get_dpp();
+      const uint64_t shard_size = pg->object_size_to_shard_size(old_size,
+                      pg->get_parent()->whoami_shard().shard);
+      ldpp_dout(dpp, 20) << " entry " << entry.version
+                        << " rollback append object_size " << old_size
+                        << " shard_size " << shard_size << dendl;
       pg->rollback_append(hoid, shard_size, &temp);
       temp.append(t);
       temp.swap(t);
     }
     void setattrs(map<string, std::optional<bufferlist> > &attrs) override {
-      ObjectStore::Transaction temp;
-      pg->rollback_setattrs(hoid, attrs, &temp);
-      temp.append(t);
-      temp.swap(t);
+      auto dpp = pg->get_parent()->get_dpp();
+      const pg_pool_t &pool = pg->get_parent()->get_pool();
+      if (pool.is_nonprimary_shard(pg->get_parent()->whoami_shard().shard)) {
+        if (entry.is_written_shard(pg->get_parent()->whoami_shard().shard)) {
+         // Written shard - only rollback OI attr
+         ldpp_dout(dpp, 20) << " entry " << entry.version
+                            << " written shard OI attr rollback "
+                            << pg->get_parent()->whoami_shard().shard
+                            << dendl;
+         ObjectStore::Transaction temp;
+         pg->rollback_setattrs(hoid, attrs, &temp, true);
+         temp.append(t);
+         temp.swap(t);
+       } else {
+         // Unwritten shard - nothing to rollback
+         ldpp_dout(dpp, 20) << " entry " << entry.version
+                            << " unwritten shard skipping attr rollback "
+                            << pg->get_parent()->whoami_shard().shard
+                            << dendl;
+       }
+      } else {
+       // Primary shard - rollback all attrs
+       ldpp_dout(dpp, 20) << " entry " << entry.version
+                          << " primary_shard attr rollback "
+                          << pg->get_parent()->whoami_shard().shard
+                          << dendl;
+       ObjectStore::Transaction temp;
+       pg->rollback_setattrs(hoid, attrs, &temp, false);
+       temp.append(t);
+       temp.swap(t);
+      }
     }
     void rmobject(version_t old_version) override {
       ObjectStore::Transaction temp;
@@ -247,17 +280,58 @@ void PGBackend::rollback(
       temp.swap(t);
     }
     void rollback_extents(
-      version_t gen,
-      const vector<pair<uint64_t, uint64_t> > &extents) override {
+      const version_t gen,
+      const std::vector<std::pair<uint64_t, uint64_t>> &extents,
+      const uint64_t object_size,
+      const std::vector<shard_id_set> &shards) override {
       ObjectStore::Transaction temp;
-      pg->rollback_extents(gen, extents, hoid, &temp);
-      temp.append(t);
-      temp.swap(t);
+      const pg_pool_t& pool = pg->get_parent()->get_pool();
+      ceph_assert(entry.written_shards.empty() ||
+                 pool.allows_ecoptimizations());
+      auto dpp = pg->get_parent()->get_dpp();
+      bool donework = false;
+      ceph_assert(shards.empty() || shards.size() == extents.size());
+      for (unsigned int i = 0; i < extents.size(); i++) {
+        if (shards.empty() ||
+           shards[i].empty() ||
+           shards[i].contains(pg->get_parent()->whoami_shard().shard)) {
+         // Written shard - rollback extents
+         const uint64_t shard_size = pg->object_size_to_shard_size(
+                                       object_size,
+                                       pg->get_parent()->whoami_shard().shard);
+         ldpp_dout(dpp, 20) << " entry " << entry.version
+                            << " written shard rollback_extents "
+                            << entry.written_shards
+                            << " shards "
+                            << (shards.empty() ? shard_id_set() : shards[i])
+                            << " " << pg->get_parent()->whoami_shard().shard
+                            << " " << object_size
+                            << " " << shard_size
+                            << dendl;
+         pg->rollback_extents(gen, extents[i].first, extents[i].second,
+                              hoid, shard_size, &temp);
+         donework = true;
+       } else {
+         // Unwritten shard - nothing to rollback
+         ldpp_dout(dpp, 20) << " entry " << entry.version
+                            << " unwritten shard skipping rollback_extents "
+                            << entry.written_shards
+                            << " " << pg->get_parent()->whoami_shard().shard
+                            << dendl;
+       }
+      }
+      if (donework) {
+       t.remove(
+         pg->coll,
+         ghobject_t(hoid, gen, pg->get_parent()->whoami_shard().shard));
+       temp.append(t);
+       temp.swap(t);
+      }
     }
   };
 
   ceph_assert(entry.mod_desc.can_rollback());
-  RollbackVisitor vis(entry.soid, this);
+  RollbackVisitor vis(entry.soid, this, entry);
   entry.mod_desc.visit(&vis);
   t->append(vis.t);
 }
@@ -279,12 +353,28 @@ struct Trimmer : public ObjectModDesc::Visitor {
   }
   // try_rmobject defaults to rmobject
   void rollback_extents(
-    version_t gen,
-    const vector<pair<uint64_t, uint64_t> > &extents) override {
-    pg->trim_rollback_object(
-      soid,
-      gen,
-      t);
+    const version_t gen,
+    const std::vector<std::pair<uint64_t, uint64_t>> &extents,
+    const uint64_t object_size,
+    const std::vector<shard_id_set> &shards) override {
+    auto dpp = pg->get_parent()->get_dpp();
+    ceph_assert(shards.empty() || shards.size() == extents.size());
+    for (unsigned int i = 0; i < extents.size(); i++) {
+      if (shards.empty() ||
+         shards[i].empty() ||
+         shards[i].contains(pg->get_parent()->whoami_shard().shard)) {
+        ldpp_dout(dpp, 30) << __func__ << " trim " << shards << " "
+                          << pg->get_parent()->whoami_shard().shard << dendl;
+        pg->trim_rollback_object(
+          soid,
+          gen,
+          t);
+       break;
+      } else {
+       ldpp_dout(dpp, 20) << __func__ << " skipping trim " << shards << " "
+                          << pg->get_parent()->whoami_shard().shard << dendl;
+      }
+    }
   }
 };
 
@@ -481,7 +571,8 @@ int PGBackend::objects_get_attrs(
 void PGBackend::rollback_setattrs(
   const hobject_t &hoid,
   map<string, std::optional<bufferlist> > &old_attrs,
-  ObjectStore::Transaction *t) {
+  ObjectStore::Transaction *t,
+  bool only_oi) {
   map<string, bufferlist, less<>> to_set;
   ceph_assert(!hoid.is_temp());
   for (map<string, std::optional<bufferlist> >::iterator i = old_attrs.begin();
@@ -489,17 +580,25 @@ void PGBackend::rollback_setattrs(
        ++i) {
     if (i->second) {
       to_set[i->first] = *(i->second);
-    } else {
+    } else if (!only_oi) {
       t->rmattr(
        coll,
        ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
        i->first);
     }
   }
-  t->setattrs(
-    coll,
-    ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
-    to_set);
+  if (only_oi) {
+    t->setattr(
+      coll,
+      ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+      OI_ATTR,
+      to_set[OI_ATTR]);
+  } else {
+    t->setattrs(
+      coll,
+      ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+      to_set);
+  }
 }
 
 void PGBackend::rollback_append(
@@ -533,6 +632,7 @@ void PGBackend::rollback_try_stash(
   version_t old_version,
   ObjectStore::Transaction *t) {
   ceph_assert(!hoid.is_temp());
+  dout(20) << __func__ << " " << hoid << " " << old_version << dendl;
   t->remove(
     coll,
     ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
@@ -544,22 +644,33 @@ void PGBackend::rollback_try_stash(
 
 void PGBackend::rollback_extents(
   version_t gen,
-  const vector<pair<uint64_t, uint64_t> > &extents,
+  const uint64_t offset,
+  uint64_t length,
   const hobject_t &hoid,
+  const uint64_t shard_size,
   ObjectStore::Transaction *t) {
   auto shard = get_parent()->whoami_shard().shard;
-  for (auto &&extent: extents) {
+  if (offset >= shard_size) {
+    // extent on this shard is beyond the end of the object - nothing to do
+    dout(20) << __func__ << " " << hoid << " "
+            << offset << "~" << length << " is out of range "
+            << shard_size << dendl;
+  } else {
+    if (offset + length > shard_size) {
+      dout(20) << __func__ << " " << length << " is being truncated" << dendl;
+      // extent on this shard goes beyond end of the object - truncate length
+      length = shard_size - offset;
+    }
+    dout(20) << __func__ << " " << hoid << " " << offset << "~" << length
+            << dendl;
     t->clone_range(
       coll,
       ghobject_t(hoid, gen, shard),
       ghobject_t(hoid, ghobject_t::NO_GEN, shard),
-      extent.first,
-      extent.second,
-      extent.first);
+      offset,
+      length,
+      offset);
   }
-  t->remove(
-    coll,
-    ghobject_t(hoid, gen, shard));
 }
 
 void PGBackend::trim_rollback_object(
@@ -567,6 +678,7 @@ void PGBackend::trim_rollback_object(
   version_t old_version,
   ObjectStore::Transaction *t) {
   ceph_assert(!hoid.is_temp());
+  dout(20) << __func__ << " trim " << hoid << " " << old_version << dendl;
   t->remove(
     coll, ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard));
 }
index 9103e7e57d92b51c590b03fec4b87f700b59d849..32ca5f17820c9086832ba23a392a8b6ee8667b49 100644 (file)
@@ -426,7 +426,7 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
    virtual IsPGReadablePredicate *get_is_readable_predicate() const = 0;
    virtual unsigned int get_ec_data_chunk_count() const { return 0; };
    virtual int get_ec_stripe_chunk_size() const { return 0; };
-   virtual uint64_t object_size_to_shard_size(const uint64_t size, int shard) const { return size; };
+   virtual uint64_t object_size_to_shard_size(const uint64_t size, shard_id_t shard) const { return size; };
    virtual void dump_recovery_info(ceph::Formatter *f) const = 0;
 
  private:
@@ -504,7 +504,8 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
    void rollback_setattrs(
      const hobject_t &hoid,
      std::map<std::string, std::optional<ceph::buffer::list> > &old_attrs,
-     ObjectStore::Transaction *t);
+     ObjectStore::Transaction *t,
+     bool only_oi);
 
    /// Truncate object to rollback append
    void rollback_append(
@@ -534,8 +535,10 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
    /// Clone the extents back into place
    void rollback_extents(
      version_t gen,
-     const std::vector<std::pair<uint64_t, uint64_t> > &extents,
+     const uint64_t offset,
+     uint64_t length,
      const hobject_t &hoid,
+     const uint64_t shard_size,
      ObjectStore::Transaction *t);
  public:
 
index 6068e6a1b1f9f985b68639f50e91afa4622663e4..4a4e74758fbf79b40f01d0267d6c859a4d08b7d9 100644 (file)
@@ -4668,11 +4668,25 @@ void ObjectModDesc::visit(Visitor *visitor) const
        break;
       }
       case ROLLBACK_EXTENTS: {
-       vector<pair<uint64_t, uint64_t> > extents;
+       vector<pair<uint64_t, uint64_t>> extents;
        version_t gen;
+       uint64_t object_size;
+       vector<shard_id_set> shards;
        decode(gen, bp);
        decode(extents, bp);
-       visitor->rollback_extents(gen,extents);
+       if (struct_v < 3) {
+         // Object size is used by optimized EC pools that do not pad objects to a
+         // multiple of the strip size. Rollback clone operations for each shard
+         // need to be truncated to not exceed the object size. Legacy EC pools
+         // do not store the object_size, but because objects are padded do not
+         // need to truncate the clones. Setting object_size to max avoids
+         // truncation.
+         object_size = std::numeric_limits<uint64_t>::max();
+       } else {
+         decode(object_size, bp);
+         decode(shards, bp);
+       }
+       visitor->rollback_extents(gen, extents, object_size, shards);
        break;
       }
       default:
@@ -4728,11 +4742,16 @@ struct DumpVisitor : public ObjectModDesc::Visitor {
     f->close_section();
   }
   void rollback_extents(
-    version_t gen,
-    const vector<pair<uint64_t, uint64_t> > &extents) override {
+    const version_t gen,
+    const vector<pair<uint64_t, uint64_t>> &extents,
+    const uint64_t object_size,
+    const vector<shard_id_set> &shards) override {
     f->open_object_section("op");
     f->dump_string("code", "ROLLBACK_EXTENTS");
     f->dump_unsigned("gen", gen);
+    f->dump_unsigned("object_size", object_size);
+    f->dump_stream("extents") << extents;
+    f->dump_stream("shards") << shards;
     f->dump_stream("snaps") << extents;
     f->close_section();
   }
index 1cbfef7e15ec2ef8d93a30ccfbd98adf5699c6de..be34ad0f717b74658bc93bb13d656867b8db88b2 100644 (file)
@@ -4052,8 +4052,10 @@ public:
     virtual void create() {}
     virtual void update_snaps(const std::set<snapid_t> &old_snaps) {}
     virtual void rollback_extents(
-      version_t gen,
-      const std::vector<std::pair<uint64_t, uint64_t> > &extents) {}
+      const version_t gen,
+      const std::vector<std::pair<uint64_t, uint64_t>> &extents,
+      const uint64_t object_size,
+      const std::vector<shard_id_set> &shards) {}
     virtual ~Visitor() {}
   };
   void visit(Visitor *visitor) const;
@@ -4076,8 +4078,9 @@ public:
     rollback_info_completed = other.rollback_info_completed;
   }
   void claim_append(ObjectModDesc &other) {
-    if (!can_local_rollback || rollback_info_completed)
+    if (!can_local_rollback || rollback_info_completed) {
       return;
+    }
     if (!other.can_local_rollback) {
       mark_unrollbackable();
       return;
@@ -4099,24 +4102,27 @@ public:
     encode(_id, bl);
   }
   void append(uint64_t old_size) {
-    if (!can_local_rollback || rollback_info_completed)
+    if (!can_local_rollback || rollback_info_completed) {
       return;
+    }
     ENCODE_START(1, 1, bl);
     append_id(APPEND);
     encode(old_size, bl);
     ENCODE_FINISH(bl);
   }
   void setattrs(std::map<std::string, std::optional<ceph::buffer::list>> &old_attrs) {
-    if (!can_local_rollback || rollback_info_completed)
+    if (!can_local_rollback || rollback_info_completed) {
       return;
+    }
     ENCODE_START(1, 1, bl);
     append_id(SETATTRS);
     encode(old_attrs, bl);
     ENCODE_FINISH(bl);
   }
   bool rmobject(version_t deletion_version) {
-    if (!can_local_rollback || rollback_info_completed)
+    if (!can_local_rollback || rollback_info_completed) {
       return false;
+    }
     ENCODE_START(1, 1, bl);
     append_id(DELETE);
     encode(deletion_version, bl);
@@ -4125,8 +4131,9 @@ public:
     return true;
   }
   bool try_rmobject(version_t deletion_version) {
-    if (!can_local_rollback || rollback_info_completed)
+    if (!can_local_rollback || rollback_info_completed) {
       return false;
+    }
     ENCODE_START(1, 1, bl);
     append_id(TRY_DELETE);
     encode(deletion_version, bl);
@@ -4135,27 +4142,51 @@ public:
     return true;
   }
   void create() {
-    if (!can_local_rollback || rollback_info_completed)
+    if (!can_local_rollback || rollback_info_completed) {
       return;
+    }
     rollback_info_completed = true;
     ENCODE_START(1, 1, bl);
     append_id(CREATE);
     ENCODE_FINISH(bl);
   }
   void update_snaps(const std::set<snapid_t> &old_snaps) {
-    if (!can_local_rollback || rollback_info_completed)
+    if (!can_local_rollback || rollback_info_completed) {
       return;
+    }
     ENCODE_START(1, 1, bl);
     append_id(UPDATE_SNAPS);
     encode(old_snaps, bl);
     ENCODE_FINISH(bl);
   }
   void rollback_extents(
-    version_t gen, const std::vector<std::pair<uint64_t, uint64_t> > &extents) {
+   const version_t gen,
+   const std::vector<std::pair<uint64_t, uint64_t>> &extents,
+   const uint64_t object_size,
+   const std::vector<shard_id_set> &shards) {
+    ceph_assert(can_local_rollback);
+    ceph_assert(!rollback_info_completed);
+    if (max_required_version < 2) {
+      max_required_version = 2;
+    }
+    ENCODE_START(3, 2, bl);
+    append_id(ROLLBACK_EXTENTS);
+    encode(gen, bl);
+    encode(extents, bl);
+    encode(object_size, bl);
+    encode(shards, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  // Version for legacy EC (can be deleted when EC*L.cc is deleted.
+  void rollback_extents(
+   const version_t gen,
+   const std::vector<std::pair<uint64_t, uint64_t>> &extents) {
     ceph_assert(can_local_rollback);
     ceph_assert(!rollback_info_completed);
-    if (max_required_version < 2)
+    if (max_required_version < 2) {
       max_required_version = 2;
+    }
     ENCODE_START(2, 2, bl);
     append_id(ROLLBACK_EXTENTS);
     encode(gen, bl);
@@ -4185,8 +4216,9 @@ public:
    * message buffer
    */
   void trim_bl() const {
-    if (bl.length() > 0)
+    if (bl.length() > 0) {
       bl.rebuild();
+    }
   }
   void encode(ceph::buffer::list &bl) const;
   void decode(ceph::buffer::list::const_iterator &bl);