]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
crimson/os/seastore: introduce TransactionManager::map_existing_extent
authorZhang Song <zhangsong325@gmail.com>
Mon, 13 Jun 2022 08:40:54 +0000 (16:40 +0800)
committerZhang Song <zhangsong325@gmail.com>
Mon, 11 Jul 2022 02:23:34 +0000 (10:23 +0800)
Signed-off-by: Zhang Song <zhangsong325@gmail.com>
src/crimson/os/seastore/cache.cc
src/crimson/os/seastore/seastore_types.h
src/crimson/os/seastore/transaction.h
src/crimson/os/seastore/transaction_manager.cc
src/crimson/os/seastore/transaction_manager.h

index 7812267d4af638a1c1cabe1083eef1fa78e7e113..64ce9217807da765b3e312ad960d5243d8e834ec 100644 (file)
@@ -979,6 +979,15 @@ CachedExtentRef Cache::duplicate_for_write(
   if (i->is_pending())
     return i;
 
+  if (i->is_exist_clean()) {
+    i->version++;
+    i->state = CachedExtent::extent_state_t::EXIST_MUTATION_PENDING;
+    i->last_committed_crc = i->get_crc32c();
+    t.add_mutated_extent(i);
+    DEBUGT("duplicate existing extent {}", t, *i);
+    return i;
+  }
+
   auto ret = i->duplicate_for_write();
   ret->prior_instance = i;
   t.add_mutated_extent(ret);
@@ -1035,16 +1044,25 @@ record_t Cache::prepare_record(
       DEBUGT("invalid mutated extent -- {}", t, *i);
       continue;
     }
-    assert(i->prior_instance);
+    assert(i->is_exist_mutation_pending() ||
+          i->prior_instance);
     get_by_ext(efforts.mutate_by_ext,
                i->get_type()).increment(i->get_length());
 
     auto delta_bl = i->get_delta();
     auto delta_length = delta_bl.length();
-    DEBUGT("mutated extent with {}B delta, commit replace extent ... -- {}, prior={}",
-           t, delta_length, *i, *i->prior_instance);
     i->set_modify_time(commit_time);
-    commit_replace_extent(t, i, i->prior_instance);
+    DEBUGT("mutated extent with {}B delta -- {}",
+          t, delta_length, *i);
+    if (!i->is_exist_mutation_pending()) {
+      DEBUGT("commit replace extent ... -- {}, prior={}",
+            t, *i, *i->prior_instance);
+      // extent with EXIST_MUTATION_PENDING doesn't have
+      // prior_instance field so skip these extents.
+      // the existing extents should be added into Cache
+      // during complete_commit to sync with gc transaction.
+      commit_replace_extent(t, i, i->prior_instance);
+    }
 
     i->prepare_write();
     i->set_io_wait();
@@ -1197,6 +1215,16 @@ record_t Cache::prepare_record(
        i->get_type());
     }
   }
+
+  for (auto &i: t.existing_block_list) {
+    if (i->is_valid()) {
+      alloc_delta.alloc_blk_ranges.emplace_back(
+        i->get_paddr(),
+       i->cast<LogicalCachedExtent>()->get_laddr(),
+       i->get_length(),
+       i->get_type());
+    }
+  }
   alloc_deltas.emplace_back(std::move(alloc_delta));
 
   for (auto b : alloc_deltas) {
@@ -1401,6 +1429,10 @@ void Cache::complete_commit(
          i->get_length());
       }
       if (is_backref_mapped_extent_node(i)) {
+       DEBUGT("backref_list new {} len {}",
+              t,
+              i->get_paddr(),
+              i->get_length());
        backref_list.emplace_back(
          std::make_unique<backref_buf_entry_t>(
            i->get_paddr(),
@@ -1426,7 +1458,8 @@ void Cache::complete_commit(
     if (!i->is_valid()) {
       continue;
     }
-    assert(i->prior_instance);
+    assert(i->is_exist_mutation_pending() ||
+          i->prior_instance);
     i->on_delta_write(final_block_start);
     i->prior_instance = CachedExtentRef();
     i->state = CachedExtent::extent_state_t::DIRTY;
@@ -1445,6 +1478,13 @@ void Cache::complete_commit(
        i->get_paddr(),
        i->get_length());
     }
+    for (auto &i: t.existing_block_list) {
+      if (i->is_valid()) {
+       cleaner->mark_space_used(
+         i->get_paddr(),
+         i->get_length());
+      }
+    }
   }
 
   for (auto &i: t.mutated_block_list) {
@@ -1459,6 +1499,11 @@ void Cache::complete_commit(
     i->dirty_from_or_retired_at = last_commit;
     if (is_backref_mapped_extent_node(i)
          || is_retired_placeholder(i->get_type())) {
+      DEBUGT("backref_list free {} len {} should release {}",
+            t,
+            i->get_paddr(),
+            i->get_length(),
+            t.should_record_release(i->get_paddr()));
       if (t.should_record_release(i->get_paddr())) {
        backref_list.emplace_back(
          std::make_unique<backref_buf_entry_t>(
@@ -1475,6 +1520,35 @@ void Cache::complete_commit(
       ceph_abort("not possible");
     }
   }
+
+  auto existing_stats = t.get_existing_block_stats();
+  DEBUGT("total existing blocks num: {}, exist clean num: {}, "
+        "exist mutation pending num: {}",
+        t,
+        existing_stats.valid_num,
+        existing_stats.clean_num,
+        existing_stats.mutated_num);
+  for (auto &i: t.existing_block_list) {
+    if (i->is_valid()) {
+      if (i->is_exist_clean()) {
+       i->state = CachedExtent::extent_state_t::CLEAN;
+      } else {
+       assert(i->state == CachedExtent::extent_state_t::DIRTY);
+      }
+      DEBUGT("backref_list new existing {} len {}",
+            t,
+            i->get_paddr(),
+            i->get_length());
+      backref_list.emplace_back(
+        std::make_unique<backref_buf_entry_t>(
+         i->get_paddr(),
+         i->cast<LogicalCachedExtent>()->get_laddr(),
+         i->get_length(),
+         i->get_type(),
+         seq));
+      add_extent(i);
+    }
+  }
   if (!backref_list.empty())
     backref_batch_update(std::move(backref_list), seq);
 }
index 3fceac7ed5883438f9dedbea7d5030fc933fa6d0..a49f2f989d513c5dd4f8b917ab96d514dc353fc8 100644 (file)
@@ -552,6 +552,10 @@ public:
     return !is_zero() && !is_null();
   }
 
+  bool is_absolute() const {
+    return get_device_id() <= DEVICE_ID_MAX_VALID;
+  }
+
   DENC(paddr_t, v, p) {
     DENC_START(1, 1, p);
     denc(v.dev_addr, p);
index 83c99f07bc6610ad5da1a9dc3ef8da846024eb1b..78150ddb58661269d2fd0efd06a630702f323628 100644 (file)
@@ -82,15 +82,20 @@ public:
   };
   get_extent_ret get_extent(paddr_t addr, CachedExtentRef *out) {
     LOG_PREFIX(Transaction::get_extent);
-    if (retired_set.count(addr)) {
-      return get_extent_ret::RETIRED;
-    } else if (auto iter = write_set.find_offset(addr);
+    // it's possible that both write_set and retired_set contain
+    // this addr at the same time when addr is absolute and the
+    // corresponding extent is used to map existing extent on disk.
+    // So search write_set first.
+    if (auto iter = write_set.find_offset(addr);
        iter != write_set.end()) {
       if (out)
        *out = CachedExtentRef(&*iter);
       SUBTRACET(seastore_cache, "{} is present in write_set -- {}",
                 *this, addr, *iter);
+      assert((*out)->is_valid());
       return get_extent_ret::PRESENT;
+    } else if (retired_set.count(addr)) {
+      return get_extent_ret::RETIRED;
     } else if (
       auto iter = read_set.find(addr);
       iter != read_set.end()) {
@@ -109,7 +114,12 @@ public:
 
   void add_to_retired_set(CachedExtentRef ref) {
     ceph_assert(!is_weak());
-    if (ref->is_initial_pending()) {
+    if (ref->is_exist_clean() ||
+       ref->is_exist_mutation_pending()) {
+      existing_block_stats.dec(ref);
+      ref->state = CachedExtent::extent_state_t::INVALID;
+      write_set.erase(*ref);
+    } else if (ref->is_initial_pending()) {
       ref->state = CachedExtent::extent_state_t::INVALID;
       write_set.erase(*ref);
     } else if (ref->is_mutation_pending()) {
@@ -137,19 +147,23 @@ public:
   void add_fresh_extent(
     CachedExtentRef ref) {
     ceph_assert(!is_weak());
-    if (ref->get_paddr().is_delayed()) {
+    if (ref->is_exist_clean()) {
+      existing_block_stats.inc(ref);
+      existing_block_list.push_back(ref);
+    } else if (ref->get_paddr().is_delayed()) {
       assert(ref->get_paddr() == make_delayed_temp_paddr(0));
       assert(ref->is_logical());
       ref->set_paddr(make_delayed_temp_paddr(delayed_temp_offset));
       delayed_temp_offset += ref->get_length();
       delayed_alloc_list.emplace_back(ref->cast<LogicalCachedExtent>());
+      fresh_block_stats.increment(ref->get_length());
     } else {
       assert(ref->get_paddr() == make_record_relative_paddr(0));
       ref->set_paddr(make_record_relative_paddr(offset));
       offset += ref->get_length();
       inline_block_list.push_back(ref);
+      fresh_block_stats.increment(ref->get_length());
     }
-    fresh_block_stats.increment(ref->get_length());
     write_set.insert(*ref);
     if (is_backref_node(ref->get_type()))
       fresh_backref_extents++;
@@ -178,9 +192,15 @@ public:
 
   void add_mutated_extent(CachedExtentRef ref) {
     ceph_assert(!is_weak());
-    assert(read_set.count(ref->prior_instance->get_paddr()));
+    assert(ref->is_exist_mutation_pending() ||
+          read_set.count(ref->prior_instance->get_paddr()));
     mutated_block_list.push_back(ref);
-    write_set.insert(*ref);
+    if (!ref->is_exist_mutation_pending()) {
+      write_set.insert(*ref);
+    } else {
+      assert(write_set.find_offset(ref->get_paddr()) !=
+            write_set.end());
+    }
   }
 
   void replace_placeholder(CachedExtent& placeholder, CachedExtent& extent) {
@@ -233,10 +253,31 @@ public:
     return mutated_block_list;
   }
 
+  const auto &get_existing_block_list() {
+    return existing_block_list;
+  }
+
   const auto &get_retired_set() {
     return retired_set;
   }
 
+  bool is_retired(laddr_t laddr, extent_len_t len, paddr_t paddr) {
+    if (retired_set.empty()) {
+      return false;
+    }
+    auto iter = retired_set.lower_bound(paddr);
+    if (iter == retired_set.end() ||
+       (*iter)->get_paddr() > paddr) {
+      assert(iter != retired_set.begin());
+      --iter;
+    }
+
+    auto lextent = (*iter)->cast<LogicalCachedExtent>();
+    auto ext_laddr = lextent->get_laddr();
+    return ext_laddr <= laddr &&
+      ext_laddr + lextent->get_length() >= laddr + len;
+  }
+
   bool should_record_release(paddr_t addr) {
     auto count = no_release_delta_retired_set.count(addr);
 #ifndef NDEBUG
@@ -337,6 +378,8 @@ public:
     ool_block_list.clear();
     retired_set.clear();
     no_release_delta_retired_set.clear();
+    existing_block_list.clear();
+    existing_block_stats = {};
     onode_tree_stats = {};
     omap_tree_stats = {};
     lba_tree_stats = {};
@@ -404,6 +447,31 @@ public:
     return rewrite_version_stats;
   }
 
+  struct existing_block_stats_t {
+    uint64_t valid_num = 0;
+    uint64_t clean_num = 0;
+    uint64_t mutated_num = 0;
+    void inc(const CachedExtentRef &ref) {
+      valid_num++;
+      if (ref->is_exist_clean()) {
+       clean_num++;
+      } else {
+       mutated_num++;
+      }
+    }
+    void dec(const CachedExtentRef &ref) {
+      valid_num--;
+      if (ref->is_exist_clean()) {
+       clean_num--;
+      } else {
+       mutated_num--;
+      }
+    }
+  };
+  existing_block_stats_t& get_existing_block_stats() {
+    return existing_block_stats;
+  }
+
 private:
   friend class Cache;
   friend Ref make_test_transaction();
@@ -455,6 +523,10 @@ private:
   /// list of mutated blocks, holds refcounts, subset of write_set
   std::list<CachedExtentRef> mutated_block_list;
 
+  /// partial blocks of extents on disk, with data and refcounts
+  std::list<CachedExtentRef> existing_block_list;
+  existing_block_stats_t existing_block_stats;
+
   /**
    * retire_set
    *
index 434daf8dfc2f5777de291b68c6c018d58a20fbe4..14813a66848933808f6ab8e47eef26936b725dec 100644 (file)
@@ -389,7 +389,8 @@ TransactionManager::submit_transaction_direct(
       // ...but add_pin from parent->leaf
       std::vector<CachedExtentRef> lba_to_link;
       std::vector<CachedExtentRef> backref_to_link;
-      lba_to_link.reserve(tref.get_fresh_block_stats().num);
+      lba_to_link.reserve(tref.get_fresh_block_stats().num +
+                         tref.get_existing_block_stats().valid_num);
       backref_to_link.reserve(tref.get_fresh_block_stats().num);
       tref.for_each_fresh_block([&](auto &e) {
        if (e->is_valid()) {
@@ -400,6 +401,12 @@ TransactionManager::submit_transaction_direct(
        }
       });
 
+      for (auto &e: tref.get_existing_block_list()) {
+       if (e->is_valid()) {
+         lba_to_link.push_back(e);
+       }
+      }
+
       lba_manager->complete_transaction(tref, lba_to_clear, lba_to_link);
       backref_manager->complete_transaction(tref, backref_to_clear, backref_to_link);
 
index de7bb3fee4d7e97d5d5692c3c8c4e4769b784567..823b1abcef9b0a64a359a0949c299acad18ec419 100644 (file)
@@ -362,6 +362,68 @@ public:
     });
   }
 
+  /**
+   * map_existing_extent
+   *
+   * Allocates a new extent at given existing_paddr that must be absolute and
+   * reads disk to fill the extent.
+   * The common usage is that remove the LogicalCachedExtent (laddr~length at paddr)
+   * and map extent to multiple new extents.
+   * placement_hint and generation should follow the original extent.
+   */
+  using map_existing_extent_iertr =
+    alloc_extent_iertr::extend_ertr<Device::read_ertr>;
+  template <typename T>
+  using map_existing_extent_ret =
+    map_existing_extent_iertr::future<TCachedExtentRef<T>>;
+  template <typename T>
+  map_existing_extent_ret<T> map_existing_extent(
+    Transaction &t,
+    laddr_t laddr_hint,
+    paddr_t existing_paddr,
+    extent_len_t length,
+    placement_hint_t placement_hint = placement_hint_t::HOT,
+    reclaim_gen_t gen = DIRTY_GENERATION) {
+    LOG_PREFIX(TransactionManager::map_existing_extent);
+    ceph_assert(existing_paddr.is_absolute());
+    assert(t.is_retired(laddr_hint, length, existing_paddr));
+
+    auto bp = ceph::bufferptr(buffer::create_page_aligned(length));
+    bp.zero();
+
+    // ExtentPlacementManager::alloc_new_extent will make a new
+    // (relative/temp) paddr, so make extent directly
+    auto ext = CachedExtent::make_cached_extent_ref<T>(std::move(bp));
+
+    ext->init(CachedExtent::extent_state_t::EXIST_CLEAN,
+             existing_paddr,
+             placement_hint,
+             gen);
+
+    t.add_fresh_extent(ext);
+
+    return lba_manager->alloc_extent(
+      t,
+      laddr_hint,
+      length,
+      existing_paddr
+    ).si_then([ext=std::move(ext), laddr_hint, &t, this, FNAME](auto &&ref) {
+      SUBDEBUGT(seastore_tm, "map existing extent: {}, laddr_hint: {} pin: {}",
+               t, *ext, laddr_hint, *ref);
+      ceph_assert(laddr_hint == ref->get_key());
+      ext->set_pin(std::move(ref));
+      return epm->read(
+        ext->get_paddr(),
+       ext->get_length(),
+       ext->get_bptr()
+      ).safe_then([ext=std::move(ext)] {
+       return map_existing_extent_iertr::make_ready_future<TCachedExtentRef<T>>
+         (std::move(ext));
+      });
+    });
+  }
+
+
   using reserve_extent_iertr = alloc_extent_iertr;
   using reserve_extent_ret = reserve_extent_iertr::future<LBAPinRef>;
   reserve_extent_ret reserve_region(