From: lishuhao Date: Sun, 14 Apr 2019 11:41:55 +0000 (+0800) Subject: osd: change build_push_op and submit_push_data base recovery strategy X-Git-Tag: v15.1.0~2697^2~12 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=2455e083478d3cf56ea7bc5c7498f133fd0ac1db;p=ceph.git osd: change build_push_op and submit_push_data base recovery strategy Signed-off-by: Ning Yao Signed-off-by: lishuhao --- diff --git a/src/messages/MOSDPGLog.h b/src/messages/MOSDPGLog.h index 041e4bd0739b1..de3a50e0c0924 100644 --- a/src/messages/MOSDPGLog.h +++ b/src/messages/MOSDPGLog.h @@ -96,7 +96,7 @@ public: encode(epoch, payload); encode(info, payload); encode(log, payload); - encode(missing, payload); + encode(missing, payload, features); if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) { // pre-nautilus OSDs do not set last_peering_reset properly encode(epoch, payload); diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h index d8fca3277e05f..3dfc5a918d45a 100644 --- a/src/osd/PGBackend.h +++ b/src/osd/PGBackend.h @@ -270,7 +270,7 @@ typedef std::shared_ptr OSDMapRef; virtual spg_t primary_spg_t() const = 0; virtual pg_shard_t primary_shard() const = 0; - + virtual uint64_t min_peer_features() const = 0; virtual hobject_t get_temp_recovery_object(const hobject_t& target, eversion_t version) = 0; diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h index f445df1f0ec81..5942be6c76507 100644 --- a/src/osd/PrimaryLogPG.h +++ b/src/osd/PrimaryLogPG.h @@ -511,7 +511,9 @@ public: pg_shard_t primary_shard() const override { return get_primary(); } - + uint64_t min_peer_features() const override { + return recovery_state.get_min_peer_features(); + } void send_message_osd_cluster( int peer, Message *m, epoch_t from_epoch) override; void send_message_osd_cluster( diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc index 8063e0a9e132a..5956750bbd70b 100644 --- a/src/osd/ReplicatedBackend.cc +++ b/src/osd/ReplicatedBackend.cc @@ -1143,6 +1143,14 @@ void ReplicatedBackend::calc_head_subsets( if (size) data_subset.insert(0, size); + if (HAVE_FEATURE(parent->min_peer_features(), SERVER_NAUTILUS)) { + const auto it = missing.get_items().find(head); + assert(it != missing.get_items().end()); + data_subset.intersection_of(it->second.clean_regions.get_dirty_regions()); + dout(10) << "calc_head_subsets " << head + << " data_subset " << data_subset << dendl; + } + if (get_parent()->get_pool().allow_incomplete_clones()) { dout(10) << __func__ << ": caching (was) enabled, skipping clone subsets" << dendl; return; @@ -1156,11 +1164,11 @@ void ReplicatedBackend::calc_head_subsets( interval_set cloning; interval_set prev; + hobject_t c = head; if (size) prev.insert(0, size); for (int j=snapset.clones.size()-1; j>=0; j--) { - hobject_t c = head; c.snap = snapset.clones[j]; prev.intersection_of(snapset.clone_overlap[snapset.clones[j]]); if (!missing.is_missing(c) && @@ -1168,23 +1176,29 @@ void ReplicatedBackend::calc_head_subsets( get_parent()->try_lock_for_read(c, manager)) { dout(10) << "calc_head_subsets " << head << " has prev " << c << " overlap " << prev << dendl; - clone_subsets[c] = prev; - cloning.union_of(prev); + cloning = prev; break; } dout(10) << "calc_head_subsets " << head << " does not have prev " << c << " overlap " << prev << dendl; } + cloning.intersection_of(data_subset); + if (cloning.empty()) { + dout(10) << "skipping clone, nothing needs to clone" << dendl; + return; + } if (cloning.num_intervals() > cct->_conf->osd_recover_clone_overlap_limit) { dout(10) << "skipping clone, too many holes" << dendl; get_parent()->release_locks(manager); clone_subsets.clear(); cloning.clear(); + return; } // what's left for us to push? + clone_subsets[c] = cloning; data_subset.subtract(cloning); dout(10) << "calc_head_subsets " << head @@ -1287,9 +1301,9 @@ void ReplicatedBackend::prepare_pull( ObjectContextRef headctx, RPGHandle *h) { - ceph_assert(get_parent()->get_local_missing().get_items().count(soid)); - eversion_t _v = get_parent()->get_local_missing().get_items().find( - soid)->second.need; + const auto missing_iter = get_parent()->get_local_missing().get_items().find(soid); + ceph_assert(missing_iter != get_parent()->get_local_missing().get_items().end()); + eversion_t _v = missing_iter->second.need; ceph_assert(_v == v); const map> &missing_loc( get_parent()->get_missing_loc_shards()); @@ -1363,11 +1377,15 @@ void ReplicatedBackend::prepare_pull( ceph_assert(ssc->snapset.clone_size.count(soid.snap)); recovery_info.size = ssc->snapset.clone_size[soid.snap]; + recovery_info.object_exist = missing_iter->second.clean_regions.object_is_exist(); } else { // pulling head or unversioned object. // always pull the whole thing. recovery_info.copy_subset.insert(0, (uint64_t)-1); + if (HAVE_FEATURE(parent->min_peer_features(), SERVER_NAUTILUS)) + recovery_info.copy_subset.intersection_of(missing_iter->second.clean_regions.get_dirty_regions()); recovery_info.size = ((uint64_t)-1); + recovery_info.object_exist = missing_iter->second.clean_regions.object_is_exist(); } h->pulls[fromshard].push_back(PullOp()); @@ -1378,7 +1396,8 @@ void ReplicatedBackend::prepare_pull( op.recovery_info.soid = soid; op.recovery_info.version = v; op.recovery_progress.data_complete = false; - op.recovery_progress.omap_complete = false; + op.recovery_progress.omap_complete = !missing_iter->second.clean_regions.omap_is_dirty() + && HAVE_FEATURE(parent->min_peer_features(), SERVER_NAUTILUS); op.recovery_progress.data_recovered_to = 0; op.recovery_progress.first = true; @@ -1491,6 +1510,9 @@ int ReplicatedBackend::prep_push( ObcLockManager &&lock_manager) { get_parent()->begin_peer_recover(peer, soid); + const auto pmissing_iter = get_parent()->get_shard_missing().find(peer); + const auto missing_iter = pmissing_iter->second.get_items().find(soid); + assert(missing_iter != pmissing_iter->second.get_items().end()); // take note. PushInfo &pi = pushing[soid][peer]; pi.obc = obc; @@ -1501,6 +1523,9 @@ int ReplicatedBackend::prep_push( pi.recovery_info.oi = obc->obs.oi; pi.recovery_info.ss = pop->recovery_info.ss; pi.recovery_info.version = version; + pi.recovery_info.object_exist = missing_iter->second.clean_regions.object_is_exist(); + pi.recovery_progress.omap_complete = !missing_iter->second.clean_regions.omap_is_dirty() && + HAVE_FEATURE(parent->min_peer_features(), SERVER_NAUTILUS); pi.lock_manager = std::move(lock_manager); ObjectRecoveryProgress new_progress; @@ -1519,7 +1544,9 @@ void ReplicatedBackend::submit_push_data( const ObjectRecoveryInfo &recovery_info, bool first, bool complete, + bool clear_omap, bool cache_dont_need, + interval_set &data_zeros, const interval_set &intervals_included, bufferlist data_included, bufferlist omap_header, @@ -1541,25 +1568,43 @@ void ReplicatedBackend::submit_push_data( } if (first) { - t->remove(coll, ghobject_t(target_oid)); - t->touch(coll, ghobject_t(target_oid)); + if (!complete) { + t->remove(coll, ghobject_t(target_oid)); + t->touch(coll, ghobject_t(target_oid)); + bufferlist bv = attrs.at(OI_ATTR); + object_info_t oi(bv); + t->set_alloc_hint(coll, ghobject_t(target_oid), + oi.expected_object_size, + oi.expected_write_size, + oi.alloc_hint_flags); + } else { + if (!recovery_info.object_exist) { + t->remove(coll, ghobject_t(target_oid)); + t->touch(coll, ghobject_t(target_oid)); + bufferlist bv = attrs.at(OI_ATTR); + object_info_t oi(bv); + t->set_alloc_hint(coll, ghobject_t(target_oid), + oi.expected_object_size, + oi.expected_write_size, + oi.alloc_hint_flags); + } + //remove xattr and update later if overwrite on original object + t->rmattrs(coll, ghobject_t(target_oid)); + //if need update omap, clear the previous content first + if (clear_omap) + t->omap_clear(coll, ghobject_t(target_oid)); + } + t->truncate(coll, ghobject_t(target_oid), recovery_info.size); - if (omap_header.length()) + if (omap_header.length()) t->omap_setheader(coll, ghobject_t(target_oid), omap_header); - bufferlist bv = attrs.at(OI_ATTR); - object_info_t oi(bv); - t->set_alloc_hint(coll, ghobject_t(target_oid), - oi.expected_object_size, - oi.expected_write_size, - oi.alloc_hint_flags); + struct stat st; + int r = store->stat(ch, ghobject_t(recovery_info.soid), &st); if (get_parent()->pg_is_remote_backfilling()) { - struct stat st; uint64_t size = 0; - int r = store->stat(ch, ghobject_t(recovery_info.soid), &st); - if (r == 0) { + if (r == 0) size = st.st_size; - } // Don't need to do anything if object is still the same size if (size != recovery_info.oi.size) { get_parent()->pg_add_local_num_bytes((int64_t)recovery_info.oi.size - (int64_t)size); @@ -1571,11 +1616,46 @@ void ReplicatedBackend::submit_push_data( << dendl; } } + if (!complete) { + //clone overlap content in local object + if (recovery_info.object_exist) { + assert(r == 0); + uint64_t local_size = std::min(recovery_info.size, (uint64_t)st.st_size); + interval_set local_intervals_included, local_intervals_excluded; + if (local_size) { + local_intervals_included.insert(0, local_size); + local_intervals_excluded.intersection_of(local_intervals_included, recovery_info.copy_subset); + local_intervals_included.subtract(local_intervals_excluded); + } + for (interval_set::const_iterator q = local_intervals_included.begin(); + q != local_intervals_included.end(); + ++q) { + dout(15) << " clone_range " << recovery_info.soid << " " + << q.get_start() << "~" << q.get_len() << dendl; + t->clone_range(coll, ghobject_t(recovery_info.soid), ghobject_t(target_oid), + q.get_start(), q.get_len(), q.get_start()); + } + } + } } uint64_t off = 0; uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL; if (cache_dont_need) fadvise_flags |= CEPH_OSD_OP_FLAG_FADVISE_DONTNEED; + // Punch zeros for data, if fiemap indicates nothing but it is marked dirty + if (data_zeros.size() > 0) { + data_zeros.intersection_of(recovery_info.copy_subset); + assert(intervals_included.subset_of(data_zeros)); + data_zeros.subtract(intervals_included); + + dout(20) << __func__ <<" recovering object " << recovery_info.soid + << " copy_subset: " << recovery_info.copy_subset + << " intervals_inlcuded: " << intervals_included + << " data_zeros: " << data_zeros << dendl; + + for (auto p = data_zeros.begin(); p != data_zeros.end(); ++p) + t->zero(coll, ghobject_t(target_oid), p.get_start(), p.get_len()); + } for (interval_set::const_iterator p = intervals_included.begin(); p != intervals_included.end(); ++p) { @@ -1594,14 +1674,15 @@ void ReplicatedBackend::submit_push_data( if (complete) { if (!first) { dout(10) << __func__ << ": Removing oid " - << target_oid << " from the temp collection" << dendl; + << target_oid << " from the temp collection" << dendl; clear_temp_obj(target_oid); t->remove(coll, ghobject_t(recovery_info.soid)); t->collection_move_rename(coll, ghobject_t(target_oid), - coll, ghobject_t(recovery_info.soid)); + coll, ghobject_t(recovery_info.soid)); } submit_push_complete(recovery_info, t); + } } @@ -1666,7 +1747,7 @@ bool ReplicatedBackend::handle_pull_response( const hobject_t &hoid = pop.soid; ceph_assert((data_included.empty() && data.length() == 0) || - (!data_included.empty() && data.length() > 0)); + (!data_included.empty() && data.length() > 0)); auto piter = pulling.find(hoid); if (piter == pulling.end()) { @@ -1719,18 +1800,28 @@ bool ReplicatedBackend::handle_pull_response( pi.recovery_progress = pop.after_progress; dout(10) << "new recovery_info " << pi.recovery_info - << ", new progress " << pi.recovery_progress - << dendl; - + << ", new progress " << pi.recovery_progress + << dendl; + interval_set data_zeros; + uint64_t z_offset = pop.before_progress.data_recovered_to; + uint64_t z_length = pop.after_progress.data_recovered_to - pop.before_progress.data_recovered_to; + if(z_length) + data_zeros.insert(z_offset, z_length); bool complete = pi.is_complete(); - - submit_push_data(pi.recovery_info, first, - complete, pi.cache_dont_need, - data_included, data, - pop.omap_header, - pop.attrset, - pop.omap_entries, - t); + bool clear_omap = !pop.before_progress.omap_complete; + + submit_push_data(pi.recovery_info, + first, + complete, + clear_omap, + pi.cache_dont_need, + data_zeros, + data_included, + data, + pop.omap_header, + pop.attrset, + pop.omap_entries, + t); pi.stat.num_keys_recovered += pop.omap_entries.size(); pi.stat.num_bytes_recovered += data.length(); @@ -1769,12 +1860,20 @@ void ReplicatedBackend::handle_push( bool first = pop.before_progress.first; bool complete = pop.after_progress.data_complete && pop.after_progress.omap_complete; - + bool clear_omap = !pop.before_progress.omap_complete; + interval_set data_zeros; + uint64_t z_offset = pop.before_progress.data_recovered_to; + uint64_t z_length = pop.after_progress.data_recovered_to - pop.before_progress.data_recovered_to; + if(z_length) + data_zeros.insert(z_offset, z_length); response->soid = pop.recovery_info.soid; + submit_push_data(pop.recovery_info, first, complete, + clear_omap, true, // must be replicate + data_zeros, pop.data_included, data, pop.omap_header, @@ -2025,6 +2124,9 @@ int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info, if (get_parent()->pg_is_repair()) stat->num_objects_repaired++; } + } else if (progress.first && progress.omap_complete) { + // If omap is not changed, we need recovery omap when recovery cannot be completed once + new_progress.omap_complete = false; } if (stat) { @@ -2138,10 +2240,14 @@ void ReplicatedBackend::handle_pull(pg_shard_t peer, PullOp &op, PushOp *reply) if (progress.first && recovery_info.size == ((uint64_t)-1)) { // Adjust size and copy_subset recovery_info.size = st.st_size; - recovery_info.copy_subset.clear(); - if (st.st_size) - recovery_info.copy_subset.insert(0, st.st_size); - ceph_assert(recovery_info.clone_subset.empty()); + if (st.st_size) { + interval_set object_range; + object_range.insert(0, st.st_size); + recovery_info.copy_subset.intersection_of(object_range); + } else { + recovery_info.copy_subset.clear(); + } + assert(recovery_info.clone_subset.empty()); } r = build_push_op(recovery_info, progress, 0, reply); diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h index 8f447495a4ed1..6e89f8ca87559 100644 --- a/src/osd/ReplicatedBackend.h +++ b/src/osd/ReplicatedBackend.h @@ -268,7 +268,9 @@ private: void submit_push_data(const ObjectRecoveryInfo &recovery_info, bool first, bool complete, + bool clear_omap, bool cache_dont_need, + interval_set &data_zeros, const interval_set &intervals_included, bufferlist data_included, bufferlist omap_header, @@ -335,7 +337,7 @@ private: ceph_tid_t tid, Context *on_commit, OpRequestRef op, eversion_t v) : RefCountedObject(nullptr, 0), - tid(tid), on_commit(on_commit), + tid(tid), on_commit(on_commit), op(op), v(v) {} bool done() const { return waiting_for_commit.empty(); diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 58b6f931568d0..a0ad552c0efc3 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -5891,7 +5891,7 @@ void ObjectRecoveryProgress::dump(Formatter *f) const void ObjectRecoveryInfo::encode(ceph::buffer::list &bl, uint64_t features) const { - ENCODE_START(2, 1, bl); + ENCODE_START(3, 1, bl); encode(soid, bl); encode(version, bl); encode(size, bl); @@ -5899,13 +5899,14 @@ void ObjectRecoveryInfo::encode(ceph::buffer::list &bl, uint64_t features) const encode(ss, bl); encode(copy_subset, bl); encode(clone_subset, bl); + encode(object_exist, bl); ENCODE_FINISH(bl); } void ObjectRecoveryInfo::decode(ceph::buffer::list::const_iterator &bl, int64_t pool) { - DECODE_START(2, bl); + DECODE_START(3, bl); decode(soid, bl); decode(version, bl); decode(size, bl); @@ -5913,8 +5914,9 @@ void ObjectRecoveryInfo::decode(ceph::buffer::list::const_iterator &bl, decode(ss, bl); decode(copy_subset, bl); decode(clone_subset, bl); + if (struct_v > 2) + decode(object_exist, bl); DECODE_FINISH(bl); - if (struct_v < 2) { if (!soid.is_max() && soid.pool == -1) soid.pool = pool; @@ -5936,6 +5938,7 @@ void ObjectRecoveryInfo::generate_test_instances( o.back()->soid = hobject_t(sobject_t("key", CEPH_NOSNAP)); o.back()->version = eversion_t(0,0); o.back()->size = 100; + o.back()->object_exist = false; } @@ -5956,6 +5959,7 @@ void ObjectRecoveryInfo::dump(Formatter *f) const } f->dump_stream("copy_subset") << copy_subset; f->dump_stream("clone_subset") << clone_subset; + f->dump_stream("object_exist") << object_exist; } ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf) @@ -5971,6 +5975,7 @@ ostream &ObjectRecoveryInfo::print(ostream &out) const << ", copy_subset: " << copy_subset << ", clone_subset: " << clone_subset << ", snapset: " << ss + << ", object_exist: " << object_exist << ")"; } diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 57054e55ea5be..8f42b4c3ecc8f 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -5421,8 +5421,9 @@ struct ObjectRecoveryInfo { SnapSet ss; // only populated if soid is_snap() interval_set copy_subset; std::map> clone_subset; + bool object_exist; - ObjectRecoveryInfo() : size(0) { } + ObjectRecoveryInfo() : size(0), object_exist(true) { } static void generate_test_instances(std::list& o); void encode(ceph::buffer::list &bl, uint64_t features) const;