From: Haomai Wang Date: Sat, 27 Jun 2015 03:57:32 +0000 (+0800) Subject: ReplicatedPG: Don't cache recovery and scrub data X-Git-Tag: v9.1.0~332^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=fabd6357e42e526d2704d7cb80375c12d731df8d;p=ceph.git ReplicatedPG: Don't cache recovery and scrub data Signed-off-by: Haomai Wang --- diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index a64c0f2a079f..6cb63ecce12e 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -1778,6 +1778,9 @@ void ECBackend::be_deep_scrub( if (stride % sinfo.get_chunk_size()) stride += sinfo.get_chunk_size() - (stride % sinfo.get_chunk_size()); uint64_t pos = 0; + + uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED; + while (true) { bufferlist bl; handle.reset_tp_timeout(); @@ -1787,7 +1790,7 @@ void ECBackend::be_deep_scrub( poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), pos, stride, bl, - true); + fadvise_flags, true); if (r < 0) break; if (bl.length() % sinfo.get_chunk_size()) { diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h index 518fdf4be850..dad8224c8399 100644 --- a/src/osd/PGBackend.h +++ b/src/osd/PGBackend.h @@ -250,6 +250,9 @@ * the pending recovery operations. */ struct RecoveryHandle { + bool cache_dont_need; + + RecoveryHandle(): cache_dont_need(false) {} virtual ~RecoveryHandle() {} }; diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc index 2015dd0b6f5e..73906b4bec60 100644 --- a/src/osd/ReplicatedBackend.cc +++ b/src/osd/ReplicatedBackend.cc @@ -742,13 +742,16 @@ void ReplicatedBackend::be_deep_scrub( bufferlist bl, hdrbl; int r; __u64 pos = 0; + + uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED; + while ( (r = store->read( - coll, - ghobject_t( - poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), - pos, - cct->_conf->osd_deep_scrub_stride, bl, - true)) > 0) { + coll, + ghobject_t( + poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + pos, + cct->_conf->osd_deep_scrub_stride, bl, + fadvise_flags, true)) > 0) { handle.reset_tp_timeout(); h << bl; pos += bl.length(); @@ -1518,6 +1521,7 @@ void ReplicatedBackend::prepare_pull( pi.head_ctx = headctx; pi.recovery_info = op.recovery_info; pi.recovery_progress = op.recovery_progress; + pi.cache_dont_need = h->cache_dont_need; } /* @@ -1526,7 +1530,7 @@ void ReplicatedBackend::prepare_pull( */ void ReplicatedBackend::prep_push_to_replica( ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer, - PushOp *pop) + PushOp *pop, bool cache_dont_need) { const object_info_t& oi = obc->obs.oi; uint64_t size = obc->obs.oi.size; @@ -1582,7 +1586,7 @@ void ReplicatedBackend::prep_push_to_replica( data_subset, clone_subsets); } - prep_push(obc, soid, peer, oi.version, data_subset, clone_subsets, pop); + prep_push(obc, soid, peer, oi.version, data_subset, clone_subsets, pop, cache_dont_need); } void ReplicatedBackend::prep_push(ObjectContextRef obc, @@ -1605,7 +1609,7 @@ void ReplicatedBackend::prep_push( eversion_t version, interval_set &data_subset, map >& clone_subsets, - PushOp *pop) + PushOp *pop, bool cache_dont_need) { get_parent()->begin_peer_recover(peer, soid); // take note. @@ -1627,7 +1631,7 @@ void ReplicatedBackend::prep_push( pi.recovery_progress, &new_progress, pop, - &(pi.stat)); + &(pi.stat), cache_dont_need); assert(r == 0); pi.recovery_progress = new_progress; } @@ -1671,6 +1675,7 @@ void ReplicatedBackend::submit_push_data( ObjectRecoveryInfo &recovery_info, bool first, bool complete, + bool cache_dont_need, const interval_set &intervals_included, bufferlist data_included, bufferlist omap_header, @@ -1698,13 +1703,16 @@ void ReplicatedBackend::submit_push_data( t->omap_setheader(coll, ghobject_t(target_oid), omap_header); } uint64_t off = 0; + uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL; + if (cache_dont_need) + fadvise_flags |= CEPH_OSD_OP_FLAG_FADVISE_DONTNEED; for (interval_set::const_iterator p = intervals_included.begin(); p != intervals_included.end(); ++p) { bufferlist bit; bit.substr_of(data_included, off, p.get_len()); t->write(coll, ghobject_t(target_oid), - p.get_start(), p.get_len(), bit); + p.get_start(), p.get_len(), bit, fadvise_flags); off += p.get_len(); } @@ -1827,7 +1835,7 @@ bool ReplicatedBackend::handle_pull_response( bool complete = pi.is_complete(); submit_push_data(pi.recovery_info, first, - complete, + complete, pi.cache_dont_need, data_included, data, pop.omap_header, pop.attrset, @@ -1871,6 +1879,7 @@ void ReplicatedBackend::handle_push( submit_push_data(pop.recovery_info, first, complete, + true, // must be replicate pop.data_included, data, pop.omap_header, @@ -1950,7 +1959,8 @@ int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info, const ObjectRecoveryProgress &progress, ObjectRecoveryProgress *out_progress, PushOp *out_op, - object_stat_sum_t *stat) + object_stat_sum_t *stat, + bool cache_dont_need) { ObjectRecoveryProgress _new_progress; if (!out_progress) @@ -2042,7 +2052,8 @@ int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info, ++p) { bufferlist bit; store->read(coll, ghobject_t(recovery_info.soid), - p.get_start(), p.get_len(), bit); + p.get_start(), p.get_len(), bit, + cache_dont_need ? CEPH_OSD_OP_FLAG_FADVISE_DONTNEED: 0); if (p.get_len() != bit.length()) { dout(10) << " extent " << p.get_start() << "~" << p.get_len() << " is actually " << p.get_start() << "~" << bit.length() @@ -2387,8 +2398,7 @@ int ReplicatedBackend::start_pushes( ++pushes; h->pushes[peer].push_back(PushOp()); prep_push_to_replica(obc, soid, peer, - &(h->pushes[peer].back()) - ); + &(h->pushes[peer].back()), h->cache_dont_need); } } return pushes; diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h index 55bef8de5dab..f66c461a8684 100644 --- a/src/osd/ReplicatedBackend.h +++ b/src/osd/ReplicatedBackend.h @@ -191,6 +191,7 @@ private: ObjectContextRef head_ctx; ObjectContextRef obc; object_stat_sum_t stat; + bool cache_dont_need; void dump(Formatter *f) const { { @@ -262,10 +263,12 @@ private: const ObjectRecoveryProgress &progress, ObjectRecoveryProgress *out_progress, PushOp *out_op, - object_stat_sum_t *stat = 0); + object_stat_sum_t *stat = 0, + bool cache_dont_need = true); void submit_push_data(ObjectRecoveryInfo &recovery_info, bool first, bool complete, + bool cache_dont_need, const interval_set &intervals_included, bufferlist data_included, bufferlist omap_header, @@ -291,7 +294,7 @@ private: RPGHandle *h); void prep_push_to_replica( ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer, - PushOp *pop); + PushOp *pop, bool cache_dont_need = true); void prep_push(ObjectContextRef obc, const hobject_t& oid, pg_shard_t dest, PushOp *op); @@ -300,7 +303,8 @@ private: eversion_t version, interval_set &data_subset, map >& clone_subsets, - PushOp *op); + PushOp *op, + bool cache = false); void calc_head_subsets(ObjectContextRef obc, SnapSet& snapset, const hobject_t& head, const pg_missing_t& missing, const hobject_t &last_backfill, diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index dd760487c8ef..d34ad92696b8 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -388,6 +388,7 @@ void ReplicatedPG::wait_for_unreadable_object( } else { dout(7) << "missing " << soid << " v " << v << ", recovering." << dendl; PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op(); + h->cache_dont_need = false; if (is_missing_object(soid)) { recover_missing(soid, v, cct->_conf->osd_client_op_priority, h); } else { @@ -464,6 +465,7 @@ void ReplicatedPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef } } PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op(); + h->cache_dont_need = false; prep_object_replica_pushes(soid, v, h); pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority); }