From: David Zafman Date: Wed, 17 Oct 2018 19:31:59 +0000 (-0700) Subject: osd: Deny reservation if expected backfill size would put us over backfill_full_ratio... X-Git-Tag: v14.1.0~344^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=0474498684d089559d479f62ae16db53c1604652;p=ceph.git osd: Deny reservation if expected backfill size would put us over backfill_full_ratio (EC) Erasure Coded Pools Fixes: http://tracker.ceph.com/issues/19753 Signed-off-by: David Zafman --- diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index bc9c4c460863..1e668376c806 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -329,6 +329,15 @@ void ECBackend::handle_recovery_push( ceph_assert(op.data.length() == 0); } + if (get_parent()->pg_is_remote_backfilling()) { + get_parent()->pg_add_local_num_bytes(op.data.length()); + get_parent()->pg_add_num_bytes(op.data.length() * get_ec_data_chunk_count()); + dout(10) << __func__ << " " << op.soid + << " add new actual data by " << op.data.length() + << " add new num_bytes by " << op.data.length() * get_ec_data_chunk_count() + << dendl; + } + if (op.before_progress.first) { ceph_assert(op.attrset.count(string("_"))); m->t.setattrs( @@ -365,6 +374,20 @@ void ECBackend::handle_recovery_push( ObjectContextRef(), false, &m->t); + if (get_parent()->pg_is_remote_backfilling()) { + struct stat st; + int r = store->stat(ch, ghobject_t(op.soid, ghobject_t::NO_GEN, + get_parent()->whoami_shard().shard), &st); + if (r == 0) { + get_parent()->pg_sub_local_num_bytes(st.st_size); + // XXX: This can be way overestimated for small objects + get_parent()->pg_sub_num_bytes(st.st_size * get_ec_data_chunk_count()); + dout(10) << __func__ << " " << op.soid + << " sub actual data by " << st.st_size + << " sub num_bytes by " << st.st_size * get_ec_data_chunk_count() + << dendl; + } + } } } m->push_replies[get_parent()->primary_shard()].push_back(PushReplyOp()); diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h index d8d4a71d664b..6784925df9ca 100644 --- a/src/osd/ECBackend.h +++ b/src/osd/ECBackend.h @@ -601,6 +601,13 @@ public: return new ECRecPred(ec_impl); } + int get_ec_data_chunk_count() const override { + return ec_impl->get_data_chunk_count(); + } + int get_ec_stripe_chunk_size() const override { + return sinfo.get_chunk_size(); + } + /** * ECReadPred * diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 170d6ae16f27..102a7598a48d 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -7688,6 +7688,14 @@ PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt) int64_t primary_num_bytes = evt.primary_num_bytes; int64_t local_num_bytes = evt.local_num_bytes; if (primary_num_bytes) { + // For erasure coded pool overestimate by a full stripe per object + // because we don't know how each objected rounded to the nearest stripe + if (pg->pool.info.is_erasure()) { + primary_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count(); + primary_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * pg->info.stats.stats.sum.num_objects; + local_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count(); + local_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * pg->info.stats.stats.sum.num_objects; + } pending_adjustment = pending_backfill(pg->cct, primary_num_bytes, local_num_bytes); ldout(pg->cct, 10) << __func__ << " primary_num_bytes " << (primary_num_bytes >> 10) << "KiB" << " local " << (local_num_bytes >> 10) << "KiB" diff --git a/src/osd/PG.h b/src/osd/PG.h index c8df5c1e922f..e631860d90ae 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -1280,10 +1280,18 @@ public: int64_t get_stats_num_bytes() { Mutex::Locker l(_lock); int num_bytes = info.stats.stats.sum.num_bytes; + if (pool.info.is_erasure()) { + num_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count(); + // Round up each object by a stripe + num_bytes += get_pgbackend()->get_ec_stripe_chunk_size() * info.stats.stats.sum.num_objects; + } int64_t lnb = local_num_bytes.load(); if (lnb && lnb != num_bytes) { lgeneric_dout(cct, 0) << this << " " << info.pgid << " num_bytes mismatch " - << lnb << " vs stats " << num_bytes << dendl; + << lnb << " vs stats " + << info.stats.stats.sum.num_bytes << " / chunk " + << get_pgbackend()->get_ec_data_chunk_count() + << dendl; } return num_bytes; } diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h index 58dd6193529a..837e2cce10f3 100644 --- a/src/osd/PGBackend.h +++ b/src/osd/PGBackend.h @@ -412,6 +412,7 @@ typedef std::shared_ptr OSDMapRef; virtual IsPGRecoverablePredicate *get_is_recoverable_predicate() const = 0; virtual IsPGReadablePredicate *get_is_readable_predicate() const = 0; virtual int get_ec_data_chunk_count() const { return 0; }; + virtual int get_ec_stripe_chunk_size() const { return 0; }; virtual void dump_recovery_info(Formatter *f) const = 0; diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 4b15e7e44d90..08a2616b3748 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -4314,10 +4314,29 @@ void PrimaryLogPG::do_backfill_remove(OpRequestRef op) pg_whoami.shard) , &st); if (r == 0) { sub_local_num_bytes(st.st_size); - int chunks = 1; - sub_num_bytes(st.st_size * chunks); + int64_t usersize; + if (pool.info.is_erasure()) { + bufferlist bv; + int r = osd->store->getattr( + ch, + ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard), + OI_ATTR, + bv); + if (r >= 0) { + object_info_t oi(bv); + usersize = oi.size * pgbackend->get_ec_data_chunk_count(); + } else { + dout(0) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard) + << " can't get object info" << dendl; + usersize = 0; + } + } else { + usersize = st.st_size; + } + sub_num_bytes(usersize); dout(10) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard) << " sub actual data by " << st.st_size + << " sub num_bytes by " << usersize << dendl; } }