From 421aee3aea959a526ed102035e0ef29ea427f8d8 Mon Sep 17 00:00:00 2001 From: xie xingguo Date: Thu, 21 Sep 2017 13:44:32 +0800 Subject: [PATCH] osd: fine-grained statistics of logical object space usage To test this change, we create an image of 5GB and do rbd bench write of 1GB: ./bin/rbd create bar -s 5120 && ./bin/rbd bench --io-type write --io-size 32K --io-total 100M --io-pattern rand rbd/bar Below is the test result. Was: GLOBAL: SIZE AVAIL RAW USED %RAW USED 30911M 27052M 3859M 12.49 POOLS: NAME ID USED %USED MAX AVAIL OBJECTS rbd 0 3191M 26.36 8914M 1174 cephfs_data_a 1 0 0 8914M 0 cephfs_metadata_a 2 2246 0 8914M 21 Now: GLOBAL: SIZE AVAIL RAW USED %RAW USED 30911M 27050M 3861M 12.49 POOLS: NAME ID USED %USED MAX AVAIL OBJECTS rbd 0 101216k 1.10 8913M 1178 cephfs_data_a 1 0 0 8913M 0 cephfs_metadata_a 2 892 0 8913M 21 E.g., this change can make "osd pool set-quota max_bytes" work nicely. Signed-off-by: xie xingguo --- src/osd/PrimaryLogPG.cc | 150 ++++++++++++++++++++++++++++++++++------ src/osd/PrimaryLogPG.h | 8 +++ src/osd/osd_types.cc | 26 +++++-- src/osd/osd_types.h | 31 ++++++--- src/osdc/Objecter.h | 9 ++- 5 files changed, 188 insertions(+), 36 deletions(-) diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index d89eef9bad7..6179d63a14c 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -4890,6 +4890,21 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector& ops) g_conf->get_val("osd_skip_data_digest"); PGTransaction* t = ctx->op_t.get(); + if (!oi.has_extents() && + get_osdmap()->require_osd_release >= CEPH_RELEASE_MIMIC) { + assert(oi.extents.empty()); + // note that this is ok because: + // 1. for reads, this should have no effect + // 2. for writes, we check if this is a pre-mimic created object + // (with FLAG_EXTENTS off). And if it is, we set FLAG_EXTENTS + // and initialize extents with a whole entry - [0, oi.size) only + // to make sure we have oi.extents.size() == oi.size at the very + // beginning, which is necessary for backward compatibility. + oi.set_flag(object_info_t::FLAG_EXTENTS); + if (oi.size) { + oi.extents.insert(0, oi.size); + } + } dout(10) << "do_osd_op " << soid << " " << ops << dendl; @@ -5629,9 +5644,9 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector& ops) oi.truncate_seq = op.extent.truncate_seq; oi.truncate_size = op.extent.truncate_size; if (op.extent.truncate_size != oi.size) { - ctx->delta_stats.num_bytes -= oi.size; - ctx->delta_stats.num_bytes += op.extent.truncate_size; - oi.size = op.extent.truncate_size; + truncate_update_size_and_usage(ctx->delta_stats, + oi, + op.extent.truncate_size); } } else { dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq @@ -5741,6 +5756,15 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector& ops) ctx->modified_ranges.union_of(ch); ctx->delta_stats.num_wr++; oi.clear_data_digest(); + if (oi.has_extents()) { + int64_t old_bytes = oi.extents.size(); + interval_set to_remove; + to_remove.subset_of(oi.extents, op.extent.offset, + op.extent.offset + op.extent.length); + oi.extents.subtract(to_remove); + int64_t new_bytes = oi.extents.size(); + ctx->delta_stats.num_bytes += new_bytes - old_bytes; + } } else { // no-op } @@ -5819,9 +5843,9 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector& ops) ctx->modified_ranges.union_of(trim); } if (op.extent.offset != oi.size) { - ctx->delta_stats.num_bytes -= oi.size; - ctx->delta_stats.num_bytes += op.extent.offset; - oi.size = op.extent.offset; + truncate_update_size_and_usage(ctx->delta_stats, + oi, + op.extent.offset); } ctx->delta_stats.num_wr++; // do no set exists, or we will break above DELETE -> TRUNCATE munging. @@ -6016,7 +6040,12 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector& ops) obs.oi.clear_omap_digest(); obs.oi.clear_flag(object_info_t::FLAG_OMAP); } - ctx->delta_stats.num_bytes -= oi.size; + if (oi.has_extents()) { + ctx->delta_stats.num_bytes -= oi.extents.size(); + oi.extents.clear(); + } else { + ctx->delta_stats.num_bytes -= oi.size; + } oi.size = 0; oi.new_object(); oi.user_version = target_version; @@ -6723,7 +6752,12 @@ inline int PrimaryLogPG::_delete_oid( assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap)); ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap); } else { - ctx->delta_stats.num_bytes -= oi.size; + if (oi.has_extents()) { + ctx->delta_stats.num_bytes -= oi.extents.size(); + oi.extents.clear(); + } else { + ctx->delta_stats.num_bytes -= oi.size; + } } oi.size = 0; oi.new_object(); @@ -6884,8 +6918,23 @@ int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op) // Adjust the cached objectcontext maybe_create_new_object(ctx, true); - ctx->delta_stats.num_bytes -= obs.oi.size; - ctx->delta_stats.num_bytes += rollback_to->obs.oi.size; + if (obs.oi.has_extents()) { + ctx->delta_stats.num_bytes -= obs.oi.extents.size(); + obs.oi.extents.clear(); + } else { + ctx->delta_stats.num_bytes -= obs.oi.size; + } + if (rollback_to->obs.oi.has_extents()) { + ctx->delta_stats.num_bytes += rollback_to->obs.oi.extents.size(); + // transfer extents map too + assert(obs.oi.has_extents()); + obs.oi.extents = rollback_to->obs.oi.extents; + } else { + ctx->delta_stats.num_bytes += rollback_to->obs.oi.size; + if (obs.oi.has_extents() && rollback_to->obs.oi.size) { + obs.oi.extents.insert(0, rollback_to->obs.oi.size); + } + } obs.oi.size = rollback_to->obs.oi.size; if (rollback_to->obs.oi.is_data_digest()) obs.oi.set_data_digest(rollback_to->obs.oi.data_digest); @@ -7095,14 +7144,56 @@ void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, o modified.union_of(ch); if (write_full || offset + length > oi.size) { uint64_t new_size = offset + length; - delta_stats.num_bytes -= oi.size; - delta_stats.num_bytes += new_size; + if (!oi.has_extents()) { + delta_stats.num_bytes -= oi.size; + delta_stats.num_bytes += new_size; + } oi.size = new_size; } + if (length && oi.has_extents()) { + // count newly write bytes, exclude overwrites + interval_set ne; + ne.insert(offset, length); + interval_set overlap; + overlap.intersection_of(ne, oi.extents); + ne.subtract(overlap); + oi.extents.union_of(ne); + delta_stats.num_bytes += ne.size(); + } delta_stats.num_wr++; delta_stats.num_wr_kb += SHIFT_ROUND_UP(length, 10); } +void PrimaryLogPG::truncate_update_size_and_usage( + object_stat_sum_t& delta_stats, + object_info_t& oi, + uint64_t truncate_size) +{ + if (oi.size == truncate_size) { + // no change + return; + } + if (oi.has_extents()) { + int64_t old_bytes = oi.extents.size(); + if (truncate_size > oi.size) { + // trunc up + oi.extents.insert(oi.size, truncate_size - oi.size); + } else { + // trunc down + interval_set new_extents; + new_extents.subset_of(oi.extents, 0, truncate_size); + oi.extents.swap(new_extents); + } + int64_t new_bytes = oi.extents.size(); + delta_stats.num_bytes += new_bytes - old_bytes; + } else { + // fall back to old fashion + delta_stats.num_bytes -= oi.size; + delta_stats.num_bytes += truncate_size; + } + oi.size = truncate_size; +} + void PrimaryLogPG::complete_disconnect_watches( ObjectContextRef obc, const list &to_disconnect) @@ -7655,6 +7746,12 @@ int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::iterator& bp, // but it works... pg_log.get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10, &reply_obj.reqids); dout(20) << " got reqids" << dendl; + if (oi.has_extents()) { + // note that we might call this multiple times + // include extents only in the final step to make extents.insert happy + reply_obj.flags |= object_copy_data_t::FLAG_EXTENTS; + reply_obj.extents = oi.extents; + } } dout(20) << " cursor.is_complete=" << cursor.is_complete() @@ -7780,6 +7877,7 @@ void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop) &cop->results.reqids, &cop->results.truncate_seq, &cop->results.truncate_size, + &cop->results.extents, &cop->rval); op.set_last_op_flags(cop->src_obj_fadvise_flags); @@ -8122,11 +8220,12 @@ void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb) ch.insert(0, obs.oi.size); ctx->modified_ranges.union_of(ch); - if (cb->get_data_size() != obs.oi.size) { - ctx->delta_stats.num_bytes -= obs.oi.size; - obs.oi.size = cb->get_data_size(); - ctx->delta_stats.num_bytes += obs.oi.size; - } + ctx->delta_stats.num_bytes -= obs.oi.has_extents() ? + obs.oi.extents.size() : obs.oi.size; + obs.oi.clear_flag(object_info_t::FLAG_EXTENTS); + obs.oi.extents.clear(); + obs.oi.size = cb->get_data_size(); + ctx->delta_stats.num_bytes += obs.oi.size; ctx->delta_stats.num_wr++; ctx->delta_stats.num_wr_kb += SHIFT_ROUND_UP(obs.oi.size, 10); @@ -8290,6 +8389,10 @@ void PrimaryLogPG::finish_promote(int r, CopyResults *results, tctx->new_obs.oi.set_omap_digest(results->omap_digest); tctx->new_obs.oi.truncate_seq = results->truncate_seq; tctx->new_obs.oi.truncate_size = results->truncate_size; + if (results->has_extents()) { + tctx->new_obs.oi.set_flag(object_info_t::FLAG_EXTENTS); + tctx->new_obs.oi.extents = results->extents; + } if (soid.snap != CEPH_NOSNAP) { assert(obc->ssc->snapset.clone_snaps.count(soid.snap)); @@ -8300,7 +8403,8 @@ void PrimaryLogPG::finish_promote(int r, CopyResults *results, tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap); } else { - tctx->delta_stats.num_bytes += results->object_size; + tctx->delta_stats.num_bytes += results->has_extents() ? + results->extents.size() : results->object_size; } } @@ -9806,7 +9910,8 @@ void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t assert(!oi.soid.is_snapdir()); object_stat_sum_t stat; - stat.num_bytes += oi.size; + stat.num_bytes += oi.has_extents() ? + oi.extents.size() : oi.size; stat.num_objects++; if (oi.is_dirty()) stat.num_objects_dirty++; @@ -12276,6 +12381,9 @@ void PrimaryLogPG::hit_set_persist() ctx->delta_stats.num_objects++; ctx->delta_stats.num_objects_hit_set_archive++; + // we do not use extents for usage tracking + // of hit_set_archive objects, for now! + assert(!obc->obs.oi.has_extents()); ctx->delta_stats.num_bytes += bl.length(); ctx->delta_stats.num_bytes_hit_set_archive += bl.length(); @@ -12341,6 +12449,7 @@ void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max) assert(obc); --ctx->delta_stats.num_objects; --ctx->delta_stats.num_objects_hit_set_archive; + assert(!obc->obs.oi.has_extents()); ctx->delta_stats.num_bytes -= obc->obs.oi.size; ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size; } @@ -13297,7 +13406,8 @@ void PrimaryLogPG::scrub_snapshot_metadata( // A clone num_bytes will be added later when we have snapset if (!soid.is_snap()) { - stat.num_bytes += oi->size; + stat.num_bytes += oi->has_extents() ? + oi->extents.size() : oi->size; } if (soid.nspace == cct->_conf->osd_hit_set_namespace) stat.num_bytes_hit_set_archive += oi->size; diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h index 3e9de77ef76..c7988a0c141 100644 --- a/src/osd/PrimaryLogPG.h +++ b/src/osd/PrimaryLogPG.h @@ -95,12 +95,16 @@ public: map attrs; // xattrs uint64_t truncate_seq; uint64_t truncate_size; + interval_set extents; // object logical extents map bool is_data_digest() { return flags & object_copy_data_t::FLAG_DATA_DIGEST; } bool is_omap_digest() { return flags & object_copy_data_t::FLAG_OMAP_DIGEST; } + bool has_extents() { + return flags & object_copy_data_t::FLAG_EXTENTS; + } CopyResults() : object_size(0), started_temp_obj(false), user_version(0), @@ -1106,6 +1110,10 @@ protected: void write_update_size_and_usage(object_stat_sum_t& stats, object_info_t& oi, interval_set& modified, uint64_t offset, uint64_t length, bool write_full=false); + void truncate_update_size_and_usage( + object_stat_sum_t& delta_stats, + object_info_t& oi, + uint64_t truncate_size); enum class cache_result_t { NOOP, diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 8899cf71bfa..beeb963d985 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -4250,7 +4250,7 @@ void object_copy_cursor_t::generate_test_instances(list& void object_copy_data_t::encode(bufferlist& bl, uint64_t features) const { - ENCODE_START(7, 5, bl); + ENCODE_START(8, 5, bl); ::encode(size, bl); ::encode(mtime, bl); ::encode(attrs, bl); @@ -4266,12 +4266,13 @@ void object_copy_data_t::encode(bufferlist& bl, uint64_t features) const ::encode(reqids, bl); ::encode(truncate_seq, bl); ::encode(truncate_size, bl); + ::encode(extents, bl); ENCODE_FINISH(bl); } void object_copy_data_t::decode(bufferlist::iterator& bl) { - DECODE_START(7, bl); + DECODE_START(8, bl); if (struct_v < 5) { // old ::decode(size, bl); @@ -4327,6 +4328,9 @@ void object_copy_data_t::decode(bufferlist::iterator& bl) ::decode(truncate_seq, bl); ::decode(truncate_size, bl); } + if (struct_v >= 8) { + ::decode(extents, bl); + } } DECODE_FINISH(bl); } @@ -4361,6 +4365,7 @@ void object_copy_data_t::generate_test_instances(list& o) o.back()->omap_header.append("this is an omap header"); o.back()->snaps.push_back(123); o.back()->reqids.push_back(make_pair(osd_reqid_t(), version_t())); + o.back()->extents.insert(0, 123); } void object_copy_data_t::dump(Formatter *f) const @@ -4900,7 +4905,7 @@ void object_info_t::encode(bufferlist& bl, uint64_t features) const ++i) { old_watchers.insert(make_pair(i->first.second, i->second)); } - ENCODE_START(17, 8, bl); + ENCODE_START(18, 8, bl); ::encode(soid, bl); ::encode(myoloc, bl); //Retained for compatibility ::encode((__u32)0, bl); // was category, no longer used @@ -4934,13 +4939,14 @@ void object_info_t::encode(bufferlist& bl, uint64_t features) const if (has_manifest()) { ::encode(manifest, bl); } + ::encode(extents, bl); ENCODE_FINISH(bl); } void object_info_t::decode(bufferlist::iterator& bl) { object_locator_t myoloc; - DECODE_START_LEGACY_COMPAT_LEN(17, 8, 8, bl); + DECODE_START_LEGACY_COMPAT_LEN(18, 8, 8, bl); map old_watchers; ::decode(soid, bl); ::decode(myoloc, bl); @@ -5028,6 +5034,9 @@ void object_info_t::decode(bufferlist::iterator& bl) ::decode(manifest, bl); } } + if (struct_v >= 18) { + ::decode(extents, bl); + } DECODE_FINISH(bl); } @@ -5053,6 +5062,15 @@ void object_info_t::dump(Formatter *f) const f->dump_unsigned("expected_write_size", expected_write_size); f->dump_unsigned("alloc_hint_flags", alloc_hint_flags); f->dump_object("manifest", manifest); + f->open_array_section("extents"); + for (interval_set::const_iterator p = extents.begin(); + p != extents.end(); ++p) { + f->open_object_section("extent"); + f->dump_unsigned("offset", p.get_start()); + f->dump_unsigned("length", p.get_len()); + f->close_section(); + } + f->close_section(); f->open_object_section("watchers"); for (map,watch_info_t>::const_iterator p = watchers.begin(); p != watchers.end(); ++p) { diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index e1429921ded..a8fa4f3738e 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -4283,6 +4283,7 @@ struct object_copy_data_t { enum { FLAG_DATA_DIGEST = 1<<0, FLAG_OMAP_DIGEST = 1<<1, + FLAG_EXTENTS = 1<<2, }; object_copy_cursor_t cursor; uint64_t size; @@ -4305,6 +4306,9 @@ struct object_copy_data_t { uint64_t truncate_seq; uint64_t truncate_size; + ///< object logical extents map + interval_set extents; + public: object_copy_data_t() : size((uint64_t)-1), data_digest(-1), @@ -4593,16 +4597,16 @@ struct object_info_t { // note: these are currently encoded into a total 16 bits; see // encode()/decode() for the weirdness. typedef enum { - FLAG_LOST = 1<<0, - FLAG_WHITEOUT = 1<<1, // object logically does not exist - FLAG_DIRTY = 1<<2, // object has been modified since last flushed or undirtied - FLAG_OMAP = 1 << 3, // has (or may have) some/any omap data - FLAG_DATA_DIGEST = 1 << 4, // has data crc - FLAG_OMAP_DIGEST = 1 << 5, // has omap crc - FLAG_CACHE_PIN = 1 << 6, // pin the object in cache tier - FLAG_MANIFEST = 1 << 7, // has manifest - // ... - FLAG_USES_TMAP = 1<<8, // deprecated; no longer used. + FLAG_LOST = 1<<0, + FLAG_WHITEOUT = 1<<1, // object logically does not exist + FLAG_DIRTY = 1<<2, // object has been modified since last flushed or undirtied + FLAG_OMAP = 1<<3, // has (or may have) some/any omap data + FLAG_DATA_DIGEST = 1<<4, // has data crc + FLAG_OMAP_DIGEST = 1<<5, // has omap crc + FLAG_CACHE_PIN = 1<<6, // pin the object in cache tier + FLAG_MANIFEST = 1<<7, // has manifest + FLAG_USES_TMAP = 1<<8, // deprecated; no longer used + FLAG_EXTENTS = 1<<9, // logical extents map is valid } flag_t; flag_t flags; @@ -4627,6 +4631,8 @@ struct object_info_t { s += "|cache_pin"; if (flags & FLAG_MANIFEST) s += "|manifest"; + if (flags & FLAG_EXTENTS) + s += "|extents"; if (s.length()) return s.substr(1); return s; @@ -4648,6 +4654,7 @@ struct object_info_t { uint32_t alloc_hint_flags; struct object_manifest_t manifest; + interval_set extents; // deduplicated logical extents map void copy_user_bits(const object_info_t& other); @@ -4684,7 +4691,9 @@ struct object_info_t { bool has_manifest() const { return test_flag(FLAG_MANIFEST); } - + bool has_extents() const { + return test_flag(FLAG_EXTENTS); + } void set_data_digest(__u32 d) { set_flag(FLAG_DATA_DIGEST); data_digest = d; diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h index a32e428e28d..3358d6bb24e 100644 --- a/src/osdc/Objecter.h +++ b/src/osdc/Objecter.h @@ -730,6 +730,7 @@ struct ObjectOperation { mempool::osd_pglog::vector > *out_reqids; uint64_t *out_truncate_seq; uint64_t *out_truncate_size; + interval_set *out_extents; int *prval; C_ObjectOperation_copyget(object_copy_cursor_t *c, uint64_t *s, @@ -745,6 +746,7 @@ struct ObjectOperation { mempool::osd_pglog::vector > *oreqids, uint64_t *otseq, uint64_t *otsize, + interval_set *otextents, int *r) : cursor(c), out_size(s), out_mtime(m), @@ -754,6 +756,7 @@ struct ObjectOperation { out_reqids(oreqids), out_truncate_seq(otseq), out_truncate_size(otsize), + out_extents(otextents), prval(r) {} void finish(int r) override { // reqids are copied on ENOENT @@ -796,6 +799,9 @@ struct ObjectOperation { *out_truncate_seq = copy_reply.truncate_seq; if (out_truncate_size) *out_truncate_size = copy_reply.truncate_size; + if (out_extents) { + *out_extents = copy_reply.extents; + } *cursor = copy_reply.cursor; } catch (buffer::error& e) { if (prval) @@ -820,6 +826,7 @@ struct ObjectOperation { mempool::osd_pglog::vector > *out_reqids, uint64_t *truncate_seq, uint64_t *truncate_size, + interval_set *extents, int *prval) { OSDOp& osd_op = add_op(CEPH_OSD_OP_COPY_GET); osd_op.op.copy_get.max = max; @@ -833,7 +840,7 @@ struct ObjectOperation { out_omap_data, out_snaps, out_snap_seq, out_flags, out_data_digest, out_omap_digest, out_reqids, truncate_seq, - truncate_size, prval); + truncate_size, extents, prval); out_bl[p] = &h->bl; out_handler[p] = h; } -- 2.39.5