g_conf->get_val<bool>("osd_skip_data_digest");
PGTransaction* t = ctx->op_t.get();
+ if (!oi.has_extents() &&
+ get_osdmap()->require_osd_release >= CEPH_RELEASE_MIMIC) {
+ assert(oi.extents.empty());
+ // note that this is ok because:
+ // 1. for reads, this should have no effect
+ // 2. for writes, we check if this is a pre-mimic created object
+ // (with FLAG_EXTENTS off). And if it is, we set FLAG_EXTENTS
+ // and initialize extents with a whole entry - [0, oi.size) only
+ // to make sure we have oi.extents.size() == oi.size at the very
+ // beginning, which is necessary for backward compatibility.
+ oi.set_flag(object_info_t::FLAG_EXTENTS);
+ if (oi.size) {
+ oi.extents.insert(0, oi.size);
+ }
+ }
dout(10) << "do_osd_op " << soid << " " << ops << dendl;
oi.truncate_seq = op.extent.truncate_seq;
oi.truncate_size = op.extent.truncate_size;
if (op.extent.truncate_size != oi.size) {
- ctx->delta_stats.num_bytes -= oi.size;
- ctx->delta_stats.num_bytes += op.extent.truncate_size;
- oi.size = op.extent.truncate_size;
+ truncate_update_size_and_usage(ctx->delta_stats,
+ oi,
+ op.extent.truncate_size);
}
} else {
dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
ctx->modified_ranges.union_of(ch);
ctx->delta_stats.num_wr++;
oi.clear_data_digest();
+ if (oi.has_extents()) {
+ int64_t old_bytes = oi.extents.size();
+ interval_set<uint64_t> to_remove;
+ to_remove.subset_of(oi.extents, op.extent.offset,
+ op.extent.offset + op.extent.length);
+ oi.extents.subtract(to_remove);
+ int64_t new_bytes = oi.extents.size();
+ ctx->delta_stats.num_bytes += new_bytes - old_bytes;
+ }
} else {
// no-op
}
ctx->modified_ranges.union_of(trim);
}
if (op.extent.offset != oi.size) {
- ctx->delta_stats.num_bytes -= oi.size;
- ctx->delta_stats.num_bytes += op.extent.offset;
- oi.size = op.extent.offset;
+ truncate_update_size_and_usage(ctx->delta_stats,
+ oi,
+ op.extent.offset);
}
ctx->delta_stats.num_wr++;
// do no set exists, or we will break above DELETE -> TRUNCATE munging.
obs.oi.clear_omap_digest();
obs.oi.clear_flag(object_info_t::FLAG_OMAP);
}
- ctx->delta_stats.num_bytes -= oi.size;
+ if (oi.has_extents()) {
+ ctx->delta_stats.num_bytes -= oi.extents.size();
+ oi.extents.clear();
+ } else {
+ ctx->delta_stats.num_bytes -= oi.size;
+ }
oi.size = 0;
oi.new_object();
oi.user_version = target_version;
assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
} else {
- ctx->delta_stats.num_bytes -= oi.size;
+ if (oi.has_extents()) {
+ ctx->delta_stats.num_bytes -= oi.extents.size();
+ oi.extents.clear();
+ } else {
+ ctx->delta_stats.num_bytes -= oi.size;
+ }
}
oi.size = 0;
oi.new_object();
// Adjust the cached objectcontext
maybe_create_new_object(ctx, true);
- ctx->delta_stats.num_bytes -= obs.oi.size;
- ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
+ if (obs.oi.has_extents()) {
+ ctx->delta_stats.num_bytes -= obs.oi.extents.size();
+ obs.oi.extents.clear();
+ } else {
+ ctx->delta_stats.num_bytes -= obs.oi.size;
+ }
+ if (rollback_to->obs.oi.has_extents()) {
+ ctx->delta_stats.num_bytes += rollback_to->obs.oi.extents.size();
+ // transfer extents map too
+ assert(obs.oi.has_extents());
+ obs.oi.extents = rollback_to->obs.oi.extents;
+ } else {
+ ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
+ if (obs.oi.has_extents() && rollback_to->obs.oi.size) {
+ obs.oi.extents.insert(0, rollback_to->obs.oi.size);
+ }
+ }
obs.oi.size = rollback_to->obs.oi.size;
if (rollback_to->obs.oi.is_data_digest())
obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
modified.union_of(ch);
if (write_full || offset + length > oi.size) {
uint64_t new_size = offset + length;
- delta_stats.num_bytes -= oi.size;
- delta_stats.num_bytes += new_size;
+ if (!oi.has_extents()) {
+ delta_stats.num_bytes -= oi.size;
+ delta_stats.num_bytes += new_size;
+ }
oi.size = new_size;
}
+ if (length && oi.has_extents()) {
+ // count newly write bytes, exclude overwrites
+ interval_set<uint64_t> ne;
+ ne.insert(offset, length);
+ interval_set<uint64_t> overlap;
+ overlap.intersection_of(ne, oi.extents);
+ ne.subtract(overlap);
+ oi.extents.union_of(ne);
+ delta_stats.num_bytes += ne.size();
+ }
delta_stats.num_wr++;
delta_stats.num_wr_kb += SHIFT_ROUND_UP(length, 10);
}
+void PrimaryLogPG::truncate_update_size_and_usage(
+ object_stat_sum_t& delta_stats,
+ object_info_t& oi,
+ uint64_t truncate_size)
+{
+ if (oi.size == truncate_size) {
+ // no change
+ return;
+ }
+ if (oi.has_extents()) {
+ int64_t old_bytes = oi.extents.size();
+ if (truncate_size > oi.size) {
+ // trunc up
+ oi.extents.insert(oi.size, truncate_size - oi.size);
+ } else {
+ // trunc down
+ interval_set<uint64_t> new_extents;
+ new_extents.subset_of(oi.extents, 0, truncate_size);
+ oi.extents.swap(new_extents);
+ }
+ int64_t new_bytes = oi.extents.size();
+ delta_stats.num_bytes += new_bytes - old_bytes;
+ } else {
+ // fall back to old fashion
+ delta_stats.num_bytes -= oi.size;
+ delta_stats.num_bytes += truncate_size;
+ }
+ oi.size = truncate_size;
+}
+
void PrimaryLogPG::complete_disconnect_watches(
ObjectContextRef obc,
const list<watch_disconnect_t> &to_disconnect)
// but it works...
pg_log.get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10, &reply_obj.reqids);
dout(20) << " got reqids" << dendl;
+ if (oi.has_extents()) {
+ // note that we might call this multiple times
+ // include extents only in the final step to make extents.insert happy
+ reply_obj.flags |= object_copy_data_t::FLAG_EXTENTS;
+ reply_obj.extents = oi.extents;
+ }
}
dout(20) << " cursor.is_complete=" << cursor.is_complete()
&cop->results.reqids,
&cop->results.truncate_seq,
&cop->results.truncate_size,
+ &cop->results.extents,
&cop->rval);
op.set_last_op_flags(cop->src_obj_fadvise_flags);
ch.insert(0, obs.oi.size);
ctx->modified_ranges.union_of(ch);
- if (cb->get_data_size() != obs.oi.size) {
- ctx->delta_stats.num_bytes -= obs.oi.size;
- obs.oi.size = cb->get_data_size();
- ctx->delta_stats.num_bytes += obs.oi.size;
- }
+ ctx->delta_stats.num_bytes -= obs.oi.has_extents() ?
+ obs.oi.extents.size() : obs.oi.size;
+ obs.oi.clear_flag(object_info_t::FLAG_EXTENTS);
+ obs.oi.extents.clear();
+ obs.oi.size = cb->get_data_size();
+ ctx->delta_stats.num_bytes += obs.oi.size;
ctx->delta_stats.num_wr++;
ctx->delta_stats.num_wr_kb += SHIFT_ROUND_UP(obs.oi.size, 10);
tctx->new_obs.oi.set_omap_digest(results->omap_digest);
tctx->new_obs.oi.truncate_seq = results->truncate_seq;
tctx->new_obs.oi.truncate_size = results->truncate_size;
+ if (results->has_extents()) {
+ tctx->new_obs.oi.set_flag(object_info_t::FLAG_EXTENTS);
+ tctx->new_obs.oi.extents = results->extents;
+ }
if (soid.snap != CEPH_NOSNAP) {
assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
} else {
- tctx->delta_stats.num_bytes += results->object_size;
+ tctx->delta_stats.num_bytes += results->has_extents() ?
+ results->extents.size() : results->object_size;
}
}
assert(!oi.soid.is_snapdir());
object_stat_sum_t stat;
- stat.num_bytes += oi.size;
+ stat.num_bytes += oi.has_extents() ?
+ oi.extents.size() : oi.size;
stat.num_objects++;
if (oi.is_dirty())
stat.num_objects_dirty++;
ctx->delta_stats.num_objects++;
ctx->delta_stats.num_objects_hit_set_archive++;
+ // we do not use extents for usage tracking
+ // of hit_set_archive objects, for now!
+ assert(!obc->obs.oi.has_extents());
ctx->delta_stats.num_bytes += bl.length();
ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
assert(obc);
--ctx->delta_stats.num_objects;
--ctx->delta_stats.num_objects_hit_set_archive;
+ assert(!obc->obs.oi.has_extents());
ctx->delta_stats.num_bytes -= obc->obs.oi.size;
ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
}
// A clone num_bytes will be added later when we have snapset
if (!soid.is_snap()) {
- stat.num_bytes += oi->size;
+ stat.num_bytes += oi->has_extents() ?
+ oi->extents.size() : oi->size;
}
if (soid.nspace == cct->_conf->osd_hit_set_namespace)
stat.num_bytes_hit_set_archive += oi->size;
void object_copy_data_t::encode(bufferlist& bl, uint64_t features) const
{
- ENCODE_START(7, 5, bl);
+ ENCODE_START(8, 5, bl);
::encode(size, bl);
::encode(mtime, bl);
::encode(attrs, bl);
::encode(reqids, bl);
::encode(truncate_seq, bl);
::encode(truncate_size, bl);
+ ::encode(extents, bl);
ENCODE_FINISH(bl);
}
void object_copy_data_t::decode(bufferlist::iterator& bl)
{
- DECODE_START(7, bl);
+ DECODE_START(8, bl);
if (struct_v < 5) {
// old
::decode(size, bl);
::decode(truncate_seq, bl);
::decode(truncate_size, bl);
}
+ if (struct_v >= 8) {
+ ::decode(extents, bl);
+ }
}
DECODE_FINISH(bl);
}
o.back()->omap_header.append("this is an omap header");
o.back()->snaps.push_back(123);
o.back()->reqids.push_back(make_pair(osd_reqid_t(), version_t()));
+ o.back()->extents.insert(0, 123);
}
void object_copy_data_t::dump(Formatter *f) const
++i) {
old_watchers.insert(make_pair(i->first.second, i->second));
}
- ENCODE_START(17, 8, bl);
+ ENCODE_START(18, 8, bl);
::encode(soid, bl);
::encode(myoloc, bl); //Retained for compatibility
::encode((__u32)0, bl); // was category, no longer used
if (has_manifest()) {
::encode(manifest, bl);
}
+ ::encode(extents, bl);
ENCODE_FINISH(bl);
}
void object_info_t::decode(bufferlist::iterator& bl)
{
object_locator_t myoloc;
- DECODE_START_LEGACY_COMPAT_LEN(17, 8, 8, bl);
+ DECODE_START_LEGACY_COMPAT_LEN(18, 8, 8, bl);
map<entity_name_t, watch_info_t> old_watchers;
::decode(soid, bl);
::decode(myoloc, bl);
::decode(manifest, bl);
}
}
+ if (struct_v >= 18) {
+ ::decode(extents, bl);
+ }
DECODE_FINISH(bl);
}
f->dump_unsigned("expected_write_size", expected_write_size);
f->dump_unsigned("alloc_hint_flags", alloc_hint_flags);
f->dump_object("manifest", manifest);
+ f->open_array_section("extents");
+ for (interval_set<uint64_t>::const_iterator p = extents.begin();
+ p != extents.end(); ++p) {
+ f->open_object_section("extent");
+ f->dump_unsigned("offset", p.get_start());
+ f->dump_unsigned("length", p.get_len());
+ f->close_section();
+ }
+ f->close_section();
f->open_object_section("watchers");
for (map<pair<uint64_t, entity_name_t>,watch_info_t>::const_iterator p =
watchers.begin(); p != watchers.end(); ++p) {
enum {
FLAG_DATA_DIGEST = 1<<0,
FLAG_OMAP_DIGEST = 1<<1,
+ FLAG_EXTENTS = 1<<2,
};
object_copy_cursor_t cursor;
uint64_t size;
uint64_t truncate_seq;
uint64_t truncate_size;
+ ///< object logical extents map
+ interval_set<uint64_t> extents;
+
public:
object_copy_data_t() :
size((uint64_t)-1), data_digest(-1),
// note: these are currently encoded into a total 16 bits; see
// encode()/decode() for the weirdness.
typedef enum {
- FLAG_LOST = 1<<0,
- FLAG_WHITEOUT = 1<<1, // object logically does not exist
- FLAG_DIRTY = 1<<2, // object has been modified since last flushed or undirtied
- FLAG_OMAP = 1 << 3, // has (or may have) some/any omap data
- FLAG_DATA_DIGEST = 1 << 4, // has data crc
- FLAG_OMAP_DIGEST = 1 << 5, // has omap crc
- FLAG_CACHE_PIN = 1 << 6, // pin the object in cache tier
- FLAG_MANIFEST = 1 << 7, // has manifest
- // ...
- FLAG_USES_TMAP = 1<<8, // deprecated; no longer used.
+ FLAG_LOST = 1<<0,
+ FLAG_WHITEOUT = 1<<1, // object logically does not exist
+ FLAG_DIRTY = 1<<2, // object has been modified since last flushed or undirtied
+ FLAG_OMAP = 1<<3, // has (or may have) some/any omap data
+ FLAG_DATA_DIGEST = 1<<4, // has data crc
+ FLAG_OMAP_DIGEST = 1<<5, // has omap crc
+ FLAG_CACHE_PIN = 1<<6, // pin the object in cache tier
+ FLAG_MANIFEST = 1<<7, // has manifest
+ FLAG_USES_TMAP = 1<<8, // deprecated; no longer used
+ FLAG_EXTENTS = 1<<9, // logical extents map is valid
} flag_t;
flag_t flags;
s += "|cache_pin";
if (flags & FLAG_MANIFEST)
s += "|manifest";
+ if (flags & FLAG_EXTENTS)
+ s += "|extents";
if (s.length())
return s.substr(1);
return s;
uint32_t alloc_hint_flags;
struct object_manifest_t manifest;
+ interval_set<uint64_t> extents; // deduplicated logical extents map
void copy_user_bits(const object_info_t& other);
bool has_manifest() const {
return test_flag(FLAG_MANIFEST);
}
-
+ bool has_extents() const {
+ return test_flag(FLAG_EXTENTS);
+ }
void set_data_digest(__u32 d) {
set_flag(FLAG_DATA_DIGEST);
data_digest = d;