return b.release_extents(empty, logical, r);
}
+bool BlueStore::Blob::try_reuse_blob(uint32_t min_alloc_size,
+ uint32_t target_blob_size,
+ uint32_t b_offset,
+ uint32_t *length0) {
+ assert(min_alloc_size);
+ assert(target_blob_size);
+ if (!get_blob().is_mutable()) {
+ return false;
+ }
+
+ uint32_t length = *length0;
+ uint32_t end = b_offset + length;
+
+ // Currently for the sake of simplicity we omit blob reuse if data is
+ // unaligned with csum chunk. Later we can perform padding if needed.
+ if (get_blob().has_csum() &&
+ ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
+ (end % get_blob().get_csum_chunk_size()) != 0)) {
+ return false;
+ }
+
+ auto blen = get_blob().get_logical_length();
+ uint32_t new_blen = blen;
+
+ // make sure target_blob_size isn't less than current blob len
+ target_blob_size = MAX(blen, target_blob_size);
+
+ if (b_offset >= blen) {
+ //new data totally stands out of the existing blob
+ new_blen = b_offset + length;
+ } else {
+ //new data overlaps with the existing blob
+ new_blen = MAX(blen, length + b_offset);
+ if (!get_blob().is_unallocated(
+ b_offset,
+ new_blen > blen ? blen - b_offset : length)) {
+ return false;
+ }
+ }
+ if (new_blen > blen) {
+ int64_t overflow = int64_t(new_blen) - target_blob_size;
+ // Unable to decrease the provided length to fit into max_blob_size
+ if (overflow >= length) {
+ return false;
+ }
+
+ if (overflow > 0) {
+ new_blen -= overflow;
+ length -= overflow;
+ *length0 = length;
+ }
+ if (new_blen > blen) {
+ dirty_blob().add_tail(new_blen);
+ used_in_blob.add_tail(new_blen,
+ blob.get_release_size(min_alloc_size));
+ }
+ }
+ return true;
+}
+
void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
{
auto cct = coll->store->cct; //used by dout
}
// We need to have completely initialized Blob to increment its ref counters.
- // But that's not true for newly created blob and we defer the increment until
- // blob is ready in _do_alloc_write. See Blob::get_ref BlueStore::_do_alloc_write
- // implementations for more details.
- if (b->get_blob().get_logical_length() != 0) {
- b->get_ref(onode->c, blob_offset, length);
- }
+ assert(b->get_blob().get_logical_length() != 0);
+ b->get_ref(onode->c, blob_offset, length);
+
Extent *le = new Extent(logical_offset, blob_offset, length, b);
extent_map.insert(*le);
if (spans_shard(logical_offset, length)) {
}
// =======================================================
-
+// WriteContext
+
+/// Checks for writes to the same pextent within a blob
+bool BlueStore::WriteContext::has_conflict(
+ BlobRef b,
+ uint64_t loffs,
+ uint64_t loffs_end,
+ uint64_t min_alloc_size)
+{
+ assert((loffs % min_alloc_size) == 0);
+ assert((loffs_end % min_alloc_size) == 0);
+ for (auto w : writes) {
+ if (b == w.b) {
+ auto loffs2 = P2ALIGN(w.logical_offset, min_alloc_size);
+ auto loffs2_end = ROUND_UP_TO( w.logical_offset + w.length0, min_alloc_size);
+ if ((loffs <= loffs2 && loffs_end > loffs2) ||
+ (loffs >= loffs2 && loffs < loffs2_end)) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+ // =======================================================
+
// Collection
#undef dout_prefix
bufferlist bl;
blp.copy(length, bl);
- // look for an existing mutable blob we can use
+ // Look for an existing mutable blob we can use.
+ // NB: Current approach prevents us from reusing blobs that might be extended
+ // but have all the extents prior to the offset. Don't care for now...
auto ep = o->extent_map.seek_lextent(offset);
if (ep != o->extent_map.extent_map.begin()) {
--ep;
}
}
BlobRef b;
+ auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
while (ep != o->extent_map.extent_map.end()) {
if (ep->blob_start() >= end) {
break;
logger->inc(l_bluestore_write_small_deferred);
return;
}
+ uint32_t alloc_len = min_alloc_size;
+ auto offset0 = P2ALIGN(offset, alloc_len);
+ if (!head_read && !tail_read &&
+ b->try_reuse_blob(min_alloc_size,
+ max_bsize,
+ offset0 - bstart,
+ &alloc_len)) {
+ assert(alloc_len == min_alloc_size); // expecting data always
+ // fit into reused blob
+ // Need to check for pending writes desiring to
+ // reuse the same pextent. The rationale is that during GC two chunks
+ // from garbage blobs(compressed?) can share logical space within the same
+ // AU. That's in turn might be caused by unaligned len in clone_range2.
+ // Hence the second write will fail in an attempt to reuse blob at
+ // do_alloc_write().
+ if (!wctx->has_conflict(b,
+ offset0,
+ offset0 + alloc_len,
+ min_alloc_size)) {
+ uint64_t b_off = offset - bstart;
+ uint64_t b_off0 = b_off - head_pad;
+ dout(20) << __func__ << " reuse blob " << *b << std::hex
+ << " (" << b_off0 << "~" << padded.length() << ")"
+ << " (" << b_off << "~" << length << ")"
+ << std::dec << dendl;
+
+ o->extent_map.punch_hole(offset, length, &wctx->old_extents);
+ wctx->write(offset, b, alloc_len, b_off0, padded, b_off, length, false, false);
+ logger->inc(l_bluestore_write_small_unused);
+ return;
+ }
+ }
+
++ep;
}
uint64_t b_off = P2PHASE(offset, alloc_len);
uint64_t b_off0 = b_off;
_pad_zeros(&bl, &b_off0, block_size);
- _buffer_cache_write(txc, b, b_off0, bl,
- wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
- Extent *le = o->extent_map.set_lextent(offset, b_off,
- length, b, &wctx->old_extents);
- txc->statfs_delta.stored() += le->length;
- dout(20) << __func__ << " lex " << *le << dendl;
- wctx->write(b, alloc_len, b_off0, bl, b_off, length, true);
+ o->extent_map.punch_hole(offset, length, &wctx->old_extents);
+ wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, true, true);
logger->inc(l_bluestore_write_small_new);
+
return;
}
<< dendl;
logger->inc(l_bluestore_write_big);
logger->inc(l_bluestore_write_big_bytes, length);
+ o->extent_map.punch_hole(offset, length, &wctx->old_extents);
+ auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
while (length > 0) {
- BlobRef b = c->new_blob();
- auto l = MIN(wctx->target_blob_size, length);
+ bool new_blob = false;
+ uint32_t l = MIN(max_bsize, length);
+ BlobRef b;
+ uint32_t b_off = 0;
+
+ //attempting to reuse existing blob
+ if (!wctx->compress) {
+ // look for an existing mutable blob we can reuse
+ auto begin = o->extent_map.extent_map.begin();
+ auto end = o->extent_map.extent_map.end();
+ auto ep = o->extent_map.seek_lextent(offset);
+ auto prev_ep = ep;
+ if (prev_ep != begin) {
+ --prev_ep;
+ } else {
+ prev_ep = end; // to avoid this extent check as it's a duplicate
+ }
+ auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
+ // search suitable extent in both forward and reverse direction in
+ // [offset - target_max_blob_size, offset + target_max_blob_size] range
+ // then check if blob can be reused via try_reuse_blob func.
+ bool any_change;
+ do {
+ any_change = false;
+ if (ep != end && ep->logical_offset < offset + max_bsize) {
+ if (offset >= ep->blob_start() &&
+ ep->blob->try_reuse_blob(min_alloc_size, max_bsize,
+ offset - ep->blob_start(),
+ &l)) {
+ b = ep->blob;
+ b_off = offset - ep->blob_start();
+ prev_ep = end; // to avoid check below
+ dout(20) << __func__ << " reuse blob " << *b << std::hex
+ << " (" << b_off << "~" << l << ")" << std::dec << dendl;
+ } else {
+ ++ep;
+ any_change = true;
+ }
+ }
+
+ if (prev_ep != end && prev_ep->logical_offset >= min_off) {
+ if (prev_ep->blob->try_reuse_blob(min_alloc_size, max_bsize,
+ offset - prev_ep->blob_start(),
+ &l)) {
+ b = prev_ep->blob;
+ b_off = offset - prev_ep->blob_start();
+ dout(20) << __func__ << " reuse blob " << *b << std::hex
+ << " (" << b_off << "~" << l << ")" << std::dec << dendl;
+ } else if (prev_ep != begin) {
+ --prev_ep;
+ any_change = true;
+ } else {
+ prev_ep = end; // to avoid useless first extent re-check
+ }
+ }
+ } while (b == nullptr && any_change);
+ }
+ if (b == nullptr) {
+ b = c->new_blob();
+ b_off = 0;
+ new_blob = true;
+ }
+
bufferlist t;
blp.copy(l, t);
- _buffer_cache_write(txc, b, 0, t, wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
- wctx->write(b, l, 0, t, 0, l, false);
- Extent *le = o->extent_map.set_lextent(offset, 0, l,
- b, &wctx->old_extents);
- txc->statfs_delta.stored() += l;
- dout(20) << __func__ << " lex " << *le << dendl;
+ wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
offset += l;
length -= l;
logger->inc(l_bluestore_write_big_blobs);
int BlueStore::_do_alloc_write(
TransContext *txc,
CollectionRef coll,
- OnodeRef& o,
+ OnodeRef o,
WriteContext *wctx)
{
dout(20) << __func__ << " txc " << txc
<< dendl;
uint64_t need = 0;
+ auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
for (auto &wi : wctx->writes) {
need += wi.blob_length;
}
logger->tinc(l_bluestore_compress_lat,
ceph_clock_now() - start);
}
- if (!compressed) {
+ if (!compressed && wi.new_blob) {
+ // initialize newly created blob only
+ assert(!dblob.has_flag(bluestore_blob_t::FLAG_MUTABLE));
dblob.set_flag(bluestore_blob_t::FLAG_MUTABLE);
+
if (l->length() != wi.blob_length) {
- // hrm, maybe we could do better here, but let's not bother.
- dout(20) << __func__ << " forcing csum_order to block_size_order "
- << block_size_order << dendl;
- csum_order = block_size_order;
+ // hrm, maybe we could do better here, but let's not bother.
+ dout(20) << __func__ << " forcing csum_order to block_size_order "
+ << block_size_order << dendl;
+ csum_order = block_size_order;
} else {
- assert(b_off == 0);
- csum_order = std::min(wctx->csum_order, ctz(l->length()));
+ csum_order = std::min(wctx->csum_order, ctz(l->length()));
+ }
+ // try to align blob with max_blob_size to improve
+ // its reuse ratio, e.g. in case of reverse write
+ uint32_t suggested_boff =
+ (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
+ if ((suggested_boff % (1 << csum_order)) == 0 &&
+ suggested_boff + final_length <= max_bsize &&
+ suggested_boff > b_off) {
+ dout(20) << __func__ << " forcing blob_offset to "
+ << std::hex << suggested_boff << std::dec << dendl;
+ assert(suggested_boff >= b_off);
+ csum_length += suggested_boff - b_off;
+ b_off = suggested_boff;
}
}
txc->allocated.insert(e.offset, e.length);
hint = p.end();
}
- dblob.allocated(extents);
+ dblob.allocated(P2ALIGN(b_off, min_alloc_size), final_length, extents);
dout(20) << __func__ << " blob " << *b
<< " csum_type " << Checksummer::get_csum_type_string(csum)
<< dendl;
if (csum != Checksummer::CSUM_NONE) {
- dblob.init_csum(csum, csum_order, csum_length);
+ if (!dblob.has_csum()) {
+ dblob.init_csum(csum, csum_order, csum_length);
+ }
dblob.calc_csum(b_off, *l);
}
if (wi.mark_unused) {
- auto b_off = wi.b_off;
auto b_end = b_off + wi.bl.length();
if (b_off) {
dblob.add_unused(0, b_off);
dblob.add_unused(b_end, wi.blob_length - b_end);
}
}
-
- // Here we reattempt get_ref call deferred at set_lextent for newly created
- // blobs. This is required since blob has logical length established at this
- // moment only. And the latter is required to initialize blob's reference
- // counting machinery.
- assert(!b->is_referenced());
- b->get_ref(coll.get(), wi.b_off0, wi.length0);
-
+ Extent *le = o->extent_map.set_lextent(wi.logical_offset,
+ b_off + (wi.b_off0 - wi.b_off),
+ wi.length0,
+ wi.b,
+ nullptr);
+ txc->statfs_delta.stored() += le->length;
+ dout(20) << __func__ << " lex " << *le << dendl;
+ _buffer_cache_write(txc, wi.b, b_off, wi.bl,
+ wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
+
// queue io
if (!g_conf->bluestore_debug_omit_block_device_write) {
if (l->length() <= prefer_deferred_size) {
bluestore_pextent_t() : AllocExtent() {}
bluestore_pextent_t(uint64_t o, uint64_t l) : AllocExtent(o, l) {}
- bluestore_pextent_t(AllocExtent &ext) : AllocExtent(ext.offset, ext.length) { }
+ bluestore_pextent_t(const AllocExtent &ext) :
+ AllocExtent(ext.offset, ext.length) { }
+ bluestore_pextent_t& operator=(const AllocExtent &ext) {
+ offset = ext.offset;
+ length = ext.length;
+ return *this;
+ }
bool is_valid() const {
return offset != INVALID_OFFSET;
}
assert(_num_au <= num_au);
if (_num_au) {
num_au = _num_au; // bytes_per_au array is left unmodified
+
} else {
clear();
}
}
}
-
+ void add_tail(uint32_t new_len, uint32_t _au_size) {
+ auto full_size = au_size * (num_au ? num_au : 1);
+ assert(new_len >= full_size);
+ if (new_len == full_size) {
+ return;
+ }
+ if (!num_au) {
+ uint32_t old_total = total_bytes;
+ total_bytes = 0;
+ init(new_len, _au_size);
+ assert(num_au);
+ bytes_per_au[0] = old_total;
+ } else {
+ assert(_au_size == au_size);
+ new_len = ROUND_UP_TO(new_len, au_size);
+ uint32_t _num_au = new_len / au_size;
+ assert(_num_au >= num_au);
+ if (_num_au > num_au) {
+ auto old_bytes = bytes_per_au;
+ auto old_num_au = num_au;
+ num_au = _num_au;
+ allocate();
+ for (size_t i = 0; i < old_num_au; i++) {
+ bytes_per_au[i] = old_bytes[i];
+ }
+ for (size_t i = old_num_au; i < num_au; i++) {
+ bytes_per_au[i] = 0;
+ }
+ delete[] old_bytes;
+ }
+ }
+ }
+
void init(
uint32_t full_length,
uint32_t _au_size);
assert(0 == "we should not get here");
}
+ /// return true if the entire range is unallocated
+ /// (not mapped to extents on disk)
+ bool is_unallocated(uint64_t b_off, uint64_t b_len) const {
+ auto p = extents.begin();
+ assert(p != extents.end());
+ while (b_off >= p->length) {
+ b_off -= p->length;
+ ++p;
+ assert(p != extents.end());
+ }
+ b_len += b_off;
+ while (b_len) {
+ assert(p != extents.end());
+ if (p->is_valid()) {
+ return false;
+ }
+ if (p->length >= b_len) {
+ return true;
+ }
+ b_len -= p->length;
+ ++p;
+ }
+ assert(0 == "we should not get here");
+ }
+
/// return true if the logical range has never been used
bool is_unused(uint64_t offset, uint64_t length) const {
if (!has_unused()) {
get_csum_value_size());
}
}
+ void add_tail(uint32_t new_len) {
+ assert(is_mutable());
+ assert(new_len > logical_length);
+ extents.emplace_back(
+ bluestore_pextent_t(
+ bluestore_pextent_t::INVALID_OFFSET,
+ new_len - logical_length));
+ logical_length = new_len;
+ if (has_csum()) {
+ bufferptr t;
+ t.swap(csum_data);
+ csum_data = buffer::create(get_csum_value_size() * logical_length / get_csum_chunk_size());
+ csum_data.copy_in(0, t.length(), t.c_str());
+ csum_data.zero( t.length(), csum_data.length() - t.length());
+ }
+ }
uint32_t get_release_size(uint32_t min_alloc_size) const {
if (is_compressed()) {
return get_logical_length();
}
void split(uint32_t blob_offset, bluestore_blob_t& rb);
- void allocated(const AllocExtentVector& allocs);
+ void allocated(uint32_t b_off, uint32_t length, const AllocExtentVector& allocs);
void allocated_test(const bluestore_pextent_t& alloc); // intended for UT only
/// updates blob's pextents container and return unused pextents eligible