}
}
-int BlueStore::_do_write(
- TransContext *txc,
- CollectionRef& c,
- OnodeRef o,
- uint64_t offset,
- uint64_t length,
- bufferlist& bl,
- uint32_t fadvise_flags)
+void BlueStore::_choose_write_options(
+ CollectionRef& c,
+ OnodeRef o,
+ uint32_t fadvise_flags,
+ WriteContext *wctx)
{
- int r = 0;
-
- dout(20) << __func__
- << " " << o->oid
- << " 0x" << std::hex << offset << "~" << length
- << " - have 0x" << o->onode.size
- << " (" << std::dec << o->onode.size << ")"
- << " bytes"
- << " fadvise_flags 0x" << std::hex << fadvise_flags << std::dec
- << dendl;
- _dump_onode(o);
-
- if (length == 0) {
- return 0;
- }
-
- uint64_t end = offset + length;
- bool was_gc = false;
- GarbageCollector gc(c->store->cct);
- int64_t benefit;
- auto dirty_start = offset;
- auto dirty_end = offset + length;
-
- WriteContext wctx, wctx_gc;
if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
dout(20) << __func__ << " will do buffered write" << dendl;
- wctx.buffered = true;
+ wctx->buffered = true;
} else if (cct->_conf->bluestore_default_buffered_write &&
(fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
dout(20) << __func__ << " defaulting to buffered write" << dendl;
- wctx.buffered = true;
+ wctx->buffered = true;
}
- // FIXME: Using the MAX of the block_size_order and preferred_csum_order
- // results in poor small random read performance when data was initially
- // written out in large chunks. Reverting to previous behavior for now.
- wctx.csum_order = block_size_order;
+ // apply basic csum block size
+ wctx->csum_order = block_size_order;
// compression parameters
unsigned alloc_hints = o->onode.alloc_hint_flags;
auto cm = select_option(
"compression_mode",
- comp_mode.load(),
+ comp_mode.load(),
[&]() {
string val;
if(c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
- return boost::optional<Compressor::CompressionMode>(Compressor::get_comp_mode_type(val));
+ return boost::optional<Compressor::CompressionMode>(
+ Compressor::get_comp_mode_type(val));
}
return boost::optional<Compressor::CompressionMode>();
}
);
- wctx.compress = (cm != Compressor::COMP_NONE) &&
+
+ wctx->compress = (cm != Compressor::COMP_NONE) &&
((cm == Compressor::COMP_FORCE) ||
(cm == Compressor::COMP_AGGRESSIVE &&
(alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
(alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
- (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE|
- CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
+ (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
+ CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
(alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
+
dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
+
auto order = min_alloc_size_order.load();
if (o->onode.expected_write_size) {
- wctx.csum_order = std::max(order,
- (uint8_t)ctz(o->onode.expected_write_size));
+ wctx->csum_order = std::max(order,
+ (uint8_t)ctz(o->onode.expected_write_size));
} else {
- wctx.csum_order = order;
+ wctx->csum_order = order;
}
- if (wctx.compress) {
- wctx.target_blob_size = select_option(
+ if (wctx->compress) {
+ wctx->target_blob_size = select_option(
"compression_max_blob_size",
- comp_max_blob_size.load(),
+ comp_max_blob_size.load(),
[&]() {
int val;
if(c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
);
}
} else {
- if (wctx.compress) {
- wctx.target_blob_size = select_option(
+ if (wctx->compress) {
+ wctx->target_blob_size = select_option(
"compression_min_blob_size",
- comp_min_blob_size.load(),
+ comp_min_blob_size.load(),
[&]() {
int val;
if(c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
);
}
}
+
uint64_t max_bsize = max_blob_size.load();
- if (wctx.target_blob_size == 0 || wctx.target_blob_size > max_bsize) {
- wctx.target_blob_size = max_bsize;
+ if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
+ wctx->target_blob_size = max_bsize;
}
+
// set the min blob size floor at 2x the min_alloc_size, or else we
// won't be able to allocate a smaller extent for the compressed
// data.
- if (wctx.compress &&
- wctx.target_blob_size < min_alloc_size * 2) {
- wctx.target_blob_size = min_alloc_size * 2;
+ if (wctx->compress &&
+ wctx->target_blob_size < min_alloc_size * 2) {
+ wctx->target_blob_size = min_alloc_size * 2;
}
+
+ dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
+ << " target_blob_size 0x" << std::hex << wctx->target_blob_size
+ << std::dec << dendl;
+}
+
+int BlueStore::_do_gc(
+ TransContext *txc,
+ CollectionRef& c,
+ OnodeRef o,
+ const GarbageCollector& gc,
+ const WriteContext& wctx,
+ uint64_t *dirty_start,
+ uint64_t *dirty_end)
+{
+ auto& extents_to_collect = gc.get_extents_to_collect();
+
+ WriteContext wctx_gc;
wctx_gc.fork(wctx); // make a clone for garbage collection
- dout(20) << __func__ << " prefer csum_order " << wctx.csum_order
- << " target_blob_size 0x" << std::hex << wctx.target_blob_size
- << std::dec << dendl;
+ for (auto it = extents_to_collect.begin();
+ it != extents_to_collect.end();
+ ++it) {
+ bufferlist bl;
+ int r = _do_read(c.get(), o, it->offset, it->length, bl, 0);
+ assert(r == (int)it->length);
+
+ o->extent_map.fault_range(db, it->offset, it->length);
+ _do_write_data(txc, c, o, it->offset, it->length, bl, &wctx_gc);
+ logger->inc(l_bluestore_gc_merged, it->length);
+
+ if (*dirty_start > it->offset) {
+ *dirty_start = it->offset;
+ }
+
+ if (*dirty_end < it->offset + it->length) {
+ *dirty_end = it->offset + it->length;
+ }
+ }
+
+ dout(30) << __func__ << " alloc write" << dendl;
+ int r = _do_alloc_write(txc, c, o, &wctx_gc);
+ if (r < 0) {
+ derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ _wctx_finish(txc, c, o, &wctx_gc);
+ return 0;
+}
+
+int BlueStore::_do_write(
+ TransContext *txc,
+ CollectionRef& c,
+ OnodeRef o,
+ uint64_t offset,
+ uint64_t length,
+ bufferlist& bl,
+ uint32_t fadvise_flags)
+{
+ int r = 0;
+
+ dout(20) << __func__
+ << " " << o->oid
+ << " 0x" << std::hex << offset << "~" << length
+ << " - have 0x" << o->onode.size
+ << " (" << std::dec << o->onode.size << ")"
+ << " bytes"
+ << " fadvise_flags 0x" << std::hex << fadvise_flags << std::dec
+ << dendl;
+ _dump_onode(o);
+
+ if (length == 0) {
+ return 0;
+ }
+
+ uint64_t end = offset + length;
+
+ GarbageCollector gc(c->store->cct);
+ int64_t benefit;
+ auto dirty_start = offset;
+ auto dirty_end = end;
+
+ WriteContext wctx;
+ _choose_write_options(c, o, fadvise_flags, &wctx);
o->extent_map.fault_range(db, offset, length);
_do_write_data(txc, c, o, offset, length, bl, &wctx);
-
r = _do_alloc_write(txc, c, o, &wctx);
if (r < 0) {
derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
goto out;
}
+ // NB: _wctx_finish() will empty old_extents
+ // so we must do gc estimation before that
benefit = gc.estimate(offset,
- length,
- o->extent_map,
- wctx.old_extents,
- min_alloc_size);
+ length,
+ o->extent_map,
+ wctx.old_extents,
+ min_alloc_size);
_wctx_finish(txc, c, o, &wctx);
if (end > o->onode.size) {
dout(20) << __func__ << " extending size to 0x" << std::hex << end
- << std::dec << dendl;
+ << std::dec << dendl;
o->onode.size = end;
}
if (benefit >= g_conf->bluestore_gc_enable_total_threshold) {
- dout(20) << __func__ << " perform garbage collection, expected benefit = "
- << benefit << " AUs" << dendl;
- auto& extents_to_collect = gc.get_extents_to_collect();
- for (auto it = extents_to_collect.begin();
- it != extents_to_collect.end();
- ++it) {
- bufferlist bl;
- int r = _do_read(c.get(), o, it->offset, it->length, bl, 0);
- assert(r == (int)it->length);
- o->extent_map.fault_range(db, it->offset, it->length);
- _do_write_data(txc, c, o, it->offset, it->length, bl, &wctx_gc);
- logger->inc(l_bluestore_gc_merged, it->length);
- was_gc = true;
- if (dirty_start > it->offset) {
- dirty_start = it->offset;
- }
- if (dirty_end < it->offset + it->length) {
- dirty_end = it->offset + it->length;
+ if (!gc.get_extents_to_collect().empty()) {
+ dout(20) << __func__ << " perform garbage collection, "
+ << "expected benefit = " << benefit << " AUs" << dendl;
+ r = _do_gc(txc, c, o, gc, wctx, &dirty_start, &dirty_end);
+ if (r < 0) {
+ derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
+ << dendl;
+ goto out;
}
}
}
- if (was_gc) {
- dout(30) << __func__ << " alloc write for GC" << dendl;
- r = _do_alloc_write(txc, c, o, &wctx_gc);
- if (r < 0) {
- derr << __func__ << " _do_alloc_write(gc) failed with " << cpp_strerror(r)
- << dendl;
- goto out;
- }
- _wctx_finish(txc, c, o, &wctx_gc);
- }
o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
+
r = 0;
out:
int BlueStore::_write(TransContext *txc,
CollectionRef& c,
OnodeRef& o,
- uint64_t offset, size_t length,
- bufferlist& bl,
- uint32_t fadvise_flags)
+ uint64_t offset, size_t length,
+ bufferlist& bl,
+ uint32_t fadvise_flags)
{
dout(15) << __func__ << " " << c->cid << " " << o->oid
<< " 0x" << std::hex << offset << "~" << length << std::dec