return 0;
}
-ceph::bufferlist BlueFS::FileWriter::flush_buffer(
- CephContext* const cct,
- const bool partial,
- const unsigned length,
- const bluefs_super_t& super)
+ceph::bufferlist BlueFS::FileWriter::get_flush_buffer(
+ CephContext* cct,
+ uint64_t want_end)
{
ceph_assert(ceph_mutex_is_locked(this->lock) || file->fnode.ino <= 1);
- ceph::bufferlist bl;
- if (partial) {
- tail_block.splice(0, tail_block.length(), &bl);
- }
- dout(20) << __func__ << " tail is" << std::hex << bl.length() << dendl;
- ceph_assert(length >= bl.length());
- const auto remaining_len = length - bl.length();
- buffer.splice(0, remaining_len, &bl);
- if (buffer.length()) {
- dout(20) << " leaving 0x" << std::hex << buffer.length() << std::dec
- << " unflushed" << dendl;
- }
- // Append padding to fill block
- unsigned tail = bl.length() & ~super.block_mask();
- if (tail) {
- unsigned padding_len = super.block_size - tail;
- dout(20) << __func__ << " caching tail of 0x"
- << std::hex << tail
- << " and padding block with 0x" << padding_len
- << " buffer.length() " << buffer.length()
- << std::dec << dendl;
+ ceph_assert(get_pos() < want_end);
+ ceph::bufferlist bl_to_disk;
+ uint64_t data_end = get_pos() + get_buffer_length();
+ uint64_t io_end = p2roundup<uint64_t>(
+ std::min(want_end, data_end), super_block_size);
+ uint64_t io_size = io_end - buffer_pos;
+ if (io_end <= data_end) {
+ // only partial flush
+ buffer.splice(0, io_size, &bl_to_disk);
+ pos = want_end;
+ buffer_pos += io_size;
+ } else {
+ // need to pad with zeros
+ ceph_assert(io_end - data_end < super_block_size);
+ uint64_t tail = p2phase<uint64_t>(data_end, super_block_size);
// We need to go through the `buffer_appender` to get a chance to
// preserve in-memory contiguity and not mess with the alignment.
// Otherwise a costly rebuild could happen in e.g. `KernelDevice`.
- buffer_appender.append_zero(padding_len);
- buffer.splice(buffer.length() - padding_len, padding_len, &bl);
- // Deep copy the tail here. This allows to avoid costlier copy on
- // bufferlist rebuild in e.g. `KernelDevice` and minimizes number
- // of memory allocations.
- // The alternative approach would be to place the entire tail and
- // padding on a dedicated, 4 KB long memory chunk. This shouldn't
- // trigger the rebuild while still being less expensive.
+ buffer_appender.append_zero(super_block_size - tail);
+ // now aligned
+ buffer.splice(0, io_size, &bl_to_disk);
if (file->envelope_mode() &&
buffer.get_append_buffer_unused_tail_length() < tail + File::envelope_t::head_size()) {
// Envelope mode header must completely fit in single buffer::ptr,
// pages. The size is min 2 * super.block_size so header will fit.
buffer.clear();
}
- buffer_appender.substr_of(bl, bl.length() - padding_len - tail, tail);
- buffer.splice(buffer.length() - tail, tail, &tail_block);
- } else {
- tail_block.clear();
+ // append (duplicate) tail for next flush
+ buffer_appender.substr_of(bl_to_disk, (data_end - tail) - buffer_pos, tail);
+
+ pos = want_end;
+ buffer_pos += io_size - super_block_size;
+ ceph_assert(buffer.length() < super_block_size);
}
- return bl;
+ return bl_to_disk;
}
int BlueFS::_signal_dirty_to_log_D(FileWriter *h)
ceph_assert(ceph_mutex_is_locked(h->lock));
ceph_assert(ceph_mutex_is_locked(h->file->lock));
}
- uint64_t x_off = 0;
- auto p = h->file->fnode.seek(offset, &x_off);
- ceph_assert(p != h->file->fnode.extents.end());
- dout(20) << __func__ << " in " << *p << " x_off 0x"
- << std::hex << x_off << std::dec << dendl;
- unsigned partial = x_off & ~super.block_mask();
- if (partial) {
- dout(20) << __func__ << " using partial tail 0x"
- << std::hex << partial << std::dec << dendl;
- x_off -= partial;
- offset -= partial;
- length += partial;
+ if (h->has_flushed_data()) {
dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
for (auto p : h->iocv) {
if (p) {
}
}
}
-
- auto bl = h->flush_buffer(cct, partial, length, super);
- ceph_assert(bl.length() >= length);
- h->set_pos(offset + length);
+ uint64_t write_offset = h->get_flush_offset();
+ bufferlist bl = h->get_flush_buffer(cct, offset + length);
length = bl.length();
+ uint64_t x_off = 0;
+ auto p = h->file->fnode.seek(write_offset, &x_off);
+ ceph_assert(p != h->file->fnode.extents.end());
+ dout(20) << __func__ << " in " << *p << " x_off 0x"
+ << std::hex << x_off << std::dec << dendl;
+
logger->inc(l_bluefs_write_count, 1);
logger->inc(l_bluefs_write_bytes, length);
return pos;
}
void set_pos(uint64_t new_pos) {
+ ceph_assert(buffer.length() == 0);
+ ceph_assert(p2aligned<uint32_t>(new_pos, super_block_size));
pos = new_pos;
+ buffer_pos = pos;
}
private:
- uint64_t pos = 0; ///< offset of data in file
- ceph::buffer::list buffer; ///< new data to write (at end of file)
- ceph::buffer::list tail_block; ///< existing partial block at end of file, if any
+ uint64_t pos = 0; ///< offset of unflushed data
+ ceph::buffer::list buffer; ///< new data to write (at end of file)
+ /// includes unaligned tail from last flush
+ uint32_t super_block_size;
+ uint64_t buffer_pos = 0; ///< offset of buffer in file
public:
+ // does not take into account part of `buffer` already flushed to disk
unsigned get_buffer_length() const {
- return buffer.length();
+ return buffer.length() - (pos - buffer_pos);
}
- ceph::bufferlist flush_buffer(
+ // indicates that a part of buffer has already been flushed to disk
+ bool has_flushed_data() {
+ return pos != buffer_pos;
+ }
+ // offset (in file) of `buffer` to flush
+ uint64_t get_flush_offset() {
+ return buffer_pos;
+ }
+ // create bufferlist to write to disk, modify buffer
+ ceph::bufferlist get_flush_buffer(
CephContext* cct,
- const bool partial,
- const unsigned length,
- const bluefs_super_t& super);
+ uint64_t flush_end);
ceph::buffer::list::page_aligned_appender buffer_appender; //< for const char* only
bufferlist::contiguous_filler envelope_head_filler;
public:
FileWriter(FileRef f, unsigned super_block_size)
: file(std::move(f))
+ , super_block_size(super_block_size)
, buffer_appender(buffer.get_page_aligned_appender(
std::max<uint64_t>(g_conf()->bluefs_alloc_size, 2 * super_block_size) / CEPH_PAGE_SIZE))
, envelope_head_filler() {
}
uint64_t get_effective_write_pos() {
- return pos + buffer.length();
+ return buffer_pos + buffer.length();
}
};