Prefer aio unless explicitly directed otherwise.
Signed-off-by: Sage Weil <sage@redhat.com>
#define dout_prefix *_dout << "bdev(" << path << ") "
BlockDevice::BlockDevice(aio_callback_t cb, void *cbpriv)
- : fd(-1),
+ : fd_direct(-1),
+ fd_buffered(-1),
size(0), block_size(0),
fs(NULL), aio(false), dio(false),
debug_lock("BlockDevice::debug_lock"),
l.l_whence = SEEK_SET;
l.l_start = 0;
l.l_len = 0;
- int r = ::fcntl(fd, F_SETLK, &l);
+ int r = ::fcntl(fd_direct, F_SETLK, &l);
if (r < 0)
return -errno;
return 0;
int BlockDevice::open(string path)
{
+ int r = 0;
dout(1) << __func__ << " path " << path << dendl;
- fd = ::open(path.c_str(), O_RDWR | O_DIRECT);
- if (fd < 0) {
+ fd_direct = ::open(path.c_str(), O_RDWR | O_DIRECT);
+ if (fd_direct < 0) {
int r = -errno;
derr << __func__ << " open got: " << cpp_strerror(r) << dendl;
return r;
}
+ fd_buffered = ::open(path.c_str(), O_RDWR);
+ if (fd_buffered < 0) {
+ r = -errno;
+ derr << __func__ << " open got: " << cpp_strerror(r) << dendl;
+ goto out_direct;
+ }
dio = true;
-#ifdef HAVE_LIBAIO
aio = g_conf->bdev_aio;
if (!aio) {
assert(0 == "non-aio not supported");
}
-#endif
- int r = _lock();
+ r = _lock();
if (r < 0) {
derr << __func__ << " failed to lock " << path << ": " << cpp_strerror(r)
<< dendl;
- ::close(fd);
- return r;
+ goto out_fail;
}
struct stat st;
- r = ::fstat(fd, &st);
+ r = ::fstat(fd_direct, &st);
if (r < 0) {
r = -errno;
derr << __func__ << " fstat got " << cpp_strerror(r) << dendl;
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- fd = -1;
- return r;
+ goto out_fail;
}
if (S_ISBLK(st.st_mode)) {
int64_t s;
- r = get_block_device_size(fd, &s);
+ r = get_block_device_size(fd_direct, &s);
if (r < 0) {
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- fd = -1;
- return r;
+ goto out_fail;
}
size = s;
} else {
}
block_size = st.st_blksize;
- fs = FS::create_by_fd(fd);
+ fs = FS::create_by_fd(fd_direct);
assert(fs);
r = _aio_start();
<< " (" << pretty_si_t(block_size) << "B)"
<< dendl;
return 0;
+
+ out_fail:
+ ::close(fd_buffered);
+ fd_buffered = -1;
+ out_direct:
+ ::close(fd_direct);
+ fd_direct = -1;
+ return r;
}
void BlockDevice::close()
{
dout(1) << __func__ << dendl;
_aio_stop();
- assert(fd >= 0);
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- fd = -1;
+ assert(fd_direct >= 0);
+ VOID_TEMP_FAILURE_RETRY(::close(fd_direct));
+ fd_direct = -1;
+ assert(fd_buffered >= 0);
+ VOID_TEMP_FAILURE_RETRY(::close(fd_buffered));
+ fd_buffered = -1;
}
int BlockDevice::flush()
assert(0 == "bdev_inject_crash");
}
utime_t start = ceph_clock_now(NULL);
- int r = ::fdatasync(fd);
+ int r = ::fdatasync(fd_direct);
utime_t end = ceph_clock_now(NULL);
utime_t dur = end - start;
if (r < 0) {
int BlockDevice::aio_write(
uint64_t off,
bufferlist &bl,
- IOContext *ioc)
+ IOContext *ioc,
+ bool buffered)
{
uint64_t len = bl.length();
dout(10) << __func__ << " " << off << "~" << len << dendl;
_aio_log_start(ioc, off, bl.length());
#ifdef HAVE_LIBAIO
- if (aio && dio) {
- ioc->pending_aios.push_back(FS::aio_t(ioc, fd));
+ if (aio && dio && !buffered) {
+ ioc->pending_aios.push_back(FS::aio_t(ioc, fd_direct));
ioc->num_pending.inc();
FS::aio_t& aio = ioc->pending_aios.back();
if (g_conf->bdev_inject_crash &&
#endif
{
dout(2) << __func__ << " write to " << off << "~" << len << dendl;
+ if (g_conf->bdev_inject_crash &&
+ rand() % g_conf->bdev_inject_crash == 0) {
+ derr << __func__ << " bdev_inject_crash: dropping io " << off << "~" << len
+ << dendl;
+ return 0;
+ }
vector<iovec> iov;
bl.prepare_iov(&iov);
- int r = ::pwritev(fd, &iov[0], iov.size(), off);
+ int r = ::pwritev(buffered ? fd_buffered : fd_direct,
+ &iov[0], iov.size(), off);
if (r < 0) {
derr << __func__ << " pwritev error: " << cpp_strerror(r) << dendl;
return r;
bufferlist foo;
// note: this works with aio only becaues the actual buffer is
// this->zeros, which is page-aligned and never freed.
- return aio_write(off, bl, ioc);
+ return aio_write(off, bl, ioc, false);
}
-int BlockDevice::read(uint64_t off, uint64_t len, bufferlist *pbl, IOContext *ioc)
+int BlockDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
+ IOContext *ioc,
+ bool buffered)
{
dout(10) << __func__ << " " << off << "~" << len << dendl;
assert(off % block_size == 0);
ioc->num_reading.inc();;
bufferptr p = buffer::create_page_aligned(len);
- int r = ::pread(fd, p.c_str(), len, off);
+ int r = ::pread(buffered ? fd_buffered : fd_direct,
+ p.c_str(), len, off);
if (r < 0) {
r = -errno;
goto out;
typedef void (*aio_callback_t)(void *handle, void *aio);
private:
- int fd;
+ int fd_direct, fd_buffered;
uint64_t size;
uint64_t block_size;
string path;
}
int read(uint64_t off, uint64_t len, bufferlist *pbl,
- IOContext *ioc);
+ IOContext *ioc,
+ bool buffered);
int aio_write(uint64_t off, bufferlist& bl,
- IOContext *ioc);
+ IOContext *ioc,
+ bool buffered);
int aio_zero(uint64_t off, uint64_t len,
IOContext *ioc);
int flush();
bl.rebuild();
IOContext ioc(NULL);
- bdev[0]->aio_write(get_super_offset(), bl, &ioc);
+ bdev[0]->aio_write(get_super_offset(), bl, &ioc, false);
bdev[0]->aio_submit(&ioc);
ioc.aio_wait();
dout(20) << __func__ << " v " << super.version << " crc " << crc
// always the second block
r = bdev[0]->read(get_super_offset(), get_super_length(),
- &bl, ioc[0]);
+ &bl, ioc[0], false);
if (r < 0)
return r;
}
dout(20) << __func__ << " fetching " << x_off << "~" << l << " of "
<< *p << dendl;
- int r = bdev[p->bdev]->read(p->offset + x_off, l, &buf->bl, ioc[p->bdev]);
+ int r = bdev[p->bdev]->read(p->offset + x_off, l, &buf->bl, ioc[p->bdev],
+ true);
assert(r == 0);
}
left = buf->get_buf_remaining(off);
z.zero();
t.append(z);
}
- bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev]);
+ bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], false);
bloff += x_len;
length -= x_len;
++p;
int r;
IOContext ioc(NULL); // FIXME?
+ // generally, don't buffer anything, unless the client explicitly requests
+ // it.
+ bool buffered = false;
+ if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
+ dout(20) << __func__ << " will do buffered read" << dendl;
+ buffered = true;
+ }
+
dout(20) << __func__ << " " << offset << "~" << length << " size "
<< o->onode.size << dendl;
bl.clear();
uint64_t r_len = ROUND_UP_TO(x_len + front_extra, block_size);
dout(30) << __func__ << " reading " << r_off << "~" << r_len << dendl;
bufferlist t;
- r = bdev->read(r_off + bp->second.offset, r_len, &t, &ioc);
+ r = bdev->read(r_off + bp->second.offset, r_len, &t, &ioc, buffered);
if (r < 0) {
goto out;
}
offset = offset & block_mask;
dout(20) << __func__ << " reading initial partial block "
<< offset << "~" << block_size << dendl;
- bdev->read(offset, block_size, &first, ioc);
+ bdev->read(offset, block_size, &first, ioc, true);
bufferlist t;
t.substr_of(first, 0, first_len);
t.claim_append(bl);
} else {
dout(20) << __func__ << " reading trailing partial block "
<< last_offset << "~" << block_size << dendl;
- bdev->read(last_offset, block_size, &last, ioc);
+ bdev->read(last_offset, block_size, &last, ioc, true);
}
bufferlist t;
uint64_t endoff = wo.extent.end() & ~block_mask;
bl.claim_append(t);
}
assert((bl.length() & ~block_mask) == 0);
- bdev->aio_write(offset, bl, ioc);
+ bdev->aio_write(offset, bl, ioc, false);
}
break;
uint64_t first_offset = offset & block_mask;
dout(20) << __func__ << " reading initial partial block "
<< first_offset << "~" << block_size << dendl;
- bdev->read(first_offset, block_size, &first, ioc);
+ bdev->read(first_offset, block_size, &first, ioc, true);
size_t z_len = MIN(block_size - first_len, length);
memset(first.c_str() + first_len, 0, z_len);
- bdev->aio_write(first_offset, first, ioc);
+ bdev->aio_write(first_offset, first, ioc, false);
offset += block_size - first_len;
length -= z_len;
}
bufferlist last;
dout(20) << __func__ << " reading trailing partial block "
<< offset << "~" << block_size << dendl;
- bdev->read(offset, block_size, &last, ioc);
+ bdev->read(offset, block_size, &last, ioc, true);
memset(last.c_str(), 0, length);
- bdev->aio_write(offset, last, ioc);
+ bdev->aio_write(offset, last, ioc, false);
}
}
break;
return 0;
}
+ bool buffered = false;
+ if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
+ dout(20) << __func__ << " will do buffered write" << dendl;
+ buffered = true;
+ }
+
uint64_t block_size = bdev->get_block_size();
const uint64_t block_mask = ~(block_size - 1);
uint64_t min_alloc_size = g_conf->bluestore_min_alloc_size;
}
dout(20) << __func__ << " write " << offset << "~" << length
<< " x_off " << x_off << dendl;
- bdev->aio_write(bp->second.offset + x_off, bl, &txc->ioc);
+ bdev->aio_write(bp->second.offset + x_off, bl, &txc->ioc, buffered);
bp->second.clear_flag(bluestore_extent_t::FLAG_UNWRITTEN);
++bp;
continue;
uint64_t x_off = offset - bp->first;
dout(20) << __func__ << " write " << offset << "~" << length
<< " x_off " << x_off << dendl;
- bdev->aio_write(bp->second.offset + x_off, bl, &txc->ioc);
+ bdev->aio_write(bp->second.offset + x_off, bl, &txc->ioc, buffered);
++bp;
continue;
}
uint64_t x_off = offset - bp->first;
dout(20) << __func__ << " write " << offset << "~" << length
<< " x_off " << x_off << dendl;
- bdev->aio_write(bp->second.offset + x_off, bl, &txc->ioc);
+ bdev->aio_write(bp->second.offset + x_off, bl, &txc->ioc, buffered);
if (offset + length < bp->first + bp->second.length &&
offset + length <= o->onode.size) {
uint64_t end = offset + length;