From: Haomai Wang Date: Sun, 7 Feb 2016 06:57:13 +0000 (+0800) Subject: NVMEDevice: use nvme zero command instead of writing zero X-Git-Tag: v10.1.0~347^2~9 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=525927044bf9a831f2ffe7b205a676f7dd7175fc;p=ceph.git NVMEDevice: use nvme zero command instead of writing zero Signed-off-by: Haomai Wang --- diff --git a/src/os/bluestore/NVMEDevice.cc b/src/os/bluestore/NVMEDevice.cc index 48eee088d1b4..246cb6261e31 100644 --- a/src/os/bluestore/NVMEDevice.cc +++ b/src/os/bluestore/NVMEDevice.cc @@ -63,9 +63,11 @@ rte_mempool *task_pool = nullptr; enum { l_bluestore_nvmedevice_first = 632430, l_bluestore_nvmedevice_aio_write_lat, + l_bluestore_nvmedevice_aio_zero_lat, l_bluestore_nvmedevice_read_lat, l_bluestore_nvmedevice_flush_lat, l_bluestore_nvmedevice_aio_write_queue_lat, + l_bluestore_nvmedevice_aio_zero_queue_lat, l_bluestore_nvmedevice_read_queue_lat, l_bluestore_nvmedevice_flush_queue_lat, l_bluestore_nvmedevice_queue_ops, @@ -147,11 +149,13 @@ class SharedDriverData { PerfCountersBuilder b(g_ceph_context, string("NVMEDevice-AIOThread-"+stringify(this)), l_bluestore_nvmedevice_first, l_bluestore_nvmedevice_last); b.add_time_avg(l_bluestore_nvmedevice_aio_write_lat, "aio_write_lat", "Average write completing latency"); + b.add_time_avg(l_bluestore_nvmedevice_aio_zero_lat, "aio_zero_lat", "Average zero completing latency"); b.add_time_avg(l_bluestore_nvmedevice_read_lat, "read_lat", "Average read completing latency"); b.add_time_avg(l_bluestore_nvmedevice_flush_lat, "flush_lat", "Average flush completing latency"); b.add_u64(l_bluestore_nvmedevice_queue_ops, "queue_ops", "Operations in nvme queue"); b.add_time_avg(l_bluestore_nvmedevice_polling_lat, "polling_lat", "Average polling latency"); b.add_time_avg(l_bluestore_nvmedevice_aio_write_queue_lat, "aio_write_queue_lat", "Average queue write request latency"); + b.add_time_avg(l_bluestore_nvmedevice_aio_zero_queue_lat, "aio_zero_queue_lat", "Average queue zero request latency"); b.add_time_avg(l_bluestore_nvmedevice_read_queue_lat, "read_queue_lat", "Average queue read request latency"); b.add_time_avg(l_bluestore_nvmedevice_flush_queue_lat, "flush_queue_lat", "Average queue flush request latency"); logger = b.create_perf_counters(); @@ -279,6 +283,24 @@ void SharedDriverData::_aio_thread() logger->tinc(l_bluestore_nvmedevice_aio_write_queue_lat, lat); break; } + case IOCommand::ZERO_COMMAND: + { + lba_off = t->offset / block_size; + lba_count = t->len / block_size; + dout(20) << __func__ << " zero command issued " << lba_off << "~" << lba_count << dendl; + r = nvme_ns_cmd_write_zeroes(ns, lba_off, lba_count, io_complete, t, 0); + if (r < 0) { + t->ctx->nvme_task_first = t->ctx->nvme_task_last = nullptr; + rte_free(t->buf); + rte_mempool_put(task_pool, t); + derr << __func__ << " failed to do zero command" << dendl; + assert(0); + } + lat = ceph_clock_now(g_ceph_context); + lat -= t->start; + logger->tinc(l_bluestore_nvmedevice_aio_zero_queue_lat, lat); + break; + } case IOCommand::READ_COMMAND: { dout(20) << __func__ << " read command issueed " << lba_off << "~" << lba_count << dendl; @@ -523,11 +545,15 @@ void io_complete(void *t, const struct nvme_completion *completion) int left = driver->inflight_ops.dec(); utime_t lat = ceph_clock_now(g_ceph_context); lat -= task->start; - if (task->command == IOCommand::WRITE_COMMAND) { - driver->logger->tinc(l_bluestore_nvmedevice_aio_write_lat, lat); + if (task->command == IOCommand::WRITE_COMMAND || + task->command == IOCommand::ZERO_COMMAND) { + if (task->command == IOCommand::WRITE_COMMAND) + driver->logger->tinc(l_bluestore_nvmedevice_aio_write_lat, lat); + else + driver->logger->tinc(l_bluestore_nvmedevice_aio_zero_lat, lat); assert(!nvme_completion_is_error(completion)); - dout(20) << __func__ << " write op successfully, left " << left << dendl; - // buffer write won't have ctx, and we will free request later, see `flush` + dout(20) << __func__ << " write/zero op successfully, left " << left << dendl; + // buffer write/zero won't have ctx, and we will free request later, see `flush` if (ctx) { // check waiting count before doing callback (which may // destroy this ioc). @@ -581,8 +607,6 @@ NVMEDevice::NVMEDevice(aio_callback_t cb, void *cbpriv) aio_callback(cb), aio_callback_priv(cbpriv) { - zeros = buffer::create_page_aligned(1048576); - zeros.zero(); } @@ -779,16 +803,32 @@ int NVMEDevice::aio_zero( assert(off < size); assert(off + len <= size); - bufferlist bl; - while (len > 0) { - bufferlist t; - t.append(zeros, 0, MIN(zeros.length(), len)); - len -= t.length(); - bl.claim_append(t); + Task *t; + int r = rte_mempool_get(task_pool, (void **)&t); + if (r < 0) { + derr << __func__ << " failed to get task from mempool: " << r << dendl; + return r; } - // note: this works with aio only becaues the actual buffer is - // this->zeros, which is page-aligned and never freed. - return aio_write(off, bl, ioc, false); + t->start = ceph_clock_now(g_ceph_context); + + t->command = IOCommand::ZERO_COMMAND; + t->offset = off; + t->len = len; + t->device = this; + t->return_code = 0; + t->next = nullptr; + + t->ctx = ioc; + Task *first = static_cast(ioc->nvme_task_first); + Task *last = static_cast(ioc->nvme_task_last); + if (last) + last->next = t; + if (!first) + ioc->nvme_task_first = t; + ioc->nvme_task_last = t; + ioc->num_pending.inc(); + + return 0; } int NVMEDevice::read(uint64_t off, uint64_t len, bufferlist *pbl, diff --git a/src/os/bluestore/NVMEDevice.h b/src/os/bluestore/NVMEDevice.h index dca5837a565e..63cb69c02f4f 100644 --- a/src/os/bluestore/NVMEDevice.h +++ b/src/os/bluestore/NVMEDevice.h @@ -35,6 +35,7 @@ enum class IOCommand { READ_COMMAND, WRITE_COMMAND, + ZERO_COMMAND, FLUSH_COMMAND }; @@ -67,7 +68,6 @@ class NVMEDevice : public BlockDevice { uint64_t block_size; bool aio_stop; - bufferptr zeros; struct BufferedExtents { struct Extent {