]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore/BlockDevice:support pmem device as bluestore backend. 15102/head
authorJianpeng Ma <jianpeng.ma@intel.com>
Wed, 7 Jun 2017 15:22:52 +0000 (23:22 +0800)
committerJianpeng Ma <jianpeng.ma@intel.com>
Wed, 7 Jun 2017 15:22:52 +0000 (23:22 +0800)
Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
CMakeLists.txt
cmake/modules/Findpmem.cmake [new file with mode: 0644]
src/include/config-h.in.cmake
src/os/CMakeLists.txt
src/os/bluestore/BlockDevice.cc
src/os/bluestore/PMEMDevice.cc [new file with mode: 0644]
src/os/bluestore/PMEMDevice.h [new file with mode: 0644]

index a5efb6d1d5a52a13790c7eae67b464f75fa56806..9cd4f2b76fad23fec2f7f7cac7354413b4ba9500 100644 (file)
@@ -217,6 +217,12 @@ if(WITH_SPDK)
   set(HAVE_SPDK TRUE)
 endif(WITH_SPDK)
 
+option(WITH_PMEM "Enable PMEM" OFF)
+if(WITH_PMEM)
+  find_package(pmem REQUIRED)
+  set(HAVE_PMEM ${PMEM_FOUND})
+endif(WITH_PMEM)
+
 # needs mds and? XXX
 option(WITH_LIBCEPHFS "libcephfs client library" ON)
 
diff --git a/cmake/modules/Findpmem.cmake b/cmake/modules/Findpmem.cmake
new file mode 100644 (file)
index 0000000..efcf682
--- /dev/null
@@ -0,0 +1,15 @@
+# Try to find libpmem
+#
+# Once done, this will define
+#
+# PMEM_FOUND
+# PMEM_INCLUDE_DIR
+# PMEM_LIBRARY
+
+find_path(PMEM_INCLUDE_DIR NAMES libpmem.h)
+find_library(PMEM_LIBRARY NAMES pmem)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(pmem DEFAULT_MSG PMEM_LIBRARY PMEM_INCLUDE_DIR)
+
+mark_as_advanced(PMEM_INCLUDE_DIR PMEM_LIBRARY)
index c12244b16b537646cd942c0e7082c4794fe39c24..e7ff0ae245af586d36229dbe5b8ad2caefca09bd 100644 (file)
 /* DPDK conditional compilation */
 #cmakedefine HAVE_DPDK
 
+/* PMEM conditional compilation */
+#cmakedefine HAVE_PMEM
+
 /* Defined if LevelDB supports bloom filters */
 #cmakedefine HAVE_LEVELDB_FILTER_POLICY
 
index b24686d811703c2b1f3a6fccaa0d5a23bab75c38..feda6a23bc1fb279b5336c50f668833c4898514b 100644 (file)
@@ -50,6 +50,11 @@ if(WITH_FUSE)
     FuseStore.cc)
 endif(WITH_FUSE)
 
+if(WITH_PMEM)
+  list(APPEND libos_srcs
+    bluestore/PMEMDevice.cc)
+endif(WITH_PMEM)
+
 if(WITH_SPDK)
   list(APPEND libos_srcs
     bluestore/NVMEDevice.cc)
@@ -67,6 +72,10 @@ if(WITH_FUSE)
   target_link_libraries(os ${FUSE_LIBRARIES})
 endif()
 
+if(WITH_PMEM)
+  target_link_libraries(os ${PMEM_LIBRARY})
+endif()
+
 if(WITH_SPDK)
   target_link_libraries(os
     ${SPDK_LIBRARIES}
index cdc13ff3637425bcecd81fa4672ec9ac42be0a95..ca81952290e029a76301fdc13113796fb08ac659 100644 (file)
 #include "NVMEDevice.h"
 #endif
 
+#if defined(HAVE_PMEM)
+#include "PMEMDevice.h"
+#include <libpmem.h>
+#endif
+
 #include "common/debug.h"
 #include "common/EventTrace.h"
+#include "common/errno.h"
+#include "include/compat.h"
 
 #define dout_context cct
 #define dout_subsys ceph_subsys_bdev
@@ -55,8 +62,27 @@ BlockDevice *BlockDevice::create(CephContext* cct, const string& path,
     if (strncmp(bname, SPDK_PREFIX, sizeof(SPDK_PREFIX)-1) == 0)
       type = "ust-nvme";
   }
+
+#if defined(HAVE_PMEM)
+  if (type == "kernel") {
+    int is_pmem = 0;
+    void *addr = pmem_map_file(path.c_str(), 1024*1024, PMEM_FILE_EXCL, O_RDONLY, NULL, &is_pmem);
+    if (addr != NULL) {
+      if (is_pmem)
+       type = "pmem";
+      pmem_unmap(addr, 1024*1024);
+    }
+  }
+#endif
+
   dout(1) << __func__ << " path " << path << " type " << type << dendl;
 
+#if defined(HAVE_PMEM)
+  if (type == "pmem") {
+    return new PMEMDevice(cct, cb, cbpriv);
+  }
+#endif
+
   if (type == "kernel") {
     return new KernelDevice(cct, cb, cbpriv);
   }
@@ -66,6 +92,7 @@ BlockDevice *BlockDevice::create(CephContext* cct, const string& path,
   }
 #endif
 
+
   derr << __func__ << " unknown backend " << type << dendl;
   ceph_abort();
   return NULL;
diff --git a/src/os/bluestore/PMEMDevice.cc b/src/os/bluestore/PMEMDevice.cc
new file mode 100644 (file)
index 0000000..262eeb1
--- /dev/null
@@ -0,0 +1,305 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Intel <jianpeng.ma@intel.com>
+ *
+ * Author: Jianpeng Ma <jianpeng.ma@intel.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <libpmem.h>
+
+#include "PMEMDevice.h"
+#include "include/types.h"
+#include "include/compat.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "common/debug.h"
+#include "common/blkdev.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bdev
+#undef dout_prefix
+#define dout_prefix *_dout << "bdev-PMEM("  << path << ") "
+
+PMEMDevice::PMEMDevice(CephContext *cct, aio_callback_t cb, void *cbpriv)
+  : BlockDevice(cct),
+    fd(-1), addr(0),
+    size(0), block_size(0),
+    debug_lock("PMEMDevice::debug_lock"),
+    injecting_crash(0)
+{
+}
+
+int PMEMDevice::_lock()
+{
+  struct flock l;
+  memset(&l, 0, sizeof(l));
+  l.l_type = F_WRLCK;
+  l.l_whence = SEEK_SET;
+  l.l_start = 0;
+  l.l_len = 0;
+  int r = ::fcntl(fd, F_SETLK, &l);
+  if (r < 0)
+    return -errno;
+  return 0;
+}
+
+int PMEMDevice::open(const string& p)
+{
+  path = p;
+  int r = 0;
+  dout(1) << __func__ << " path " << path << dendl;
+
+  fd = ::open(path.c_str(), O_RDWR);
+  if (fd < 0) {
+    r = -errno;
+    derr << __func__ << " open got: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  r = _lock();
+  if (r < 0) {
+    derr << __func__ << " failed to lock " << path << ": " << cpp_strerror(r)
+        << dendl;
+    goto out_fail;
+  }
+
+  struct stat st;
+  r = ::fstat(fd, &st);
+  if (r < 0) {
+    r = -errno;
+    derr << __func__ << " fstat got " << cpp_strerror(r) << dendl;
+    goto out_fail;
+  }
+  if (S_ISBLK(st.st_mode)) {
+    int64_t s;
+    r = get_block_device_size(fd, &s);
+    if (r < 0) {
+      goto out_fail;
+    }
+    size = s;
+  } else {
+    size = st.st_size;
+  }
+
+  size_t map_len;
+  addr = (char *)pmem_map_file(path.c_str(), size, PMEM_FILE_EXCL, O_RDWR, &map_len, NULL);
+  if (addr == NULL) {
+    derr << __func__ << " pmem_map_file error" << dendl;
+    goto out_fail;
+  }
+  size = map_len;
+
+  // Operate as though the block size is 4 KB.  The backing file
+  // blksize doesn't strictly matter except that some file systems may
+  // require a read/modify/write if we write something smaller than
+  // it.
+  block_size = g_conf->bdev_block_size;
+  if (block_size != (unsigned)st.st_blksize) {
+    dout(1) << __func__ << " backing device/file reports st_blksize "
+      << st.st_blksize << ", using bdev_block_size "
+      << block_size << " anyway" << dendl;
+  }
+
+  dout(1) << __func__
+    << " size " << size
+    << " (" << pretty_si_t(size) << "B)"
+    << " block_size " << block_size
+    << " (" << pretty_si_t(block_size) << "B)"
+    << dendl;
+  return 0;
+
+ out_fail:
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+  fd = -1;
+  return r;
+}
+
+void PMEMDevice::close()
+{
+  dout(1) << __func__ << dendl;
+
+  assert(addr != NULL);
+  pmem_unmap(addr, size);
+  assert(fd >= 0);
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+  fd = -1;
+
+  path.clear();
+}
+
+static string get_dev_property(const char *dev, const char *property)
+{
+  char val[1024] = {0};
+  get_block_device_string_property(dev, property, val, sizeof(val));
+  return val;
+}
+
+int PMEMDevice::collect_metadata(string prefix, map<string,string> *pm) const
+{
+  (*pm)[prefix + "rotational"] = stringify((int)(bool)rotational);
+  (*pm)[prefix + "size"] = stringify(get_size());
+  (*pm)[prefix + "block_size"] = stringify(get_block_size());
+  (*pm)[prefix + "driver"] = "PMEMDevice";
+  (*pm)[prefix + "type"] = "ssd";
+
+  struct stat st;
+  int r = ::fstat(fd, &st);
+  if (r < 0)
+    return -errno;
+  if (S_ISBLK(st.st_mode)) {
+    (*pm)[prefix + "access_mode"] = "blk";
+    char partition_path[PATH_MAX];
+    char dev_node[PATH_MAX];
+    int rc = get_device_by_fd(fd, partition_path, dev_node, PATH_MAX);
+    switch (rc) {
+    case -EOPNOTSUPP:
+    case -EINVAL:
+      (*pm)[prefix + "partition_path"] = "unknown";
+      (*pm)[prefix + "dev_node"] = "unknown";
+      break;
+    case -ENODEV:
+      (*pm)[prefix + "partition_path"] = string(partition_path);
+      (*pm)[prefix + "dev_node"] = "unknown";
+      break;
+    default:
+      {
+       (*pm)[prefix + "partition_path"] = string(partition_path);
+       (*pm)[prefix + "dev_node"] = string(dev_node);
+       (*pm)[prefix + "model"] = get_dev_property(dev_node, "device/model");
+       (*pm)[prefix + "dev"] = get_dev_property(dev_node, "dev");
+
+       // nvme exposes a serial number
+       string serial = get_dev_property(dev_node, "device/serial");
+       if (serial.length()) {
+         (*pm)[prefix + "serial"] = serial;
+       }
+
+       // nvme has a device/device/* structure; infer from that.  there
+       // is probably a better way?
+       string nvme_vendor = get_dev_property(dev_node, "device/device/vendor");
+       if (nvme_vendor.length()) {
+         (*pm)[prefix + "type"] = "nvme";
+       }
+      }
+    }
+  } else {
+    (*pm)[prefix + "access_mode"] = "file";
+    (*pm)[prefix + "path"] = path;
+  }
+  return 0;
+}
+
+int PMEMDevice::flush()
+{
+  //Because all write is persist. So no need
+  return 0;
+}
+
+
+void PMEMDevice::aio_submit(IOContext *ioc)
+{
+  return;
+}
+
+int PMEMDevice::write(uint64_t off, bufferlist& bl, bool buffered)
+{
+  uint64_t len = bl.length();
+  dout(20) << __func__ << " " << off << "~" << len  << dendl;
+  assert(len > 0);
+  assert(off < size);
+  assert(off + len <= size);
+
+  dout(40) << "data: ";
+  bl.hexdump(*_dout);
+  *_dout << dendl;
+
+  if (g_conf->bdev_inject_crash &&
+      rand() % g_conf->bdev_inject_crash == 0) {
+    derr << __func__ << " bdev_inject_crash: dropping io " << off << "~" << len
+      << dendl;
+    ++injecting_crash;
+    return 0;
+  }
+
+  bufferlist::iterator p = bl.begin();
+  uint32_t off1 = off;
+  while (len) {
+    const char *data;
+    uint32_t l = p.get_ptr_and_advance(len, &data);
+    pmem_memcpy_persist(addr + off1, data, l);
+    len -= l;
+    off1 += l;
+  }
+
+  return 0;
+}
+
+int PMEMDevice::aio_write(
+  uint64_t off,
+  bufferlist &bl,
+  IOContext *ioc,
+  bool buffered)
+{
+  return write(off, bl, buffered);
+}
+
+
+int PMEMDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
+                     IOContext *ioc,
+                     bool buffered)
+{
+  dout(5) << __func__ << " " << off << "~" << len  << dendl;
+  assert(len > 0);
+  assert(off < size);
+  assert(off + len <= size);
+
+  bufferptr p = buffer::create_page_aligned(len);
+  memcpy(p.c_str(), addr + off, len);
+
+  pbl->clear();
+  pbl->push_back(std::move(p));
+
+  dout(40) << "data: ";
+  pbl->hexdump(*_dout);
+  *_dout << dendl;
+
+  return 0;
+}
+
+int PMEMDevice::aio_read(uint64_t off, uint64_t len, bufferlist *pbl,
+                     IOContext *ioc)
+{
+  return read(off, len, pbl, ioc, false);
+}
+
+int PMEMDevice::read_random(uint64_t off, uint64_t len, char *buf, bool buffered)
+{
+  assert(len > 0);
+  assert(off < size);
+  assert(off + len <= size);
+
+  memcpy(buf, addr + off, len);
+  return 0;
+}
+
+
+int PMEMDevice::invalidate_cache(uint64_t off, uint64_t len)
+{
+  dout(5) << __func__ << " " << off << "~" << len << dendl;
+  return 0;
+}
+
+
diff --git a/src/os/bluestore/PMEMDevice.h b/src/os/bluestore/PMEMDevice.h
new file mode 100644 (file)
index 0000000..a908c78
--- /dev/null
@@ -0,0 +1,74 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ *  Copyright (C) 2015 Intel <jianpeng.ma@intel.com>
+ *
+ * Author: Jianpeng Ma <jianpeng.ma@intel.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OS_BLUESTORE_PMEMDEVICE_H
+#define CEPH_OS_BLUESTORE_PMEMDEVICE_H
+
+#include <atomic>
+
+#include "os/fs/FS.h"
+#include "os/fs/aio.h"
+#include "include/interval_set.h"
+#include "BlockDevice.h"
+
+class PMEMDevice : public BlockDevice {
+  int fd;
+  char *addr; //the address of mmap
+  uint64_t size;
+  uint64_t block_size;
+  std::string path;
+
+  Mutex debug_lock;
+  interval_set<uint64_t> debug_inflight;
+
+  std::atomic_int injecting_crash;
+  int _lock();
+
+public:
+  PMEMDevice(CephContext *cct, aio_callback_t cb, void *cbpriv);
+
+
+  void aio_submit(IOContext *ioc) override;
+
+  uint64_t get_size() const override {
+    return size;
+  }
+  uint64_t get_block_size() const override {
+    return block_size;
+  }
+
+  int collect_metadata(std::string prefix, map<std::string,std::string> *pm) const override;
+
+  int read(uint64_t off, uint64_t len, bufferlist *pbl,
+          IOContext *ioc,
+          bool buffered) override;
+  int aio_read(uint64_t off, uint64_t len, bufferlist *pbl,
+              IOContext *ioc) override;
+
+  int read_random(uint64_t off, uint64_t len, char *buf, bool buffered) override;
+  int write(uint64_t off, bufferlist& bl, bool buffered) override;
+  int aio_write(uint64_t off, bufferlist& bl,
+               IOContext *ioc,
+               bool buffered) override;
+  int flush() override;
+
+  // for managing buffered readers/writers
+  int invalidate_cache(uint64_t off, uint64_t len) override;
+  int open(const std::string &path) override;
+  void close() override;
+};
+
+#endif