From 5a1536986543cf0e424e2c6017dcc442d80287ba Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 7 Aug 2013 14:38:22 +0800 Subject: [PATCH] store: Add (experimental) ZFS parallel journal support This patch adds ZFS parallel journal support. It uses libzfs provided by zfsonlinux to access ZFS' functionalities. To enable ZFS parallel journal support, compile ceph by: ./configure --with-libzfs LIBZFS_CFLAGS="-I -I" make Add following line to osd section of ceph.conf filestore zfs_snap = 1 Note: ZFS (no mater parallel journal is enabled or not) does not support direct IO. To use it as backend FS for OSD, you need to add following line to osd section of ceph.conf journal aio = 0 journal dio = 0 Signed-off-by: Yan, Zheng --- configure.ac | 11 ++ src/Makefile.am | 15 +- src/common/config_opts.h | 1 + src/os/FileStore.cc | 10 ++ src/os/FileStore.h | 4 + src/os/ZFS.cc | 83 +++++++++++ src/os/ZFS.h | 39 +++++ src/os/ZFSFileStoreBackend.cc | 259 ++++++++++++++++++++++++++++++++++ src/os/ZFSFileStoreBackend.h | 30 ++++ 9 files changed, 451 insertions(+), 1 deletion(-) create mode 100644 src/os/ZFS.cc create mode 100644 src/os/ZFS.h create mode 100644 src/os/ZFSFileStoreBackend.cc create mode 100644 src/os/ZFSFileStoreBackend.h diff --git a/configure.ac b/configure.ac index 812126664ebfd..6f3cc6a9f5f76 100644 --- a/configure.ac +++ b/configure.ac @@ -466,6 +466,17 @@ AS_IF([test "$with_libaio" = "yes"], [AC_DEFINE([HAVE_LIBAIO], [1], [Defined if you don't have atomic_ops])]) AM_CONDITIONAL(WITH_LIBAIO, [ test "$with_libaio" = "yes" ]) +# use libzfs +AC_ARG_WITH([libzfs], + [AS_HELP_STRING([--with-libzfs], [build ZFS support])], + , + [with_libzfs=no]) +AS_IF([test "x$with_libzfs" = xyes], + [PKG_CHECK_MODULES([LIBZFS], [zfs], [], [true])]) +AS_IF([test "x$with_libzfs" = xyes], + [AC_DEFINE([HAVE_LIBZFS], [1], [Defined if you have libzfs enabled])]) +AM_CONDITIONAL(WITH_LIBZFS, [ test "$with_libzfs" = "yes" ]) + # Checks for header files. AC_HEADER_DIRENT AC_HEADER_STDC diff --git a/src/Makefile.am b/src/Makefile.am index 3b93c9da325b4..f268cab5e2b2a 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -71,6 +71,10 @@ if WITH_LIBAIO LIBOS_LDA += -laio endif +if WITH_LIBZFS +LIBOS_LDA += libos_zfs.a -lzfs +endif + # use system leveldb LIBOS_LDA += -lleveldb -lsnappy @@ -1669,10 +1673,17 @@ libos_a_SOURCES = \ os/LevelDBStore.cc \ os/WBThrottle.cc \ os/BtrfsFileStoreBackend.cc \ - os/GenericFileStoreBackend.cc + os/GenericFileStoreBackend.cc \ + os/ZFSFileStoreBackend.cc libos_a_CXXFLAGS= ${AM_CXXFLAGS} noinst_LIBRARIES += libos.a +if WITH_LIBZFS +libos_zfs_a_SOURCES = os/ZFS.cc +libos_zfs_a_CXXFLAGS= ${AM_CXXFLAGS} ${LIBZFS_CFLAGS} +noinst_LIBRARIES += libos_zfs.a +endif + libosd_a_SOURCES = \ osd/PG.cc \ osd/PGLog.cc \ @@ -2167,6 +2178,7 @@ noinst_HEADERS = \ msg/msg_types.h\ objclass/objclass.h\ os/btrfs_ioctl.h\ + os/ZFS.h\ os/chain_xattr.h\ os/hobject.h \ os/CollectionIndex.h\ @@ -2174,6 +2186,7 @@ noinst_HEADERS = \ os/FileStore.h\ os/BtrfsFileStoreBackend.h\ os/GenericFileStoreBackend.h\ + os/ZFSFileStoreBackend.h\ os/FlatIndex.h\ os/HashIndex.h\ os/FDCache.h\ diff --git a/src/common/config_opts.h b/src/common/config_opts.h index d933250f282bc..ffc0e6094207a 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -540,6 +540,7 @@ OPTION(filestore_max_sync_interval, OPT_DOUBLE, 5) // seconds OPTION(filestore_min_sync_interval, OPT_DOUBLE, .01) // seconds OPTION(filestore_btrfs_snap, OPT_BOOL, true) OPTION(filestore_btrfs_clone_range, OPT_BOOL, true) +OPTION(filestore_zfs_snap, OPT_BOOL, false) // zfsonlinux is still unstable OPTION(filestore_fsync_flushes_journal_data, OPT_BOOL, false) OPTION(filestore_fiemap, OPT_BOOL, false) // (try to) use fiemap OPTION(filestore_journal_parallel, OPT_BOOL, false) diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc index cdcb216a3f598..a84cc69858ce0 100644 --- a/src/os/FileStore.cc +++ b/src/os/FileStore.cc @@ -52,6 +52,7 @@ #include "FileStore.h" #include "GenericFileStoreBackend.h" #include "BtrfsFileStoreBackend.h" +#include "ZFSFileStoreBackend.h" #include "common/BackTrace.h" #include "include/types.h" #include "FileJournal.h" @@ -602,6 +603,10 @@ int FileStore::mkfs() if (basefs.f_type == BTRFS_SUPER_MAGIC) { #if defined(__linux__) backend = new BtrfsFileStoreBackend(this); +#endif + } else if (basefs.f_type == ZFS_SUPER_MAGIC) { +#ifdef HAVE_LIBZFS + backend = new ZFSFileStoreBackend(this); #endif } @@ -799,6 +804,11 @@ int FileStore::_detect_fs() } } #endif +#ifdef HAVE_LIBZFS + if (st.f_type == ZFS_SUPER_MAGIC) { + backend = new ZFSFileStoreBackend(this); + } +#endif r = backend->detect_features(); if (r < 0) { diff --git a/src/os/FileStore.h b/src/os/FileStore.h index 2ffc317827a29..9bf7f92966fd5 100644 --- a/src/os/FileStore.h +++ b/src/os/FileStore.h @@ -60,6 +60,10 @@ static const __SWORD_TYPE XFS_SUPER_MAGIC(0x58465342); # endif #endif +#ifndef ZFS_SUPER_MAGIC +static const __SWORD_TYPE ZFS_SUPER_MAGIC(0x2fc12fc1); +#endif + class FileStoreBackend; class FileStore : public JournalingObjectStore, diff --git a/src/os/ZFS.cc b/src/os/ZFS.cc new file mode 100644 index 0000000000000..02520796c7307 --- /dev/null +++ b/src/os/ZFS.cc @@ -0,0 +1,83 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#define HAVE_IOCTL_IN_SYS_IOCTL_H +#include +#include "ZFS.h" + +const int ZFS::TYPE_FILESYSTEM = ZFS_TYPE_FILESYSTEM; +const int ZFS::TYPE_SNAPSHOT = ZFS_TYPE_SNAPSHOT; +const int ZFS::TYPE_VOLUME = ZFS_TYPE_VOLUME; +const int ZFS::TYPE_DATASET = ZFS_TYPE_DATASET; + +ZFS::~ZFS() +{ + if (g_zfs) + ::libzfs_fini((libzfs_handle_t*)g_zfs); +} + +int ZFS::init() +{ + g_zfs = ::libzfs_init(); + return g_zfs ? 0 : -EINVAL; +} + +ZFS::Handle *ZFS::open(const char *n, int t) +{ + return (ZFS::Handle*)::zfs_open((libzfs_handle_t*)g_zfs, n, (zfs_type_t)t); +} + +void ZFS::close(ZFS::Handle *h) +{ + ::zfs_close((zfs_handle_t*)h); +} + +const char *ZFS::get_name(ZFS::Handle *h) +{ + return ::zfs_get_name((zfs_handle_t*)h); +} + +ZFS::Handle *ZFS::path_to_zhandle(const char *p, int t) +{ + return ::zfs_path_to_zhandle((libzfs_handle_t*)g_zfs, (char *)p, (zfs_type_t)t); +} + +int ZFS::create(const char *n, int t) +{ + return ::zfs_create((libzfs_handle_t*)g_zfs, n, (zfs_type_t)t, NULL); +} + +int ZFS::snapshot(const char *n, bool r) +{ + return ::zfs_snapshot((libzfs_handle_t*)g_zfs, n, (boolean_t)r, NULL); +} + +int ZFS::rollback(ZFS::Handle *h, ZFS::Handle *snap, bool f) +{ + return ::zfs_rollback((zfs_handle_t*)h, (zfs_handle_t*)snap, (boolean_t)f); +} + +int ZFS::destroy_snaps(ZFS::Handle *h, const char *n, bool d) +{ + return ::zfs_destroy_snaps((zfs_handle_t*)h, (char *)n, (boolean_t)d); +} + +bool ZFS::is_mounted(ZFS::Handle *h, char **p) +{ + return (bool)::zfs_is_mounted((zfs_handle_t*)h, p); +} + +int ZFS::mount(ZFS::Handle *h, const char *o, int f) +{ + return ::zfs_mount((zfs_handle_t*)h, o, f); +} + +int ZFS::umount(ZFS::Handle *h, const char *o, int f) +{ + return ::zfs_unmount((zfs_handle_t*)h, o, f); +} + +int ZFS::iter_snapshots_sorted(ZFS::Handle *h, ZFS::iter_func f, void *d) +{ + return ::zfs_iter_snapshots_sorted((zfs_handle_t*)h, (zfs_iter_f)f, d); +} diff --git a/src/os/ZFS.h b/src/os/ZFS.h new file mode 100644 index 0000000000000..3ebe11107b20c --- /dev/null +++ b/src/os/ZFS.h @@ -0,0 +1,39 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_ZFS_H +#define CEPH_ZFS_H + +// Simple wrapper to hide libzfs.h. (it conflicts with standard linux headers) +class ZFS { + void *g_zfs; +public: + + static const int TYPE_FILESYSTEM; + static const int TYPE_SNAPSHOT; + static const int TYPE_VOLUME; + static const int TYPE_POOL; + static const int TYPE_DATASET; + + typedef void Handle; + typedef int (*iter_func)(Handle *, void *); + + static const char *get_name(Handle *); + + ZFS() : g_zfs(NULL) {} + ~ZFS(); + int init(); + Handle *open(const char *, int); + void close(Handle *); + Handle *path_to_zhandle(const char *, int); + int create(const char *, int); + int snapshot(const char *, bool); + int rollback(Handle *, Handle *, bool); + int destroy_snaps(Handle *, const char *, bool); + int iter_snapshots_sorted(Handle *, iter_func, void *); + int mount(Handle *, const char *, int); + int umount(Handle *, const char *, int); + bool is_mounted(Handle *, char **); +}; + +#endif diff --git a/src/os/ZFSFileStoreBackend.cc b/src/os/ZFSFileStoreBackend.cc new file mode 100644 index 0000000000000..0f01bd20ee741 --- /dev/null +++ b/src/os/ZFSFileStoreBackend.cc @@ -0,0 +1,259 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "include/compat.h" +#include "include/linux_fiemap.h" +#include "include/types.h" +#include "include/color.h" +#include "include/buffer.h" +#include "include/assert.h" + +#include +#include +#include + +#include "common/errno.h" +#include "common/config.h" +#include "common/sync_filesystem.h" + +#ifdef HAVE_LIBZFS + +#include "ZFSFileStoreBackend.h" + +#define dout_subsys ceph_subsys_filestore +#undef dout_prefix +#define dout_prefix *_dout << "zfsfilestorebackend(" << get_basedir_path() << ") " + +ZFSFileStoreBackend::ZFSFileStoreBackend(FileStore *fs) : + GenericFileStoreBackend(fs), base_zh(NULL), current_zh(NULL), + m_filestore_zfs_snap(g_conf->filestore_zfs_snap) +{ + int ret = zfs.init(); + if (ret < 0) { + dout(0) << "ZFSFileStoreBackend: failed to init libzfs" << dendl; + return; + } + + base_zh = zfs.path_to_zhandle(get_basedir_path().c_str(), ZFS::TYPE_FILESYSTEM); + if (!base_zh) { + dout(0) << "ZFSFileStoreBackend: failed to get zfs handler for basedir" << dendl; + return; + } + + update_current_zh(); +} + +ZFSFileStoreBackend::~ZFSFileStoreBackend() +{ + if (base_zh) + zfs.close(base_zh); + if (current_zh) + zfs.close(current_zh); +} + +int ZFSFileStoreBackend::update_current_zh() +{ + char path[PATH_MAX]; + snprintf(path, sizeof(path), "%s/current", zfs.get_name(base_zh)); + ZFS::Handle *zh = zfs.open(path, ZFS::TYPE_FILESYSTEM); + if (zh) { + char *mnt; + if (zfs.is_mounted(zh, &mnt)) { + int ret = get_current_path() == mnt; + free(mnt); + if (ret) { + current_zh = zh; + return 0; + } + } else { + int ret = zfs.mount(zh, NULL, 0); + if (ret < 0) { + ret = -errno; + dout(0) << "update_current_zh: zfs_mount '" << zfs.get_name(zh) + << "' got " << cpp_strerror(ret) << dendl; + return ret; + } + } + zfs.close(zh); + } else { + dout(0) << "update_current_zh: zfs_open '" << path << "' got NULL" << dendl; + return -ENOENT; + } + + zh = zfs.path_to_zhandle(get_current_path().c_str(), ZFS::TYPE_FILESYSTEM); + if (zh) { + if (strcmp(zfs.get_name(base_zh), zfs.get_name(zh))) { + current_zh = zh; + return 0; + } + zfs.close(zh); + dout(0) << "update_current_zh: basedir and current/ on the same filesystem" << dendl; + } else { + dout(0) << "update_current_zh: current/ not exist" << dendl; + } + return -ENOENT; +} + +int ZFSFileStoreBackend::detect_features() +{ + if (!current_zh) + dout(0) << "detect_features: null zfs handle for current/" << dendl; + return 0; +} + +bool ZFSFileStoreBackend::can_checkpoint() +{ + return m_filestore_zfs_snap && current_zh != NULL; +} + +int ZFSFileStoreBackend::create_current() +{ + struct stat st; + int ret = ::stat(get_current_path().c_str(), &st); + if (ret == 0) { + // current/ exists + if (!S_ISDIR(st.st_mode)) { + dout(0) << "create_current: current/ exists but is not a directory" << dendl; + return -ENOTDIR; + } + return 0; + } else if (errno != ENOENT) { + ret = -errno; + dout(0) << "create_current: cannot stat current/ " << cpp_strerror(ret) << dendl; + return ret; + } + + char path[PATH_MAX]; + snprintf(path, sizeof(path), "%s/current", zfs.get_name(base_zh)); + ret = zfs.create(path, ZFS::TYPE_FILESYSTEM); + if (ret < 0 && errno != EEXIST) { + ret = -errno; + dout(0) << "create_current: zfs_create '" << path << "' got " << cpp_strerror(ret) << dendl; + return ret; + } + + ret = update_current_zh(); + return ret; +} + +static int list_checkpoints_callback(ZFS::Handle *zh, void *data) +{ + list *ls = static_cast *>(data); + string str = ZFS::get_name(zh); + size_t pos = str.find('@'); + assert(pos != string::npos && pos + 1 != str.length()); + ls->push_back(str.substr(pos + 1)); + return 0; +} + +int ZFSFileStoreBackend::list_checkpoints(list& ls) +{ + dout(10) << "list_checkpoints:" << dendl; + if (!current_zh) + return -EINVAL; + + list snaps; + int ret = zfs.iter_snapshots_sorted(current_zh, list_checkpoints_callback, &snaps); + if (ret < 0) { + ret = -errno; + dout(0) << "list_checkpoints: zfs_iter_snapshots_sorted got" << cpp_strerror(ret) << dendl; + return ret; + } + ls.swap(snaps); + return 0; +} + +int ZFSFileStoreBackend::create_checkpoint(const string& name, uint64_t *cid) +{ + dout(10) << "create_checkpoint: '" << name << "'" << dendl; + if (!current_zh) + return -EINVAL; + + // looks like zfsonlinux doesn't flush dirty data when taking snapshot + int ret = sync_filesystem(get_current_fd()); + if (ret < 0) { + ret = -errno; + dout(0) << "create_checkpoint: sync_filesystem got" << cpp_strerror(ret) << dendl; + return ret; + } + + char path[PATH_MAX]; + snprintf(path, sizeof(path), "%s@%s", zfs.get_name(current_zh), name.c_str()); + ret = zfs.snapshot(path, false); + if (ret < 0) { + ret = -errno; + dout(0) << "create_checkpoint: zfs_snapshot '" << path << "' got" << cpp_strerror(ret) << dendl; + return ret; + } + if (cid) + *cid = 0; + return 0; +} + +int ZFSFileStoreBackend::rollback_to(const string& name) +{ + dout(10) << "rollback_to: '" << name << "'" << dendl; + if (!current_zh) + return -EINVAL; + + // umount current to avoid triggering online rollback deadlock + int ret; + if (zfs.is_mounted(current_zh, NULL)) { + ret = zfs.umount(current_zh, NULL, 0); + if (ret < 0) { + ret = -errno; + dout(0) << "rollback_to: zfs_umount '" << zfs.get_name(current_zh) << "' got" << cpp_strerror(ret) << dendl; + } + } + + char path[PATH_MAX]; + snprintf(path, sizeof(path), "%s@%s", zfs.get_name(current_zh), name.c_str()); + + ZFS::Handle *snap_zh = zfs.open(path, ZFS::TYPE_SNAPSHOT); + if (!snap_zh) { + dout(0) << "rollback_to: zfs_open '" << path << "' got NULL" << dendl; + return -ENOENT; + } + + ret = zfs.rollback(current_zh, snap_zh, false); + if (ret < 0) { + ret = -errno; + dout(0) << "rollback_to: zfs_rollback '" << zfs.get_name(snap_zh) << "' got" << cpp_strerror(ret) << dendl; + } + + if (!zfs.is_mounted(current_zh, NULL)) { + int ret = zfs.mount(current_zh, NULL, 0); + if (ret < 0) { + ret = -errno; + dout(0) << "update_current_zh: zfs_mount '" << zfs.get_name(current_zh) << "' got " << cpp_strerror(ret) << dendl; + return ret; + } + } + + zfs.close(snap_zh); + return ret; +} + +int ZFSFileStoreBackend::destroy_checkpoint(const string& name) +{ + dout(10) << "destroy_checkpoint: '" << name << "'" << dendl; + if (!current_zh) + return -EINVAL; + + int ret = zfs.destroy_snaps(current_zh, name.c_str(), true); + if (ret < 0) { + ret = -errno; + dout(0) << "destroy_checkpoint: zfs_destroy_snaps '" << name << "' got" << cpp_strerror(ret) << dendl; + } + return ret; +} +#endif diff --git a/src/os/ZFSFileStoreBackend.h b/src/os/ZFSFileStoreBackend.h new file mode 100644 index 0000000000000..8186d9ca957de --- /dev/null +++ b/src/os/ZFSFileStoreBackend.h @@ -0,0 +1,30 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_ZFSFILESTOREBACKEND_H +#define CEPH_ZFSFILESTOREBACKEND_H + +#ifdef HAVE_LIBZFS +#include "GenericFileStoreBackend.h" +#include "ZFS.h" + +class ZFSFileStoreBackend : public GenericFileStoreBackend { +private: + ZFS zfs; + ZFS::Handle *base_zh; + ZFS::Handle *current_zh; + bool m_filestore_zfs_snap; + int update_current_zh(); +public: + ZFSFileStoreBackend(FileStore *fs); + ~ZFSFileStoreBackend(); + int detect_features(); + bool can_checkpoint(); + int create_current(); + int list_checkpoints(list& ls); + int create_checkpoint(const string& name, uint64_t *cid); + int rollback_to(const string& name); + int destroy_checkpoint(const string& name); +}; +#endif +#endif -- 2.39.5