--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <inttypes.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include "include/compat.h"
+#include "include/linux_fiemap.h"
+#include "include/types.h"
+#include "include/color.h"
+#include "include/buffer.h"
+#include "include/assert.h"
+
+#ifndef __CYGWIN__
+#include "btrfs_ioctl.h"
+#endif
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "BtrfsFileStoreBackend.h"
+
+#include "common/errno.h"
+#include "common/config.h"
+
+#if defined(__linux__)
+
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "btrfsfilestorebackend(" << get_basedir_path() << ") "
+
+#define ALIGN_DOWN(x, by) ((x) - ((x) % (by)))
+#define ALIGNED(x, by) (!((x) % (by)))
+#define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by)))
+
+BtrfsFileStoreBackend::BtrfsFileStoreBackend(FileStore *fs):
+ GenericFileStoreBackend(fs), has_clone_range(false), has_snap_create(false),
+ has_snap_create_v2(false), has_wait_sync(false), stable_commits(false),
+ m_filestore_btrfs_clone_range(g_conf->filestore_btrfs_clone_range),
+ m_filestore_btrfs_snap (g_conf->filestore_btrfs_snap) { }
+
+int BtrfsFileStoreBackend::detect_features()
+{
+ int r;
+
+ r = GenericFileStoreBackend::detect_features();
+ if (r < 0)
+ return r;
+
+ // clone_range?
+ if (m_filestore_btrfs_clone_range) {
+ int fd = ::openat(get_basedir_fd(), "clone_range_test", O_CREAT|O_WRONLY, 0600);
+ if (fd >= 0) {
+ ::unlinkat(get_basedir_fd(), "clone_range_test", 0);
+ if (fd < 0) {
+ r = -errno;
+ dout(0) << "detect_feature: failed to unlink test file for CLONE_RANGE ioctl: "
+ << cpp_strerror(r) << dendl;
+ }
+ btrfs_ioctl_clone_range_args clone_args;
+ memset(&clone_args, 0, sizeof(clone_args));
+ clone_args.src_fd = -1;
+ r = ::ioctl(fd, BTRFS_IOC_CLONE_RANGE, &clone_args);
+ if (r < 0 && errno == EBADF) {
+ dout(0) << "detect_feature: CLONE_RANGE ioctl is supported" << dendl;
+ has_clone_range = true;
+ } else {
+ r = -errno;
+ dout(0) << "detect_feature: CLONE_RANGE ioctl is NOT supported: " << cpp_strerror(r) << dendl;
+ }
+ TEMP_FAILURE_RETRY(::close(fd));
+ } else {
+ r = -errno;
+ dout(0) << "detect_feature: failed to create test file for CLONE_RANGE ioctl: "
+ << cpp_strerror(r) << dendl;
+ }
+ } else {
+ dout(0) << "detect_feature: CLONE_RANGE ioctl is DISABLED via 'filestore btrfs clone range' option" << dendl;
+ }
+
+ struct btrfs_ioctl_vol_args vol_args;
+ memset(&vol_args, 0, sizeof(vol_args));
+
+ // create test source volume
+ vol_args.fd = 0;
+ strcpy(vol_args.name, "test_subvol");
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE, &vol_args);
+ if (r != 0) {
+ r = -errno;
+ dout(0) << "detect_feature: failed to create simple subvolume " << vol_args.name << ": " << cpp_strerror(r) << dendl;
+ }
+ int srcfd = ::openat(get_basedir_fd(), vol_args.name, O_RDONLY);
+ if (srcfd < 0) {
+ r = -errno;
+ dout(0) << "detect_feature: failed to open " << vol_args.name << ": " << cpp_strerror(r) << dendl;
+ }
+
+ // snap_create and snap_destroy?
+ vol_args.fd = srcfd;
+ strcpy(vol_args.name, "sync_snap_test");
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args);
+ int err = errno;
+ if (r == 0 || errno == EEXIST) {
+ dout(0) << "detect_feature: SNAP_CREATE is supported" << dendl;
+ has_snap_create = true;
+
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+ if (r == 0) {
+ dout(0) << "detect_feature: SNAP_DESTROY is supported" << dendl;
+ has_snap_destroy = true;
+ } else {
+ err = -errno;
+ dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err) << dendl;
+
+ if (err == -EPERM && getuid() != 0) {
+ dout(0) << "detect_feature: failed with EPERM as non-root; remount with -o user_subvol_rm_allowed" << dendl;
+ cerr << TEXT_YELLOW
+ << "btrfs SNAP_DESTROY failed as non-root; remount with -o user_subvol_rm_allowed"
+ << TEXT_NORMAL << std::endl;
+ } else if (err == -EOPNOTSUPP) {
+ derr << "btrfs SNAP_DESTROY ioctl not supported; you need a kernel newer than 2.6.32" << dendl;
+ }
+ }
+ } else {
+ dout(0) << "detect_feature: SNAP_CREATE failed: " << cpp_strerror(err) << dendl;
+ }
+
+ if (m_filestore_btrfs_snap) {
+ if (has_snap_destroy)
+ stable_commits = true;
+ else
+ dout(0) << "detect_feature: snaps enabled, but no SNAP_DESTROY ioctl; DISABLING" << dendl;
+ }
+
+ // start_sync?
+ __u64 transid = 0;
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_START_SYNC, &transid);
+ if (r < 0) {
+ int err = errno;
+ dout(0) << "detect_feature: START_SYNC got " << cpp_strerror(err) << dendl;
+ }
+ if (r == 0 && transid > 0) {
+ dout(0) << "detect_feature: START_SYNC is supported (transid " << transid << ")" << dendl;
+
+ // do we have wait_sync too?
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_WAIT_SYNC, &transid);
+ if (r == 0 || errno == ERANGE) {
+ dout(0) << "detect_feature: WAIT_SYNC is supported" << dendl;
+ has_wait_sync = true;
+ } else {
+ int err = errno;
+ dout(0) << "detect_feature: WAIT_SYNC is NOT supported: " << cpp_strerror(err) << dendl;
+ }
+ } else {
+ int err = errno;
+ dout(0) << "detect_feature: START_SYNC is NOT supported: " << cpp_strerror(err) << dendl;
+ }
+
+ if (has_wait_sync) {
+ // async snap creation?
+ struct btrfs_ioctl_vol_args_v2 async_args;
+ memset(&async_args, 0, sizeof(async_args));
+ async_args.fd = srcfd;
+ async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC;
+ strcpy(async_args.name, "async_snap_test");
+
+ // remove old one, first
+ struct stat st;
+ strcpy(vol_args.name, async_args.name);
+ if (::fstatat(get_basedir_fd(), vol_args.name, &st, 0) == 0) {
+ dout(0) << "detect_feature: removing old async_snap_test" << dendl;
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+ if (r != 0) {
+ int err = errno;
+ dout(0) << "detect_feature: failed to remove old async_snap_test: " << cpp_strerror(err) << dendl;
+ }
+ }
+
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2, &async_args);
+ if (r == 0 || errno == EEXIST) {
+ dout(0) << "detect_feature: SNAP_CREATE_V2 is supported" << dendl;
+ has_snap_create_v2 = true;
+
+ // clean up
+ strcpy(vol_args.name, "async_snap_test");
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+ if (r != 0) {
+ int err = errno;
+ dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err) << dendl;
+ }
+ } else {
+ int err = errno;
+ dout(0) << "detect_feature: SNAP_CREATE_V2 is NOT supported: " << cpp_strerror(err) << dendl;
+ }
+ }
+
+ // clean up test subvol
+ if (srcfd >= 0)
+ TEMP_FAILURE_RETRY(::close(srcfd));
+
+ strcpy(vol_args.name, "test_subvol");
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+ if (r < 0) {
+ r = -errno;
+ dout(0) << "detect_feature: failed to remove " << vol_args.name << ": " << cpp_strerror(r) << dendl;
+ }
+
+ if (m_filestore_btrfs_snap && !has_snap_create_v2) {
+ dout(0) << "mount WARNING: btrfs snaps enabled, but no SNAP_CREATE_V2 ioctl (from kernel 2.6.37+)" << dendl;
+ cerr << TEXT_YELLOW
+ << " ** WARNING: 'filestore btrfs snap' is enabled (for safe transactions,\n"
+ << " rollback), but btrfs does not support the SNAP_CREATE_V2 ioctl\n"
+ << " (added in Linux 2.6.37). Expect slow btrfs sync/commit\n"
+ << " performance.\n"
+ << TEXT_NORMAL;
+ }
+
+ return 0;
+}
+
+bool BtrfsFileStoreBackend::can_checkpoint()
+{
+ return stable_commits;
+}
+
+int BtrfsFileStoreBackend::create_current()
+{
+ struct stat st;
+ int ret = ::stat(get_current_path().c_str(), &st);
+ if (ret == 0) {
+ // current/ exists
+ if (!S_ISDIR(st.st_mode)) {
+ dout(0) << "create_current: current/ exists but is not a directory" << dendl;
+ return -EINVAL;
+ }
+
+ struct stat basest;
+ struct statfs currentfs;
+ ret = ::fstat(get_basedir_fd(), &basest);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "create_current: cannot fstat basedir " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ ret = ::statfs(get_current_path().c_str(), ¤tfs);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "create_current: cannot statsf basedir " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ if (currentfs.f_type == BTRFS_SUPER_MAGIC && basest.st_dev != st.st_dev) {
+ dout(2) << "create_current: current appears to be a btrfs subvolume" << dendl;
+ stable_commits = true;
+ }
+ return 0;
+ }
+
+ struct btrfs_ioctl_vol_args volargs;
+ memset(&volargs, 0, sizeof(volargs));
+
+ volargs.fd = 0;
+ strcpy(volargs.name, "current");
+ if (::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE, (unsigned long int)&volargs) < 0) {
+ ret = -errno;
+ dout(0) << "create_current: BTRFS_IOC_SUBVOL_CREATE failed with error "
+ << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ dout(2) << "create_current: created btrfs subvol " << get_current_path() << dendl;
+ if (::chmod(get_current_path().c_str(), 0755) < 0) {
+ ret = -errno;
+ dout(0) << "create_current: failed to chmod " << get_current_path() << " to 0755: "
+ << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ stable_commits = true;
+ return 0;
+}
+
+int BtrfsFileStoreBackend::list_checkpoints(list<string>& ls)
+{
+ struct stat basest;
+ int ret = ::fstat(get_basedir_fd(), &basest);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "list_checkpoints: cannot fstat basedir " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ // get snap list
+ DIR *dir = ::opendir(get_basedir_path().c_str());
+ if (!dir) {
+ ret = -errno;
+ dout(0) << "list_checkpoints: opendir '" << get_basedir_path() << "' failed: "
+ << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ list<string> snaps;
+ char path[PATH_MAX];
+ struct dirent buf, *de;
+ while (::readdir_r(dir, &buf, &de) == 0) {
+ if (!de)
+ break;
+
+ snprintf(path, sizeof(path), "%s/%s", get_basedir_path().c_str(), de->d_name);
+
+ struct stat st;
+ ret = ::stat(path, &st);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "list_checkpoints: stat '" << path << "' failed: "
+ << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ if (!S_ISDIR(st.st_mode))
+ continue;
+
+ struct statfs fs;
+ ret = ::statfs(path, &fs);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "list_checkpoints: statfs '" << path << "' failed: "
+ << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ if (fs.f_type == BTRFS_SUPER_MAGIC && basest.st_dev != st.st_dev)
+ snaps.push_back(string(de->d_name));
+ }
+
+ if (::closedir(dir) < 0) {
+ ret = -errno;
+ dout(0) << "list_checkpoints: closedir failed: " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ ls.swap(snaps);
+ return 0;
+}
+
+int BtrfsFileStoreBackend::create_checkpoint(const string& name, uint64_t *transid)
+{
+ dout(10) << "create_checkpoint: '" << name << "'" << dendl;
+ if (has_snap_create_v2 && transid) {
+ struct btrfs_ioctl_vol_args_v2 async_args;
+ memset(&async_args, 0, sizeof(async_args));
+ async_args.fd = get_current_fd();
+ async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC;
+ strcpy(async_args.name, name.c_str());
+
+ int r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2, &async_args);
+ if (r < 0) {
+ r = -errno;
+ dout(0) << "create_checkpoint: async snap create '" << name << "' got " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ dout(20) << "create_checkpoint: async snap create '" << name << "' transid " << async_args.transid << dendl;
+ *transid = async_args.transid;
+ } else {
+ struct btrfs_ioctl_vol_args vol_args;
+ memset(&vol_args, 0, sizeof(vol_args));
+ vol_args.fd = get_current_fd();
+ strcpy(vol_args.name, name.c_str());
+
+ int r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args);
+ if (r < 0) {
+ r = -errno;
+ dout(0) << "create_checkpoint: snap create '" << name << "' got " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ if (transid)
+ *transid = 0;
+ }
+ return 0;
+}
+
+int BtrfsFileStoreBackend::sync_checkpoint(uint64_t transid)
+{
+ // wait for commit
+ dout(10) << "sync_checkpoint: transid " << transid << " to complete" << dendl;
+ int ret = ::ioctl(get_op_fd(), BTRFS_IOC_WAIT_SYNC, &transid);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "sync_checkpoint: ioctl WAIT_SYNC got " << cpp_strerror(ret) << dendl;
+ return -errno;
+ }
+ dout(20) << "sync_checkpoint: done waiting for transid " << transid << dendl;
+ return 0;
+}
+
+int BtrfsFileStoreBackend::rollback_to(const string& name)
+{
+ dout(10) << "rollback_to: to '" << name << "'" << dendl;
+ char s[PATH_MAX];
+ btrfs_ioctl_vol_args vol_args;
+
+ memset(&vol_args, 0, sizeof(vol_args));
+ vol_args.fd = 0;
+ strcpy(vol_args.name, "current");
+
+ int ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+ if (ret && errno != ENOENT) {
+ dout(0) << "rollback_to: error removing old current subvol: " << cpp_strerror(ret) << dendl;
+ snprintf(s, sizeof(s), "%s/current.remove.me.%d", get_basedir_path().c_str(), rand());
+ if (::rename(get_current_path().c_str(), s)) {
+ ret = -errno;
+ dout(0) << "rollback_to: error renaming old current subvol: "
+ << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ }
+
+ snprintf(s, sizeof(s), "%s/%s", get_basedir_path().c_str(), name.c_str());
+
+ // roll back
+ vol_args.fd = ::open(s, O_RDONLY);
+ if (vol_args.fd < 0) {
+ ret = -errno;
+ dout(0) << "rollback_to: error opening '" << s << "': " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args);
+ if (ret < 0 ) {
+ ret = -errno;
+ dout(0) << "rollback_to: ioctl SNAP_CREATE got " << cpp_strerror(ret) << dendl;
+ }
+ TEMP_FAILURE_RETRY(::close(vol_args.fd));
+ return ret;
+}
+
+int BtrfsFileStoreBackend::destroy_checkpoint(const string& name)
+{
+ dout(10) << "destroy_checkpoint: '" << name << "'" << dendl;
+ btrfs_ioctl_vol_args vol_args;
+ memset(&vol_args, 0, sizeof(vol_args));
+ vol_args.fd = 0;
+ strcpy(vol_args.name, name.c_str());
+
+ int ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+ if (ret) {
+ ret = -errno;
+ dout(0) << "destroy_checkpoint: ioctl SNAP_DESTROY got " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ return 0;
+}
+
+int BtrfsFileStoreBackend::syncfs()
+{
+ dout(15) << "syncfs" << dendl;
+ // do a full btrfs commit
+ int ret = ::ioctl(get_op_fd(), BTRFS_IOC_SYNC);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "syncfs: btrfs IOC_SYNC got " << cpp_strerror(ret) << dendl;
+ }
+ return ret;
+}
+
+int BtrfsFileStoreBackend::clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff)
+{
+ dout(20) << "clone_range: " << srcoff << "~" << len << " to " << dstoff << dendl;
+ size_t blk_size = get_blksize();
+ if (!has_clone_range ||
+ srcoff % blk_size != dstoff % blk_size) {
+ dout(20) << "clone_range: using copy" << dendl;
+ return _copy_range(from, to, srcoff, len, dstoff);
+ }
+
+ int err = 0;
+ int r = 0;
+
+ uint64_t srcoffclone = ALIGN_UP(srcoff, blk_size);
+ uint64_t dstoffclone = ALIGN_UP(dstoff, blk_size);
+ if (srcoffclone >= srcoff + len) {
+ dout(20) << "clone_range: using copy, extent too short to align srcoff" << dendl;
+ return _copy_range(from, to, srcoff, len, dstoff);
+ }
+
+ uint64_t lenclone = len - (srcoffclone - srcoff);
+ if (!ALIGNED(lenclone, blk_size)) {
+ struct stat from_stat, to_stat;
+ err = ::fstat(from, &from_stat);
+ if (err) return -errno;
+ err = ::fstat(to , &to_stat);
+ if (err) return -errno;
+
+ if (srcoff + len != (uint64_t)from_stat.st_size ||
+ dstoff + len < (uint64_t)to_stat.st_size) {
+ // Not to the end of the file, need to align length as well
+ lenclone = ALIGN_DOWN(lenclone, blk_size);
+ }
+ }
+ if (lenclone == 0) {
+ // too short
+ return _copy_range(from, to, srcoff, len, dstoff);
+ }
+
+ dout(20) << "clone_range: cloning " << srcoffclone << "~" << lenclone
+ << " to " << dstoffclone << " = " << r << dendl;
+ btrfs_ioctl_clone_range_args a;
+ a.src_fd = from;
+ a.src_offset = srcoffclone;
+ a.src_length = lenclone;
+ a.dest_offset = dstoffclone;
+ err = ::ioctl(to, BTRFS_IOC_CLONE_RANGE, &a);
+ if (err >= 0) {
+ r += err;
+ } else if (errno == EINVAL) {
+ // Still failed, might be compressed
+ dout(20) << "clone_range: failed CLONE_RANGE call with -EINVAL, using copy" << dendl;
+ return _copy_range(from, to, srcoff, len, dstoff);
+ } else {
+ return -errno;
+ }
+
+ // Take care any trimmed from front
+ if (srcoffclone != srcoff) {
+ err = _copy_range(from, to, srcoff, srcoffclone - srcoff, dstoff);
+ if (err >= 0) {
+ r += err;
+ } else {
+ return err;
+ }
+ }
+
+ // Copy end
+ if (srcoffclone + lenclone != srcoff + len) {
+ err = _copy_range(from, to,
+ srcoffclone + lenclone,
+ (srcoff + len) - (srcoffclone + lenclone),
+ dstoffclone + lenclone);
+ if (err >= 0) {
+ r += err;
+ } else {
+ return err;
+ }
+ }
+ dout(20) << "clone_range: finished " << srcoff << "~" << len
+ << " to " << dstoff << " = " << r << dendl;
+ return r;
+}
+#endif
#if defined(__linux__)
#include <linux/fs.h>
-#include <syscall.h>
#endif
#include <iostream>
#include <sstream>
#include "FileStore.h"
+#include "GenericFileStoreBackend.h"
+#include "BtrfsFileStoreBackend.h"
#include "common/BackTrace.h"
#include "include/types.h"
#include "FileJournal.h"
#include "common/ceph_crypto.h"
using ceph::crypto::SHA1;
-#ifndef __CYGWIN__
-# include "btrfs_ioctl.h"
-#endif
-
#include "include/assert.h"
#include "common/config.h"
#undef dout_prefix
#define dout_prefix *_dout << "filestore(" << basedir << ") "
-#if defined(__linux__)
-# ifndef BTRFS_SUPER_MAGIC
-static const __SWORD_TYPE BTRFS_SUPER_MAGIC(0x9123683E);
-# endif
-# ifndef XFS_SUPER_MAGIC
-static const __SWORD_TYPE XFS_SUPER_MAGIC(0x58465342);
-# endif
-#endif
-
#define COMMIT_SNAP_ITEM "snap_%lld"
#define CLUSTER_SNAP_ITEM "clustersnap_%s"
#define REPLAY_GUARD_XATTR "user.cephos.seq"
#define GLOBAL_REPLAY_GUARD_XATTR "user.cephos.gseq"
-/*
- * long file names will have the following format:
- *
- * prefix_hash_index_cookie
- *
- * The prefix will just be the first X bytes of the original file name.
- * The cookie is a constant string that shows whether this file name
- * is hashed
- */
-
-#define FILENAME_LFN_DIGEST_SIZE CEPH_CRYPTO_SHA1_DIGESTSIZE
-
-#define FILENAME_MAX_LEN 4096 // the long file name size
-#define FILENAME_SHORT_LEN 255 // the short file name size
-#define FILENAME_COOKIE "long" // ceph long file name
-#define FILENAME_HASH_LEN FILENAME_LFN_DIGEST_SIZE
-#define FILENAME_EXTRA 4 // underscores and digit
-
-#define LFN_ATTR "user.cephos.lfn"
-
-#define FILENAME_PREFIX_LEN (FILENAME_SHORT_LEN - FILENAME_HASH_LEN - (sizeof(FILENAME_COOKIE) - 1) - FILENAME_EXTRA)
-#define ALIGN_DOWN(x, by) ((x) - ((x) % (by)))
-#define ALIGNED(x, by) (!((x) % (by)))
-#define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by)))
void FileStore::FSPerfTracker::update_from_perfcounters(
PerfCounters &logger)
/* Ensure that replay of this op doesn't result in the object_map
* going away.
*/
- if (!btrfs_stable_commits)
+ if (!backend->can_checkpoint())
object_map->sync(&o, &spos);
}
}
FileStore::FileStore(const std::string &base, const std::string &jdev, const char *name, bool do_update) :
internal_name(name),
basedir(base), journalpath(jdev),
- btrfs(false),
- btrfs_stable_commits(false),
blk_size(0),
- btrfs_trans_start_end(false), btrfs_clone_range(false),
- btrfs_snap_create(false),
- btrfs_snap_destroy(false),
- btrfs_snap_create_v2(false),
- btrfs_wait_sync(false),
- ioctl_fiemap(false),
fsid_fd(-1), op_fd(-1),
basedir_fd(-1), current_fd(-1),
+ generic_backend(NULL), backend(NULL),
index_manager(do_update),
ondisk_finisher(g_ceph_context),
lock("FileStore::lock"),
g_conf->filestore_op_thread_suicide_timeout, &op_tp),
logger(NULL),
read_error_lock("FileStore::read_error_lock"),
- m_filestore_btrfs_clone_range(g_conf->filestore_btrfs_clone_range),
- m_filestore_btrfs_snap (g_conf->filestore_btrfs_snap ),
m_filestore_commit_timeout(g_conf->filestore_commit_timeout),
- m_filestore_fiemap(g_conf->filestore_fiemap),
- m_filestore_fsync_flushes_journal_data(g_conf->filestore_fsync_flushes_journal_data),
m_filestore_journal_parallel(g_conf->filestore_journal_parallel ),
m_filestore_journal_trailing(g_conf->filestore_journal_trailing),
m_filestore_journal_writeahead(g_conf->filestore_journal_writeahead),
g_ceph_context->get_perfcounters_collection()->add(logger);
g_ceph_context->_conf->add_observer(this);
+
+ generic_backend = new GenericFileStoreBackend(this);
+ backend = generic_backend;
}
FileStore::~FileStore()
g_ceph_context->_conf->remove_observer(this);
g_ceph_context->get_perfcounters_collection()->remove(logger);
+ delete generic_backend;
+
if (journal)
journal->logger = NULL;
delete logger;
return false;
}
-static int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap)
-{
- struct fiemap *fiemap = NULL;
- struct fiemap *_realloc_fiemap = NULL;
- int size;
- int ret;
-
- fiemap = (struct fiemap*)calloc(sizeof(struct fiemap), 1);
- if (!fiemap)
- return -ENOMEM;
-
- fiemap->fm_start = start;
- fiemap->fm_length = len;
-
- fsync(fd); /* flush extents to disk if needed */
-
- if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) {
- ret = -errno;
- goto done_err;
- }
-
- size = sizeof(struct fiemap_extent) * (fiemap->fm_mapped_extents);
-
- _realloc_fiemap = (struct fiemap *)realloc(fiemap, sizeof(struct fiemap) +
- size);
- if (!_realloc_fiemap) {
- ret = -ENOMEM;
- goto done_err;
- } else {
- fiemap = _realloc_fiemap;
- }
-
- memset(fiemap->fm_extents, 0, size);
-
- fiemap->fm_extent_count = fiemap->fm_mapped_extents;
- fiemap->fm_mapped_extents = 0;
-
- if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) {
- ret = -errno;
- goto done_err;
- }
- *pfiemap = fiemap;
-
- return 0;
-
-done_err:
- *pfiemap = NULL;
- free(fiemap);
- return ret;
-}
-
int FileStore::statfs(struct statfs *buf)
{
if (::statfs(basedir.c_str(), buf) < 0) {
int FileStore::mkfs()
{
int ret = 0;
- int basedir_fd;
char fsid_fn[PATH_MAX];
- struct stat st;
uuid_d old_fsid;
-#if defined(__linux__)
- struct btrfs_ioctl_vol_args volargs;
- memset(&volargs, 0, sizeof(volargs));
-#endif
-
dout(1) << "mkfs in " << basedir << dendl;
basedir_fd = ::open(basedir.c_str(), O_RDONLY);
if (basedir_fd < 0) {
goto close_fsid_fd;
}
- // current
- ret = ::stat(current_fn.c_str(), &st);
- if (ret == 0) {
- // current/ exists
- if (!S_ISDIR(st.st_mode)) {
- ret = -EINVAL;
- derr << "mkfs current/ exists but is not a directory" << dendl;
- goto close_fsid_fd;
- }
-
+ if (basefs.f_type == BTRFS_SUPER_MAGIC) {
#if defined(__linux__)
- // is current/ a btrfs subvolume?
- // check fsid, and compare st_dev to see if it's a subvolume.
- struct stat basest;
- struct statfs currentfs;
- ret = ::fstat(basedir_fd, &basest);
- if (ret < 0) {
- ret = -errno;
- derr << "mkfs cannot fstat basedir "
- << cpp_strerror(ret) << dendl;
- goto close_fsid_fd;
- }
- ret = ::statfs(current_fn.c_str(), ¤tfs);
- if (ret < 0) {
- ret = -errno;
- derr << "mkfs cannot statsf basedir "
- << cpp_strerror(ret) << dendl;
- goto close_fsid_fd;
- }
- if (basefs.f_type == BTRFS_SUPER_MAGIC &&
- currentfs.f_type == BTRFS_SUPER_MAGIC &&
- basest.st_dev != st.st_dev) {
- dout(2) << " current appears to be a btrfs subvolume" << dendl;
- btrfs_stable_commits = true;
- }
+ backend = new BtrfsFileStoreBackend(this);
#endif
- } else {
-#if defined(__linux__)
- if (basefs.f_type == BTRFS_SUPER_MAGIC) {
- volargs.fd = 0;
- strcpy(volargs.name, "current");
- if (::ioctl(basedir_fd, BTRFS_IOC_SUBVOL_CREATE, (unsigned long int)&volargs) < 0) {
- ret = -errno;
- derr << "mkfs: BTRFS_IOC_SUBVOL_CREATE failed with error "
- << cpp_strerror(ret) << dendl;
- goto close_fsid_fd;
- }
+ }
- dout(2) << " created btrfs subvol " << current_fn << dendl;
- if (::chmod(current_fn.c_str(), 0755) < 0) {
- ret = -errno;
- derr << "mkfs: failed to chmod " << current_fn << " to 0755: "
- << cpp_strerror(ret) << dendl;
- goto close_fsid_fd;
- }
- btrfs_stable_commits = true;
- } else
-#endif
- {
- if (::mkdir(current_fn.c_str(), 0755) < 0) {
- ret = -errno;
- derr << "mkfs: mkdir " << current_fn << " failed: "
- << cpp_strerror(ret) << dendl;
- goto close_fsid_fd;
- }
- }
+ ret = backend->create_current();
+ if (ret < 0) {
+ derr << "mkfs: failed to create current/ " << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
}
// write initial op_seq
goto close_fsid_fd;
}
- if (btrfs_stable_commits) {
+ if (backend->can_checkpoint()) {
// create snap_1 too
- snprintf(volargs.name, sizeof(volargs.name), COMMIT_SNAP_ITEM, 1ull);
- volargs.fd = ::open(current_fn.c_str(), O_RDONLY);
- assert(volargs.fd >= 0);
- if (::ioctl(basedir_fd, BTRFS_IOC_SNAP_CREATE, (unsigned long int)&volargs)) {
- ret = -errno;
- if (ret != -EEXIST) {
- TEMP_FAILURE_RETRY(::close(fd));
- TEMP_FAILURE_RETRY(::close(volargs.fd));
- derr << "mkfs: failed to create " << volargs.name << ": "
- << cpp_strerror(ret) << dendl;
- goto close_fsid_fd;
- }
- }
- if (::fchmod(volargs.fd, 0755)) {
+ current_fd = ::open(current_fn.c_str(), O_RDONLY);
+ assert(current_fd >= 0);
+ char s[NAME_MAX];
+ snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, 1ull);
+ ret = backend->create_checkpoint(s, NULL);
+ TEMP_FAILURE_RETRY(::close(current_fd));
+ if (ret < 0 && ret != -EEXIST) {
TEMP_FAILURE_RETRY(::close(fd));
- TEMP_FAILURE_RETRY(::close(volargs.fd));
- ret = -errno;
- derr << "mkfs: failed to chmod " << basedir << "/" << volargs.name << " to 0755: "
- << cpp_strerror(ret) << dendl;
+ derr << "mkfs: failed to create snap_1: " << cpp_strerror(ret) << dendl;
goto close_fsid_fd;
}
- TEMP_FAILURE_RETRY(::close(volargs.fd));
}
}
TEMP_FAILURE_RETRY(::close(fd));
fsid_fd = -1;
close_basedir_fd:
TEMP_FAILURE_RETRY(::close(basedir_fd));
+ if (backend != generic_backend) {
+ delete backend;
+ backend = generic_backend;
+ }
return ret;
}
return inuse;
}
-int FileStore::_test_fiemap()
-{
- char fn[PATH_MAX];
- snprintf(fn, sizeof(fn), "%s/fiemap_test", basedir.c_str());
-
- int fd = ::open(fn, O_CREAT|O_RDWR|O_TRUNC, 0644);
- if (fd < 0) {
- fd = -errno;
- derr << "_test_fiemap unable to create " << fn << ": " << cpp_strerror(fd) << dendl;
- return fd;
- }
-
- // ext4 has a bug in older kernels where fiemap will return an empty
- // result in some cases. this is a file layout that triggers the bug
- // on 2.6.34-rc5.
- int v[] = {
- 0x0000000000016000, 0x0000000000007000,
- 0x000000000004a000, 0x0000000000007000,
- 0x0000000000060000, 0x0000000000001000,
- 0x0000000000061000, 0x0000000000008000,
- 0x0000000000069000, 0x0000000000007000,
- 0x00000000000a3000, 0x000000000000c000,
- 0x000000000024e000, 0x000000000000c000,
- 0x000000000028b000, 0x0000000000009000,
- 0x00000000002b1000, 0x0000000000003000,
- 0, 0
- };
- for (int i=0; v[i]; i++) {
- int off = v[i++];
- int len = v[i];
-
- // write a large extent
- char buf[len];
- memset(buf, 1, sizeof(buf));
- int r = ::lseek(fd, off, SEEK_SET);
- if (r < 0) {
- r = -errno;
- derr << "_test_fiemap failed to lseek " << fn << ": " << cpp_strerror(r) << dendl;
- TEMP_FAILURE_RETRY(::close(fd));
- return r;
- }
- r = safe_write(fd, buf, sizeof(buf));
- if (r < 0) {
- derr << "_test_fiemap failed to write to " << fn << ": " << cpp_strerror(r) << dendl;
- TEMP_FAILURE_RETRY(::close(fd));
- return r;
- }
- }
- ::fsync(fd);
-
- // fiemap an extent inside that
- struct fiemap *fiemap;
- int r = do_fiemap(fd, 2430421, 59284, &fiemap);
- if (r < 0) {
- dout(0) << "mount FIEMAP ioctl is NOT supported" << dendl;
- ioctl_fiemap = false;
- } else {
- if (fiemap->fm_mapped_extents == 0) {
- dout(0) << "mount FIEMAP ioctl is supported, but buggy -- upgrade your kernel" << dendl;
- ioctl_fiemap = false;
- } else {
- dout(0) << "mount FIEMAP ioctl is supported and appears to work" << dendl;
- ioctl_fiemap = true;
- }
- }
- if (!m_filestore_fiemap) {
- dout(0) << "mount FIEMAP ioctl is disabled via 'filestore fiemap' config option" << dendl;
- ioctl_fiemap = false;
- }
- free(fiemap);
-
- ::unlink(fn);
- TEMP_FAILURE_RETRY(::close(fd));
- return 0;
-}
-
int FileStore::_detect_fs()
{
- int fd = ::open(basedir.c_str(), O_RDONLY);
- if (fd < 0)
- return -errno;
-
struct statfs st;
- int r = ::fstatfs(fd, &st);
- if (r < 0) {
- TEMP_FAILURE_RETRY(::close(fd));
+ int r = ::fstatfs(basedir_fd, &st);
+ if (r < 0)
return -errno;
- }
#if defined(__linux__)
- if (st.f_type == XFS_SUPER_MAGIC) {
+ if (st.f_type == BTRFS_SUPER_MAGIC) {
+ dout(0) << "mount detected btrfs" << dendl;
+ backend = new BtrfsFileStoreBackend(this);
+
+ wbthrottle.set_fs(WBThrottle::BTRFS);
+ } else if (st.f_type == XFS_SUPER_MAGIC) {
dout(1) << "mount detected xfs" << dendl;
if (m_filestore_replica_fadvise) {
dout(1) << " disabling 'filestore replica fadvise' due to known issues with fadvise(DONTNEED) on xfs" << dendl;
}
#endif
+ r = backend->detect_features();
+ if (r < 0) {
+ derr << "_detect_fs: detect_features error: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
// test xattrs
char fn[PATH_MAX];
int x = rand();
if (tmpfd < 0) {
int ret = -errno;
derr << "_detect_fs unable to create " << fn << ": " << cpp_strerror(ret) << dendl;
- TEMP_FAILURE_RETRY(::close(fd));
return ret;
}
<< "file system with the 'user_xattr' option." << dendl;
::unlink(fn);
TEMP_FAILURE_RETRY(::close(tmpfd));
- TEMP_FAILURE_RETRY(::close(fd));
return -ENOTSUP;
}
::unlink(fn);
TEMP_FAILURE_RETRY(::close(tmpfd));
- r = _test_fiemap();
- if (r < 0) {
- TEMP_FAILURE_RETRY(::close(fd));
- return -r;
- }
-
- blk_size = st.f_bsize;
-
-#if defined(__linux__)
- if (st.f_type == BTRFS_SUPER_MAGIC) {
- dout(0) << "mount detected btrfs" << dendl;
- wbthrottle.set_fs(WBThrottle::BTRFS);
- btrfs = true;
-
- btrfs_stable_commits = btrfs && m_filestore_btrfs_snap;
-
- // clone_range?
- if (m_filestore_btrfs_clone_range) {
- btrfs_clone_range = true;
- int r = _do_clone_range(fsid_fd, -1, 0, 1, 0);
- if (r == -EBADF) {
- dout(0) << "mount btrfs CLONE_RANGE ioctl is supported" << dendl;
- } else {
- btrfs_clone_range = false;
- dout(0) << "mount btrfs CLONE_RANGE ioctl is NOT supported: " << cpp_strerror(r) << dendl;
- }
- } else {
- dout(0) << "mount btrfs CLONE_RANGE ioctl is DISABLED via 'filestore btrfs clone range' option" << dendl;
- }
-
- struct btrfs_ioctl_vol_args vol_args;
- memset(&vol_args, 0, sizeof(vol_args));
-
- // create test source volume
- vol_args.fd = 0;
- strcpy(vol_args.name, "test_subvol");
- r = ::ioctl(fd, BTRFS_IOC_SUBVOL_CREATE, &vol_args);
- if (r != 0) {
- r = -errno;
- dout(0) << "mount failed to create simple subvolume " << vol_args.name << ": " << cpp_strerror(r) << dendl;
- }
- int srcfd = ::openat(fd, vol_args.name, O_RDONLY);
- if (srcfd < 0) {
- r = -errno;
- dout(0) << "mount failed to open " << vol_args.name << ": " << cpp_strerror(r) << dendl;
- }
-
- // snap_create and snap_destroy?
- vol_args.fd = srcfd;
- strcpy(vol_args.name, "sync_snap_test");
- r = ::ioctl(fd, BTRFS_IOC_SNAP_CREATE, &vol_args);
- int err = errno;
- if (r == 0 || errno == EEXIST) {
- dout(0) << "mount btrfs SNAP_CREATE is supported" << dendl;
- btrfs_snap_create = true;
-
- r = ::ioctl(fd, BTRFS_IOC_SNAP_DESTROY, &vol_args);
- if (r == 0) {
- dout(0) << "mount btrfs SNAP_DESTROY is supported" << dendl;
- btrfs_snap_destroy = true;
- } else {
- err = -errno;
- dout(0) << "mount btrfs SNAP_DESTROY failed: " << cpp_strerror(err) << dendl;
-
- if (err == -EPERM && getuid() != 0) {
- dout(0) << "btrfs SNAP_DESTROY failed with EPERM as non-root; remount with -o user_subvol_rm_allowed" << dendl;
- cerr << TEXT_YELLOW
- << "btrfs SNAP_DESTROY failed as non-root; remount with -o user_subvol_rm_allowed"
- << TEXT_NORMAL
- << std::endl;
- } else if (err == -EOPNOTSUPP) {
- derr << "btrfs SNAP_DESTROY ioctl not supported; you need a kernel newer than 2.6.32" << dendl;
- }
- }
- } else {
- dout(0) << "mount btrfs SNAP_CREATE failed: " << cpp_strerror(err) << dendl;
- }
-
- if (m_filestore_btrfs_snap && !btrfs_snap_destroy) {
- dout(0) << "mount btrfs snaps enabled, but no SNAP_DESTROY ioctl; DISABLING" << dendl;
- btrfs_stable_commits = false;
- }
-
- // start_sync?
- __u64 transid = 0;
- r = ::ioctl(fd, BTRFS_IOC_START_SYNC, &transid);
- if (r < 0) {
- int err = errno;
- dout(0) << "mount btrfs START_SYNC got " << cpp_strerror(err) << dendl;
- }
- if (r == 0 && transid > 0) {
- dout(0) << "mount btrfs START_SYNC is supported (transid " << transid << ")" << dendl;
-
- // do we have wait_sync too?
- r = ::ioctl(fd, BTRFS_IOC_WAIT_SYNC, &transid);
- if (r == 0 || errno == ERANGE) {
- dout(0) << "mount btrfs WAIT_SYNC is supported" << dendl;
- btrfs_wait_sync = true;
- } else {
- int err = errno;
- dout(0) << "mount btrfs WAIT_SYNC is NOT supported: " << cpp_strerror(err) << dendl;
- }
- } else {
- int err = errno;
- dout(0) << "mount btrfs START_SYNC is NOT supported: " << cpp_strerror(err) << dendl;
- }
-
- if (btrfs_wait_sync) {
- // async snap creation?
- struct btrfs_ioctl_vol_args_v2 async_args;
- memset(&async_args, 0, sizeof(async_args));
- async_args.fd = srcfd;
- async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC;
- strcpy(async_args.name, "async_snap_test");
-
- // remove old one, first
- struct stat st;
- strcpy(vol_args.name, async_args.name);
- if (::fstatat(fd, vol_args.name, &st, 0) == 0) {
- dout(0) << "mount btrfs removing old async_snap_test" << dendl;
- r = ::ioctl(fd, BTRFS_IOC_SNAP_DESTROY, &vol_args);
- if (r != 0) {
- int err = errno;
- dout(0) << "mount failed to remove old async_snap_test: " << cpp_strerror(err) << dendl;
- }
- }
-
- r = ::ioctl(fd, BTRFS_IOC_SNAP_CREATE_V2, &async_args);
- if (r == 0 || errno == EEXIST) {
- dout(0) << "mount btrfs SNAP_CREATE_V2 is supported" << dendl;
- btrfs_snap_create_v2 = true;
-
- // clean up
- strcpy(vol_args.name, "async_snap_test");
- r = ::ioctl(fd, BTRFS_IOC_SNAP_DESTROY, &vol_args);
- if (r != 0) {
- int err = errno;
- dout(0) << "mount btrfs SNAP_DESTROY failed: " << cpp_strerror(err) << dendl;
- }
- } else {
- int err = errno;
- dout(0) << "mount btrfs SNAP_CREATE_V2 is NOT supported: "
- << cpp_strerror(err) << dendl;
- }
- }
-
- // clean up test subvol
- if (srcfd >= 0)
- TEMP_FAILURE_RETRY(::close(srcfd));
-
- strcpy(vol_args.name, "test_subvol");
- r = ::ioctl(fd, BTRFS_IOC_SNAP_DESTROY, &vol_args);
- if (r < 0) {
- r = -errno;
- dout(0) << "mount failed to remove " << vol_args.name << ": " << cpp_strerror(r) << dendl;
- }
-
- if (m_filestore_btrfs_snap && !btrfs_snap_create_v2) {
- dout(0) << "mount WARNING: btrfs snaps enabled, but no SNAP_CREATE_V2 ioctl (from kernel 2.6.37+)" << dendl;
- cerr << TEXT_YELLOW
- << " ** WARNING: 'filestore btrfs snap' is enabled (for safe transactions,\n"
- << " rollback), but btrfs does not support the SNAP_CREATE_V2 ioctl\n"
- << " (added in Linux 2.6.37). Expect slow btrfs sync/commit\n"
- << " performance.\n"
- << TEXT_NORMAL;
- }
-
- } else
-#endif /* __linux__ */
- {
- dout(0) << "mount did NOT detect btrfs" << dendl;
- btrfs = false;
- }
-
- bool have_syncfs = false;
-#ifdef HAVE_SYS_SYNCFS
- if (syncfs(fd) == 0) {
- dout(0) << "mount syncfs(2) syscall fully supported (by glibc and kernel)" << dendl;
- have_syncfs = true;
- } else {
- dout(0) << "mount syncfs(2) syscall supported by glibc BUT NOT the kernel" << dendl;
- }
-#elif defined(SYS_syncfs)
- if (syscall(SYS_syncfs, fd) == 0) {
- dout(0) << "mount syscall(SYS_syncfs, fd) fully supported" << dendl;
- have_syncfs = true;
- } else {
- dout(0) << "mount syscall(SYS_syncfs, fd) supported by libc BUT NOT the kernel" << dendl;
- }
-#elif defined(__NR_syncfs)
- if (syscall(__NR_syncfs, fd) == 0) {
- dout(0) << "mount syscall(__NR_syncfs, fd) fully supported" << dendl;
- have_syncfs = true;
- } else {
- dout(0) << "mount syscall(__NR_syncfs, fd) supported by libc BUT NOT the kernel" << dendl;
- }
-#endif
- if (!have_syncfs) {
- dout(0) << "mount syncfs(2) syscall not supported" << dendl;
- if (btrfs) {
- dout(0) << "mount no syncfs(2), but the btrfs SYNC ioctl will suffice" << dendl;
- } else if (m_filestore_fsync_flushes_journal_data) {
- dout(0) << "mount no syncfs(2), but 'filestore fsync flushes journal data = true', so fsync will suffice." << dendl;
- } else {
- dout(0) << "mount no syncfs(2), must use sync(2)." << dendl;
- dout(0) << "mount WARNING: multiple ceph-osd daemons on the same host will be slow" << dendl;
- }
- }
-
- TEMP_FAILURE_RETRY(::close(fd));
return 0;
}
return -EINVAL;
}
- if (!btrfs) {
+ if (!backend->can_checkpoint()) {
if (!journal || !m_filestore_journal_writeahead) {
dout(0) << "mount WARNING: no btrfs, and no journal in writeahead mode; data may be lost" << dendl;
cerr << TEXT_RED
dout(10) << "mount fsid is " << fsid << dendl;
- // test for btrfs, xattrs, etc.
- ret = _detect_fs();
- if (ret)
- goto close_fsid_fd;
uint32_t version_stamp;
ret = version_stamp_is_valid(&version_stamp);
goto close_fsid_fd;
}
+ // test for btrfs, xattrs, etc.
+ ret = _detect_fs();
+ if (ret < 0) {
+ derr << "FileStore::mount : error in _detect_fs: "
+ << cpp_strerror(ret) << dendl;
+ goto close_basedir_fd;
+ }
+
{
- // get snap list
- DIR *dir = ::opendir(basedir.c_str());
- if (!dir) {
- ret = -errno;
- derr << "FileStore::mount: opendir '" << basedir << "' failed: "
- << cpp_strerror(ret) << dendl;
+ list<string> ls;
+ ret = backend->list_checkpoints(ls);
+ if (ret < 0) {
+ derr << "FileStore::mount : error in _list_snaps: "<< cpp_strerror(ret) << dendl;
goto close_basedir_fd;
}
- struct dirent *de;
- while (::readdir_r(dir, (struct dirent *)buf, &de) == 0) {
- if (!de)
- break;
- long long unsigned c;
- char clustersnap[PATH_MAX];
- if (sscanf(de->d_name, COMMIT_SNAP_ITEM, &c) == 1)
+ long long unsigned c, prev = 0;
+ char clustersnap[NAME_MAX];
+ for (list<string>::iterator it = ls.begin(); it != ls.end(); ++it) {
+ if (sscanf(it->c_str(), COMMIT_SNAP_ITEM, &c) == 1) {
+ assert(c > prev);
+ prev = c;
snaps.push_back(c);
- else if (sscanf(de->d_name, CLUSTER_SNAP_ITEM, clustersnap) == 1)
- cluster_snaps.insert(clustersnap);
- }
-
- if (::closedir(dir) < 0) {
- ret = -errno;
- derr << "FileStore::closedir(basedir) failed: error " << cpp_strerror(ret)
- << dendl;
- goto close_basedir_fd;
+ } else if (sscanf(it->c_str(), CLUSTER_SNAP_ITEM, clustersnap) == 1)
+ cluster_snaps.insert(*it);
}
-
- dout(0) << "mount found snaps " << snaps << dendl;
- if (!cluster_snaps.empty())
- dout(0) << "mount found cluster snaps " << cluster_snaps << dendl;
}
if (m_osd_rollback_to_cluster_snap.length() &&
char nosnapfn[200];
snprintf(nosnapfn, sizeof(nosnapfn), "%s/nosnap", current_fn.c_str());
- if (btrfs_stable_commits) {
+ if (backend->can_checkpoint()) {
if (snaps.empty()) {
dout(0) << "mount WARNING: no consistent snaps found, store may be in inconsistent state" << dendl;
- } else if (!btrfs) {
- dout(0) << "mount WARNING: not btrfs, store may be in inconsistent state" << dendl;
} else {
- char s[PATH_MAX];
+ char s[NAME_MAX];
uint64_t curr_seq = 0;
if (m_osd_rollback_to_cluster_snap.length()) {
<< TEXT_NORMAL
<< dendl;
assert(cluster_snaps.count(m_osd_rollback_to_cluster_snap));
- snprintf(s, sizeof(s), "%s/" CLUSTER_SNAP_ITEM, basedir.c_str(),
- m_osd_rollback_to_cluster_snap.c_str());
+ snprintf(s, sizeof(s), CLUSTER_SNAP_ITEM, m_osd_rollback_to_cluster_snap.c_str());
} else {
{
int fd = read_op_seq(&curr_seq);
}
dout(10) << "mount rolling back to consistent snap " << cp << dendl;
- snprintf(s, sizeof(s), "%s/" COMMIT_SNAP_ITEM, basedir.c_str(), (long long unsigned)cp);
+ snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)cp);
}
- btrfs_ioctl_vol_args vol_args;
- memset(&vol_args, 0, sizeof(vol_args));
- vol_args.fd = 0;
- strcpy(vol_args.name, "current");
-
// drop current?
- if (curr_seq > 0) {
- ret = ::ioctl(basedir_fd, BTRFS_IOC_SNAP_DESTROY, &vol_args);
- if (ret) {
- ret = -errno;
- derr << "FileStore::mount: error removing old current subvol: " << cpp_strerror(ret) << dendl;
- char s[PATH_MAX];
- snprintf(s, sizeof(s), "%s/current.remove.me.%d", basedir.c_str(), rand());
- if (::rename(current_fn.c_str(), s)) {
- ret = -errno;
- derr << "FileStore::mount: error renaming old current subvol: "
- << cpp_strerror(ret) << dendl;
- goto close_basedir_fd;
- }
- }
- }
-
- // roll back
- vol_args.fd = ::open(s, O_RDONLY);
- if (vol_args.fd < 0) {
- ret = -errno;
- derr << "FileStore::mount: error opening '" << s << "': " << cpp_strerror(ret) << dendl;
- goto close_basedir_fd;
- }
- if (::ioctl(basedir_fd, BTRFS_IOC_SNAP_CREATE, &vol_args)) {
- ret = -errno;
- derr << "FileStore::mount: error ioctl(BTRFS_IOC_SNAP_CREATE) failed: " << cpp_strerror(ret) << dendl;
- TEMP_FAILURE_RETRY(::close(vol_args.fd));
+ ret = backend->rollback_to(s);
+ if (ret) {
+ derr << "FileStore::mount: error rolling back to " << s << ": "
+ << cpp_strerror(ret) << dendl;
goto close_basedir_fd;
}
- TEMP_FAILURE_RETRY(::close(vol_args.fd));
}
}
initial_op_seq = 0;
goto close_current_fd;
}
- if (!btrfs_stable_commits) {
+ if (!backend->can_checkpoint()) {
// mark current/ as non-snapshotted so that we don't rollback away
// from it.
int r = ::creat(nosnapfn, 0644);
if (!m_filestore_journal_writeahead &&
!m_filestore_journal_parallel &&
!m_filestore_journal_trailing) {
- if (!btrfs) {
- m_filestore_journal_writeahead = true;
- dout(0) << "mount: enabling WRITEAHEAD journal mode: btrfs not detected" << dendl;
- } else if (!btrfs_stable_commits) {
+ if (!backend->can_checkpoint()) {
m_filestore_journal_writeahead = true;
- dout(0) << "mount: enabling WRITEAHEAD journal mode: 'filestore btrfs snap' mode is not enabled" << dendl;
- } else if (!btrfs_snap_create_v2) {
- m_filestore_journal_writeahead = true;
- dout(0) << "mount: enabling WRITEAHEAD journal mode: btrfs SNAP_CREATE_V2 ioctl not detected (v2.6.37+)" << dendl;
+ dout(0) << "mount: enabling WRITEAHEAD journal mode: checkpoint is not enabled" << dendl;
} else {
m_filestore_journal_parallel = true;
- dout(0) << "mount: enabling PARALLEL journal mode: btrfs, SNAP_CREATE_V2 detected and 'filestore btrfs snap' mode is enabled" << dendl;
+ dout(0) << "mount: enabling PARALLEL journal mode: fs, checkpoint is enabled" << dendl;
}
} else {
if (m_filestore_journal_writeahead)
TEMP_FAILURE_RETRY(::close(basedir_fd));
basedir_fd = -1;
}
+
+ if (backend != generic_backend) {
+ delete backend;
+ backend = generic_backend;
+ }
+
object_map.reset();
{
uint64_t max_ops = m_filestore_queue_max_ops;
uint64_t max_bytes = m_filestore_queue_max_bytes;
- if (btrfs_stable_commits && is_committing()) {
+ if (backend->can_checkpoint() && is_committing()) {
max_ops += m_filestore_queue_committing_max_ops;
max_bytes += m_filestore_queue_committing_max_bytes;
}
void FileStore::_set_global_replay_guard(coll_t cid,
const SequencerPosition &spos)
{
- if (btrfs_stable_commits)
+ if (backend->can_checkpoint())
return;
// sync all previous operations on this sequencer
int FileStore::_check_global_replay_guard(coll_t cid,
const SequencerPosition& spos)
{
- if (!replaying || btrfs_stable_commits)
+ if (!replaying || backend->can_checkpoint())
return 1;
char fn[PATH_MAX];
const hobject_t *hoid,
bool in_progress)
{
- if (btrfs_stable_commits)
+ if (backend->can_checkpoint())
return;
dout(10) << "_set_replay_guard " << spos << (in_progress ? " START" : "") << dendl;
void FileStore::_close_replay_guard(int fd, const SequencerPosition& spos)
{
- if (btrfs_stable_commits)
+ if (backend->can_checkpoint())
return;
dout(10) << "_close_replay_guard " << spos << dendl;
int FileStore::_check_replay_guard(coll_t cid, hobject_t oid, const SequencerPosition& spos)
{
- if (!replaying || btrfs_stable_commits)
+ if (!replaying || backend->can_checkpoint())
return 1;
int r = _check_global_replay_guard(cid, spos);
int FileStore::_check_replay_guard(coll_t cid, const SequencerPosition& spos)
{
- if (!replaying || btrfs_stable_commits)
+ if (!replaying || backend->can_checkpoint())
return 1;
char fn[PATH_MAX];
int FileStore::_check_replay_guard(int fd, const SequencerPosition& spos)
{
- if (!replaying || btrfs_stable_commits)
+ if (!replaying || backend->can_checkpoint())
return 1;
char buf[100];
op == Transaction::OP_CLONERANGE2 ||
op == Transaction::OP_COLL_ADD))
// -ENOENT is normally okay
- // ...including on a replayed OP_RMCOLL with !stable_commits
+ // ...including on a replayed OP_RMCOLL with checkpoint mode
ok = true;
if (r == -ENOENT && (
op == Transaction::OP_COLL_ADD &&
if (r == -ENODATA)
ok = true;
- if (replaying && !btrfs_stable_commits) {
+ if (replaying && !backend->can_checkpoint()) {
if (r == -EEXIST && op == Transaction::OP_MKCOLL) {
- dout(10) << "tolerating EEXIST during journal replay on non-btrfs" << dendl;
+ dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl;
ok = true;
}
if (r == -EEXIST && op == Transaction::OP_COLL_ADD) {
- dout(10) << "tolerating EEXIST during journal replay since btrfs_snap is not enabled" << dendl;
+ dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl;
ok = true;
}
if (r == -EEXIST && op == Transaction::OP_COLL_MOVE) {
- dout(10) << "tolerating EEXIST during journal replay since btrfs_snap is not enabled" << dendl;
+ dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl;
ok = true;
}
if (r == -ERANGE) {
uint64_t offset, size_t len,
bufferlist& bl)
{
- if (!ioctl_fiemap || len <= (size_t)m_filestore_fiemap_threshold) {
+ if (!backend->has_fiemap() || len <= (size_t)m_filestore_fiemap_threshold) {
map<uint64_t, uint64_t> m;
m[offset] = len;
::encode(m, bl);
} else {
uint64_t i;
- r = do_fiemap(**fd, offset, len, &fiemap);
+ r = backend->do_fiemap(**fd, offset, len, &fiemap);
if (r < 0)
goto done;
int FileStore::_do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff)
{
- dout(20) << "_do_clone_range " << srcoff << "~" << len << " to " << dstoff << dendl;
- if (!btrfs_clone_range ||
- srcoff % blk_size != dstoff % blk_size) {
- dout(20) << "_do_clone_range using copy" << dendl;
- return _do_copy_range(from, to, srcoff, len, dstoff);
- }
- int err = 0;
- int r = 0;
-
- uint64_t srcoffclone = ALIGN_UP(srcoff, blk_size);
- uint64_t dstoffclone = ALIGN_UP(dstoff, blk_size);
- if (srcoffclone >= srcoff + len) {
- dout(20) << "_do_clone_range using copy, extent too short to align srcoff" << dendl;
- return _do_copy_range(from, to, srcoff, len, dstoff);
- }
-
- uint64_t lenclone = len - (srcoffclone - srcoff);
- if (!ALIGNED(lenclone, blk_size)) {
- struct stat from_stat, to_stat;
- err = ::fstat(from, &from_stat);
- if (err) return -errno;
- err = ::fstat(to , &to_stat);
- if (err) return -errno;
-
- if (srcoff + len != (uint64_t)from_stat.st_size ||
- dstoff + len < (uint64_t)to_stat.st_size) {
- // Not to the end of the file, need to align length as well
- lenclone = ALIGN_DOWN(lenclone, blk_size);
- }
- }
- if (lenclone == 0) {
- // too short
- return _do_copy_range(from, to, srcoff, len, dstoff);
- }
-
- dout(20) << "_do_clone_range cloning " << srcoffclone << "~" << lenclone
- << " to " << dstoffclone << " = " << r << dendl;
- btrfs_ioctl_clone_range_args a;
- a.src_fd = from;
- a.src_offset = srcoffclone;
- a.src_length = lenclone;
- a.dest_offset = dstoffclone;
- err = ::ioctl(to, BTRFS_IOC_CLONE_RANGE, &a);
- if (err >= 0) {
- r += err;
- } else if (errno == EINVAL) {
- // Still failed, might be compressed
- dout(20) << "_do_clone_range failed CLONE_RANGE call with -EINVAL, using copy" << dendl;
- return _do_copy_range(from, to, srcoff, len, dstoff);
- } else {
- return -errno;
- }
-
- // Take care any trimmed from front
- if (srcoffclone != srcoff) {
- err = _do_copy_range(from, to, srcoff, srcoffclone - srcoff, dstoff);
- if (err >= 0) {
- r += err;
- } else {
- return err;
- }
- }
-
- // Copy end
- if (srcoffclone + lenclone != srcoff + len) {
- err = _do_copy_range(from, to,
- srcoffclone + lenclone,
- (srcoff + len) - (srcoffclone + lenclone),
- dstoffclone + lenclone);
- if (err >= 0) {
- r += err;
- } else {
- return err;
- }
- }
- dout(20) << "_do_clone_range finished " << srcoff << "~" << len
- << " to " << dstoff << " = " << r << dendl;
- return r;
+ dout(20) << "_do_clone_range copy " << srcoff << "~" << len << " to " << dstoff << dendl;
+ return backend->clone_range(from, to, srcoff, len, dstoff);
}
int FileStore::_do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff)
assert(0);
}
- if (btrfs_stable_commits) {
+ if (backend->can_checkpoint()) {
int err = write_op_seq(op_fd, cp);
if (err < 0) {
derr << "Error during write_op_seq: " << cpp_strerror(err) << dendl;
assert(0 == "error during write_op_seq");
}
- if (btrfs_snap_create_v2) {
- // be smart!
- struct btrfs_ioctl_vol_args_v2 async_args;
- memset(&async_args, 0, sizeof(async_args));
- async_args.fd = current_fd;
- async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC;
- snprintf(async_args.name, sizeof(async_args.name), COMMIT_SNAP_ITEM,
- (long long unsigned)cp);
-
- dout(10) << "taking async snap '" << async_args.name << "'" << dendl;
- int r = ::ioctl(basedir_fd, BTRFS_IOC_SNAP_CREATE_V2, &async_args);
- if (r < 0) {
+ char s[NAME_MAX];
+ snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)cp);
+ uint64_t cid = 0;
+ err = backend->create_checkpoint(s, &cid);
+ if (err < 0) {
int err = errno;
- derr << "async snap create '" << async_args.name << "' transid " << async_args.transid
- << " got " << cpp_strerror(err) << dendl;
- assert(0 == "async snap ioctl error");
- }
- dout(20) << "async snap create '" << async_args.name << "' transid " << async_args.transid << dendl;
-
- snaps.push_back(cp);
+ derr << "snap create '" << s << "' got error " << err << dendl;
+ assert(err == 0);
+ }
- apply_manager.commit_started();
- op_tp.unpause();
+ snaps.push_back(cp);
+ apply_manager.commit_started();
+ op_tp.unpause();
- // wait for commit
- dout(20) << " waiting for transid " << async_args.transid << " to complete" << dendl;
- r = ::ioctl(op_fd, BTRFS_IOC_WAIT_SYNC, &async_args.transid);
- if (r < 0) {
- int err = errno;
+ if (cid > 0) {
+ dout(20) << " waiting for checkpoint " << cid << " to complete" << dendl;
+ err = backend->sync_checkpoint(cid);
+ if (err < 0) {
derr << "ioctl WAIT_SYNC got " << cpp_strerror(err) << dendl;
assert(0 == "wait_sync got error");
}
- dout(20) << " done waiting for transid " << async_args.transid << " to complete" << dendl;
-
- } else {
- // the synchronous snap create does a sync.
- struct btrfs_ioctl_vol_args vol_args;
- memset(&vol_args, 0, sizeof(vol_args));
- vol_args.fd = current_fd;
- snprintf(vol_args.name, sizeof(vol_args.name), COMMIT_SNAP_ITEM,
- (long long unsigned)cp);
-
- dout(10) << "taking snap '" << vol_args.name << "'" << dendl;
- int r = ::ioctl(basedir_fd, BTRFS_IOC_SNAP_CREATE, &vol_args);
- if (r != 0) {
- int err = errno;
- derr << "snap create '" << vol_args.name << "' got error " << err << dendl;
- assert(r == 0);
- }
- dout(20) << "snap create '" << vol_args.name << "' succeeded." << dendl;
- assert(r == 0);
- snaps.push_back(cp);
-
- apply_manager.commit_started();
- op_tp.unpause();
+ dout(20) << " done waiting for checkpoint" << cid << " to complete" << dendl;
}
} else
{
apply_manager.commit_started();
op_tp.unpause();
- if (btrfs) {
- dout(15) << "sync_entry doing btrfs SYNC" << dendl;
- // do a full btrfs commit
- int r = ::ioctl(op_fd, BTRFS_IOC_SYNC);
- if (r < 0) {
- r = -errno;
- derr << "sync_entry btrfs IOC_SYNC got " << cpp_strerror(r) << dendl;
- assert(0 == "btrfs sync ioctl returned error");
- }
- } else
- if (m_filestore_fsync_flushes_journal_data) {
- dout(15) << "sync_entry doing fsync on " << current_op_seq_fn << dendl;
- // make the file system's journal commit.
- // this works with ext3, but NOT ext4
- ::fsync(op_fd);
- } else {
- dout(15) << "sync_entry doing a full sync (syncfs(2) if possible)" << dendl;
- sync_filesystem(basedir_fd);
+ int err = backend->syncfs();
+ if (err < 0) {
+ derr << "syncfs got " << cpp_strerror(err) << dendl;
+ assert(0 == "syncfs returned error");
}
- int err = write_op_seq(op_fd, cp);
+ err = write_op_seq(op_fd, cp);
if (err < 0) {
derr << "Error during write_op_seq: " << cpp_strerror(err) << dendl;
assert(0 == "error during write_op_seq");
logger->set(l_os_committing, 0);
// remove old snaps?
- if (btrfs_stable_commits) {
+ if (backend->can_checkpoint()) {
+ char s[NAME_MAX];
while (snaps.size() > 2) {
- btrfs_ioctl_vol_args vol_args;
- memset(&vol_args, 0, sizeof(vol_args));
- vol_args.fd = 0;
- snprintf(vol_args.name, sizeof(vol_args.name), COMMIT_SNAP_ITEM,
- (long long unsigned)snaps.front());
-
+ snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)snaps.front());
snaps.pop_front();
- dout(10) << "removing snap '" << vol_args.name << "'" << dendl;
- int r = ::ioctl(basedir_fd, BTRFS_IOC_SNAP_DESTROY, &vol_args);
+ dout(10) << "removing snap '" << s << "'" << dendl;
+ int r = backend->destroy_checkpoint(s);
if (r) {
int err = errno;
- derr << "unable to destroy snap '" << vol_args.name << "' got " << cpp_strerror(err) << dendl;
+ derr << "unable to destroy snap '" << s << "' got " << cpp_strerror(err) << dendl;
}
}
}
dout(10) << "snapshot " << name << dendl;
sync_and_flush();
- if (!btrfs) {
- dout(0) << "snapshot " << name << " failed, no btrfs" << dendl;
+ if (!backend->can_checkpoint()) {
+ dout(0) << "snapshot " << name << " failed, not supported" << dendl;
return -EOPNOTSUPP;
}
- btrfs_ioctl_vol_args vol_args;
- vol_args.fd = current_fd;
- snprintf(vol_args.name, sizeof(vol_args.name), CLUSTER_SNAP_ITEM, name.c_str());
+ char s[NAME_MAX];
+ snprintf(s, sizeof(s), CLUSTER_SNAP_ITEM, name.c_str());
- int r = ::ioctl(basedir_fd, BTRFS_IOC_SNAP_CREATE, &vol_args);
+ int r = backend->create_checkpoint(s, NULL);
if (r) {
r = -errno;
derr << "snapshot " << name << " failed: " << cpp_strerror(r) << dendl;
int ret = 0;
if (::rename(old_coll, new_coll)) {
- if (replaying && !btrfs_stable_commits &&
+ if (replaying && !backend->can_checkpoint() &&
(errno == EEXIST || errno == ENOTEMPTY))
ret = _collection_remove_recursive(cid, spos);
else
}
r = lfn_link(oldcid, c, o);
- if (replaying && !btrfs_stable_commits &&
+ if (replaying && !backend->can_checkpoint() &&
r == -EEXIST) // crashed between link() and set_replay_guard()
r = 0;