From: Sage Weil Date: Wed, 23 Dec 2015 15:58:30 +0000 (-0500) Subject: os/filestore: move FileStore to os/filestore/* X-Git-Tag: v10.0.3~154^2~40 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=ba2cc1eb6e0e869bf66f884290a40e99ae5fbc37;p=ceph.git os/filestore: move FileStore to os/filestore/* Signed-off-by: Sage Weil --- diff --git a/src/os/BtrfsFileStoreBackend.cc b/src/os/BtrfsFileStoreBackend.cc deleted file mode 100644 index 8c2273344a49..000000000000 --- a/src/os/BtrfsFileStoreBackend.cc +++ /dev/null @@ -1,578 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "include/int_types.h" -#include "include/types.h" - -#include -#include -#include -#include -#include -#include -#include -#include "include/compat.h" -#include "include/linux_fiemap.h" -#include "include/color.h" -#include "include/buffer.h" -#include "include/assert.h" - -#ifndef __CYGWIN__ -#include "btrfs_ioctl.h" -#endif - -#include -#include -#include - -#include "BtrfsFileStoreBackend.h" - -#include "common/errno.h" -#include "common/config.h" - -#if defined(__linux__) - -#define dout_subsys ceph_subsys_filestore -#undef dout_prefix -#define dout_prefix *_dout << "btrfsfilestorebackend(" << get_basedir_path() << ") " - -#define ALIGN_DOWN(x, by) ((x) - ((x) % (by))) -#define ALIGNED(x, by) (!((x) % (by))) -#define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by))) - -BtrfsFileStoreBackend::BtrfsFileStoreBackend(FileStore *fs): - GenericFileStoreBackend(fs), has_clone_range(false), - has_snap_create(false), has_snap_destroy(false), - has_snap_create_v2(false), has_wait_sync(false), stable_commits(false), - m_filestore_btrfs_clone_range(g_conf->filestore_btrfs_clone_range), - m_filestore_btrfs_snap (g_conf->filestore_btrfs_snap) { } - -int BtrfsFileStoreBackend::detect_features() -{ - int r; - - r = GenericFileStoreBackend::detect_features(); - if (r < 0) - return r; - - // clone_range? - if (m_filestore_btrfs_clone_range) { - int fd = ::openat(get_basedir_fd(), "clone_range_test", O_CREAT|O_WRONLY, 0600); - if (fd >= 0) { - if (::unlinkat(get_basedir_fd(), "clone_range_test", 0) < 0) { - r = -errno; - dout(0) << "detect_feature: failed to unlink test file for CLONE_RANGE ioctl: " - << cpp_strerror(r) << dendl; - } - btrfs_ioctl_clone_range_args clone_args; - memset(&clone_args, 0, sizeof(clone_args)); - clone_args.src_fd = -1; - r = ::ioctl(fd, BTRFS_IOC_CLONE_RANGE, &clone_args); - if (r < 0 && errno == EBADF) { - dout(0) << "detect_feature: CLONE_RANGE ioctl is supported" << dendl; - has_clone_range = true; - } else { - r = -errno; - dout(0) << "detect_feature: CLONE_RANGE ioctl is NOT supported: " << cpp_strerror(r) << dendl; - } - TEMP_FAILURE_RETRY(::close(fd)); - } else { - r = -errno; - dout(0) << "detect_feature: failed to create test file for CLONE_RANGE ioctl: " - << cpp_strerror(r) << dendl; - } - } else { - dout(0) << "detect_feature: CLONE_RANGE ioctl is DISABLED via 'filestore btrfs clone range' option" << dendl; - } - - struct btrfs_ioctl_vol_args vol_args; - memset(&vol_args, 0, sizeof(vol_args)); - - // create test source volume - vol_args.fd = 0; - strcpy(vol_args.name, "test_subvol"); - r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE, &vol_args); - if (r != 0) { - r = -errno; - dout(0) << "detect_feature: failed to create simple subvolume " << vol_args.name << ": " << cpp_strerror(r) << dendl; - } - int srcfd = ::openat(get_basedir_fd(), vol_args.name, O_RDONLY); - if (srcfd < 0) { - r = -errno; - dout(0) << "detect_feature: failed to open " << vol_args.name << ": " << cpp_strerror(r) << dendl; - } - - // snap_create and snap_destroy? - vol_args.fd = srcfd; - strcpy(vol_args.name, "sync_snap_test"); - r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args); - int err = errno; - if (r == 0 || errno == EEXIST) { - dout(0) << "detect_feature: SNAP_CREATE is supported" << dendl; - has_snap_create = true; - - r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args); - if (r == 0) { - dout(0) << "detect_feature: SNAP_DESTROY is supported" << dendl; - has_snap_destroy = true; - } else { - err = -errno; - dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err) << dendl; - - if (err == -EPERM && getuid() != 0) { - dout(0) << "detect_feature: failed with EPERM as non-root; remount with -o user_subvol_rm_allowed" << dendl; - cerr << TEXT_YELLOW - << "btrfs SNAP_DESTROY failed as non-root; remount with -o user_subvol_rm_allowed" - << TEXT_NORMAL << std::endl; - } else if (err == -EOPNOTSUPP) { - derr << "btrfs SNAP_DESTROY ioctl not supported; you need a kernel newer than 2.6.32" << dendl; - } - } - } else { - dout(0) << "detect_feature: SNAP_CREATE failed: " << cpp_strerror(err) << dendl; - } - - if (m_filestore_btrfs_snap) { - if (has_snap_destroy) - stable_commits = true; - else - dout(0) << "detect_feature: snaps enabled, but no SNAP_DESTROY ioctl; DISABLING" << dendl; - } - - // start_sync? - __u64 transid = 0; - r = ::ioctl(get_basedir_fd(), BTRFS_IOC_START_SYNC, &transid); - if (r < 0) { - int err = errno; - dout(0) << "detect_feature: START_SYNC got " << cpp_strerror(err) << dendl; - } - if (r == 0 && transid > 0) { - dout(0) << "detect_feature: START_SYNC is supported (transid " << transid << ")" << dendl; - - // do we have wait_sync too? - r = ::ioctl(get_basedir_fd(), BTRFS_IOC_WAIT_SYNC, &transid); - if (r == 0 || errno == ERANGE) { - dout(0) << "detect_feature: WAIT_SYNC is supported" << dendl; - has_wait_sync = true; - } else { - int err = errno; - dout(0) << "detect_feature: WAIT_SYNC is NOT supported: " << cpp_strerror(err) << dendl; - } - } else { - int err = errno; - dout(0) << "detect_feature: START_SYNC is NOT supported: " << cpp_strerror(err) << dendl; - } - - if (has_wait_sync) { - // async snap creation? - struct btrfs_ioctl_vol_args_v2 async_args; - memset(&async_args, 0, sizeof(async_args)); - async_args.fd = srcfd; - async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC; - strcpy(async_args.name, "async_snap_test"); - - // remove old one, first - struct stat st; - strcpy(vol_args.name, async_args.name); - if (::fstatat(get_basedir_fd(), vol_args.name, &st, 0) == 0) { - dout(0) << "detect_feature: removing old async_snap_test" << dendl; - r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args); - if (r != 0) { - int err = errno; - dout(0) << "detect_feature: failed to remove old async_snap_test: " << cpp_strerror(err) << dendl; - } - } - - r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2, &async_args); - if (r == 0 || errno == EEXIST) { - dout(0) << "detect_feature: SNAP_CREATE_V2 is supported" << dendl; - has_snap_create_v2 = true; - - // clean up - strcpy(vol_args.name, "async_snap_test"); - r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args); - if (r != 0) { - int err = errno; - dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err) << dendl; - } - } else { - int err = errno; - dout(0) << "detect_feature: SNAP_CREATE_V2 is NOT supported: " << cpp_strerror(err) << dendl; - } - } - - // clean up test subvol - if (srcfd >= 0) - TEMP_FAILURE_RETRY(::close(srcfd)); - - strcpy(vol_args.name, "test_subvol"); - r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args); - if (r < 0) { - r = -errno; - dout(0) << "detect_feature: failed to remove " << vol_args.name << ": " << cpp_strerror(r) << dendl; - } - - if (m_filestore_btrfs_snap && !has_snap_create_v2) { - dout(0) << "mount WARNING: btrfs snaps enabled, but no SNAP_CREATE_V2 ioctl (from kernel 2.6.37+)" << dendl; - cerr << TEXT_YELLOW - << " ** WARNING: 'filestore btrfs snap' is enabled (for safe transactions,\n" - << " rollback), but btrfs does not support the SNAP_CREATE_V2 ioctl\n" - << " (added in Linux 2.6.37). Expect slow btrfs sync/commit\n" - << " performance.\n" - << TEXT_NORMAL; - } - - return 0; -} - -bool BtrfsFileStoreBackend::can_checkpoint() -{ - return stable_commits; -} - -int BtrfsFileStoreBackend::create_current() -{ - struct stat st; - int ret = ::stat(get_current_path().c_str(), &st); - if (ret == 0) { - // current/ exists - if (!S_ISDIR(st.st_mode)) { - dout(0) << "create_current: current/ exists but is not a directory" << dendl; - return -EINVAL; - } - - struct stat basest; - struct statfs currentfs; - ret = ::fstat(get_basedir_fd(), &basest); - if (ret < 0) { - ret = -errno; - dout(0) << "create_current: cannot fstat basedir " << cpp_strerror(ret) << dendl; - return ret; - } - ret = ::statfs(get_current_path().c_str(), ¤tfs); - if (ret < 0) { - ret = -errno; - dout(0) << "create_current: cannot statsf basedir " << cpp_strerror(ret) << dendl; - return ret; - } - if (currentfs.f_type == BTRFS_SUPER_MAGIC && basest.st_dev != st.st_dev) { - dout(2) << "create_current: current appears to be a btrfs subvolume" << dendl; - stable_commits = true; - } - return 0; - } - - struct btrfs_ioctl_vol_args volargs; - memset(&volargs, 0, sizeof(volargs)); - - volargs.fd = 0; - strcpy(volargs.name, "current"); - if (::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE, (unsigned long int)&volargs) < 0) { - ret = -errno; - dout(0) << "create_current: BTRFS_IOC_SUBVOL_CREATE failed with error " - << cpp_strerror(ret) << dendl; - return ret; - } - - dout(2) << "create_current: created btrfs subvol " << get_current_path() << dendl; - if (::chmod(get_current_path().c_str(), 0755) < 0) { - ret = -errno; - dout(0) << "create_current: failed to chmod " << get_current_path() << " to 0755: " - << cpp_strerror(ret) << dendl; - return ret; - } - - stable_commits = true; - return 0; -} - -int BtrfsFileStoreBackend::list_checkpoints(list& ls) -{ - int ret, err = 0; - - struct stat basest; - ret = ::fstat(get_basedir_fd(), &basest); - if (ret < 0) { - ret = -errno; - dout(0) << "list_checkpoints: cannot fstat basedir " << cpp_strerror(ret) << dendl; - return ret; - } - - // get snap list - DIR *dir = ::opendir(get_basedir_path().c_str()); - if (!dir) { - ret = -errno; - dout(0) << "list_checkpoints: opendir '" << get_basedir_path() << "' failed: " - << cpp_strerror(ret) << dendl; - return ret; - } - - list snaps; - char path[PATH_MAX]; - char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1]; - struct dirent *de; - while (::readdir_r(dir, (struct dirent *)&buf, &de) == 0) { - if (!de) - break; - - snprintf(path, sizeof(path), "%s/%s", get_basedir_path().c_str(), de->d_name); - - struct stat st; - ret = ::stat(path, &st); - if (ret < 0) { - err = -errno; - dout(0) << "list_checkpoints: stat '" << path << "' failed: " - << cpp_strerror(err) << dendl; - break; - } - - if (!S_ISDIR(st.st_mode)) - continue; - - struct statfs fs; - ret = ::statfs(path, &fs); - if (ret < 0) { - err = -errno; - dout(0) << "list_checkpoints: statfs '" << path << "' failed: " - << cpp_strerror(err) << dendl; - break; - } - - if (fs.f_type == BTRFS_SUPER_MAGIC && basest.st_dev != st.st_dev) - snaps.push_back(string(de->d_name)); - } - - if (::closedir(dir) < 0) { - ret = -errno; - dout(0) << "list_checkpoints: closedir failed: " << cpp_strerror(ret) << dendl; - if (!err) - err = ret; - } - - if (err) - return err; - - ls.swap(snaps); - return 0; -} - -int BtrfsFileStoreBackend::create_checkpoint(const string& name, uint64_t *transid) -{ - dout(10) << "create_checkpoint: '" << name << "'" << dendl; - if (has_snap_create_v2 && transid) { - struct btrfs_ioctl_vol_args_v2 async_args; - memset(&async_args, 0, sizeof(async_args)); - async_args.fd = get_current_fd(); - async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC; - - size_t name_size = sizeof(async_args.name); - strncpy(async_args.name, name.c_str(), name_size); - async_args.name[name_size-1] = '\0'; - - int r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2, &async_args); - if (r < 0) { - r = -errno; - dout(0) << "create_checkpoint: async snap create '" << name << "' got " << cpp_strerror(r) << dendl; - return r; - } - dout(20) << "create_checkpoint: async snap create '" << name << "' transid " << async_args.transid << dendl; - *transid = async_args.transid; - } else { - struct btrfs_ioctl_vol_args vol_args; - memset(&vol_args, 0, sizeof(vol_args)); - vol_args.fd = get_current_fd(); - - size_t name_size = sizeof(vol_args.name); - strncpy(vol_args.name, name.c_str(), name_size); - vol_args.name[name_size-1] = '\0'; - - int r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args); - if (r < 0) { - r = -errno; - dout(0) << "create_checkpoint: snap create '" << name << "' got " << cpp_strerror(r) << dendl; - return r; - } - if (transid) - *transid = 0; - } - return 0; -} - -int BtrfsFileStoreBackend::sync_checkpoint(uint64_t transid) -{ - // wait for commit - dout(10) << "sync_checkpoint: transid " << transid << " to complete" << dendl; - int ret = ::ioctl(get_op_fd(), BTRFS_IOC_WAIT_SYNC, &transid); - if (ret < 0) { - ret = -errno; - dout(0) << "sync_checkpoint: ioctl WAIT_SYNC got " << cpp_strerror(ret) << dendl; - return -errno; - } - dout(20) << "sync_checkpoint: done waiting for transid " << transid << dendl; - return 0; -} - -int BtrfsFileStoreBackend::rollback_to(const string& name) -{ - dout(10) << "rollback_to: to '" << name << "'" << dendl; - char s[PATH_MAX]; - btrfs_ioctl_vol_args vol_args; - - memset(&vol_args, 0, sizeof(vol_args)); - vol_args.fd = 0; - strcpy(vol_args.name, "current"); - - int ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args); - if (ret && errno != ENOENT) { - dout(0) << "rollback_to: error removing old current subvol: " << cpp_strerror(ret) << dendl; - snprintf(s, sizeof(s), "%s/current.remove.me.%d", get_basedir_path().c_str(), rand()); - if (::rename(get_current_path().c_str(), s)) { - ret = -errno; - dout(0) << "rollback_to: error renaming old current subvol: " - << cpp_strerror(ret) << dendl; - return ret; - } - } - - snprintf(s, sizeof(s), "%s/%s", get_basedir_path().c_str(), name.c_str()); - - // roll back - vol_args.fd = ::open(s, O_RDONLY); - if (vol_args.fd < 0) { - ret = -errno; - dout(0) << "rollback_to: error opening '" << s << "': " << cpp_strerror(ret) << dendl; - return ret; - } - ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args); - if (ret < 0 ) { - ret = -errno; - dout(0) << "rollback_to: ioctl SNAP_CREATE got " << cpp_strerror(ret) << dendl; - } - TEMP_FAILURE_RETRY(::close(vol_args.fd)); - return ret; -} - -int BtrfsFileStoreBackend::destroy_checkpoint(const string& name) -{ - dout(10) << "destroy_checkpoint: '" << name << "'" << dendl; - btrfs_ioctl_vol_args vol_args; - memset(&vol_args, 0, sizeof(vol_args)); - vol_args.fd = 0; - strncpy(vol_args.name, name.c_str(), sizeof(vol_args.name)); - - int ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args); - if (ret) { - ret = -errno; - dout(0) << "destroy_checkpoint: ioctl SNAP_DESTROY got " << cpp_strerror(ret) << dendl; - return ret; - } - return 0; -} - -int BtrfsFileStoreBackend::syncfs() -{ - dout(15) << "syncfs" << dendl; - // do a full btrfs commit - int ret = ::ioctl(get_op_fd(), BTRFS_IOC_SYNC); - if (ret < 0) { - ret = -errno; - dout(0) << "syncfs: btrfs IOC_SYNC got " << cpp_strerror(ret) << dendl; - } - return ret; -} - -int BtrfsFileStoreBackend::clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) -{ - dout(20) << "clone_range: " << srcoff << "~" << len << " to " << dstoff << dendl; - size_t blk_size = get_blksize(); - if (!has_clone_range || - srcoff % blk_size != dstoff % blk_size) { - dout(20) << "clone_range: using copy" << dendl; - return _copy_range(from, to, srcoff, len, dstoff); - } - - int err = 0; - int r = 0; - - uint64_t srcoffclone = ALIGN_UP(srcoff, blk_size); - uint64_t dstoffclone = ALIGN_UP(dstoff, blk_size); - if (srcoffclone >= srcoff + len) { - dout(20) << "clone_range: using copy, extent too short to align srcoff" << dendl; - return _copy_range(from, to, srcoff, len, dstoff); - } - - uint64_t lenclone = len - (srcoffclone - srcoff); - if (!ALIGNED(lenclone, blk_size)) { - struct stat from_stat, to_stat; - err = ::fstat(from, &from_stat); - if (err) return -errno; - err = ::fstat(to , &to_stat); - if (err) return -errno; - - if (srcoff + len != (uint64_t)from_stat.st_size || - dstoff + len < (uint64_t)to_stat.st_size) { - // Not to the end of the file, need to align length as well - lenclone = ALIGN_DOWN(lenclone, blk_size); - } - } - if (lenclone == 0) { - // too short - return _copy_range(from, to, srcoff, len, dstoff); - } - - dout(20) << "clone_range: cloning " << srcoffclone << "~" << lenclone - << " to " << dstoffclone << " = " << r << dendl; - btrfs_ioctl_clone_range_args a; - a.src_fd = from; - a.src_offset = srcoffclone; - a.src_length = lenclone; - a.dest_offset = dstoffclone; - err = ::ioctl(to, BTRFS_IOC_CLONE_RANGE, &a); - if (err >= 0) { - r += err; - } else if (errno == EINVAL) { - // Still failed, might be compressed - dout(20) << "clone_range: failed CLONE_RANGE call with -EINVAL, using copy" << dendl; - return _copy_range(from, to, srcoff, len, dstoff); - } else { - return -errno; - } - - // Take care any trimmed from front - if (srcoffclone != srcoff) { - err = _copy_range(from, to, srcoff, srcoffclone - srcoff, dstoff); - if (err >= 0) { - r += err; - } else { - return err; - } - } - - // Copy end - if (srcoffclone + lenclone != srcoff + len) { - err = _copy_range(from, to, - srcoffclone + lenclone, - (srcoff + len) - (srcoffclone + lenclone), - dstoffclone + lenclone); - if (err >= 0) { - r += err; - } else { - return err; - } - } - dout(20) << "clone_range: finished " << srcoff << "~" << len - << " to " << dstoff << " = " << r << dendl; - return r; -} -#endif diff --git a/src/os/BtrfsFileStoreBackend.h b/src/os/BtrfsFileStoreBackend.h deleted file mode 100644 index 9bc878f77676..000000000000 --- a/src/os/BtrfsFileStoreBackend.h +++ /dev/null @@ -1,49 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef CEPH_BTRFSFILESTOREBACKEDN_H -#define CEPH_BTRFSFILESTOREBACKEDN_H - -#if defined(__linux__) -#include "GenericFileStoreBackend.h" - -class BtrfsFileStoreBackend : public GenericFileStoreBackend { -private: - bool has_clone_range; ///< clone range ioctl is supported - bool has_snap_create; ///< snap create ioctl is supported - bool has_snap_destroy; ///< snap destroy ioctl is supported - bool has_snap_create_v2; ///< snap create v2 ioctl (async!) is supported - bool has_wait_sync; ///< wait sync ioctl is supported - bool stable_commits; - bool m_filestore_btrfs_clone_range; - bool m_filestore_btrfs_snap; -public: - BtrfsFileStoreBackend(FileStore *fs); - ~BtrfsFileStoreBackend() {} - const char *get_name() { - return "btrfs"; - } - int detect_features(); - bool can_checkpoint(); - int create_current(); - int list_checkpoints(list& ls); - int create_checkpoint(const string& name, uint64_t *cid); - int sync_checkpoint(uint64_t cid); - int rollback_to(const string& name); - int destroy_checkpoint(const string& name); - int syncfs(); - int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff); -}; -#endif -#endif diff --git a/src/os/CollectionIndex.h b/src/os/CollectionIndex.h deleted file mode 100644 index 97e17bae1b7d..000000000000 --- a/src/os/CollectionIndex.h +++ /dev/null @@ -1,199 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef OS_COLLECTIONINDEX_H -#define OS_COLLECTIONINDEX_H - -#include -#include -#include "include/memory.h" - -#include "osd/osd_types.h" -#include "include/object.h" -#include "common/RWLock.h" - -/** - * CollectionIndex provides an interface for manipulating indexed collections - */ -class CollectionIndex { -protected: - /** - * Object encapsulating a returned path. - * - * A path to an object (existent or non-existent) becomes invalid - * when a different object is created in the index. Path stores - * a shared_ptr to the CollectionIndex to keep the index alive - * during its lifetime. - * @see IndexManager - * @see self_ref - * @see set_ref - */ - class Path { - public: - /// Returned path - string full_path; - /// Ref to parent Index - CollectionIndex* parent_ref; - /// coll_t for parent Index - coll_t parent_coll; - - /// Normal Constructor - Path( - string path, ///< [in] Path to return. - CollectionIndex* ref) - : full_path(path), parent_ref(ref), parent_coll(parent_ref->coll()) {} - - /// Debugging Constructor - Path( - string path, ///< [in] Path to return. - coll_t coll) ///< [in] collection - : full_path(path), parent_coll(coll) {} - - /// Getter for the stored path. - const char *path() const { return full_path.c_str(); } - - /// Getter for collection - coll_t coll() const { return parent_coll; } - - /// Getter for parent - CollectionIndex* get_index() const { - return parent_ref; - } - }; - public: - - string access_lock_name; - RWLock access_lock; - /// Type of returned paths - typedef ceph::shared_ptr IndexedPath; - - static IndexedPath get_testing_path(string path, coll_t collection) { - return IndexedPath(new Path(path, collection)); - } - - static const uint32_t FLAT_INDEX_TAG = 0; - static const uint32_t HASH_INDEX_TAG = 1; - static const uint32_t HASH_INDEX_TAG_2 = 2; - static const uint32_t HOBJECT_WITH_POOL = 3; - /** - * For tracking Filestore collection versions. - * - * @return Collection version represented by the Index implementation - */ - virtual uint32_t collection_version() = 0; - - /** - * Returns the collection managed by this CollectionIndex - */ - virtual coll_t coll() const = 0; - - - /** - * Initializes the index. - * - * @return Error Code, 0 for success - */ - virtual int init() = 0; - - /** - * Cleanup before replaying journal - * - * Index implemenations may need to perform compound operations - * which may leave the collection unstable if interupted. cleanup - * is called on mount to allow the CollectionIndex implementation - * to stabilize. - * - * @see HashIndex - * @return Error Code, 0 for success - */ - virtual int cleanup() = 0; - - /** - * Call when a file is created using a path returned from lookup. - * - * @return Error Code, 0 for success - */ - virtual int created( - const ghobject_t &oid, ///< [in] Created object. - const char *path ///< [in] Path to created object. - ) = 0; - - /** - * Removes oid from the collection - * - * @return Error Code, 0 for success - */ - virtual int unlink( - const ghobject_t &oid ///< [in] Object to remove - ) = 0; - - /** - * Gets the IndexedPath for oid. - * - * @return Error Code, 0 for success - */ - virtual int lookup( - const ghobject_t &oid, ///< [in] Object to lookup - IndexedPath *path, ///< [out] Path to object - int *hardlink ///< [out] number of hard links of this object. *hardlink=0 mean object no-exist. - ) = 0; - - /** - * Moves objects matching @e match in the lsb @e bits - * - * dest and this must be the same subclass - * - * @return Error Code, 0 for success - */ - virtual int split( - uint32_t match, //< [in] value to match - uint32_t bits, //< [in] bits to check - CollectionIndex* dest //< [in] destination index - ) { assert(0); return 0; } - - - /// List contents of collection by hash - virtual int collection_list_partial( - const ghobject_t &start, ///< [in] object at which to start - const ghobject_t &end, ///< [in] list only objects < end - bool sort_bitwise, ///< [in] use bitwise sort - int max_count, ///< [in] return at most max_count objects - vector *ls, ///< [out] Listed objects - ghobject_t *next ///< [out] Next object to list - ) = 0; - - /// Call prior to removing directory - virtual int prep_delete() { return 0; } - - CollectionIndex(coll_t collection): - access_lock_name ("CollectionIndex::access_lock::" + collection.to_str()), - access_lock(access_lock_name.c_str()) {} - - /* - * Pre-hash the collection, this collection should map to a PG folder. - * - * @param pg_num - pg number of the pool this collection belongs to. - * @param expected_num_objs - expected number of objects in this collection. - * @Return 0 on success, an error code otherwise. - */ - virtual int pre_hash_collection( - uint32_t pg_num, ///< [in] pg number of the pool this collection belongs to - uint64_t expected_num_objs ///< [in] expected number of objects this collection has - ) { assert(0); return 0; } - - /// Virtual destructor - virtual ~CollectionIndex() {} -}; - -#endif diff --git a/src/os/DBObjectMap.cc b/src/os/DBObjectMap.cc deleted file mode 100644 index 449a5de2fb76..000000000000 --- a/src/os/DBObjectMap.cc +++ /dev/null @@ -1,1264 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- - -#include "include/int_types.h" -#include "include/buffer.h" - -#include -#include -#include -#include -#include "include/memory.h" -#include - -#include "ObjectMap.h" -#include "kv/KeyValueDB.h" -#include "DBObjectMap.h" -#include - -#include "common/debug.h" -#include "common/config.h" -#include "include/assert.h" - -#define dout_subsys ceph_subsys_filestore -#undef dout_prefix -#define dout_prefix *_dout << "filestore " - -const string DBObjectMap::USER_PREFIX = "_USER_"; -const string DBObjectMap::XATTR_PREFIX = "_AXATTR_"; -const string DBObjectMap::SYS_PREFIX = "_SYS_"; -const string DBObjectMap::COMPLETE_PREFIX = "_COMPLETE_"; -const string DBObjectMap::HEADER_KEY = "HEADER"; -const string DBObjectMap::USER_HEADER_KEY = "USER_HEADER"; -const string DBObjectMap::GLOBAL_STATE_KEY = "HEADER"; -const string DBObjectMap::HOBJECT_TO_SEQ = "_HOBJTOSEQ_"; - -// Legacy -const string DBObjectMap::LEAF_PREFIX = "_LEAF_"; -const string DBObjectMap::REVERSE_LEAF_PREFIX = "_REVLEAF_"; - -static void append_escaped(const string &in, string *out) -{ - for (string::const_iterator i = in.begin(); i != in.end(); ++i) { - if (*i == '%') { - out->push_back('%'); - out->push_back('p'); - } else if (*i == '.') { - out->push_back('%'); - out->push_back('e'); - } else if (*i == '_') { - out->push_back('%'); - out->push_back('u'); - } else { - out->push_back(*i); - } - } -} - -bool DBObjectMap::check(std::ostream &out) -{ - bool retval = true; - map parent_to_num_children; - map parent_to_actual_num_children; - KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ); - for (iter->seek_to_first(); iter->valid(); iter->next()) { - _Header header; - assert(header.num_children == 1); - header.num_children = 0; // Hack for leaf node - bufferlist bl = iter->value(); - while (true) { - bufferlist::iterator bliter = bl.begin(); - header.decode(bliter); - if (header.seq != 0) - parent_to_actual_num_children[header.seq] = header.num_children; - if (header.parent == 0) - break; - - if (!parent_to_num_children.count(header.parent)) - parent_to_num_children[header.parent] = 0; - parent_to_num_children[header.parent]++; - if (parent_to_actual_num_children.count(header.parent)) - break; - - set to_get; - map got; - to_get.insert(HEADER_KEY); - db->get(sys_parent_prefix(header), to_get, &got); - if (got.empty()) { - out << "Missing: seq " << header.parent << std::endl; - retval = false; - break; - } else { - bl = got.begin()->second; - } - } - } - - for (map::iterator i = parent_to_num_children.begin(); - i != parent_to_num_children.end(); - parent_to_num_children.erase(i++)) { - if (!parent_to_actual_num_children.count(i->first)) - continue; - if (parent_to_actual_num_children[i->first] != i->second) { - out << "Invalid: seq " << i->first << " recorded children: " - << parent_to_actual_num_children[i->first] << " found: " - << i->second << std::endl; - retval = false; - } - parent_to_actual_num_children.erase(i->first); - } - return retval; -} - -string DBObjectMap::ghobject_key(const ghobject_t &oid) -{ - string out; - append_escaped(oid.hobj.oid.name, &out); - out.push_back('.'); - append_escaped(oid.hobj.get_key(), &out); - out.push_back('.'); - append_escaped(oid.hobj.nspace, &out); - out.push_back('.'); - - char snap_with_hash[1000]; - char *t = snap_with_hash; - char *end = t + sizeof(snap_with_hash); - if (oid.hobj.snap == CEPH_NOSNAP) - t += snprintf(t, end - t, "head"); - else if (oid.hobj.snap == CEPH_SNAPDIR) - t += snprintf(t, end - t, "snapdir"); - else - t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap); - - if (oid.hobj.pool == -1) - t += snprintf(t, end - t, ".none"); - else - t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.hobj.pool); - t += snprintf(t, end - t, ".%.*X", (int)(sizeof(uint32_t)*2), oid.hobj.get_hash()); - - if (oid.generation != ghobject_t::NO_GEN || - oid.shard_id != shard_id_t::NO_SHARD) { - t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.generation); - t += snprintf(t, end - t, ".%x", (int)oid.shard_id); - } - out += string(snap_with_hash); - return out; -} - -// ok: pglog%u3%efs1...0.none.0017B237 -// bad: plana8923501-10...4c.3.ffffffffffffffff.2 -// fixed: plana8923501-10...4c.3.CB767F2D.ffffffffffffffff.2 -// returns 0 for false, 1 for true, negative for error -int DBObjectMap::is_buggy_ghobject_key_v1(const string &in) -{ - int dots = 5; // skip 5 .'s - const char *s = in.c_str(); - do { - while (*s && *s != '.') - ++s; - if (!*s) { - derr << "unexpected null at " << (int)(s-in.c_str()) << dendl; - return -EINVAL; - } - ++s; - } while (*s && --dots); - if (!*s) { - derr << "unexpected null at " << (int)(s-in.c_str()) << dendl; - return -EINVAL; - } - // we are now either at a hash value (32 bits, 8 chars) or a generation - // value (64 bits) '.' and shard id. count the dots! - int len = 0; - while (*s && *s != '.') { - ++s; - ++len; - } - if (*s == '\0') { - if (len != 8) { - derr << "hash value is not 8 chars" << dendl; - return -EINVAL; // the hash value is always 8 chars. - } - return 0; - } - if (*s != '.') { // the shard follows. - derr << "missing final . and shard id at " << (int)(s-in.c_str()) << dendl; - return -EINVAL; - } - return 1; -} - - -string DBObjectMap::map_header_key(const ghobject_t &oid) -{ - return ghobject_key(oid); -} - -string DBObjectMap::header_key(uint64_t seq) -{ - char buf[100]; - snprintf(buf, sizeof(buf), "%.*" PRId64, (int)(2*sizeof(seq)), seq); - return string(buf); -} - -string DBObjectMap::complete_prefix(Header header) -{ - return USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX; -} - -string DBObjectMap::user_prefix(Header header) -{ - return USER_PREFIX + header_key(header->seq) + USER_PREFIX; -} - -string DBObjectMap::sys_prefix(Header header) -{ - return USER_PREFIX + header_key(header->seq) + SYS_PREFIX; -} - -string DBObjectMap::xattr_prefix(Header header) -{ - return USER_PREFIX + header_key(header->seq) + XATTR_PREFIX; -} - -string DBObjectMap::sys_parent_prefix(_Header header) -{ - return USER_PREFIX + header_key(header.parent) + SYS_PREFIX; -} - -int DBObjectMap::DBObjectMapIteratorImpl::init() -{ - invalid = false; - if (ready) { - return 0; - } - assert(!parent_iter); - if (header->parent) { - Header parent = map->lookup_parent(header); - if (!parent) { - assert(0); - return -EINVAL; - } - parent_iter.reset(new DBObjectMapIteratorImpl(map, parent)); - } - key_iter = map->db->get_iterator(map->user_prefix(header)); - assert(key_iter); - complete_iter = map->db->get_iterator(map->complete_prefix(header)); - assert(complete_iter); - cur_iter = key_iter; - assert(cur_iter); - ready = true; - return 0; -} - -ObjectMap::ObjectMapIterator DBObjectMap::get_iterator( - const ghobject_t &oid) -{ - MapHeaderLock hl(this, oid); - Header header = lookup_map_header(hl, oid); - if (!header) - return ObjectMapIterator(new EmptyIteratorImpl()); - DBObjectMapIterator iter = _get_iterator(header); - iter->hlock.swap(hl); - return iter; -} - -int DBObjectMap::DBObjectMapIteratorImpl::seek_to_first() -{ - init(); - r = 0; - if (parent_iter) { - r = parent_iter->seek_to_first(); - if (r < 0) - return r; - } - r = key_iter->seek_to_first(); - if (r < 0) - return r; - return adjust(); -} - -int DBObjectMap::DBObjectMapIteratorImpl::seek_to_last() -{ - init(); - r = 0; - if (parent_iter) { - r = parent_iter->seek_to_last(); - if (r < 0) - return r; - if (parent_iter->valid()) - r = parent_iter->next(); - if (r < 0) - return r; - } - r = key_iter->seek_to_last(); - if (r < 0) - return r; - if (key_iter->valid()) - r = key_iter->next(); - if (r < 0) - return r; - return adjust(); -} - -int DBObjectMap::DBObjectMapIteratorImpl::lower_bound(const string &to) -{ - init(); - r = 0; - if (parent_iter) { - r = parent_iter->lower_bound(to); - if (r < 0) - return r; - } - r = key_iter->lower_bound(to); - if (r < 0) - return r; - return adjust(); -} - -int DBObjectMap::DBObjectMapIteratorImpl::upper_bound(const string &after) -{ - init(); - r = 0; - if (parent_iter) { - r = parent_iter->upper_bound(after); - if (r < 0) - return r; - } - r = key_iter->upper_bound(after); - if (r < 0) - return r; - return adjust(); -} - -bool DBObjectMap::DBObjectMapIteratorImpl::valid() -{ - bool valid = !invalid && ready; - assert(!valid || cur_iter->valid()); - return valid; -} - -bool DBObjectMap::DBObjectMapIteratorImpl::valid_parent() -{ - if (parent_iter && parent_iter->valid() && - (!key_iter->valid() || key_iter->key() > parent_iter->key())) - return true; - return false; -} - -int DBObjectMap::DBObjectMapIteratorImpl::next(bool validate) -{ - assert(cur_iter->valid()); - assert(valid()); - cur_iter->next(); - return adjust(); -} - -int DBObjectMap::DBObjectMapIteratorImpl::next_parent() -{ - if (!parent_iter || !parent_iter->valid()) { - invalid = true; - return 0; - } - r = next(); - if (r < 0) - return r; - if (!valid() || on_parent() || !parent_iter->valid()) - return 0; - - return lower_bound(parent_iter->key()); -} - -int DBObjectMap::DBObjectMapIteratorImpl::in_complete_region(const string &to_test, - string *begin, - string *end) -{ - complete_iter->upper_bound(to_test); - if (complete_iter->valid()) - complete_iter->prev(); - else - complete_iter->seek_to_last(); - - if (!complete_iter->valid()) - return false; - - string _end; - if (begin) - *begin = complete_iter->key(); - _end = string(complete_iter->value().c_str()); - if (end) - *end = _end; - return (to_test >= complete_iter->key()) && (!_end.size() || _end > to_test); -} - -/** - * Moves parent_iter to the next position both out of the complete_region and - * not equal to key_iter. Then, we set cur_iter to parent_iter if valid and - * less than key_iter and key_iter otherwise. - */ -int DBObjectMap::DBObjectMapIteratorImpl::adjust() -{ - string begin, end; - while (parent_iter && parent_iter->valid()) { - if (in_complete_region(parent_iter->key(), &begin, &end)) { - if (end.size() == 0) { - parent_iter->seek_to_last(); - if (parent_iter->valid()) - parent_iter->next(); - } else - parent_iter->lower_bound(end); - } else if (key_iter->valid() && key_iter->key() == parent_iter->key()) { - parent_iter->next(); - } else { - break; - } - } - if (valid_parent()) { - cur_iter = parent_iter; - } else if (key_iter->valid()) { - cur_iter = key_iter; - } else { - invalid = true; - } - assert(invalid || cur_iter->valid()); - return 0; -} - - -string DBObjectMap::DBObjectMapIteratorImpl::key() -{ - return cur_iter->key(); -} - -bufferlist DBObjectMap::DBObjectMapIteratorImpl::value() -{ - return cur_iter->value(); -} - -int DBObjectMap::DBObjectMapIteratorImpl::status() -{ - return r; -} - -int DBObjectMap::set_keys(const ghobject_t &oid, - const map &set, - const SequencerPosition *spos) -{ - KeyValueDB::Transaction t = db->get_transaction(); - MapHeaderLock hl(this, oid); - Header header = lookup_create_map_header(hl, oid, t); - if (!header) - return -EINVAL; - if (check_spos(oid, header, spos)) - return 0; - - t->set(user_prefix(header), set); - - return db->submit_transaction(t); -} - -int DBObjectMap::set_header(const ghobject_t &oid, - const bufferlist &bl, - const SequencerPosition *spos) -{ - KeyValueDB::Transaction t = db->get_transaction(); - MapHeaderLock hl(this, oid); - Header header = lookup_create_map_header(hl, oid, t); - if (!header) - return -EINVAL; - if (check_spos(oid, header, spos)) - return 0; - _set_header(header, bl, t); - return db->submit_transaction(t); -} - -void DBObjectMap::_set_header(Header header, const bufferlist &bl, - KeyValueDB::Transaction t) -{ - map to_set; - to_set[USER_HEADER_KEY] = bl; - t->set(sys_prefix(header), to_set); -} - -int DBObjectMap::get_header(const ghobject_t &oid, - bufferlist *bl) -{ - MapHeaderLock hl(this, oid); - Header header = lookup_map_header(hl, oid); - if (!header) { - return 0; - } - return _get_header(header, bl); -} - -int DBObjectMap::_get_header(Header header, - bufferlist *bl) -{ - map out; - while (true) { - out.clear(); - set to_get; - to_get.insert(USER_HEADER_KEY); - int r = db->get(sys_prefix(header), to_get, &out); - if (r == 0 && !out.empty()) - break; - if (r < 0) - return r; - Header current(header); - if (!current->parent) - break; - header = lookup_parent(current); - } - - if (!out.empty()) - bl->swap(out.begin()->second); - return 0; -} - -int DBObjectMap::clear(const ghobject_t &oid, - const SequencerPosition *spos) -{ - KeyValueDB::Transaction t = db->get_transaction(); - MapHeaderLock hl(this, oid); - Header header = lookup_map_header(hl, oid); - if (!header) - return -ENOENT; - if (check_spos(oid, header, spos)) - return 0; - remove_map_header(hl, oid, header, t); - assert(header->num_children > 0); - header->num_children--; - int r = _clear(header, t); - if (r < 0) - return r; - return db->submit_transaction(t); -} - -int DBObjectMap::_clear(Header header, - KeyValueDB::Transaction t) -{ - while (1) { - if (header->num_children) { - set_header(header, t); - break; - } - clear_header(header, t); - if (!header->parent) - break; - Header parent = lookup_parent(header); - if (!parent) { - return -EINVAL; - } - assert(parent->num_children > 0); - parent->num_children--; - header.swap(parent); - } - return 0; -} - -int DBObjectMap::merge_new_complete(Header header, - const map &new_complete, - DBObjectMapIterator iter, - KeyValueDB::Transaction t) -{ - KeyValueDB::Iterator complete_iter = db->get_iterator( - complete_prefix(header) - ); - map::const_iterator i = new_complete.begin(); - set to_remove; - map to_add; - - string begin, end; - while (i != new_complete.end()) { - string new_begin = i->first; - string new_end = i->second; - int r = iter->in_complete_region(new_begin, &begin, &end); - if (r < 0) - return r; - if (r) { - to_remove.insert(begin); - new_begin = begin; - } - ++i; - while (i != new_complete.end()) { - if (!new_end.size() || i->first <= new_end) { - if (!new_end.size() && i->second > new_end) { - new_end = i->second; - } - ++i; - continue; - } - - r = iter->in_complete_region(new_end, &begin, &end); - if (r < 0) - return r; - if (r) { - to_remove.insert(begin); - new_end = end; - continue; - } - break; - } - bufferlist bl; - bl.append(bufferptr(new_end.c_str(), new_end.size() + 1)); - to_add.insert(make_pair(new_begin, bl)); - } - t->rmkeys(complete_prefix(header), to_remove); - t->set(complete_prefix(header), to_add); - return 0; -} - -int DBObjectMap::copy_up_header(Header header, - KeyValueDB::Transaction t) -{ - bufferlist bl; - int r = _get_header(header, &bl); - if (r < 0) - return r; - - _set_header(header, bl, t); - return 0; -} - -int DBObjectMap::need_parent(DBObjectMapIterator iter) -{ - int r = iter->seek_to_first(); - if (r < 0) - return r; - - if (!iter->valid()) - return 0; - - string begin, end; - if (iter->in_complete_region(iter->key(), &begin, &end) && end == "") { - return 0; - } - return 1; -} - -int DBObjectMap::rm_keys(const ghobject_t &oid, - const set &to_clear, - const SequencerPosition *spos) -{ - MapHeaderLock hl(this, oid); - Header header = lookup_map_header(hl, oid); - if (!header) - return -ENOENT; - KeyValueDB::Transaction t = db->get_transaction(); - if (check_spos(oid, header, spos)) - return 0; - t->rmkeys(user_prefix(header), to_clear); - if (!header->parent) { - return db->submit_transaction(t); - } - - // Copy up keys from parent around to_clear - int keep_parent; - { - DBObjectMapIterator iter = _get_iterator(header); - iter->seek_to_first(); - map new_complete; - map to_write; - for(set::const_iterator i = to_clear.begin(); - i != to_clear.end(); - ) { - unsigned copied = 0; - iter->lower_bound(*i); - ++i; - if (!iter->valid()) - break; - string begin = iter->key(); - if (!iter->on_parent()) - iter->next_parent(); - if (new_complete.size() && new_complete.rbegin()->second == begin) { - begin = new_complete.rbegin()->first; - } - while (iter->valid() && copied < 20) { - if (!to_clear.count(iter->key())) - to_write[iter->key()].append(iter->value()); - if (i != to_clear.end() && *i <= iter->key()) { - ++i; - copied = 0; - } - - iter->next_parent(); - copied++; - } - if (iter->valid()) { - new_complete[begin] = iter->key(); - } else { - new_complete[begin] = ""; - break; - } - } - t->set(user_prefix(header), to_write); - merge_new_complete(header, new_complete, iter, t); - keep_parent = need_parent(iter); - if (keep_parent < 0) - return keep_parent; - } - if (!keep_parent) { - copy_up_header(header, t); - Header parent = lookup_parent(header); - if (!parent) - return -EINVAL; - parent->num_children--; - _clear(parent, t); - header->parent = 0; - set_map_header(hl, oid, *header, t); - t->rmkeys_by_prefix(complete_prefix(header)); - } - return db->submit_transaction(t); -} - -int DBObjectMap::clear_keys_header(const ghobject_t &oid, - const SequencerPosition *spos) -{ - KeyValueDB::Transaction t = db->get_transaction(); - MapHeaderLock hl(this, oid); - Header header = lookup_map_header(hl, oid); - if (!header) - return -ENOENT; - if (check_spos(oid, header, spos)) - return 0; - - // save old attrs - KeyValueDB::Iterator iter = db->get_iterator(xattr_prefix(header)); - if (!iter) - return -EINVAL; - map attrs; - for (iter->seek_to_first(); !iter->status() && iter->valid(); iter->next()) - attrs.insert(make_pair(iter->key(), iter->value())); - if (iter->status()) - return iter->status(); - - // remove current header - remove_map_header(hl, oid, header, t); - assert(header->num_children > 0); - header->num_children--; - int r = _clear(header, t); - if (r < 0) - return r; - - // create new header - Header newheader = generate_new_header(oid, Header()); - set_map_header(hl, oid, *newheader, t); - if (!attrs.empty()) - t->set(xattr_prefix(newheader), attrs); - return db->submit_transaction(t); -} - -int DBObjectMap::get(const ghobject_t &oid, - bufferlist *_header, - map *out) -{ - MapHeaderLock hl(this, oid); - Header header = lookup_map_header(hl, oid); - if (!header) - return -ENOENT; - _get_header(header, _header); - ObjectMapIterator iter = _get_iterator(header); - for (iter->seek_to_first(); iter->valid(); iter->next()) { - if (iter->status()) - return iter->status(); - out->insert(make_pair(iter->key(), iter->value())); - } - return 0; -} - -int DBObjectMap::get_keys(const ghobject_t &oid, - set *keys) -{ - MapHeaderLock hl(this, oid); - Header header = lookup_map_header(hl, oid); - if (!header) - return -ENOENT; - ObjectMapIterator iter = _get_iterator(header); - for (iter->seek_to_first(); iter->valid(); iter->next()) { - if (iter->status()) - return iter->status(); - keys->insert(iter->key()); - } - return 0; -} - -int DBObjectMap::scan(Header header, - const set &in_keys, - set *out_keys, - map *out_values) -{ - ObjectMapIterator db_iter = _get_iterator(header); - for (set::const_iterator key_iter = in_keys.begin(); - key_iter != in_keys.end(); - ++key_iter) { - db_iter->lower_bound(*key_iter); - if (db_iter->status()) - return db_iter->status(); - if (db_iter->valid() && db_iter->key() == *key_iter) { - if (out_keys) - out_keys->insert(*key_iter); - if (out_values) - out_values->insert(make_pair(db_iter->key(), db_iter->value())); - } - } - return 0; -} - -int DBObjectMap::get_values(const ghobject_t &oid, - const set &keys, - map *out) -{ - MapHeaderLock hl(this, oid); - Header header = lookup_map_header(hl, oid); - if (!header) - return -ENOENT; - return scan(header, keys, 0, out); -} - -int DBObjectMap::check_keys(const ghobject_t &oid, - const set &keys, - set *out) -{ - MapHeaderLock hl(this, oid); - Header header = lookup_map_header(hl, oid); - if (!header) - return -ENOENT; - return scan(header, keys, out, 0); -} - -int DBObjectMap::get_xattrs(const ghobject_t &oid, - const set &to_get, - map *out) -{ - MapHeaderLock hl(this, oid); - Header header = lookup_map_header(hl, oid); - if (!header) - return -ENOENT; - return db->get(xattr_prefix(header), to_get, out); -} - -int DBObjectMap::get_all_xattrs(const ghobject_t &oid, - set *out) -{ - MapHeaderLock hl(this, oid); - Header header = lookup_map_header(hl, oid); - if (!header) - return -ENOENT; - KeyValueDB::Iterator iter = db->get_iterator(xattr_prefix(header)); - if (!iter) - return -EINVAL; - for (iter->seek_to_first(); !iter->status() && iter->valid(); iter->next()) - out->insert(iter->key()); - return iter->status(); -} - -int DBObjectMap::set_xattrs(const ghobject_t &oid, - const map &to_set, - const SequencerPosition *spos) -{ - KeyValueDB::Transaction t = db->get_transaction(); - MapHeaderLock hl(this, oid); - Header header = lookup_create_map_header(hl, oid, t); - if (!header) - return -EINVAL; - if (check_spos(oid, header, spos)) - return 0; - t->set(xattr_prefix(header), to_set); - return db->submit_transaction(t); -} - -int DBObjectMap::remove_xattrs(const ghobject_t &oid, - const set &to_remove, - const SequencerPosition *spos) -{ - KeyValueDB::Transaction t = db->get_transaction(); - MapHeaderLock hl(this, oid); - Header header = lookup_map_header(hl, oid); - if (!header) - return -ENOENT; - if (check_spos(oid, header, spos)) - return 0; - t->rmkeys(xattr_prefix(header), to_remove); - return db->submit_transaction(t); -} - -int DBObjectMap::clone(const ghobject_t &oid, - const ghobject_t &target, - const SequencerPosition *spos) -{ - if (oid == target) - return 0; - - MapHeaderLock _l1(this, MIN_GHOBJ(oid, target, true)); - MapHeaderLock _l2(this, MAX_GHOBJ(oid, target, true)); - MapHeaderLock *lsource, *ltarget; - if (cmp_bitwise(oid, target) > 0) { - lsource = &_l2; - ltarget= &_l1; - } else { - lsource = &_l1; - ltarget= &_l2; - } - - KeyValueDB::Transaction t = db->get_transaction(); - { - Header destination = lookup_map_header(*ltarget, target); - if (destination) { - remove_map_header(*ltarget, target, destination, t); - if (check_spos(target, destination, spos)) - return 0; - destination->num_children--; - _clear(destination, t); - } - } - - Header parent = lookup_map_header(*lsource, oid); - if (!parent) - return db->submit_transaction(t); - - Header source = generate_new_header(oid, parent); - Header destination = generate_new_header(target, parent); - if (spos) - destination->spos = *spos; - - parent->num_children = 2; - set_header(parent, t); - set_map_header(*lsource, oid, *source, t); - set_map_header(*ltarget, target, *destination, t); - - map to_set; - KeyValueDB::Iterator xattr_iter = db->get_iterator(xattr_prefix(parent)); - for (xattr_iter->seek_to_first(); - xattr_iter->valid(); - xattr_iter->next()) - to_set.insert(make_pair(xattr_iter->key(), xattr_iter->value())); - t->set(xattr_prefix(source), to_set); - t->set(xattr_prefix(destination), to_set); - t->rmkeys_by_prefix(xattr_prefix(parent)); - return db->submit_transaction(t); -} - -int DBObjectMap::upgrade_to_v2() -{ - dout(1) << __func__ << " start" << dendl; - KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ); - iter->seek_to_first(); - while (iter->valid()) { - unsigned count = 0; - KeyValueDB::Transaction t = db->get_transaction(); - set remove; - map add; - for (; - iter->valid() && count < 300; - iter->next()) { - dout(20) << __func__ << " key is " << iter->key() << dendl; - int r = is_buggy_ghobject_key_v1(iter->key()); - if (r < 0) { - derr << __func__ << " bad key '" << iter->key() << "'" << dendl; - return r; - } - if (!r) { - dout(20) << __func__ << " " << iter->key() << " ok" << dendl; - continue; - } - - // decode header to get oid - _Header hdr; - bufferlist bl = iter->value(); - bufferlist::iterator bliter = bl.begin(); - hdr.decode(bliter); - - string newkey(ghobject_key(hdr.oid)); - dout(20) << __func__ << " " << iter->key() << " -> " << newkey << dendl; - add[newkey] = iter->value(); - remove.insert(iter->key()); - ++count; - } - - if (!remove.empty()) { - dout(20) << __func__ << " updating " << remove.size() << " keys" << dendl; - t->rmkeys(HOBJECT_TO_SEQ, remove); - t->set(HOBJECT_TO_SEQ, add); - int r = db->submit_transaction(t); - if (r < 0) - return r; - } - } - - state.v = 2; - - Mutex::Locker l(header_lock); - KeyValueDB::Transaction t = db->get_transaction(); - write_state(t); - db->submit_transaction_sync(t); - dout(1) << __func__ << " done" << dendl; - return 0; -} - -int DBObjectMap::init(bool do_upgrade) -{ - map result; - set to_get; - to_get.insert(GLOBAL_STATE_KEY); - int r = db->get(SYS_PREFIX, to_get, &result); - if (r < 0) - return r; - if (!result.empty()) { - bufferlist::iterator bliter = result.begin()->second.begin(); - state.decode(bliter); - if (state.v < 1) { - dout(1) << "DBObjectMap is *very* old; upgrade to an older version first" - << dendl; - return -ENOTSUP; - } - if (state.v < 2) { // Needs upgrade - if (!do_upgrade) { - dout(1) << "DOBjbectMap requires an upgrade," - << " set filestore_update_to" - << dendl; - return -ENOTSUP; - } else { - r = upgrade_to_v2(); - if (r < 0) - return r; - } - } - } else { - // New store - state.v = 2; - state.seq = 1; - } - dout(20) << "(init)dbobjectmap: seq is " << state.seq << dendl; - return 0; -} - -int DBObjectMap::sync(const ghobject_t *oid, - const SequencerPosition *spos) { - KeyValueDB::Transaction t = db->get_transaction(); - if (oid) { - assert(spos); - MapHeaderLock hl(this, *oid); - Header header = lookup_map_header(hl, *oid); - if (header) { - dout(10) << "oid: " << *oid << " setting spos to " - << *spos << dendl; - header->spos = *spos; - set_map_header(hl, *oid, *header, t); - } - /* It may appear that this and the identical portion of the else - * block can combined below, but in this block, the transaction - * must be submitted under *both* the MapHeaderLock and the full - * header_lock. - * - * See 2b63dd25fc1c73fa42e52e9ea4ab5a45dd9422a0 and bug 9891. - */ - Mutex::Locker l(header_lock); - write_state(t); - return db->submit_transaction_sync(t); - } else { - Mutex::Locker l(header_lock); - write_state(t); - return db->submit_transaction_sync(t); - } -} - -int DBObjectMap::write_state(KeyValueDB::Transaction _t) { - assert(header_lock.is_locked_by_me()); - dout(20) << "dbobjectmap: seq is " << state.seq << dendl; - KeyValueDB::Transaction t = _t ? _t : db->get_transaction(); - bufferlist bl; - state.encode(bl); - map to_write; - to_write[GLOBAL_STATE_KEY] = bl; - t->set(SYS_PREFIX, to_write); - return _t ? 0 : db->submit_transaction(t); -} - - -DBObjectMap::Header DBObjectMap::_lookup_map_header( - const MapHeaderLock &l, - const ghobject_t &oid) -{ - assert(l.get_locked() == oid); - - _Header *header = new _Header(); - { - Mutex::Locker l(cache_lock); - if (caches.lookup(oid, header)) { - assert(!in_use.count(header->seq)); - in_use.insert(header->seq); - return Header(header, RemoveOnDelete(this)); - } - } - - bufferlist out; - int r = db->get(HOBJECT_TO_SEQ, map_header_key(oid), &out); - if (r < 0 || out.length()==0) { - delete header; - return Header(); - } - - Header ret(header, RemoveOnDelete(this)); - bufferlist::iterator iter = out.begin(); - - ret->decode(iter); - { - Mutex::Locker l(cache_lock); - caches.add(oid, *ret); - } - - assert(!in_use.count(header->seq)); - in_use.insert(header->seq); - return ret; -} - -DBObjectMap::Header DBObjectMap::_generate_new_header(const ghobject_t &oid, - Header parent) -{ - Header header = Header(new _Header(), RemoveOnDelete(this)); - header->seq = state.seq++; - if (parent) { - header->parent = parent->seq; - header->spos = parent->spos; - } - header->num_children = 1; - header->oid = oid; - assert(!in_use.count(header->seq)); - in_use.insert(header->seq); - - write_state(); - return header; -} - -DBObjectMap::Header DBObjectMap::lookup_parent(Header input) -{ - Mutex::Locker l(header_lock); - while (in_use.count(input->parent)) - header_cond.Wait(header_lock); - map out; - set keys; - keys.insert(HEADER_KEY); - - dout(20) << "lookup_parent: parent " << input->parent - << " for seq " << input->seq << dendl; - int r = db->get(sys_parent_prefix(input), keys, &out); - if (r < 0) { - assert(0); - return Header(); - } - if (out.empty()) { - assert(0); - return Header(); - } - - Header header = Header(new _Header(), RemoveOnDelete(this)); - header->seq = input->parent; - bufferlist::iterator iter = out.begin()->second.begin(); - header->decode(iter); - dout(20) << "lookup_parent: parent seq is " << header->seq << " with parent " - << header->parent << dendl; - in_use.insert(header->seq); - return header; -} - -DBObjectMap::Header DBObjectMap::lookup_create_map_header( - const MapHeaderLock &hl, - const ghobject_t &oid, - KeyValueDB::Transaction t) -{ - Mutex::Locker l(header_lock); - Header header = _lookup_map_header(hl, oid); - if (!header) { - header = _generate_new_header(oid, Header()); - set_map_header(hl, oid, *header, t); - } - return header; -} - -void DBObjectMap::clear_header(Header header, KeyValueDB::Transaction t) -{ - dout(20) << "clear_header: clearing seq " << header->seq << dendl; - t->rmkeys_by_prefix(user_prefix(header)); - t->rmkeys_by_prefix(sys_prefix(header)); - t->rmkeys_by_prefix(complete_prefix(header)); - t->rmkeys_by_prefix(xattr_prefix(header)); - set keys; - keys.insert(header_key(header->seq)); - t->rmkeys(USER_PREFIX, keys); -} - -void DBObjectMap::set_header(Header header, KeyValueDB::Transaction t) -{ - dout(20) << "set_header: setting seq " << header->seq << dendl; - map to_write; - header->encode(to_write[HEADER_KEY]); - t->set(sys_prefix(header), to_write); -} - -void DBObjectMap::remove_map_header( - const MapHeaderLock &l, - const ghobject_t &oid, - Header header, - KeyValueDB::Transaction t) -{ - assert(l.get_locked() == oid); - dout(20) << "remove_map_header: removing " << header->seq - << " oid " << oid << dendl; - set to_remove; - to_remove.insert(map_header_key(oid)); - t->rmkeys(HOBJECT_TO_SEQ, to_remove); - { - Mutex::Locker l(cache_lock); - caches.clear(oid); - } -} - -void DBObjectMap::set_map_header( - const MapHeaderLock &l, - const ghobject_t &oid, _Header header, - KeyValueDB::Transaction t) -{ - assert(l.get_locked() == oid); - dout(20) << "set_map_header: setting " << header.seq - << " oid " << oid << " parent seq " - << header.parent << dendl; - map to_set; - header.encode(to_set[map_header_key(oid)]); - t->set(HOBJECT_TO_SEQ, to_set); - { - Mutex::Locker l(cache_lock); - caches.add(oid, header); - } -} - -bool DBObjectMap::check_spos(const ghobject_t &oid, - Header header, - const SequencerPosition *spos) -{ - if (!spos || *spos > header->spos) { - stringstream out; - if (spos) - dout(10) << "oid: " << oid << " not skipping op, *spos " - << *spos << dendl; - else - dout(10) << "oid: " << oid << " not skipping op, *spos " - << "empty" << dendl; - dout(10) << " > header.spos " << header->spos << dendl; - return false; - } else { - dout(10) << "oid: " << oid << " skipping op, *spos " << *spos - << " <= header.spos " << header->spos << dendl; - return true; - } -} - -int DBObjectMap::list_objects(vector *out) -{ - KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ); - for (iter->seek_to_first(); iter->valid(); iter->next()) { - bufferlist bl = iter->value(); - bufferlist::iterator bliter = bl.begin(); - _Header header; - header.decode(bliter); - out->push_back(header.oid); - } - return 0; -} diff --git a/src/os/DBObjectMap.h b/src/os/DBObjectMap.h deleted file mode 100644 index 4b81acbd9480..000000000000 --- a/src/os/DBObjectMap.h +++ /dev/null @@ -1,532 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -#ifndef DBOBJECTMAP_DB_H -#define DBOBJECTMAP_DB_H - -#include "include/buffer_fwd.h" -#include -#include -#include - -#include -#include "include/memory.h" -#include - -#include "ObjectMap.h" -#include "kv/KeyValueDB.h" -#include "osd/osd_types.h" -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/simple_cache.hpp" -#include - -/** - * DBObjectMap: Implements ObjectMap in terms of KeyValueDB - * - * Prefix space structure: - * - * @see complete_prefix - * @see user_prefix - * @see sys_prefix - * - * - GHOBJECT_TO_SEQ: Contains leaf mapping from ghobject_t->hobj.seq and - * corresponding omap header - * - SYS_PREFIX: GLOBAL_STATE_KEY - contains next seq number - * @see State - * @see write_state - * @see init - * @see generate_new_header - * - USER_PREFIX + header_key(header->seq) + USER_PREFIX - * : key->value for header->seq - * - USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX: see below - * - USER_PREFIX + header_key(header->seq) + XATTR_PREFIX: xattrs - * - USER_PREFIX + header_key(header->seq) + SYS_PREFIX - * : USER_HEADER_KEY - omap header for header->seq - * : HEADER_KEY - encoding of header for header->seq - * - * For each node (represented by a header), we - * store three mappings: the key mapping, the complete mapping, and the parent. - * The complete mapping (COMPLETE_PREFIX space) is key->key. Each x->y entry in - * this mapping indicates that the key mapping contains all entries on [x,y). - * Note, max string is represented by "", so ""->"" indicates that the parent - * is unnecessary (@see rm_keys). When looking up a key not contained in the - * the complete set, we have to check the parent if we don't find it in the - * key set. During rm_keys, we copy keys from the parent and update the - * complete set to reflect the change @see rm_keys. - */ -class DBObjectMap : public ObjectMap { -public: - boost::scoped_ptr db; - - /** - * Serializes access to next_seq as well as the in_use set - */ - Mutex header_lock; - Cond header_cond; - Cond map_header_cond; - - /** - * Set of headers currently in use - */ - set in_use; - set map_header_in_use; - - /** - * Takes the map_header_in_use entry in constructor, releases in - * destructor - */ - class MapHeaderLock { - DBObjectMap *db; - boost::optional locked; - - MapHeaderLock(const MapHeaderLock &); - MapHeaderLock &operator=(const MapHeaderLock &); - public: - MapHeaderLock(DBObjectMap *db) : db(db) {} - MapHeaderLock(DBObjectMap *db, const ghobject_t &oid) : db(db), locked(oid) { - Mutex::Locker l(db->header_lock); - while (db->map_header_in_use.count(*locked)) - db->map_header_cond.Wait(db->header_lock); - db->map_header_in_use.insert(*locked); - } - - const ghobject_t &get_locked() const { - assert(locked); - return *locked; - } - - void swap(MapHeaderLock &o) { - assert(db == o.db); - - // centos6's boost optional doesn't seem to have swap :( - boost::optional _locked = o.locked; - o.locked = locked; - locked = _locked; - } - - ~MapHeaderLock() { - if (locked) { - Mutex::Locker l(db->header_lock); - assert(db->map_header_in_use.count(*locked)); - db->map_header_cond.Signal(); - db->map_header_in_use.erase(*locked); - } - } - }; - - DBObjectMap(KeyValueDB *db) : db(db), header_lock("DBOBjectMap"), - cache_lock("DBObjectMap::CacheLock"), - caches(g_conf->filestore_omap_header_cache_size) - {} - - int set_keys( - const ghobject_t &oid, - const map &set, - const SequencerPosition *spos=0 - ); - - int set_header( - const ghobject_t &oid, - const bufferlist &bl, - const SequencerPosition *spos=0 - ); - - int get_header( - const ghobject_t &oid, - bufferlist *bl - ); - - int clear( - const ghobject_t &oid, - const SequencerPosition *spos=0 - ); - - int clear_keys_header( - const ghobject_t &oid, - const SequencerPosition *spos=0 - ); - - int rm_keys( - const ghobject_t &oid, - const set &to_clear, - const SequencerPosition *spos=0 - ); - - int get( - const ghobject_t &oid, - bufferlist *header, - map *out - ); - - int get_keys( - const ghobject_t &oid, - set *keys - ); - - int get_values( - const ghobject_t &oid, - const set &keys, - map *out - ); - - int check_keys( - const ghobject_t &oid, - const set &keys, - set *out - ); - - int get_xattrs( - const ghobject_t &oid, - const set &to_get, - map *out - ); - - int get_all_xattrs( - const ghobject_t &oid, - set *out - ); - - int set_xattrs( - const ghobject_t &oid, - const map &to_set, - const SequencerPosition *spos=0 - ); - - int remove_xattrs( - const ghobject_t &oid, - const set &to_remove, - const SequencerPosition *spos=0 - ); - - int clone( - const ghobject_t &oid, - const ghobject_t &target, - const SequencerPosition *spos=0 - ); - - /// Read initial state from backing store - int init(bool upgrade = false); - - /// Upgrade store to current version - int upgrade_to_v2(); - - /// Consistency check, debug, there must be no parallel writes - bool check(std::ostream &out); - - /// Ensure that all previous operations are durable - int sync(const ghobject_t *oid=0, const SequencerPosition *spos=0); - - /// Util, list all objects, there must be no other concurrent access - int list_objects(vector *objs ///< [out] objects - ); - - ObjectMapIterator get_iterator(const ghobject_t &oid); - - static const string USER_PREFIX; - static const string XATTR_PREFIX; - static const string SYS_PREFIX; - static const string COMPLETE_PREFIX; - static const string HEADER_KEY; - static const string USER_HEADER_KEY; - static const string GLOBAL_STATE_KEY; - static const string HOBJECT_TO_SEQ; - - /// Legacy - static const string LEAF_PREFIX; - static const string REVERSE_LEAF_PREFIX; - - /// persistent state for store @see generate_header - struct State { - __u8 v; - uint64_t seq; - State() : v(0), seq(1) {} - State(uint64_t seq) : v(0), seq(seq) {} - - void encode(bufferlist &bl) const { - ENCODE_START(2, 1, bl); - ::encode(v, bl); - ::encode(seq, bl); - ENCODE_FINISH(bl); - } - - void decode(bufferlist::iterator &bl) { - DECODE_START(2, bl); - if (struct_v >= 2) - ::decode(v, bl); - else - v = 0; - ::decode(seq, bl); - DECODE_FINISH(bl); - } - - void dump(Formatter *f) const { - f->dump_unsigned("seq", seq); - } - - static void generate_test_instances(list &o) { - o.push_back(new State(0)); - o.push_back(new State(20)); - } - } state; - - struct _Header { - uint64_t seq; - uint64_t parent; - uint64_t num_children; - - coll_t c; - ghobject_t oid; - - SequencerPosition spos; - - void encode(bufferlist &bl) const { - ENCODE_START(2, 1, bl); - ::encode(seq, bl); - ::encode(parent, bl); - ::encode(num_children, bl); - ::encode(c, bl); - ::encode(oid, bl); - ::encode(spos, bl); - ENCODE_FINISH(bl); - } - - void decode(bufferlist::iterator &bl) { - DECODE_START(2, bl); - ::decode(seq, bl); - ::decode(parent, bl); - ::decode(num_children, bl); - ::decode(c, bl); - ::decode(oid, bl); - if (struct_v >= 2) - ::decode(spos, bl); - DECODE_FINISH(bl); - } - - void dump(Formatter *f) const { - f->dump_unsigned("seq", seq); - f->dump_unsigned("parent", parent); - f->dump_unsigned("num_children", num_children); - f->dump_stream("coll") << c; - f->dump_stream("oid") << oid; - } - - static void generate_test_instances(list<_Header*> &o) { - o.push_back(new _Header); - o.push_back(new _Header); - o.back()->parent = 20; - o.back()->seq = 30; - } - - _Header() : seq(0), parent(0), num_children(1) {} - }; - - /// String munging (public for testing) - static string ghobject_key(const ghobject_t &oid); - static string ghobject_key_v0(coll_t c, const ghobject_t &oid); - static int is_buggy_ghobject_key_v1(const string &in); -private: - /// Implicit lock on Header->seq - typedef ceph::shared_ptr<_Header> Header; - Mutex cache_lock; - SimpleLRU caches; - - string map_header_key(const ghobject_t &oid); - string header_key(uint64_t seq); - string complete_prefix(Header header); - string user_prefix(Header header); - string sys_prefix(Header header); - string xattr_prefix(Header header); - string sys_parent_prefix(_Header header); - string sys_parent_prefix(Header header) { - return sys_parent_prefix(*header); - } - - class EmptyIteratorImpl : public ObjectMapIteratorImpl { - public: - int seek_to_first() { return 0; } - int seek_to_last() { return 0; } - int upper_bound(const string &after) { return 0; } - int lower_bound(const string &to) { return 0; } - bool valid() { return false; } - int next(bool validate=true) { assert(0); return 0; } - string key() { assert(0); return ""; } - bufferlist value() { assert(0); return bufferlist(); } - int status() { return 0; } - }; - - - /// Iterator - class DBObjectMapIteratorImpl : public ObjectMapIteratorImpl { - public: - DBObjectMap *map; - - /// NOTE: implicit lock hlock->get_locked() when returned out of the class - MapHeaderLock hlock; - /// NOTE: implicit lock on header->seq AND for all ancestors - Header header; - - /// parent_iter == NULL iff no parent - ceph::shared_ptr parent_iter; - KeyValueDB::Iterator key_iter; - KeyValueDB::Iterator complete_iter; - - /// cur_iter points to currently valid iterator - ceph::shared_ptr cur_iter; - int r; - - /// init() called, key_iter, complete_iter, parent_iter filled in - bool ready; - /// past end - bool invalid; - - DBObjectMapIteratorImpl(DBObjectMap *map, Header header) : - map(map), hlock(map), header(header), r(0), ready(false), invalid(true) {} - int seek_to_first(); - int seek_to_last(); - int upper_bound(const string &after); - int lower_bound(const string &to); - bool valid(); - int next(bool validate=true); - string key(); - bufferlist value(); - int status(); - - bool on_parent() { - return cur_iter == parent_iter; - } - - /// skips to next valid parent entry - int next_parent(); - - /// Tests whether to_test is in complete region - int in_complete_region(const string &to_test, ///< [in] key to test - string *begin, ///< [out] beginning of region - string *end ///< [out] end of region - ); ///< @returns true if to_test is in the complete region, else false - - private: - int init(); - bool valid_parent(); - int adjust(); - }; - - typedef ceph::shared_ptr DBObjectMapIterator; - DBObjectMapIterator _get_iterator(Header header) { - return DBObjectMapIterator(new DBObjectMapIteratorImpl(this, header)); - } - - /// sys - - /// Removes node corresponding to header - void clear_header(Header header, KeyValueDB::Transaction t); - - /// Set node containing input to new contents - void set_header(Header input, KeyValueDB::Transaction t); - - /// Remove leaf node corresponding to oid in c - void remove_map_header( - const MapHeaderLock &l, - const ghobject_t &oid, - Header header, - KeyValueDB::Transaction t); - - /// Set leaf node for c and oid to the value of header - void set_map_header( - const MapHeaderLock &l, - const ghobject_t &oid, _Header header, - KeyValueDB::Transaction t); - - /// Set leaf node for c and oid to the value of header - bool check_spos(const ghobject_t &oid, - Header header, - const SequencerPosition *spos); - - /// Lookup or create header for c oid - Header lookup_create_map_header( - const MapHeaderLock &l, - const ghobject_t &oid, - KeyValueDB::Transaction t); - - /** - * Generate new header for c oid with new seq number - * - * Has the side effect of syncronously saving the new DBObjectMap state - */ - Header _generate_new_header(const ghobject_t &oid, Header parent); - Header generate_new_header(const ghobject_t &oid, Header parent) { - Mutex::Locker l(header_lock); - return _generate_new_header(oid, parent); - } - - /// Lookup leaf header for c oid - Header _lookup_map_header( - const MapHeaderLock &l, - const ghobject_t &oid); - Header lookup_map_header( - const MapHeaderLock &l2, - const ghobject_t &oid) { - Mutex::Locker l(header_lock); - return _lookup_map_header(l2, oid); - } - - /// Lookup header node for input - Header lookup_parent(Header input); - - - /// Helpers - int _get_header(Header header, bufferlist *bl); - - /// Scan keys in header into out_keys and out_values (if nonnull) - int scan(Header header, - const set &in_keys, - set *out_keys, - map *out_values); - - /// Remove header and all related prefixes - int _clear(Header header, - KeyValueDB::Transaction t); - /// Adds to t operations necessary to add new_complete to the complete set - int merge_new_complete(Header header, - const map &new_complete, - DBObjectMapIterator iter, - KeyValueDB::Transaction t); - - /// Writes out State (mainly next_seq) - int write_state(KeyValueDB::Transaction _t = - KeyValueDB::Transaction()); - - /// 0 if the complete set now contains all of key space, < 0 on error, 1 else - int need_parent(DBObjectMapIterator iter); - - /// Copies header entry from parent @see rm_keys - int copy_up_header(Header header, - KeyValueDB::Transaction t); - - /// Sets header @see set_header - void _set_header(Header header, const bufferlist &bl, - KeyValueDB::Transaction t); - - /** - * Removes header seq lock and possibly object lock - * once Header is out of scope - * @see lookup_parent - * @see generate_new_header - */ - class RemoveOnDelete { - public: - DBObjectMap *db; - RemoveOnDelete(DBObjectMap *db) : - db(db) {} - void operator() (_Header *header) { - Mutex::Locker l(db->header_lock); - assert(db->in_use.count(header->seq)); - db->in_use.erase(header->seq); - db->header_cond.Signal(); - delete header; - } - }; - friend class RemoveOnDelete; -}; -WRITE_CLASS_ENCODER(DBObjectMap::_Header) -WRITE_CLASS_ENCODER(DBObjectMap::State) - -#endif diff --git a/src/os/FDCache.h b/src/os/FDCache.h deleted file mode 100644 index 635043b7e061..000000000000 --- a/src/os/FDCache.h +++ /dev/null @@ -1,111 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2013 Inktank Storage, Inc. - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef CEPH_FDCACHE_H -#define CEPH_FDCACHE_H - -#include -#include -#include -#include "common/hobject.h" -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/shared_cache.hpp" -#include "include/compat.h" -#include "include/intarith.h" - -/** - * FD Cache - */ -class FDCache : public md_config_obs_t { -public: - /** - * FD - * - * Wrapper for an fd. Destructor closes the fd. - */ - class FD { - public: - const int fd; - FD(int _fd) : fd(_fd) { - assert(_fd >= 0); - } - int operator*() const { - return fd; - } - ~FD() { - VOID_TEMP_FAILURE_RETRY(::close(fd)); - } - }; - -private: - CephContext *cct; - const int registry_shards; - SharedLRU *registry; - -public: - FDCache(CephContext *cct) : cct(cct), - registry_shards(cct->_conf->filestore_fd_cache_shards) { - assert(cct); - cct->_conf->add_observer(this); - registry = new SharedLRU[registry_shards]; - for (int i = 0; i < registry_shards; ++i) { - registry[i].set_cct(cct); - registry[i].set_size( - MAX((cct->_conf->filestore_fd_cache_size / registry_shards), 1)); - } - } - ~FDCache() { - cct->_conf->remove_observer(this); - delete[] registry; - } - typedef ceph::shared_ptr FDRef; - - FDRef lookup(const ghobject_t &hoid) { - int registry_id = hoid.hobj.get_hash() % registry_shards; - return registry[registry_id].lookup(hoid); - } - - FDRef add(const ghobject_t &hoid, int fd, bool *existed) { - int registry_id = hoid.hobj.get_hash() % registry_shards; - return registry[registry_id].add(hoid, new FD(fd), existed); - } - - /// clear cached fd for hoid, subsequent lookups will get an empty FD - void clear(const ghobject_t &hoid) { - int registry_id = hoid.hobj.get_hash() % registry_shards; - registry[registry_id].purge(hoid); - } - - /// md_config_obs_t - const char** get_tracked_conf_keys() const { - static const char* KEYS[] = { - "filestore_fd_cache_size", - NULL - }; - return KEYS; - } - void handle_conf_change(const md_config_t *conf, - const std::set &changed) { - if (changed.count("filestore_fd_cache_size")) { - for (int i = 0; i < registry_shards; ++i) - registry[i].set_size( - MAX((conf->filestore_fd_cache_size / registry_shards), 1)); - } - } - -}; -typedef FDCache::FDRef FDRef; - -#endif diff --git a/src/os/FileJournal.cc b/src/os/FileJournal.cc deleted file mode 100644 index 698f2b31c079..000000000000 --- a/src/os/FileJournal.cc +++ /dev/null @@ -1,2146 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ -#include "acconfig.h" - -#include "common/debug.h" -#include "common/errno.h" -#include "common/safe_io.h" -#include "FileJournal.h" -#include "include/color.h" -#include "common/perf_counters.h" -#include "os/FileStore.h" - -#include "include/compat.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "common/blkdev.h" -#include "common/linux_version.h" - -#if defined(__FreeBSD__) -#define O_DSYNC O_SYNC -#endif - -#define dout_subsys ceph_subsys_journal -#undef dout_prefix -#define dout_prefix *_dout << "journal " - -const static int64_t ONE_MEG(1 << 20); -const static int CEPH_MINIMUM_BLOCK_SIZE(4096); - -int FileJournal::_open(bool forwrite, bool create) -{ - int flags, ret; - - if (forwrite) { - flags = O_RDWR; - if (directio) - flags |= O_DIRECT | O_DSYNC; - } else { - flags = O_RDONLY; - } - if (create) - flags |= O_CREAT; - - if (fd >= 0) { - if (TEMP_FAILURE_RETRY(::close(fd))) { - int err = errno; - derr << "FileJournal::_open: error closing old fd: " - << cpp_strerror(err) << dendl; - } - } - fd = TEMP_FAILURE_RETRY(::open(fn.c_str(), flags, 0644)); - if (fd < 0) { - int err = errno; - dout(2) << "FileJournal::_open unable to open journal " - << fn << ": " << cpp_strerror(err) << dendl; - return -err; - } - - struct stat st; - ret = ::fstat(fd, &st); - if (ret) { - ret = errno; - derr << "FileJournal::_open: unable to fstat journal: " << cpp_strerror(ret) << dendl; - ret = -ret; - goto out_fd; - } - - if (S_ISBLK(st.st_mode)) { - ret = _open_block_device(); - } else { - if (aio && !force_aio) { - derr << "FileJournal::_open: disabling aio for non-block journal. Use " - << "journal_force_aio to force use of aio anyway" << dendl; - aio = false; - } - ret = _open_file(st.st_size, st.st_blksize, create); - } - - if (ret) - goto out_fd; - -#ifdef HAVE_LIBAIO - if (aio) { - aio_ctx = 0; - ret = io_setup(128, &aio_ctx); - if (ret < 0) { - ret = errno; - derr << "FileJournal::_open: unable to setup io_context " << cpp_strerror(ret) << dendl; - ret = -ret; - goto out_fd; - } - } -#endif - - /* We really want max_size to be a multiple of block_size. */ - max_size -= max_size % block_size; - - dout(1) << "_open " << fn << " fd " << fd - << ": " << max_size - << " bytes, block size " << block_size - << " bytes, directio = " << directio - << ", aio = " << aio - << dendl; - return 0; - - out_fd: - VOID_TEMP_FAILURE_RETRY(::close(fd)); - return ret; -} - -int FileJournal::_open_block_device() -{ - int64_t bdev_sz = 0; - int ret = get_block_device_size(fd, &bdev_sz); - if (ret) { - dout(0) << __func__ << ": failed to read block device size." << dendl; - return -EIO; - } - - /* Check for bdev_sz too small */ - if (bdev_sz < ONE_MEG) { - dout(0) << __func__ << ": your block device must be at least " - << ONE_MEG << " bytes to be used for a Ceph journal." << dendl; - return -EINVAL; - } - - dout(10) << __func__ << ": ignoring osd journal size. " - << "We'll use the entire block device (size: " << bdev_sz << ")" - << dendl; - max_size = bdev_sz; - - block_size = CEPH_MINIMUM_BLOCK_SIZE; - - if (g_conf->journal_discard) { - discard = block_device_support_discard(fn.c_str()); - dout(10) << fn << " support discard: " << (int)discard << dendl; - } - _check_disk_write_cache(); - return 0; -} - -void FileJournal::_check_disk_write_cache() const -{ - ostringstream hdparm_cmd; - FILE *fp = NULL; - - if (geteuid() != 0) { - dout(10) << "_check_disk_write_cache: not root, NOT checking disk write " - << "cache on raw block device " << fn << dendl; - goto done; - } - - hdparm_cmd << "/sbin/hdparm -W " << fn; - fp = popen(hdparm_cmd.str().c_str(), "r"); - if (!fp) { - dout(10) << "_check_disk_write_cache: failed to run /sbin/hdparm: NOT " - << "checking disk write cache on raw block device " << fn << dendl; - goto done; - } - - while (true) { - char buf[256]; - memset(buf, 0, sizeof(buf)); - char *line = fgets(buf, sizeof(buf) - 1, fp); - if (!line) { - if (ferror(fp)) { - int ret = -errno; - derr << "_check_disk_write_cache: fgets error: " << cpp_strerror(ret) - << dendl; - goto close_f; - } - else { - // EOF. - break; - } - } - - int on; - if (sscanf(line, " write-caching = %d", &on) != 1) - continue; - if (!on) { - dout(10) << "_check_disk_write_cache: disk write cache is off (good) on " - << fn << dendl; - break; - } - - // is our kernel new enough? - int ver = get_linux_version(); - if (ver == 0) { - dout(10) << "_check_disk_write_cache: get_linux_version failed" << dendl; - } else if (ver >= KERNEL_VERSION(2, 6, 33)) { - dout(20) << "_check_disk_write_cache: disk write cache is on, but your " - << "kernel is new enough to handle it correctly. (fn:" - << fn << ")" << dendl; - break; - } - derr << TEXT_RED - << " ** WARNING: disk write cache is ON on " << fn << ".\n" - << " Journaling will not be reliable on kernels prior to 2.6.33\n" - << " (recent kernels are safe). You can disable the write cache with\n" - << " 'hdparm -W 0 " << fn << "'" - << TEXT_NORMAL - << dendl; - break; - } - -close_f: - if (pclose(fp)) { - int ret = -errno; - derr << "_check_disk_write_cache: pclose failed: " << cpp_strerror(ret) - << dendl; - } -done: - ; -} - -int FileJournal::_open_file(int64_t oldsize, blksize_t blksize, - bool create) -{ - int ret; - int64_t conf_journal_sz(g_conf->osd_journal_size); - conf_journal_sz <<= 20; - - if ((g_conf->osd_journal_size == 0) && (oldsize < ONE_MEG)) { - derr << "I'm sorry, I don't know how large of a journal to create." - << "Please specify a block device to use as the journal OR " - << "set osd_journal_size in your ceph.conf" << dendl; - return -EINVAL; - } - - if (create && (oldsize < conf_journal_sz)) { - uint64_t newsize(g_conf->osd_journal_size); - newsize <<= 20; - dout(10) << "_open extending to " << newsize << " bytes" << dendl; - ret = ::ftruncate(fd, newsize); - if (ret < 0) { - int err = errno; - derr << "FileJournal::_open_file : unable to extend journal to " - << newsize << " bytes: " << cpp_strerror(err) << dendl; - return -err; - } -#ifdef HAVE_POSIX_FALLOCATE - ret = ::posix_fallocate(fd, 0, newsize); - if (ret) { - derr << "FileJournal::_open_file : unable to preallocation journal to " - << newsize << " bytes: " << cpp_strerror(ret) << dendl; - return -ret; - } - max_size = newsize; -#elif defined(__APPLE__) - fstore_t store; - store.fst_flags = F_ALLOCATECONTIG; - store.fst_posmode = F_PEOFPOSMODE; - store.fst_offset = 0; - store.fst_length = newsize; - - ret = ::fcntl(fd, F_PREALLOCATE, &store); - if (ret == -1) { - ret = -errno; - derr << "FileJournal::_open_file : unable to preallocation journal to " - << newsize << " bytes: " << cpp_strerror(ret) << dendl; - return ret; - } - max_size = newsize; -#else -# error "Journal pre-allocation not supported on platform." -#endif - } - else { - max_size = oldsize; - } - block_size = MAX(blksize, (blksize_t)CEPH_MINIMUM_BLOCK_SIZE); - - if (create && g_conf->journal_zero_on_create) { - derr << "FileJournal::_open_file : zeroing journal" << dendl; - uint64_t write_size = 1 << 20; - char *buf; - ret = ::posix_memalign((void **)&buf, block_size, write_size); - if (ret != 0) { - return -ret; - } - memset(static_cast(buf), 0, write_size); - uint64_t i = 0; - for (; (i + write_size) <= (uint64_t)max_size; i += write_size) { - ret = ::pwrite(fd, static_cast(buf), write_size, i); - if (ret < 0) { - free(buf); - return -errno; - } - } - if (i < (uint64_t)max_size) { - ret = ::pwrite(fd, static_cast(buf), max_size - i, i); - if (ret < 0) { - free(buf); - return -errno; - } - } - free(buf); - } - - - dout(10) << "_open journal is not a block device, NOT checking disk " - << "write cache on '" << fn << "'" << dendl; - - return 0; -} - -// This can not be used on an active journal -int FileJournal::check() -{ - int ret; - - assert(fd == -1); - ret = _open(false, false); - if (ret) - return ret; - - ret = read_header(&header); - if (ret < 0) - goto done; - - if (header.fsid != fsid) { - derr << "check: ondisk fsid " << header.fsid << " doesn't match expected " << fsid - << ", invalid (someone else's?) journal" << dendl; - ret = -EINVAL; - goto done; - } - - dout(1) << "check: header looks ok" << dendl; - ret = 0; - - done: - close(); - return ret; -} - - -int FileJournal::create() -{ - void *buf = 0; - int64_t needed_space; - int ret; - buffer::ptr bp; - dout(2) << "create " << fn << " fsid " << fsid << dendl; - - ret = _open(true, true); - if (ret) - goto done; - - // write empty header - header = header_t(); - header.flags = header_t::FLAG_CRC; // enable crcs on any new journal. - header.fsid = fsid; - header.max_size = max_size; - header.block_size = block_size; - if (g_conf->journal_block_align || directio) - header.alignment = block_size; - else - header.alignment = 16; // at least stay word aligned on 64bit machines... - - header.start = get_top(); - header.start_seq = 0; - - print_header(header); - - // static zeroed buffer for alignment padding - delete [] zero_buf; - zero_buf = new char[header.alignment]; - memset(zero_buf, 0, header.alignment); - - bp = prepare_header(); - if (TEMP_FAILURE_RETRY(::pwrite(fd, bp.c_str(), bp.length(), 0)) < 0) { - ret = -errno; - derr << "FileJournal::create : create write header error " - << cpp_strerror(ret) << dendl; - goto close_fd; - } - - // zero first little bit, too. - ret = posix_memalign(&buf, block_size, block_size); - if (ret) { - ret = -ret; - derr << "FileJournal::create: failed to allocate " << block_size - << " bytes of memory: " << cpp_strerror(ret) << dendl; - goto close_fd; - } - memset(buf, 0, block_size); - if (TEMP_FAILURE_RETRY(::pwrite(fd, buf, block_size, get_top())) < 0) { - ret = -errno; - derr << "FileJournal::create: error zeroing first " << block_size - << " bytes " << cpp_strerror(ret) << dendl; - goto free_buf; - } - - needed_space = ((int64_t)g_conf->osd_max_write_size) << 20; - needed_space += (2 * sizeof(entry_header_t)) + get_top(); - if (header.max_size - header.start < needed_space) { - derr << "FileJournal::create: OSD journal is not large enough to hold " - << "osd_max_write_size bytes!" << dendl; - ret = -ENOSPC; - goto free_buf; - } - - dout(2) << "create done" << dendl; - ret = 0; - -free_buf: - free(buf); - buf = 0; -close_fd: - if (TEMP_FAILURE_RETRY(::close(fd)) < 0) { - ret = -errno; - derr << "FileJournal::create: error closing fd: " << cpp_strerror(ret) - << dendl; - } -done: - fd = -1; - return ret; -} - -// This can not be used on an active journal -int FileJournal::peek_fsid(uuid_d& fsid) -{ - assert(fd == -1); - int r = _open(false, false); - if (r) - return r; - r = read_header(&header); - if (r < 0) - goto out; - fsid = header.fsid; -out: - close(); - return r; -} - -int FileJournal::open(uint64_t fs_op_seq) -{ - dout(2) << "open " << fn << " fsid " << fsid << " fs_op_seq " << fs_op_seq << dendl; - - uint64_t next_seq = fs_op_seq + 1; - - int err = _open(false); - if (err) - return err; - - // assume writeable, unless... - read_pos = 0; - write_pos = get_top(); - - // read header? - err = read_header(&header); - if (err < 0) - return err; - - // static zeroed buffer for alignment padding - delete [] zero_buf; - zero_buf = new char[header.alignment]; - memset(zero_buf, 0, header.alignment); - - dout(10) << "open header.fsid = " << header.fsid - //<< " vs expected fsid = " << fsid - << dendl; - if (header.fsid != fsid) { - derr << "FileJournal::open: ondisk fsid " << header.fsid << " doesn't match expected " << fsid - << ", invalid (someone else's?) journal" << dendl; - return -EINVAL; - } - if (header.max_size > max_size) { - dout(2) << "open journal size " << header.max_size << " > current " << max_size << dendl; - return -EINVAL; - } - if (header.block_size != block_size) { - dout(2) << "open journal block size " << header.block_size << " != current " << block_size << dendl; - return -EINVAL; - } - if (header.max_size % header.block_size) { - dout(2) << "open journal max size " << header.max_size - << " not a multiple of block size " << header.block_size << dendl; - return -EINVAL; - } - if (header.alignment != block_size && directio) { - dout(0) << "open journal alignment " << header.alignment << " does not match block size " - << block_size << " (required for direct_io journal mode)" << dendl; - return -EINVAL; - } - if ((header.alignment % CEPH_MINIMUM_BLOCK_SIZE) && directio) { - dout(0) << "open journal alignment " << header.alignment << " is not multiple of minimum block size " - << CEPH_MINIMUM_BLOCK_SIZE << " (required for direct_io journal mode)" << dendl; - return -EINVAL; - } - - // looks like a valid header. - write_pos = 0; // not writeable yet - - journaled_seq = header.committed_up_to; - - // find next entry - read_pos = header.start; - uint64_t seq = header.start_seq; - - // last_committed_seq is 1 before the start of the journal or - // 0 if the start is 0 - last_committed_seq = seq > 0 ? seq - 1 : seq; - if (last_committed_seq < fs_op_seq) { - dout(2) << "open advancing committed_seq " << last_committed_seq - << " to fs op_seq " << fs_op_seq << dendl; - last_committed_seq = fs_op_seq; - } - - while (1) { - bufferlist bl; - off64_t old_pos = read_pos; - if (!read_entry(bl, seq)) { - dout(10) << "open reached end of journal." << dendl; - break; - } - if (seq > next_seq) { - dout(10) << "open entry " << seq << " len " << bl.length() << " > next_seq " << next_seq - << ", ignoring journal contents" - << dendl; - read_pos = -1; - last_committed_seq = 0; - seq = 0; - return 0; - } - if (seq == next_seq) { - dout(10) << "open reached seq " << seq << dendl; - read_pos = old_pos; - break; - } - seq++; // next event should follow. - } - - return 0; -} - -void FileJournal::_close(int fd) const -{ - VOID_TEMP_FAILURE_RETRY(::close(fd)); -} - -void FileJournal::close() -{ - dout(1) << "close " << fn << dendl; - - // stop writer thread - stop_writer(); - - // close - assert(writeq_empty()); - assert(!must_write_header); - assert(fd >= 0); - _close(fd); - fd = -1; -} - - -int FileJournal::dump(ostream& out) -{ - return _dump(out, false); -} - -int FileJournal::simple_dump(ostream& out) -{ - return _dump(out, true); -} - -int FileJournal::_dump(ostream& out, bool simple) -{ - JSONFormatter f(true); - int ret = _fdump(f, simple); - f.flush(out); - return ret; -} - -int FileJournal::_fdump(Formatter &f, bool simple) -{ - dout(10) << "_fdump" << dendl; - - assert(fd == -1); - int err = _open(false, false); - if (err) - return err; - - err = read_header(&header); - if (err < 0) { - close(); - return err; - } - - off64_t next_pos = header.start; - - f.open_object_section("journal"); - - f.open_object_section("header"); - f.dump_unsigned("flags", header.flags); - ostringstream os; - os << header.fsid; - f.dump_string("fsid", os.str()); - f.dump_unsigned("block_size", header.block_size); - f.dump_unsigned("alignment", header.alignment); - f.dump_int("max_size", header.max_size); - f.dump_int("start", header.start); - f.dump_unsigned("committed_up_to", header.committed_up_to); - f.dump_unsigned("start_seq", header.start_seq); - f.close_section(); - - f.open_array_section("entries"); - uint64_t seq = header.start_seq; - while (1) { - bufferlist bl; - off64_t pos = next_pos; - - if (!pos) { - dout(2) << "_dump -- not readable" << dendl; - err = -EINVAL; - break; - } - stringstream ss; - read_entry_result result = do_read_entry( - pos, - &next_pos, - &bl, - &seq, - &ss); - if (result != SUCCESS) { - if (seq < header.committed_up_to) { - dout(2) << "Unable to read past sequence " << seq - << " but header indicates the journal has committed up through " - << header.committed_up_to << ", journal is corrupt" << dendl; - err = -EINVAL; - } - dout(25) << ss.str() << dendl; - dout(25) << "No further valid entries found, journal is most likely valid" - << dendl; - break; - } - - f.open_object_section("entry"); - f.dump_unsigned("offset", pos); - f.dump_unsigned("seq", seq); - if (simple) { - f.dump_unsigned("bl.length", bl.length()); - } else { - f.open_array_section("transactions"); - bufferlist::iterator p = bl.begin(); - int trans_num = 0; - while (!p.end()) { - ObjectStore::Transaction t(p); - f.open_object_section("transaction"); - f.dump_unsigned("trans_num", trans_num); - t.dump(&f); - f.close_section(); - trans_num++; - } - f.close_section(); - } - f.close_section(); - } - - f.close_section(); - f.close_section(); - dout(10) << "dump finish" << dendl; - - close(); - return err; -} - - -void FileJournal::start_writer() -{ - write_stop = false; - aio_stop = false; - write_thread.create(); -#ifdef HAVE_LIBAIO - if (aio) - write_finish_thread.create(); -#endif -} - -void FileJournal::stop_writer() -{ - // Do nothing if writer already stopped or never started - if (!write_stop) - { - { - Mutex::Locker l(write_lock); - Mutex::Locker p(writeq_lock); - write_stop = true; - writeq_cond.Signal(); - // Doesn't hurt to signal commit_cond in case thread is waiting there - // and caller didn't use committed_thru() first. - commit_cond.Signal(); - } - write_thread.join(); - - // write journal header now so that we have less to replay on remount - write_header_sync(); - } - -#ifdef HAVE_LIBAIO - // stop aio completeion thread *after* writer thread has stopped - // and has submitted all of its io - if (aio && !aio_stop) { - aio_lock.Lock(); - aio_stop = true; - aio_cond.Signal(); - write_finish_cond.Signal(); - aio_lock.Unlock(); - write_finish_thread.join(); - } -#endif -} - - - -void FileJournal::print_header(const header_t &header) const -{ - dout(10) << "header: block_size " << header.block_size - << " alignment " << header.alignment - << " max_size " << header.max_size - << dendl; - dout(10) << "header: start " << header.start << dendl; - dout(10) << " write_pos " << write_pos << dendl; -} - -int FileJournal::read_header(header_t *hdr) const -{ - dout(10) << "read_header" << dendl; - bufferlist bl; - - buffer::ptr bp = buffer::create_page_aligned(block_size); - char* bpdata = bp.c_str(); - int r = ::pread(fd, bpdata, bp.length(), 0); - - if (r < 0) { - int err = errno; - dout(0) << "read_header got " << cpp_strerror(err) << dendl; - return -err; - } - - // don't use bp.zero() here, because it also invalidates - // crc cache (which is not yet populated anyway) - if (bp.length() != (size_t)r) { - // r will be always less or equal than bp.length - bpdata += r; - memset(bpdata, 0, bp.length() - r); - } - - bl.push_back(bp); - - try { - bufferlist::iterator p = bl.begin(); - ::decode(*hdr, p); - } - catch (buffer::error& e) { - derr << "read_header error decoding journal header" << dendl; - return -EINVAL; - } - - - /* - * Unfortunately we weren't initializing the flags field for new - * journals! Aie. This is safe(ish) now that we have only one - * flag. Probably around when we add the next flag we need to - * remove this or else this (eventually old) code will clobber newer - * code's flags. - */ - if (hdr->flags > 3) { - derr << "read_header appears to have gibberish flags; assuming 0" << dendl; - hdr->flags = 0; - } - - print_header(*hdr); - - return 0; -} - -bufferptr FileJournal::prepare_header() -{ - bufferlist bl; - { - Mutex::Locker l(finisher_lock); - header.committed_up_to = journaled_seq; - } - ::encode(header, bl); - bufferptr bp = buffer::create_page_aligned(get_top()); - // don't use bp.zero() here, because it also invalidates - // crc cache (which is not yet populated anyway) - char* data = bp.c_str(); - memcpy(data, bl.c_str(), bl.length()); - data += bl.length(); - memset(data, 0, bp.length()-bl.length()); - return bp; -} - -void FileJournal::write_header_sync() -{ - Mutex::Locker locker(write_lock); - must_write_header = true; - bufferlist bl; - do_write(bl); - dout(20) << __func__ << " finish" << dendl; -} - -int FileJournal::check_for_full(uint64_t seq, off64_t pos, off64_t size) -{ - // already full? - if (full_state != FULL_NOTFULL) - return -ENOSPC; - - // take 1 byte off so that we only get pos == header.start on EMPTY, never on FULL. - off64_t room; - if (pos >= header.start) - room = (header.max_size - pos) + (header.start - get_top()) - 1; - else - room = header.start - pos - 1; - dout(10) << "room " << room << " max_size " << max_size << " pos " << pos << " header.start " << header.start - << " top " << get_top() << dendl; - - if (do_sync_cond) { - if (room >= (header.max_size >> 1) && - room - size < (header.max_size >> 1)) { - dout(10) << " passing half full mark, triggering commit" << dendl; - do_sync_cond->SloppySignal(); // initiate a real commit so we can trim - } - } - - if (room >= size) { - dout(10) << "check_for_full at " << pos << " : " << size << " < " << room << dendl; - if (pos + size > header.max_size) - must_write_header = true; - return 0; - } - - // full - dout(1) << "check_for_full at " << pos << " : JOURNAL FULL " - << pos << " >= " << room - << " (max_size " << header.max_size << " start " << header.start << ")" - << dendl; - - off64_t max = header.max_size - get_top(); - if (size > max) - dout(0) << "JOURNAL TOO SMALL: continuing, but slow: item " << size << " > journal " << max << " (usable)" << dendl; - - return -ENOSPC; -} - -int FileJournal::prepare_multi_write(bufferlist& bl, uint64_t& orig_ops, uint64_t& orig_bytes) -{ - // gather queued writes - off64_t queue_pos = write_pos; - - int eleft = g_conf->journal_max_write_entries; - unsigned bmax = g_conf->journal_max_write_bytes; - - if (full_state != FULL_NOTFULL) - return -ENOSPC; - - while (!writeq_empty()) { - list items; - batch_pop_write(items); - list::iterator it = items.begin(); - while (it != items.end()) { - int r = prepare_single_write(*it, bl, queue_pos, orig_ops, orig_bytes); - if (r == 0) { // prepare ok, delete it - items.erase(it++); - } - if (r == -ENOSPC) { - // the journal maybe full, insert the left item to writeq - batch_unpop_write(items); - if (orig_ops) - goto out; // commit what we have - - if (logger) - logger->inc(l_os_j_full); - - if (wait_on_full) { - dout(20) << "prepare_multi_write full on first entry, need to wait" << dendl; - } else { - dout(20) << "prepare_multi_write full on first entry, restarting journal" << dendl; - - // throw out what we have so far - full_state = FULL_FULL; - while (!writeq_empty()) { - put_throttle(1, peek_write().orig_len); - pop_write(); - } - print_header(header); - } - - return -ENOSPC; // hrm, full on first op - } - if (eleft) { - if (--eleft == 0) { - dout(20) << "prepare_multi_write hit max events per write " << g_conf->journal_max_write_entries << dendl; - batch_unpop_write(items); - goto out; - } - } - if (bmax) { - if (bl.length() >= bmax) { - dout(20) << "prepare_multi_write hit max write size " << g_conf->journal_max_write_bytes << dendl; - batch_unpop_write(items); - goto out; - } - } - } - } - -out: - dout(20) << "prepare_multi_write queue_pos now " << queue_pos << dendl; - assert((write_pos + bl.length() == queue_pos) || - (write_pos + bl.length() - header.max_size + get_top() == queue_pos)); - return 0; -} - -/* -void FileJournal::queue_write_fin(uint64_t seq, Context *fin) -{ - writing_seq.push_back(seq); - if (!waiting_for_notfull.empty()) { - // make sure previously unjournaled stuff waiting for UNFULL triggers - // _before_ newly journaled stuff does - dout(10) << "queue_write_fin will defer seq " << seq << " callback " << fin - << " until after UNFULL" << dendl; - C_Gather *g = new C_Gather(writeq.front().fin); - writing_fin.push_back(g->new_sub()); - waiting_for_notfull.push_back(g->new_sub()); - } else { - writing_fin.push_back(writeq.front().fin); - dout(20) << "queue_write_fin seq " << seq << " callback " << fin << dendl; - } -} -*/ - -void FileJournal::queue_completions_thru(uint64_t seq) -{ - assert(finisher_lock.is_locked()); - utime_t now = ceph_clock_now(g_ceph_context); - list items; - batch_pop_completions(items); - list::iterator it = items.begin(); - while (it != items.end()) { - completion_item& next = *it; - if (next.seq > seq) - break; - utime_t lat = now; - lat -= next.start; - dout(10) << "queue_completions_thru seq " << seq - << " queueing seq " << next.seq - << " " << next.finish - << " lat " << lat << dendl; - if (logger) { - logger->tinc(l_os_j_lat, lat); - } - if (next.finish) - finisher->queue(next.finish); - if (next.tracked_op) - next.tracked_op->mark_event("journaled_completion_queued"); - items.erase(it++); - } - batch_unpop_completions(items); - finisher_cond.Signal(); -} - - -int FileJournal::prepare_single_write(write_item &next_write, bufferlist& bl, off64_t& queue_pos, uint64_t& orig_ops, uint64_t& orig_bytes) -{ - uint64_t seq = next_write.seq; - bufferlist &ebl = next_write.bl; - off64_t size = ebl.length(); - - int r = check_for_full(seq, queue_pos, size); - if (r < 0) - return r; // ENOSPC or EAGAIN - - uint32_t orig_len = next_write.orig_len; - orig_bytes += orig_len; - orig_ops++; - - // add to write buffer - dout(15) << "prepare_single_write " << orig_ops << " will write " << queue_pos << " : seq " << seq - << " len " << orig_len << " -> " << size << dendl; - - unsigned seq_offset = offsetof(entry_header_t, seq); - unsigned magic1_offset = offsetof(entry_header_t, magic1); - unsigned magic2_offset = offsetof(entry_header_t, magic2); - - bufferptr headerptr = ebl.buffers().front(); - uint64_t _seq = seq; - uint64_t _queue_pos = queue_pos; - uint64_t magic2 = entry_header_t::make_magic(seq, orig_len, header.get_fsid64()); - headerptr.copy_in(seq_offset, sizeof(uint64_t), (char *)&_seq); - headerptr.copy_in(magic1_offset, sizeof(uint64_t), (char *)&_queue_pos); - headerptr.copy_in(magic2_offset, sizeof(uint64_t), (char *)&magic2); - - bufferptr footerptr = ebl.buffers().back(); - unsigned post_offset = footerptr.length() - sizeof(entry_header_t); - footerptr.copy_in(post_offset + seq_offset, sizeof(uint64_t), (char *)&_seq); - footerptr.copy_in(post_offset + magic1_offset, sizeof(uint64_t), (char *)&_queue_pos); - footerptr.copy_in(post_offset + magic2_offset, sizeof(uint64_t), (char *)&magic2); - - bl.claim_append(ebl); - if (next_write.tracked_op) - next_write.tracked_op->mark_event("write_thread_in_journal_buffer"); - - journalq.push_back(pair(seq, queue_pos)); - writing_seq = seq; - - queue_pos += size; - if (queue_pos >= header.max_size) - queue_pos = queue_pos + get_top() - header.max_size; - - return 0; -} - -void FileJournal::align_bl(off64_t pos, bufferlist& bl) -{ - // make sure list segments are page aligned - if (directio && (!bl.is_aligned(block_size) || - !bl.is_n_align_sized(CEPH_MINIMUM_BLOCK_SIZE))) { - assert(0 == "bl should be align"); - if ((bl.length() & (CEPH_MINIMUM_BLOCK_SIZE - 1)) != 0 || - (pos & (CEPH_MINIMUM_BLOCK_SIZE - 1)) != 0) - dout(0) << "rebuild_page_aligned failed, " << bl << dendl; - assert((bl.length() & (CEPH_MINIMUM_BLOCK_SIZE - 1)) == 0); - assert((pos & (CEPH_MINIMUM_BLOCK_SIZE - 1)) == 0); - } -} - -int FileJournal::write_bl(off64_t& pos, bufferlist& bl) -{ - int ret; - - off64_t spos = ::lseek64(fd, pos, SEEK_SET); - if (spos < 0) { - ret = -errno; - derr << "FileJournal::write_bl : lseek64 failed " << cpp_strerror(ret) << dendl; - return ret; - } - ret = bl.write_fd(fd); - if (ret) { - derr << "FileJournal::write_bl : write_fd failed: " << cpp_strerror(ret) << dendl; - return ret; - } - pos += bl.length(); - if (pos == header.max_size) - pos = get_top(); - return 0; -} - -void FileJournal::do_write(bufferlist& bl) -{ - // nothing to do? - if (bl.length() == 0 && !must_write_header) - return; - - buffer::ptr hbp; - if (g_conf->journal_write_header_frequency && - (((++journaled_since_start) % - g_conf->journal_write_header_frequency) == 0)) { - must_write_header = true; - } - - if (must_write_header) { - must_write_header = false; - hbp = prepare_header(); - } - - dout(15) << "do_write writing " << write_pos << "~" << bl.length() - << (hbp.length() ? " + header":"") - << dendl; - - utime_t from = ceph_clock_now(g_ceph_context); - - // entry - off64_t pos = write_pos; - - // Adjust write_pos - align_bl(pos, bl); - write_pos += bl.length(); - if (write_pos >= header.max_size) - write_pos = write_pos - header.max_size + get_top(); - - write_lock.Unlock(); - - // split? - off64_t split = 0; - if (pos + bl.length() > header.max_size) { - bufferlist first, second; - split = header.max_size - pos; - first.substr_of(bl, 0, split); - second.substr_of(bl, split, bl.length() - split); - assert(first.length() + second.length() == bl.length()); - dout(10) << "do_write wrapping, first bit at " << pos << " len " << first.length() - << " second bit len " << second.length() << " (orig len " << bl.length() << ")" << dendl; - - //Save pos to write first piece second - off64_t first_pos = pos; - off64_t orig_pos; - pos = get_top(); - // header too? - if (hbp.length()) { - // be sneaky: include the header in the second fragment - second.push_front(hbp); - pos = 0; // we included the header - } - // Write the second portion first possible with the header, so - // do_read_entry() won't even get a valid entry_header_t if there - // is a crash between the two writes. - orig_pos = pos; - if (write_bl(pos, second)) { - derr << "FileJournal::do_write: write_bl(pos=" << orig_pos - << ") failed" << dendl; - ceph_abort(); - } - orig_pos = first_pos; - if (write_bl(first_pos, first)) { - derr << "FileJournal::do_write: write_bl(pos=" << orig_pos - << ") failed" << dendl; - ceph_abort(); - } - assert(first_pos == get_top()); - } else { - // header too? - if (hbp.length()) { - if (TEMP_FAILURE_RETRY(::pwrite(fd, hbp.c_str(), hbp.length(), 0)) < 0) { - int err = errno; - derr << "FileJournal::do_write: pwrite(fd=" << fd - << ", hbp.length=" << hbp.length() << ") failed :" - << cpp_strerror(err) << dendl; - ceph_abort(); - } - } - - if (write_bl(pos, bl)) { - derr << "FileJournal::do_write: write_bl(pos=" << pos - << ") failed" << dendl; - ceph_abort(); - } - } - - if (!directio) { - dout(20) << "do_write fsync" << dendl; - - /* - * We'd really love to have a fsync_range or fdatasync_range and do a: - * - * if (split) { - * ::fsync_range(fd, header.max_size - split, split)l - * ::fsync_range(fd, get_top(), bl.length() - split); - * else - * ::fsync_range(fd, write_pos, bl.length()) - * - * NetBSD and AIX apparently have it, and adding it to Linux wouldn't be - * too hard given all the underlying infrastructure already exist. - * - * NOTE: using sync_file_range here would not be safe as it does not - * flush disk caches or commits any sort of metadata. - */ - int ret = 0; -#if defined(DARWIN) || defined(__FreeBSD__) - ret = ::fsync(fd); -#else - ret = ::fdatasync(fd); -#endif - if (ret < 0) { - derr << __func__ << " fsync/fdatasync failed: " << cpp_strerror(errno) << dendl; - ceph_abort(); - } -#ifdef HAVE_POSIX_FADVISE - if (g_conf->filestore_fadvise) - posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED); -#endif - } - - utime_t lat = ceph_clock_now(g_ceph_context) - from; - dout(20) << "do_write latency " << lat << dendl; - - write_lock.Lock(); - - assert(write_pos == pos); - assert(write_pos % header.alignment == 0); - - { - Mutex::Locker locker(finisher_lock); - journaled_seq = writing_seq; - - // kick finisher? - // only if we haven't filled up recently! - if (full_state != FULL_NOTFULL) { - dout(10) << "do_write NOT queueing finisher seq " << journaled_seq - << ", full_commit_seq|full_restart_seq" << dendl; - } else { - if (plug_journal_completions) { - dout(20) << "do_write NOT queueing finishers through seq " << journaled_seq - << " due to completion plug" << dendl; - } else { - dout(20) << "do_write queueing finishers through seq " << journaled_seq << dendl; - queue_completions_thru(journaled_seq); - } - } - } -} - -void FileJournal::flush() -{ - dout(10) << "waiting for completions to empty" << dendl; - { - Mutex::Locker l(finisher_lock); - while (!completions_empty()) - finisher_cond.Wait(finisher_lock); - } - dout(10) << "flush waiting for finisher" << dendl; - finisher->wait_for_empty(); - dout(10) << "flush done" << dendl; -} - - -void FileJournal::write_thread_entry() -{ - dout(10) << "write_thread_entry start" << dendl; - while (1) { - { - Mutex::Locker locker(writeq_lock); - if (writeq.empty() && !must_write_header) { - if (write_stop) - break; - dout(20) << "write_thread_entry going to sleep" << dendl; - writeq_cond.Wait(writeq_lock); - dout(20) << "write_thread_entry woke up" << dendl; - continue; - } - } - -#ifdef HAVE_LIBAIO - if (aio) { - Mutex::Locker locker(aio_lock); - // should we back off to limit aios in flight? try to do this - // adaptively so that we submit larger aios once we have lots of - // them in flight. - // - // NOTE: our condition here is based on aio_num (protected by - // aio_lock) and throttle_bytes (part of the write queue). when - // we sleep, we *only* wait for aio_num to change, and do not - // wake when more data is queued. this is not strictly correct, - // but should be fine given that we will have plenty of aios in - // flight if we hit this limit to ensure we keep the device - // saturated. - while (aio_num > 0) { - int exp = MIN(aio_num * 2, 24); - long unsigned min_new = 1ull << exp; - long unsigned cur = throttle_bytes.get_current(); - dout(20) << "write_thread_entry aio throttle: aio num " << aio_num << " bytes " << aio_bytes - << " ... exp " << exp << " min_new " << min_new - << " ... pending " << cur << dendl; - if (cur >= min_new) - break; - dout(20) << "write_thread_entry deferring until more aios complete: " - << aio_num << " aios with " << aio_bytes << " bytes needs " << min_new - << " bytes to start a new aio (currently " << cur << " pending)" << dendl; - aio_cond.Wait(aio_lock); - dout(20) << "write_thread_entry woke up" << dendl; - } - } -#endif - - Mutex::Locker locker(write_lock); - uint64_t orig_ops = 0; - uint64_t orig_bytes = 0; - - bufferlist bl; - int r = prepare_multi_write(bl, orig_ops, orig_bytes); - // Don't care about journal full if stoppping, so drop queue and - // possibly let header get written and loop above to notice stop - if (r == -ENOSPC) { - if (write_stop) { - dout(20) << "write_thread_entry full and stopping, throw out queue and finish up" << dendl; - while (!writeq_empty()) { - put_throttle(1, peek_write().orig_len); - pop_write(); - } - print_header(header); - r = 0; - } else { - dout(20) << "write_thread_entry full, going to sleep (waiting for commit)" << dendl; - commit_cond.Wait(write_lock); - dout(20) << "write_thread_entry woke up" << dendl; - continue; - } - } - assert(r == 0); - - if (logger) { - logger->inc(l_os_j_wr); - logger->inc(l_os_j_wr_bytes, bl.length()); - } - -#ifdef HAVE_LIBAIO - if (aio) - do_aio_write(bl); - else - do_write(bl); -#else - do_write(bl); -#endif - put_throttle(orig_ops, orig_bytes); - } - - dout(10) << "write_thread_entry finish" << dendl; -} - -#ifdef HAVE_LIBAIO -void FileJournal::do_aio_write(bufferlist& bl) -{ - - if (g_conf->journal_write_header_frequency && - (((++journaled_since_start) % - g_conf->journal_write_header_frequency) == 0)) { - must_write_header = true; - } - - // nothing to do? - if (bl.length() == 0 && !must_write_header) - return; - - buffer::ptr hbp; - if (must_write_header) { - must_write_header = false; - hbp = prepare_header(); - } - - // entry - off64_t pos = write_pos; - - dout(15) << "do_aio_write writing " << pos << "~" << bl.length() - << (hbp.length() ? " + header":"") - << dendl; - - // split? - off64_t split = 0; - if (pos + bl.length() > header.max_size) { - bufferlist first, second; - split = header.max_size - pos; - first.substr_of(bl, 0, split); - second.substr_of(bl, split, bl.length() - split); - assert(first.length() + second.length() == bl.length()); - dout(10) << "do_aio_write wrapping, first bit at " << pos << "~" << first.length() << dendl; - - if (write_aio_bl(pos, first, 0)) { - derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos - << ") failed" << dendl; - ceph_abort(); - } - assert(pos == header.max_size); - if (hbp.length()) { - // be sneaky: include the header in the second fragment - second.push_front(hbp); - pos = 0; // we included the header - } else - pos = get_top(); // no header, start after that - if (write_aio_bl(pos, second, writing_seq)) { - derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos - << ") failed" << dendl; - ceph_abort(); - } - } else { - // header too? - if (hbp.length()) { - bufferlist hbl; - hbl.push_back(hbp); - loff_t pos = 0; - if (write_aio_bl(pos, hbl, 0)) { - derr << "FileJournal::do_aio_write: write_aio_bl(header) failed" << dendl; - ceph_abort(); - } - } - - if (write_aio_bl(pos, bl, writing_seq)) { - derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos - << ") failed" << dendl; - ceph_abort(); - } - } - - write_pos = pos; - if (write_pos == header.max_size) - write_pos = get_top(); - assert(write_pos % header.alignment == 0); -} - -/** - * write a buffer using aio - * - * @param seq seq to trigger when this aio completes. if 0, do not update any state - * on completion. - */ -int FileJournal::write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq) -{ - align_bl(pos, bl); - - dout(20) << "write_aio_bl " << pos << "~" << bl.length() << " seq " << seq << dendl; - - while (bl.length() > 0) { - int max = MIN(bl.buffers().size(), IOV_MAX-1); - iovec *iov = new iovec[max]; - int n = 0; - unsigned len = 0; - for (std::list::const_iterator p = bl.buffers().begin(); - n < max; - ++p, ++n) { - assert(p != bl.buffers().end()); - iov[n].iov_base = (void *)p->c_str(); - iov[n].iov_len = p->length(); - len += p->length(); - } - - bufferlist tbl; - bl.splice(0, len, &tbl); // move bytes from bl -> tbl - - // lock only aio_queue, current aio, aio_num, aio_bytes, which may be - // modified in check_aio_completion - aio_lock.Lock(); - aio_queue.push_back(aio_info(tbl, pos, bl.length() > 0 ? 0 : seq)); - aio_info& aio = aio_queue.back(); - aio.iov = iov; - - io_prep_pwritev(&aio.iocb, fd, aio.iov, n, pos); - - dout(20) << "write_aio_bl .. " << aio.off << "~" << aio.len - << " in " << n << dendl; - - aio_num++; - aio_bytes += aio.len; - - // need to save current aio len to update write_pos later because current - // aio could be ereased from aio_queue once it is done - uint64_t cur_len = aio.len; - // unlock aio_lock because following io_submit might take time to return - aio_lock.Unlock(); - - iocb *piocb = &aio.iocb; - int attempts = 10; - do { - int r = io_submit(aio_ctx, 1, &piocb); - dout(20) << "write_aio_bl io_submit return value: " << r << dendl; - if (r < 0) { - derr << "io_submit to " << aio.off << "~" << cur_len - << " got " << cpp_strerror(r) << dendl; - if (r == -EAGAIN && attempts-- > 0) { - usleep(500); - continue; - } - assert(0 == "io_submit got unexpected error"); - } else { - break; - } - } while (true); - pos += cur_len; - } - aio_lock.Lock(); - write_finish_cond.Signal(); - aio_lock.Unlock(); - return 0; -} -#endif - -void FileJournal::write_finish_thread_entry() -{ -#ifdef HAVE_LIBAIO - dout(10) << "write_finish_thread_entry enter" << dendl; - while (true) { - { - Mutex::Locker locker(aio_lock); - if (aio_queue.empty()) { - if (aio_stop) - break; - dout(20) << "write_finish_thread_entry sleeping" << dendl; - write_finish_cond.Wait(aio_lock); - continue; - } - } - - dout(20) << "write_finish_thread_entry waiting for aio(s)" << dendl; - io_event event[16]; - int r = io_getevents(aio_ctx, 1, 16, event, NULL); - if (r < 0) { - if (r == -EINTR) { - dout(0) << "io_getevents got " << cpp_strerror(r) << dendl; - continue; - } - derr << "io_getevents got " << cpp_strerror(r) << dendl; - assert(0 == "got unexpected error from io_getevents"); - } - - { - Mutex::Locker locker(aio_lock); - for (int i=0; ilen) { - derr << "aio to " << ai->off << "~" << ai->len - << " wrote " << event[i].res << dendl; - assert(0 == "unexpected aio error"); - } - dout(10) << "write_finish_thread_entry aio " << ai->off - << "~" << ai->len << " done" << dendl; - ai->done = true; - } - check_aio_completion(); - } - } - dout(10) << "write_finish_thread_entry exit" << dendl; -#endif -} - -#ifdef HAVE_LIBAIO -/** - * check aio_wait for completed aio, and update state appropriately. - */ -void FileJournal::check_aio_completion() -{ - assert(aio_lock.is_locked()); - dout(20) << "check_aio_completion" << dendl; - - bool completed_something = false, signal = false; - uint64_t new_journaled_seq = 0; - - list::iterator p = aio_queue.begin(); - while (p != aio_queue.end() && p->done) { - dout(20) << "check_aio_completion completed seq " << p->seq << " " - << p->off << "~" << p->len << dendl; - if (p->seq) { - new_journaled_seq = p->seq; - completed_something = true; - } - aio_num--; - aio_bytes -= p->len; - aio_queue.erase(p++); - signal = true; - } - - if (completed_something) { - // kick finisher? - // only if we haven't filled up recently! - Mutex::Locker locker(finisher_lock); - journaled_seq = new_journaled_seq; - if (full_state != FULL_NOTFULL) { - dout(10) << "check_aio_completion NOT queueing finisher seq " << journaled_seq - << ", full_commit_seq|full_restart_seq" << dendl; - } else { - if (plug_journal_completions) { - dout(20) << "check_aio_completion NOT queueing finishers through seq " << journaled_seq - << " due to completion plug" << dendl; - } else { - dout(20) << "check_aio_completion queueing finishers through seq " << journaled_seq << dendl; - queue_completions_thru(journaled_seq); - } - } - } - if (signal) { - // maybe write queue was waiting for aio count to drop? - aio_cond.Signal(); - } -} -#endif - -int FileJournal::prepare_entry(list& tls, bufferlist* tbl) { - dout(10) << "prepare_entry " << tls << dendl; - unsigned data_len = 0; - int data_align = -1; // -1 indicates that we don't care about the alignment - bufferlist bl; - for (list::iterator p = tls.begin(); - p != tls.end(); ++p) { - ObjectStore::Transaction *t = *p; - if (t->get_data_length() > data_len && - (int)t->get_data_length() >= g_conf->journal_align_min_size) { - data_len = t->get_data_length(); - data_align = (t->get_data_alignment() - bl.length()) & ~CEPH_PAGE_MASK; - } - ::encode(*t, bl); - } - if (tbl->length()) { - bl.claim_append(*tbl); - } - // add it this entry - entry_header_t h; - unsigned head_size = sizeof(entry_header_t); - off64_t base_size = 2*head_size + bl.length(); - memset(&h, 0, sizeof(h)); - if (data_align >= 0) - h.pre_pad = ((unsigned int)data_align - (unsigned int)head_size) & ~CEPH_PAGE_MASK; - off64_t size = ROUND_UP_TO(base_size + h.pre_pad, header.alignment); - unsigned post_pad = size - base_size - h.pre_pad; - h.len = bl.length(); - h.post_pad = post_pad; - h.crc32c = bl.crc32c(0); - dout(10) << " len " << bl.length() << " -> " << size - << " (head " << head_size << " pre_pad " << h.pre_pad - << " bl " << bl.length() << " post_pad " << post_pad << " tail " << head_size << ")" - << " (bl alignment " << data_align << ")" - << dendl; - bufferlist ebl; - // header - ebl.append((const char*)&h, sizeof(h)); - if (h.pre_pad) { - ebl.push_back(buffer::create_static(h.pre_pad, zero_buf)); - } - // payload - ebl.claim_append(bl, buffer::list::CLAIM_ALLOW_NONSHAREABLE); // potential zero-copy - if (h.post_pad) { - ebl.push_back(buffer::create_static(h.post_pad, zero_buf)); - } - // footer - ebl.append((const char*)&h, sizeof(h)); - ebl.rebuild_aligned(CEPH_MINIMUM_BLOCK_SIZE); - tbl->claim(ebl); - return h.len; -} - -void FileJournal::submit_entry(uint64_t seq, bufferlist& e, uint32_t orig_len, - Context *oncommit, TrackedOpRef osd_op) -{ - // dump on queue - dout(5) << "submit_entry seq " << seq - << " len " << e.length() - << " (" << oncommit << ")" << dendl; - assert(e.length() > 0); - - throttle_ops.take(1); - throttle_bytes.take(orig_len); - if (osd_op) - osd_op->mark_event("commit_queued_for_journal_write"); - if (logger) { - logger->set(l_os_jq_max_ops, throttle_ops.get_max()); - logger->set(l_os_jq_max_bytes, throttle_bytes.get_max()); - logger->set(l_os_jq_ops, throttle_ops.get_current()); - logger->set(l_os_jq_bytes, throttle_bytes.get_current()); - } - - { - Mutex::Locker l1(writeq_lock); // ** lock ** - Mutex::Locker l2(completions_lock); // ** lock ** - completions.push_back( - completion_item( - seq, oncommit, ceph_clock_now(g_ceph_context), osd_op)); - if (writeq.empty()) - writeq_cond.Signal(); - writeq.push_back(write_item(seq, e, orig_len, osd_op)); - } -} - -bool FileJournal::writeq_empty() -{ - Mutex::Locker locker(writeq_lock); - return writeq.empty(); -} - -FileJournal::write_item &FileJournal::peek_write() -{ - assert(write_lock.is_locked()); - Mutex::Locker locker(writeq_lock); - return writeq.front(); -} - -void FileJournal::pop_write() -{ - assert(write_lock.is_locked()); - Mutex::Locker locker(writeq_lock); - writeq.pop_front(); -} - -void FileJournal::batch_pop_write(list &items) -{ - assert(write_lock.is_locked()); - Mutex::Locker locker(writeq_lock); - writeq.swap(items); -} - -void FileJournal::batch_unpop_write(list &items) -{ - assert(write_lock.is_locked()); - Mutex::Locker locker(writeq_lock); - writeq.splice(writeq.begin(), items); -} - -void FileJournal::commit_start(uint64_t seq) -{ - dout(10) << "commit_start" << dendl; - - // was full? - switch (full_state) { - case FULL_NOTFULL: - break; // all good - - case FULL_FULL: - if (seq >= journaled_seq) { - dout(1) << " FULL_FULL -> FULL_WAIT. commit_start on seq " - << seq << " > journaled_seq " << journaled_seq - << ", moving to FULL_WAIT." - << dendl; - full_state = FULL_WAIT; - } else { - dout(1) << "FULL_FULL commit_start on seq " - << seq << " < journaled_seq " << journaled_seq - << ", remaining in FULL_FULL" - << dendl; - } - break; - - case FULL_WAIT: - dout(1) << " FULL_WAIT -> FULL_NOTFULL. journal now active, setting completion plug." << dendl; - full_state = FULL_NOTFULL; - plug_journal_completions = true; - break; - } -} - -/* - *send discard command to joural block deivce - */ -void FileJournal::do_discard(int64_t offset, int64_t end) -{ - dout(10) << __func__ << "trim(" << offset << ", " << end << dendl; - - offset = ROUND_UP_TO(offset, block_size); - if (offset >= end) - return; - end = ROUND_UP_TO(end - block_size, block_size); - assert(end >= offset); - if (offset < end) - if (block_device_discard(fd, offset, end - offset) < 0) - dout(1) << __func__ << "ioctl(BLKDISCARD) error:" << cpp_strerror(errno) << dendl; -} - -void FileJournal::committed_thru(uint64_t seq) -{ - Mutex::Locker locker(write_lock); - - if (seq < last_committed_seq) { - dout(5) << "committed_thru " << seq << " < last_committed_seq " << last_committed_seq << dendl; - assert(seq >= last_committed_seq); - return; - } - if (seq == last_committed_seq) { - dout(5) << "committed_thru " << seq << " == last_committed_seq " << last_committed_seq << dendl; - return; - } - - dout(5) << "committed_thru " << seq << " (last_committed_seq " << last_committed_seq << ")" << dendl; - last_committed_seq = seq; - - // completions! - { - Mutex::Locker locker(finisher_lock); - queue_completions_thru(seq); - if (plug_journal_completions && seq >= header.start_seq) { - dout(10) << " removing completion plug, queuing completions thru journaled_seq " << journaled_seq << dendl; - plug_journal_completions = false; - queue_completions_thru(journaled_seq); - } - } - - // adjust start pointer - while (!journalq.empty() && journalq.front().first <= seq) { - journalq.pop_front(); - } - - int64_t old_start = header.start; - if (!journalq.empty()) { - header.start = journalq.front().second; - header.start_seq = journalq.front().first; - } else { - header.start = write_pos; - header.start_seq = seq + 1; - } - - if (discard) { - dout(10) << __func__ << " will trim (" << old_start << ", " << header.start << ")" << dendl; - if (old_start < header.start) - do_discard(old_start, header.start - 1); - else { - do_discard(old_start, header.max_size - 1); - do_discard(get_top(), header.start - 1); - } - } - - must_write_header = true; - print_header(header); - - // committed but unjournaled items - while (!writeq_empty() && peek_write().seq <= seq) { - dout(15) << " dropping committed but unwritten seq " << peek_write().seq - << " len " << peek_write().bl.length() - << dendl; - put_throttle(1, peek_write().orig_len); - pop_write(); - } - - commit_cond.Signal(); - - dout(10) << "committed_thru done" << dendl; -} - - -void FileJournal::put_throttle(uint64_t ops, uint64_t bytes) -{ - uint64_t new_ops = throttle_ops.put(ops); - uint64_t new_bytes = throttle_bytes.put(bytes); - dout(5) << "put_throttle finished " << ops << " ops and " - << bytes << " bytes, now " - << new_ops << " ops and " << new_bytes << " bytes" - << dendl; - - if (logger) { - logger->inc(l_os_j_ops, ops); - logger->inc(l_os_j_bytes, bytes); - logger->set(l_os_jq_ops, new_ops); - logger->set(l_os_jq_bytes, new_bytes); - logger->set(l_os_jq_max_ops, throttle_ops.get_max()); - logger->set(l_os_jq_max_bytes, throttle_bytes.get_max()); - } -} - -int FileJournal::make_writeable() -{ - dout(10) << __func__ << dendl; - int r = _open(true); - if (r < 0) - return r; - - if (read_pos > 0) - write_pos = read_pos; - else - write_pos = get_top(); - read_pos = 0; - - must_write_header = true; - start_writer(); - return 0; -} - -void FileJournal::wrap_read_bl( - off64_t pos, - int64_t olen, - bufferlist* bl, - off64_t *out_pos - ) const -{ - while (olen > 0) { - while (pos >= header.max_size) - pos = pos + get_top() - header.max_size; - - int64_t len; - if (pos + olen > header.max_size) - len = header.max_size - pos; // partial - else - len = olen; // rest - - int64_t actual = ::lseek64(fd, pos, SEEK_SET); - assert(actual == pos); - - bufferptr bp = buffer::create(len); - int r = safe_read_exact(fd, bp.c_str(), len); - if (r) { - derr << "FileJournal::wrap_read_bl: safe_read_exact " << pos << "~" << len << " returned " - << r << dendl; - ceph_abort(); - } - bl->push_back(bp); - pos += len; - olen -= len; - } - if (pos >= header.max_size) - pos = pos + get_top() - header.max_size; - if (out_pos) - *out_pos = pos; -} - -bool FileJournal::read_entry( - bufferlist &bl, - uint64_t &next_seq, - bool *corrupt) -{ - if (corrupt) - *corrupt = false; - uint64_t seq = next_seq; - - if (!read_pos) { - dout(2) << "read_entry -- not readable" << dendl; - return false; - } - - off64_t pos = read_pos; - off64_t next_pos = pos; - stringstream ss; - read_entry_result result = do_read_entry( - pos, - &next_pos, - &bl, - &seq, - &ss); - if (result == SUCCESS) { - journalq.push_back( pair(seq, pos)); - if (next_seq > seq) { - return false; - } else { - read_pos = next_pos; - next_seq = seq; - if (seq > journaled_seq) - journaled_seq = seq; - return true; - } - } - - if (seq && seq < header.committed_up_to) { - derr << "Unable to read past sequence " << seq - << " but header indicates the journal has committed up through " - << header.committed_up_to << ", journal is corrupt" << dendl; - if (g_conf->journal_ignore_corruption) { - if (corrupt) - *corrupt = true; - return false; - } else { - assert(0); - } - } - - dout(25) << ss.str() << dendl; - dout(2) << "No further valid entries found, journal is most likely valid" - << dendl; - return false; -} - -FileJournal::read_entry_result FileJournal::do_read_entry( - off64_t init_pos, - off64_t *next_pos, - bufferlist *bl, - uint64_t *seq, - ostream *ss, - entry_header_t *_h) const -{ - off64_t cur_pos = init_pos; - bufferlist _bl; - if (!bl) - bl = &_bl; - - // header - entry_header_t *h; - bufferlist hbl; - off64_t _next_pos; - wrap_read_bl(cur_pos, sizeof(*h), &hbl, &_next_pos); - h = reinterpret_cast(hbl.c_str()); - - if (!h->check_magic(cur_pos, header.get_fsid64())) { - dout(25) << "read_entry " << init_pos - << " : bad header magic, end of journal" << dendl; - if (ss) - *ss << "bad header magic"; - if (next_pos) - *next_pos = init_pos + (4<<10); // check 4k ahead - return MAYBE_CORRUPT; - } - cur_pos = _next_pos; - - // pad + body + pad - if (h->pre_pad) - cur_pos += h->pre_pad; - - bl->clear(); - wrap_read_bl(cur_pos, h->len, bl, &cur_pos); - - if (h->post_pad) - cur_pos += h->post_pad; - - // footer - entry_header_t *f; - bufferlist fbl; - wrap_read_bl(cur_pos, sizeof(*f), &fbl, &cur_pos); - f = reinterpret_cast(fbl.c_str()); - if (memcmp(f, h, sizeof(*f))) { - if (ss) - *ss << "bad footer magic, partial entry"; - if (next_pos) - *next_pos = cur_pos; - return MAYBE_CORRUPT; - } - - if ((header.flags & header_t::FLAG_CRC) || // if explicitly enabled (new journal) - h->crc32c != 0) { // newer entry in old journal - uint32_t actual_crc = bl->crc32c(0); - if (actual_crc != h->crc32c) { - if (ss) - *ss << "header crc (" << h->crc32c - << ") doesn't match body crc (" << actual_crc << ")"; - if (next_pos) - *next_pos = cur_pos; - return MAYBE_CORRUPT; - } - } - - // yay! - dout(2) << "read_entry " << init_pos << " : seq " << h->seq - << " " << h->len << " bytes" - << dendl; - - // ok! - if (seq) - *seq = h->seq; - - - if (next_pos) - *next_pos = cur_pos; - - if (_h) - *_h = *h; - - assert(cur_pos % header.alignment == 0); - return SUCCESS; -} - -void FileJournal::throttle() -{ - if (throttle_ops.wait(g_conf->journal_queue_max_ops)) - dout(2) << "throttle: waited for ops" << dendl; - if (throttle_bytes.wait(g_conf->journal_queue_max_bytes)) - dout(2) << "throttle: waited for bytes" << dendl; -} - -void FileJournal::get_header( - uint64_t wanted_seq, - off64_t *_pos, - entry_header_t *h) -{ - off64_t pos = header.start; - off64_t next_pos = pos; - bufferlist bl; - uint64_t seq = 0; - dout(2) << __func__ << dendl; - while (1) { - bl.clear(); - pos = next_pos; - read_entry_result result = do_read_entry( - pos, - &next_pos, - &bl, - &seq, - 0, - h); - if (result == FAILURE || result == MAYBE_CORRUPT) - assert(0); - if (seq == wanted_seq) { - if (_pos) - *_pos = pos; - return; - } - } - assert(0); // not reachable -} - -void FileJournal::corrupt( - int wfd, - off64_t corrupt_at) -{ - dout(2) << __func__ << dendl; - if (corrupt_at >= header.max_size) - corrupt_at = corrupt_at + get_top() - header.max_size; - - int64_t actual = ::lseek64(fd, corrupt_at, SEEK_SET); - assert(actual == corrupt_at); - - char buf[10]; - int r = safe_read_exact(fd, buf, 1); - assert(r == 0); - - actual = ::lseek64(wfd, corrupt_at, SEEK_SET); - assert(actual == corrupt_at); - - buf[0]++; - r = safe_write(wfd, buf, 1); - assert(r == 0); -} - -void FileJournal::corrupt_payload( - int wfd, - uint64_t seq) -{ - dout(2) << __func__ << dendl; - off64_t pos = 0; - entry_header_t h; - get_header(seq, &pos, &h); - off64_t corrupt_at = - pos + sizeof(entry_header_t) + h.pre_pad; - corrupt(wfd, corrupt_at); -} - - -void FileJournal::corrupt_footer_magic( - int wfd, - uint64_t seq) -{ - dout(2) << __func__ << dendl; - off64_t pos = 0; - entry_header_t h; - get_header(seq, &pos, &h); - off64_t corrupt_at = - pos + sizeof(entry_header_t) + h.pre_pad + - h.len + h.post_pad + - (reinterpret_cast(&h.magic2) - reinterpret_cast(&h)); - corrupt(wfd, corrupt_at); -} - - -void FileJournal::corrupt_header_magic( - int wfd, - uint64_t seq) -{ - dout(2) << __func__ << dendl; - off64_t pos = 0; - entry_header_t h; - get_header(seq, &pos, &h); - off64_t corrupt_at = - pos + - (reinterpret_cast(&h.magic2) - reinterpret_cast(&h)); - corrupt(wfd, corrupt_at); -} diff --git a/src/os/FileJournal.h b/src/os/FileJournal.h deleted file mode 100644 index 602a03480575..000000000000 --- a/src/os/FileJournal.h +++ /dev/null @@ -1,516 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef CEPH_FILEJOURNAL_H -#define CEPH_FILEJOURNAL_H - -#include -using std::deque; - -#include "Journal.h" -#include "common/Cond.h" -#include "common/Mutex.h" -#include "common/Thread.h" -#include "common/Throttle.h" - -#ifdef HAVE_LIBAIO -# include -#endif - -/** - * Implements journaling on top of block device or file. - * - * Lock ordering is write_lock > aio_lock > finisher_lock - */ -class FileJournal : public Journal { -public: - /// Protected by finisher_lock - struct completion_item { - uint64_t seq; - Context *finish; - utime_t start; - TrackedOpRef tracked_op; - completion_item(uint64_t o, Context *c, utime_t s, - TrackedOpRef opref) - : seq(o), finish(c), start(s), tracked_op(opref) {} - completion_item() : seq(0), finish(0), start(0) {} - }; - struct write_item { - uint64_t seq; - bufferlist bl; - uint32_t orig_len; - TrackedOpRef tracked_op; - write_item(uint64_t s, bufferlist& b, int ol, TrackedOpRef opref) : - seq(s), orig_len(ol), tracked_op(opref) { - bl.claim(b, buffer::list::CLAIM_ALLOW_NONSHAREABLE); // potential zero-copy - } - write_item() : seq(0), orig_len(0) {} - }; - - Mutex finisher_lock; - Cond finisher_cond; - uint64_t journaled_seq; - bool plug_journal_completions; - - Mutex writeq_lock; - Cond writeq_cond; - list writeq; - bool writeq_empty(); - write_item &peek_write(); - void pop_write(); - void batch_pop_write(list &items); - void batch_unpop_write(list &items); - - Mutex completions_lock; - list completions; - bool completions_empty() { - Mutex::Locker l(completions_lock); - return completions.empty(); - } - void batch_pop_completions(list &items) { - Mutex::Locker l(completions_lock); - completions.swap(items); - } - void batch_unpop_completions(list &items) { - Mutex::Locker l(completions_lock); - completions.splice(completions.begin(), items); - } - completion_item completion_peek_front() { - Mutex::Locker l(completions_lock); - assert(!completions.empty()); - return completions.front(); - } - void completion_pop_front() { - Mutex::Locker l(completions_lock); - assert(!completions.empty()); - completions.pop_front(); - } - - int prepare_entry(list& tls, bufferlist* tbl); - - void submit_entry(uint64_t seq, bufferlist& bl, uint32_t orig_len, - Context *oncommit, - TrackedOpRef osd_op = TrackedOpRef()); - /// End protected by finisher_lock - - /* - * journal header - */ - struct header_t { - enum { - FLAG_CRC = (1<<0), - // NOTE: remove kludgey weirdness in read_header() next time a flag is added. - }; - - uint64_t flags; - uuid_d fsid; - __u32 block_size; - __u32 alignment; - int64_t max_size; // max size of journal ring buffer - int64_t start; // offset of first entry - uint64_t committed_up_to; // committed up to - - /** - * start_seq - * - * entry at header.start has sequence >= start_seq - * - * Generally, the entry at header.start will have sequence - * start_seq if it exists. The only exception is immediately - * after journal creation since the first sequence number is - * not known. - * - * If the first read on open fails, we can assume corruption - * if start_seq > committed_up_thru because the entry would have - * a sequence >= start_seq and therefore > committed_up_thru. - */ - uint64_t start_seq; - - header_t() : - flags(0), block_size(0), alignment(0), max_size(0), start(0), - committed_up_to(0), start_seq(0) {} - - void clear() { - start = block_size; - } - - uint64_t get_fsid64() const { - return *(uint64_t*)fsid.bytes(); - } - - void encode(bufferlist& bl) const { - __u32 v = 4; - ::encode(v, bl); - bufferlist em; - { - ::encode(flags, em); - ::encode(fsid, em); - ::encode(block_size, em); - ::encode(alignment, em); - ::encode(max_size, em); - ::encode(start, em); - ::encode(committed_up_to, em); - ::encode(start_seq, em); - } - ::encode(em, bl); - } - void decode(bufferlist::iterator& bl) { - __u32 v; - ::decode(v, bl); - if (v < 2) { // normally 0, but concievably 1 - // decode old header_t struct (pre v0.40). - bl.advance(4); // skip __u32 flags (it was unused by any old code) - flags = 0; - uint64_t tfsid; - ::decode(tfsid, bl); - *(uint64_t*)&fsid.bytes()[0] = tfsid; - *(uint64_t*)&fsid.bytes()[8] = tfsid; - ::decode(block_size, bl); - ::decode(alignment, bl); - ::decode(max_size, bl); - ::decode(start, bl); - committed_up_to = 0; - start_seq = 0; - return; - } - bufferlist em; - ::decode(em, bl); - bufferlist::iterator t = em.begin(); - ::decode(flags, t); - ::decode(fsid, t); - ::decode(block_size, t); - ::decode(alignment, t); - ::decode(max_size, t); - ::decode(start, t); - - if (v > 2) - ::decode(committed_up_to, t); - else - committed_up_to = 0; - - if (v > 3) - ::decode(start_seq, t); - else - start_seq = 0; - } - } header; - - struct entry_header_t { - uint64_t seq; // fs op seq # - uint32_t crc32c; // payload only. not header, pre_pad, post_pad, or footer. - uint32_t len; - uint32_t pre_pad, post_pad; - uint64_t magic1; - uint64_t magic2; - - static uint64_t make_magic(uint64_t seq, uint32_t len, uint64_t fsid) { - return (fsid ^ seq ^ len); - } - bool check_magic(off64_t pos, uint64_t fsid) { - return - magic1 == (uint64_t)pos && - magic2 == (fsid ^ seq ^ len); - } - } __attribute__((__packed__, aligned(4))); - - bool journalq_empty() { return journalq.empty(); } - -private: - string fn; - - char *zero_buf; - off64_t max_size; - size_t block_size; - bool directio, aio, force_aio; - bool must_write_header; - off64_t write_pos; // byte where the next entry to be written will go - off64_t read_pos; // - bool discard; //for block journal whether support discard - -#ifdef HAVE_LIBAIO - /// state associated with an in-flight aio request - /// Protected by aio_lock - struct aio_info { - struct iocb iocb; - bufferlist bl; - struct iovec *iov; - bool done; - uint64_t off, len; ///< these are for debug only - uint64_t seq; ///< seq number to complete on aio completion, if non-zero - - aio_info(bufferlist& b, uint64_t o, uint64_t s) - : iov(NULL), done(false), off(o), len(b.length()), seq(s) { - bl.claim(b); - memset((void*)&iocb, 0, sizeof(iocb)); - } - ~aio_info() { - delete[] iov; - } - }; - Mutex aio_lock; - Cond aio_cond; - Cond write_finish_cond; - io_context_t aio_ctx; - list aio_queue; - int aio_num, aio_bytes; - /// End protected by aio_lock -#endif - - uint64_t last_committed_seq; - uint64_t journaled_since_start; - - /* - * full states cycle at the beginnging of each commit epoch, when commit_start() - * is called. - * FULL - we just filled up during this epoch. - * WAIT - we filled up last epoch; now we have to wait until everything during - * that epoch commits to the fs before we can start writing over it. - * NOTFULL - all good, journal away. - */ - enum { - FULL_NOTFULL = 0, - FULL_FULL = 1, - FULL_WAIT = 2, - } full_state; - - int fd; - - // in journal - deque > journalq; // track seq offsets, so we can trim later. - uint64_t writing_seq; - - - // throttle - Throttle throttle_ops, throttle_bytes; - - void put_throttle(uint64_t ops, uint64_t bytes); - - // write thread - Mutex write_lock; - bool write_stop; - bool aio_stop; - - Cond commit_cond; - - int _open(bool wr, bool create=false); - int _open_block_device(); - void _close(int fd) const; - void _check_disk_write_cache() const; - int _open_file(int64_t oldsize, blksize_t blksize, bool create); - int _dump(ostream& out, bool simple); - void print_header(const header_t &hdr) const; - int read_header(header_t *hdr) const; - bufferptr prepare_header(); - void start_writer(); - void stop_writer(); - void write_thread_entry(); - - void queue_completions_thru(uint64_t seq); - - int check_for_full(uint64_t seq, off64_t pos, off64_t size); - int prepare_multi_write(bufferlist& bl, uint64_t& orig_ops, uint64_t& orig_bytee); - int prepare_single_write(write_item &next_write, bufferlist& bl, off64_t& queue_pos, - uint64_t& orig_ops, uint64_t& orig_bytes); - void do_write(bufferlist& bl); - - void write_finish_thread_entry(); - void check_aio_completion(); - void do_aio_write(bufferlist& bl); - int write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq); - - - void align_bl(off64_t pos, bufferlist& bl); - int write_bl(off64_t& pos, bufferlist& bl); - - /// read len from journal starting at in_pos and wrapping up to len - void wrap_read_bl( - off64_t in_pos, ///< [in] start position - int64_t len, ///< [in] length to read - bufferlist* bl, ///< [out] result - off64_t *out_pos ///< [out] next position to read, will be wrapped - ) const; - - void do_discard(int64_t offset, int64_t end); - - class Writer : public Thread { - FileJournal *journal; - public: - Writer(FileJournal *fj) : journal(fj) {} - void *entry() { - journal->write_thread_entry(); - return 0; - } - } write_thread; - - class WriteFinisher : public Thread { - FileJournal *journal; - public: - WriteFinisher(FileJournal *fj) : journal(fj) {} - void *entry() { - journal->write_finish_thread_entry(); - return 0; - } - } write_finish_thread; - - off64_t get_top() const { - return ROUND_UP_TO(sizeof(header), block_size); - } - - public: - FileJournal(uuid_d fsid, Finisher *fin, Cond *sync_cond, const char *f, bool dio=false, bool ai=true, bool faio=false) : - Journal(fsid, fin, sync_cond), - finisher_lock("FileJournal::finisher_lock", false, true, false, g_ceph_context), - journaled_seq(0), - plug_journal_completions(false), - writeq_lock("FileJournal::writeq_lock", false, true, false, g_ceph_context), - completions_lock( - "FileJournal::completions_lock", false, true, false, g_ceph_context), - fn(f), - zero_buf(NULL), - max_size(0), block_size(0), - directio(dio), aio(ai), force_aio(faio), - must_write_header(false), - write_pos(0), read_pos(0), - discard(false), -#ifdef HAVE_LIBAIO - aio_lock("FileJournal::aio_lock"), - aio_ctx(0), - aio_num(0), aio_bytes(0), -#endif - last_committed_seq(0), - journaled_since_start(0), - full_state(FULL_NOTFULL), - fd(-1), - writing_seq(0), - throttle_ops(g_ceph_context, "journal_ops", g_conf->journal_queue_max_ops), - throttle_bytes(g_ceph_context, "journal_bytes", g_conf->journal_queue_max_bytes), - write_lock("FileJournal::write_lock", false, true, false, g_ceph_context), - write_stop(true), - aio_stop(true), - write_thread(this), - write_finish_thread(this) { - - if (aio && !directio) { - derr << "FileJournal::_open_any: aio not supported without directio; disabling aio" << dendl; - aio = false; - } -#ifndef HAVE_LIBAIO - if (aio) { - derr << "FileJournal::_open_any: libaio not compiled in; disabling aio" << dendl; - aio = false; - } -#endif - } - ~FileJournal() { - assert(fd == -1); - delete[] zero_buf; - } - - int check(); - int create(); - int open(uint64_t fs_op_seq); - void close(); - int peek_fsid(uuid_d& fsid); - - int dump(ostream& out); - int simple_dump(ostream& out); - int _fdump(Formatter &f, bool simple); - - void flush(); - - void throttle(); - - bool is_writeable() { - return read_pos == 0; - } - int make_writeable(); - - // writes - void commit_start(uint64_t seq); - void committed_thru(uint64_t seq); - bool should_commit_now() { - return full_state != FULL_NOTFULL && !write_stop; - } - - void write_header_sync(); - - void set_wait_on_full(bool b) { wait_on_full = b; } - - // reads - - /// Result code for read_entry - enum read_entry_result { - SUCCESS, - FAILURE, - MAYBE_CORRUPT - }; - - /** - * read_entry - * - * Reads next entry starting at pos. If the entry appears - * clean, *bl will contain the payload, *seq will contain - * the sequence number, and *out_pos will reflect the next - * read position. If the entry is invalid *ss will contain - * debug text, while *seq, *out_pos, and *bl will be unchanged. - * - * If the entry suggests a corrupt log, *ss will contain debug - * text, *out_pos will contain the next index to check. If - * we find an entry in this way that returns SUCCESS, the journal - * is most likely corrupt. - */ - read_entry_result do_read_entry( - off64_t pos, ///< [in] position to read - off64_t *next_pos, ///< [out] next position to read - bufferlist* bl, ///< [out] payload for successful read - uint64_t *seq, ///< [out] seq of successful read - ostream *ss, ///< [out] error output - entry_header_t *h = 0 ///< [out] header - ) const; ///< @return result code - - bool read_entry( - bufferlist &bl, - uint64_t &last_seq, - bool *corrupt - ); - - bool read_entry( - bufferlist &bl, - uint64_t &last_seq) { - return read_entry(bl, last_seq, 0); - } - - // Debug/Testing - void get_header( - uint64_t wanted_seq, - off64_t *_pos, - entry_header_t *h); - void corrupt( - int wfd, - off64_t corrupt_at); - void corrupt_payload( - int wfd, - uint64_t seq); - void corrupt_footer_magic( - int wfd, - uint64_t seq); - void corrupt_header_magic( - int wfd, - uint64_t seq); -}; - -WRITE_CLASS_ENCODER(FileJournal::header_t) - -#endif diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc deleted file mode 100644 index 0eca9d90eac1..000000000000 --- a/src/os/FileStore.cc +++ /dev/null @@ -1,5615 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * Copyright (c) 2015 Hewlett-Packard Development Company, L.P. - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ -#include "include/int_types.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(__linux__) -#include -#endif - -#include -#include - -#include "include/compat.h" -#include "include/linux_fiemap.h" - -#include "common/xattr.h" -#include "chain_xattr.h" - -#if defined(DARWIN) || defined(__FreeBSD__) -#include -#include -#endif // DARWIN - - -#include -#include - -#include "FileStore.h" -#include "GenericFileStoreBackend.h" -#include "BtrfsFileStoreBackend.h" -#include "XfsFileStoreBackend.h" -#include "ZFSFileStoreBackend.h" -#include "common/BackTrace.h" -#include "include/types.h" -#include "FileJournal.h" - -#include "osd/osd_types.h" -#include "include/color.h" -#include "include/buffer.h" - -#include "common/Timer.h" -#include "common/debug.h" -#include "common/errno.h" -#include "common/run_cmd.h" -#include "common/safe_io.h" -#include "common/perf_counters.h" -#include "common/sync_filesystem.h" -#include "common/fd.h" -#include "HashIndex.h" -#include "DBObjectMap.h" -#include "kv/KeyValueDB.h" - -#include "common/ceph_crypto.h" -using ceph::crypto::SHA1; - -#include "include/assert.h" - -#include "common/config.h" -#include "common/blkdev.h" - -#ifdef WITH_LTTNG -#define TRACEPOINT_DEFINE -#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE -#include "tracing/objectstore.h" -#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE -#undef TRACEPOINT_DEFINE -#else -#define tracepoint(...) -#endif - -#define dout_subsys ceph_subsys_filestore -#undef dout_prefix -#define dout_prefix *_dout << "filestore(" << basedir << ") " - -#define COMMIT_SNAP_ITEM "snap_%llu" -#define CLUSTER_SNAP_ITEM "clustersnap_%s" - -#define REPLAY_GUARD_XATTR "user.cephos.seq" -#define GLOBAL_REPLAY_GUARD_XATTR "user.cephos.gseq" - -// XATTR_SPILL_OUT_NAME as a xattr is used to maintain that indicates whether -// xattrs spill over into DBObjectMap, if XATTR_SPILL_OUT_NAME exists in file -// xattrs and the value is "no", it indicates no xattrs in DBObjectMap -#define XATTR_SPILL_OUT_NAME "user.cephos.spill_out" -#define XATTR_NO_SPILL_OUT "0" -#define XATTR_SPILL_OUT "1" - -//Initial features in new superblock. -static CompatSet get_fs_initial_compat_set() { - CompatSet::FeatureSet ceph_osd_feature_compat; - CompatSet::FeatureSet ceph_osd_feature_ro_compat; - CompatSet::FeatureSet ceph_osd_feature_incompat; - return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat, - ceph_osd_feature_incompat); -} - -//Features are added here that this FileStore supports. -static CompatSet get_fs_supported_compat_set() { - CompatSet compat = get_fs_initial_compat_set(); - //Any features here can be set in code, but not in initial superblock - compat.incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS); - return compat; -} - -int FileStore::get_block_device_fsid(const string& path, uuid_d *fsid) -{ - // make sure we don't try to use aio or direct_io (and get annoying - // error messages from failing to do so); performance implications - // should be irrelevant for this use - FileJournal j(*fsid, 0, 0, path.c_str(), false, false); - return j.peek_fsid(*fsid); -} - -void FileStore::FSPerfTracker::update_from_perfcounters( - PerfCounters &logger) -{ - os_commit_latency.consume_next( - logger.get_tavg_ms( - l_os_j_lat)); - os_apply_latency.consume_next( - logger.get_tavg_ms( - l_os_apply_lat)); -} - - -ostream& operator<<(ostream& out, const FileStore::OpSequencer& s) -{ - assert(&out); - return out << *s.parent; -} - -int FileStore::get_cdir(coll_t cid, char *s, int len) -{ - const string &cid_str(cid.to_str()); - return snprintf(s, len, "%s/current/%s", basedir.c_str(), cid_str.c_str()); -} - -int FileStore::get_index(coll_t cid, Index *index) -{ - int r = index_manager.get_index(cid, basedir, index); - assert(!m_filestore_fail_eio || r != -EIO); - return r; -} - -int FileStore::init_index(coll_t cid) -{ - char path[PATH_MAX]; - get_cdir(cid, path, sizeof(path)); - int r = index_manager.init_index(cid, path, target_version); - assert(!m_filestore_fail_eio || r != -EIO); - return r; -} - -int FileStore::lfn_find(const ghobject_t& oid, const Index& index, IndexedPath *path) -{ - IndexedPath path2; - if (!path) - path = &path2; - int r, exist; - assert(NULL != index.index); - r = (index.index)->lookup(oid, path, &exist); - if (r < 0) { - assert(!m_filestore_fail_eio || r != -EIO); - return r; - } - if (!exist) - return -ENOENT; - return 0; -} - -int FileStore::lfn_truncate(coll_t cid, const ghobject_t& oid, off_t length) -{ - FDRef fd; - int r = lfn_open(cid, oid, false, &fd); - if (r < 0) - return r; - r = ::ftruncate(**fd, length); - if (r < 0) - r = -errno; - if (r >= 0 && m_filestore_sloppy_crc) { - int rc = backend->_crc_update_truncate(**fd, length); - assert(rc >= 0); - } - assert(!m_filestore_fail_eio || r != -EIO); - return r; -} - -int FileStore::lfn_stat(coll_t cid, const ghobject_t& oid, struct stat *buf) -{ - IndexedPath path; - Index index; - int r = get_index(cid, &index); - if (r < 0) - return r; - - assert(NULL != index.index); - RWLock::RLocker l((index.index)->access_lock); - - r = lfn_find(oid, index, &path); - if (r < 0) - return r; - r = ::stat(path->path(), buf); - if (r < 0) - r = -errno; - return r; -} - -int FileStore::lfn_open(coll_t cid, - const ghobject_t& oid, - bool create, - FDRef *outfd, - Index *index) -{ - assert(outfd); - int r = 0; - bool need_lock = true; - int flags = O_RDWR; - - if (create) - flags |= O_CREAT; - - Index index2; - if (!index) { - index = &index2; - } - if (!((*index).index)) { - r = get_index(cid, index); - if (r < 0) { - dout(10) << __func__ << " could not get index r = " << r << dendl; - return r; - } - } else { - need_lock = false; - } - - int fd, exist; - assert(NULL != (*index).index); - if (need_lock) { - ((*index).index)->access_lock.get_write(); - } - if (!replaying) { - *outfd = fdcache.lookup(oid); - if (*outfd) { - if (need_lock) { - ((*index).index)->access_lock.put_write(); - } - return 0; - } - } - - - IndexedPath path2; - IndexedPath *path = &path2; - - r = (*index)->lookup(oid, path, &exist); - if (r < 0) { - derr << "could not find " << oid << " in index: " - << cpp_strerror(-r) << dendl; - goto fail; - } - - r = ::open((*path)->path(), flags, 0644); - if (r < 0) { - r = -errno; - dout(10) << "error opening file " << (*path)->path() << " with flags=" - << flags << ": " << cpp_strerror(-r) << dendl; - goto fail; - } - fd = r; - if (create && (!exist)) { - r = (*index)->created(oid, (*path)->path()); - if (r < 0) { - VOID_TEMP_FAILURE_RETRY(::close(fd)); - derr << "error creating " << oid << " (" << (*path)->path() - << ") in index: " << cpp_strerror(-r) << dendl; - goto fail; - } - r = chain_fsetxattr(fd, XATTR_SPILL_OUT_NAME, - XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT), true); - if (r < 0) { - VOID_TEMP_FAILURE_RETRY(::close(fd)); - derr << "error setting spillout xattr for oid " << oid << " (" << (*path)->path() - << "):" << cpp_strerror(-r) << dendl; - goto fail; - } - } - - if (!replaying) { - bool existed; - *outfd = fdcache.add(oid, fd, &existed); - if (existed) { - TEMP_FAILURE_RETRY(::close(fd)); - } - } else { - *outfd = FDRef(new FDCache::FD(fd)); - } - - if (need_lock) { - ((*index).index)->access_lock.put_write(); - } - - return 0; - - fail: - - if (need_lock) { - ((*index).index)->access_lock.put_write(); - } - - assert(!m_filestore_fail_eio || r != -EIO); - return r; -} - -void FileStore::lfn_close(FDRef fd) -{ -} - -int FileStore::lfn_link(coll_t c, coll_t newcid, const ghobject_t& o, const ghobject_t& newoid) -{ - Index index_new, index_old; - IndexedPath path_new, path_old; - int exist; - int r; - bool index_same = false; - if (c < newcid) { - r = get_index(newcid, &index_new); - if (r < 0) - return r; - r = get_index(c, &index_old); - if (r < 0) - return r; - } else if (c == newcid) { - r = get_index(c, &index_old); - if (r < 0) - return r; - index_new = index_old; - index_same = true; - } else { - r = get_index(c, &index_old); - if (r < 0) - return r; - r = get_index(newcid, &index_new); - if (r < 0) - return r; - } - - assert(NULL != index_old.index); - assert(NULL != index_new.index); - - if (!index_same) { - - RWLock::RLocker l1((index_old.index)->access_lock); - - r = index_old->lookup(o, &path_old, &exist); - if (r < 0) { - assert(!m_filestore_fail_eio || r != -EIO); - return r; - } - if (!exist) - return -ENOENT; - - RWLock::WLocker l2((index_new.index)->access_lock); - - r = index_new->lookup(newoid, &path_new, &exist); - if (r < 0) { - assert(!m_filestore_fail_eio || r != -EIO); - return r; - } - if (exist) - return -EEXIST; - - dout(25) << "lfn_link path_old: " << path_old << dendl; - dout(25) << "lfn_link path_new: " << path_new << dendl; - r = ::link(path_old->path(), path_new->path()); - if (r < 0) - return -errno; - - r = index_new->created(newoid, path_new->path()); - if (r < 0) { - assert(!m_filestore_fail_eio || r != -EIO); - return r; - } - } else { - RWLock::WLocker l1((index_old.index)->access_lock); - - r = index_old->lookup(o, &path_old, &exist); - if (r < 0) { - assert(!m_filestore_fail_eio || r != -EIO); - return r; - } - if (!exist) - return -ENOENT; - - r = index_new->lookup(newoid, &path_new, &exist); - if (r < 0) { - assert(!m_filestore_fail_eio || r != -EIO); - return r; - } - if (exist) - return -EEXIST; - - dout(25) << "lfn_link path_old: " << path_old << dendl; - dout(25) << "lfn_link path_new: " << path_new << dendl; - r = ::link(path_old->path(), path_new->path()); - if (r < 0) - return -errno; - - // make sure old fd for unlinked/overwritten file is gone - fdcache.clear(newoid); - - r = index_new->created(newoid, path_new->path()); - if (r < 0) { - assert(!m_filestore_fail_eio || r != -EIO); - return r; - } - } - return 0; -} - -int FileStore::lfn_unlink(coll_t cid, const ghobject_t& o, - const SequencerPosition &spos, - bool force_clear_omap) -{ - Index index; - int r = get_index(cid, &index); - if (r < 0) { - dout(25) << __func__ << " get_index failed " << cpp_strerror(r) << dendl; - return r; - } - - assert(NULL != index.index); - RWLock::WLocker l((index.index)->access_lock); - - { - IndexedPath path; - int hardlink; - r = index->lookup(o, &path, &hardlink); - if (r < 0) { - assert(!m_filestore_fail_eio || r != -EIO); - return r; - } - - if (!force_clear_omap) { - if (hardlink == 0) { - wbthrottle.clear_object(o); // should be only non-cache ref - fdcache.clear(o); - return 0; - } else if (hardlink == 1) { - force_clear_omap = true; - } - } - if (force_clear_omap) { - dout(20) << __func__ << ": clearing omap on " << o - << " in cid " << cid << dendl; - r = object_map->clear(o, &spos); - if (r < 0 && r != -ENOENT) { - dout(25) << __func__ << " omap clear failed " << cpp_strerror(r) << dendl; - assert(!m_filestore_fail_eio || r != -EIO); - return r; - } - if (g_conf->filestore_debug_inject_read_err) { - debug_obj_on_delete(o); - } - wbthrottle.clear_object(o); // should be only non-cache ref - fdcache.clear(o); - } else { - /* Ensure that replay of this op doesn't result in the object_map - * going away. - */ - if (!backend->can_checkpoint()) - object_map->sync(&o, &spos); - } - } - r = index->unlink(o); - if (r < 0) { - dout(25) << __func__ << " index unlink failed " << cpp_strerror(r) << dendl; - return r; - } - return 0; -} - -FileStore::FileStore(const std::string &base, const std::string &jdev, osflagbits_t flags, const char *name, bool do_update) : - JournalingObjectStore(base), - internal_name(name), - basedir(base), journalpath(jdev), - generic_flags(flags), - blk_size(0), - fsid_fd(-1), op_fd(-1), - basedir_fd(-1), current_fd(-1), - backend(NULL), - index_manager(do_update), - lock("FileStore::lock"), - force_sync(false), - sync_entry_timeo_lock("sync_entry_timeo_lock"), - timer(g_ceph_context, sync_entry_timeo_lock), - stop(false), sync_thread(this), - fdcache(g_ceph_context), - wbthrottle(g_ceph_context), - next_osr_id(0), - throttle_ops(g_ceph_context, "filestore_ops",g_conf->filestore_queue_max_ops), - throttle_bytes(g_ceph_context, "filestore_bytes",g_conf->filestore_queue_max_bytes), - m_ondisk_finisher_num(g_conf->filestore_ondisk_finisher_threads), - m_apply_finisher_num(g_conf->filestore_apply_finisher_threads), - op_tp(g_ceph_context, "FileStore::op_tp", g_conf->filestore_op_threads, "filestore_op_threads"), - op_wq(this, g_conf->filestore_op_thread_timeout, - g_conf->filestore_op_thread_suicide_timeout, &op_tp), - logger(NULL), - read_error_lock("FileStore::read_error_lock"), - m_filestore_commit_timeout(g_conf->filestore_commit_timeout), - m_filestore_journal_parallel(g_conf->filestore_journal_parallel ), - m_filestore_journal_trailing(g_conf->filestore_journal_trailing), - m_filestore_journal_writeahead(g_conf->filestore_journal_writeahead), - m_filestore_fiemap_threshold(g_conf->filestore_fiemap_threshold), - m_filestore_max_sync_interval(g_conf->filestore_max_sync_interval), - m_filestore_min_sync_interval(g_conf->filestore_min_sync_interval), - m_filestore_fail_eio(g_conf->filestore_fail_eio), - m_filestore_fadvise(g_conf->filestore_fadvise), - do_update(do_update), - m_journal_dio(g_conf->journal_dio), - m_journal_aio(g_conf->journal_aio), - m_journal_force_aio(g_conf->journal_force_aio), - m_osd_rollback_to_cluster_snap(g_conf->osd_rollback_to_cluster_snap), - m_osd_use_stale_snap(g_conf->osd_use_stale_snap), - m_filestore_queue_max_ops(g_conf->filestore_queue_max_ops), - m_filestore_queue_max_bytes(g_conf->filestore_queue_max_bytes), - m_filestore_queue_committing_max_ops(g_conf->filestore_queue_committing_max_ops), - m_filestore_queue_committing_max_bytes(g_conf->filestore_queue_committing_max_bytes), - m_filestore_do_dump(false), - m_filestore_dump_fmt(true), - m_filestore_sloppy_crc(g_conf->filestore_sloppy_crc), - m_filestore_sloppy_crc_block_size(g_conf->filestore_sloppy_crc_block_size), - m_filestore_max_alloc_hint_size(g_conf->filestore_max_alloc_hint_size), - m_fs_type(0), - m_filestore_max_inline_xattr_size(0), - m_filestore_max_inline_xattrs(0) -{ - m_filestore_kill_at.set(g_conf->filestore_kill_at); - for (int i = 0; i < m_ondisk_finisher_num; ++i) { - ostringstream oss; - oss << "filestore-ondisk-" << i; - Finisher *f = new Finisher(g_ceph_context, oss.str()); - ondisk_finishers.push_back(f); - } - for (int i = 0; i < m_apply_finisher_num; ++i) { - ostringstream oss; - oss << "filestore-apply-" << i; - Finisher *f = new Finisher(g_ceph_context, oss.str()); - apply_finishers.push_back(f); - } - - ostringstream oss; - oss << basedir << "/current"; - current_fn = oss.str(); - - ostringstream sss; - sss << basedir << "/current/commit_op_seq"; - current_op_seq_fn = sss.str(); - - ostringstream omss; - omss << basedir << "/current/omap"; - omap_dir = omss.str(); - - // initialize logger - PerfCountersBuilder plb(g_ceph_context, internal_name, l_os_first, l_os_last); - - plb.add_u64(l_os_jq_max_ops, "journal_queue_max_ops", "Max operations in journal queue"); - plb.add_u64(l_os_jq_ops, "journal_queue_ops", "Operations in journal queue"); - plb.add_u64_counter(l_os_j_ops, "journal_ops", "Total journal entries written"); - plb.add_u64(l_os_jq_max_bytes, "journal_queue_max_bytes", "Max data in journal queue"); - plb.add_u64(l_os_jq_bytes, "journal_queue_bytes", "Size of journal queue"); - plb.add_u64_counter(l_os_j_bytes, "journal_bytes", "Total operations size in journal"); - plb.add_time_avg(l_os_j_lat, "journal_latency", "Average journal queue completing latency"); - plb.add_u64_counter(l_os_j_wr, "journal_wr", "Journal write IOs"); - plb.add_u64_avg(l_os_j_wr_bytes, "journal_wr_bytes", "Journal data written"); - plb.add_u64(l_os_oq_max_ops, "op_queue_max_ops", "Max operations in writing to FS queue"); - plb.add_u64(l_os_oq_ops, "op_queue_ops", "Operations in writing to FS queue"); - plb.add_u64_counter(l_os_ops, "ops", "Operations written to store"); - plb.add_u64(l_os_oq_max_bytes, "op_queue_max_bytes", "Max data in writing to FS queue"); - plb.add_u64(l_os_oq_bytes, "op_queue_bytes", "Size of writing to FS queue"); - plb.add_u64_counter(l_os_bytes, "bytes", "Data written to store"); - plb.add_time_avg(l_os_apply_lat, "apply_latency", "Apply latency"); - plb.add_u64(l_os_committing, "committing", "Is currently committing"); - - plb.add_u64_counter(l_os_commit, "commitcycle", "Commit cycles"); - plb.add_time_avg(l_os_commit_len, "commitcycle_interval", "Average interval between commits"); - plb.add_time_avg(l_os_commit_lat, "commitcycle_latency", "Average latency of commit"); - plb.add_u64_counter(l_os_j_full, "journal_full", "Journal writes while full"); - plb.add_time_avg(l_os_queue_lat, "queue_transaction_latency_avg", "Store operation queue latency"); - - logger = plb.create_perf_counters(); - - g_ceph_context->get_perfcounters_collection()->add(logger); - g_ceph_context->_conf->add_observer(this); - - superblock.compat_features = get_fs_initial_compat_set(); -} - -FileStore::~FileStore() -{ - for (vector::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) { - delete *it; - *it = NULL; - } - for (vector::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) { - delete *it; - *it = NULL; - } - g_ceph_context->_conf->remove_observer(this); - g_ceph_context->get_perfcounters_collection()->remove(logger); - - if (journal) - journal->logger = NULL; - delete logger; - - if (m_filestore_do_dump) { - dump_stop(); - } -} - -static void get_attrname(const char *name, char *buf, int len) -{ - snprintf(buf, len, "user.ceph.%s", name); -} - -bool parse_attrname(char **name) -{ - if (strncmp(*name, "user.ceph.", 10) == 0) { - *name += 10; - return true; - } - return false; -} - -void FileStore::collect_metadata(map *pm) -{ - char partition_path[PATH_MAX]; - char dev_node[PATH_MAX]; - int rc = 0; - - (*pm)["filestore_backend"] = backend->get_name(); - ostringstream ss; - ss << "0x" << std::hex << m_fs_type << std::dec; - (*pm)["filestore_f_type"] = ss.str(); - - if (g_conf->filestore_collect_device_partition_information) { - rc = get_device_by_uuid(get_fsid(), "PARTUUID", partition_path, - dev_node); - } else { - rc = -EINVAL; - } - - switch (rc) { - case -EOPNOTSUPP: - case -EINVAL: - (*pm)["backend_filestore_partition_path"] = "unknown"; - (*pm)["backend_filestore_dev_node"] = "unknown"; - break; - case -ENODEV: - (*pm)["backend_filestore_partition_path"] = string(partition_path); - (*pm)["backend_filestore_dev_node"] = "unknown"; - break; - default: - (*pm)["backend_filestore_partition_path"] = string(partition_path); - (*pm)["backend_filestore_dev_node"] = string(dev_node); - } -} - -int FileStore::statfs(struct statfs *buf) -{ - if (::statfs(basedir.c_str(), buf) < 0) { - int r = -errno; - assert(!m_filestore_fail_eio || r != -EIO); - return r; - } - return 0; -} - - -void FileStore::new_journal() -{ - if (journalpath.length()) { - dout(10) << "open_journal at " << journalpath << dendl; - journal = new FileJournal(fsid, &finisher, &sync_cond, journalpath.c_str(), - m_journal_dio, m_journal_aio, m_journal_force_aio); - if (journal) - journal->logger = logger; - } - return; -} - -int FileStore::dump_journal(ostream& out) -{ - int r; - - if (!journalpath.length()) - return -EINVAL; - - FileJournal *journal = new FileJournal(fsid, &finisher, &sync_cond, journalpath.c_str(), m_journal_dio); - r = journal->dump(out); - delete journal; - return r; -} - -FileStoreBackend *FileStoreBackend::create(long f_type, FileStore *fs) -{ - switch (f_type) { -#if defined(__linux__) - case BTRFS_SUPER_MAGIC: - return new BtrfsFileStoreBackend(fs); -# ifdef HAVE_LIBXFS - case XFS_SUPER_MAGIC: - return new XfsFileStoreBackend(fs); -# endif -#endif -#ifdef HAVE_LIBZFS - case ZFS_SUPER_MAGIC: - return new ZFSFileStoreBackend(fs); -#endif - default: - return new GenericFileStoreBackend(fs); - } -} - -void FileStore::create_backend(long f_type) -{ - m_fs_type = f_type; - - assert(backend == NULL); - backend = FileStoreBackend::create(f_type, this); - - dout(0) << "backend " << backend->get_name() - << " (magic 0x" << std::hex << f_type << std::dec << ")" - << dendl; - - switch (f_type) { -#if defined(__linux__) - case BTRFS_SUPER_MAGIC: - wbthrottle.set_fs(WBThrottle::BTRFS); - break; - - case XFS_SUPER_MAGIC: - // wbthrottle is constructed with fs(WBThrottle::XFS) - break; -#endif - } - - set_xattr_limits_via_conf(); -} - -int FileStore::mkfs() -{ - int ret = 0; - char fsid_fn[PATH_MAX]; - uuid_d old_fsid; - - dout(1) << "mkfs in " << basedir << dendl; - basedir_fd = ::open(basedir.c_str(), O_RDONLY); - if (basedir_fd < 0) { - ret = -errno; - derr << "mkfs failed to open base dir " << basedir << ": " << cpp_strerror(ret) << dendl; - return ret; - } - - // open+lock fsid - snprintf(fsid_fn, sizeof(fsid_fn), "%s/fsid", basedir.c_str()); - fsid_fd = ::open(fsid_fn, O_RDWR|O_CREAT, 0644); - if (fsid_fd < 0) { - ret = -errno; - derr << "mkfs: failed to open " << fsid_fn << ": " << cpp_strerror(ret) << dendl; - goto close_basedir_fd; - } - - if (lock_fsid() < 0) { - ret = -EBUSY; - goto close_fsid_fd; - } - - if (read_fsid(fsid_fd, &old_fsid) < 0 || old_fsid.is_zero()) { - if (fsid.is_zero()) { - fsid.generate_random(); - dout(1) << "mkfs generated fsid " << fsid << dendl; - } else { - dout(1) << "mkfs using provided fsid " << fsid << dendl; - } - - char fsid_str[40]; - fsid.print(fsid_str); - strcat(fsid_str, "\n"); - ret = ::ftruncate(fsid_fd, 0); - if (ret < 0) { - ret = -errno; - derr << "mkfs: failed to truncate fsid: " - << cpp_strerror(ret) << dendl; - goto close_fsid_fd; - } - ret = safe_write(fsid_fd, fsid_str, strlen(fsid_str)); - if (ret < 0) { - derr << "mkfs: failed to write fsid: " - << cpp_strerror(ret) << dendl; - goto close_fsid_fd; - } - if (::fsync(fsid_fd) < 0) { - ret = errno; - derr << "mkfs: close failed: can't write fsid: " - << cpp_strerror(ret) << dendl; - goto close_fsid_fd; - } - dout(10) << "mkfs fsid is " << fsid << dendl; - } else { - if (!fsid.is_zero() && fsid != old_fsid) { - derr << "mkfs on-disk fsid " << old_fsid << " != provided " << fsid << dendl; - ret = -EINVAL; - goto close_fsid_fd; - } - fsid = old_fsid; - dout(1) << "mkfs fsid is already set to " << fsid << dendl; - } - - // version stamp - ret = write_version_stamp(); - if (ret < 0) { - derr << "mkfs: write_version_stamp() failed: " - << cpp_strerror(ret) << dendl; - goto close_fsid_fd; - } - - // superblock - superblock.omap_backend = g_conf->filestore_omap_backend; - ret = write_superblock(); - if (ret < 0) { - derr << "mkfs: write_superblock() failed: " - << cpp_strerror(ret) << dendl; - goto close_fsid_fd; - } - - struct statfs basefs; - ret = ::fstatfs(basedir_fd, &basefs); - if (ret < 0) { - ret = -errno; - derr << "mkfs cannot fstatfs basedir " - << cpp_strerror(ret) << dendl; - goto close_fsid_fd; - } - - create_backend(basefs.f_type); - - ret = backend->create_current(); - if (ret < 0) { - derr << "mkfs: failed to create current/ " << cpp_strerror(ret) << dendl; - goto close_fsid_fd; - } - - // write initial op_seq - { - uint64_t initial_seq = 0; - int fd = read_op_seq(&initial_seq); - if (fd < 0) { - derr << "mkfs: failed to create " << current_op_seq_fn << ": " - << cpp_strerror(fd) << dendl; - goto close_fsid_fd; - } - if (initial_seq == 0) { - int err = write_op_seq(fd, 1); - if (err < 0) { - VOID_TEMP_FAILURE_RETRY(::close(fd)); - derr << "mkfs: failed to write to " << current_op_seq_fn << ": " - << cpp_strerror(err) << dendl; - goto close_fsid_fd; - } - - if (backend->can_checkpoint()) { - // create snap_1 too - current_fd = ::open(current_fn.c_str(), O_RDONLY); - assert(current_fd >= 0); - char s[NAME_MAX]; - snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, 1ull); - ret = backend->create_checkpoint(s, NULL); - VOID_TEMP_FAILURE_RETRY(::close(current_fd)); - if (ret < 0 && ret != -EEXIST) { - VOID_TEMP_FAILURE_RETRY(::close(fd)); - derr << "mkfs: failed to create snap_1: " << cpp_strerror(ret) << dendl; - goto close_fsid_fd; - } - } - } - VOID_TEMP_FAILURE_RETRY(::close(fd)); - } - ret = KeyValueDB::test_init(superblock.omap_backend, omap_dir); - if (ret < 0) { - derr << "mkfs failed to create " << g_conf->filestore_omap_backend << dendl; - ret = -1; - goto close_fsid_fd; - } - dout(1) << g_conf->filestore_omap_backend << " db exists/created" << dendl; - - // journal? - ret = mkjournal(); - if (ret) - goto close_fsid_fd; - - ret = write_meta("type", "filestore"); - if (ret) - goto close_fsid_fd; - - dout(1) << "mkfs done in " << basedir << dendl; - ret = 0; - - close_fsid_fd: - VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); - fsid_fd = -1; - close_basedir_fd: - VOID_TEMP_FAILURE_RETRY(::close(basedir_fd)); - delete backend; - backend = NULL; - return ret; -} - -int FileStore::mkjournal() -{ - // read fsid - int ret; - char fn[PATH_MAX]; - snprintf(fn, sizeof(fn), "%s/fsid", basedir.c_str()); - int fd = ::open(fn, O_RDONLY, 0644); - if (fd < 0) { - int err = errno; - derr << "FileStore::mkjournal: open error: " << cpp_strerror(err) << dendl; - return -err; - } - ret = read_fsid(fd, &fsid); - if (ret < 0) { - derr << "FileStore::mkjournal: read error: " << cpp_strerror(ret) << dendl; - VOID_TEMP_FAILURE_RETRY(::close(fd)); - return ret; - } - VOID_TEMP_FAILURE_RETRY(::close(fd)); - - ret = 0; - - new_journal(); - if (journal) { - ret = journal->check(); - if (ret < 0) { - ret = journal->create(); - if (ret) - derr << "mkjournal error creating journal on " << journalpath - << ": " << cpp_strerror(ret) << dendl; - else - dout(0) << "mkjournal created journal on " << journalpath << dendl; - } - delete journal; - journal = 0; - } - return ret; -} - -int FileStore::read_fsid(int fd, uuid_d *uuid) -{ - char fsid_str[40]; - int ret = safe_read(fd, fsid_str, sizeof(fsid_str)); - if (ret < 0) - return ret; - if (ret == 8) { - // old 64-bit fsid... mirror it. - *(uint64_t*)&uuid->bytes()[0] = *(uint64_t*)fsid_str; - *(uint64_t*)&uuid->bytes()[8] = *(uint64_t*)fsid_str; - return 0; - } - - if (ret > 36) - fsid_str[36] = 0; - if (!uuid->parse(fsid_str)) - return -EINVAL; - return 0; -} - -int FileStore::lock_fsid() -{ - struct flock l; - memset(&l, 0, sizeof(l)); - l.l_type = F_WRLCK; - l.l_whence = SEEK_SET; - l.l_start = 0; - l.l_len = 0; - int r = ::fcntl(fsid_fd, F_SETLK, &l); - if (r < 0) { - int err = errno; - dout(0) << "lock_fsid failed to lock " << basedir << "/fsid, is another ceph-osd still running? " - << cpp_strerror(err) << dendl; - return -err; - } - return 0; -} - -bool FileStore::test_mount_in_use() -{ - dout(5) << "test_mount basedir " << basedir << " journal " << journalpath << dendl; - char fn[PATH_MAX]; - snprintf(fn, sizeof(fn), "%s/fsid", basedir.c_str()); - - // verify fs isn't in use - - fsid_fd = ::open(fn, O_RDWR, 0644); - if (fsid_fd < 0) - return 0; // no fsid, ok. - bool inuse = lock_fsid() < 0; - VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); - fsid_fd = -1; - return inuse; -} - -int FileStore::_detect_fs() -{ - struct statfs st; - int r = ::fstatfs(basedir_fd, &st); - if (r < 0) - return -errno; - - blk_size = st.f_bsize; - - create_backend(st.f_type); - - r = backend->detect_features(); - if (r < 0) { - derr << "_detect_fs: detect_features error: " << cpp_strerror(r) << dendl; - return r; - } - - // test xattrs - char fn[PATH_MAX]; - int x = rand(); - int y = x+1; - snprintf(fn, sizeof(fn), "%s/xattr_test", basedir.c_str()); - int tmpfd = ::open(fn, O_CREAT|O_WRONLY|O_TRUNC, 0700); - if (tmpfd < 0) { - int ret = -errno; - derr << "_detect_fs unable to create " << fn << ": " << cpp_strerror(ret) << dendl; - return ret; - } - - int ret = chain_fsetxattr(tmpfd, "user.test", &x, sizeof(x)); - if (ret >= 0) - ret = chain_fgetxattr(tmpfd, "user.test", &y, sizeof(y)); - if ((ret < 0) || (x != y)) { - derr << "Extended attributes don't appear to work. "; - if (ret) - *_dout << "Got error " + cpp_strerror(ret) + ". "; - *_dout << "If you are using ext3 or ext4, be sure to mount the underlying " - << "file system with the 'user_xattr' option." << dendl; - ::unlink(fn); - VOID_TEMP_FAILURE_RETRY(::close(tmpfd)); - return -ENOTSUP; - } - - char buf[1000]; - memset(buf, 0, sizeof(buf)); // shut up valgrind - chain_fsetxattr(tmpfd, "user.test", &buf, sizeof(buf)); - chain_fsetxattr(tmpfd, "user.test2", &buf, sizeof(buf)); - chain_fsetxattr(tmpfd, "user.test3", &buf, sizeof(buf)); - chain_fsetxattr(tmpfd, "user.test4", &buf, sizeof(buf)); - ret = chain_fsetxattr(tmpfd, "user.test5", &buf, sizeof(buf)); - if (ret == -ENOSPC) { - dout(0) << "limited size xattrs" << dendl; - } - chain_fremovexattr(tmpfd, "user.test"); - chain_fremovexattr(tmpfd, "user.test2"); - chain_fremovexattr(tmpfd, "user.test3"); - chain_fremovexattr(tmpfd, "user.test4"); - chain_fremovexattr(tmpfd, "user.test5"); - - ::unlink(fn); - VOID_TEMP_FAILURE_RETRY(::close(tmpfd)); - - return 0; -} - -int FileStore::_sanity_check_fs() -{ - // sanity check(s) - - if (((int)m_filestore_journal_writeahead + - (int)m_filestore_journal_parallel + - (int)m_filestore_journal_trailing) > 1) { - dout(0) << "mount ERROR: more than one of filestore journal {writeahead,parallel,trailing} enabled" << dendl; - cerr << TEXT_RED - << " ** WARNING: more than one of 'filestore journal {writeahead,parallel,trailing}'\n" - << " is enabled in ceph.conf. You must choose a single journal mode." - << TEXT_NORMAL << std::endl; - return -EINVAL; - } - - if (!backend->can_checkpoint()) { - if (!journal || !m_filestore_journal_writeahead) { - dout(0) << "mount WARNING: no btrfs, and no journal in writeahead mode; data may be lost" << dendl; - cerr << TEXT_RED - << " ** WARNING: no btrfs AND (no journal OR journal not in writeahead mode)\n" - << " For non-btrfs volumes, a writeahead journal is required to\n" - << " maintain on-disk consistency in the event of a crash. Your conf\n" - << " should include something like:\n" - << " osd journal = /path/to/journal_device_or_file\n" - << " filestore journal writeahead = true\n" - << TEXT_NORMAL; - } - } - - if (!journal) { - dout(0) << "mount WARNING: no journal" << dendl; - cerr << TEXT_YELLOW - << " ** WARNING: No osd journal is configured: write latency may be high.\n" - << " If you will not be using an osd journal, write latency may be\n" - << " relatively high. It can be reduced somewhat by lowering\n" - << " filestore_max_sync_interval, but lower values mean lower write\n" - << " throughput, especially with spinning disks.\n" - << TEXT_NORMAL; - } - - return 0; -} - -int FileStore::write_superblock() -{ - bufferlist bl; - ::encode(superblock, bl); - return safe_write_file(basedir.c_str(), "superblock", - bl.c_str(), bl.length()); -} - -int FileStore::read_superblock() -{ - bufferptr bp(PATH_MAX); - int ret = safe_read_file(basedir.c_str(), "superblock", - bp.c_str(), bp.length()); - if (ret < 0) { - if (ret == -ENOENT) { - // If the file doesn't exist write initial CompatSet - return write_superblock(); - } - return ret; - } - - bufferlist bl; - bl.push_back(bp); - bufferlist::iterator i = bl.begin(); - ::decode(superblock, i); - return 0; -} - -int FileStore::update_version_stamp() -{ - return write_version_stamp(); -} - -int FileStore::version_stamp_is_valid(uint32_t *version) -{ - bufferptr bp(PATH_MAX); - int ret = safe_read_file(basedir.c_str(), "store_version", - bp.c_str(), bp.length()); - if (ret < 0) { - if (ret == -ENOENT) - return 0; - return ret; - } - bufferlist bl; - bl.push_back(bp); - bufferlist::iterator i = bl.begin(); - ::decode(*version, i); - dout(10) << __func__ << " was " << *version << " vs target " - << target_version << dendl; - if (*version == target_version) - return 1; - else - return 0; -} - -int FileStore::write_version_stamp() -{ - dout(1) << __func__ << " " << target_version << dendl; - bufferlist bl; - ::encode(target_version, bl); - - return safe_write_file(basedir.c_str(), "store_version", - bl.c_str(), bl.length()); -} - -int FileStore::upgrade() -{ - dout(1) << "upgrade" << dendl; - uint32_t version; - int r = version_stamp_is_valid(&version); - if (r < 0) - return r; - if (r == 1) - return 0; - - if (version < 3) { - derr << "ObjectStore is old at version " << version << ". Please upgrade to firefly v0.80.x, convert your store, and then upgrade." << dendl; - return -EINVAL; - } - - // nothing necessary in FileStore for v3 -> v4 upgrade; we just need to - // open up DBObjectMap with the do_upgrade flag, which we already did. - update_version_stamp(); - return 0; -} - -int FileStore::read_op_seq(uint64_t *seq) -{ - int op_fd = ::open(current_op_seq_fn.c_str(), O_CREAT|O_RDWR, 0644); - if (op_fd < 0) { - int r = -errno; - assert(!m_filestore_fail_eio || r != -EIO); - return r; - } - char s[40]; - memset(s, 0, sizeof(s)); - int ret = safe_read(op_fd, s, sizeof(s) - 1); - if (ret < 0) { - derr << "error reading " << current_op_seq_fn << ": " << cpp_strerror(ret) << dendl; - VOID_TEMP_FAILURE_RETRY(::close(op_fd)); - assert(!m_filestore_fail_eio || ret != -EIO); - return ret; - } - *seq = atoll(s); - return op_fd; -} - -int FileStore::write_op_seq(int fd, uint64_t seq) -{ - char s[30]; - snprintf(s, sizeof(s), "%" PRId64 "\n", seq); - int ret = TEMP_FAILURE_RETRY(::pwrite(fd, s, strlen(s), 0)); - if (ret < 0) { - ret = -errno; - assert(!m_filestore_fail_eio || ret != -EIO); - } - return ret; -} - -int FileStore::mount() -{ - int ret; - char buf[PATH_MAX]; - uint64_t initial_op_seq; - set cluster_snaps; - CompatSet supported_compat_set = get_fs_supported_compat_set(); - - dout(5) << "basedir " << basedir << " journal " << journalpath << dendl; - - // make sure global base dir exists - if (::access(basedir.c_str(), R_OK | W_OK)) { - ret = -errno; - derr << "FileStore::mount: unable to access basedir '" << basedir << "': " - << cpp_strerror(ret) << dendl; - goto done; - } - - // get fsid - snprintf(buf, sizeof(buf), "%s/fsid", basedir.c_str()); - fsid_fd = ::open(buf, O_RDWR, 0644); - if (fsid_fd < 0) { - ret = -errno; - derr << "FileStore::mount: error opening '" << buf << "': " - << cpp_strerror(ret) << dendl; - goto done; - } - - ret = read_fsid(fsid_fd, &fsid); - if (ret < 0) { - derr << "FileStore::mount: error reading fsid_fd: " << cpp_strerror(ret) - << dendl; - goto close_fsid_fd; - } - - if (lock_fsid() < 0) { - derr << "FileStore::mount: lock_fsid failed" << dendl; - ret = -EBUSY; - goto close_fsid_fd; - } - - dout(10) << "mount fsid is " << fsid << dendl; - - - uint32_t version_stamp; - ret = version_stamp_is_valid(&version_stamp); - if (ret < 0) { - derr << "FileStore::mount : error in version_stamp_is_valid: " - << cpp_strerror(ret) << dendl; - goto close_fsid_fd; - } else if (ret == 0) { - if (do_update || (int)version_stamp < g_conf->filestore_update_to) { - derr << "FileStore::mount : stale version stamp detected: " - << version_stamp - << ". Proceeding, do_update " - << "is set, performing disk format upgrade." - << dendl; - do_update = true; - } else { - ret = -EINVAL; - derr << "FileStore::mount : stale version stamp " << version_stamp - << ". Please run the FileStore update script before starting the " - << "OSD, or set filestore_update_to to " << target_version - << " (currently " << g_conf->filestore_update_to << ")" - << dendl; - goto close_fsid_fd; - } - } - - ret = read_superblock(); - if (ret < 0) { - ret = -EINVAL; - goto close_fsid_fd; - } - - // Check if this FileStore supports all the necessary features to mount - if (supported_compat_set.compare(superblock.compat_features) == -1) { - derr << "FileStore::mount : Incompatible features set " - << superblock.compat_features << dendl; - ret = -EINVAL; - goto close_fsid_fd; - } - - // open some dir handles - basedir_fd = ::open(basedir.c_str(), O_RDONLY); - if (basedir_fd < 0) { - ret = -errno; - derr << "FileStore::mount: failed to open " << basedir << ": " - << cpp_strerror(ret) << dendl; - basedir_fd = -1; - goto close_fsid_fd; - } - - // test for btrfs, xattrs, etc. - ret = _detect_fs(); - if (ret < 0) { - derr << "FileStore::mount : error in _detect_fs: " - << cpp_strerror(ret) << dendl; - goto close_basedir_fd; - } - - { - list ls; - ret = backend->list_checkpoints(ls); - if (ret < 0) { - derr << "FileStore::mount : error in _list_snaps: "<< cpp_strerror(ret) << dendl; - goto close_basedir_fd; - } - - long long unsigned c, prev = 0; - char clustersnap[NAME_MAX]; - for (list::iterator it = ls.begin(); it != ls.end(); ++it) { - if (sscanf(it->c_str(), COMMIT_SNAP_ITEM, &c) == 1) { - assert(c > prev); - prev = c; - snaps.push_back(c); - } else if (sscanf(it->c_str(), CLUSTER_SNAP_ITEM, clustersnap) == 1) - cluster_snaps.insert(*it); - } - } - - if (m_osd_rollback_to_cluster_snap.length() && - cluster_snaps.count(m_osd_rollback_to_cluster_snap) == 0) { - derr << "rollback to cluster snapshot '" << m_osd_rollback_to_cluster_snap << "': not found" << dendl; - ret = -ENOENT; - goto close_basedir_fd; - } - - char nosnapfn[200]; - snprintf(nosnapfn, sizeof(nosnapfn), "%s/nosnap", current_fn.c_str()); - - if (backend->can_checkpoint()) { - if (snaps.empty()) { - dout(0) << "mount WARNING: no consistent snaps found, store may be in inconsistent state" << dendl; - } else { - char s[NAME_MAX]; - uint64_t curr_seq = 0; - - if (m_osd_rollback_to_cluster_snap.length()) { - derr << TEXT_RED - << " ** NOTE: rolling back to cluster snapshot " << m_osd_rollback_to_cluster_snap << " **" - << TEXT_NORMAL - << dendl; - assert(cluster_snaps.count(m_osd_rollback_to_cluster_snap)); - snprintf(s, sizeof(s), CLUSTER_SNAP_ITEM, m_osd_rollback_to_cluster_snap.c_str()); - } else { - { - int fd = read_op_seq(&curr_seq); - if (fd >= 0) { - VOID_TEMP_FAILURE_RETRY(::close(fd)); - } - } - if (curr_seq) - dout(10) << " current/ seq was " << curr_seq << dendl; - else - dout(10) << " current/ missing entirely (unusual, but okay)" << dendl; - - uint64_t cp = snaps.back(); - dout(10) << " most recent snap from " << snaps << " is " << cp << dendl; - - // if current/ is marked as non-snapshotted, refuse to roll - // back (without clear direction) to avoid throwing out new - // data. - struct stat st; - if (::stat(nosnapfn, &st) == 0) { - if (!m_osd_use_stale_snap) { - derr << "ERROR: " << nosnapfn << " exists, not rolling back to avoid losing new data" << dendl; - derr << "Force rollback to old snapshotted version with 'osd use stale snap = true'" << dendl; - derr << "config option for --osd-use-stale-snap startup argument." << dendl; - ret = -ENOTSUP; - goto close_basedir_fd; - } - derr << "WARNING: user forced start with data sequence mismatch: current was " << curr_seq - << ", newest snap is " << cp << dendl; - cerr << TEXT_YELLOW - << " ** WARNING: forcing the use of stale snapshot data **" - << TEXT_NORMAL << std::endl; - } - - dout(10) << "mount rolling back to consistent snap " << cp << dendl; - snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)cp); - } - - // drop current? - ret = backend->rollback_to(s); - if (ret) { - derr << "FileStore::mount: error rolling back to " << s << ": " - << cpp_strerror(ret) << dendl; - goto close_basedir_fd; - } - } - } - initial_op_seq = 0; - - current_fd = ::open(current_fn.c_str(), O_RDONLY); - if (current_fd < 0) { - ret = -errno; - derr << "FileStore::mount: error opening: " << current_fn << ": " << cpp_strerror(ret) << dendl; - goto close_basedir_fd; - } - - assert(current_fd >= 0); - - op_fd = read_op_seq(&initial_op_seq); - if (op_fd < 0) { - derr << "FileStore::mount: read_op_seq failed" << dendl; - goto close_current_fd; - } - - dout(5) << "mount op_seq is " << initial_op_seq << dendl; - if (initial_op_seq == 0) { - derr << "mount initial op seq is 0; something is wrong" << dendl; - ret = -EINVAL; - goto close_current_fd; - } - - if (!backend->can_checkpoint()) { - // mark current/ as non-snapshotted so that we don't rollback away - // from it. - int r = ::creat(nosnapfn, 0644); - if (r < 0) { - derr << "FileStore::mount: failed to create current/nosnap" << dendl; - goto close_current_fd; - } - VOID_TEMP_FAILURE_RETRY(::close(r)); - } else { - // clear nosnap marker, if present. - ::unlink(nosnapfn); - } - - if (!(generic_flags & SKIP_MOUNT_OMAP)) { - KeyValueDB * omap_store = KeyValueDB::create(g_ceph_context, - superblock.omap_backend, - omap_dir); - if (omap_store == NULL) - { - derr << "Error creating " << superblock.omap_backend << dendl; - ret = -1; - goto close_current_fd; - } - - if (superblock.omap_backend == "rocksdb") - omap_store->init(g_conf->filestore_rocksdb_options); - else - omap_store->init(); - - stringstream err; - if (omap_store->create_and_open(err)) { - delete omap_store; - derr << "Error initializing " << superblock.omap_backend - << " : " << err.str() << dendl; - ret = -1; - goto close_current_fd; - } - - DBObjectMap *dbomap = new DBObjectMap(omap_store); - ret = dbomap->init(do_update); - if (ret < 0) { - delete dbomap; - derr << "Error initializing DBObjectMap: " << ret << dendl; - goto close_current_fd; - } - stringstream err2; - - if (g_conf->filestore_debug_omap_check && !dbomap->check(err2)) { - derr << err2.str() << dendl; - delete dbomap; - ret = -EINVAL; - goto close_current_fd; - } - object_map.reset(dbomap); - } - - // journal - new_journal(); - - // select journal mode? - if (journal) { - if (!m_filestore_journal_writeahead && - !m_filestore_journal_parallel && - !m_filestore_journal_trailing) { - if (!backend->can_checkpoint()) { - m_filestore_journal_writeahead = true; - dout(0) << "mount: enabling WRITEAHEAD journal mode: checkpoint is not enabled" << dendl; - } else { - m_filestore_journal_parallel = true; - dout(0) << "mount: enabling PARALLEL journal mode: fs, checkpoint is enabled" << dendl; - } - } else { - if (m_filestore_journal_writeahead) - dout(0) << "mount: WRITEAHEAD journal mode explicitly enabled in conf" << dendl; - if (m_filestore_journal_parallel) - dout(0) << "mount: PARALLEL journal mode explicitly enabled in conf" << dendl; - if (m_filestore_journal_trailing) - dout(0) << "mount: TRAILING journal mode explicitly enabled in conf" << dendl; - } - if (m_filestore_journal_writeahead) - journal->set_wait_on_full(true); - } else { - dout(0) << "mount: no journal" << dendl; - } - - ret = _sanity_check_fs(); - if (ret) { - derr << "FileStore::mount: _sanity_check_fs failed with error " - << ret << dendl; - goto close_current_fd; - } - - // Cleanup possibly invalid collections - { - vector collections; - ret = list_collections(collections, true); - if (ret < 0) { - derr << "Error " << ret << " while listing collections" << dendl; - goto close_current_fd; - } - for (vector::iterator i = collections.begin(); - i != collections.end(); - ++i) { - Index index; - ret = get_index(*i, &index); - if (ret < 0) { - derr << "Unable to mount index " << *i - << " with error: " << ret << dendl; - goto close_current_fd; - } - assert(NULL != index.index); - RWLock::WLocker l((index.index)->access_lock); - - index->cleanup(); - } - } - - wbthrottle.start(); - sync_thread.create(); - - if (!(generic_flags & SKIP_JOURNAL_REPLAY)) { - ret = journal_replay(initial_op_seq); - if (ret < 0) { - derr << "mount failed to open journal " << journalpath << ": " << cpp_strerror(ret) << dendl; - if (ret == -ENOTTY) { - derr << "maybe journal is not pointing to a block device and its size " - << "wasn't configured?" << dendl; - } - - // stop sync thread - lock.Lock(); - stop = true; - sync_cond.Signal(); - lock.Unlock(); - sync_thread.join(); - - wbthrottle.stop(); - - goto close_current_fd; - } - } - - { - stringstream err2; - if (g_conf->filestore_debug_omap_check && !object_map->check(err2)) { - derr << err2.str() << dendl; - ret = -EINVAL; - goto close_current_fd; - } - } - - init_temp_collections(); - - journal_start(); - - op_tp.start(); - for (vector::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) { - (*it)->start(); - } - for (vector::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) { - (*it)->start(); - } - - timer.init(); - - // upgrade? - if (g_conf->filestore_update_to >= (int)get_target_version()) { - int err = upgrade(); - if (err < 0) { - derr << "error converting store" << dendl; - umount(); - return err; - } - } - - // all okay. - return 0; - -close_current_fd: - VOID_TEMP_FAILURE_RETRY(::close(current_fd)); - current_fd = -1; -close_basedir_fd: - VOID_TEMP_FAILURE_RETRY(::close(basedir_fd)); - basedir_fd = -1; -close_fsid_fd: - VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); - fsid_fd = -1; -done: - assert(!m_filestore_fail_eio || ret != -EIO); - return ret; -} - -void FileStore::init_temp_collections() -{ - dout(10) << __func__ << dendl; - vector ls; - int r = list_collections(ls, true); - assert(r >= 0); - - dout(20) << " ls " << ls << dendl; - - SequencerPosition spos; - - set temps; - for (vector::iterator p = ls.begin(); p != ls.end(); ++p) - if (p->is_temp()) - temps.insert(*p); - dout(20) << " temps " << temps << dendl; - - for (vector::iterator p = ls.begin(); p != ls.end(); ++p) { - if (p->is_temp()) - continue; - if (p->is_meta()) - continue; - coll_t temp = p->get_temp(); - if (temps.count(temp)) { - temps.erase(temp); - } else { - dout(10) << __func__ << " creating " << temp << dendl; - r = _create_collection(temp, spos); - assert(r == 0); - } - } - - for (set::iterator p = temps.begin(); p != temps.end(); ++p) { - dout(10) << __func__ << " removing stray " << *p << dendl; - r = _collection_remove_recursive(*p, spos); - assert(r == 0); - } -} - -int FileStore::umount() -{ - dout(5) << "umount " << basedir << dendl; - - flush(); - sync(); - do_force_sync(); - - lock.Lock(); - stop = true; - sync_cond.Signal(); - lock.Unlock(); - sync_thread.join(); - wbthrottle.stop(); - op_tp.stop(); - - journal_stop(); - if (!(generic_flags & SKIP_JOURNAL_REPLAY)) - journal_write_close(); - - for (vector::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) { - (*it)->stop(); - } - for (vector::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) { - (*it)->stop(); - } - - if (fsid_fd >= 0) { - VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); - fsid_fd = -1; - } - if (op_fd >= 0) { - VOID_TEMP_FAILURE_RETRY(::close(op_fd)); - op_fd = -1; - } - if (current_fd >= 0) { - VOID_TEMP_FAILURE_RETRY(::close(current_fd)); - current_fd = -1; - } - if (basedir_fd >= 0) { - VOID_TEMP_FAILURE_RETRY(::close(basedir_fd)); - basedir_fd = -1; - } - - force_sync = false; - - delete backend; - backend = NULL; - - object_map.reset(); - - { - Mutex::Locker l(sync_entry_timeo_lock); - timer.shutdown(); - } - - // nothing - return 0; -} - - - - -/// ----------------------------- - -FileStore::Op *FileStore::build_op(list& tls, - Context *onreadable, - Context *onreadable_sync, - TrackedOpRef osd_op) -{ - uint64_t bytes = 0, ops = 0; - for (list::iterator p = tls.begin(); - p != tls.end(); - ++p) { - bytes += (*p)->get_num_bytes(); - ops += (*p)->get_num_ops(); - } - - Op *o = new Op; - o->start = ceph_clock_now(g_ceph_context); - o->tls.swap(tls); - o->onreadable = onreadable; - o->onreadable_sync = onreadable_sync; - o->ops = ops; - o->bytes = bytes; - o->osd_op = osd_op; - return o; -} - - - -void FileStore::queue_op(OpSequencer *osr, Op *o) -{ - // queue op on sequencer, then queue sequencer for the threadpool, - // so that regardless of which order the threads pick up the - // sequencer, the op order will be preserved. - - osr->queue(o); - - logger->inc(l_os_ops); - logger->inc(l_os_bytes, o->bytes); - - dout(5) << "queue_op " << o << " seq " << o->op - << " " << *osr - << " " << o->bytes << " bytes" - << " (queue has " << throttle_ops.get_current() << " ops and " << throttle_bytes.get_current() << " bytes)" - << dendl; - op_wq.queue(osr); -} - -void FileStore::op_queue_reserve_throttle(Op *o, ThreadPool::TPHandle *handle) -{ - // Do not call while holding the journal lock! - uint64_t max_ops = m_filestore_queue_max_ops; - uint64_t max_bytes = m_filestore_queue_max_bytes; - - if (backend->can_checkpoint() && is_committing()) { - max_ops += m_filestore_queue_committing_max_ops; - max_bytes += m_filestore_queue_committing_max_bytes; - } - - logger->set(l_os_oq_max_ops, max_ops); - logger->set(l_os_oq_max_bytes, max_bytes); - - if (handle) - handle->suspend_tp_timeout(); - if (throttle_ops.should_wait(1) || - (throttle_bytes.get_current() // let single large ops through! - && throttle_bytes.should_wait(o->bytes))) { - dout(2) << "waiting " << throttle_ops.get_current() + 1 << " > " << max_ops << " ops || " - << throttle_bytes.get_current() + o->bytes << " > " << max_bytes << dendl; - } - throttle_ops.get(); - throttle_bytes.get(o->bytes); - if (handle) - handle->reset_tp_timeout(); - - logger->set(l_os_oq_ops, throttle_ops.get_current()); - logger->set(l_os_oq_bytes, throttle_bytes.get_current()); -} - -void FileStore::op_queue_release_throttle(Op *o) -{ - throttle_ops.put(); - throttle_bytes.put(o->bytes); - logger->set(l_os_oq_ops, throttle_ops.get_current()); - logger->set(l_os_oq_bytes, throttle_bytes.get_current()); -} - -void FileStore::_do_op(OpSequencer *osr, ThreadPool::TPHandle &handle) -{ - wbthrottle.throttle(); - // inject a stall? - if (g_conf->filestore_inject_stall) { - int orig = g_conf->filestore_inject_stall; - dout(5) << "_do_op filestore_inject_stall " << orig << ", sleeping" << dendl; - for (int n = 0; n < g_conf->filestore_inject_stall; n++) - sleep(1); - g_conf->set_val("filestore_inject_stall", "0"); - dout(5) << "_do_op done stalling" << dendl; - } - - osr->apply_lock.Lock(); - Op *o = osr->peek_queue(); - apply_manager.op_apply_start(o->op); - dout(5) << "_do_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << " start" << dendl; - int r = _do_transactions(o->tls, o->op, &handle); - apply_manager.op_apply_finish(o->op); - dout(10) << "_do_op " << o << " seq " << o->op << " r = " << r - << ", finisher " << o->onreadable << " " << o->onreadable_sync << dendl; -} - -void FileStore::_finish_op(OpSequencer *osr) -{ - list to_queue; - Op *o = osr->dequeue(&to_queue); - - utime_t lat = ceph_clock_now(g_ceph_context); - lat -= o->start; - - dout(10) << "_finish_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << " lat " << lat << dendl; - osr->apply_lock.Unlock(); // locked in _do_op - - // called with tp lock held - op_queue_release_throttle(o); - - logger->tinc(l_os_apply_lat, lat); - - if (o->onreadable_sync) { - o->onreadable_sync->complete(0); - } - if (o->onreadable) { - apply_finishers[osr->id % m_apply_finisher_num]->queue(o->onreadable); - } - if (!to_queue.empty()) { - apply_finishers[osr->id % m_apply_finisher_num]->queue(to_queue); - } - delete o; -} - - -struct C_JournaledAhead : public Context { - FileStore *fs; - FileStore::OpSequencer *osr; - FileStore::Op *o; - Context *ondisk; - - C_JournaledAhead(FileStore *f, FileStore::OpSequencer *os, FileStore::Op *o, Context *ondisk): - fs(f), osr(os), o(o), ondisk(ondisk) { } - void finish(int r) { - fs->_journaled_ahead(osr, o, ondisk); - } -}; - -int FileStore::queue_transactions(Sequencer *posr, list &tls, - TrackedOpRef osd_op, - ThreadPool::TPHandle *handle) -{ - Context *onreadable; - Context *ondisk; - Context *onreadable_sync; - ObjectStore::Transaction::collect_contexts( - tls, &onreadable, &ondisk, &onreadable_sync); - if (g_conf->filestore_blackhole) { - dout(0) << "queue_transactions filestore_blackhole = TRUE, dropping transaction" << dendl; - delete ondisk; - delete onreadable; - delete onreadable_sync; - return 0; - } - - utime_t start = ceph_clock_now(g_ceph_context); - // set up the sequencer - OpSequencer *osr; - assert(posr); - if (posr->p) { - osr = static_cast(posr->p.get()); - dout(5) << "queue_transactions existing " << osr << " " << *osr << dendl; - } else { - osr = new OpSequencer(next_osr_id.inc()); - osr->set_cct(g_ceph_context); - osr->parent = posr; - posr->p = osr; - dout(5) << "queue_transactions new " << osr << " " << *osr << dendl; - } - - // used to include osr information in tracepoints during transaction apply - for (list::iterator i = tls.begin(); i != tls.end(); ++i) { - (*i)->set_osr(osr); - } - - if (journal && journal->is_writeable() && !m_filestore_journal_trailing) { - Op *o = build_op(tls, onreadable, onreadable_sync, osd_op); - op_queue_reserve_throttle(o, handle); - journal->throttle(); - //prepare and encode transactions data out of lock - bufferlist tbl; - int orig_len = journal->prepare_entry(o->tls, &tbl); - uint64_t op_num = submit_manager.op_submit_start(); - o->op = op_num; - - if (m_filestore_do_dump) - dump_transactions(o->tls, o->op, osr); - - if (m_filestore_journal_parallel) { - dout(5) << "queue_transactions (parallel) " << o->op << " " << o->tls << dendl; - - _op_journal_transactions(tbl, orig_len, o->op, ondisk, osd_op); - - // queue inside submit_manager op submission lock - queue_op(osr, o); - } else if (m_filestore_journal_writeahead) { - dout(5) << "queue_transactions (writeahead) " << o->op << " " << o->tls << dendl; - - osr->queue_journal(o->op); - - _op_journal_transactions(tbl, orig_len, o->op, - new C_JournaledAhead(this, osr, o, ondisk), - osd_op); - } else { - assert(0); - } - submit_manager.op_submit_finish(op_num); - utime_t end = ceph_clock_now(g_ceph_context); - logger->tinc(l_os_queue_lat, end - start); - return 0; - } - - if (!journal) { - Op *o = build_op(tls, onreadable, onreadable_sync, osd_op); - dout(5) << __func__ << " (no journal) " << o << " " << tls << dendl; - - op_queue_reserve_throttle(o, handle); - - uint64_t op_num = submit_manager.op_submit_start(); - o->op = op_num; - - if (m_filestore_do_dump) - dump_transactions(o->tls, o->op, osr); - - queue_op(osr, o); - - if (ondisk) - apply_manager.add_waiter(op_num, ondisk); - submit_manager.op_submit_finish(op_num); - utime_t end = ceph_clock_now(g_ceph_context); - logger->tinc(l_os_queue_lat, end - start); - return 0; - } - - assert(journal); - //prepare and encode transactions data out of lock - bufferlist tbl; - int orig_len = -1; - if (journal->is_writeable()) { - orig_len = journal->prepare_entry(tls, &tbl); - } - uint64_t op = submit_manager.op_submit_start(); - dout(5) << "queue_transactions (trailing journal) " << op << " " << tls << dendl; - - if (m_filestore_do_dump) - dump_transactions(tls, op, osr); - - apply_manager.op_apply_start(op); - int r = do_transactions(tls, op); - - if (r >= 0) { - _op_journal_transactions(tbl, orig_len, op, ondisk, osd_op); - } else { - delete ondisk; - } - - // start on_readable finisher after we queue journal item, as on_readable callback - // is allowed to delete the Transaction - if (onreadable_sync) { - onreadable_sync->complete(r); - } - apply_finishers[osr->id % m_apply_finisher_num]->queue(onreadable, r); - - submit_manager.op_submit_finish(op); - apply_manager.op_apply_finish(op); - - utime_t end = ceph_clock_now(g_ceph_context); - logger->tinc(l_os_queue_lat, end - start); - return r; -} - -void FileStore::_journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk) -{ - dout(5) << "_journaled_ahead " << o << " seq " << o->op << " " << *osr << " " << o->tls << dendl; - - // this should queue in order because the journal does it's completions in order. - queue_op(osr, o); - - list to_queue; - osr->dequeue_journal(&to_queue); - - // do ondisk completions async, to prevent any onreadable_sync completions - // getting blocked behind an ondisk completion. - if (ondisk) { - dout(10) << " queueing ondisk " << ondisk << dendl; - ondisk_finishers[osr->id % m_ondisk_finisher_num]->queue(ondisk); - } - if (!to_queue.empty()) { - ondisk_finishers[osr->id % m_ondisk_finisher_num]->queue(to_queue); - } -} - -int FileStore::_do_transactions( - list &tls, - uint64_t op_seq, - ThreadPool::TPHandle *handle) -{ - int r = 0; - int trans_num = 0; - - for (list::iterator p = tls.begin(); - p != tls.end(); - ++p, trans_num++) { - r = _do_transaction(**p, op_seq, trans_num, handle); - if (r < 0) - break; - if (handle) - handle->reset_tp_timeout(); - } - - return r; -} - -void FileStore::_set_global_replay_guard(coll_t cid, - const SequencerPosition &spos) -{ - if (backend->can_checkpoint()) - return; - - // sync all previous operations on this sequencer - int ret = object_map->sync(); - if (ret < 0) { - derr << __func__ << " : omap sync error " << cpp_strerror(ret) << dendl; - assert(0 == "_set_global_replay_guard failed"); - } - ret = sync_filesystem(basedir_fd); - if (ret < 0) { - derr << __func__ << " :sync_filesytem error " << cpp_strerror(ret) << dendl; - assert(0 == "_set_global_replay_guard failed"); - } - - char fn[PATH_MAX]; - get_cdir(cid, fn, sizeof(fn)); - int fd = ::open(fn, O_RDONLY); - if (fd < 0) { - int err = errno; - derr << __func__ << ": " << cid << " error " << cpp_strerror(err) << dendl; - assert(0 == "_set_global_replay_guard failed"); - } - - _inject_failure(); - - // then record that we did it - bufferlist v; - ::encode(spos, v); - int r = chain_fsetxattr(fd, GLOBAL_REPLAY_GUARD_XATTR, v.c_str(), v.length(), true); - if (r < 0) { - derr << __func__ << ": fsetxattr " << GLOBAL_REPLAY_GUARD_XATTR - << " got " << cpp_strerror(r) << dendl; - assert(0 == "fsetxattr failed"); - } - - // and make sure our xattr is durable. - ::fsync(fd); - - _inject_failure(); - - VOID_TEMP_FAILURE_RETRY(::close(fd)); - dout(10) << __func__ << ": " << spos << " done" << dendl; -} - -int FileStore::_check_global_replay_guard(coll_t cid, - const SequencerPosition& spos) -{ - char fn[PATH_MAX]; - get_cdir(cid, fn, sizeof(fn)); - int fd = ::open(fn, O_RDONLY); - if (fd < 0) { - dout(10) << __func__ << ": " << cid << " dne" << dendl; - return 1; // if collection does not exist, there is no guard, and we can replay. - } - - char buf[100]; - int r = chain_fgetxattr(fd, GLOBAL_REPLAY_GUARD_XATTR, buf, sizeof(buf)); - if (r < 0) { - dout(20) << __func__ << " no xattr" << dendl; - assert(!m_filestore_fail_eio || r != -EIO); - VOID_TEMP_FAILURE_RETRY(::close(fd)); - return 1; // no xattr - } - bufferlist bl; - bl.append(buf, r); - - SequencerPosition opos; - bufferlist::iterator p = bl.begin(); - ::decode(opos, p); - - VOID_TEMP_FAILURE_RETRY(::close(fd)); - return spos >= opos ? 1 : -1; -} - - -void FileStore::_set_replay_guard(coll_t cid, - const SequencerPosition &spos, - bool in_progress=false) -{ - char fn[PATH_MAX]; - get_cdir(cid, fn, sizeof(fn)); - int fd = ::open(fn, O_RDONLY); - if (fd < 0) { - int err = errno; - derr << "_set_replay_guard " << cid << " error " << cpp_strerror(err) << dendl; - assert(0 == "_set_replay_guard failed"); - } - _set_replay_guard(fd, spos, 0, in_progress); - VOID_TEMP_FAILURE_RETRY(::close(fd)); -} - - -void FileStore::_set_replay_guard(int fd, - const SequencerPosition& spos, - const ghobject_t *hoid, - bool in_progress) -{ - if (backend->can_checkpoint()) - return; - - dout(10) << "_set_replay_guard " << spos << (in_progress ? " START" : "") << dendl; - - _inject_failure(); - - // first make sure the previous operation commits - ::fsync(fd); - - // sync object_map too. even if this object has a header or keys, - // it have had them in the past and then removed them, so always - // sync. - object_map->sync(hoid, &spos); - - _inject_failure(); - - // then record that we did it - bufferlist v(40); - ::encode(spos, v); - ::encode(in_progress, v); - int r = chain_fsetxattr(fd, REPLAY_GUARD_XATTR, v.c_str(), v.length(), true); - if (r < 0) { - derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl; - assert(0 == "fsetxattr failed"); - } - - // and make sure our xattr is durable. - ::fsync(fd); - - _inject_failure(); - - dout(10) << "_set_replay_guard " << spos << " done" << dendl; -} - -void FileStore::_close_replay_guard(coll_t cid, - const SequencerPosition &spos) -{ - char fn[PATH_MAX]; - get_cdir(cid, fn, sizeof(fn)); - int fd = ::open(fn, O_RDONLY); - if (fd < 0) { - int err = errno; - derr << "_close_replay_guard " << cid << " error " << cpp_strerror(err) << dendl; - assert(0 == "_close_replay_guard failed"); - } - _close_replay_guard(fd, spos); - VOID_TEMP_FAILURE_RETRY(::close(fd)); -} - -void FileStore::_close_replay_guard(int fd, const SequencerPosition& spos) -{ - if (backend->can_checkpoint()) - return; - - dout(10) << "_close_replay_guard " << spos << dendl; - - _inject_failure(); - - // then record that we are done with this operation - bufferlist v(40); - ::encode(spos, v); - bool in_progress = false; - ::encode(in_progress, v); - int r = chain_fsetxattr(fd, REPLAY_GUARD_XATTR, v.c_str(), v.length(), true); - if (r < 0) { - derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl; - assert(0 == "fsetxattr failed"); - } - - // and make sure our xattr is durable. - ::fsync(fd); - - _inject_failure(); - - dout(10) << "_close_replay_guard " << spos << " done" << dendl; -} - -int FileStore::_check_replay_guard(coll_t cid, ghobject_t oid, const SequencerPosition& spos) -{ - if (!replaying || backend->can_checkpoint()) - return 1; - - int r = _check_global_replay_guard(cid, spos); - if (r < 0) - return r; - - FDRef fd; - r = lfn_open(cid, oid, false, &fd); - if (r < 0) { - dout(10) << "_check_replay_guard " << cid << " " << oid << " dne" << dendl; - return 1; // if file does not exist, there is no guard, and we can replay. - } - int ret = _check_replay_guard(**fd, spos); - lfn_close(fd); - return ret; -} - -int FileStore::_check_replay_guard(coll_t cid, const SequencerPosition& spos) -{ - if (!replaying || backend->can_checkpoint()) - return 1; - - char fn[PATH_MAX]; - get_cdir(cid, fn, sizeof(fn)); - int fd = ::open(fn, O_RDONLY); - if (fd < 0) { - dout(10) << "_check_replay_guard " << cid << " dne" << dendl; - return 1; // if collection does not exist, there is no guard, and we can replay. - } - int ret = _check_replay_guard(fd, spos); - VOID_TEMP_FAILURE_RETRY(::close(fd)); - return ret; -} - -int FileStore::_check_replay_guard(int fd, const SequencerPosition& spos) -{ - if (!replaying || backend->can_checkpoint()) - return 1; - - char buf[100]; - int r = chain_fgetxattr(fd, REPLAY_GUARD_XATTR, buf, sizeof(buf)); - if (r < 0) { - dout(20) << "_check_replay_guard no xattr" << dendl; - assert(!m_filestore_fail_eio || r != -EIO); - return 1; // no xattr - } - bufferlist bl; - bl.append(buf, r); - - SequencerPosition opos; - bufferlist::iterator p = bl.begin(); - ::decode(opos, p); - bool in_progress = false; - if (!p.end()) // older journals don't have this - ::decode(in_progress, p); - if (opos > spos) { - dout(10) << "_check_replay_guard object has " << opos << " > current pos " << spos - << ", now or in future, SKIPPING REPLAY" << dendl; - return -1; - } else if (opos == spos) { - if (in_progress) { - dout(10) << "_check_replay_guard object has " << opos << " == current pos " << spos - << ", in_progress=true, CONDITIONAL REPLAY" << dendl; - return 0; - } else { - dout(10) << "_check_replay_guard object has " << opos << " == current pos " << spos - << ", in_progress=false, SKIPPING REPLAY" << dendl; - return -1; - } - } else { - dout(10) << "_check_replay_guard object has " << opos << " < current pos " << spos - << ", in past, will replay" << dendl; - return 1; - } -} - -unsigned FileStore::_do_transaction( - Transaction& t, uint64_t op_seq, int trans_num, - ThreadPool::TPHandle *handle) -{ - dout(10) << "_do_transaction on " << &t << dendl; - -#ifdef WITH_LTTNG - const char *osr_name = t.get_osr() ? static_cast(t.get_osr())->get_name().c_str() : ""; -#endif - - Transaction::iterator i = t.begin(); - - SequencerPosition spos(op_seq, trans_num, 0); - while (i.have_op()) { - if (handle) - handle->reset_tp_timeout(); - - Transaction::Op *op = i.decode_op(); - int r = 0; - - _inject_failure(); - - switch (op->op) { - case Transaction::OP_NOP: - break; - case Transaction::OP_TOUCH: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - _kludge_temp_object_collection(cid, oid); - tracepoint(objectstore, touch_enter, osr_name); - if (_check_replay_guard(cid, oid, spos) > 0) - r = _touch(cid, oid); - tracepoint(objectstore, touch_exit, r); - } - break; - - case Transaction::OP_WRITE: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - _kludge_temp_object_collection(cid, oid); - uint64_t off = op->off; - uint64_t len = op->len; - uint32_t fadvise_flags = i.get_fadvise_flags(); - bufferlist bl; - i.decode_bl(bl); - tracepoint(objectstore, write_enter, osr_name, off, len); - if (_check_replay_guard(cid, oid, spos) > 0) - r = _write(cid, oid, off, len, bl, fadvise_flags); - tracepoint(objectstore, write_exit, r); - } - break; - - case Transaction::OP_ZERO: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - _kludge_temp_object_collection(cid, oid); - uint64_t off = op->off; - uint64_t len = op->len; - tracepoint(objectstore, zero_enter, osr_name, off, len); - if (_check_replay_guard(cid, oid, spos) > 0) - r = _zero(cid, oid, off, len); - tracepoint(objectstore, zero_exit, r); - } - break; - - case Transaction::OP_TRIMCACHE: - { - // deprecated, no-op - } - break; - - case Transaction::OP_TRUNCATE: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - _kludge_temp_object_collection(cid, oid); - uint64_t off = op->off; - tracepoint(objectstore, truncate_enter, osr_name, off); - if (_check_replay_guard(cid, oid, spos) > 0) - r = _truncate(cid, oid, off); - tracepoint(objectstore, truncate_exit, r); - } - break; - - case Transaction::OP_REMOVE: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - _kludge_temp_object_collection(cid, oid); - tracepoint(objectstore, remove_enter, osr_name); - if (_check_replay_guard(cid, oid, spos) > 0) - r = _remove(cid, oid, spos); - tracepoint(objectstore, remove_exit, r); - } - break; - - case Transaction::OP_SETATTR: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - _kludge_temp_object_collection(cid, oid); - string name = i.decode_string(); - bufferlist bl; - i.decode_bl(bl); - tracepoint(objectstore, setattr_enter, osr_name); - if (_check_replay_guard(cid, oid, spos) > 0) { - map to_set; - to_set[name] = bufferptr(bl.c_str(), bl.length()); - r = _setattrs(cid, oid, to_set, spos); - if (r == -ENOSPC) - dout(0) << " ENOSPC on setxattr on " << cid << "/" << oid - << " name " << name << " size " << bl.length() << dendl; - } - tracepoint(objectstore, setattr_exit, r); - } - break; - - case Transaction::OP_SETATTRS: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - _kludge_temp_object_collection(cid, oid); - map aset; - i.decode_attrset(aset); - tracepoint(objectstore, setattrs_enter, osr_name); - if (_check_replay_guard(cid, oid, spos) > 0) - r = _setattrs(cid, oid, aset, spos); - tracepoint(objectstore, setattrs_exit, r); - if (r == -ENOSPC) - dout(0) << " ENOSPC on setxattrs on " << cid << "/" << oid << dendl; - } - break; - - case Transaction::OP_RMATTR: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - _kludge_temp_object_collection(cid, oid); - string name = i.decode_string(); - tracepoint(objectstore, rmattr_enter, osr_name); - if (_check_replay_guard(cid, oid, spos) > 0) - r = _rmattr(cid, oid, name.c_str(), spos); - tracepoint(objectstore, rmattr_exit, r); - } - break; - - case Transaction::OP_RMATTRS: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - _kludge_temp_object_collection(cid, oid); - tracepoint(objectstore, rmattrs_enter, osr_name); - if (_check_replay_guard(cid, oid, spos) > 0) - r = _rmattrs(cid, oid, spos); - tracepoint(objectstore, rmattrs_exit, r); - } - break; - - case Transaction::OP_CLONE: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - _kludge_temp_object_collection(cid, oid); - ghobject_t noid = i.get_oid(op->dest_oid); - tracepoint(objectstore, clone_enter, osr_name); - r = _clone(cid, oid, noid, spos); - tracepoint(objectstore, clone_exit, r); - } - break; - - case Transaction::OP_CLONERANGE: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - _kludge_temp_object_collection(cid, oid); - ghobject_t noid = i.get_oid(op->dest_oid); - _kludge_temp_object_collection(cid, noid); - uint64_t off = op->off; - uint64_t len = op->len; - tracepoint(objectstore, clone_range_enter, osr_name, len); - r = _clone_range(cid, oid, noid, off, len, off, spos); - tracepoint(objectstore, clone_range_exit, r); - } - break; - - case Transaction::OP_CLONERANGE2: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - _kludge_temp_object_collection(cid, oid); - ghobject_t noid = i.get_oid(op->dest_oid); - _kludge_temp_object_collection(cid, noid); - uint64_t srcoff = op->off; - uint64_t len = op->len; - uint64_t dstoff = op->dest_off; - tracepoint(objectstore, clone_range2_enter, osr_name, len); - r = _clone_range(cid, oid, noid, srcoff, len, dstoff, spos); - tracepoint(objectstore, clone_range2_exit, r); - } - break; - - case Transaction::OP_MKCOLL: - { - coll_t cid = i.get_cid(op->cid); - tracepoint(objectstore, mkcoll_enter, osr_name); - if (_check_replay_guard(cid, spos) > 0) - r = _create_collection(cid, spos); - tracepoint(objectstore, mkcoll_exit, r); - } - break; - - case Transaction::OP_COLL_HINT: - { - coll_t cid = i.get_cid(op->cid); - uint32_t type = op->hint_type; - bufferlist hint; - i.decode_bl(hint); - bufferlist::iterator hiter = hint.begin(); - if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) { - uint32_t pg_num; - uint64_t num_objs; - ::decode(pg_num, hiter); - ::decode(num_objs, hiter); - if (_check_replay_guard(cid, spos) > 0) { - r = _collection_hint_expected_num_objs(cid, pg_num, num_objs, spos); - } - } else { - // Ignore the hint - dout(10) << "Unrecognized collection hint type: " << type << dendl; - } - } - break; - - case Transaction::OP_RMCOLL: - { - coll_t cid = i.get_cid(op->cid); - tracepoint(objectstore, rmcoll_enter, osr_name); - if (_check_replay_guard(cid, spos) > 0) - r = _destroy_collection(cid); - tracepoint(objectstore, rmcoll_exit, r); - } - break; - - case Transaction::OP_COLL_ADD: - { - coll_t ocid = i.get_cid(op->cid); - coll_t ncid = i.get_cid(op->dest_cid); - ghobject_t oid = i.get_oid(op->oid); - - assert(oid.hobj.pool >= -1); - - // always followed by OP_COLL_REMOVE - Transaction::Op *op2 = i.decode_op(); - coll_t ocid2 = i.get_cid(op2->cid); - ghobject_t oid2 = i.get_oid(op2->oid); - assert(op2->op == Transaction::OP_COLL_REMOVE); - assert(ocid2 == ocid); - assert(oid2 == oid); - - tracepoint(objectstore, coll_add_enter); - r = _collection_add(ncid, ocid, oid, spos); - tracepoint(objectstore, coll_add_exit, r); - spos.op++; - if (r < 0) - break; - tracepoint(objectstore, coll_remove_enter, osr_name); - if (_check_replay_guard(ocid, oid, spos) > 0) - r = _remove(ocid, oid, spos); - tracepoint(objectstore, coll_remove_exit, r); - } - break; - - case Transaction::OP_COLL_MOVE: - { - // WARNING: this is deprecated and buggy; only here to replay old journals. - coll_t ocid = i.get_cid(op->cid); - coll_t ncid = i.get_cid(op->dest_cid); - ghobject_t oid = i.get_oid(op->oid); - tracepoint(objectstore, coll_move_enter); - r = _collection_add(ocid, ncid, oid, spos); - if (r == 0 && - (_check_replay_guard(ocid, oid, spos) > 0)) - r = _remove(ocid, oid, spos); - tracepoint(objectstore, coll_move_exit, r); - } - break; - - case Transaction::OP_COLL_MOVE_RENAME: - { - coll_t oldcid = i.get_cid(op->cid); - ghobject_t oldoid = i.get_oid(op->oid); - coll_t newcid = i.get_cid(op->dest_cid); - ghobject_t newoid = i.get_oid(op->dest_oid); - _kludge_temp_object_collection(oldcid, oldoid); - _kludge_temp_object_collection(newcid, newoid); - tracepoint(objectstore, coll_move_rename_enter); - r = _collection_move_rename(oldcid, oldoid, newcid, newoid, spos); - tracepoint(objectstore, coll_move_rename_exit, r); - } - break; - - case Transaction::OP_COLL_SETATTR: - { - coll_t cid = i.get_cid(op->cid); - string name = i.decode_string(); - bufferlist bl; - i.decode_bl(bl); - tracepoint(objectstore, coll_setattr_enter, osr_name); - if (_check_replay_guard(cid, spos) > 0) - r = _collection_setattr(cid, name.c_str(), bl.c_str(), bl.length()); - tracepoint(objectstore, coll_setattr_exit, r); - } - break; - - case Transaction::OP_COLL_RMATTR: - { - coll_t cid = i.get_cid(op->cid); - string name = i.decode_string(); - tracepoint(objectstore, coll_rmattr_enter, osr_name); - if (_check_replay_guard(cid, spos) > 0) - r = _collection_rmattr(cid, name.c_str()); - tracepoint(objectstore, coll_rmattr_exit, r); - } - break; - - case Transaction::OP_STARTSYNC: - tracepoint(objectstore, startsync_enter, osr_name); - _start_sync(); - tracepoint(objectstore, startsync_exit); - break; - - case Transaction::OP_COLL_RENAME: - { - r = -EOPNOTSUPP; - } - break; - - case Transaction::OP_OMAP_CLEAR: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - _kludge_temp_object_collection(cid, oid); - tracepoint(objectstore, omap_clear_enter, osr_name); - r = _omap_clear(cid, oid, spos); - tracepoint(objectstore, omap_clear_exit, r); - } - break; - case Transaction::OP_OMAP_SETKEYS: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - _kludge_temp_object_collection(cid, oid); - map aset; - i.decode_attrset(aset); - tracepoint(objectstore, omap_setkeys_enter, osr_name); - r = _omap_setkeys(cid, oid, aset, spos); - tracepoint(objectstore, omap_setkeys_exit, r); - } - break; - case Transaction::OP_OMAP_RMKEYS: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - _kludge_temp_object_collection(cid, oid); - set keys; - i.decode_keyset(keys); - tracepoint(objectstore, omap_rmkeys_enter, osr_name); - r = _omap_rmkeys(cid, oid, keys, spos); - tracepoint(objectstore, omap_rmkeys_exit, r); - } - break; - case Transaction::OP_OMAP_RMKEYRANGE: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - _kludge_temp_object_collection(cid, oid); - string first, last; - first = i.decode_string(); - last = i.decode_string(); - tracepoint(objectstore, omap_rmkeyrange_enter, osr_name); - r = _omap_rmkeyrange(cid, oid, first, last, spos); - tracepoint(objectstore, omap_rmkeyrange_exit, r); - } - break; - case Transaction::OP_OMAP_SETHEADER: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - _kludge_temp_object_collection(cid, oid); - bufferlist bl; - i.decode_bl(bl); - tracepoint(objectstore, omap_setheader_enter, osr_name); - r = _omap_setheader(cid, oid, bl, spos); - tracepoint(objectstore, omap_setheader_exit, r); - } - break; - case Transaction::OP_SPLIT_COLLECTION: - { - assert(0 == "not legacy journal; upgrade to firefly first"); - } - break; - case Transaction::OP_SPLIT_COLLECTION2: - { - coll_t cid = i.get_cid(op->cid); - uint32_t bits = op->split_bits; - uint32_t rem = op->split_rem; - coll_t dest = i.get_cid(op->dest_cid); - tracepoint(objectstore, split_coll2_enter, osr_name); - r = _split_collection(cid, bits, rem, dest, spos); - tracepoint(objectstore, split_coll2_exit, r); - } - break; - - case Transaction::OP_SETALLOCHINT: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - _kludge_temp_object_collection(cid, oid); - uint64_t expected_object_size = op->expected_object_size; - uint64_t expected_write_size = op->expected_write_size; - tracepoint(objectstore, setallochint_enter, osr_name); - if (_check_replay_guard(cid, oid, spos) > 0) - r = _set_alloc_hint(cid, oid, expected_object_size, - expected_write_size); - tracepoint(objectstore, setallochint_exit, r); - } - break; - - default: - derr << "bad op " << op->op << dendl; - assert(0); - } - - if (r < 0) { - bool ok = false; - - if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE || - op->op == Transaction::OP_CLONE || - op->op == Transaction::OP_CLONERANGE2 || - op->op == Transaction::OP_COLL_ADD)) - // -ENOENT is normally okay - // ...including on a replayed OP_RMCOLL with checkpoint mode - ok = true; - if (r == -ENODATA) - ok = true; - - if (op->op == Transaction::OP_SETALLOCHINT) - // Either EOPNOTSUPP or EINVAL most probably. EINVAL in most - // cases means invalid hint size (e.g. too big, not a multiple - // of block size, etc) or, at least on xfs, an attempt to set - // or change it when the file is not empty. However, - // OP_SETALLOCHINT is advisory, so ignore all errors. - ok = true; - - if (replaying && !backend->can_checkpoint()) { - if (r == -EEXIST && op->op == Transaction::OP_MKCOLL) { - dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl; - ok = true; - } - if (r == -EEXIST && op->op == Transaction::OP_COLL_ADD) { - dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl; - ok = true; - } - if (r == -EEXIST && op->op == Transaction::OP_COLL_MOVE) { - dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl; - ok = true; - } - if (r == -ERANGE) { - dout(10) << "tolerating ERANGE on replay" << dendl; - ok = true; - } - if (r == -ENOENT) { - dout(10) << "tolerating ENOENT on replay" << dendl; - ok = true; - } - } - - if (!ok) { - const char *msg = "unexpected error code"; - - if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE || - op->op == Transaction::OP_CLONE || - op->op == Transaction::OP_CLONERANGE2)) - msg = "ENOENT on clone suggests osd bug"; - - if (r == -ENOSPC) - // For now, if we hit _any_ ENOSPC, crash, before we do any damage - // by partially applying transactions. - msg = "ENOSPC handling not implemented"; - - if (r == -ENOTEMPTY) { - msg = "ENOTEMPTY suggests garbage data in osd data dir"; - } - - dout(0) << " error " << cpp_strerror(r) << " not handled on operation " << op - << " (" << spos << ", or op " << spos.op << ", counting from 0)" << dendl; - dout(0) << msg << dendl; - dout(0) << " transaction dump:\n"; - JSONFormatter f(true); - f.open_object_section("transaction"); - t.dump(&f); - f.close_section(); - f.flush(*_dout); - *_dout << dendl; - - if (r == -EMFILE) { - dump_open_fds(g_ceph_context); - } - - assert(0 == "unexpected error"); - } - } - - spos.op++; - } - - _inject_failure(); - - return 0; // FIXME count errors -} - - /*********************************************/ - - - -// -------------------- -// objects - -bool FileStore::exists(coll_t cid, const ghobject_t& oid) -{ - tracepoint(objectstore, exists_enter, cid.c_str()); - _kludge_temp_object_collection(cid, oid); - struct stat st; - bool retval = stat(cid, oid, &st) == 0; - tracepoint(objectstore, exists_exit, retval); - return retval; -} - -int FileStore::stat( - coll_t cid, const ghobject_t& oid, struct stat *st, bool allow_eio) -{ - tracepoint(objectstore, stat_enter, cid.c_str()); - _kludge_temp_object_collection(cid, oid); - int r = lfn_stat(cid, oid, st); - assert(allow_eio || !m_filestore_fail_eio || r != -EIO); - if (r < 0) { - dout(10) << "stat " << cid << "/" << oid - << " = " << r << dendl; - } else { - dout(10) << "stat " << cid << "/" << oid - << " = " << r - << " (size " << st->st_size << ")" << dendl; - } - if (g_conf->filestore_debug_inject_read_err && - debug_mdata_eio(oid)) { - return -EIO; - } else { - tracepoint(objectstore, stat_exit, r); - return r; - } -} - -int FileStore::read( - coll_t cid, - const ghobject_t& oid, - uint64_t offset, - size_t len, - bufferlist& bl, - uint32_t op_flags, - bool allow_eio) -{ - int got; - tracepoint(objectstore, read_enter, cid.c_str(), offset, len); - _kludge_temp_object_collection(cid, oid); - - dout(15) << "read " << cid << "/" << oid << " " << offset << "~" << len << dendl; - - FDRef fd; - int r = lfn_open(cid, oid, false, &fd); - if (r < 0) { - dout(10) << "FileStore::read(" << cid << "/" << oid << ") open error: " - << cpp_strerror(r) << dendl; - return r; - } - - if (len == 0) { - struct stat st; - memset(&st, 0, sizeof(struct stat)); - int r = ::fstat(**fd, &st); - assert(r == 0); - len = st.st_size; - } - -#ifdef HAVE_POSIX_FADVISE - if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_RANDOM) - posix_fadvise(**fd, offset, len, POSIX_FADV_RANDOM); - if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) - posix_fadvise(**fd, offset, len, POSIX_FADV_SEQUENTIAL); -#endif - - bufferptr bptr(len); // prealloc space for entire read - got = safe_pread(**fd, bptr.c_str(), len, offset); - if (got < 0) { - dout(10) << "FileStore::read(" << cid << "/" << oid << ") pread error: " << cpp_strerror(got) << dendl; - lfn_close(fd); - assert(allow_eio || !m_filestore_fail_eio || got != -EIO); - return got; - } - bptr.set_length(got); // properly size the buffer - bl.push_back(bptr); // put it in the target bufferlist - -#ifdef HAVE_POSIX_FADVISE - if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED) - posix_fadvise(**fd, offset, len, POSIX_FADV_DONTNEED); - if (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_RANDOM | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL)) - posix_fadvise(**fd, offset, len, POSIX_FADV_NORMAL); -#endif - - if (m_filestore_sloppy_crc && (!replaying || backend->can_checkpoint())) { - ostringstream ss; - int errors = backend->_crc_verify_read(**fd, offset, got, bl, &ss); - if (errors > 0) { - dout(0) << "FileStore::read " << cid << "/" << oid << " " << offset << "~" - << got << " ... BAD CRC:\n" << ss.str() << dendl; - assert(0 == "bad crc on read"); - } - } - - lfn_close(fd); - - dout(10) << "FileStore::read " << cid << "/" << oid << " " << offset << "~" - << got << "/" << len << dendl; - if (g_conf->filestore_debug_inject_read_err && - debug_data_eio(oid)) { - return -EIO; - } else { - tracepoint(objectstore, read_exit, got); - return got; - } -} - -int FileStore::_do_fiemap(int fd, uint64_t offset, size_t len, - map *m) -{ - struct fiemap *fiemap = NULL; - uint64_t i; - struct fiemap_extent *extent = NULL; - int r = 0; - - r = backend->do_fiemap(fd, offset, len, &fiemap); - if (r < 0) - return r; - - if (fiemap->fm_mapped_extents == 0) { - free(fiemap); - return r; - } - - extent = &fiemap->fm_extents[0]; - - /* start where we were asked to start */ - if (extent->fe_logical < offset) { - extent->fe_length -= offset - extent->fe_logical; - extent->fe_logical = offset; - } - - i = 0; - - while (i < fiemap->fm_mapped_extents) { - struct fiemap_extent *next = extent + 1; - - dout(10) << "FileStore::fiemap() fm_mapped_extents=" << fiemap->fm_mapped_extents - << " fe_logical=" << extent->fe_logical << " fe_length=" << extent->fe_length << dendl; - - /* try to merge extents */ - while ((i < fiemap->fm_mapped_extents - 1) && - (extent->fe_logical + extent->fe_length == next->fe_logical)) { - next->fe_length += extent->fe_length; - next->fe_logical = extent->fe_logical; - extent = next; - next = extent + 1; - i++; - } - - if (extent->fe_logical + extent->fe_length > offset + len) - extent->fe_length = offset + len - extent->fe_logical; - (*m)[extent->fe_logical] = extent->fe_length; - i++; - extent++; - } - free(fiemap); - - return r; -} - -int FileStore::_do_seek_hole_data(int fd, uint64_t offset, size_t len, - map *m) -{ -#if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA) - off_t hole_pos, data_pos; - int r = 0; - - // If lseek fails with errno setting to be ENXIO, this means the current - // file offset is beyond the end of the file. - off_t start = offset; - while(start < (off_t)(offset + len)) { - data_pos = lseek(fd, start, SEEK_DATA); - if (data_pos < 0) { - if (errno == ENXIO) - break; - else { - r = -errno; - dout(10) << "failed to lseek: " << cpp_strerror(r) << dendl; - return r; - } - } else if (data_pos > (off_t)(offset + len)) { - break; - } - - hole_pos = lseek(fd, data_pos, SEEK_HOLE); - if (hole_pos < 0) { - if (errno == ENXIO) { - break; - } else { - r = -errno; - dout(10) << "failed to lseek: " << cpp_strerror(r) << dendl; - return r; - } - } - - if (hole_pos >= (off_t)(offset + len)) { - (*m)[data_pos] = offset + len - data_pos; - break; - } - (*m)[data_pos] = hole_pos - data_pos; - start = hole_pos; - } - - return r; -#else - (*m)[offset] = len; - return 0; -#endif -} - -int FileStore::fiemap(coll_t cid, const ghobject_t& oid, - uint64_t offset, size_t len, - bufferlist& bl) -{ - tracepoint(objectstore, fiemap_enter, cid.c_str(), offset, len); - _kludge_temp_object_collection(cid, oid); - - if ((!backend->has_seek_data_hole() && !backend->has_fiemap()) || - len <= (size_t)m_filestore_fiemap_threshold) { - map m; - m[offset] = len; - ::encode(m, bl); - return 0; - } - - dout(15) << "fiemap " << cid << "/" << oid << " " << offset << "~" << len << dendl; - - map exomap; - FDRef fd; - - int r = lfn_open(cid, oid, false, &fd); - if (r < 0) { - dout(10) << "read couldn't open " << cid << "/" << oid << ": " << cpp_strerror(r) << dendl; - goto done; - } - - if (backend->has_seek_data_hole()) { - dout(15) << "seek_data/seek_hole " << cid << "/" << oid << " " << offset << "~" << len << dendl; - r = _do_seek_hole_data(**fd, offset, len, &exomap); - } else if (backend->has_fiemap()) { - dout(15) << "fiemap ioctl" << cid << "/" << oid << " " << offset << "~" << len << dendl; - r = _do_fiemap(**fd, offset, len, &exomap); - } - -done: - if (r >= 0) { - lfn_close(fd); - ::encode(exomap, bl); - } - - dout(10) << "fiemap " << cid << "/" << oid << " " << offset << "~" << len << " = " << r << " num_extents=" << exomap.size() << " " << exomap << dendl; - assert(!m_filestore_fail_eio || r != -EIO); - tracepoint(objectstore, fiemap_exit, r); - return r; -} - - -int FileStore::_remove(coll_t cid, const ghobject_t& oid, - const SequencerPosition &spos) -{ - dout(15) << "remove " << cid << "/" << oid << dendl; - int r = lfn_unlink(cid, oid, spos); - dout(10) << "remove " << cid << "/" << oid << " = " << r << dendl; - return r; -} - -int FileStore::_truncate(coll_t cid, const ghobject_t& oid, uint64_t size) -{ - dout(15) << "truncate " << cid << "/" << oid << " size " << size << dendl; - int r = lfn_truncate(cid, oid, size); - dout(10) << "truncate " << cid << "/" << oid << " size " << size << " = " << r << dendl; - return r; -} - - -int FileStore::_touch(coll_t cid, const ghobject_t& oid) -{ - dout(15) << "touch " << cid << "/" << oid << dendl; - - FDRef fd; - int r = lfn_open(cid, oid, true, &fd); - if (r < 0) { - return r; - } else { - lfn_close(fd); - } - dout(10) << "touch " << cid << "/" << oid << " = " << r << dendl; - return r; -} - -int FileStore::_write(coll_t cid, const ghobject_t& oid, - uint64_t offset, size_t len, - const bufferlist& bl, uint32_t fadvise_flags) -{ - dout(15) << "write " << cid << "/" << oid << " " << offset << "~" << len << dendl; - int r; - - int64_t actual; - - FDRef fd; - r = lfn_open(cid, oid, true, &fd); - if (r < 0) { - dout(0) << "write couldn't open " << cid << "/" - << oid << ": " - << cpp_strerror(r) << dendl; - goto out; - } - - // seek - actual = ::lseek64(**fd, offset, SEEK_SET); - if (actual < 0) { - r = -errno; - dout(0) << "write lseek64 to " << offset << " failed: " << cpp_strerror(r) << dendl; - lfn_close(fd); - goto out; - } - if (actual != (int64_t)offset) { - dout(0) << "write lseek64 to " << offset << " gave bad offset " << actual << dendl; - r = -EIO; - lfn_close(fd); - goto out; - } - - // write - r = bl.write_fd(**fd); - if (r == 0) - r = bl.length(); - - if (r >= 0 && m_filestore_sloppy_crc) { - int rc = backend->_crc_update_write(**fd, offset, len, bl); - assert(rc >= 0); - } - - // flush? - if (!replaying && - g_conf->filestore_wbthrottle_enable) - wbthrottle.queue_wb(fd, oid, offset, len, - fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED); - lfn_close(fd); - - out: - dout(10) << "write " << cid << "/" << oid << " " << offset << "~" << len << " = " << r << dendl; - return r; -} - -int FileStore::_zero(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len) -{ - dout(15) << "zero " << cid << "/" << oid << " " << offset << "~" << len << dendl; - int ret = 0; - -#ifdef CEPH_HAVE_FALLOCATE -# if !defined(DARWIN) && !defined(__FreeBSD__) - // first try to punch a hole. - FDRef fd; - ret = lfn_open(cid, oid, false, &fd); - if (ret < 0) { - goto out; - } - - // first try fallocate - ret = fallocate(**fd, FALLOC_FL_PUNCH_HOLE, offset, len); - if (ret < 0) - ret = -errno; - lfn_close(fd); - - if (ret >= 0 && m_filestore_sloppy_crc) { - int rc = backend->_crc_update_zero(**fd, offset, len); - assert(rc >= 0); - } - - if (ret == 0) - goto out; // yay! - if (ret != -EOPNOTSUPP) - goto out; // some other error -# endif -#endif - - // lame, kernel is old and doesn't support it. - // write zeros.. yuck! - dout(20) << "zero FALLOC_FL_PUNCH_HOLE not supported, falling back to writing zeros" << dendl; - { - bufferptr bp(len); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - ret = _write(cid, oid, offset, len, bl); - } - - out: - dout(20) << "zero " << cid << "/" << oid << " " << offset << "~" << len << " = " << ret << dendl; - return ret; -} - -int FileStore::_clone(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid, - const SequencerPosition& spos) -{ - dout(15) << "clone " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << dendl; - - if (_check_replay_guard(cid, newoid, spos) < 0) - return 0; - - int r; - FDRef o, n; - { - Index index; - r = lfn_open(cid, oldoid, false, &o, &index); - if (r < 0) { - goto out2; - } - assert(NULL != (index.index)); - RWLock::WLocker l((index.index)->access_lock); - - r = lfn_open(cid, newoid, true, &n, &index); - if (r < 0) { - goto out; - } - r = ::ftruncate(**n, 0); - if (r < 0) { - goto out3; - } - struct stat st; - ::fstat(**o, &st); - r = _do_clone_range(**o, **n, 0, st.st_size, 0); - if (r < 0) { - r = -errno; - goto out3; - } - - dout(20) << "objectmap clone" << dendl; - r = object_map->clone(oldoid, newoid, &spos); - if (r < 0 && r != -ENOENT) - goto out3; - } - - { - char buf[2]; - map aset; - r = _fgetattrs(**o, aset); - if (r < 0) - goto out3; - - r = chain_fgetxattr(**o, XATTR_SPILL_OUT_NAME, buf, sizeof(buf)); - if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) { - r = chain_fsetxattr(**n, XATTR_SPILL_OUT_NAME, XATTR_NO_SPILL_OUT, - sizeof(XATTR_NO_SPILL_OUT), true); - } else { - r = chain_fsetxattr(**n, XATTR_SPILL_OUT_NAME, XATTR_SPILL_OUT, - sizeof(XATTR_SPILL_OUT), true); - } - if (r < 0) - goto out3; - - r = _fsetattrs(**n, aset); - if (r < 0) - goto out3; - } - - // clone is non-idempotent; record our work. - _set_replay_guard(**n, spos, &newoid); - - out3: - lfn_close(n); - out: - lfn_close(o); - out2: - dout(10) << "clone " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << " = " << r << dendl; - assert(!m_filestore_fail_eio || r != -EIO); - return r; -} - -int FileStore::_do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) -{ - dout(20) << "_do_clone_range copy " << srcoff << "~" << len << " to " << dstoff << dendl; - return backend->clone_range(from, to, srcoff, len, dstoff); -} - -int FileStore::_do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) -{ - dout(20) << __func__ << " " << srcoff << "~" << len << " to " << dstoff << dendl; - int r = 0; - map exomap; - // fiemap doesn't allow zero length - if (len == 0) - return 0; - - if (backend->has_seek_data_hole()) { - dout(15) << "seek_data/seek_hole " << from << " " << srcoff << "~" << len << dendl; - r = _do_seek_hole_data(from, srcoff, len, &exomap); - } else if (backend->has_fiemap()) { - dout(15) << "fiemap ioctl" << from << " " << srcoff << "~" << len << dendl; - r = _do_fiemap(from, srcoff, len, &exomap); - } - - int64_t written = 0; - for (map::iterator miter = exomap.begin(); miter != exomap.end(); ++miter) { - uint64_t it_off = miter->first - srcoff + dstoff; - r = _do_copy_range(from, to, miter->first, miter->second, it_off, true); - if (r < 0) { - r = -errno; - derr << "FileStore::_do_copy_range: copy error at " << miter->first << "~" << miter->second - << " to " << it_off << ", " << cpp_strerror(r) << dendl; - break; - } - written += miter->second; - } - - if (r >= 0) { - if (m_filestore_sloppy_crc) { - int rc = backend->_crc_update_clone_range(from, to, srcoff, len, dstoff); - assert(rc >= 0); - } - struct stat st; - r = ::fstat(to, &st); - if (r < 0) { - r = -errno; - derr << __func__ << ": fstat error at " << to << " " << cpp_strerror(r) << dendl; - goto out; - } - if (st.st_size < (int)(dstoff + len)) { - r = ::ftruncate(to, dstoff + len); - if (r < 0) { - r = -errno; - derr << __func__ << ": ftruncate error at " << dstoff+len << " " << cpp_strerror(r) << dendl; - goto out; - } - } - r = written; - } - - out: - dout(20) << __func__ << " " << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl; - return r; -} - -int FileStore::_do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc) -{ - dout(20) << "_do_copy_range " << srcoff << "~" << len << " to " << dstoff << dendl; - int r = 0; - loff_t pos = srcoff; - loff_t end = srcoff + len; - int buflen = 4096 * 16; //limit by pipe max size.see fcntl - -#ifdef CEPH_HAVE_SPLICE - if (backend->has_splice()) { - int pipefd[2]; - if (pipe(pipefd) < 0) { - r = errno; - derr << " pipe " << " got " << cpp_strerror(r) << dendl; - return r; - } - - loff_t dstpos = dstoff; - while (pos < end) { - int l = MIN(end-pos, buflen); - r = safe_splice(from, &pos, pipefd[1], NULL, l, SPLICE_F_NONBLOCK); - dout(10) << " safe_splice read from " << pos << "~" << l << " got " << r << dendl; - if (r < 0) { - derr << "FileStore::_do_copy_range: safe_splice read error at " << pos << "~" << len - << ", " << cpp_strerror(r) << dendl; - break; - } - if (r == 0) { - // hrm, bad source range, wtf. - r = -ERANGE; - derr << "FileStore::_do_copy_range got short read result at " << pos - << " of fd " << from << " len " << len << dendl; - break; - } - - r = safe_splice(pipefd[0], NULL, to, &dstpos, r, 0); - dout(10) << " safe_splice write to " << to << " len " << r - << " got " << r << dendl; - if (r < 0) { - derr << "FileStore::_do_copy_range: write error at " << pos << "~" - << r << ", " << cpp_strerror(r) << dendl; - break; - } - } - close(pipefd[0]); - close(pipefd[1]); - } else -#endif - { - int64_t actual; - - actual = ::lseek64(from, srcoff, SEEK_SET); - if (actual != (int64_t)srcoff) { - r = errno; - derr << "lseek64 to " << srcoff << " got " << cpp_strerror(r) << dendl; - return r; - } - actual = ::lseek64(to, dstoff, SEEK_SET); - if (actual != (int64_t)dstoff) { - r = errno; - derr << "lseek64 to " << dstoff << " got " << cpp_strerror(r) << dendl; - return r; - } - - char buf[buflen]; - while (pos < end) { - int l = MIN(end-pos, buflen); - r = ::read(from, buf, l); - dout(25) << " read from " << pos << "~" << l << " got " << r << dendl; - if (r < 0) { - if (errno == EINTR) { - continue; - } else { - r = -errno; - derr << "FileStore::_do_copy_range: read error at " << pos << "~" << len - << ", " << cpp_strerror(r) << dendl; - break; - } - } - if (r == 0) { - // hrm, bad source range, wtf. - r = -ERANGE; - derr << "FileStore::_do_copy_range got short read result at " << pos - << " of fd " << from << " len " << len << dendl; - break; - } - int op = 0; - while (op < r) { - int r2 = safe_write(to, buf+op, r-op); - dout(25) << " write to " << to << " len " << (r-op) - << " got " << r2 << dendl; - if (r2 < 0) { - r = r2; - derr << "FileStore::_do_copy_range: write error at " << pos << "~" - << r-op << ", " << cpp_strerror(r) << dendl; - - break; - } - op += (r-op); - } - if (r < 0) - break; - pos += r; - } - } - - assert(pos == end); - if (r >= 0 && !skip_sloppycrc && m_filestore_sloppy_crc) { - int rc = backend->_crc_update_clone_range(from, to, srcoff, len, dstoff); - assert(rc >= 0); - } - dout(20) << "_do_copy_range " << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl; - return r; -} - -int FileStore::_clone_range(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid, - uint64_t srcoff, uint64_t len, uint64_t dstoff, - const SequencerPosition& spos) -{ - dout(15) << "clone_range " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << " " << srcoff << "~" << len << " to " << dstoff << dendl; - - if (_check_replay_guard(cid, newoid, spos) < 0) - return 0; - - int r; - FDRef o, n; - r = lfn_open(cid, oldoid, false, &o); - if (r < 0) { - goto out2; - } - r = lfn_open(cid, newoid, true, &n); - if (r < 0) { - goto out; - } - r = _do_clone_range(**o, **n, srcoff, len, dstoff); - if (r < 0) { - r = -errno; - goto out3; - } - - // clone is non-idempotent; record our work. - _set_replay_guard(**n, spos, &newoid); - - out3: - lfn_close(n); - out: - lfn_close(o); - out2: - dout(10) << "clone_range " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << " " - << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl; - return r; -} - -class SyncEntryTimeout : public Context { -public: - SyncEntryTimeout(int commit_timeo) - : m_commit_timeo(commit_timeo) - { - } - - void finish(int r) { - BackTrace *bt = new BackTrace(1); - generic_dout(-1) << "FileStore: sync_entry timed out after " - << m_commit_timeo << " seconds.\n"; - bt->print(*_dout); - *_dout << dendl; - delete bt; - ceph_abort(); - } -private: - int m_commit_timeo; -}; - -void FileStore::sync_entry() -{ - lock.Lock(); - while (!stop) { - utime_t max_interval; - max_interval.set_from_double(m_filestore_max_sync_interval); - utime_t min_interval; - min_interval.set_from_double(m_filestore_min_sync_interval); - - utime_t startwait = ceph_clock_now(g_ceph_context); - if (!force_sync) { - dout(20) << "sync_entry waiting for max_interval " << max_interval << dendl; - sync_cond.WaitInterval(g_ceph_context, lock, max_interval); - } else { - dout(20) << "sync_entry not waiting, force_sync set" << dendl; - } - - if (force_sync) { - dout(20) << "sync_entry force_sync set" << dendl; - force_sync = false; - } else { - // wait for at least the min interval - utime_t woke = ceph_clock_now(g_ceph_context); - woke -= startwait; - dout(20) << "sync_entry woke after " << woke << dendl; - if (woke < min_interval) { - utime_t t = min_interval; - t -= woke; - dout(20) << "sync_entry waiting for another " << t - << " to reach min interval " << min_interval << dendl; - sync_cond.WaitInterval(g_ceph_context, lock, t); - } - } - - list fin; - again: - fin.swap(sync_waiters); - lock.Unlock(); - - op_tp.pause(); - if (apply_manager.commit_start()) { - utime_t start = ceph_clock_now(g_ceph_context); - uint64_t cp = apply_manager.get_committing_seq(); - - sync_entry_timeo_lock.Lock(); - SyncEntryTimeout *sync_entry_timeo = - new SyncEntryTimeout(m_filestore_commit_timeout); - timer.add_event_after(m_filestore_commit_timeout, sync_entry_timeo); - sync_entry_timeo_lock.Unlock(); - - logger->set(l_os_committing, 1); - - dout(15) << "sync_entry committing " << cp << dendl; - stringstream errstream; - if (g_conf->filestore_debug_omap_check && !object_map->check(errstream)) { - derr << errstream.str() << dendl; - assert(0); - } - - if (backend->can_checkpoint()) { - int err = write_op_seq(op_fd, cp); - if (err < 0) { - derr << "Error during write_op_seq: " << cpp_strerror(err) << dendl; - assert(0 == "error during write_op_seq"); - } - - char s[NAME_MAX]; - snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)cp); - uint64_t cid = 0; - err = backend->create_checkpoint(s, &cid); - if (err < 0) { - int err = errno; - derr << "snap create '" << s << "' got error " << err << dendl; - assert(err == 0); - } - - snaps.push_back(cp); - apply_manager.commit_started(); - op_tp.unpause(); - - if (cid > 0) { - dout(20) << " waiting for checkpoint " << cid << " to complete" << dendl; - err = backend->sync_checkpoint(cid); - if (err < 0) { - derr << "ioctl WAIT_SYNC got " << cpp_strerror(err) << dendl; - assert(0 == "wait_sync got error"); - } - dout(20) << " done waiting for checkpoint" << cid << " to complete" << dendl; - } - } else - { - apply_manager.commit_started(); - op_tp.unpause(); - - object_map->sync(); - int err = backend->syncfs(); - if (err < 0) { - derr << "syncfs got " << cpp_strerror(err) << dendl; - assert(0 == "syncfs returned error"); - } - - err = write_op_seq(op_fd, cp); - if (err < 0) { - derr << "Error during write_op_seq: " << cpp_strerror(err) << dendl; - assert(0 == "error during write_op_seq"); - } - err = ::fsync(op_fd); - if (err < 0) { - derr << "Error during fsync of op_seq: " << cpp_strerror(err) << dendl; - assert(0 == "error during fsync of op_seq"); - } - } - - utime_t done = ceph_clock_now(g_ceph_context); - utime_t lat = done - start; - utime_t dur = done - startwait; - dout(10) << "sync_entry commit took " << lat << ", interval was " << dur << dendl; - - logger->inc(l_os_commit); - logger->tinc(l_os_commit_lat, lat); - logger->tinc(l_os_commit_len, dur); - - apply_manager.commit_finish(); - wbthrottle.clear(); - - logger->set(l_os_committing, 0); - - // remove old snaps? - if (backend->can_checkpoint()) { - char s[NAME_MAX]; - while (snaps.size() > 2) { - snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)snaps.front()); - snaps.pop_front(); - dout(10) << "removing snap '" << s << "'" << dendl; - int r = backend->destroy_checkpoint(s); - if (r) { - int err = errno; - derr << "unable to destroy snap '" << s << "' got " << cpp_strerror(err) << dendl; - } - } - } - - dout(15) << "sync_entry committed to op_seq " << cp << dendl; - - sync_entry_timeo_lock.Lock(); - timer.cancel_event(sync_entry_timeo); - sync_entry_timeo_lock.Unlock(); - } else { - op_tp.unpause(); - } - - lock.Lock(); - finish_contexts(g_ceph_context, fin, 0); - fin.clear(); - if (!sync_waiters.empty()) { - dout(10) << "sync_entry more waiters, committing again" << dendl; - goto again; - } - if (!stop && journal && journal->should_commit_now()) { - dout(10) << "sync_entry journal says we should commit again (probably is/was full)" << dendl; - goto again; - } - } - stop = false; - lock.Unlock(); -} - -void FileStore::_start_sync() -{ - if (!journal) { // don't do a big sync if the journal is on - dout(10) << "start_sync" << dendl; - sync_cond.Signal(); - } else { - dout(10) << "start_sync - NOOP (journal is on)" << dendl; - } -} - -void FileStore::do_force_sync() -{ - dout(10) << __func__ << dendl; - Mutex::Locker l(lock); - force_sync = true; - sync_cond.Signal(); -} - -void FileStore::start_sync(Context *onsafe) -{ - Mutex::Locker l(lock); - sync_waiters.push_back(onsafe); - sync_cond.Signal(); - force_sync = true; - dout(10) << "start_sync" << dendl; -} - -void FileStore::sync() -{ - Mutex l("FileStore::sync"); - Cond c; - bool done; - C_SafeCond *fin = new C_SafeCond(&l, &c, &done); - - start_sync(fin); - - l.Lock(); - while (!done) { - dout(10) << "sync waiting" << dendl; - c.Wait(l); - } - l.Unlock(); - dout(10) << "sync done" << dendl; -} - -void FileStore::_flush_op_queue() -{ - dout(10) << "_flush_op_queue draining op tp" << dendl; - op_wq.drain(); - dout(10) << "_flush_op_queue waiting for apply finisher" << dendl; - for (vector::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) { - (*it)->wait_for_empty(); - } -} - -/* - * flush - make every queued write readable - */ -void FileStore::flush() -{ - dout(10) << "flush" << dendl; - - if (g_conf->filestore_blackhole) { - // wait forever - Mutex lock("FileStore::flush::lock"); - Cond cond; - lock.Lock(); - while (true) - cond.Wait(lock); - assert(0); - } - - if (m_filestore_journal_writeahead) { - if (journal) - journal->flush(); - dout(10) << "flush draining ondisk finisher" << dendl; - for (vector::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) { - (*it)->wait_for_empty(); - } - } - - _flush_op_queue(); - dout(10) << "flush complete" << dendl; -} - -/* - * sync_and_flush - make every queued write readable AND committed to disk - */ -void FileStore::sync_and_flush() -{ - dout(10) << "sync_and_flush" << dendl; - - if (m_filestore_journal_writeahead) { - if (journal) - journal->flush(); - _flush_op_queue(); - } else { - // includes m_filestore_journal_parallel - _flush_op_queue(); - sync(); - } - dout(10) << "sync_and_flush done" << dendl; -} - -int FileStore::flush_journal() -{ - dout(10) << __func__ << dendl; - sync_and_flush(); - sync(); - return 0; -} - -int FileStore::snapshot(const string& name) -{ - dout(10) << "snapshot " << name << dendl; - sync_and_flush(); - - if (!backend->can_checkpoint()) { - dout(0) << "snapshot " << name << " failed, not supported" << dendl; - return -EOPNOTSUPP; - } - - char s[NAME_MAX]; - snprintf(s, sizeof(s), CLUSTER_SNAP_ITEM, name.c_str()); - - int r = backend->create_checkpoint(s, NULL); - if (r) { - r = -errno; - derr << "snapshot " << name << " failed: " << cpp_strerror(r) << dendl; - } - - return r; -} - -// ------------------------------- -// attributes - -int FileStore::_fgetattr(int fd, const char *name, bufferptr& bp) -{ - char val[CHAIN_XATTR_MAX_BLOCK_LEN]; - int l = chain_fgetxattr(fd, name, val, sizeof(val)); - if (l >= 0) { - bp = buffer::create(l); - memcpy(bp.c_str(), val, l); - } else if (l == -ERANGE) { - l = chain_fgetxattr(fd, name, 0, 0); - if (l > 0) { - bp = buffer::create(l); - l = chain_fgetxattr(fd, name, bp.c_str(), l); - } - } - assert(!m_filestore_fail_eio || l != -EIO); - return l; -} - -int FileStore::_fgetattrs(int fd, map& aset) -{ - // get attr list - char names1[100]; - int len = chain_flistxattr(fd, names1, sizeof(names1)-1); - char *names2 = 0; - char *name = 0; - if (len == -ERANGE) { - len = chain_flistxattr(fd, 0, 0); - if (len < 0) { - assert(!m_filestore_fail_eio || len != -EIO); - return len; - } - dout(10) << " -ERANGE, len is " << len << dendl; - names2 = new char[len+1]; - len = chain_flistxattr(fd, names2, len); - dout(10) << " -ERANGE, got " << len << dendl; - if (len < 0) { - assert(!m_filestore_fail_eio || len != -EIO); - delete[] names2; - return len; - } - name = names2; - } else if (len < 0) { - assert(!m_filestore_fail_eio || len != -EIO); - return len; - } else { - name = names1; - } - name[len] = 0; - - char *end = name + len; - while (name < end) { - char *attrname = name; - if (parse_attrname(&name)) { - if (*name) { - dout(20) << "fgetattrs " << fd << " getting '" << name << "'" << dendl; - int r = _fgetattr(fd, attrname, aset[name]); - if (r < 0) { - delete[] names2; - return r; - } - } - } - name += strlen(name) + 1; - } - - delete[] names2; - return 0; -} - -int FileStore::_fsetattrs(int fd, map &aset) -{ - for (map::iterator p = aset.begin(); - p != aset.end(); - ++p) { - char n[CHAIN_XATTR_MAX_NAME_LEN]; - get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN); - const char *val; - if (p->second.length()) - val = p->second.c_str(); - else - val = ""; - // ??? Why do we skip setting all the other attrs if one fails? - int r = chain_fsetxattr(fd, n, val, p->second.length()); - if (r < 0) { - derr << "FileStore::_setattrs: chain_setxattr returned " << r << dendl; - return r; - } - } - return 0; -} - -// debug EIO injection -void FileStore::inject_data_error(const ghobject_t &oid) { - Mutex::Locker l(read_error_lock); - dout(10) << __func__ << ": init error on " << oid << dendl; - data_error_set.insert(oid); -} -void FileStore::inject_mdata_error(const ghobject_t &oid) { - Mutex::Locker l(read_error_lock); - dout(10) << __func__ << ": init error on " << oid << dendl; - mdata_error_set.insert(oid); -} -void FileStore::debug_obj_on_delete(const ghobject_t &oid) { - Mutex::Locker l(read_error_lock); - dout(10) << __func__ << ": clear error on " << oid << dendl; - data_error_set.erase(oid); - mdata_error_set.erase(oid); -} -bool FileStore::debug_data_eio(const ghobject_t &oid) { - Mutex::Locker l(read_error_lock); - if (data_error_set.count(oid)) { - dout(10) << __func__ << ": inject error on " << oid << dendl; - return true; - } else { - return false; - } -} -bool FileStore::debug_mdata_eio(const ghobject_t &oid) { - Mutex::Locker l(read_error_lock); - if (mdata_error_set.count(oid)) { - dout(10) << __func__ << ": inject error on " << oid << dendl; - return true; - } else { - return false; - } -} - - -// objects - -int FileStore::getattr(coll_t cid, const ghobject_t& oid, const char *name, bufferptr &bp) -{ - tracepoint(objectstore, getattr_enter, cid.c_str()); - _kludge_temp_object_collection(cid, oid); - dout(15) << "getattr " << cid << "/" << oid << " '" << name << "'" << dendl; - FDRef fd; - int r = lfn_open(cid, oid, false, &fd); - if (r < 0) { - goto out; - } - char n[CHAIN_XATTR_MAX_NAME_LEN]; - get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN); - r = _fgetattr(**fd, n, bp); - lfn_close(fd); - if (r == -ENODATA) { - map got; - set to_get; - to_get.insert(string(name)); - Index index; - r = get_index(cid, &index); - if (r < 0) { - dout(10) << __func__ << " could not get index r = " << r << dendl; - goto out; - } - r = object_map->get_xattrs(oid, to_get, &got); - if (r < 0 && r != -ENOENT) { - dout(10) << __func__ << " get_xattrs err r =" << r << dendl; - goto out; - } - if (got.empty()) { - dout(10) << __func__ << " got.size() is 0" << dendl; - return -ENODATA; - } - bp = bufferptr(got.begin()->second.c_str(), - got.begin()->second.length()); - r = bp.length(); - } - out: - dout(10) << "getattr " << cid << "/" << oid << " '" << name << "' = " << r << dendl; - assert(!m_filestore_fail_eio || r != -EIO); - if (g_conf->filestore_debug_inject_read_err && - debug_mdata_eio(oid)) { - return -EIO; - } else { - tracepoint(objectstore, getattr_exit, r); - return r < 0 ? r : 0; - } -} - -int FileStore::getattrs(coll_t cid, const ghobject_t& oid, map& aset) -{ - tracepoint(objectstore, getattrs_enter, cid.c_str()); - _kludge_temp_object_collection(cid, oid); - set omap_attrs; - map omap_aset; - Index index; - dout(15) << "getattrs " << cid << "/" << oid << dendl; - FDRef fd; - bool spill_out = true; - char buf[2]; - - int r = lfn_open(cid, oid, false, &fd); - if (r < 0) { - goto out; - } - - r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf)); - if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) - spill_out = false; - - r = _fgetattrs(**fd, aset); - if (r < 0) { - goto out; - } - lfn_close(fd); - - if (!spill_out) { - dout(10) << __func__ << " no xattr exists in object_map r = " << r << dendl; - goto out; - } - - r = get_index(cid, &index); - if (r < 0) { - dout(10) << __func__ << " could not get index r = " << r << dendl; - goto out; - } - { - r = object_map->get_all_xattrs(oid, &omap_attrs); - if (r < 0 && r != -ENOENT) { - dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl; - goto out; - } - - r = object_map->get_xattrs(oid, omap_attrs, &omap_aset); - if (r < 0 && r != -ENOENT) { - dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl; - goto out; - } - if (r == -ENOENT) - r = 0; - } - assert(omap_attrs.size() == omap_aset.size()); - for (map::iterator i = omap_aset.begin(); - i != omap_aset.end(); - ++i) { - string key(i->first); - aset.insert(make_pair(key, - bufferptr(i->second.c_str(), i->second.length()))); - } - out: - dout(10) << "getattrs " << cid << "/" << oid << " = " << r << dendl; - assert(!m_filestore_fail_eio || r != -EIO); - - if (g_conf->filestore_debug_inject_read_err && - debug_mdata_eio(oid)) { - return -EIO; - } else { - tracepoint(objectstore, getattrs_exit, r); - return r; - } -} - -int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map& aset, - const SequencerPosition &spos) -{ - map omap_set; - set omap_remove; - map inline_set; - map inline_to_set; - FDRef fd; - int spill_out = -1; - bool incomplete_inline = false; - - int r = lfn_open(cid, oid, false, &fd); - if (r < 0) { - goto out; - } - - char buf[2]; - r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf)); - if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) - spill_out = 0; - else - spill_out = 1; - - r = _fgetattrs(**fd, inline_set); - incomplete_inline = (r == -E2BIG); - assert(!m_filestore_fail_eio || r != -EIO); - dout(15) << "setattrs " << cid << "/" << oid - << (incomplete_inline ? " (incomplete_inline, forcing omap)" : "") - << dendl; - - for (map::iterator p = aset.begin(); - p != aset.end(); - ++p) { - char n[CHAIN_XATTR_MAX_NAME_LEN]; - get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN); - - if (incomplete_inline) { - chain_fremovexattr(**fd, n); // ignore any error - omap_set[p->first].push_back(p->second); - continue; - } - - if (p->second.length() > m_filestore_max_inline_xattr_size) { - if (inline_set.count(p->first)) { - inline_set.erase(p->first); - r = chain_fremovexattr(**fd, n); - if (r < 0) - goto out_close; - } - omap_set[p->first].push_back(p->second); - continue; - } - - if (!inline_set.count(p->first) && - inline_set.size() >= m_filestore_max_inline_xattrs) { - omap_set[p->first].push_back(p->second); - continue; - } - omap_remove.insert(p->first); - inline_set.insert(*p); - - inline_to_set.insert(*p); - } - - if (spill_out != 1 && !omap_set.empty()) { - chain_fsetxattr(**fd, XATTR_SPILL_OUT_NAME, XATTR_SPILL_OUT, - sizeof(XATTR_SPILL_OUT)); - } - - r = _fsetattrs(**fd, inline_to_set); - if (r < 0) - goto out_close; - - if (spill_out && !omap_remove.empty()) { - r = object_map->remove_xattrs(oid, omap_remove, &spos); - if (r < 0 && r != -ENOENT) { - dout(10) << __func__ << " could not remove_xattrs r = " << r << dendl; - assert(!m_filestore_fail_eio || r != -EIO); - goto out_close; - } else { - r = 0; // don't confuse the debug output - } - } - - if (!omap_set.empty()) { - r = object_map->set_xattrs(oid, omap_set, &spos); - if (r < 0) { - dout(10) << __func__ << " could not set_xattrs r = " << r << dendl; - assert(!m_filestore_fail_eio || r != -EIO); - goto out_close; - } - } - out_close: - lfn_close(fd); - out: - dout(10) << "setattrs " << cid << "/" << oid << " = " << r << dendl; - return r; -} - - -int FileStore::_rmattr(coll_t cid, const ghobject_t& oid, const char *name, - const SequencerPosition &spos) -{ - dout(15) << "rmattr " << cid << "/" << oid << " '" << name << "'" << dendl; - FDRef fd; - bool spill_out = true; - bufferptr bp; - - int r = lfn_open(cid, oid, false, &fd); - if (r < 0) { - goto out; - } - - char buf[2]; - r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf)); - if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) { - spill_out = false; - } - - char n[CHAIN_XATTR_MAX_NAME_LEN]; - get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN); - r = chain_fremovexattr(**fd, n); - if (r == -ENODATA && spill_out) { - Index index; - r = get_index(cid, &index); - if (r < 0) { - dout(10) << __func__ << " could not get index r = " << r << dendl; - goto out_close; - } - set to_remove; - to_remove.insert(string(name)); - r = object_map->remove_xattrs(oid, to_remove, &spos); - if (r < 0 && r != -ENOENT) { - dout(10) << __func__ << " could not remove_xattrs index r = " << r << dendl; - assert(!m_filestore_fail_eio || r != -EIO); - goto out_close; - } - } - out_close: - lfn_close(fd); - out: - dout(10) << "rmattr " << cid << "/" << oid << " '" << name << "' = " << r << dendl; - return r; -} - -int FileStore::_rmattrs(coll_t cid, const ghobject_t& oid, - const SequencerPosition &spos) -{ - dout(15) << "rmattrs " << cid << "/" << oid << dendl; - - map aset; - FDRef fd; - set omap_attrs; - Index index; - bool spill_out = true; - - int r = lfn_open(cid, oid, false, &fd); - if (r < 0) { - goto out; - } - - char buf[2]; - r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf)); - if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) { - spill_out = false; - } - - r = _fgetattrs(**fd, aset); - if (r >= 0) { - for (map::iterator p = aset.begin(); p != aset.end(); ++p) { - char n[CHAIN_XATTR_MAX_NAME_LEN]; - get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN); - r = chain_fremovexattr(**fd, n); - if (r < 0) - break; - } - } - - if (!spill_out) { - dout(10) << __func__ << " no xattr exists in object_map r = " << r << dendl; - goto out_close; - } - - r = get_index(cid, &index); - if (r < 0) { - dout(10) << __func__ << " could not get index r = " << r << dendl; - goto out_close; - } - { - r = object_map->get_all_xattrs(oid, &omap_attrs); - if (r < 0 && r != -ENOENT) { - dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl; - assert(!m_filestore_fail_eio || r != -EIO); - goto out_close; - } - r = object_map->remove_xattrs(oid, omap_attrs, &spos); - if (r < 0 && r != -ENOENT) { - dout(10) << __func__ << " could not remove omap_attrs r = " << r << dendl; - goto out_close; - } - if (r == -ENOENT) - r = 0; - chain_fsetxattr(**fd, XATTR_SPILL_OUT_NAME, XATTR_NO_SPILL_OUT, - sizeof(XATTR_NO_SPILL_OUT)); - } - - out_close: - lfn_close(fd); - out: - dout(10) << "rmattrs " << cid << "/" << oid << " = " << r << dendl; - return r; -} - - - -// collections - -int FileStore::collection_getattr(coll_t c, const char *name, - void *value, size_t size) -{ - char fn[PATH_MAX]; - get_cdir(c, fn, sizeof(fn)); - dout(15) << "collection_getattr " << fn << " '" << name << "' len " << size << dendl; - int r; - int fd = ::open(fn, O_RDONLY); - if (fd < 0) { - r = -errno; - goto out; - } - char n[PATH_MAX]; - get_attrname(name, n, PATH_MAX); - r = chain_fgetxattr(fd, n, value, size); - VOID_TEMP_FAILURE_RETRY(::close(fd)); - out: - dout(10) << "collection_getattr " << fn << " '" << name << "' len " << size << " = " << r << dendl; - assert(!m_filestore_fail_eio || r != -EIO); - return r; -} - -int FileStore::collection_getattr(coll_t c, const char *name, bufferlist& bl) -{ - char fn[PATH_MAX]; - get_cdir(c, fn, sizeof(fn)); - dout(15) << "collection_getattr " << fn << " '" << name << "'" << dendl; - char n[PATH_MAX]; - get_attrname(name, n, PATH_MAX); - buffer::ptr bp; - int r; - int fd = ::open(fn, O_RDONLY); - if (fd < 0) { - r = -errno; - goto out; - } - r = _fgetattr(fd, n, bp); - bl.push_back(bp); - VOID_TEMP_FAILURE_RETRY(::close(fd)); - out: - dout(10) << "collection_getattr " << fn << " '" << name << "' = " << r << dendl; - assert(!m_filestore_fail_eio || r != -EIO); - return r; -} - -int FileStore::collection_getattrs(coll_t cid, map& aset) -{ - char fn[PATH_MAX]; - get_cdir(cid, fn, sizeof(fn)); - dout(10) << "collection_getattrs " << fn << dendl; - int r = 0; - int fd = ::open(fn, O_RDONLY); - if (fd < 0) { - r = -errno; - goto out; - } - r = _fgetattrs(fd, aset); - VOID_TEMP_FAILURE_RETRY(::close(fd)); - out: - dout(10) << "collection_getattrs " << fn << " = " << r << dendl; - assert(!m_filestore_fail_eio || r != -EIO); - return r; -} - - -int FileStore::_collection_setattr(coll_t c, const char *name, - const void *value, size_t size) -{ - char fn[PATH_MAX]; - get_cdir(c, fn, sizeof(fn)); - dout(10) << "collection_setattr " << fn << " '" << name << "' len " << size << dendl; - char n[PATH_MAX]; - int r; - int fd = ::open(fn, O_RDONLY); - if (fd < 0) { - r = -errno; - goto out; - } - get_attrname(name, n, PATH_MAX); - r = chain_fsetxattr(fd, n, value, size); - VOID_TEMP_FAILURE_RETRY(::close(fd)); - out: - dout(10) << "collection_setattr " << fn << " '" << name << "' len " << size << " = " << r << dendl; - return r; -} - -int FileStore::_collection_rmattr(coll_t c, const char *name) -{ - char fn[PATH_MAX]; - get_cdir(c, fn, sizeof(fn)); - dout(15) << "collection_rmattr " << fn << dendl; - char n[PATH_MAX]; - get_attrname(name, n, PATH_MAX); - int r; - int fd = ::open(fn, O_RDONLY); - if (fd < 0) { - r = -errno; - goto out; - } - r = chain_fremovexattr(fd, n); - VOID_TEMP_FAILURE_RETRY(::close(fd)); - out: - dout(10) << "collection_rmattr " << fn << " = " << r << dendl; - return r; -} - - -int FileStore::_collection_setattrs(coll_t cid, map& aset) -{ - char fn[PATH_MAX]; - get_cdir(cid, fn, sizeof(fn)); - dout(15) << "collection_setattrs " << fn << dendl; - int r = 0; - int fd = ::open(fn, O_RDONLY); - if (fd < 0) { - r = -errno; - goto out; - } - for (map::iterator p = aset.begin(); - p != aset.end(); - ++p) { - char n[PATH_MAX]; - get_attrname(p->first.c_str(), n, PATH_MAX); - r = chain_fsetxattr(fd, n, p->second.c_str(), p->second.length()); - if (r < 0) - break; - } - VOID_TEMP_FAILURE_RETRY(::close(fd)); - out: - dout(10) << "collection_setattrs " << fn << " = " << r << dendl; - return r; -} - -int FileStore::_collection_remove_recursive(const coll_t &cid, - const SequencerPosition &spos) -{ - struct stat st; - int r = collection_stat(cid, &st); - if (r < 0) { - if (r == -ENOENT) - return 0; - return r; - } - - vector objects; - ghobject_t max; - while (!max.is_max()) { - r = collection_list(cid, max, ghobject_t::get_max(), true, - 300, &objects, &max); - if (r < 0) - return r; - for (vector::iterator i = objects.begin(); - i != objects.end(); - ++i) { - assert(_check_replay_guard(cid, *i, spos)); - r = _remove(cid, *i, spos); - if (r < 0) - return r; - } - } - return _destroy_collection(cid); -} - -// -------------------------- -// collections - -int FileStore::collection_version_current(coll_t c, uint32_t *version) -{ - Index index; - int r = get_index(c, &index); - if (r < 0) - return r; - - assert(NULL != index.index); - RWLock::RLocker l((index.index)->access_lock); - - *version = index->collection_version(); - if (*version == target_version) - return 1; - else - return 0; -} - -int FileStore::list_collections(vector& ls) -{ - return list_collections(ls, false); -} - -int FileStore::list_collections(vector& ls, bool include_temp) -{ - tracepoint(objectstore, list_collections_enter); - dout(10) << "list_collections" << dendl; - - char fn[PATH_MAX]; - snprintf(fn, sizeof(fn), "%s/current", basedir.c_str()); - - int r = 0; - DIR *dir = ::opendir(fn); - if (!dir) { - r = -errno; - derr << "tried opening directory " << fn << ": " << cpp_strerror(-r) << dendl; - assert(!m_filestore_fail_eio || r != -EIO); - return r; - } - - char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1]; - struct dirent *de; - while ((r = ::readdir_r(dir, (struct dirent *)&buf, &de)) == 0) { - if (!de) - break; - if (de->d_type == DT_UNKNOWN) { - // d_type not supported (non-ext[234], btrfs), must stat - struct stat sb; - char filename[PATH_MAX]; - snprintf(filename, sizeof(filename), "%s/%s", fn, de->d_name); - - r = ::stat(filename, &sb); - if (r < 0) { - r = -errno; - derr << "stat on " << filename << ": " << cpp_strerror(-r) << dendl; - assert(!m_filestore_fail_eio || r != -EIO); - break; - } - if (!S_ISDIR(sb.st_mode)) { - continue; - } - } else if (de->d_type != DT_DIR) { - continue; - } - if (strcmp(de->d_name, "omap") == 0) { - continue; - } - if (de->d_name[0] == '.' && - (de->d_name[1] == '\0' || - (de->d_name[1] == '.' && - de->d_name[2] == '\0'))) - continue; - coll_t cid; - if (!cid.parse(de->d_name)) { - derr << "ignoging invalid collection '" << de->d_name << "'" << dendl; - continue; - } - if (!cid.is_temp() || include_temp) - ls.push_back(cid); - } - - if (r > 0) { - derr << "trying readdir_r " << fn << ": " << cpp_strerror(r) << dendl; - r = -r; - } - - ::closedir(dir); - assert(!m_filestore_fail_eio || r != -EIO); - tracepoint(objectstore, list_collections_exit, r); - return r; -} - -int FileStore::collection_stat(coll_t c, struct stat *st) -{ - tracepoint(objectstore, collection_stat_enter, c.c_str()); - char fn[PATH_MAX]; - get_cdir(c, fn, sizeof(fn)); - dout(15) << "collection_stat " << fn << dendl; - int r = ::stat(fn, st); - if (r < 0) - r = -errno; - dout(10) << "collection_stat " << fn << " = " << r << dendl; - assert(!m_filestore_fail_eio || r != -EIO); - tracepoint(objectstore, collection_stat_exit, r); - return r; -} - -bool FileStore::collection_exists(coll_t c) -{ - tracepoint(objectstore, collection_exists_enter, c.c_str()); - struct stat st; - bool ret = collection_stat(c, &st) == 0; - tracepoint(objectstore, collection_exists_exit, ret); - return ret; -} - -bool FileStore::collection_empty(coll_t c) -{ - tracepoint(objectstore, collection_empty_enter, c.c_str()); - dout(15) << "collection_empty " << c << dendl; - Index index; - int r = get_index(c, &index); - if (r < 0) - return false; - - assert(NULL != index.index); - RWLock::RLocker l((index.index)->access_lock); - - vector ls; - r = index->collection_list_partial(ghobject_t(), ghobject_t::get_max(), true, - 1, &ls, NULL); - if (r < 0) { - assert(!m_filestore_fail_eio || r != -EIO); - return false; - } - bool ret = ls.empty(); - tracepoint(objectstore, collection_empty_exit, ret); - return ret; -} -int FileStore::collection_list(coll_t c, ghobject_t start, ghobject_t end, - bool sort_bitwise, int max, - vector *ls, ghobject_t *next) -{ - if (start.is_max()) - return 0; - - ghobject_t temp_next; - if (!next) - next = &temp_next; - // figure out the pool id. we need this in order to generate a - // meaningful 'next' value. - int64_t pool = -1; - shard_id_t shard; - { - spg_t pgid; - if (c.is_temp(&pgid)) { - pool = -2 - pgid.pool(); - shard = pgid.shard; - } else if (c.is_pg(&pgid)) { - pool = pgid.pool(); - shard = pgid.shard; - } else if (c.is_meta()) { - pool = -1; - shard = shard_id_t::NO_SHARD; - } else { - // hrm, the caller is test code! we should get kill it off. for now, - // tolerate it. - pool = 0; - shard = shard_id_t::NO_SHARD; - } - dout(20) << __func__ << " pool is " << pool << " shard is " << shard - << " pgid " << pgid << dendl; - } - ghobject_t sep; - sep.hobj.pool = -1; - sep.set_shard(shard); - if (!c.is_temp() && !c.is_meta()) { - if (cmp_bitwise(start, sep) < 0) { // bitwise vs nibble doesn't matter here - dout(10) << __func__ << " first checking temp pool" << dendl; - coll_t temp = c.get_temp(); - int r = collection_list(temp, start, end, sort_bitwise, max, ls, next); - if (r < 0) - return r; - if (*next != ghobject_t::get_max()) - return r; - start = sep; - dout(10) << __func__ << " fall through to non-temp collection, start " - << start << dendl; - } else { - dout(10) << __func__ << " start " << start << " >= sep " << sep << dendl; - } - } - - Index index; - int r = get_index(c, &index); - if (r < 0) - return r; - - assert(NULL != index.index); - RWLock::RLocker l((index.index)->access_lock); - - r = index->collection_list_partial(start, end, sort_bitwise, max, ls, next); - - if (r < 0) { - assert(!m_filestore_fail_eio || r != -EIO); - return r; - } - dout(20) << "objects: " << ls << dendl; - - // HashIndex doesn't know the pool when constructing a 'next' value - if (next && !next->is_max()) { - next->hobj.pool = pool; - next->set_shard(shard); - dout(20) << " next " << *next << dendl; - } - - return 0; -} - -int FileStore::omap_get(coll_t c, const ghobject_t &hoid, - bufferlist *header, - map *out) -{ - tracepoint(objectstore, omap_get_enter, c.c_str()); - _kludge_temp_object_collection(c, hoid); - dout(15) << __func__ << " " << c << "/" << hoid << dendl; - Index index; - int r = get_index(c, &index); - if (r < 0) - return r; - { - assert(NULL != index.index); - RWLock::RLocker l((index.index)->access_lock); - r = lfn_find(hoid, index); - if (r < 0) - return r; - } - r = object_map->get(hoid, header, out); - if (r < 0 && r != -ENOENT) { - assert(!m_filestore_fail_eio || r != -EIO); - return r; - } - tracepoint(objectstore, omap_get_exit, 0); - return 0; -} - -int FileStore::omap_get_header( - coll_t c, - const ghobject_t &hoid, - bufferlist *bl, - bool allow_eio) -{ - tracepoint(objectstore, omap_get_header_enter, c.c_str()); - _kludge_temp_object_collection(c, hoid); - dout(15) << __func__ << " " << c << "/" << hoid << dendl; - Index index; - int r = get_index(c, &index); - if (r < 0) - return r; - { - assert(NULL != index.index); - RWLock::RLocker l((index.index)->access_lock); - r = lfn_find(hoid, index); - if (r < 0) - return r; - } - r = object_map->get_header(hoid, bl); - if (r < 0 && r != -ENOENT) { - assert(allow_eio || !m_filestore_fail_eio || r != -EIO); - return r; - } - tracepoint(objectstore, omap_get_header_exit, 0); - return 0; -} - -int FileStore::omap_get_keys(coll_t c, const ghobject_t &hoid, set *keys) -{ - tracepoint(objectstore, omap_get_keys_enter, c.c_str()); - _kludge_temp_object_collection(c, hoid); - dout(15) << __func__ << " " << c << "/" << hoid << dendl; - Index index; - int r = get_index(c, &index); - if (r < 0) - return r; - { - assert(NULL != index.index); - RWLock::RLocker l((index.index)->access_lock); - r = lfn_find(hoid, index); - if (r < 0) - return r; - } - r = object_map->get_keys(hoid, keys); - if (r < 0 && r != -ENOENT) { - assert(!m_filestore_fail_eio || r != -EIO); - return r; - } - tracepoint(objectstore, omap_get_keys_exit, 0); - return 0; -} - -int FileStore::omap_get_values(coll_t c, const ghobject_t &hoid, - const set &keys, - map *out) -{ - tracepoint(objectstore, omap_get_values_enter, c.c_str()); - _kludge_temp_object_collection(c, hoid); - dout(15) << __func__ << " " << c << "/" << hoid << dendl; - Index index; - const char *where = 0; - int r = get_index(c, &index); - if (r < 0) { - where = " (get_index)"; - goto out; - } - { - assert(NULL != index.index); - RWLock::RLocker l((index.index)->access_lock); - r = lfn_find(hoid, index); - if (r < 0) { - where = " (lfn_find)"; - goto out; - } - } - r = object_map->get_values(hoid, keys, out); - if (r < 0 && r != -ENOENT) { - assert(!m_filestore_fail_eio || r != -EIO); - goto out; - } - r = 0; - out: - tracepoint(objectstore, omap_get_values_exit, r); - dout(15) << __func__ << " " << c << "/" << hoid << " = " << r - << where << dendl; - return r; -} - -int FileStore::omap_check_keys(coll_t c, const ghobject_t &hoid, - const set &keys, - set *out) -{ - tracepoint(objectstore, omap_check_keys_enter, c.c_str()); - _kludge_temp_object_collection(c, hoid); - dout(15) << __func__ << " " << c << "/" << hoid << dendl; - - Index index; - int r = get_index(c, &index); - if (r < 0) - return r; - { - assert(NULL != index.index); - RWLock::RLocker l((index.index)->access_lock); - r = lfn_find(hoid, index); - if (r < 0) - return r; - } - r = object_map->check_keys(hoid, keys, out); - if (r < 0 && r != -ENOENT) { - assert(!m_filestore_fail_eio || r != -EIO); - return r; - } - tracepoint(objectstore, omap_check_keys_exit, 0); - return 0; -} - -ObjectMap::ObjectMapIterator FileStore::get_omap_iterator(coll_t c, - const ghobject_t &hoid) -{ - tracepoint(objectstore, get_omap_iterator, c.c_str()); - _kludge_temp_object_collection(c, hoid); - dout(15) << __func__ << " " << c << "/" << hoid << dendl; - Index index; - int r = get_index(c, &index); - if (r < 0) { - dout(10) << __func__ << " " << c << "/" << hoid << " = 0 " - << "(get_index failed with " << cpp_strerror(r) << ")" << dendl; - return ObjectMap::ObjectMapIterator(); - } - { - assert(NULL != index.index); - RWLock::RLocker l((index.index)->access_lock); - r = lfn_find(hoid, index); - if (r < 0) { - dout(10) << __func__ << " " << c << "/" << hoid << " = 0 " - << "(lfn_find failed with " << cpp_strerror(r) << ")" << dendl; - return ObjectMap::ObjectMapIterator(); - } - } - return object_map->get_iterator(hoid); -} - -int FileStore::_collection_hint_expected_num_objs(coll_t c, uint32_t pg_num, - uint64_t expected_num_objs, - const SequencerPosition &spos) -{ - dout(15) << __func__ << " collection: " << c << " pg number: " - << pg_num << " expected number of objects: " << expected_num_objs << dendl; - - if (!collection_empty(c) && !replaying) { - dout(0) << "Failed to give an expected number of objects hint to collection : " - << c << ", only empty collection can take such type of hint. " << dendl; - return 0; - } - - int ret; - Index index; - ret = get_index(c, &index); - if (ret < 0) - return ret; - // Pre-hash the collection - ret = index->pre_hash_collection(pg_num, expected_num_objs); - dout(10) << "pre_hash_collection " << c << " = " << ret << dendl; - if (ret < 0) - return ret; - _set_replay_guard(c, spos); - - return 0; -} - -int FileStore::_create_collection( - coll_t c, - const SequencerPosition &spos) -{ - char fn[PATH_MAX]; - get_cdir(c, fn, sizeof(fn)); - dout(15) << "create_collection " << fn << dendl; - int r = ::mkdir(fn, 0755); - if (r < 0) - r = -errno; - if (r == -EEXIST && replaying) - r = 0; - dout(10) << "create_collection " << fn << " = " << r << dendl; - - if (r < 0) - return r; - r = init_index(c); - if (r < 0) - return r; - - // create parallel temp collection, too - if (!c.is_meta() && !c.is_temp()) { - coll_t temp = c.get_temp(); - r = _create_collection(temp, spos); - if (r < 0) - return r; - } - - _set_replay_guard(c, spos); - return 0; -} - -int FileStore::_destroy_collection(coll_t c) -{ - int r = 0; - char fn[PATH_MAX]; - get_cdir(c, fn, sizeof(fn)); - dout(15) << "_destroy_collection " << fn << dendl; - { - Index from; - r = get_index(c, &from); - if (r < 0) - goto out; - assert(NULL != from.index); - RWLock::WLocker l((from.index)->access_lock); - - r = from->prep_delete(); - if (r < 0) - goto out; - } - r = ::rmdir(fn); - if (r < 0) { - r = -errno; - goto out; - } - - out: - // destroy parallel temp collection, too - if (!c.is_meta() && !c.is_temp()) { - coll_t temp = c.get_temp(); - int r2 = _destroy_collection(temp); - if (r2 < 0) { - r = r2; - goto out_final; - } - } - - out_final: - dout(10) << "_destroy_collection " << fn << " = " << r << dendl; - return r; -} - - -int FileStore::_collection_add(coll_t c, coll_t oldcid, const ghobject_t& o, - const SequencerPosition& spos) -{ - dout(15) << "collection_add " << c << "/" << o << " from " << oldcid << "/" << o << dendl; - - int dstcmp = _check_replay_guard(c, o, spos); - if (dstcmp < 0) - return 0; - - // check the src name too; it might have a newer guard, and we don't - // want to clobber it - int srccmp = _check_replay_guard(oldcid, o, spos); - if (srccmp < 0) - return 0; - - // open guard on object so we don't any previous operations on the - // new name that will modify the source inode. - FDRef fd; - int r = lfn_open(oldcid, o, 0, &fd); - if (r < 0) { - // the source collection/object does not exist. If we are replaying, we - // should be safe, so just return 0 and move on. - assert(replaying); - dout(10) << "collection_add " << c << "/" << o << " from " - << oldcid << "/" << o << " (dne, continue replay) " << dendl; - return 0; - } - if (dstcmp > 0) { // if dstcmp == 0 the guard already says "in-progress" - _set_replay_guard(**fd, spos, &o, true); - } - - r = lfn_link(oldcid, c, o, o); - if (replaying && !backend->can_checkpoint() && - r == -EEXIST) // crashed between link() and set_replay_guard() - r = 0; - - _inject_failure(); - - // close guard on object so we don't do this again - if (r == 0) { - _close_replay_guard(**fd, spos); - } - lfn_close(fd); - - dout(10) << "collection_add " << c << "/" << o << " from " << oldcid << "/" << o << " = " << r << dendl; - return r; -} - -int FileStore::_collection_move_rename(coll_t oldcid, const ghobject_t& oldoid, - coll_t c, const ghobject_t& o, - const SequencerPosition& spos) -{ - dout(15) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" << oldoid << dendl; - int r = 0; - int dstcmp, srccmp; - - if (replaying) { - /* If the destination collection doesn't exist during replay, - * we need to delete the src object and continue on - */ - if (!collection_exists(c)) - goto out_rm_src; - } - - dstcmp = _check_replay_guard(c, o, spos); - if (dstcmp < 0) - goto out_rm_src; - - // check the src name too; it might have a newer guard, and we don't - // want to clobber it - srccmp = _check_replay_guard(oldcid, oldoid, spos); - if (srccmp < 0) - return 0; - - { - // open guard on object so we don't any previous operations on the - // new name that will modify the source inode. - FDRef fd; - r = lfn_open(oldcid, oldoid, 0, &fd); - if (r < 0) { - // the source collection/object does not exist. If we are replaying, we - // should be safe, so just return 0 and move on. - assert(replaying); - dout(10) << __func__ << " " << c << "/" << o << " from " - << oldcid << "/" << oldoid << " (dne, continue replay) " << dendl; - return 0; - } - if (dstcmp > 0) { // if dstcmp == 0 the guard already says "in-progress" - _set_replay_guard(**fd, spos, &o, true); - } - - r = lfn_link(oldcid, c, oldoid, o); - if (replaying && !backend->can_checkpoint() && - r == -EEXIST) // crashed between link() and set_replay_guard() - r = 0; - - _inject_failure(); - - if (r == 0) { - // the name changed; link the omap content - r = object_map->clone(oldoid, o, &spos); - if (r == -ENOENT) - r = 0; - } - - _inject_failure(); - - lfn_close(fd); - fd = FDRef(); - - if (r == 0) - r = lfn_unlink(oldcid, oldoid, spos, true); - - if (r == 0) - r = lfn_open(c, o, 0, &fd); - - // close guard on object so we don't do this again - if (r == 0) - _close_replay_guard(**fd, spos); - - lfn_close(fd); - } - - dout(10) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" << oldoid - << " = " << r << dendl; - return r; - - out_rm_src: - // remove source - if (_check_replay_guard(oldcid, oldoid, spos) > 0) { - r = lfn_unlink(oldcid, oldoid, spos, true); - } - - dout(10) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" << oldoid - << " = " << r << dendl; - return r; -} - -void FileStore::_inject_failure() -{ - if (m_filestore_kill_at.read()) { - int final = m_filestore_kill_at.dec(); - dout(5) << "_inject_failure " << (final+1) << " -> " << final << dendl; - if (final == 0) { - derr << "_inject_failure KILLING" << dendl; - g_ceph_context->_log->flush(); - _exit(1); - } - } -} - -int FileStore::_omap_clear(coll_t cid, const ghobject_t &hoid, - const SequencerPosition &spos) { - dout(15) << __func__ << " " << cid << "/" << hoid << dendl; - Index index; - int r = get_index(cid, &index); - if (r < 0) - return r; - { - assert(NULL != index.index); - RWLock::RLocker l((index.index)->access_lock); - r = lfn_find(hoid, index); - if (r < 0) - return r; - } - r = object_map->clear_keys_header(hoid, &spos); - if (r < 0 && r != -ENOENT) - return r; - return 0; -} - -int FileStore::_omap_setkeys(coll_t cid, const ghobject_t &hoid, - const map &aset, - const SequencerPosition &spos) { - dout(15) << __func__ << " " << cid << "/" << hoid << dendl; - Index index; - int r; - //treat pgmeta as a logical object, skip to check exist - if (hoid.is_pgmeta()) - goto skip; - - r = get_index(cid, &index); - if (r < 0) { - dout(20) << __func__ << " get_index got " << cpp_strerror(r) << dendl; - return r; - } - { - assert(NULL != index.index); - RWLock::RLocker l((index.index)->access_lock); - r = lfn_find(hoid, index); - if (r < 0) { - dout(20) << __func__ << " lfn_find got " << cpp_strerror(r) << dendl; - return r; - } - } -skip: - r = object_map->set_keys(hoid, aset, &spos); - dout(20) << __func__ << " " << cid << "/" << hoid << " = " << r << dendl; - return r; -} - -int FileStore::_omap_rmkeys(coll_t cid, const ghobject_t &hoid, - const set &keys, - const SequencerPosition &spos) { - dout(15) << __func__ << " " << cid << "/" << hoid << dendl; - Index index; - int r; - //treat pgmeta as a logical object, skip to check exist - if (hoid.is_pgmeta()) - goto skip; - - r = get_index(cid, &index); - if (r < 0) - return r; - { - assert(NULL != index.index); - RWLock::RLocker l((index.index)->access_lock); - r = lfn_find(hoid, index); - if (r < 0) - return r; - } -skip: - r = object_map->rm_keys(hoid, keys, &spos); - if (r < 0 && r != -ENOENT) - return r; - return 0; -} - -int FileStore::_omap_rmkeyrange(coll_t cid, const ghobject_t &hoid, - const string& first, const string& last, - const SequencerPosition &spos) { - dout(15) << __func__ << " " << cid << "/" << hoid << " [" << first << "," << last << "]" << dendl; - set keys; - { - ObjectMap::ObjectMapIterator iter = get_omap_iterator(cid, hoid); - if (!iter) - return -ENOENT; - for (iter->lower_bound(first); iter->valid() && iter->key() < last; - iter->next()) { - keys.insert(iter->key()); - } - } - return _omap_rmkeys(cid, hoid, keys, spos); -} - -int FileStore::_omap_setheader(coll_t cid, const ghobject_t &hoid, - const bufferlist &bl, - const SequencerPosition &spos) -{ - dout(15) << __func__ << " " << cid << "/" << hoid << dendl; - Index index; - int r = get_index(cid, &index); - if (r < 0) - return r; - { - assert(NULL != index.index); - RWLock::RLocker l((index.index)->access_lock); - r = lfn_find(hoid, index); - if (r < 0) - return r; - } - return object_map->set_header(hoid, bl, &spos); -} - -int FileStore::_split_collection(coll_t cid, - uint32_t bits, - uint32_t rem, - coll_t dest, - const SequencerPosition &spos) -{ - int r; - { - dout(15) << __func__ << " " << cid << " bits: " << bits << dendl; - if (!collection_exists(cid)) { - dout(2) << __func__ << ": " << cid << " DNE" << dendl; - assert(replaying); - return 0; - } - if (!collection_exists(dest)) { - dout(2) << __func__ << ": " << dest << " DNE" << dendl; - assert(replaying); - return 0; - } - - int dstcmp = _check_replay_guard(dest, spos); - if (dstcmp < 0) - return 0; - - int srccmp = _check_replay_guard(cid, spos); - if (srccmp < 0) - return 0; - - _set_global_replay_guard(cid, spos); - _set_replay_guard(cid, spos, true); - _set_replay_guard(dest, spos, true); - - Index from; - r = get_index(cid, &from); - - Index to; - if (!r) - r = get_index(dest, &to); - - if (!r) { - assert(NULL != from.index); - RWLock::WLocker l1((from.index)->access_lock); - - assert(NULL != to.index); - RWLock::WLocker l2((to.index)->access_lock); - - r = from->split(rem, bits, to.index); - } - - _close_replay_guard(cid, spos); - _close_replay_guard(dest, spos); - } - if (g_conf->filestore_debug_verify_split) { - vector objects; - ghobject_t next; - while (1) { - collection_list( - cid, - next, ghobject_t::get_max(), - true, - get_ideal_list_max(), - &objects, - &next); - if (objects.empty()) - break; - for (vector::iterator i = objects.begin(); - i != objects.end(); - ++i) { - dout(20) << __func__ << ": " << *i << " still in source " - << cid << dendl; - assert(!i->match(bits, rem)); - } - objects.clear(); - } - next = ghobject_t(); - while (1) { - collection_list( - dest, - next, ghobject_t::get_max(), - true, - get_ideal_list_max(), - &objects, - &next); - if (objects.empty()) - break; - for (vector::iterator i = objects.begin(); - i != objects.end(); - ++i) { - dout(20) << __func__ << ": " << *i << " now in dest " - << *i << dendl; - assert(i->match(bits, rem)); - } - objects.clear(); - } - } - return r; -} - -int FileStore::_set_alloc_hint(coll_t cid, const ghobject_t& oid, - uint64_t expected_object_size, - uint64_t expected_write_size) -{ - dout(15) << "set_alloc_hint " << cid << "/" << oid << " object_size " << expected_object_size << " write_size " << expected_write_size << dendl; - - FDRef fd; - int ret; - - ret = lfn_open(cid, oid, false, &fd); - if (ret < 0) - goto out; - - { - // TODO: a more elaborate hint calculation - uint64_t hint = MIN(expected_write_size, m_filestore_max_alloc_hint_size); - - ret = backend->set_alloc_hint(**fd, hint); - dout(20) << "set_alloc_hint hint " << hint << " ret " << ret << dendl; - } - - lfn_close(fd); -out: - dout(10) << "set_alloc_hint " << cid << "/" << oid << " object_size " << expected_object_size << " write_size " << expected_write_size << " = " << ret << dendl; - assert(!m_filestore_fail_eio || ret != -EIO); - return ret; -} - -const char** FileStore::get_tracked_conf_keys() const -{ - static const char* KEYS[] = { - "filestore_min_sync_interval", - "filestore_max_sync_interval", - "filestore_queue_max_ops", - "filestore_queue_max_bytes", - "filestore_queue_committing_max_ops", - "filestore_queue_committing_max_bytes", - "filestore_commit_timeout", - "filestore_dump_file", - "filestore_kill_at", - "filestore_fail_eio", - "filestore_fadvise", - "filestore_sloppy_crc", - "filestore_sloppy_crc_block_size", - "filestore_max_alloc_hint_size", - NULL - }; - return KEYS; -} - -void FileStore::handle_conf_change(const struct md_config_t *conf, - const std::set &changed) -{ - if (changed.count("filestore_max_inline_xattr_size") || - changed.count("filestore_max_inline_xattr_size_xfs") || - changed.count("filestore_max_inline_xattr_size_btrfs") || - changed.count("filestore_max_inline_xattr_size_other") || - changed.count("filestore_max_inline_xattrs") || - changed.count("filestore_max_inline_xattrs_xfs") || - changed.count("filestore_max_inline_xattrs_btrfs") || - changed.count("filestore_max_inline_xattrs_other")) { - Mutex::Locker l(lock); - set_xattr_limits_via_conf(); - } - if (changed.count("filestore_min_sync_interval") || - changed.count("filestore_max_sync_interval") || - changed.count("filestore_queue_max_ops") || - changed.count("filestore_queue_max_bytes") || - changed.count("filestore_queue_committing_max_ops") || - changed.count("filestore_queue_committing_max_bytes") || - changed.count("filestore_kill_at") || - changed.count("filestore_fail_eio") || - changed.count("filestore_sloppy_crc") || - changed.count("filestore_sloppy_crc_block_size") || - changed.count("filestore_max_alloc_hint_size") || - changed.count("filestore_fadvise")) { - Mutex::Locker l(lock); - m_filestore_min_sync_interval = conf->filestore_min_sync_interval; - m_filestore_max_sync_interval = conf->filestore_max_sync_interval; - m_filestore_queue_max_ops = conf->filestore_queue_max_ops; - m_filestore_queue_max_bytes = conf->filestore_queue_max_bytes; - m_filestore_queue_committing_max_ops = conf->filestore_queue_committing_max_ops; - m_filestore_queue_committing_max_bytes = conf->filestore_queue_committing_max_bytes; - m_filestore_kill_at.set(conf->filestore_kill_at); - m_filestore_fail_eio = conf->filestore_fail_eio; - m_filestore_fadvise = conf->filestore_fadvise; - m_filestore_sloppy_crc = conf->filestore_sloppy_crc; - m_filestore_sloppy_crc_block_size = conf->filestore_sloppy_crc_block_size; - m_filestore_max_alloc_hint_size = conf->filestore_max_alloc_hint_size; - throttle_ops.reset_max(conf->filestore_queue_max_ops); - throttle_bytes.reset_max(conf->filestore_queue_max_bytes); - } - if (changed.count("filestore_commit_timeout")) { - Mutex::Locker l(sync_entry_timeo_lock); - m_filestore_commit_timeout = conf->filestore_commit_timeout; - } - if (changed.count("filestore_dump_file")) { - if (conf->filestore_dump_file.length() && - conf->filestore_dump_file != "-") { - dump_start(conf->filestore_dump_file); - } else { - dump_stop(); - } - } -} - -void FileStore::dump_start(const std::string& file) -{ - dout(10) << "dump_start " << file << dendl; - if (m_filestore_do_dump) { - dump_stop(); - } - m_filestore_dump_fmt.reset(); - m_filestore_dump_fmt.open_array_section("dump"); - m_filestore_dump.open(file.c_str()); - m_filestore_do_dump = true; -} - -void FileStore::dump_stop() -{ - dout(10) << "dump_stop" << dendl; - m_filestore_do_dump = false; - if (m_filestore_dump.is_open()) { - m_filestore_dump_fmt.close_section(); - m_filestore_dump_fmt.flush(m_filestore_dump); - m_filestore_dump.flush(); - m_filestore_dump.close(); - } -} - -void FileStore::dump_transactions(list& ls, uint64_t seq, OpSequencer *osr) -{ - m_filestore_dump_fmt.open_array_section("transactions"); - unsigned trans_num = 0; - for (list::iterator i = ls.begin(); i != ls.end(); ++i, ++trans_num) { - m_filestore_dump_fmt.open_object_section("transaction"); - m_filestore_dump_fmt.dump_string("osr", osr->get_name()); - m_filestore_dump_fmt.dump_unsigned("seq", seq); - m_filestore_dump_fmt.dump_unsigned("trans_num", trans_num); - (*i)->dump(&m_filestore_dump_fmt); - m_filestore_dump_fmt.close_section(); - } - m_filestore_dump_fmt.close_section(); - m_filestore_dump_fmt.flush(m_filestore_dump); - m_filestore_dump.flush(); -} - -void FileStore::set_xattr_limits_via_conf() -{ - uint32_t fs_xattr_size; - uint32_t fs_xattrs; - - switch (m_fs_type) { -#if defined(__linux__) - case XFS_SUPER_MAGIC: - fs_xattr_size = g_conf->filestore_max_inline_xattr_size_xfs; - fs_xattrs = g_conf->filestore_max_inline_xattrs_xfs; - break; - case BTRFS_SUPER_MAGIC: - fs_xattr_size = g_conf->filestore_max_inline_xattr_size_btrfs; - fs_xattrs = g_conf->filestore_max_inline_xattrs_btrfs; - break; -#endif - default: - fs_xattr_size = g_conf->filestore_max_inline_xattr_size_other; - fs_xattrs = g_conf->filestore_max_inline_xattrs_other; - break; - } - - // Use override value if set - if (g_conf->filestore_max_inline_xattr_size) - m_filestore_max_inline_xattr_size = g_conf->filestore_max_inline_xattr_size; - else - m_filestore_max_inline_xattr_size = fs_xattr_size; - - // Use override value if set - if (g_conf->filestore_max_inline_xattrs) - m_filestore_max_inline_xattrs = g_conf->filestore_max_inline_xattrs; - else - m_filestore_max_inline_xattrs = fs_xattrs; -} - -// -- FSSuperblock -- - -void FSSuperblock::encode(bufferlist &bl) const -{ - ENCODE_START(2, 1, bl); - compat_features.encode(bl); - ::encode(omap_backend, bl); - ENCODE_FINISH(bl); -} - -void FSSuperblock::decode(bufferlist::iterator &bl) -{ - DECODE_START(2, bl); - compat_features.decode(bl); - if (struct_v >= 2) - ::decode(omap_backend, bl); - else - omap_backend = "leveldb"; - DECODE_FINISH(bl); -} - -void FSSuperblock::dump(Formatter *f) const -{ - f->open_object_section("compat"); - compat_features.dump(f); - f->dump_string("omap_backend", omap_backend); - f->close_section(); -} - -void FSSuperblock::generate_test_instances(list& o) -{ - FSSuperblock z; - o.push_back(new FSSuperblock(z)); - CompatSet::FeatureSet feature_compat; - CompatSet::FeatureSet feature_ro_compat; - CompatSet::FeatureSet feature_incompat; - feature_incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS); - z.compat_features = CompatSet(feature_compat, feature_ro_compat, - feature_incompat); - o.push_back(new FSSuperblock(z)); - z.omap_backend = "rocksdb"; - o.push_back(new FSSuperblock(z)); -} diff --git a/src/os/FileStore.h b/src/os/FileStore.h deleted file mode 100644 index ac7490b5ee48..000000000000 --- a/src/os/FileStore.h +++ /dev/null @@ -1,816 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef CEPH_FILESTORE_H -#define CEPH_FILESTORE_H - -#include "include/types.h" - -#include -#include -#include -#include -using namespace std; - -#include "include/unordered_map.h" - -#include "include/assert.h" - -#include "ObjectStore.h" -#include "JournalingObjectStore.h" - -#include "common/Timer.h" -#include "common/WorkQueue.h" - -#include "common/Mutex.h" -#include "HashIndex.h" -#include "IndexManager.h" -#include "ObjectMap.h" -#include "SequencerPosition.h" -#include "FDCache.h" -#include "WBThrottle.h" - -#include "include/uuid.h" - - -// from include/linux/falloc.h: -#ifndef FALLOC_FL_PUNCH_HOLE -# define FALLOC_FL_PUNCH_HOLE 0x2 -#endif - -#if defined(__linux__) -# ifndef BTRFS_SUPER_MAGIC -#define BTRFS_SUPER_MAGIC 0x9123683E -# endif -# ifndef XFS_SUPER_MAGIC -#define XFS_SUPER_MAGIC 0x58465342 -# endif -# ifndef ZFS_SUPER_MAGIC -#define ZFS_SUPER_MAGIC 0x2fc12fc1 -# endif -#endif - - -class FileStoreBackend; - -#define CEPH_FS_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(1, "sharded objects") - -class FSSuperblock { -public: - CompatSet compat_features; - string omap_backend; - - FSSuperblock() { } - - void encode(bufferlist &bl) const; - void decode(bufferlist::iterator &bl); - void dump(Formatter *f) const; - static void generate_test_instances(list& o); -}; -WRITE_CLASS_ENCODER(FSSuperblock) - -inline ostream& operator<<(ostream& out, const FSSuperblock& sb) -{ - return out << "sb(" << sb.compat_features << "): " - << sb.omap_backend; -} - -class FileStore : public JournalingObjectStore, - public md_config_obs_t -{ - static const uint32_t target_version = 4; -public: - uint32_t get_target_version() { - return target_version; - } - - static int get_block_device_fsid(const string& path, uuid_d *fsid); - - struct FSPerfTracker { - PerfCounters::avg_tracker os_commit_latency; - PerfCounters::avg_tracker os_apply_latency; - - objectstore_perf_stat_t get_cur_stats() const { - objectstore_perf_stat_t ret; - ret.filestore_commit_latency = os_commit_latency.avg(); - ret.filestore_apply_latency = os_apply_latency.avg(); - return ret; - } - - void update_from_perfcounters(PerfCounters &logger); - } perf_tracker; - objectstore_perf_stat_t get_cur_stats() { - perf_tracker.update_from_perfcounters(*logger); - return perf_tracker.get_cur_stats(); - } - -private: - string internal_name; ///< internal name, used to name the perfcounter instance - string basedir, journalpath; - osflagbits_t generic_flags; - std::string current_fn; - std::string current_op_seq_fn; - std::string omap_dir; - uuid_d fsid; - - size_t blk_size; ///< fs block size - - int fsid_fd, op_fd, basedir_fd, current_fd; - - FileStoreBackend *backend; - - void create_backend(long f_type); - - deque snaps; - - // Indexed Collections - IndexManager index_manager; - int get_index(coll_t c, Index *index); - int init_index(coll_t c); - - void _kludge_temp_object_collection(coll_t& cid, const ghobject_t& oid) { - // - normal temp case: cid is pg, object is temp (pool < -1) - // - hammer temp case: cid is pg (or already temp), object pool is -1 - if (cid.is_pg() && (oid.hobj.pool < -1 || - oid.hobj.pool == -1)) - cid = cid.get_temp(); - } - void init_temp_collections(); - - // ObjectMap - boost::scoped_ptr object_map; - - // helper fns - int get_cdir(coll_t cid, char *s, int len); - - /// read a uuid from fd - int read_fsid(int fd, uuid_d *uuid); - - /// lock fsid_fd - int lock_fsid(); - - // sync thread - Mutex lock; - bool force_sync; - Cond sync_cond; - - Mutex sync_entry_timeo_lock; - SafeTimer timer; - - list sync_waiters; - bool stop; - void sync_entry(); - struct SyncThread : public Thread { - FileStore *fs; - SyncThread(FileStore *f) : fs(f) {} - void *entry() { - fs->sync_entry(); - return 0; - } - } sync_thread; - - // -- op workqueue -- - struct Op { - utime_t start; - uint64_t op; - list tls; - Context *onreadable, *onreadable_sync; - uint64_t ops, bytes; - TrackedOpRef osd_op; - }; - class OpSequencer : public Sequencer_impl { - Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock) - list q; - list jq; - list > flush_commit_waiters; - Cond cond; - public: - Sequencer *parent; - Mutex apply_lock; // for apply mutual exclusion - int id; - - /// get_max_uncompleted - bool _get_max_uncompleted( - uint64_t *seq ///< [out] max uncompleted seq - ) { - assert(qlock.is_locked()); - assert(seq); - *seq = 0; - if (q.empty() && jq.empty()) - return true; - - if (!q.empty()) - *seq = q.back()->op; - if (!jq.empty() && jq.back() > *seq) - *seq = jq.back(); - - return false; - } /// @returns true if both queues are empty - - /// get_min_uncompleted - bool _get_min_uncompleted( - uint64_t *seq ///< [out] min uncompleted seq - ) { - assert(qlock.is_locked()); - assert(seq); - *seq = 0; - if (q.empty() && jq.empty()) - return true; - - if (!q.empty()) - *seq = q.front()->op; - if (!jq.empty() && jq.front() < *seq) - *seq = jq.front(); - - return false; - } /// @returns true if both queues are empty - - void _wake_flush_waiters(list *to_queue) { - uint64_t seq; - if (_get_min_uncompleted(&seq)) - seq = -1; - - for (list >::iterator i = - flush_commit_waiters.begin(); - i != flush_commit_waiters.end() && i->first < seq; - flush_commit_waiters.erase(i++)) { - to_queue->push_back(i->second); - } - } - - void queue_journal(uint64_t s) { - Mutex::Locker l(qlock); - jq.push_back(s); - } - void dequeue_journal(list *to_queue) { - Mutex::Locker l(qlock); - jq.pop_front(); - cond.Signal(); - _wake_flush_waiters(to_queue); - } - void queue(Op *o) { - Mutex::Locker l(qlock); - q.push_back(o); - } - Op *peek_queue() { - Mutex::Locker l(qlock); - assert(apply_lock.is_locked()); - return q.front(); - } - - Op *dequeue(list *to_queue) { - assert(to_queue); - assert(apply_lock.is_locked()); - Mutex::Locker l(qlock); - Op *o = q.front(); - q.pop_front(); - cond.Signal(); - - _wake_flush_waiters(to_queue); - return o; - } - - void flush() { - Mutex::Locker l(qlock); - - while (g_conf->filestore_blackhole) - cond.Wait(qlock); // wait forever - - - // get max for journal _or_ op queues - uint64_t seq = 0; - if (!q.empty()) - seq = q.back()->op; - if (!jq.empty() && jq.back() > seq) - seq = jq.back(); - - if (seq) { - // everything prior to our watermark to drain through either/both queues - while ((!q.empty() && q.front()->op <= seq) || - (!jq.empty() && jq.front() <= seq)) - cond.Wait(qlock); - } - } - bool flush_commit(Context *c) { - Mutex::Locker l(qlock); - uint64_t seq = 0; - if (_get_max_uncompleted(&seq)) { - return true; - } else { - flush_commit_waiters.push_back(make_pair(seq, c)); - return false; - } - } - - OpSequencer(int i) - : qlock("FileStore::OpSequencer::qlock", false, false), - parent(0), - apply_lock("FileStore::OpSequencer::apply_lock", false, false), - id(i) {} - ~OpSequencer() { - assert(q.empty()); - } - - const string& get_name() const { - return parent->get_name(); - } - }; - - friend ostream& operator<<(ostream& out, const OpSequencer& s); - - FDCache fdcache; - WBThrottle wbthrottle; - - atomic_t next_osr_id; - deque op_queue; - Throttle throttle_ops, throttle_bytes; - const int m_ondisk_finisher_num; - const int m_apply_finisher_num; - vector ondisk_finishers; - vector apply_finishers; - - ThreadPool op_tp; - struct OpWQ : public ThreadPool::WorkQueue { - FileStore *store; - OpWQ(FileStore *fs, time_t timeout, time_t suicide_timeout, ThreadPool *tp) - : ThreadPool::WorkQueue("FileStore::OpWQ", timeout, suicide_timeout, tp), store(fs) {} - - bool _enqueue(OpSequencer *osr) { - store->op_queue.push_back(osr); - return true; - } - void _dequeue(OpSequencer *o) { - assert(0); - } - bool _empty() { - return store->op_queue.empty(); - } - OpSequencer *_dequeue() { - if (store->op_queue.empty()) - return NULL; - OpSequencer *osr = store->op_queue.front(); - store->op_queue.pop_front(); - return osr; - } - void _process(OpSequencer *osr, ThreadPool::TPHandle &handle) { - store->_do_op(osr, handle); - } - using ThreadPool::WorkQueue::_process; - void _process_finish(OpSequencer *osr) { - store->_finish_op(osr); - } - void _clear() { - assert(store->op_queue.empty()); - } - } op_wq; - - void _do_op(OpSequencer *o, ThreadPool::TPHandle &handle); - void _finish_op(OpSequencer *o); - Op *build_op(list& tls, - Context *onreadable, Context *onreadable_sync, - TrackedOpRef osd_op); - void queue_op(OpSequencer *osr, Op *o); - void op_queue_reserve_throttle(Op *o, ThreadPool::TPHandle *handle = NULL); - void op_queue_release_throttle(Op *o); - void _journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk); - friend struct C_JournaledAhead; - - void new_journal(); - - PerfCounters *logger; - -public: - int lfn_find(const ghobject_t& oid, const Index& index, - IndexedPath *path = NULL); - int lfn_truncate(coll_t cid, const ghobject_t& oid, off_t length); - int lfn_stat(coll_t cid, const ghobject_t& oid, struct stat *buf); - int lfn_open( - coll_t cid, - const ghobject_t& oid, - bool create, - FDRef *outfd, - Index *index = 0); - - void lfn_close(FDRef fd); - int lfn_link(coll_t c, coll_t newcid, const ghobject_t& o, const ghobject_t& newoid) ; - int lfn_unlink(coll_t cid, const ghobject_t& o, const SequencerPosition &spos, - bool force_clear_omap=false); - -public: - FileStore(const std::string &base, const std::string &jdev, - osflagbits_t flags = 0, - const char *internal_name = "filestore", bool update_to=false); - ~FileStore(); - - int _detect_fs(); - int _sanity_check_fs(); - - bool test_mount_in_use(); - int read_op_seq(uint64_t *seq); - int write_op_seq(int, uint64_t seq); - int mount(); - int umount(); - unsigned get_max_object_name_length() { - // not safe for all file systems, btw! use the tunable to limit this. - return 4096; - } - unsigned get_max_attr_name_length() { - // xattr limit is 128; leave room for our prefixes (user.ceph._), - // some margin, and cap at 100 - return 100; - } - int mkfs(); - int mkjournal(); - bool wants_journal() { - return true; - } - bool allows_journal() { - return true; - } - bool needs_journal() { - return false; - } - - int write_version_stamp(); - int version_stamp_is_valid(uint32_t *version); - int update_version_stamp(); - int upgrade(); - - bool can_sort_nibblewise() { - return true; // i support legacy sort order - } - - void collect_metadata(map *pm); - - int statfs(struct statfs *buf); - - int _do_transactions( - list &tls, uint64_t op_seq, - ThreadPool::TPHandle *handle); - int do_transactions(list &tls, uint64_t op_seq) { - return _do_transactions(tls, op_seq, 0); - } - unsigned _do_transaction( - Transaction& t, uint64_t op_seq, int trans_num, - ThreadPool::TPHandle *handle); - - int queue_transactions(Sequencer *osr, list& tls, - TrackedOpRef op = TrackedOpRef(), - ThreadPool::TPHandle *handle = NULL); - - /** - * set replay guard xattr on given file - * - * This will ensure that we will not replay this (or any previous) operation - * against this particular inode/object. - * - * @param fd open file descriptor for the file/object - * @param spos sequencer position of the last operation we should not replay - */ - void _set_replay_guard(int fd, - const SequencerPosition& spos, - const ghobject_t *oid=0, - bool in_progress=false); - void _set_replay_guard(coll_t cid, - const SequencerPosition& spos, - bool in_progress); - void _set_global_replay_guard(coll_t cid, - const SequencerPosition &spos); - - /// close a replay guard opened with in_progress=true - void _close_replay_guard(int fd, const SequencerPosition& spos); - void _close_replay_guard(coll_t cid, const SequencerPosition& spos); - - /** - * check replay guard xattr on given file - * - * Check the current position against any marker on the file that - * indicates which operations have already been applied. If the - * current or a newer operation has been marked as applied, we - * should not replay the current operation again. - * - * If we are not replaying the journal, we already return true. It - * is only on replay that we might return false, indicated that the - * operation should not be performed (again). - * - * @param fd open fd on the file/object in question - * @param spos sequencerposition for an operation we could apply/replay - * @return 1 if we can apply (maybe replay) this operation, -1 if spos has already been applied, 0 if it was in progress - */ - int _check_replay_guard(int fd, const SequencerPosition& spos); - int _check_replay_guard(coll_t cid, const SequencerPosition& spos); - int _check_replay_guard(coll_t cid, ghobject_t oid, const SequencerPosition& pos); - int _check_global_replay_guard(coll_t cid, const SequencerPosition& spos); - - // ------------------ - // objects - int pick_object_revision_lt(ghobject_t& oid) { - return 0; - } - bool exists(coll_t cid, const ghobject_t& oid); - int stat( - coll_t cid, - const ghobject_t& oid, - struct stat *st, - bool allow_eio = false); - int read( - coll_t cid, - const ghobject_t& oid, - uint64_t offset, - size_t len, - bufferlist& bl, - uint32_t op_flags = 0, - bool allow_eio = false); - int _do_fiemap(int fd, uint64_t offset, size_t len, - map *m); - int _do_seek_hole_data(int fd, uint64_t offset, size_t len, - map *m); - int fiemap(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl); - - int _touch(coll_t cid, const ghobject_t& oid); - int _write(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, - const bufferlist& bl, uint32_t fadvise_flags = 0); - int _zero(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len); - int _truncate(coll_t cid, const ghobject_t& oid, uint64_t size); - int _clone(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid, - const SequencerPosition& spos); - int _clone_range(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid, - uint64_t srcoff, uint64_t len, uint64_t dstoff, - const SequencerPosition& spos); - int _do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff); - int _do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff); - int _do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc=false); - int _remove(coll_t cid, const ghobject_t& oid, const SequencerPosition &spos); - - int _fgetattr(int fd, const char *name, bufferptr& bp); - int _fgetattrs(int fd, map& aset); - int _fsetattrs(int fd, map &aset); - - void _start_sync(); - - void do_force_sync(); - void start_sync(Context *onsafe); - void sync(); - void _flush_op_queue(); - void flush(); - void sync_and_flush(); - - int flush_journal(); - int dump_journal(ostream& out); - - void set_fsid(uuid_d u) { - fsid = u; - } - uuid_d get_fsid() { return fsid; } - - // DEBUG read error injection, an object is removed from both on delete() - Mutex read_error_lock; - set data_error_set; // read() will return -EIO - set mdata_error_set; // getattr(),stat() will return -EIO - void inject_data_error(const ghobject_t &oid); - void inject_mdata_error(const ghobject_t &oid); - void debug_obj_on_delete(const ghobject_t &oid); - bool debug_data_eio(const ghobject_t &oid); - bool debug_mdata_eio(const ghobject_t &oid); - - int snapshot(const string& name); - - // attrs - int getattr(coll_t cid, const ghobject_t& oid, const char *name, bufferptr &bp); - int getattrs(coll_t cid, const ghobject_t& oid, map& aset); - - int _setattrs(coll_t cid, const ghobject_t& oid, map& aset, - const SequencerPosition &spos); - int _rmattr(coll_t cid, const ghobject_t& oid, const char *name, - const SequencerPosition &spos); - int _rmattrs(coll_t cid, const ghobject_t& oid, - const SequencerPosition &spos); - - int collection_getattr(coll_t c, const char *name, void *value, size_t size); - int collection_getattr(coll_t c, const char *name, bufferlist& bl); - int collection_getattrs(coll_t cid, map &aset); - - int _collection_setattr(coll_t c, const char *name, const void *value, size_t size); - int _collection_rmattr(coll_t c, const char *name); - int _collection_setattrs(coll_t cid, map &aset); - int _collection_remove_recursive(const coll_t &cid, - const SequencerPosition &spos); - - // collections - int collection_list(coll_t c, ghobject_t start, ghobject_t end, - bool sort_bitwise, int max, - vector *ls, ghobject_t *next); - int list_collections(vector& ls); - int list_collections(vector& ls, bool include_temp); - int collection_version_current(coll_t c, uint32_t *version); - int collection_stat(coll_t c, struct stat *st); - bool collection_exists(coll_t c); - bool collection_empty(coll_t c); - - // omap (see ObjectStore.h for documentation) - int omap_get(coll_t c, const ghobject_t &oid, bufferlist *header, - map *out); - int omap_get_header( - coll_t c, - const ghobject_t &oid, - bufferlist *out, - bool allow_eio = false); - int omap_get_keys(coll_t c, const ghobject_t &oid, set *keys); - int omap_get_values(coll_t c, const ghobject_t &oid, const set &keys, - map *out); - int omap_check_keys(coll_t c, const ghobject_t &oid, const set &keys, - set *out); - ObjectMap::ObjectMapIterator get_omap_iterator(coll_t c, const ghobject_t &oid); - - int _create_collection(coll_t c, const SequencerPosition &spos); - int _destroy_collection(coll_t c); - /** - * Give an expected number of objects hint to the collection. - * - * @param c - collection id. - * @param pg_num - pg number of the pool this collection belongs to - * @param expected_num_objs - expected number of objects in this collection - * @param spos - sequence position - * - * @return 0 on success, an error code otherwise - */ - int _collection_hint_expected_num_objs(coll_t c, uint32_t pg_num, - uint64_t expected_num_objs, - const SequencerPosition &spos); - int _collection_add(coll_t c, coll_t ocid, const ghobject_t& oid, - const SequencerPosition& spos); - int _collection_move_rename(coll_t oldcid, const ghobject_t& oldoid, - coll_t c, const ghobject_t& o, - const SequencerPosition& spos); - - int _set_alloc_hint(coll_t cid, const ghobject_t& oid, - uint64_t expected_object_size, - uint64_t expected_write_size); - - void dump_start(const std::string& file); - void dump_stop(); - void dump_transactions(list& ls, uint64_t seq, OpSequencer *osr); - -private: - void _inject_failure(); - - // omap - int _omap_clear(coll_t cid, const ghobject_t &oid, - const SequencerPosition &spos); - int _omap_setkeys(coll_t cid, const ghobject_t &oid, - const map &aset, - const SequencerPosition &spos); - int _omap_rmkeys(coll_t cid, const ghobject_t &oid, const set &keys, - const SequencerPosition &spos); - int _omap_rmkeyrange(coll_t cid, const ghobject_t &oid, - const string& first, const string& last, - const SequencerPosition &spos); - int _omap_setheader(coll_t cid, const ghobject_t &oid, const bufferlist &bl, - const SequencerPosition &spos); - int _split_collection(coll_t cid, uint32_t bits, uint32_t rem, coll_t dest, - const SequencerPosition &spos); - int _split_collection_create(coll_t cid, uint32_t bits, uint32_t rem, - coll_t dest, - const SequencerPosition &spos); - - virtual const char** get_tracked_conf_keys() const; - virtual void handle_conf_change(const struct md_config_t *conf, - const std::set &changed); - float m_filestore_commit_timeout; - bool m_filestore_journal_parallel; - bool m_filestore_journal_trailing; - bool m_filestore_journal_writeahead; - int m_filestore_fiemap_threshold; - double m_filestore_max_sync_interval; - double m_filestore_min_sync_interval; - bool m_filestore_fail_eio; - bool m_filestore_fadvise; - int do_update; - bool m_journal_dio, m_journal_aio, m_journal_force_aio; - std::string m_osd_rollback_to_cluster_snap; - bool m_osd_use_stale_snap; - int m_filestore_queue_max_ops; - int m_filestore_queue_max_bytes; - int m_filestore_queue_committing_max_ops; - int m_filestore_queue_committing_max_bytes; - bool m_filestore_do_dump; - std::ofstream m_filestore_dump; - JSONFormatter m_filestore_dump_fmt; - atomic_t m_filestore_kill_at; - bool m_filestore_sloppy_crc; - int m_filestore_sloppy_crc_block_size; - uint64_t m_filestore_max_alloc_hint_size; - long m_fs_type; - - //Determined xattr handling based on fs type - void set_xattr_limits_via_conf(); - uint32_t m_filestore_max_inline_xattr_size; - uint32_t m_filestore_max_inline_xattrs; - - FSSuperblock superblock; - - /** - * write_superblock() - * - * Write superblock to persisent storage - * - * return value: 0 on success, otherwise negative errno - */ - int write_superblock(); - - /** - * read_superblock() - * - * Fill in FileStore::superblock by reading persistent storage - * - * return value: 0 on success, otherwise negative errno - */ - int read_superblock(); - - friend class FileStoreBackend; - friend class TestFileStore; -}; - -ostream& operator<<(ostream& out, const FileStore::OpSequencer& s); - -struct fiemap; - -class FileStoreBackend { -private: - FileStore *filestore; -protected: - int get_basedir_fd() { - return filestore->basedir_fd; - } - int get_current_fd() { - return filestore->current_fd; - } - int get_op_fd() { - return filestore->op_fd; - } - size_t get_blksize() { - return filestore->blk_size; - } - const string& get_basedir_path() { - return filestore->basedir; - } - const string& get_current_path() { - return filestore->current_fn; - } - int _copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) { - if (has_fiemap() || has_seek_data_hole()) { - return filestore->_do_sparse_copy_range(from, to, srcoff, len, dstoff); - } else { - return filestore->_do_copy_range(from, to, srcoff, len, dstoff); - } - } - int get_crc_block_size() { - return filestore->m_filestore_sloppy_crc_block_size; - } - -public: - FileStoreBackend(FileStore *fs) : filestore(fs) {} - virtual ~FileStoreBackend() {} - - static FileStoreBackend *create(long f_type, FileStore *fs); - - virtual const char *get_name() = 0; - virtual int detect_features() = 0; - virtual int create_current() = 0; - virtual bool can_checkpoint() = 0; - virtual int list_checkpoints(list& ls) = 0; - virtual int create_checkpoint(const string& name, uint64_t *cid) = 0; - virtual int sync_checkpoint(uint64_t id) = 0; - virtual int rollback_to(const string& name) = 0; - virtual int destroy_checkpoint(const string& name) = 0; - virtual int syncfs() = 0; - virtual bool has_fiemap() = 0; - virtual bool has_seek_data_hole() = 0; - virtual int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) = 0; - virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) = 0; - virtual int set_alloc_hint(int fd, uint64_t hint) = 0; - virtual bool has_splice() const = 0; - - // hooks for (sloppy) crc tracking - virtual int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) = 0; - virtual int _crc_update_truncate(int fd, loff_t off) = 0; - virtual int _crc_update_zero(int fd, loff_t off, size_t len) = 0; - virtual int _crc_update_clone_range(int srcfd, int destfd, - loff_t srcoff, size_t len, loff_t dstoff) = 0; - virtual int _crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl, - ostream *out) = 0; -}; - -#endif diff --git a/src/os/GenericFileStoreBackend.cc b/src/os/GenericFileStoreBackend.cc deleted file mode 100644 index 4bba4130ccfe..000000000000 --- a/src/os/GenericFileStoreBackend.cc +++ /dev/null @@ -1,431 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "include/int_types.h" -#include "include/types.h" - -#include -#include -#include -#include -#include -#include -#include - -#if defined(__linux__) -#include -#endif - -#include "include/compat.h" -#include "include/linux_fiemap.h" - -#include -#include -#include - -#include "GenericFileStoreBackend.h" - -#include "common/errno.h" -#include "common/config.h" -#include "common/sync_filesystem.h" - -#include "common/SloppyCRCMap.h" -#include "os/chain_xattr.h" - -#define SLOPPY_CRC_XATTR "user.cephos.scrc" - - -#define dout_subsys ceph_subsys_filestore -#undef dout_prefix -#define dout_prefix *_dout << "genericfilestorebackend(" << get_basedir_path() << ") " - -#define ALIGN_DOWN(x, by) ((x) - ((x) % (by))) -#define ALIGNED(x, by) (!((x) % (by))) -#define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by))) - -GenericFileStoreBackend::GenericFileStoreBackend(FileStore *fs): - FileStoreBackend(fs), - ioctl_fiemap(false), - seek_data_hole(false), - m_filestore_fiemap(g_conf->filestore_fiemap), - m_filestore_seek_data_hole(g_conf->filestore_seek_data_hole), - m_filestore_fsync_flushes_journal_data(g_conf->filestore_fsync_flushes_journal_data), - m_filestore_splice(false) {} - -int GenericFileStoreBackend::detect_features() -{ - char fn[PATH_MAX]; - snprintf(fn, sizeof(fn), "%s/fiemap_test", get_basedir_path().c_str()); - - int fd = ::open(fn, O_CREAT|O_RDWR|O_TRUNC, 0644); - if (fd < 0) { - fd = -errno; - derr << "detect_features: unable to create " << fn << ": " << cpp_strerror(fd) << dendl; - return fd; - } - - // ext4 has a bug in older kernels where fiemap will return an empty - // result in some cases. this is a file layout that triggers the bug - // on 2.6.34-rc5. - int v[] = { - 0x0000000000016000, 0x0000000000007000, - 0x000000000004a000, 0x0000000000007000, - 0x0000000000060000, 0x0000000000001000, - 0x0000000000061000, 0x0000000000008000, - 0x0000000000069000, 0x0000000000007000, - 0x00000000000a3000, 0x000000000000c000, - 0x000000000024e000, 0x000000000000c000, - 0x000000000028b000, 0x0000000000009000, - 0x00000000002b1000, 0x0000000000003000, - 0, 0 - }; - for (int i=0; v[i]; i++) { - int off = v[i++]; - int len = v[i]; - - // write a large extent - char buf[len]; - memset(buf, 1, sizeof(buf)); - int r = ::lseek(fd, off, SEEK_SET); - if (r < 0) { - r = -errno; - derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(r) << dendl; - VOID_TEMP_FAILURE_RETRY(::close(fd)); - return r; - } - r = write(fd, buf, sizeof(buf)); - if (r < 0) { - derr << "detect_features: failed to write to " << fn << ": " << cpp_strerror(r) << dendl; - VOID_TEMP_FAILURE_RETRY(::close(fd)); - return r; - } - } - - // fiemap an extent inside that - if (!m_filestore_fiemap) { - dout(0) << "detect_features: FIEMAP ioctl is disabled via 'filestore fiemap' config option" << dendl; - ioctl_fiemap = false; - } else { - struct fiemap *fiemap; - int r = do_fiemap(fd, 2430421, 59284, &fiemap); - if (r < 0) { - dout(0) << "detect_features: FIEMAP ioctl is NOT supported" << dendl; - ioctl_fiemap = false; - } else { - if (fiemap->fm_mapped_extents == 0) { - dout(0) << "detect_features: FIEMAP ioctl is supported, but buggy -- upgrade your kernel" << dendl; - ioctl_fiemap = false; - } else { - dout(0) << "detect_features: FIEMAP ioctl is supported and appears to work" << dendl; - ioctl_fiemap = true; - } - free(fiemap); - } - } - - // SEEK_DATA/SEEK_HOLE detection - if (!m_filestore_seek_data_hole) { - dout(0) << "detect_features: SEEK_DATA/SEEK_HOLE is disabled via 'filestore seek data hole' config option" << dendl; - seek_data_hole = false; - } else { -#if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA) - // If compiled on an OS with SEEK_HOLE/SEEK_DATA support, but running - // on an OS that doesn't support SEEK_HOLE/SEEK_DATA, EINVAL is returned. - // Fall back to use fiemap. - off_t hole_pos; - - hole_pos = lseek(fd, 0, SEEK_HOLE); - if (hole_pos < 0) { - if (errno == EINVAL) { - dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is NOT supported" << dendl; - seek_data_hole = false; - } else { - derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(-errno) << dendl; - VOID_TEMP_FAILURE_RETRY(::close(fd)); - return -errno; - } - } else { - dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is supported" << dendl; - seek_data_hole = true; - } -#endif - } - - //splice detection -#ifdef CEPH_HAVE_SPLICE - if (!m_filestore_splice) { - int pipefd[2]; - loff_t off_in = 0; - int r; - if ((r = pipe(pipefd)) < 0) - dout(0) << "detect_features: splice pipe met error " << cpp_strerror(errno) << dendl; - else { - lseek(fd, 0, SEEK_SET); - r = splice(fd, &off_in, pipefd[1], NULL, 10, 0); - if (!(r < 0 && errno == EINVAL)) { - m_filestore_splice = true; - dout(0) << "detect_features: splice is supported" << dendl; - } else - dout(0) << "detect_features: splice is NOT supported" << dendl; - close(pipefd[0]); - close(pipefd[1]); - } - } -#endif - ::unlink(fn); - VOID_TEMP_FAILURE_RETRY(::close(fd)); - - - bool have_syncfs = false; -#ifdef HAVE_SYS_SYNCFS - if (::syncfs(get_basedir_fd()) == 0) { - dout(0) << "detect_features: syncfs(2) syscall fully supported (by glibc and kernel)" << dendl; - have_syncfs = true; - } else { - dout(0) << "detect_features: syncfs(2) syscall supported by glibc BUT NOT the kernel" << dendl; - } -#elif defined(SYS_syncfs) - if (syscall(SYS_syncfs, get_basedir_fd()) == 0) { - dout(0) << "detect_features: syscall(SYS_syncfs, fd) fully supported" << dendl; - have_syncfs = true; - } else { - dout(0) << "detect_features: syscall(SYS_syncfs, fd) supported by libc BUT NOT the kernel" << dendl; - } -#elif defined(__NR_syncfs) - if (syscall(__NR_syncfs, get_basedir_fd()) == 0) { - dout(0) << "detect_features: syscall(__NR_syncfs, fd) fully supported" << dendl; - have_syncfs = true; - } else { - dout(0) << "detect_features: syscall(__NR_syncfs, fd) supported by libc BUT NOT the kernel" << dendl; - } -#endif - if (!have_syncfs) { - dout(0) << "detect_features: syncfs(2) syscall not supported" << dendl; - if (m_filestore_fsync_flushes_journal_data) { - dout(0) << "detect_features: no syncfs(2), but 'filestore fsync flushes journal data = true', so fsync will suffice." << dendl; - } else { - dout(0) << "detect_features: no syncfs(2), must use sync(2)." << dendl; - dout(0) << "detect_features: WARNING: multiple ceph-osd daemons on the same host will be slow" << dendl; - } - } - - return 0; -} - -int GenericFileStoreBackend::create_current() -{ - struct stat st; - int ret = ::stat(get_current_path().c_str(), &st); - if (ret == 0) { - // current/ exists - if (!S_ISDIR(st.st_mode)) { - dout(0) << "_create_current: current/ exists but is not a directory" << dendl; - ret = -EINVAL; - } - } else { - ret = ::mkdir(get_current_path().c_str(), 0755); - if (ret < 0) { - ret = -errno; - dout(0) << "_create_current: mkdir " << get_current_path() << " failed: "<< cpp_strerror(ret) << dendl; - } - } - return ret; -} - -int GenericFileStoreBackend::syncfs() -{ - int ret; - if (m_filestore_fsync_flushes_journal_data) { - dout(15) << "syncfs: doing fsync on " << get_op_fd() << dendl; - // make the file system's journal commit. - // this works with ext3, but NOT ext4 - ret = ::fsync(get_op_fd()); - if (ret < 0) - ret = -errno; - } else { - dout(15) << "syncfs: doing a full sync (syncfs(2) if possible)" << dendl; - ret = sync_filesystem(get_current_fd()); - } - return ret; -} - -int GenericFileStoreBackend::do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) -{ - struct fiemap *fiemap = NULL; - struct fiemap *_realloc_fiemap = NULL; - int size; - int ret; - - fiemap = (struct fiemap*)calloc(sizeof(struct fiemap), 1); - if (!fiemap) - return -ENOMEM; - /* - * There is a bug on xfs about fiemap. Suppose(offset=3990, len=4096), - * the result is (logical=4096, len=4096). It leak the [3990, 4096). - * Commit:"xfs: fix rounding error of fiemap length parameter - * (eedf32bfcace7d8e20cc66757d74fc68f3439ff7)" fix this bug. - * Here, we make offset aligned with CEPH_PAGE_SIZE to avoid this bug. - */ - fiemap->fm_start = start - start % CEPH_PAGE_SIZE; - fiemap->fm_length = len + start % CEPH_PAGE_SIZE; - fiemap->fm_flags = FIEMAP_FLAG_SYNC; /* flush extents to disk if needed */ - -#if defined(DARWIN) || defined(__FreeBSD__) - ret = -ENOTSUP; - goto done_err; -#else - if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) { - ret = -errno; - goto done_err; - } -#endif - size = sizeof(struct fiemap_extent) * (fiemap->fm_mapped_extents); - - _realloc_fiemap = (struct fiemap *)realloc(fiemap, sizeof(struct fiemap) + size); - if (!_realloc_fiemap) { - ret = -ENOMEM; - goto done_err; - } else { - fiemap = _realloc_fiemap; - } - - memset(fiemap->fm_extents, 0, size); - - fiemap->fm_extent_count = fiemap->fm_mapped_extents; - fiemap->fm_mapped_extents = 0; - -#if defined(DARWIN) || defined(__FreeBSD__) - ret = -ENOTSUP; - goto done_err; -#else - if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) { - ret = -errno; - goto done_err; - } - *pfiemap = fiemap; -#endif - return 0; - -done_err: - *pfiemap = NULL; - free(fiemap); - return ret; -} - - -int GenericFileStoreBackend::_crc_load_or_init(int fd, SloppyCRCMap *cm) -{ - char buf[100]; - bufferptr bp; - int r = 0; - int l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, buf, sizeof(buf)); - if (l == -ENODATA) { - return 0; - } - if (l >= 0) { - bp = buffer::create(l); - memcpy(bp.c_str(), buf, l); - } else if (l == -ERANGE) { - l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, 0, 0); - if (l > 0) { - bp = buffer::create(l); - l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, bp.c_str(), l); - } - } - bufferlist bl; - bl.append(bp); - bufferlist::iterator p = bl.begin(); - try { - ::decode(*cm, p); - } - catch (buffer::error &e) { - r = -EIO; - } - if (r < 0) - derr << __func__ << " got " << cpp_strerror(r) << dendl; - return r; -} - -int GenericFileStoreBackend::_crc_save(int fd, SloppyCRCMap *cm) -{ - bufferlist bl; - ::encode(*cm, bl); - int r = chain_fsetxattr(fd, SLOPPY_CRC_XATTR, bl.c_str(), bl.length()); - if (r < 0) - derr << __func__ << " got " << cpp_strerror(r) << dendl; - return r; -} - -int GenericFileStoreBackend::_crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) -{ - SloppyCRCMap scm(get_crc_block_size()); - int r = _crc_load_or_init(fd, &scm); - if (r < 0) - return r; - ostringstream ss; - scm.write(off, len, bl, &ss); - dout(30) << __func__ << "\n" << ss.str() << dendl; - r = _crc_save(fd, &scm); - return r; -} - -int GenericFileStoreBackend::_crc_update_truncate(int fd, loff_t off) -{ - SloppyCRCMap scm(get_crc_block_size()); - int r = _crc_load_or_init(fd, &scm); - if (r < 0) - return r; - scm.truncate(off); - r = _crc_save(fd, &scm); - return r; -} - -int GenericFileStoreBackend::_crc_update_zero(int fd, loff_t off, size_t len) -{ - SloppyCRCMap scm(get_crc_block_size()); - int r = _crc_load_or_init(fd, &scm); - if (r < 0) - return r; - scm.zero(off, len); - r = _crc_save(fd, &scm); - return r; -} - -int GenericFileStoreBackend::_crc_update_clone_range(int srcfd, int destfd, - loff_t srcoff, size_t len, loff_t dstoff) -{ - SloppyCRCMap scm_src(get_crc_block_size()); - SloppyCRCMap scm_dst(get_crc_block_size()); - int r = _crc_load_or_init(srcfd, &scm_src); - if (r < 0) - return r; - r = _crc_load_or_init(destfd, &scm_dst); - if (r < 0) - return r; - ostringstream ss; - scm_dst.clone_range(srcoff, len, dstoff, scm_src, &ss); - dout(30) << __func__ << "\n" << ss.str() << dendl; - r = _crc_save(destfd, &scm_dst); - return r; -} - -int GenericFileStoreBackend::_crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl, - ostream *out) -{ - SloppyCRCMap scm(get_crc_block_size()); - int r = _crc_load_or_init(fd, &scm); - if (r < 0) - return r; - return scm.read(off, len, bl, out); -} diff --git a/src/os/GenericFileStoreBackend.h b/src/os/GenericFileStoreBackend.h deleted file mode 100644 index f31e2029a652..000000000000 --- a/src/os/GenericFileStoreBackend.h +++ /dev/null @@ -1,66 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef CEPH_GENERICFILESTOREBACKEDN_H -#define CEPH_GENERICFILESTOREBACKEDN_H - -#include "FileStore.h" - -class SloppyCRCMap; - -class GenericFileStoreBackend : public FileStoreBackend { -private: - bool ioctl_fiemap; - bool seek_data_hole; - bool m_filestore_fiemap; - bool m_filestore_seek_data_hole; - bool m_filestore_fsync_flushes_journal_data; - bool m_filestore_splice; -public: - GenericFileStoreBackend(FileStore *fs); - virtual ~GenericFileStoreBackend() {} - - virtual const char *get_name() { - return "generic"; - } - virtual int detect_features(); - virtual int create_current(); - virtual bool can_checkpoint() { return false; } - virtual int list_checkpoints(list& ls) { return 0; } - virtual int create_checkpoint(const string& name, uint64_t *cid) { return -EOPNOTSUPP; } - virtual int sync_checkpoint(uint64_t id) { return -EOPNOTSUPP; } - virtual int rollback_to(const string& name) { return -EOPNOTSUPP; } - virtual int destroy_checkpoint(const string& name) { return -EOPNOTSUPP; } - virtual int syncfs(); - virtual bool has_fiemap() { return ioctl_fiemap; } - virtual bool has_seek_data_hole() { return seek_data_hole; } - virtual int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap); - virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) { - return _copy_range(from, to, srcoff, len, dstoff); - } - virtual int set_alloc_hint(int fd, uint64_t hint) { return -EOPNOTSUPP; } - virtual bool has_splice() const { return m_filestore_splice; } -private: - int _crc_load_or_init(int fd, SloppyCRCMap *cm); - int _crc_save(int fd, SloppyCRCMap *cm); -public: - virtual int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl); - virtual int _crc_update_truncate(int fd, loff_t off); - virtual int _crc_update_zero(int fd, loff_t off, size_t len); - virtual int _crc_update_clone_range(int srcfd, int destfd, - loff_t srcoff, size_t len, loff_t dstoff); - virtual int _crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl, - ostream *out); -}; -#endif diff --git a/src/os/HashIndex.cc b/src/os/HashIndex.cc deleted file mode 100644 index 27edf0de6d29..000000000000 --- a/src/os/HashIndex.cc +++ /dev/null @@ -1,1085 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "include/types.h" -#include "include/buffer.h" -#include "osd/osd_types.h" -#include - -#include "HashIndex.h" - -#include "common/debug.h" -#define dout_subsys ceph_subsys_filestore - -const string HashIndex::SUBDIR_ATTR = "contents"; -const string HashIndex::IN_PROGRESS_OP_TAG = "in_progress_op"; - -/// hex digit to integer value -int hex_to_int(char c) -{ - if (c >= '0' && c <= '9') - return c - '0'; - if (c >= 'A' && c <= 'F') - return c - 'A' + 10; - assert(0); -} - -/// int value to hex digit -char int_to_hex(int v) -{ - assert(v < 16); - if (v < 10) - return '0' + v; - return 'A' + v - 10; -} - -/// reverse bits in a nibble (0..15) -int reverse_nibble_bits(int in) -{ - assert(in < 16); - return - ((in & 8) >> 3) | - ((in & 4) >> 1) | - ((in & 2) << 1) | - ((in & 1) << 3); -} - -/// reverse nibble bits in a hex digit -char reverse_hexdigit_bits(char c) -{ - return int_to_hex(reverse_nibble_bits(hex_to_int(c))); -} - -/// reverse nibble bits in a hex string -string reverse_hexdigit_bits_string(string s) -{ - for (unsigned i=0; i(), IN_PROGRESS_OP_TAG, bl); - if (r < 0) { - // No in progress operations! - return 0; - } - bufferlist::iterator i = bl.begin(); - InProgressOp in_progress(i); - subdir_info_s info; - r = get_info(in_progress.path, &info); - if (r == -ENOENT) { - return end_split_or_merge(in_progress.path); - } else if (r < 0) { - return r; - } - - if (in_progress.is_split()) - return complete_split(in_progress.path, info); - else if (in_progress.is_merge()) - return complete_merge(in_progress.path, info); - else if (in_progress.is_col_split()) { - for (vector::iterator i = in_progress.path.begin(); - i != in_progress.path.end(); - ++i) { - vector path(in_progress.path.begin(), i); - int r = reset_attr(path); - if (r < 0) - return r; - } - return 0; - } - else - return -EINVAL; -} - -int HashIndex::reset_attr( - const vector &path) -{ - int exists = 0; - int r = path_exists(path, &exists); - if (r < 0) - return r; - if (!exists) - return 0; - map objects; - vector subdirs; - r = list_objects(path, 0, 0, &objects); - if (r < 0) - return r; - r = list_subdirs(path, &subdirs); - if (r < 0) - return r; - - subdir_info_s info; - info.hash_level = path.size(); - info.objs = objects.size(); - info.subdirs = subdirs.size(); - return set_info(path, info); -} - -int HashIndex::col_split_level( - HashIndex &from, - HashIndex &to, - const vector &path, - uint32_t inbits, - uint32_t match, - unsigned *mkdirred) -{ - /* For each subdir, move, recurse, or ignore based on comparing the low order - * bits of the hash represented by the subdir path with inbits, match passed - * in. - */ - vector subdirs; - int r = from.list_subdirs(path, &subdirs); - if (r < 0) - return r; - map objects; - r = from.list_objects(path, 0, 0, &objects); - if (r < 0) - return r; - - set to_move; - for (vector::iterator i = subdirs.begin(); - i != subdirs.end(); - ++i) { - uint32_t bits = 0; - uint32_t hash = 0; - vector sub_path(path.begin(), path.end()); - sub_path.push_back(*i); - path_to_hobject_hash_prefix(sub_path, &bits, &hash); - if (bits < inbits) { - if (hobject_t::match_hash(hash, bits, match)) { - r = col_split_level( - from, - to, - sub_path, - inbits, - match, - mkdirred); - if (r < 0) - return r; - if (*mkdirred > path.size()) - *mkdirred = path.size(); - } // else, skip, doesn't need to be moved or recursed into - } else { - if (hobject_t::match_hash(hash, inbits, match)) { - to_move.insert(*i); - } - } // else, skip, doesn't need to be moved or recursed into - } - - /* Then, do the same for each object */ - map objs_to_move; - for (map::iterator i = objects.begin(); - i != objects.end(); - ++i) { - if (i->second.match(inbits, match)) { - objs_to_move.insert(*i); - } - } - - if (objs_to_move.empty() && to_move.empty()) - return 0; - - // Make parent directories as needed - while (*mkdirred < path.size()) { - ++*mkdirred; - int exists = 0; - vector creating_path(path.begin(), path.begin()+*mkdirred); - r = to.path_exists(creating_path, &exists); - if (r < 0) - return r; - if (exists) - continue; - subdir_info_s info; - info.objs = 0; - info.subdirs = 0; - info.hash_level = creating_path.size(); - if (*mkdirred < path.size() - 1) - info.subdirs = 1; - r = to.start_col_split(creating_path); - if (r < 0) - return r; - r = to.create_path(creating_path); - if (r < 0) - return r; - r = to.set_info(creating_path, info); - if (r < 0) - return r; - r = to.end_split_or_merge(creating_path); - if (r < 0) - return r; - } - - subdir_info_s from_info; - subdir_info_s to_info; - r = from.get_info(path, &from_info); - if (r < 0) - return r; - r = to.get_info(path, &to_info); - if (r < 0) - return r; - - from.start_col_split(path); - to.start_col_split(path); - - // Do subdir moves - for (set::iterator i = to_move.begin(); - i != to_move.end(); - ++i) { - from_info.subdirs--; - to_info.subdirs++; - r = move_subdir(from, to, path, *i); - if (r < 0) - return r; - } - - for (map::iterator i = objs_to_move.begin(); - i != objs_to_move.end(); - ++i) { - from_info.objs--; - to_info.objs++; - r = move_object(from, to, path, *i); - if (r < 0) - return r; - } - - - r = to.set_info(path, to_info); - if (r < 0) - return r; - r = from.set_info(path, from_info); - if (r < 0) - return r; - from.end_split_or_merge(path); - to.end_split_or_merge(path); - return 0; -} - -int HashIndex::_split( - uint32_t match, - uint32_t bits, - CollectionIndex* dest) { - assert(collection_version() == dest->collection_version()); - unsigned mkdirred = 0; - return col_split_level( - *this, - *static_cast(dest), - vector(), - bits, - match, - &mkdirred); -} - -int HashIndex::_init() { - subdir_info_s info; - vector path; - return set_info(path, info); -} - -/* LFNIndex virtual method implementations */ -int HashIndex::_created(const vector &path, - const ghobject_t &oid, - const string &mangled_name) { - subdir_info_s info; - int r; - r = get_info(path, &info); - if (r < 0) - return r; - info.objs++; - r = set_info(path, info); - if (r < 0) - return r; - - if (must_split(info)) { - int r = initiate_split(path, info); - if (r < 0) - return r; - return complete_split(path, info); - } else { - return 0; - } -} - -int HashIndex::_remove(const vector &path, - const ghobject_t &oid, - const string &mangled_name) { - int r; - r = remove_object(path, oid); - if (r < 0) - return r; - subdir_info_s info; - r = get_info(path, &info); - if (r < 0) - return r; - info.objs--; - r = set_info(path, info); - if (r < 0) - return r; - if (must_merge(info)) { - r = initiate_merge(path, info); - if (r < 0) - return r; - return complete_merge(path, info); - } else { - return 0; - } -} - -int HashIndex::_lookup(const ghobject_t &oid, - vector *path, - string *mangled_name, - int *hardlink) { - vector path_comp; - get_path_components(oid, &path_comp); - vector::iterator next = path_comp.begin(); - int exists; - while (1) { - int r = path_exists(*path, &exists); - if (r < 0) - return r; - if (!exists) { - if (path->empty()) - return -ENOENT; - path->pop_back(); - break; - } - if (next == path_comp.end()) - break; - path->push_back(*(next++)); - } - return get_mangled_name(*path, oid, mangled_name, hardlink); -} - -int HashIndex::_collection_list_partial(const ghobject_t &start, - const ghobject_t &end, - bool sort_bitwise, - int max_count, - vector *ls, - ghobject_t *next) { - vector path; - ghobject_t _next; - if (!next) - next = &_next; - *next = start; - dout(20) << __func__ << " start:" << start << " end:" << end << "-" << max_count << " ls.size " << ls->size() << dendl; - return list_by_hash(path, end, sort_bitwise, max_count, next, ls); -} - -int HashIndex::prep_delete() { - return recursive_remove(vector()); -} - -int HashIndex::_pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs) { - int ret; - vector path; - subdir_info_s root_info; - // Make sure there is neither objects nor sub-folders - // in this collection - ret = get_info(path, &root_info); - if (ret < 0) - return ret; - - // Do the folder splitting first - ret = pre_split_folder(pg_num, expected_num_objs); - if (ret < 0) - return ret; - // Initialize the folder info starting from root - return init_split_folder(path, 0); -} - -int HashIndex::pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs) -{ - // If folder merging is enabled (by setting the threshold positive), - // no need to split - if (merge_threshold > 0) - return 0; - const coll_t c = coll(); - // Do not split if the expected number of objects in this collection is zero (by default) - if (expected_num_objs == 0) - return 0; - - // Calculate the number of leaf folders (which actually store files) - // need to be created - const uint64_t objs_per_folder = (uint64_t)(abs(merge_threshold)) * (uint64_t)split_multiplier * 16; - uint64_t leavies = expected_num_objs / objs_per_folder ; - // No need to split - if (leavies == 0 || expected_num_objs == objs_per_folder) - return 0; - - spg_t spgid; - if (!c.is_pg_prefix(&spgid)) - return -EINVAL; - const ps_t ps = spgid.pgid.ps(); - - // the most significant bits of pg_num - const int pg_num_bits = calc_num_bits(pg_num - 1); - ps_t tmp_id = ps; - // calculate the number of levels we only create one sub folder - int num = pg_num_bits / 4; - // pg num's hex value is like 1xxx,xxxx,xxxx but not 1111,1111,1111, - // so that splitting starts at level 3 - if (pg_num_bits % 4 == 0 && pg_num < ((uint32_t)1 << pg_num_bits)) { - --num; - } - - int ret; - // Start with creation that only has one subfolder - vector paths; - int dump_num = num; - while (num-- > 0) { - ps_t v = tmp_id & 0x0000000f; - paths.push_back(to_hex(v)); - ret = create_path(paths); - if (ret < 0 && ret != -EEXIST) - return ret; - tmp_id = tmp_id >> 4; - } - - // Starting from here, we can split by creating multiple subfolders - const int left_bits = pg_num_bits - dump_num * 4; - // this variable denotes how many bits (for this level) that can be - // used for sub folder splitting - int split_bits = 4 - left_bits; - // the below logic is inspired by rados.h#ceph_stable_mod, - // it basically determines how many sub-folders should we - // create for splitting - assert(pg_num_bits > 0); // otherwise BAD_SHIFT - if (((1 << (pg_num_bits - 1)) | ps) >= pg_num) { - ++split_bits; - } - const uint32_t subs = (1 << split_bits); - // Calculate how many levels we create starting from here - int level = 0; - leavies /= subs; - while (leavies > 1) { - ++level; - leavies = leavies >> 4; - } - for (uint32_t i = 0; i < subs; ++i) { - assert(split_bits <= 4); // otherwise BAD_SHIFT - int v = tmp_id | (i << ((4 - split_bits) % 4)); - paths.push_back(to_hex(v)); - ret = create_path(paths); - if (ret < 0 && ret != -EEXIST) - return ret; - ret = recursive_create_path(paths, level); - if (ret < 0) - return ret; - paths.pop_back(); - } - return 0; -} - -int HashIndex::init_split_folder(vector &path, uint32_t hash_level) -{ - // Get the number of sub directories for the current path - vector subdirs; - int ret = list_subdirs(path, &subdirs); - if (ret < 0) - return ret; - subdir_info_s info; - info.subdirs = subdirs.size(); - info.hash_level = hash_level; - ret = set_info(path, info); - if (ret < 0) - return ret; - ret = fsync_dir(path); - if (ret < 0) - return ret; - - // Do the same for subdirs - vector::const_iterator iter; - for (iter = subdirs.begin(); iter != subdirs.end(); ++iter) { - path.push_back(*iter); - ret = init_split_folder(path, hash_level + 1); - if (ret < 0) - return ret; - path.pop_back(); - } - return 0; -} - -int HashIndex::recursive_create_path(vector& path, int level) -{ - if (level == 0) - return 0; - for (int i = 0; i < 16; ++i) { - path.push_back(to_hex(i)); - int ret = create_path(path); - if (ret < 0 && ret != -EEXIST) - return ret; - ret = recursive_create_path(path, level - 1); - if (ret < 0) - return ret; - path.pop_back(); - } - return 0; -} - -int HashIndex::recursive_remove(const vector &path) { - vector subdirs; - int r = list_subdirs(path, &subdirs); - if (r < 0) - return r; - map objects; - r = list_objects(path, 0, 0, &objects); - if (r < 0) - return r; - if (!objects.empty()) - return -ENOTEMPTY; - vector subdir(path); - for (vector::iterator i = subdirs.begin(); - i != subdirs.end(); - ++i) { - subdir.push_back(*i); - r = recursive_remove(subdir); - if (r < 0) - return r; - subdir.pop_back(); - } - return remove_path(path); -} - -int HashIndex::start_col_split(const vector &path) { - bufferlist bl; - InProgressOp op_tag(InProgressOp::COL_SPLIT, path); - op_tag.encode(bl); - int r = add_attr_path(vector(), IN_PROGRESS_OP_TAG, bl); - if (r < 0) - return r; - return fsync_dir(vector()); -} - -int HashIndex::start_split(const vector &path) { - bufferlist bl; - InProgressOp op_tag(InProgressOp::SPLIT, path); - op_tag.encode(bl); - int r = add_attr_path(vector(), IN_PROGRESS_OP_TAG, bl); - if (r < 0) - return r; - return fsync_dir(vector()); -} - -int HashIndex::start_merge(const vector &path) { - bufferlist bl; - InProgressOp op_tag(InProgressOp::MERGE, path); - op_tag.encode(bl); - int r = add_attr_path(vector(), IN_PROGRESS_OP_TAG, bl); - if (r < 0) - return r; - return fsync_dir(vector()); -} - -int HashIndex::end_split_or_merge(const vector &path) { - return remove_attr_path(vector(), IN_PROGRESS_OP_TAG); -} - -int HashIndex::get_info(const vector &path, subdir_info_s *info) { - bufferlist buf; - int r = get_attr_path(path, SUBDIR_ATTR, buf); - if (r < 0) - return r; - bufferlist::iterator bufiter = buf.begin(); - info->decode(bufiter); - assert(path.size() == (unsigned)info->hash_level); - return 0; -} - -int HashIndex::set_info(const vector &path, const subdir_info_s &info) { - bufferlist buf; - assert(path.size() == (unsigned)info.hash_level); - info.encode(buf); - return add_attr_path(path, SUBDIR_ATTR, buf); -} - -bool HashIndex::must_merge(const subdir_info_s &info) { - return (info.hash_level > 0 && - merge_threshold > 0 && - info.objs < (unsigned)merge_threshold && - info.subdirs == 0); -} - -bool HashIndex::must_split(const subdir_info_s &info) { - return (info.hash_level < (unsigned)MAX_HASH_LEVEL && - info.objs > ((unsigned)(abs(merge_threshold)) * 16 * split_multiplier)); - -} - -int HashIndex::initiate_merge(const vector &path, subdir_info_s info) { - return start_merge(path); -} - -int HashIndex::complete_merge(const vector &path, subdir_info_s info) { - vector dst = path; - dst.pop_back(); - subdir_info_s dstinfo; - int r, exists; - r = path_exists(path, &exists); - if (r < 0) - return r; - r = get_info(dst, &dstinfo); - if (r < 0) - return r; - if (exists) { - r = move_objects(path, dst); - if (r < 0) - return r; - r = reset_attr(dst); - if (r < 0) - return r; - r = remove_path(path); - if (r < 0) - return r; - } - if (must_merge(dstinfo)) { - r = initiate_merge(dst, dstinfo); - if (r < 0) - return r; - r = fsync_dir(dst); - if (r < 0) - return r; - return complete_merge(dst, dstinfo); - } - r = fsync_dir(dst); - if (r < 0) - return r; - return end_split_or_merge(path); -} - -int HashIndex::initiate_split(const vector &path, subdir_info_s info) { - return start_split(path); -} - -int HashIndex::complete_split(const vector &path, subdir_info_s info) { - int level = info.hash_level; - map objects; - vector dst = path; - int r; - dst.push_back(""); - r = list_objects(path, 0, 0, &objects); - if (r < 0) - return r; - vector subdirs_vec; - r = list_subdirs(path, &subdirs_vec); - if (r < 0) - return r; - set subdirs; - subdirs.insert(subdirs_vec.begin(), subdirs_vec.end()); - map > mapped; - map moved; - int num_moved = 0; - for (map::iterator i = objects.begin(); - i != objects.end(); - ++i) { - vector new_path; - get_path_components(i->second, &new_path); - mapped[new_path[level]][i->first] = i->second; - } - for (map >::iterator i = mapped.begin(); - i != mapped.end(); - ) { - dst[level] = i->first; - /* If the info already exists, it must be correct, - * we may be picking up a partially finished split */ - subdir_info_s temp; - // subdir has already been fully copied - if (subdirs.count(i->first) && !get_info(dst, &temp)) { - for (map::iterator j = i->second.begin(); - j != i->second.end(); - ++j) { - moved[j->first] = j->second; - num_moved++; - objects.erase(j->first); - } - ++i; - continue; - } - - subdir_info_s info_new; - info_new.objs = i->second.size(); - info_new.subdirs = 0; - info_new.hash_level = level + 1; - if (must_merge(info_new) && !subdirs.count(i->first)) { - mapped.erase(i++); - continue; - } - - // Subdir doesn't yet exist - if (!subdirs.count(i->first)) { - info.subdirs += 1; - r = create_path(dst); - if (r < 0) - return r; - } // else subdir has been created but only partially copied - - for (map::iterator j = i->second.begin(); - j != i->second.end(); - ++j) { - moved[j->first] = j->second; - num_moved++; - objects.erase(j->first); - r = link_object(path, dst, j->second, j->first); - // May be a partially finished split - if (r < 0 && r != -EEXIST) { - return r; - } - } - - r = fsync_dir(dst); - if (r < 0) - return r; - - // Presence of info must imply that all objects have been copied - r = set_info(dst, info_new); - if (r < 0) - return r; - - r = fsync_dir(dst); - if (r < 0) - return r; - - ++i; - } - r = remove_objects(path, moved, &objects); - if (r < 0) - return r; - info.objs = objects.size(); - r = reset_attr(path); - if (r < 0) - return r; - r = fsync_dir(path); - if (r < 0) - return r; - return end_split_or_merge(path); -} - -void HashIndex::get_path_components(const ghobject_t &oid, - vector *path) { - char buf[MAX_HASH_LEVEL + 1]; - snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, (uint32_t)oid.hobj.get_nibblewise_key()); - - // Path components are the hex characters of oid.hobj.hash, least - // significant first - for (int i = 0; i < MAX_HASH_LEVEL; ++i) { - path->push_back(string(&buf[i], 1)); - } -} - -string HashIndex::get_hash_str(uint32_t hash) { - char buf[MAX_HASH_LEVEL + 1]; - snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, hash); - string retval; - for (int i = 0; i < MAX_HASH_LEVEL; ++i) { - retval.push_back(buf[MAX_HASH_LEVEL - 1 - i]); - } - return retval; -} - -string HashIndex::get_path_str(const ghobject_t &oid) { - assert(!oid.is_max()); - return get_hash_str(oid.hobj.get_hash()); -} - -uint32_t HashIndex::hash_prefix_to_hash(string prefix) { - while (prefix.size() < sizeof(uint32_t) * 2) { - prefix.push_back('0'); - } - uint32_t hash; - sscanf(prefix.c_str(), "%x", &hash); - // nibble reverse - hash = ((hash & 0x0f0f0f0f) << 4) | ((hash & 0xf0f0f0f0) >> 4); - hash = ((hash & 0x00ff00ff) << 8) | ((hash & 0xff00ff00) >> 8); - hash = ((hash & 0x0000ffff) << 16) | ((hash & 0xffff0000) >> 16); - return hash; -} - -int HashIndex::get_path_contents_by_hash_bitwise( - const vector &path, - const ghobject_t *next_object, - set *hash_prefixes, - set, CmpPairBitwise> *objects) -{ - map rev_objects; - int r; - r = list_objects(path, 0, 0, &rev_objects); - if (r < 0) - return r; - // bitwise sort - for (map::iterator i = rev_objects.begin(); - i != rev_objects.end(); - ++i) { - if (next_object && cmp_bitwise(i->second, *next_object) < 0) - continue; - string hash_prefix = get_path_str(i->second); - hash_prefixes->insert(hash_prefix); - objects->insert(pair(hash_prefix, i->second)); - } - vector subdirs; - r = list_subdirs(path, &subdirs); - if (r < 0) - return r; - - // sort subdirs bitwise (by reversing hex digit nibbles) - std::sort(subdirs.begin(), subdirs.end(), cmp_hexdigit_bitwise); - - // Local to this function, we will convert the prefix strings - // (previously simply the reversed hex digits) to also have each - // digit's nibbles reversed. This will make the strings sort - // bitwise. - string cur_prefix; - for (vector::const_iterator i = path.begin(); - i != path.end(); - ++i) { - cur_prefix.append(reverse_hexdigit_bits_string(*i)); - } - string next_object_string; - if (next_object) - next_object_string = reverse_hexdigit_bits_string(get_path_str(*next_object)); - for (vector::iterator i = subdirs.begin(); - i != subdirs.end(); - ++i) { - string candidate = cur_prefix + reverse_hexdigit_bits_string(*i); - if (next_object) { - if (next_object->is_max()) - continue; - if (candidate < next_object_string.substr(0, candidate.size())) - continue; - } - // re-reverse the hex digit nibbles for the caller - hash_prefixes->insert(reverse_hexdigit_bits_string(candidate)); - } - return 0; -} - -int HashIndex::get_path_contents_by_hash_nibblewise( - const vector &path, - const ghobject_t *next_object, - set *hash_prefixes, - set, CmpPairNibblewise > *objects) -{ - map rev_objects; - int r; - r = list_objects(path, 0, 0, &rev_objects); - if (r < 0) - return r; - - for (map::iterator i = rev_objects.begin(); - i != rev_objects.end(); - ++i) { - string hash_prefix = get_path_str(i->second); - if (next_object && cmp_nibblewise(i->second, *next_object) < 0) - continue; - hash_prefixes->insert(hash_prefix); - objects->insert(pair(hash_prefix, i->second)); - } - - vector subdirs; - r = list_subdirs(path, &subdirs); - if (r < 0) - return r; - - // sort nibblewise (string sort of (reversed) hex digits) - std::sort(subdirs.begin(), subdirs.end()); - - string cur_prefix; - for (vector::const_iterator i = path.begin(); - i != path.end(); - ++i) { - cur_prefix.append(*i); - } - string next_object_string; - if (next_object) - next_object_string = get_path_str(*next_object); - - for (vector::iterator i = subdirs.begin(); - i != subdirs.end(); - ++i) { - string candidate = cur_prefix + *i; - if (next_object) { - if (next_object->is_max()) - continue; - if (candidate < next_object_string.substr(0, candidate.size())) - continue; - } - hash_prefixes->insert(cur_prefix + *i); - } - return 0; -} - -int HashIndex::list_by_hash(const vector &path, - const ghobject_t &end, - bool sort_bitwise, - int max_count, - ghobject_t *next, - vector *out) -{ - assert(out); - if (sort_bitwise) - return list_by_hash_bitwise(path, end, max_count, next, out); - else - return list_by_hash_nibblewise(path, end, max_count, next, out); -} - -int HashIndex::list_by_hash_bitwise( - const vector &path, - const ghobject_t& end, - int max_count, - ghobject_t *next, - vector *out) -{ - vector next_path = path; - next_path.push_back(""); - set hash_prefixes; - set, CmpPairBitwise> objects; - int r = get_path_contents_by_hash_bitwise(path, - next, - &hash_prefixes, - &objects); - if (r < 0) - return r; - for (set::iterator i = hash_prefixes.begin(); - i != hash_prefixes.end(); - ++i) { - dout(20) << __func__ << " prefix " << *i << dendl; - set, CmpPairBitwise>::iterator j = objects.lower_bound( - make_pair(*i, ghobject_t())); - if (j == objects.end() || j->first != *i) { - *(next_path.rbegin()) = *(i->rbegin()); - ghobject_t next_recurse; - if (next) - next_recurse = *next; - r = list_by_hash_bitwise(next_path, - end, - max_count, - &next_recurse, - out); - - if (r < 0) - return r; - if (!next_recurse.is_max()) { - if (next) - *next = next_recurse; - return 0; - } - } else { - while (j != objects.end() && j->first == *i) { - if (max_count > 0 && out->size() == (unsigned)max_count) { - if (next) - *next = j->second; - return 0; - } - if (cmp_bitwise(j->second, end) >= 0) { - if (next) - *next = ghobject_t::get_max(); - return 0; - } - if (!next || cmp_bitwise(j->second, *next) >= 0) { - dout(20) << __func__ << " prefix " << *i << " ob " << j->second << dendl; - out->push_back(j->second); - } - ++j; - } - } - } - if (next) - *next = ghobject_t::get_max(); - return 0; -} - -int HashIndex::list_by_hash_nibblewise( - const vector &path, - const ghobject_t& end, - int max_count, - ghobject_t *next, - vector *out) -{ - vector next_path = path; - next_path.push_back(""); - set hash_prefixes; - set, CmpPairNibblewise> objects; - int r = get_path_contents_by_hash_nibblewise(path, - next, - &hash_prefixes, - &objects); - if (r < 0) - return r; - for (set::iterator i = hash_prefixes.begin(); - i != hash_prefixes.end(); - ++i) { - dout(20) << __func__ << " prefix " << *i << dendl; - set, CmpPairNibblewise >::iterator j = - objects.lower_bound(make_pair(*i, ghobject_t())); - if (j == objects.end() || j->first != *i) { - *(next_path.rbegin()) = *(i->rbegin()); - ghobject_t next_recurse; - if (next) - next_recurse = *next; - r = list_by_hash_nibblewise(next_path, - end, - max_count, - &next_recurse, - out); - - if (r < 0) - return r; - if (!next_recurse.is_max()) { - if (next) - *next = next_recurse; - return 0; - } - } else { - while (j != objects.end() && j->first == *i) { - if (max_count > 0 && out->size() == (unsigned)max_count) { - if (next) - *next = j->second; - return 0; - } - if (cmp_nibblewise(j->second, end) >= 0) { - if (next) - *next = ghobject_t::get_max(); - return 0; - } - if (!next || cmp_nibblewise(j->second, *next) >= 0) { - out->push_back(j->second); - } - ++j; - } - } - } - if (next) - *next = ghobject_t::get_max(); - return 0; -} diff --git a/src/os/HashIndex.h b/src/os/HashIndex.h deleted file mode 100644 index c3808bd96673..000000000000 --- a/src/os/HashIndex.h +++ /dev/null @@ -1,432 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef CEPH_HASHINDEX_H -#define CEPH_HASHINDEX_H - -#include "include/buffer_fwd.h" -#include "include/encoding.h" -#include "LFNIndex.h" - -extern string reverse_hexdigit_bits_string(string l); - -/** - * Implements collection prehashing. - * - * @verbatim - * (root) - 0 - 0 - * - 1 - * - E - * - 1 - * - 2 - D - 0 - * . - * . - * . - * - F - 0 - * @endverbatim - * - * A file is located at the longest existing directory from the root - * given by the hex characters in the hash beginning with the least - * significant. - * - * ex: ghobject_t("object", CEPH_NO_SNAP, 0xA4CEE0D2) - * would be located in (root)/2/D/0/ - * - * Subdirectories are created when the number of objects in a directory - * exceed (abs(merge_threshhold)) * 16 * split_multiplier. The number of objects in a directory - * is encoded as subdir_info_s in an xattr on the directory. - */ -class HashIndex : public LFNIndex { -private: - /// Attribute name for storing subdir info @see subdir_info_s - static const string SUBDIR_ATTR; - /// Attribute name for storing in progress op tag - static const string IN_PROGRESS_OP_TAG; - /// Size (bits) in object hash - static const int PATH_HASH_LEN = 32; - /// Max length of hashed path - static const int MAX_HASH_LEVEL = (PATH_HASH_LEN/4); - - /** - * Merges occur when the number of object drops below - * merge_threshold and splits occur when the number of objects - * exceeds 16 * abs(merge_threshold) * split_multiplier. - * Please note if merge_threshold is less than zero, it will never do merging - */ - int merge_threshold; - int split_multiplier; - - /// Encodes current subdir state for determining when to split/merge. - struct subdir_info_s { - uint64_t objs; ///< Objects in subdir. - uint32_t subdirs; ///< Subdirs in subdir. - uint32_t hash_level; ///< Hashlevel of subdir. - - subdir_info_s() : objs(0), subdirs(0), hash_level(0) {} - - void encode(bufferlist &bl) const - { - __u8 v = 1; - ::encode(v, bl); - ::encode(objs, bl); - ::encode(subdirs, bl); - ::encode(hash_level, bl); - } - - void decode(bufferlist::iterator &bl) - { - __u8 v; - ::decode(v, bl); - assert(v == 1); - ::decode(objs, bl); - ::decode(subdirs, bl); - ::decode(hash_level, bl); - } - }; - - /// Encodes in progress split or merge - struct InProgressOp { - static const int SPLIT = 0; - static const int MERGE = 1; - static const int COL_SPLIT = 2; - int op; - vector path; - - InProgressOp(int op, const vector &path) - : op(op), path(path) {} - - InProgressOp(bufferlist::iterator &bl) { - decode(bl); - } - - bool is_split() const { return op == SPLIT; } - bool is_col_split() const { return op == COL_SPLIT; } - bool is_merge() const { return op == MERGE; } - - void encode(bufferlist &bl) const { - __u8 v = 1; - ::encode(v, bl); - ::encode(op, bl); - ::encode(path, bl); - } - - void decode(bufferlist::iterator &bl) { - __u8 v; - ::decode(v, bl); - assert(v == 1); - ::decode(op, bl); - ::decode(path, bl); - } - }; - - -public: - /// Constructor. - HashIndex( - coll_t collection, ///< [in] Collection - const char *base_path, ///< [in] Path to the index root. - int merge_at, ///< [in] Merge threshhold. - int split_multiple, ///< [in] Split threshhold. - uint32_t index_version,///< [in] Index version - double retry_probability=0) ///< [in] retry probability - : LFNIndex(collection, base_path, index_version, retry_probability), - merge_threshold(merge_at), - split_multiplier(split_multiple) {} - - /// @see CollectionIndex - uint32_t collection_version() { return index_version; } - - /// @see CollectionIndex - int cleanup(); - - /// @see CollectionIndex - int prep_delete(); - - /// @see CollectionIndex - int _split( - uint32_t match, - uint32_t bits, - CollectionIndex* dest - ); - -protected: - int _init(); - - int _created( - const vector &path, - const ghobject_t &oid, - const string &mangled_name - ); - int _remove( - const vector &path, - const ghobject_t &oid, - const string &mangled_name - ); - int _lookup( - const ghobject_t &oid, - vector *path, - string *mangled_name, - int *hardlink - ); - - /** - * Pre-hash the collection to create folders according to the expected number - * of objects in this collection. - */ - int _pre_hash_collection( - uint32_t pg_num, - uint64_t expected_num_objs - ); - - int _collection_list_partial( - const ghobject_t &start, - const ghobject_t &end, - bool sort_bitwise, - int max_count, - vector *ls, - ghobject_t *next - ); -private: - /// Recursively remove path and its subdirs - int recursive_remove( - const vector &path ///< [in] path to remove - ); /// @return Error Code, 0 on success - /// Tag root directory at beginning of col_split - int start_col_split( - const vector &path ///< [in] path to split - ); ///< @return Error Code, 0 on success - /// Tag root directory at beginning of split - int start_split( - const vector &path ///< [in] path to split - ); ///< @return Error Code, 0 on success - /// Tag root directory at beginning of split - int start_merge( - const vector &path ///< [in] path to merge - ); ///< @return Error Code, 0 on success - /// Remove tag at end of split or merge - int end_split_or_merge( - const vector &path ///< [in] path to split or merged - ); ///< @return Error Code, 0 on success - /// Gets info from the xattr on the subdir represented by path - int get_info( - const vector &path, ///< [in] Path from which to read attribute. - subdir_info_s *info ///< [out] Attribute value - ); /// @return Error Code, 0 on success - - /// Sets info to the xattr on the subdir represented by path - int set_info( - const vector &path, ///< [in] Path on which to set attribute. - const subdir_info_s &info ///< [in] Value to set - ); /// @return Error Code, 0 on success - - /// Encapsulates logic for when to split. - bool must_merge( - const subdir_info_s &info ///< [in] Info to check - ); /// @return True if info must be merged, False otherwise - - /// Encapsulates logic for when to merge. - bool must_split( - const subdir_info_s &info ///< [in] Info to check - ); /// @return True if info must be split, False otherwise - - /// Initiates merge - int initiate_merge( - const vector &path, ///< [in] Subdir to merge - subdir_info_s info ///< [in] Info attached to path - ); /// @return Error Code, 0 on success - - /// Completes merge - int complete_merge( - const vector &path, ///< [in] Subdir to merge - subdir_info_s info ///< [in] Info attached to path - ); /// @return Error Code, 0 on success - - /// Resets attr to match actual subdir contents - int reset_attr( - const vector &path ///< [in] path to cleanup - ); - - /// Initiate Split - int initiate_split( - const vector &path, ///< [in] Subdir to split - subdir_info_s info ///< [in] Info attached to path - ); /// @return Error Code, 0 on success - - /// Completes Split - int complete_split( - const vector &path, ///< [in] Subdir to split - subdir_info_s info ///< [in] Info attached to path - ); /// @return Error Code, 0 on success - - /// Determine path components from hoid hash - void get_path_components( - const ghobject_t &oid, ///< [in] Object for which to get path components - vector *path ///< [out] Path components for hoid. - ); - - /// Pre-hash and split folders to avoid runtime splitting - /// according to the given expected object number. - int pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs); - - /// Initialize the folder (dir info) with the given hash - /// level and number of its subdirs. - int init_split_folder(vector &path, uint32_t hash_level); - - /// do collection split for path - static int col_split_level( - HashIndex &from, ///< [in] from index - HashIndex &dest, ///< [in] to index - const vector &path, ///< [in] path to split - uint32_t bits, ///< [in] num bits to match - uint32_t match, ///< [in] bits to match - unsigned *mkdirred ///< [in,out] path[:mkdirred] has been mkdirred - ); - - - /** - * Get string representation of ghobject_t/hash - * - * e.g: 0x01234567 -> "76543210" - */ - static string get_path_str( - const ghobject_t &oid ///< [in] Object to get hash string for - ); ///< @return Hash string for hoid. - - /// Get string from hash, @see get_path_str - static string get_hash_str( - uint32_t hash ///< [in] Hash to convert to a string. - ); ///< @return String representation of hash - - /// Get hash from hash prefix string e.g. "FFFFAB" -> 0xFFFFAB00 - static uint32_t hash_prefix_to_hash( - string prefix ///< [in] string to convert - ); ///< @return Hash - - /// Get hash mod from path - static void path_to_hobject_hash_prefix( - const vector &path,///< [in] path to convert - uint32_t *bits, ///< [out] bits - uint32_t *hash ///< [out] hash - ) { - string hash_str; - for (vector::const_iterator i = path.begin(); - i != path.end(); - ++i) { - hash_str.push_back(*i->begin()); - } - uint32_t rev_hash = hash_prefix_to_hash(hash_str); - if (hash) - *hash = rev_hash; - if (bits) - *bits = path.size() * 4; - } - - /// Calculate the number of bits. - static int calc_num_bits(uint64_t n) { - int ret = 0; - while (n > 0) { - n = n >> 1; - ret++; - } - return ret; - } - - /// Convert a number to hex string (upper case). - static string to_hex(int n) { - assert(n >= 0 && n < 16); - char c = (n <= 9 ? ('0' + n) : ('A' + n - 10)); - string str; - str.append(1, c); - return str; - } - - struct CmpPairNibblewise { - bool operator()(const pair& l, - const pair& r) - { - if (l.first < r.first) - return true; - if (l.first > r.first) - return false; - if (cmp_nibblewise(l.second, r.second) < 0) - return true; - return false; - } - }; - - struct CmpPairBitwise { - bool operator()(const pair& l, - const pair& r) - { - if (l.first < r.first) - return true; - if (l.first > r.first) - return false; - if (cmp_bitwise(l.second, r.second) < 0) - return true; - return false; - } - }; - - struct CmpHexdigitStringBitwise { - bool operator()(const string& l, const string& r) { - return reverse_hexdigit_bits_string(l) < reverse_hexdigit_bits_string(r); - } - }; - - /// Get path contents by hash - int get_path_contents_by_hash_bitwise( - const vector &path, /// [in] Path to list - const ghobject_t *next_object, /// [in] list > *next_object - set *hash_prefixes, /// [out] prefixes in dir - set, CmpPairBitwise> *objects /// [out] objects - ); - int get_path_contents_by_hash_nibblewise( - const vector &path, /// [in] Path to list - const ghobject_t *next_object, /// [in] list > *next_object - set *hash_prefixes, /// [out] prefixes in dir - set, CmpPairNibblewise> *objects /// [out] objects - ); - - /// List objects in collection in ghobject_t order - int list_by_hash( - const vector &path, /// [in] Path to list - const ghobject_t &end, /// [in] List only objects < end - bool sort_bitwise, /// [in] sort bitwise - int max_count, /// [in] List at most max_count - ghobject_t *next, /// [in,out] List objects >= *next - vector *out /// [out] Listed objects - ); ///< @return Error Code, 0 on success - /// List objects in collection in ghobject_t order - int list_by_hash_bitwise( - const vector &path, /// [in] Path to list - const ghobject_t &end, /// [in] List only objects < end - int max_count, /// [in] List at most max_count - ghobject_t *next, /// [in,out] List objects >= *next - vector *out /// [out] Listed objects - ); ///< @return Error Code, 0 on success - int list_by_hash_nibblewise( - const vector &path, /// [in] Path to list - const ghobject_t &end, /// [in] List only objects < end - int max_count, /// [in] List at most max_count - ghobject_t *next, /// [in,out] List objects >= *next - vector *out /// [out] Listed objects - ); ///< @return Error Code, 0 on success - - /// Create the given levels of sub directories from the given root. - /// The contents of *path* is not changed after calling this function. - int recursive_create_path(vector& path, int level); -}; - -#endif diff --git a/src/os/IndexManager.cc b/src/os/IndexManager.cc deleted file mode 100644 index 1415939f92db..000000000000 --- a/src/os/IndexManager.cc +++ /dev/null @@ -1,136 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "include/memory.h" -#include "include/unordered_map.h" - -#if defined(__FreeBSD__) -#include -#endif - -#include - -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/config.h" -#include "common/debug.h" -#include "include/buffer.h" - -#include "IndexManager.h" -#include "HashIndex.h" -#include "CollectionIndex.h" - -#include "chain_xattr.h" - -static int set_version(const char *path, uint32_t version) { - bufferlist bl; - ::encode(version, bl); - return chain_setxattr(path, "user.cephos.collection_version", bl.c_str(), - bl.length(), true); -} - -static int get_version(const char *path, uint32_t *version) { - bufferptr bp(PATH_MAX); - int r = chain_getxattr(path, "user.cephos.collection_version", - bp.c_str(), bp.length()); - if (r < 0) { - if (r != -ENOENT) { - *version = 0; - return 0; - } else { - return r; - } - } - bp.set_length(r); - bufferlist bl; - bl.push_back(bp); - bufferlist::iterator i = bl.begin(); - ::decode(*version, i); - return 0; -} - -IndexManager::~IndexManager() { - - for (ceph::unordered_map ::iterator it = col_indices.begin(); - it != col_indices.end(); ++it) { - - delete it->second; - it->second = NULL; - } - col_indices.clear(); -} - - -int IndexManager::init_index(coll_t c, const char *path, uint32_t version) { - Mutex::Locker l(lock); - int r = set_version(path, version); - if (r < 0) - return r; - HashIndex index(c, path, g_conf->filestore_merge_threshold, - g_conf->filestore_split_multiple, - version, - g_conf->filestore_index_retry_probability); - return index.init(); -} - -int IndexManager::build_index(coll_t c, const char *path, CollectionIndex **index) { - if (upgrade) { - // Need to check the collection generation - int r; - uint32_t version = 0; - r = get_version(path, &version); - if (r < 0) - return r; - - switch (version) { - case CollectionIndex::FLAT_INDEX_TAG: - case CollectionIndex::HASH_INDEX_TAG: // fall through - case CollectionIndex::HASH_INDEX_TAG_2: // fall through - case CollectionIndex::HOBJECT_WITH_POOL: { - // Must be a HashIndex - *index = new HashIndex(c, path, g_conf->filestore_merge_threshold, - g_conf->filestore_split_multiple, version); - return 0; - } - default: assert(0); - } - - } else { - // No need to check - *index = new HashIndex(c, path, g_conf->filestore_merge_threshold, - g_conf->filestore_split_multiple, - CollectionIndex::HOBJECT_WITH_POOL, - g_conf->filestore_index_retry_probability); - return 0; - } -} - -int IndexManager::get_index(coll_t c, const string& baseDir, Index *index) { - - Mutex::Locker l(lock); - ceph::unordered_map ::iterator it = col_indices.find(c); - if (it == col_indices.end()) { - char path[PATH_MAX]; - snprintf(path, sizeof(path), "%s/current/%s", baseDir.c_str(), c.to_str().c_str()); - CollectionIndex* colIndex = NULL; - int r = build_index(c, path, &colIndex); - if (r < 0) - return r; - col_indices[c] = colIndex; - index->index = colIndex; - } else { - index->index = it->second; - } - return 0; -} diff --git a/src/os/IndexManager.h b/src/os/IndexManager.h deleted file mode 100644 index b167e7de28f8..000000000000 --- a/src/os/IndexManager.h +++ /dev/null @@ -1,96 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ -#ifndef OS_INDEXMANAGER_H -#define OS_INDEXMANAGER_H - -#include "include/memory.h" -#include "include/unordered_map.h" - -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/config.h" -#include "common/debug.h" - -#include "CollectionIndex.h" -#include "HashIndex.h" - - -/// Public type for Index -struct Index { - CollectionIndex *index; - - Index() : index(NULL) {} - Index(CollectionIndex* index) : index(index) {} - - CollectionIndex *operator->() { return index; } - CollectionIndex &operator*() { return *index; } -}; - - -/** - * Encapsulates mutual exclusion for CollectionIndexes. - * - * Allowing a modification (removal or addition of an object) to occur - * while a read is occuring (lookup of an object's path and use of - * that path) may result in the path becoming invalid. Thus, during - * the lifetime of a CollectionIndex object and any paths returned - * by it, no other concurrent accesses may be allowed. - * This is enforced by using CollectionIndex::access_lock - */ -class IndexManager { - Mutex lock; ///< Lock for Index Manager - bool upgrade; - ceph::unordered_map col_indices; - - /** - * Index factory - * - * Encapsulates logic for handling legacy FileStore - * layouts - * - * @param [in] c Collection for which to get index - * @param [in] path Path to collection - * @param [out] index Index for c - * @return error code - */ - int build_index(coll_t c, const char *path, CollectionIndex **index); -public: - /// Constructor - IndexManager(bool upgrade) : lock("IndexManager lock"), - upgrade(upgrade) {} - - ~IndexManager(); - - /** - * Reserve and return index for c - * - * @param [in] c Collection for which to get index - * @param [in] baseDir base directory of collections - * @param [out] index Index for c - * @return error code - */ - int get_index(coll_t c, const string& baseDir, Index *index); - - /** - * Initialize index for collection c at path - * - * @param [in] c Collection for which to init Index - * @param [in] path Path to collection - * @param [in] filestore_version version of containing FileStore - * @return error code - */ - int init_index(coll_t c, const char *path, uint32_t filestore_version); -}; - -#endif diff --git a/src/os/Journal.h b/src/os/Journal.h deleted file mode 100644 index 400b1ea8b15f..000000000000 --- a/src/os/Journal.h +++ /dev/null @@ -1,81 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef CEPH_JOURNAL_H -#define CEPH_JOURNAL_H - -#include - -#include "include/buffer_fwd.h" -#include "include/Context.h" -#include "common/Finisher.h" -#include "common/TrackedOp.h" -#include "os/ObjectStore.h" - -class PerfCounters; - -class Journal { -protected: - uuid_d fsid; - Finisher *finisher; -public: - PerfCounters *logger; -protected: - Cond *do_sync_cond; - bool wait_on_full; - -public: - Journal(uuid_d f, Finisher *fin, Cond *c=0) : - fsid(f), finisher(fin), logger(NULL), - do_sync_cond(c), - wait_on_full(false) { } - virtual ~Journal() { } - - virtual int check() = 0; ///< check if journal appears valid - virtual int create() = 0; ///< create a fresh journal - virtual int open(uint64_t fs_op_seq) = 0; ///< open an existing journal - virtual void close() = 0; ///< close an open journal - - virtual void flush() = 0; - virtual void throttle() = 0; - - virtual int dump(ostream& out) { return -EOPNOTSUPP; } - - void set_wait_on_full(bool b) { wait_on_full = b; } - - // writes - virtual bool is_writeable() = 0; - virtual int make_writeable() = 0; - virtual void submit_entry(uint64_t seq, bufferlist& e, uint32_t orig_len, - Context *oncommit, - TrackedOpRef osd_op = TrackedOpRef()) = 0; - virtual void commit_start(uint64_t seq) = 0; - virtual void committed_thru(uint64_t seq) = 0; - - /// Read next journal entry - asserts on invalid journal - virtual bool read_entry( - bufferlist &bl, ///< [out] payload on successful read - uint64_t &seq ///< [in,out] sequence number on last successful read - ) = 0; ///< @return true on successful read, false on journal end - - virtual bool should_commit_now() = 0; - - virtual int prepare_entry(list& tls, bufferlist* tbl) = 0; - - // reads/recovery - -}; - -#endif diff --git a/src/os/JournalingObjectStore.cc b/src/os/JournalingObjectStore.cc deleted file mode 100644 index 599a1b568cb6..000000000000 --- a/src/os/JournalingObjectStore.cc +++ /dev/null @@ -1,269 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- - -#include "JournalingObjectStore.h" - -#include "common/errno.h" -#include "common/debug.h" - -#define dout_subsys ceph_subsys_journal -#undef dout_prefix -#define dout_prefix *_dout << "journal " - - - -void JournalingObjectStore::journal_start() -{ - dout(10) << "journal_start" << dendl; - finisher.start(); -} - -void JournalingObjectStore::journal_stop() -{ - dout(10) << "journal_stop" << dendl; - finisher.stop(); -} - -// A journal_replay() makes journal writeable, this closes that out. -void JournalingObjectStore::journal_write_close() -{ - if (journal) { - journal->close(); - delete journal; - journal = 0; - } - apply_manager.reset(); -} - -int JournalingObjectStore::journal_replay(uint64_t fs_op_seq) -{ - dout(10) << "journal_replay fs op_seq " << fs_op_seq << dendl; - - if (g_conf->journal_replay_from) { - dout(0) << "journal_replay forcing replay from " << g_conf->journal_replay_from - << " instead of " << fs_op_seq << dendl; - // the previous op is the last one committed - fs_op_seq = g_conf->journal_replay_from - 1; - } - - uint64_t op_seq = fs_op_seq; - apply_manager.init_seq(fs_op_seq); - - if (!journal) { - submit_manager.set_op_seq(op_seq); - return 0; - } - - int err = journal->open(op_seq); - if (err < 0) { - dout(3) << "journal_replay open failed with " - << cpp_strerror(err) << dendl; - delete journal; - journal = 0; - return err; - } - - replaying = true; - - int count = 0; - while (1) { - bufferlist bl; - uint64_t seq = op_seq + 1; - if (!journal->read_entry(bl, seq)) { - dout(3) << "journal_replay: end of journal, done." << dendl; - break; - } - - if (seq <= op_seq) { - dout(3) << "journal_replay: skipping old op seq " << seq << " <= " << op_seq << dendl; - continue; - } - assert(op_seq == seq-1); - - dout(3) << "journal_replay: applying op seq " << seq << dendl; - bufferlist::iterator p = bl.begin(); - list tls; - while (!p.end()) { - Transaction *t = new Transaction(p); - tls.push_back(t); - } - - apply_manager.op_apply_start(seq); - int r = do_transactions(tls, seq); - apply_manager.op_apply_finish(seq); - - op_seq = seq; - - while (!tls.empty()) { - delete tls.front(); - tls.pop_front(); - } - - dout(3) << "journal_replay: r = " << r << ", op_seq now " << op_seq << dendl; - } - - replaying = false; - - submit_manager.set_op_seq(op_seq); - - // done reading, make writeable. - err = journal->make_writeable(); - if (err < 0) - return err; - - return count; -} - - -// ------------------------------------ - -uint64_t JournalingObjectStore::ApplyManager::op_apply_start(uint64_t op) -{ - Mutex::Locker l(apply_lock); - while (blocked) { - // note: this only happens during journal replay - dout(10) << "op_apply_start blocked, waiting" << dendl; - blocked_cond.Wait(apply_lock); - } - dout(10) << "op_apply_start " << op << " open_ops " << open_ops << " -> " << (open_ops+1) << dendl; - assert(!blocked); - assert(op > committed_seq); - open_ops++; - return op; -} - -void JournalingObjectStore::ApplyManager::op_apply_finish(uint64_t op) -{ - Mutex::Locker l(apply_lock); - dout(10) << "op_apply_finish " << op << " open_ops " << open_ops - << " -> " << (open_ops-1) - << ", max_applied_seq " << max_applied_seq << " -> " << MAX(op, max_applied_seq) - << dendl; - --open_ops; - assert(open_ops >= 0); - - // signal a blocked commit_start (only needed during journal replay) - if (blocked) { - blocked_cond.Signal(); - } - - // there can be multiple applies in flight; track the max value we - // note. note that we can't _read_ this value and learn anything - // meaningful unless/until we've quiesced all in-flight applies. - if (op > max_applied_seq) - max_applied_seq = op; -} - -uint64_t JournalingObjectStore::SubmitManager::op_submit_start() -{ - lock.Lock(); - uint64_t op = ++op_seq; - dout(10) << "op_submit_start " << op << dendl; - return op; -} - -void JournalingObjectStore::SubmitManager::op_submit_finish(uint64_t op) -{ - dout(10) << "op_submit_finish " << op << dendl; - if (op != op_submitted + 1) { - dout(0) << "op_submit_finish " << op << " expected " << (op_submitted + 1) - << ", OUT OF ORDER" << dendl; - assert(0 == "out of order op_submit_finish"); - } - op_submitted = op; - lock.Unlock(); -} - - -// ------------------------------------------ - -void JournalingObjectStore::ApplyManager::add_waiter(uint64_t op, Context *c) -{ - Mutex::Locker l(com_lock); - assert(c); - commit_waiters[op].push_back(c); -} - -bool JournalingObjectStore::ApplyManager::commit_start() -{ - bool ret = false; - - uint64_t _committing_seq = 0; - { - Mutex::Locker l(apply_lock); - dout(10) << "commit_start max_applied_seq " << max_applied_seq - << ", open_ops " << open_ops - << dendl; - blocked = true; - while (open_ops > 0) { - dout(10) << "commit_start waiting for " << open_ops << " open ops to drain" << dendl; - blocked_cond.Wait(apply_lock); - } - assert(open_ops == 0); - dout(10) << "commit_start blocked, all open_ops have completed" << dendl; - { - Mutex::Locker l(com_lock); - if (max_applied_seq == committed_seq) { - dout(10) << "commit_start nothing to do" << dendl; - blocked = false; - assert(commit_waiters.empty()); - goto out; - } - - _committing_seq = committing_seq = max_applied_seq; - - dout(10) << "commit_start committing " << committing_seq - << ", still blocked" << dendl; - } - } - ret = true; - - out: - if (journal) - journal->commit_start(_committing_seq); // tell the journal too - return ret; -} - -void JournalingObjectStore::ApplyManager::commit_started() -{ - Mutex::Locker l(apply_lock); - // allow new ops. (underlying fs should now be committing all prior ops) - dout(10) << "commit_started committing " << committing_seq << ", unblocking" << dendl; - blocked = false; - blocked_cond.Signal(); -} - -void JournalingObjectStore::ApplyManager::commit_finish() -{ - Mutex::Locker l(com_lock); - dout(10) << "commit_finish thru " << committing_seq << dendl; - - if (journal) - journal->committed_thru(committing_seq); - - committed_seq = committing_seq; - - map >::iterator p = commit_waiters.begin(); - while (p != commit_waiters.end() && - p->first <= committing_seq) { - finisher.queue(p->second); - commit_waiters.erase(p++); - } -} - -void JournalingObjectStore::_op_journal_transactions( - bufferlist& tbl, uint32_t orig_len, uint64_t op, - Context *onjournal, TrackedOpRef osd_op) -{ - if (osd_op.get()) - dout(10) << "op_journal_transactions " << op << " reqid_t " - << (static_cast(osd_op.get()))->get_reqid() << dendl; - else - dout(10) << "op_journal_transactions " << op << dendl; - - if (journal && journal->is_writeable()) { - journal->submit_entry(op, tbl, orig_len, onjournal, osd_op); - } else if (onjournal) { - apply_manager.add_waiter(op, onjournal); - } -} - diff --git a/src/os/JournalingObjectStore.h b/src/os/JournalingObjectStore.h deleted file mode 100644 index bba3767b2c3d..000000000000 --- a/src/os/JournalingObjectStore.h +++ /dev/null @@ -1,143 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef CEPH_JOURNALINGOBJECTSTORE_H -#define CEPH_JOURNALINGOBJECTSTORE_H - -#include "ObjectStore.h" -#include "Journal.h" -#include "FileJournal.h" -#include "common/RWLock.h" - -class JournalingObjectStore : public ObjectStore { -protected: - Journal *journal; - Finisher finisher; - - - class SubmitManager { - Mutex lock; - uint64_t op_seq; - uint64_t op_submitted; - public: - SubmitManager() : - lock("JOS::SubmitManager::lock", false, true, false, g_ceph_context), - op_seq(0), op_submitted(0) - {} - uint64_t op_submit_start(); - void op_submit_finish(uint64_t op); - void set_op_seq(uint64_t seq) { - Mutex::Locker l(lock); - op_submitted = op_seq = seq; - } - uint64_t get_op_seq() { - return op_seq; - } - } submit_manager; - - class ApplyManager { - Journal *&journal; - Finisher &finisher; - - Mutex apply_lock; - bool blocked; - Cond blocked_cond; - int open_ops; - uint64_t max_applied_seq; - - Mutex com_lock; - map > commit_waiters; - uint64_t committing_seq, committed_seq; - - public: - ApplyManager(Journal *&j, Finisher &f) : - journal(j), finisher(f), - apply_lock("JOS::ApplyManager::apply_lock", false, true, false, g_ceph_context), - blocked(false), - open_ops(0), - max_applied_seq(0), - com_lock("JOS::ApplyManager::com_lock", false, true, false, g_ceph_context), - committing_seq(0), committed_seq(0) {} - void reset() { - assert(open_ops == 0); - assert(blocked == false); - max_applied_seq = 0; - committing_seq = 0; - committed_seq = 0; - } - void add_waiter(uint64_t, Context*); - uint64_t op_apply_start(uint64_t op); - void op_apply_finish(uint64_t op); - bool commit_start(); - void commit_started(); - void commit_finish(); - bool is_committing() { - Mutex::Locker l(com_lock); - return committing_seq != committed_seq; - } - uint64_t get_committed_seq() { - Mutex::Locker l(com_lock); - return committed_seq; - } - uint64_t get_committing_seq() { - Mutex::Locker l(com_lock); - return committing_seq; - } - void init_seq(uint64_t fs_op_seq) { - { - Mutex::Locker l(com_lock); - committed_seq = fs_op_seq; - committing_seq = fs_op_seq; - } - { - Mutex::Locker l(apply_lock); - max_applied_seq = fs_op_seq; - } - } - } apply_manager; - - bool replaying; - -protected: - void journal_start(); - void journal_stop(); - void journal_write_close(); - int journal_replay(uint64_t fs_op_seq); - - void _op_journal_transactions(bufferlist& tls, uint32_t orig_len, uint64_t op, - Context *onjournal, TrackedOpRef osd_op); - - virtual int do_transactions(list& tls, uint64_t op_seq) = 0; - -public: - bool is_committing() { - return apply_manager.is_committing(); - } - uint64_t get_committed_seq() { - return apply_manager.get_committed_seq(); - } - -public: - JournalingObjectStore(const std::string& path) - : ObjectStore(path), - journal(NULL), - finisher(g_ceph_context, "JournalObjectStore"), - apply_manager(journal, finisher), - replaying(false) {} - - ~JournalingObjectStore() { - } -}; - -#endif diff --git a/src/os/LFNIndex.cc b/src/os/LFNIndex.cc deleted file mode 100644 index af2df6a92682..000000000000 --- a/src/os/LFNIndex.cc +++ /dev/null @@ -1,1356 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include -#include -#include -#include - -#if defined(__FreeBSD__) -#include -#endif - -#include "osd/osd_types.h" -#include "include/object.h" -#include "common/config.h" -#include "common/debug.h" -#include "include/buffer.h" -#include "common/ceph_crypto.h" -#include "include/compat.h" -#include "chain_xattr.h" - -#include "LFNIndex.h" -using ceph::crypto::SHA1; - -#define dout_subsys ceph_subsys_filestore -#undef dout_prefix -#define dout_prefix *_dout << "LFNIndex(" << get_base_path() << ") " - - -const string LFNIndex::LFN_ATTR = "user.cephos.lfn"; -const string LFNIndex::PHASH_ATTR_PREFIX = "user.cephos.phash."; -const string LFNIndex::SUBDIR_PREFIX = "DIR_"; -const string LFNIndex::FILENAME_COOKIE = "long"; -const int LFNIndex::FILENAME_PREFIX_LEN = FILENAME_SHORT_LEN - FILENAME_HASH_LEN - - FILENAME_COOKIE.size() - - FILENAME_EXTRA; -void LFNIndex::maybe_inject_failure() -{ - if (error_injection_enabled) { - if (current_failure > last_failure && - (((double)(rand() % 10000))/((double)(10000)) - < error_injection_probability)) { - last_failure = current_failure; - current_failure = 0; - throw RetryException(); - } - ++current_failure; - } -} - -// Helper to close fd's when we leave scope. This is useful when used -// in combination with RetryException, thrown by the above. -struct FDCloser { - int fd; - FDCloser(int f) : fd(f) {} - ~FDCloser() { - VOID_TEMP_FAILURE_RETRY(::close(fd)); - } -}; - - -/* Public methods */ - - -int LFNIndex::init() -{ - return _init(); -} - -int LFNIndex::created(const ghobject_t &oid, const char *path) -{ - WRAP_RETRY( - vector path_comp; - string short_name; - r = decompose_full_path(path, &path_comp, 0, &short_name); - if (r < 0) - goto out; - r = lfn_created(path_comp, oid, short_name); - if (r < 0) - goto out; - r = _created(path_comp, oid, short_name); - if (r < 0) - goto out; - ); -} - -int LFNIndex::unlink(const ghobject_t &oid) -{ - WRAP_RETRY( - vector path; - string short_name; - r = _lookup(oid, &path, &short_name, NULL); - if (r < 0) { - goto out; - } - r = _remove(path, oid, short_name); - if (r < 0) { - goto out; - } - ); -} - -int LFNIndex::lookup(const ghobject_t &oid, - IndexedPath *out_path, - int *hardlink) -{ - WRAP_RETRY( - vector path; - string short_name; - r = _lookup(oid, &path, &short_name, hardlink); - if (r < 0) - goto out; - string full_path = get_full_path(path, short_name); - *out_path = IndexedPath(new Path(full_path, this)); - r = 0; - ); -} - -int LFNIndex::pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs) -{ - return _pre_hash_collection(pg_num, expected_num_objs); -} - - -int LFNIndex::collection_list_partial(const ghobject_t &start, - const ghobject_t &end, - bool sort_bitwise, - int max_count, - vector *ls, - ghobject_t *next) -{ - return _collection_list_partial(start, end, sort_bitwise, max_count, ls, next); -} - -/* Derived class utility methods */ - -int LFNIndex::fsync_dir(const vector &path) -{ - maybe_inject_failure(); - int fd = ::open(get_full_path_subdir(path).c_str(), O_RDONLY); - if (fd < 0) - return -errno; - FDCloser f(fd); - maybe_inject_failure(); - int r = ::fsync(fd); - maybe_inject_failure(); - if (r < 0) - return -errno; - else - return 0; -} - -int LFNIndex::link_object(const vector &from, - const vector &to, - const ghobject_t &oid, - const string &from_short_name) -{ - int r; - string from_path = get_full_path(from, from_short_name); - string to_path; - maybe_inject_failure(); - r = lfn_get_name(to, oid, 0, &to_path, 0); - if (r < 0) - return r; - maybe_inject_failure(); - r = ::link(from_path.c_str(), to_path.c_str()); - maybe_inject_failure(); - if (r < 0) - return -errno; - else - return 0; -} - -int LFNIndex::remove_objects(const vector &dir, - const map &to_remove, - map *remaining) -{ - set clean_chains; - for (map::const_iterator to_clean = to_remove.begin(); - to_clean != to_remove.end(); - ++to_clean) { - if (!lfn_is_hashed_filename(to_clean->first)) { - maybe_inject_failure(); - int r = ::unlink(get_full_path(dir, to_clean->first).c_str()); - maybe_inject_failure(); - if (r < 0) - return -errno; - continue; - } - if (clean_chains.count(lfn_get_short_name(to_clean->second, 0))) - continue; - set holes; - map > chain; - for (int i = 0; ; ++i) { - string short_name = lfn_get_short_name(to_clean->second, i); - if (remaining->count(short_name)) { - chain[i] = *(remaining->find(short_name)); - } else if (to_remove.count(short_name)) { - holes.insert(i); - } else { - break; - } - } - - map >::reverse_iterator candidate = chain.rbegin(); - for (set::iterator i = holes.begin(); - i != holes.end(); - ++i) { - if (candidate == chain.rend() || *i > candidate->first) { - string remove_path_name = - get_full_path(dir, lfn_get_short_name(to_clean->second, *i)); - maybe_inject_failure(); - int r = ::unlink(remove_path_name.c_str()); - maybe_inject_failure(); - if (r < 0) - return -errno; - continue; - } - string from = get_full_path(dir, candidate->second.first); - string to = get_full_path(dir, lfn_get_short_name(candidate->second.second, *i)); - maybe_inject_failure(); - int r = ::rename(from.c_str(), to.c_str()); - maybe_inject_failure(); - if (r < 0) - return -errno; - remaining->erase(candidate->second.first); - remaining->insert(pair( - lfn_get_short_name(candidate->second.second, *i), - candidate->second.second)); - ++candidate; - } - if (!holes.empty()) - clean_chains.insert(lfn_get_short_name(to_clean->second, 0)); - } - return 0; -} - -int LFNIndex::move_objects(const vector &from, - const vector &to) -{ - map to_move; - int r; - r = list_objects(from, 0, NULL, &to_move); - if (r < 0) - return r; - for (map::iterator i = to_move.begin(); - i != to_move.end(); - ++i) { - string from_path = get_full_path(from, i->first); - string to_path, to_name; - r = lfn_get_name(to, i->second, &to_name, &to_path, 0); - if (r < 0) - return r; - maybe_inject_failure(); - r = ::link(from_path.c_str(), to_path.c_str()); - if (r < 0 && errno != EEXIST) - return -errno; - maybe_inject_failure(); - r = lfn_created(to, i->second, to_name); - maybe_inject_failure(); - if (r < 0) - return r; - } - r = fsync_dir(to); - if (r < 0) - return r; - for (map::iterator i = to_move.begin(); - i != to_move.end(); - ++i) { - maybe_inject_failure(); - r = ::unlink(get_full_path(from, i->first).c_str()); - maybe_inject_failure(); - if (r < 0) - return -errno; - } - return fsync_dir(from); -} - -int LFNIndex::remove_object(const vector &from, - const ghobject_t &oid) -{ - string short_name; - int r, exist; - maybe_inject_failure(); - r = get_mangled_name(from, oid, &short_name, &exist); - maybe_inject_failure(); - if (r < 0) - return r; - if (exist == 0) - return -ENOENT; - return lfn_unlink(from, oid, short_name); -} - -int LFNIndex::get_mangled_name(const vector &from, - const ghobject_t &oid, - string *mangled_name, int *hardlink) -{ - return lfn_get_name(from, oid, mangled_name, 0, hardlink); -} - -int LFNIndex::move_subdir( - LFNIndex &from, - LFNIndex &dest, - const vector &path, - string dir - ) -{ - vector sub_path(path.begin(), path.end()); - sub_path.push_back(dir); - string from_path(from.get_full_path_subdir(sub_path)); - string to_path(dest.get_full_path_subdir(sub_path)); - int r = ::rename(from_path.c_str(), to_path.c_str()); - if (r < 0) - return -errno; - return 0; -} - -int LFNIndex::move_object( - LFNIndex &from, - LFNIndex &dest, - const vector &path, - const pair &obj - ) -{ - string from_path(from.get_full_path(path, obj.first)); - string to_path; - string to_name; - int exists; - int r = dest.lfn_get_name(path, obj.second, &to_name, &to_path, &exists); - if (r < 0) - return r; - if (!exists) { - r = ::link(from_path.c_str(), to_path.c_str()); - if (r < 0) - return r; - } - r = dest.lfn_created(path, obj.second, to_name); - if (r < 0) - return r; - r = dest.fsync_dir(path); - if (r < 0) - return r; - r = from.remove_object(path, obj.second); - if (r < 0) - return r; - return from.fsync_dir(path); -} - - -static int get_hobject_from_oinfo(const char *dir, const char *file, - ghobject_t *o) -{ - char path[PATH_MAX]; - bufferptr bp(PATH_MAX); - snprintf(path, sizeof(path), "%s/%s", dir, file); - // Hack, user.ceph._ is the attribute used to store the object info - int r = chain_getxattr(path, "user.ceph._", bp.c_str(), bp.length()); - if (r < 0) - return r; - bufferlist bl; - bl.push_back(bp); - object_info_t oi(bl); - *o = ghobject_t(oi.soid); - return 0; -} - - -int LFNIndex::list_objects(const vector &to_list, int max_objs, - long *handle, map *out) -{ - string to_list_path = get_full_path_subdir(to_list); - DIR *dir = ::opendir(to_list_path.c_str()); - char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1]; - int r; - if (!dir) { - return -errno; - } - - if (handle && *handle) { - seekdir(dir, *handle); - } - - struct dirent *de; - int listed = 0; - bool end = false; - while (!::readdir_r(dir, reinterpret_cast(buf), &de)) { - if (!de) { - end = true; - break; - } - if (max_objs > 0 && listed >= max_objs) { - break; - } - if (de->d_name[0] == '.') - continue; - string short_name(de->d_name); - ghobject_t obj; - if (lfn_is_object(short_name)) { - r = lfn_translate(to_list, short_name, &obj); - if (r < 0) { - r = -errno; - goto cleanup; - } else if (r > 0) { - string long_name = lfn_generate_object_name(obj); - if (!lfn_must_hash(long_name)) { - assert(long_name == short_name); - } - if (index_version == HASH_INDEX_TAG) - get_hobject_from_oinfo(to_list_path.c_str(), short_name.c_str(), &obj); - - out->insert(pair(short_name, obj)); - ++listed; - } else { - continue; - } - } - } - - if (handle && !end) { - *handle = telldir(dir); - } - - r = 0; - cleanup: - ::closedir(dir); - return r; -} - -int LFNIndex::list_subdirs(const vector &to_list, - vector *out) -{ - string to_list_path = get_full_path_subdir(to_list); - DIR *dir = ::opendir(to_list_path.c_str()); - char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1]; - if (!dir) - return -errno; - - struct dirent *de; - while (!::readdir_r(dir, reinterpret_cast(buf), &de)) { - if (!de) { - break; - } - string short_name(de->d_name); - string demangled_name; - if (lfn_is_subdir(short_name, &demangled_name)) { - out->push_back(demangled_name); - } - } - - ::closedir(dir); - return 0; -} - -int LFNIndex::create_path(const vector &to_create) -{ - maybe_inject_failure(); - int r = ::mkdir(get_full_path_subdir(to_create).c_str(), 0777); - maybe_inject_failure(); - if (r < 0) - return -errno; - else - return 0; -} - -int LFNIndex::remove_path(const vector &to_remove) -{ - maybe_inject_failure(); - int r = ::rmdir(get_full_path_subdir(to_remove).c_str()); - maybe_inject_failure(); - if (r < 0) - return -errno; - else - return 0; -} - -int LFNIndex::path_exists(const vector &to_check, int *exists) -{ - string full_path = get_full_path_subdir(to_check); - struct stat buf; - if (::stat(full_path.c_str(), &buf)) { - int r = -errno; - if (r == -ENOENT) { - *exists = 0; - return 0; - } else { - return r; - } - } else { - *exists = 1; - return 0; - } -} - -int LFNIndex::add_attr_path(const vector &path, - const string &attr_name, - bufferlist &attr_value) -{ - string full_path = get_full_path_subdir(path); - maybe_inject_failure(); - return chain_setxattr(full_path.c_str(), mangle_attr_name(attr_name).c_str(), - reinterpret_cast(attr_value.c_str()), - attr_value.length()); -} - -int LFNIndex::get_attr_path(const vector &path, - const string &attr_name, - bufferlist &attr_value) -{ - string full_path = get_full_path_subdir(path); - size_t size = 1024; // Initial - while (1) { - bufferptr buf(size); - int r = chain_getxattr(full_path.c_str(), mangle_attr_name(attr_name).c_str(), - reinterpret_cast(buf.c_str()), - size); - if (r > 0) { - buf.set_length(r); - attr_value.push_back(buf); - break; - } else { - r = -errno; - if (r == -ERANGE) { - size *= 2; - } else { - return r; - } - } - } - return 0; -} - -int LFNIndex::remove_attr_path(const vector &path, - const string &attr_name) -{ - string full_path = get_full_path_subdir(path); - string mangled_attr_name = mangle_attr_name(attr_name); - maybe_inject_failure(); - return chain_removexattr(full_path.c_str(), mangled_attr_name.c_str()); -} - -string LFNIndex::lfn_generate_object_name_keyless(const ghobject_t &oid) -{ - char s[FILENAME_MAX_LEN]; - char *end = s + sizeof(s); - char *t = s; - - assert(oid.generation == ghobject_t::NO_GEN); - const char *i = oid.hobj.oid.name.c_str(); - // Escape subdir prefix - if (oid.hobj.oid.name.substr(0, 4) == "DIR_") { - *t++ = '\\'; - *t++ = 'd'; - i += 4; - } - while (*i && t < end) { - if (*i == '\\') { - *t++ = '\\'; - *t++ = '\\'; - } else if (*i == '.' && i == oid.hobj.oid.name.c_str()) { // only escape leading . - *t++ = '\\'; - *t++ = '.'; - } else if (*i == '/') { - *t++ = '\\'; - *t++ = 's'; - } else - *t++ = *i; - i++; - } - - if (oid.hobj.snap == CEPH_NOSNAP) - t += snprintf(t, end - t, "_head"); - else if (oid.hobj.snap == CEPH_SNAPDIR) - t += snprintf(t, end - t, "_snapdir"); - else - t += snprintf(t, end - t, "_%llx", (long long unsigned)oid.hobj.snap); - snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash()); - - return string(s); -} - -static void append_escaped(string::const_iterator begin, - string::const_iterator end, - string *out) -{ - for (string::const_iterator i = begin; i != end; ++i) { - if (*i == '\\') { - out->append("\\\\"); - } else if (*i == '/') { - out->append("\\s"); - } else if (*i == '_') { - out->append("\\u"); - } else if (*i == '\0') { - out->append("\\n"); - } else { - out->append(i, i+1); - } - } -} - -string LFNIndex::lfn_generate_object_name(const ghobject_t &oid) -{ - if (index_version == HASH_INDEX_TAG) - return lfn_generate_object_name_keyless(oid); - if (index_version == HASH_INDEX_TAG_2) - return lfn_generate_object_name_poolless(oid); - - string full_name; - string::const_iterator i = oid.hobj.oid.name.begin(); - if (oid.hobj.oid.name.substr(0, 4) == "DIR_") { - full_name.append("\\d"); - i += 4; - } else if (oid.hobj.oid.name[0] == '.') { - full_name.append("\\."); - ++i; - } - append_escaped(i, oid.hobj.oid.name.end(), &full_name); - full_name.append("_"); - append_escaped(oid.hobj.get_key().begin(), oid.hobj.get_key().end(), &full_name); - full_name.append("_"); - - char buf[PATH_MAX]; - char *t = buf; - char *end = t + sizeof(buf); - if (oid.hobj.snap == CEPH_NOSNAP) - t += snprintf(t, end - t, "head"); - else if (oid.hobj.snap == CEPH_SNAPDIR) - t += snprintf(t, end - t, "snapdir"); - else - t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap); - snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash()); - full_name += string(buf); - full_name.append("_"); - - append_escaped(oid.hobj.nspace.begin(), oid.hobj.nspace.end(), &full_name); - full_name.append("_"); - - t = buf; - end = t + sizeof(buf); - if (oid.hobj.pool == -1) - t += snprintf(t, end - t, "none"); - else - t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.pool); - full_name += string(buf); - - if (oid.generation != ghobject_t::NO_GEN || - oid.shard_id != shard_id_t::NO_SHARD) { - full_name.append("_"); - - t = buf; - end = t + sizeof(buf); - t += snprintf(t, end - t, "%llx", (long long unsigned)oid.generation); - full_name += string(buf); - - full_name.append("_"); - - t = buf; - end = t + sizeof(buf); - t += snprintf(t, end - t, "%x", (int)oid.shard_id); - full_name += string(buf); - } - - return full_name; -} - -string LFNIndex::lfn_generate_object_name_poolless(const ghobject_t &oid) -{ - if (index_version == HASH_INDEX_TAG) - return lfn_generate_object_name_keyless(oid); - - assert(oid.generation == ghobject_t::NO_GEN); - string full_name; - string::const_iterator i = oid.hobj.oid.name.begin(); - if (oid.hobj.oid.name.substr(0, 4) == "DIR_") { - full_name.append("\\d"); - i += 4; - } else if (oid.hobj.oid.name[0] == '.') { - full_name.append("\\."); - ++i; - } - append_escaped(i, oid.hobj.oid.name.end(), &full_name); - full_name.append("_"); - append_escaped(oid.hobj.get_key().begin(), oid.hobj.get_key().end(), &full_name); - full_name.append("_"); - - char snap_with_hash[PATH_MAX]; - char *t = snap_with_hash; - char *end = t + sizeof(snap_with_hash); - if (oid.hobj.snap == CEPH_NOSNAP) - t += snprintf(t, end - t, "head"); - else if (oid.hobj.snap == CEPH_SNAPDIR) - t += snprintf(t, end - t, "snapdir"); - else - t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap); - snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash()); - full_name += string(snap_with_hash); - return full_name; -} - -int LFNIndex::lfn_get_name(const vector &path, - const ghobject_t &oid, - string *mangled_name, string *out_path, - int *hardlink) -{ - string subdir_path = get_full_path_subdir(path); - string full_name = lfn_generate_object_name(oid); - int r; - - if (!lfn_must_hash(full_name)) { - if (mangled_name) - *mangled_name = full_name; - if (out_path) - *out_path = get_full_path(path, full_name); - if (hardlink) { - struct stat buf; - string full_path = get_full_path(path, full_name); - maybe_inject_failure(); - r = ::stat(full_path.c_str(), &buf); - if (r < 0) { - if (errno == ENOENT) - *hardlink = 0; - else - return -errno; - } else { - *hardlink = buf.st_nlink; - } - } - return 0; - } - - int i = 0; - string candidate; - string candidate_path; - char buf[FILENAME_MAX_LEN + 1]; - for ( ; ; ++i) { - candidate = lfn_get_short_name(oid, i); - candidate_path = get_full_path(path, candidate); - r = chain_getxattr(candidate_path.c_str(), get_lfn_attr().c_str(), - buf, sizeof(buf)); - if (r < 0) { - if (errno != ENODATA && errno != ENOENT) - return -errno; - if (errno == ENODATA) { - // Left over from incomplete transaction, it'll be replayed - maybe_inject_failure(); - r = ::unlink(candidate_path.c_str()); - maybe_inject_failure(); - if (r < 0) - return -errno; - } - if (mangled_name) - *mangled_name = candidate; - if (out_path) - *out_path = candidate_path; - if (hardlink) - *hardlink = 0; - return 0; - } - assert(r > 0); - buf[MIN((int)sizeof(buf) - 1, r)] = '\0'; - if (!strcmp(buf, full_name.c_str())) { - if (mangled_name) - *mangled_name = candidate; - if (out_path) - *out_path = candidate_path; - if (hardlink) { - struct stat st; - r = ::stat(candidate_path.c_str(), &st); - *hardlink = st.st_nlink; - } - return 0; - } - r = chain_getxattr(candidate_path.c_str(), get_alt_lfn_attr().c_str(), - buf, sizeof(buf)); - if (r > 0) { - // only consider alt name if nlink > 1 - struct stat st; - int rc = ::stat(candidate_path.c_str(), &st); - if (rc < 0) - return -errno; - if (st.st_nlink <= 1) { - // left over from incomplete unlink, remove - maybe_inject_failure(); - dout(20) << __func__ << " found extra alt attr for " << candidate_path - << ", long name " << string(buf, r) << dendl; - rc = chain_removexattr(candidate_path.c_str(), - get_alt_lfn_attr().c_str()); - maybe_inject_failure(); - if (rc < 0) - return rc; - continue; - } - buf[MIN((int)sizeof(buf) - 1, r)] = '\0'; - if (!strcmp(buf, full_name.c_str())) { - dout(20) << __func__ << " used alt attr for " << full_name << dendl; - if (mangled_name) - *mangled_name = candidate; - if (out_path) - *out_path = candidate_path; - if (hardlink) - *hardlink = st.st_nlink; - return 0; - } - } - } - assert(0); // Unreachable - return 0; -} - -int LFNIndex::lfn_created(const vector &path, - const ghobject_t &oid, - const string &mangled_name) -{ - if (!lfn_is_hashed_filename(mangled_name)) - return 0; - string full_path = get_full_path(path, mangled_name); - string full_name = lfn_generate_object_name(oid); - maybe_inject_failure(); - - // if the main attr exists and is different, move it to the alt attr. - char buf[FILENAME_MAX_LEN + 1]; - int r = chain_getxattr(full_path.c_str(), get_lfn_attr().c_str(), - buf, sizeof(buf)); - if (r >= 0 && (r != (int)full_name.length() || - memcmp(buf, full_name.c_str(), full_name.length()))) { - dout(20) << __func__ << " " << mangled_name - << " moving old name to alt attr " - << string(buf, r) - << ", new name is " << full_name << dendl; - r = chain_setxattr(full_path.c_str(), get_alt_lfn_attr().c_str(), - buf, r); - if (r < 0) - return r; - } - - return chain_setxattr(full_path.c_str(), get_lfn_attr().c_str(), - full_name.c_str(), full_name.size()); -} - -int LFNIndex::lfn_unlink(const vector &path, - const ghobject_t &oid, - const string &mangled_name) -{ - if (!lfn_is_hashed_filename(mangled_name)) { - string full_path = get_full_path(path, mangled_name); - maybe_inject_failure(); - int r = ::unlink(full_path.c_str()); - maybe_inject_failure(); - if (r < 0) - return -errno; - return 0; - } - string subdir_path = get_full_path_subdir(path); - - - int i = 0; - for ( ; ; ++i) { - string candidate = lfn_get_short_name(oid, i); - if (candidate == mangled_name) - break; - } - int removed_index = i; - ++i; - for ( ; ; ++i) { - struct stat buf; - string to_check = lfn_get_short_name(oid, i); - string to_check_path = get_full_path(path, to_check); - int r = ::stat(to_check_path.c_str(), &buf); - if (r < 0) { - if (errno == ENOENT) { - break; - } else { - return -errno; - } - } - } - string full_path = get_full_path(path, mangled_name); - int fd = ::open(full_path.c_str(), O_RDONLY); - if (fd < 0) - return -errno; - FDCloser f(fd); - if (i == removed_index + 1) { - maybe_inject_failure(); - int r = ::unlink(full_path.c_str()); - maybe_inject_failure(); - if (r < 0) - return -errno; - } else { - string& rename_to = full_path; - string rename_from = get_full_path(path, lfn_get_short_name(oid, i - 1)); - maybe_inject_failure(); - int r = ::rename(rename_from.c_str(), rename_to.c_str()); - maybe_inject_failure(); - if (r < 0) - return -errno; - } - struct stat st; - int r = ::fstat(fd, &st); - if (r == 0 && st.st_nlink > 0) { - // remove alt attr - dout(20) << __func__ << " removing alt attr from " << full_path << dendl; - fsync_dir(path); - chain_fremovexattr(fd, get_alt_lfn_attr().c_str()); - } - return r; -} - -int LFNIndex::lfn_translate(const vector &path, - const string &short_name, - ghobject_t *out) -{ - if (!lfn_is_hashed_filename(short_name)) { - return lfn_parse_object_name(short_name, out); - } - // Get lfn_attr - string full_path = get_full_path(path, short_name); - char attr[PATH_MAX]; - int r = chain_getxattr(full_path.c_str(), get_lfn_attr().c_str(), attr, sizeof(attr) - 1); - if (r < 0) - return -errno; - if (r < (int)sizeof(attr)) - attr[r] = '\0'; - - string long_name(attr); - return lfn_parse_object_name(long_name, out); -} - -bool LFNIndex::lfn_is_object(const string &short_name) -{ - return lfn_is_hashed_filename(short_name) || !lfn_is_subdir(short_name, 0); -} - -bool LFNIndex::lfn_is_subdir(const string &name, string *demangled) -{ - if (name.substr(0, SUBDIR_PREFIX.size()) == SUBDIR_PREFIX) { - if (demangled) - *demangled = demangle_path_component(name); - return 1; - } - return 0; -} - -static int parse_object(const char *s, ghobject_t& o) -{ - const char *hash = s + strlen(s) - 1; - while (*hash != '_' && - hash > s) - hash--; - const char *bar = hash - 1; - while (*bar != '_' && - bar > s) - bar--; - if (*bar == '_') { - char buf[bar-s + 1]; - char *t = buf; - const char *i = s; - while (i < bar) { - if (*i == '\\') { - i++; - switch (*i) { - case '\\': *t++ = '\\'; break; - case '.': *t++ = '.'; break; - case 's': *t++ = '/'; break; - case 'd': { - *t++ = 'D'; - *t++ = 'I'; - *t++ = 'R'; - *t++ = '_'; - break; - } - default: assert(0); - } - } else { - *t++ = *i; - } - i++; - } - *t = 0; - o.hobj.oid.name = string(buf, t-buf); - if (strncmp(bar+1, "head", 4) == 0) - o.hobj.snap = CEPH_NOSNAP; - else if (strncmp(bar+1, "snapdir", 7) == 0) - o.hobj.snap = CEPH_SNAPDIR; - else - o.hobj.snap = strtoull(bar+1, NULL, 16); - - uint32_t hobject_hash_input; - sscanf(hash, "_%X", &hobject_hash_input); - o.hobj.set_hash(hobject_hash_input); - - return 1; - } - return 0; -} - -bool LFNIndex::lfn_parse_object_name_keyless(const string &long_name, ghobject_t *out) -{ - bool r = parse_object(long_name.c_str(), *out); - int64_t pool = -1; - spg_t pg; - if (coll().is_pg_prefix(&pg)) - pool = (int64_t)pg.pgid.pool(); - out->hobj.pool = pool; - if (!r) return r; - string temp = lfn_generate_object_name(*out); - return r; -} - -static bool append_unescaped(string::const_iterator begin, - string::const_iterator end, - string *out) -{ - for (string::const_iterator i = begin; i != end; ++i) { - if (*i == '\\') { - ++i; - if (*i == '\\') - out->append("\\"); - else if (*i == 's') - out->append("/"); - else if (*i == 'n') - (*out) += '\0'; - else if (*i == 'u') - out->append("_"); - else - return false; - } else { - out->append(i, i+1); - } - } - return true; -} - -bool LFNIndex::lfn_parse_object_name_poolless(const string &long_name, - ghobject_t *out) -{ - string name; - string key; - uint32_t hash; - snapid_t snap; - - string::const_iterator current = long_name.begin(); - if (*current == '\\') { - ++current; - if (current == long_name.end()) { - return false; - } else if (*current == 'd') { - name.append("DIR_"); - ++current; - } else if (*current == '.') { - name.append("."); - ++current; - } else { - --current; - } - } - - string::const_iterator end = current; - for ( ; end != long_name.end() && *end != '_'; ++end) ; - if (end == long_name.end()) - return false; - if (!append_unescaped(current, end, &name)) - return false; - - current = ++end; - for ( ; end != long_name.end() && *end != '_'; ++end) ; - if (end == long_name.end()) - return false; - if (!append_unescaped(current, end, &key)) - return false; - - current = ++end; - for ( ; end != long_name.end() && *end != '_'; ++end) ; - if (end == long_name.end()) - return false; - string snap_str(current, end); - - current = ++end; - for ( ; end != long_name.end() && *end != '_'; ++end) ; - if (end != long_name.end()) - return false; - string hash_str(current, end); - - if (snap_str == "head") - snap = CEPH_NOSNAP; - else if (snap_str == "snapdir") - snap = CEPH_SNAPDIR; - else - snap = strtoull(snap_str.c_str(), NULL, 16); - sscanf(hash_str.c_str(), "%X", &hash); - - - int64_t pool = -1; - spg_t pg; - if (coll().is_pg_prefix(&pg)) - pool = (int64_t)pg.pgid.pool(); - (*out) = ghobject_t(hobject_t(name, key, snap, hash, pool, "")); - return true; -} - - -bool LFNIndex::lfn_parse_object_name(const string &long_name, ghobject_t *out) -{ - string name; - string key; - string ns; - uint32_t hash; - snapid_t snap; - uint64_t pool; - gen_t generation = ghobject_t::NO_GEN; - shard_id_t shard_id = shard_id_t::NO_SHARD; - - if (index_version == HASH_INDEX_TAG) - return lfn_parse_object_name_keyless(long_name, out); - if (index_version == HASH_INDEX_TAG_2) - return lfn_parse_object_name_poolless(long_name, out); - - string::const_iterator current = long_name.begin(); - if (*current == '\\') { - ++current; - if (current == long_name.end()) { - return false; - } else if (*current == 'd') { - name.append("DIR_"); - ++current; - } else if (*current == '.') { - name.append("."); - ++current; - } else { - --current; - } - } - - string::const_iterator end = current; - for ( ; end != long_name.end() && *end != '_'; ++end) ; - if (end == long_name.end()) - return false; - if (!append_unescaped(current, end, &name)) - return false; - - current = ++end; - for ( ; end != long_name.end() && *end != '_'; ++end) ; - if (end == long_name.end()) - return false; - if (!append_unescaped(current, end, &key)) - return false; - - current = ++end; - for ( ; end != long_name.end() && *end != '_'; ++end) ; - if (end == long_name.end()) - return false; - string snap_str(current, end); - - current = ++end; - for ( ; end != long_name.end() && *end != '_'; ++end) ; - if (end == long_name.end()) - return false; - string hash_str(current, end); - - current = ++end; - for ( ; end != long_name.end() && *end != '_'; ++end) ; - if (end == long_name.end()) - return false; - if (!append_unescaped(current, end, &ns)) - return false; - - current = ++end; - for ( ; end != long_name.end() && *end != '_'; ++end) ; - string pstring(current, end); - - // Optional generation/shard_id - string genstring, shardstring; - if (end != long_name.end()) { - current = ++end; - for ( ; end != long_name.end() && *end != '_'; ++end) ; - if (end == long_name.end()) - return false; - genstring = string(current, end); - - generation = (gen_t)strtoull(genstring.c_str(), NULL, 16); - - current = ++end; - for ( ; end != long_name.end() && *end != '_'; ++end) ; - if (end != long_name.end()) - return false; - shardstring = string(current, end); - - shard_id = (shard_id_t)strtoul(shardstring.c_str(), NULL, 16); - } - - if (snap_str == "head") - snap = CEPH_NOSNAP; - else if (snap_str == "snapdir") - snap = CEPH_SNAPDIR; - else - snap = strtoull(snap_str.c_str(), NULL, 16); - sscanf(hash_str.c_str(), "%X", &hash); - - if (pstring == "none") - pool = (uint64_t)-1; - else - pool = strtoull(pstring.c_str(), NULL, 16); - - (*out) = ghobject_t(hobject_t(name, key, snap, hash, (int64_t)pool, ns), generation, shard_id); - return true; -} - -bool LFNIndex::lfn_is_hashed_filename(const string &name) -{ - if (name.size() < (unsigned)FILENAME_SHORT_LEN) { - return 0; - } - if (name.substr(name.size() - FILENAME_COOKIE.size(), FILENAME_COOKIE.size()) - == FILENAME_COOKIE) { - return 1; - } else { - return 0; - } -} - -bool LFNIndex::lfn_must_hash(const string &long_name) -{ - return (int)long_name.size() >= FILENAME_SHORT_LEN; -} - -static inline void buf_to_hex(const unsigned char *buf, int len, char *str) -{ - int i; - str[0] = '\0'; - for (i = 0; i < len; i++) { - sprintf(&str[i*2], "%02x", (int)buf[i]); - } -} - -int LFNIndex::hash_filename(const char *filename, char *hash, int buf_len) -{ - if (buf_len < FILENAME_HASH_LEN + 1) - return -EINVAL; - - char buf[FILENAME_LFN_DIGEST_SIZE]; - char hex[FILENAME_LFN_DIGEST_SIZE * 2]; - - SHA1 h; - h.Update((const byte *)filename, strlen(filename)); - h.Final((byte *)buf); - - buf_to_hex((byte *)buf, (FILENAME_HASH_LEN + 1) / 2, hex); - strncpy(hash, hex, FILENAME_HASH_LEN); - hash[FILENAME_HASH_LEN] = '\0'; - return 0; -} - -void LFNIndex::build_filename(const char *old_filename, int i, char *filename, int len) -{ - char hash[FILENAME_HASH_LEN + 1]; - - assert(len >= FILENAME_SHORT_LEN + 4); - - strncpy(filename, old_filename, FILENAME_PREFIX_LEN); - filename[FILENAME_PREFIX_LEN] = '\0'; - if ((int)strlen(filename) < FILENAME_PREFIX_LEN) - return; - if (old_filename[FILENAME_PREFIX_LEN] == '\0') - return; - - hash_filename(old_filename, hash, sizeof(hash)); - int ofs = FILENAME_PREFIX_LEN; - while (1) { - int suffix_len = sprintf(filename + ofs, "_%s_%d_%s", hash, i, FILENAME_COOKIE.c_str()); - if (ofs + suffix_len <= FILENAME_SHORT_LEN || !ofs) - break; - ofs--; - } -} - -string LFNIndex::lfn_get_short_name(const ghobject_t &oid, int i) -{ - string long_name = lfn_generate_object_name(oid); - assert(lfn_must_hash(long_name)); - char buf[FILENAME_SHORT_LEN + 4]; - build_filename(long_name.c_str(), i, buf, sizeof(buf)); - return string(buf); -} - -const string &LFNIndex::get_base_path() -{ - return base_path; -} - -string LFNIndex::get_full_path_subdir(const vector &rel) -{ - string retval = get_base_path(); - for (vector::const_iterator i = rel.begin(); - i != rel.end(); - ++i) { - retval += "/"; - retval += mangle_path_component(*i); - } - return retval; -} - -string LFNIndex::get_full_path(const vector &rel, const string &name) -{ - return get_full_path_subdir(rel) + "/" + name; -} - -string LFNIndex::mangle_path_component(const string &component) -{ - return SUBDIR_PREFIX + component; -} - -string LFNIndex::demangle_path_component(const string &component) -{ - return component.substr(SUBDIR_PREFIX.size(), component.size() - SUBDIR_PREFIX.size()); -} - -int LFNIndex::decompose_full_path(const char *in, vector *out, - ghobject_t *oid, string *shortname) -{ - const char *beginning = in + get_base_path().size(); - const char *end = beginning; - while (1) { - end++; - beginning = end++; - for ( ; *end != '\0' && *end != '/'; ++end) ; - if (*end != '\0') { - out->push_back(demangle_path_component(string(beginning, end - beginning))); - continue; - } else { - break; - } - } - *shortname = string(beginning, end - beginning); - if (oid) { - int r = lfn_translate(*out, *shortname, oid); - if (r < 0) - return r; - } - return 0; -} - -string LFNIndex::mangle_attr_name(const string &attr) -{ - return PHASH_ATTR_PREFIX + attr; -} diff --git a/src/os/LFNIndex.h b/src/os/LFNIndex.h deleted file mode 100644 index 31a100a7e8da..000000000000 --- a/src/os/LFNIndex.h +++ /dev/null @@ -1,578 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef OS_LFNINDEX_H -#define OS_LFNINDEX_H - -#include -#include -#include -#include -#include "include/memory.h" -#include - -#include "osd/osd_types.h" -#include "include/object.h" -#include "common/ceph_crypto.h" - -#include "CollectionIndex.h" - -/** - * LFNIndex also encapsulates logic for manipulating - * subdirectories of of a collection as well as the long filename - * logic. - * - * The protected methods provide machinery for derived classes to - * manipulate subdirectories and objects. - * - * The virtual methods are to be overridden to provide the actual - * hashed layout. - * - * User must call created when an object is created. - * - * Syncronization: Calling code must ensure that there are no object - * creations or deletions during the lifetime of a Path object (except - * of an object at that path). - * - * Unless otherwise noted, methods which return an int return 0 on sucess - * and a negative error code on failure. - */ -#define WRAP_RETRY(x) { \ - bool failed = false; \ - int r = 0; \ - init_inject_failure(); \ - while (1) { \ - try { \ - if (failed) { \ - r = cleanup(); \ - assert(r == 0); \ - } \ - { x } \ - out: \ - complete_inject_failure(); \ - return r; \ - } catch (RetryException) { \ - failed = true; \ - } catch (...) { \ - assert(0); \ - } \ - } \ - return -1; \ - } \ - - - -class LFNIndex : public CollectionIndex { - /// Hash digest output size. - static const int FILENAME_LFN_DIGEST_SIZE = CEPH_CRYPTO_SHA1_DIGESTSIZE; - /// Length of filename hash. - static const int FILENAME_HASH_LEN = FILENAME_LFN_DIGEST_SIZE; - /// Max filename size. - static const int FILENAME_MAX_LEN = 4096; - /// Length of hashed filename. - static const int FILENAME_SHORT_LEN = 255; - /// Length of hashed filename prefix. - static const int FILENAME_PREFIX_LEN; - /// Length of hashed filename cookie. - static const int FILENAME_EXTRA = 4; - /// Lfn cookie value. - static const string FILENAME_COOKIE; - /// Name of LFN attribute for storing full name. - static const string LFN_ATTR; - /// Prefix for subdir index attributes. - static const string PHASH_ATTR_PREFIX; - /// Prefix for index subdirectories. - static const string SUBDIR_PREFIX; - - /// Path to Index base. - const string base_path; - -protected: - const uint32_t index_version; - - /// true if retry injection is enabled - struct RetryException : public exception {}; - bool error_injection_enabled; - bool error_injection_on; - double error_injection_probability; - uint64_t last_failure; - uint64_t current_failure; - void init_inject_failure() { - if (error_injection_on) { - error_injection_enabled = true; - last_failure = current_failure = 0; - } - } - void maybe_inject_failure(); - void complete_inject_failure() { - error_injection_enabled = false; - } - -private: - string lfn_attribute, lfn_alt_attribute; - coll_t collection; - -public: - /// Constructor - LFNIndex( - coll_t collection, - const char *base_path, ///< [in] path to Index root - uint32_t index_version, - double _error_injection_probability=0) - : CollectionIndex(collection), - base_path(base_path), - index_version(index_version), - error_injection_enabled(false), - error_injection_on(_error_injection_probability != 0), - error_injection_probability(_error_injection_probability), - last_failure(0), current_failure(0), - collection(collection) { - if (index_version == HASH_INDEX_TAG) { - lfn_attribute = LFN_ATTR; - } else { - char buf[100]; - snprintf(buf, sizeof(buf), "%d", index_version); - lfn_attribute = LFN_ATTR + string(buf); - lfn_alt_attribute = LFN_ATTR + string(buf) + "-alt"; - } - } - - coll_t coll() const { return collection; } - - /// Virtual destructor - virtual ~LFNIndex() {} - - /// @see CollectionIndex - int init(); - - /// @see CollectionIndex - int cleanup() = 0; - - /// @see CollectionIndex - int created( - const ghobject_t &oid, - const char *path - ); - - /// @see CollectionIndex - int unlink( - const ghobject_t &oid - ); - - /// @see CollectionIndex - int lookup( - const ghobject_t &oid, - IndexedPath *path, - int *hardlink - ); - - /// @see CollectionIndex; - int pre_hash_collection( - uint32_t pg_num, - uint64_t expected_num_objs - ); - - /// @see CollectionIndex - int collection_list_partial( - const ghobject_t &start, - const ghobject_t &end, - bool sort_bitwise, - int max_count, - vector *ls, - ghobject_t *next - ); - - virtual int _split( - uint32_t match, //< [in] value to match - uint32_t bits, //< [in] bits to check - CollectionIndex* dest //< [in] destination index - ) = 0; - - /// @see CollectionIndex - int split( - uint32_t match, - uint32_t bits, - CollectionIndex* dest - ) { - WRAP_RETRY( - r = _split(match, bits, dest); - goto out; - ); - } - - -protected: - virtual int _init() = 0; - - /// Will be called upon object creation - virtual int _created( - const vector &path, ///< [in] Path to subdir. - const ghobject_t &oid, ///< [in] Object created. - const string &mangled_name ///< [in] Mangled filename. - ) = 0; - - /// Will be called to remove an object - virtual int _remove( - const vector &path, ///< [in] Path to subdir. - const ghobject_t &oid, ///< [in] Object to remove. - const string &mangled_name ///< [in] Mangled filename. - ) = 0; - - /// Return the path and mangled_name for oid. - virtual int _lookup( - const ghobject_t &oid,///< [in] Object for lookup. - vector *path, ///< [out] Path to the object. - string *mangled_name, ///< [out] Mangled filename. - int *exists ///< [out] True if the object exists. - ) = 0; - - /// Pre-hash the collection with the given pg number and - /// expected number of objects in the collection. - virtual int _pre_hash_collection( - uint32_t pg_num, - uint64_t expected_num_objs - ) = 0; - - /// @see CollectionIndex - virtual int _collection_list_partial( - const ghobject_t &start, - const ghobject_t &end, - bool sort_bitwise, - int max_count, - vector *ls, - ghobject_t *next - ) = 0; - -protected: - - /* Non-virtual utility methods */ - - /// Sync a subdirectory - int fsync_dir( - const vector &path ///< [in] Path to sync - ); ///< @return Error Code, 0 on success - - /// Link an object from from into to - int link_object( - const vector &from, ///< [in] Source subdirectory. - const vector &to, ///< [in] Dest subdirectory. - const ghobject_t &oid, ///< [in] Object to move. - const string &from_short_name ///< [in] Mangled filename of oid. - ); ///< @return Error Code, 0 on success - - /** - * Efficiently remove objects from a subdirectory - * - * remove_object invalidates mangled names in the directory requiring - * the mangled name of each additional object to be looked up a second - * time. remove_objects removes the need for additional lookups - * - * @param [in] dir Directory from which to remove. - * @param [in] map of objects to remove to mangle names - * @param [in,out] map of filenames to objects - * @return Error Code, 0 on success. - */ - int remove_objects( - const vector &dir, - const map &to_remove, - map *remaining - ); - - - /** - * Moves contents of from into to. - * - * Invalidates mangled names in to. If interupted, all objects will be - * present in to before objects are removed from from. Ignores EEXIST - * while linking into to. - * @return Error Code, 0 on success - */ - int move_objects( - const vector &from, ///< [in] Source subdirectory. - const vector &to ///< [in] Dest subdirectory. - ); - - /** - * Remove an object from from. - * - * Invalidates mangled names in from. - * @return Error Code, 0 on success - */ - int remove_object( - const vector &from, ///< [in] Directory from which to remove. - const ghobject_t &to_remove ///< [in] Object to remove. - ); - - /** - * Gets the filename corresponding to oid in from. - * - * The filename may differ between subdirectories. Furthermore, - * file creations ore removals in from may invalidate the name. - * @return Error code on failure, 0 on success - */ - int get_mangled_name( - const vector &from, ///< [in] Subdirectory - const ghobject_t &oid, ///< [in] Object - string *mangled_name, ///< [out] Filename - int *hardlink ///< [out] hardlink for this file, hardlink=0 mean no-exist - ); - - /// do move subdir from from to dest - static int move_subdir( - LFNIndex &from, ///< [in] from index - LFNIndex &dest, ///< [in] to index - const vector &path, ///< [in] path containing dir - string dir ///< [in] dir to move - ); - - /// do move object from from to dest - static int move_object( - LFNIndex &from, ///< [in] from index - LFNIndex &dest, ///< [in] to index - const vector &path, ///< [in] path to split - const pair &obj ///< [in] obj to move - ); - - /** - * Lists objects in to_list. - * - * @param [in] to_list Directory to list. - * @param [in] max_objects Max number to list. - * @param [in,out] handle Cookie for continuing the listing. - * Initialize to zero to start at the beginning of the directory. - * @param [out] out Mapping of listed object filenames to objects. - * @return Error code on failure, 0 on success - */ - int list_objects( - const vector &to_list, - int max_objects, - long *handle, - map *out - ); - - /// Lists subdirectories. - int list_subdirs( - const vector &to_list, ///< [in] Directory to list. - vector *out ///< [out] Subdirectories listed. - ); - - /// Create subdirectory. - int create_path( - const vector &to_create ///< [in] Subdirectory to create. - ); - - /// Remove subdirectory. - int remove_path( - const vector &to_remove ///< [in] Subdirectory to remove. - ); - - /// Check whether to_check exists. - int path_exists( - const vector &to_check, ///< [in] Subdirectory to check. - int *exists ///< [out] 1 if it exists, 0 else - ); - - /// Save attr_value to attr_name attribute on path. - int add_attr_path( - const vector &path, ///< [in] Path to modify. - const string &attr_name, ///< [in] Name of attribute. - bufferlist &attr_value ///< [in] Value to save. - ); - - /// Read into attr_value atribute attr_name on path. - int get_attr_path( - const vector &path, ///< [in] Path to read. - const string &attr_name, ///< [in] Attribute to read. - bufferlist &attr_value ///< [out] Attribute value read. - ); - - /// Remove attr from path - int remove_attr_path( - const vector &path, ///< [in] path from which to remove attr - const string &attr_name ///< [in] attr to remove - ); ///< @return Error code, 0 on success - -private: - /* lfn translation functions */ - - /** - * Gets the version specific lfn attribute tag - */ - const string &get_lfn_attr() const { - return lfn_attribute; - } - const string &get_alt_lfn_attr() const { - return lfn_alt_attribute; - } - - /** - * Gets the filename corresponsing to oid in path. - * - * @param [in] path Path in which to get filename for oid. - * @param [in] oid Object for which to get filename. - * @param [out] mangled_name Filename for oid, pass NULL if not needed. - * @param [out] full_path Fullpath for oid, pass NULL if not needed. - * @param [out] hardlink of this file, 0 mean no-exist, pass NULL if - * not needed - * @return Error Code, 0 on success. - */ - int lfn_get_name( - const vector &path, - const ghobject_t &oid, - string *mangled_name, - string *full_path, - int *hardlink - ); - - /// Adjusts path contents when oid is created at name mangled_name. - int lfn_created( - const vector &path, ///< [in] Path to adjust. - const ghobject_t &oid, ///< [in] Object created. - const string &mangled_name ///< [in] Filename of created object. - ); - - /// Removes oid from path while adjusting path contents - int lfn_unlink( - const vector &path, ///< [in] Path containing oid. - const ghobject_t &oid, ///< [in] Object to remove. - const string &mangled_name ///< [in] Filename of object to remove. - ); - - ///Transate a file into and ghobject_t. - int lfn_translate( - const vector &path, ///< [in] Path containing the file. - const string &short_name, ///< [in] Filename to translate. - ghobject_t *out ///< [out] Object found. - ); ///< @return Negative error code on error, 0 if not an object, 1 else - - /* manglers/demanglers */ - /// Filters object filenames - bool lfn_is_object( - const string &short_name ///< [in] Filename to check - ); ///< True if short_name is an object, false otherwise - - /// Filters subdir filenames - bool lfn_is_subdir( - const string &short_name, ///< [in] Filename to check. - string *demangled_name ///< [out] Demangled subdir name. - ); ///< @return True if short_name is a subdir, false otherwise - - /// Generate object name - string lfn_generate_object_name_keyless( - const ghobject_t &oid ///< [in] Object for which to generate. - ); ///< @return Generated object name. - - /// Generate object name - string lfn_generate_object_name_poolless( - const ghobject_t &oid ///< [in] Object for which to generate. - ); ///< @return Generated object name. - - /// Generate object name - string lfn_generate_object_name( - const ghobject_t &oid ///< [in] Object for which to generate. - ); ///< @return Generated object name. - - /// Parse object name - bool lfn_parse_object_name_keyless( - const string &long_name, ///< [in] Name to parse - ghobject_t *out ///< [out] Resulting Object - ); ///< @return True if successfull, False otherwise. - - /// Parse object name - bool lfn_parse_object_name_poolless( - const string &long_name, ///< [in] Name to parse - ghobject_t *out ///< [out] Resulting Object - ); ///< @return True if successfull, False otherwise. - - /// Parse object name - bool lfn_parse_object_name( - const string &long_name, ///< [in] Name to parse - ghobject_t *out ///< [out] Resulting Object - ); ///< @return True if successfull, False otherwise. - - /// Checks whether short_name is a hashed filename. - bool lfn_is_hashed_filename( - const string &short_name ///< [in] Name to check. - ); ///< @return True if short_name is hashed, False otherwise. - - /// Checks whether long_name must be hashed. - bool lfn_must_hash( - const string &long_name ///< [in] Name to check. - ); ///< @return True if long_name must be hashed, False otherwise. - - /// Generate hashed name. - string lfn_get_short_name( - const ghobject_t &oid, ///< [in] Object for which to generate. - int i ///< [in] Index of hashed name to generate. - ); ///< @return Hashed filename. - - /* other common methods */ - /// Gets the base path - const string &get_base_path(); ///< @return Index base_path - - /// Get full path the subdir - string get_full_path_subdir( - const vector &rel ///< [in] The subdir. - ); ///< @return Full path to rel. - - /// Get full path to object - string get_full_path( - const vector &rel, ///< [in] Path to object. - const string &name ///< [in] Filename of object. - ); ///< @return Fullpath to object at name in rel. - - /// Get mangled path component - string mangle_path_component( - const string &component ///< [in] Component to mangle - ); /// @return Mangled component - - /// Demangle component - string demangle_path_component( - const string &component ///< [in] Subdir name to demangle - ); ///< @return Demangled path component. - - /// Decompose full path into object name and filename. - int decompose_full_path( - const char *in, ///< [in] Full path to object. - vector *out, ///< [out] Path to object at in. - ghobject_t *oid, ///< [out] Object at in. - string *shortname ///< [out] Filename of object at in. - ); ///< @return Error Code, 0 on success. - - /// Mangle attribute name - string mangle_attr_name( - const string &attr ///< [in] Attribute to mangle. - ); ///< @return Mangled attribute name. - - /// Builds hashed filename - void build_filename( - const char *old_filename, ///< [in] Filename to convert. - int i, ///< [in] Index of hash. - char *filename, ///< [out] Resulting filename. - int len ///< [in] Size of buffer for filename - ); ///< @return Error Code, 0 on success - - /// Get hash of filename - int hash_filename( - const char *filename, ///< [in] Filename to hash. - char *hash, ///< [out] Hash of filename. - int len ///< [in] Size of hash buffer. - ); ///< @return Error Code, 0 on success. - - friend class TestWrapLFNIndex; -}; -typedef LFNIndex::IndexedPath IndexedPath; - -#endif diff --git a/src/os/Makefile.am b/src/os/Makefile.am index 1a6aa5739960..a5cd08deaa16 100644 --- a/src/os/Makefile.am +++ b/src/os/Makefile.am @@ -9,7 +9,6 @@ noinst_LIBRARIES += libos_types.a if ENABLE_SERVER libos_a_SOURCES = \ - os/chain_xattr.cc \ os/fs/FS.cc \ os/bluestore/kv.cc \ os/bluestore/Allocator.cc \ @@ -19,34 +18,35 @@ libos_a_SOURCES = \ os/bluestore/BlueStore.cc \ os/bluestore/FreelistManager.cc \ os/bluestore/StupidAllocator.cc \ + os/filestore/chain_xattr.cc \ + os/filestore/DBObjectMap.cc \ + os/filestore/FileJournal.cc \ + os/filestore/FileStore.cc \ + os/filestore/GenericFileStoreBackend.cc \ + os/filestore/HashIndex.cc \ + os/filestore/IndexManager.cc \ + os/filestore/JournalingObjectStore.cc \ + os/filestore/LFNIndex.cc \ + os/filestore/WBThrottle.cc \ os/kstore/kv.cc \ os/kstore/KStore.cc \ - os/DBObjectMap.cc \ os/GenericObjectMap.cc \ - os/FileJournal.cc \ - os/FileStore.cc \ - os/GenericFileStoreBackend.cc \ - os/HashIndex.cc \ - os/IndexManager.cc \ - os/JournalingObjectStore.cc \ - os/LFNIndex.cc \ os/MemStore.cc \ os/KeyValueStore.cc \ - os/ObjectStore.cc \ - os/WBThrottle.cc + os/ObjectStore.cc if LINUX -libos_a_SOURCES += os/BtrfsFileStoreBackend.cc +libos_a_SOURCES += os/filestore/BtrfsFileStoreBackend.cc endif if WITH_LIBXFS libos_a_SOURCES += \ os/fs/XFS.cc \ - os/XfsFileStoreBackend.cc + os/filestore/XfsFileStoreBackend.cc endif if WITH_LIBZFS -libos_a_SOURCES += os/ZFSFileStoreBackend.cc +libos_a_SOURCES += os/filestore/ZFSFileStoreBackend.cc endif libos_a_CXXFLAGS = ${AM_CXXFLAGS} -I rocksdb/include -fPIC @@ -58,8 +58,6 @@ libos_a_LIBADD += $(LIBOS_TP) endif noinst_HEADERS += \ - os/btrfs_ioctl.h \ - os/chain_xattr.h \ os/bluestore/bluefs_types.h \ os/bluestore/bluestore_types.h \ os/bluestore/kv.h \ @@ -70,33 +68,35 @@ noinst_HEADERS += \ os/bluestore/BlueStore.h \ os/bluestore/FreelistManager.h \ os/bluestore/StupidAllocator.h \ + os/btrfs_ioctl.h \ + os/filestore/chain_xattr.h \ + os/filestore/BtrfsFileStoreBackend.h \ + os/filestore/CollectionIndex.h \ + os/filestore/DBObjectMap.h \ + os/filestore/FileJournal.h \ + os/filestore/FileStore.h \ + os/filestore/FDCache.h \ + os/filestore/GenericFileStoreBackend.h \ + os/filestore/HashIndex.h \ + os/filestore/IndexManager.h \ + os/filestore/Journal.h \ + os/filestore/JournalingObjectStore.h \ + os/filestore/LFNIndex.h \ + os/filestore/SequencerPosition.h \ + os/filestore/WBThrottle.h \ + os/filestore/XfsFileStoreBackend.h \ + os/filestore/ZFSFileStoreBackend.h os/kstore/kstore_types.h \ os/kstore/KStore.h \ os/kstore/kv.h \ - os/BtrfsFileStoreBackend.h \ - os/CollectionIndex.h \ - os/DBObjectMap.h \ os/GenericObjectMap.h \ - os/FileJournal.h \ - os/FileStore.h \ - os/FDCache.h \ os/fs/FS.h \ os/fs/XFS.h \ - os/GenericFileStoreBackend.h \ - os/HashIndex.h \ - os/IndexManager.h \ - os/Journal.h \ - os/JournalingObjectStore.h \ - os/LFNIndex.h \ os/MemStore.h \ os/KeyValueStore.h \ os/ObjectMap.h \ os/ObjectStore.h \ - os/PageSet.h \ - os/SequencerPosition.h \ - os/WBThrottle.h \ - os/XfsFileStoreBackend.h \ - os/ZFSFileStoreBackend.h + os/PageSet.h if WITH_LIBZFS libos_zfs_a_SOURCES = os/ZFS.cc diff --git a/src/os/ObjectMap.h b/src/os/ObjectMap.h index e7a64a47f9db..c4efc7fbc202 100644 --- a/src/os/ObjectMap.h +++ b/src/os/ObjectMap.h @@ -15,12 +15,13 @@ #ifndef OS_KEYVALUESTORE_H #define OS_KEYVALUESTORE_H -#include "IndexManager.h" -#include "SequencerPosition.h" #include #include #include "include/memory.h" #include "kv/KeyValueDB.h" +#include "common/hobject.h" + +class SequencerPosition; /** * Encapsulates the FileStore key value store diff --git a/src/os/ObjectStore.cc b/src/os/ObjectStore.cc index 32923af4b1a3..55f9f7fd01f4 100644 --- a/src/os/ObjectStore.cc +++ b/src/os/ObjectStore.cc @@ -18,7 +18,7 @@ #include "common/Formatter.h" #include "common/safe_io.h" -#include "FileStore.h" +#include "filestore/FileStore.h" #include "MemStore.h" #include "KeyValueStore.h" #if defined(HAVE_LIBAIO) diff --git a/src/os/SequencerPosition.h b/src/os/SequencerPosition.h deleted file mode 100644 index 38f11f08dc40..000000000000 --- a/src/os/SequencerPosition.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#ifndef __CEPH_OS_SEQUENCERPOSITION_H -#define __CEPH_OS_SEQUENCERPOSITION_H - -#include "include/types.h" -#include "include/cmp.h" -#include "include/encoding.h" -#include "common/Formatter.h" - -#include - -/** - * transaction and op offset - */ -struct SequencerPosition { - uint64_t seq; ///< seq - uint32_t trans; ///< transaction in that seq (0-based) - uint32_t op; ///< op in that transaction (0-based) - - SequencerPosition(uint64_t s=0, int32_t t=0, int32_t o=0) : seq(s), trans(t), op(o) {} - - void encode(bufferlist& bl) const { - ENCODE_START(1, 1, bl); - ::encode(seq, bl); - ::encode(trans, bl); - ::encode(op, bl); - ENCODE_FINISH(bl); - } - void decode(bufferlist::iterator& p) { - DECODE_START(1, p); - ::decode(seq, p); - ::decode(trans, p); - ::decode(op, p); - DECODE_FINISH(p); - } - void dump(Formatter *f) const { - f->dump_unsigned("seq", seq); - f->dump_unsigned("trans", trans); - f->dump_unsigned("op", op); - } - static void generate_test_instances(list& o) { - o.push_back(new SequencerPosition); - o.push_back(new SequencerPosition(1, 2, 3)); - o.push_back(new SequencerPosition(4, 5, 6)); - } -}; -WRITE_CLASS_ENCODER(SequencerPosition) - -inline ostream& operator<<(ostream& out, const SequencerPosition& t) { - return out << t.seq << "." << t.trans << "." << t.op; -} - -WRITE_EQ_OPERATORS_3(SequencerPosition, seq, trans, op) -WRITE_CMP_OPERATORS_3(SequencerPosition, seq, trans, op) - - -#endif diff --git a/src/os/WBThrottle.cc b/src/os/WBThrottle.cc deleted file mode 100644 index 04c6922dfa1e..000000000000 --- a/src/os/WBThrottle.cc +++ /dev/null @@ -1,267 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "acconfig.h" - -#include "os/WBThrottle.h" -#include "common/perf_counters.h" - -WBThrottle::WBThrottle(CephContext *cct) : - cur_ios(0), cur_size(0), - cct(cct), - logger(NULL), - stopping(true), - lock("WBThrottle::lock", false, true, false, cct), - fs(XFS) -{ - { - Mutex::Locker l(lock); - set_from_conf(); - } - assert(cct); - PerfCountersBuilder b( - cct, string("WBThrottle"), - l_wbthrottle_first, l_wbthrottle_last); - b.add_u64(l_wbthrottle_bytes_dirtied, "bytes_dirtied", "Dirty data"); - b.add_u64(l_wbthrottle_bytes_wb, "bytes_wb", "Written data"); - b.add_u64(l_wbthrottle_ios_dirtied, "ios_dirtied", "Dirty operations"); - b.add_u64(l_wbthrottle_ios_wb, "ios_wb", "Written operations"); - b.add_u64(l_wbthrottle_inodes_dirtied, "inodes_dirtied", "Entries waiting for write"); - b.add_u64(l_wbthrottle_inodes_wb, "inodes_wb", "Written entries"); - logger = b.create_perf_counters(); - cct->get_perfcounters_collection()->add(logger); - for (unsigned i = l_wbthrottle_first + 1; i != l_wbthrottle_last; ++i) - logger->set(i, 0); - - cct->_conf->add_observer(this); -} - -WBThrottle::~WBThrottle() { - assert(cct); - cct->get_perfcounters_collection()->remove(logger); - delete logger; - cct->_conf->remove_observer(this); -} - -void WBThrottle::start() -{ - { - Mutex::Locker l(lock); - stopping = false; - } - create(); -} - -void WBThrottle::stop() -{ - { - Mutex::Locker l(lock); - stopping = true; - cond.Signal(); - } - - join(); -} - -const char** WBThrottle::get_tracked_conf_keys() const -{ - static const char* KEYS[] = { - "filestore_wbthrottle_btrfs_bytes_start_flusher", - "filestore_wbthrottle_btrfs_bytes_hard_limit", - "filestore_wbthrottle_btrfs_ios_start_flusher", - "filestore_wbthrottle_btrfs_ios_hard_limit", - "filestore_wbthrottle_btrfs_inodes_start_flusher", - "filestore_wbthrottle_btrfs_inodes_hard_limit", - "filestore_wbthrottle_xfs_bytes_start_flusher", - "filestore_wbthrottle_xfs_bytes_hard_limit", - "filestore_wbthrottle_xfs_ios_start_flusher", - "filestore_wbthrottle_xfs_ios_hard_limit", - "filestore_wbthrottle_xfs_inodes_start_flusher", - "filestore_wbthrottle_xfs_inodes_hard_limit", - NULL - }; - return KEYS; -} - -void WBThrottle::set_from_conf() -{ - assert(lock.is_locked()); - if (fs == BTRFS) { - size_limits.first = - cct->_conf->filestore_wbthrottle_btrfs_bytes_start_flusher; - size_limits.second = - cct->_conf->filestore_wbthrottle_btrfs_bytes_hard_limit; - io_limits.first = - cct->_conf->filestore_wbthrottle_btrfs_ios_start_flusher; - io_limits.second = - cct->_conf->filestore_wbthrottle_btrfs_ios_hard_limit; - fd_limits.first = - cct->_conf->filestore_wbthrottle_btrfs_inodes_start_flusher; - fd_limits.second = - cct->_conf->filestore_wbthrottle_btrfs_inodes_hard_limit; - } else if (fs == XFS) { - size_limits.first = - cct->_conf->filestore_wbthrottle_xfs_bytes_start_flusher; - size_limits.second = - cct->_conf->filestore_wbthrottle_xfs_bytes_hard_limit; - io_limits.first = - cct->_conf->filestore_wbthrottle_xfs_ios_start_flusher; - io_limits.second = - cct->_conf->filestore_wbthrottle_xfs_ios_hard_limit; - fd_limits.first = - cct->_conf->filestore_wbthrottle_xfs_inodes_start_flusher; - fd_limits.second = - cct->_conf->filestore_wbthrottle_xfs_inodes_hard_limit; - } else { - assert(0 == "invalid value for fs"); - } - cond.Signal(); -} - -void WBThrottle::handle_conf_change(const md_config_t *conf, - const std::set &changed) -{ - Mutex::Locker l(lock); - for (const char** i = get_tracked_conf_keys(); *i; ++i) { - if (changed.count(*i)) { - set_from_conf(); - return; - } - } -} - -bool WBThrottle::get_next_should_flush( - boost::tuple *next) -{ - assert(lock.is_locked()); - assert(next); - while (!stopping && !beyond_limit()) - cond.Wait(lock); - if (stopping) - return false; - assert(!pending_wbs.empty()); - ghobject_t obj(pop_object()); - - ceph::unordered_map >::iterator i = - pending_wbs.find(obj); - *next = boost::make_tuple(obj, i->second.second, i->second.first); - pending_wbs.erase(i); - return true; -} - - -void *WBThrottle::entry() -{ - Mutex::Locker l(lock); - boost::tuple wb; - while (get_next_should_flush(&wb)) { - clearing = wb.get<0>(); - cur_ios -= wb.get<2>().ios; - logger->dec(l_wbthrottle_ios_dirtied, wb.get<2>().ios); - logger->inc(l_wbthrottle_ios_wb, wb.get<2>().ios); - cur_size -= wb.get<2>().size; - logger->dec(l_wbthrottle_bytes_dirtied, wb.get<2>().size); - logger->inc(l_wbthrottle_bytes_wb, wb.get<2>().size); - logger->dec(l_wbthrottle_inodes_dirtied); - logger->inc(l_wbthrottle_inodes_wb); - lock.Unlock(); -#ifdef HAVE_FDATASYNC - ::fdatasync(**wb.get<1>()); -#else - ::fsync(**wb.get<1>()); -#endif -#ifdef HAVE_POSIX_FADVISE - if (g_conf->filestore_fadvise && wb.get<2>().nocache) { - int fa_r = posix_fadvise(**wb.get<1>(), 0, 0, POSIX_FADV_DONTNEED); - assert(fa_r == 0); - } -#endif - lock.Lock(); - clearing = ghobject_t(); - cond.Signal(); - wb = boost::tuple(); - } - return 0; -} - -void WBThrottle::queue_wb( - FDRef fd, const ghobject_t &hoid, uint64_t offset, uint64_t len, - bool nocache) -{ - Mutex::Locker l(lock); - ceph::unordered_map >::iterator wbiter = - pending_wbs.find(hoid); - if (wbiter == pending_wbs.end()) { - wbiter = pending_wbs.insert( - make_pair(hoid, - make_pair( - PendingWB(), - fd))).first; - logger->inc(l_wbthrottle_inodes_dirtied); - } else { - remove_object(hoid); - } - - cur_ios++; - logger->inc(l_wbthrottle_ios_dirtied); - cur_size += len; - logger->inc(l_wbthrottle_bytes_dirtied, len); - - wbiter->second.first.add(nocache, len, 1); - insert_object(hoid); - if (beyond_limit()) - cond.Signal(); -} - -void WBThrottle::clear() -{ - Mutex::Locker l(lock); - for (ceph::unordered_map >::iterator i = - pending_wbs.begin(); - i != pending_wbs.end(); - ++i) { -#ifdef HAVE_POSIX_FADVISE - if (g_conf->filestore_fadvise && i->second.first.nocache) { - int fa_r = posix_fadvise(**i->second.second, 0, 0, POSIX_FADV_DONTNEED); - assert(fa_r == 0); - } -#endif - - } - cur_ios = cur_size = 0; - logger->set(l_wbthrottle_ios_dirtied, 0); - logger->set(l_wbthrottle_bytes_dirtied, 0); - logger->set(l_wbthrottle_inodes_dirtied, 0); - pending_wbs.clear(); - lru.clear(); - rev_lru.clear(); - cond.Signal(); -} - -void WBThrottle::clear_object(const ghobject_t &hoid) -{ - Mutex::Locker l(lock); - while (clearing == hoid) - cond.Wait(lock); - ceph::unordered_map >::iterator i = - pending_wbs.find(hoid); - if (i == pending_wbs.end()) - return; - - cur_ios -= i->second.first.ios; - logger->dec(l_wbthrottle_ios_dirtied, i->second.first.ios); - cur_size -= i->second.first.size; - logger->dec(l_wbthrottle_bytes_dirtied, i->second.first.size); - logger->dec(l_wbthrottle_inodes_dirtied); - - pending_wbs.erase(i); - remove_object(hoid); - cond.Signal(); -} - -void WBThrottle::throttle() -{ - Mutex::Locker l(lock); - while (!stopping && need_flush()) - cond.Wait(lock); -} diff --git a/src/os/WBThrottle.h b/src/os/WBThrottle.h deleted file mode 100644 index f06ec877b2d4..000000000000 --- a/src/os/WBThrottle.h +++ /dev/null @@ -1,188 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2013 Inktank Storage, Inc. - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef WBTHROTTLE_H -#define WBTHROTTLE_H - -#include "include/unordered_map.h" -#include -#include "include/memory.h" -#include "common/Formatter.h" -#include "common/hobject.h" -#include "include/interval_set.h" -#include "FDCache.h" -#include "common/Thread.h" -#include "common/ceph_context.h" - -class PerfCounters; -enum { - l_wbthrottle_first = 999090, - l_wbthrottle_bytes_dirtied, - l_wbthrottle_bytes_wb, - l_wbthrottle_ios_dirtied, - l_wbthrottle_ios_wb, - l_wbthrottle_inodes_dirtied, - l_wbthrottle_inodes_wb, - l_wbthrottle_last -}; - -/** - * WBThrottle - * - * Tracks, throttles, and flushes outstanding IO - */ -class WBThrottle : Thread, public md_config_obs_t { - ghobject_t clearing; - /* *_limits.first is the start_flusher limit and - * *_limits.second is the hard limit - */ - - /// Limits on unflushed bytes - pair size_limits; - - /// Limits on unflushed ios - pair io_limits; - - /// Limits on unflushed objects - pair fd_limits; - - uint64_t cur_ios; /// Currently unflushed IOs - uint64_t cur_size; /// Currently unflushed bytes - - /** - * PendingWB tracks the ios pending on an object. - */ - class PendingWB { - public: - bool nocache; - uint64_t size; - uint64_t ios; - PendingWB() : nocache(true), size(0), ios(0) {} - void add(bool _nocache, uint64_t _size, uint64_t _ios) { - if (!_nocache) - nocache = false; // only nocache if all writes are nocache - size += _size; - ios += _ios; - } - }; - - CephContext *cct; - PerfCounters *logger; - bool stopping; - Mutex lock; - Cond cond; - - - /** - * Flush objects in lru order - */ - list lru; - ceph::unordered_map::iterator> rev_lru; - void remove_object(const ghobject_t &oid) { - assert(lock.is_locked()); - ceph::unordered_map::iterator>::iterator iter = - rev_lru.find(oid); - if (iter == rev_lru.end()) - return; - - lru.erase(iter->second); - rev_lru.erase(iter); - } - ghobject_t pop_object() { - assert(!lru.empty()); - ghobject_t oid(lru.front()); - lru.pop_front(); - rev_lru.erase(oid); - return oid; - } - void insert_object(const ghobject_t &oid) { - assert(rev_lru.find(oid) == rev_lru.end()); - lru.push_back(oid); - rev_lru.insert(make_pair(oid, --lru.end())); - } - - ceph::unordered_map > pending_wbs; - - /// get next flush to perform - bool get_next_should_flush( - boost::tuple *next ///< [out] next to flush - ); ///< @return false if we are shutting down -public: - enum FS { - BTRFS, - XFS - }; - -private: - FS fs; - - void set_from_conf(); - bool beyond_limit() const { - if (cur_ios < io_limits.first && - pending_wbs.size() < fd_limits.first && - cur_size < size_limits.first) - return false; - else - return true; - } - bool need_flush() const { - if (cur_ios < io_limits.second && - pending_wbs.size() < fd_limits.second && - cur_size < size_limits.second) - return false; - else - return true; - } - -public: - WBThrottle(CephContext *cct); - ~WBThrottle(); - - void start(); - void stop(); - /// Set fs as XFS or BTRFS - void set_fs(FS new_fs) { - Mutex::Locker l(lock); - fs = new_fs; - set_from_conf(); - } - - /// Queue wb on oid, fd taking throttle (does not block) - void queue_wb( - FDRef fd, ///< [in] FDRef to oid - const ghobject_t &oid, ///< [in] object - uint64_t offset, ///< [in] offset written - uint64_t len, ///< [in] length written - bool nocache ///< [in] try to clear out of cache after write - ); - - /// Clear all wb (probably due to sync) - void clear(); - - /// Clear object - void clear_object(const ghobject_t &oid); - - /// Block until there is throttle available - void throttle(); - - /// md_config_obs_t - const char** get_tracked_conf_keys() const; - void handle_conf_change(const md_config_t *conf, - const std::set &changed); - - /// Thread - void *entry(); -}; - -#endif diff --git a/src/os/XfsFileStoreBackend.cc b/src/os/XfsFileStoreBackend.cc deleted file mode 100644 index cf8bfe193886..000000000000 --- a/src/os/XfsFileStoreBackend.cc +++ /dev/null @@ -1,148 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2014 Inktank, Inc - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "XfsFileStoreBackend.h" - -#include -#include -#include -#include -#include -#include - -#include - -#include "common/errno.h" -#include "common/linux_version.h" -#include "include/assert.h" -#include "include/compat.h" - -#define dout_subsys ceph_subsys_filestore -#undef dout_prefix -#define dout_prefix *_dout << "xfsfilestorebackend(" << get_basedir_path() << ") " - -XfsFileStoreBackend::XfsFileStoreBackend(FileStore *fs): - GenericFileStoreBackend(fs), m_has_extsize(false) { } - -/* - * Set extsize attr on a file to val. Should be a free-standing - * function, but dout_prefix expanding to a call to get_basedir_path() - * protected member function won't let it. - */ -int XfsFileStoreBackend::set_extsize(int fd, unsigned int val) -{ - struct fsxattr fsx; - struct stat sb; - int ret; - - if (fstat(fd, &sb) < 0) { - ret = -errno; - dout(0) << "set_extsize: fstat: " << cpp_strerror(ret) << dendl; - return ret; - } - if (!S_ISREG(sb.st_mode)) { - dout(0) << "set_extsize: invalid target file type" << dendl; - return -EINVAL; - } - - if (ioctl(fd, XFS_IOC_FSGETXATTR, &fsx) < 0) { - ret = -errno; - dout(0) << "set_extsize: FSGETXATTR: " << cpp_strerror(ret) << dendl; - return ret; - } - - // already set? - if ((fsx.fsx_xflags & XFS_XFLAG_EXTSIZE) && fsx.fsx_extsize == val) - return 0; - - // xfs won't change extent size if any extents are allocated - if (fsx.fsx_nextents != 0) - return 0; - - fsx.fsx_xflags |= XFS_XFLAG_EXTSIZE; - fsx.fsx_extsize = val; - - if (ioctl(fd, XFS_IOC_FSSETXATTR, &fsx) < 0) { - ret = -errno; - dout(0) << "set_extsize: FSSETXATTR: " << cpp_strerror(ret) << dendl; - return ret; - } - - return 0; -} - -int XfsFileStoreBackend::detect_features() -{ - int ret; - - ret = GenericFileStoreBackend::detect_features(); - if (ret < 0) - return ret; - - // extsize? - int fd = ::openat(get_basedir_fd(), "extsize_test", O_CREAT|O_WRONLY, 0600); - if (fd < 0) { - ret = -errno; - dout(0) << "detect_feature: failed to create test file for extsize attr: " - << cpp_strerror(ret) << dendl; - goto out; - } - if (::unlinkat(get_basedir_fd(), "extsize_test", 0) < 0) { - ret = -errno; - dout(0) << "detect_feature: failed to unlink test file for extsize attr: " - << cpp_strerror(ret) << dendl; - goto out_close; - } - - if (g_conf->filestore_xfs_extsize) { - ret = set_extsize(fd, 1U << 15); // a few pages - if (ret) { - ret = 0; - dout(0) << "detect_feature: failed to set test file extsize, assuming extsize is NOT supported" << dendl; - goto out_close; - } - - // make sure we have 3.5 or newer, which includes this fix - // aff3a9edb7080f69f07fe76a8bd089b3dfa4cb5d - // for this set_extsize bug - // http://oss.sgi.com/bugzilla/show_bug.cgi?id=874 - int ver = get_linux_version(); - if (ver == 0) { - dout(0) << __func__ << ": couldn't verify extsize not buggy, disabling extsize" << dendl; - m_has_extsize = false; - } else if (ver < KERNEL_VERSION(3, 5, 0)) { - dout(0) << __func__ << ": disabling extsize, your kernel < 3.5 and has buggy extsize ioctl" << dendl; - m_has_extsize = false; - } else { - dout(0) << __func__ << ": extsize is supported and your kernel >= 3.5" << dendl; - m_has_extsize = true; - } - } else { - dout(0) << "detect_feature: extsize is disabled by conf" << dendl; - } - -out_close: - TEMP_FAILURE_RETRY(::close(fd)); -out: - return ret; -} - -int XfsFileStoreBackend::set_alloc_hint(int fd, uint64_t hint) -{ - if (!m_has_extsize) - return -EOPNOTSUPP; - - assert(hint < UINT_MAX); - return set_extsize(fd, hint); -} diff --git a/src/os/XfsFileStoreBackend.h b/src/os/XfsFileStoreBackend.h deleted file mode 100644 index 282fc1c9ba12..000000000000 --- a/src/os/XfsFileStoreBackend.h +++ /dev/null @@ -1,36 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2014 Inktank, Inc - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef CEPH_XFSFILESTOREBACKEND_H -#define CEPH_XFSFILESTOREBACKEND_H - -#include "GenericFileStoreBackend.h" - -#include "include/int_types.h" - -class XfsFileStoreBackend : public GenericFileStoreBackend { -private: - bool m_has_extsize; - int set_extsize(int fd, unsigned int val); -public: - XfsFileStoreBackend(FileStore *fs); - ~XfsFileStoreBackend() {} - const char *get_name() { - return "xfs"; - } - int detect_features(); - int set_alloc_hint(int fd, uint64_t hint); -}; - -#endif /* CEPH_XFSFILESTOREBACKEND_H */ diff --git a/src/os/ZFSFileStoreBackend.cc b/src/os/ZFSFileStoreBackend.cc deleted file mode 100644 index aa52b8d29339..000000000000 --- a/src/os/ZFSFileStoreBackend.cc +++ /dev/null @@ -1,260 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "include/int_types.h" -#include "include/types.h" - -#include -#include -#include -#include -#include -#include -#include - -#include "include/compat.h" -#include "include/linux_fiemap.h" -#include "include/color.h" -#include "include/buffer.h" -#include "include/assert.h" - -#include -#include -#include - -#include "common/errno.h" -#include "common/config.h" -#include "common/sync_filesystem.h" - -#ifdef HAVE_LIBZFS - -#include "ZFSFileStoreBackend.h" - -#define dout_subsys ceph_subsys_filestore -#undef dout_prefix -#define dout_prefix *_dout << "zfsfilestorebackend(" << get_basedir_path() << ") " - -ZFSFileStoreBackend::ZFSFileStoreBackend(FileStore *fs) : - GenericFileStoreBackend(fs), base_zh(NULL), current_zh(NULL), - m_filestore_zfs_snap(g_conf->filestore_zfs_snap) -{ - int ret = zfs.init(); - if (ret < 0) { - dout(0) << "ZFSFileStoreBackend: failed to init libzfs" << dendl; - return; - } - - base_zh = zfs.path_to_zhandle(get_basedir_path().c_str(), ZFS::TYPE_FILESYSTEM); - if (!base_zh) { - dout(0) << "ZFSFileStoreBackend: failed to get zfs handler for basedir" << dendl; - return; - } - - update_current_zh(); -} - -ZFSFileStoreBackend::~ZFSFileStoreBackend() -{ - if (base_zh) - zfs.close(base_zh); - if (current_zh) - zfs.close(current_zh); -} - -int ZFSFileStoreBackend::update_current_zh() -{ - char path[PATH_MAX]; - snprintf(path, sizeof(path), "%s/current", zfs.get_name(base_zh)); - ZFS::Handle *zh = zfs.open(path, ZFS::TYPE_FILESYSTEM); - if (zh) { - char *mnt; - if (zfs.is_mounted(zh, &mnt)) { - int ret = get_current_path() == mnt; - free(mnt); - if (ret) { - current_zh = zh; - return 0; - } - } else { - int ret = zfs.mount(zh, NULL, 0); - if (ret < 0) { - ret = -errno; - dout(0) << "update_current_zh: zfs_mount '" << zfs.get_name(zh) - << "' got " << cpp_strerror(ret) << dendl; - return ret; - } - } - zfs.close(zh); - } else { - dout(0) << "update_current_zh: zfs_open '" << path << "' got NULL" << dendl; - return -ENOENT; - } - - zh = zfs.path_to_zhandle(get_current_path().c_str(), ZFS::TYPE_FILESYSTEM); - if (zh) { - if (strcmp(zfs.get_name(base_zh), zfs.get_name(zh))) { - current_zh = zh; - return 0; - } - zfs.close(zh); - dout(0) << "update_current_zh: basedir and current/ on the same filesystem" << dendl; - } else { - dout(0) << "update_current_zh: current/ not exist" << dendl; - } - return -ENOENT; -} - -int ZFSFileStoreBackend::detect_features() -{ - if (!current_zh) - dout(0) << "detect_features: null zfs handle for current/" << dendl; - return 0; -} - -bool ZFSFileStoreBackend::can_checkpoint() -{ - return m_filestore_zfs_snap && current_zh != NULL; -} - -int ZFSFileStoreBackend::create_current() -{ - struct stat st; - int ret = ::stat(get_current_path().c_str(), &st); - if (ret == 0) { - // current/ exists - if (!S_ISDIR(st.st_mode)) { - dout(0) << "create_current: current/ exists but is not a directory" << dendl; - return -ENOTDIR; - } - return 0; - } else if (errno != ENOENT) { - ret = -errno; - dout(0) << "create_current: cannot stat current/ " << cpp_strerror(ret) << dendl; - return ret; - } - - char path[PATH_MAX]; - snprintf(path, sizeof(path), "%s/current", zfs.get_name(base_zh)); - ret = zfs.create(path, ZFS::TYPE_FILESYSTEM); - if (ret < 0 && errno != EEXIST) { - ret = -errno; - dout(0) << "create_current: zfs_create '" << path << "' got " << cpp_strerror(ret) << dendl; - return ret; - } - - ret = update_current_zh(); - return ret; -} - -static int list_checkpoints_callback(ZFS::Handle *zh, void *data) -{ - list *ls = static_cast *>(data); - string str = ZFS::get_name(zh); - size_t pos = str.find('@'); - assert(pos != string::npos && pos + 1 != str.length()); - ls->push_back(str.substr(pos + 1)); - return 0; -} - -int ZFSFileStoreBackend::list_checkpoints(list& ls) -{ - dout(10) << "list_checkpoints:" << dendl; - if (!current_zh) - return -EINVAL; - - list snaps; - int ret = zfs.iter_snapshots_sorted(current_zh, list_checkpoints_callback, &snaps); - if (ret < 0) { - ret = -errno; - dout(0) << "list_checkpoints: zfs_iter_snapshots_sorted got" << cpp_strerror(ret) << dendl; - return ret; - } - ls.swap(snaps); - return 0; -} - -int ZFSFileStoreBackend::create_checkpoint(const string& name, uint64_t *cid) -{ - dout(10) << "create_checkpoint: '" << name << "'" << dendl; - if (!current_zh) - return -EINVAL; - - // looks like zfsonlinux doesn't flush dirty data when taking snapshot - int ret = sync_filesystem(get_current_fd()); - if (ret < 0) { - ret = -errno; - dout(0) << "create_checkpoint: sync_filesystem got" << cpp_strerror(ret) << dendl; - return ret; - } - - char path[PATH_MAX]; - snprintf(path, sizeof(path), "%s@%s", zfs.get_name(current_zh), name.c_str()); - ret = zfs.snapshot(path, false); - if (ret < 0) { - ret = -errno; - dout(0) << "create_checkpoint: zfs_snapshot '" << path << "' got" << cpp_strerror(ret) << dendl; - return ret; - } - if (cid) - *cid = 0; - return 0; -} - -int ZFSFileStoreBackend::rollback_to(const string& name) -{ - dout(10) << "rollback_to: '" << name << "'" << dendl; - if (!current_zh) - return -EINVAL; - - // umount current to avoid triggering online rollback deadlock - int ret; - if (zfs.is_mounted(current_zh, NULL)) { - ret = zfs.umount(current_zh, NULL, 0); - if (ret < 0) { - ret = -errno; - dout(0) << "rollback_to: zfs_umount '" << zfs.get_name(current_zh) << "' got" << cpp_strerror(ret) << dendl; - } - } - - char path[PATH_MAX]; - snprintf(path, sizeof(path), "%s@%s", zfs.get_name(current_zh), name.c_str()); - - ZFS::Handle *snap_zh = zfs.open(path, ZFS::TYPE_SNAPSHOT); - if (!snap_zh) { - dout(0) << "rollback_to: zfs_open '" << path << "' got NULL" << dendl; - return -ENOENT; - } - - ret = zfs.rollback(current_zh, snap_zh, false); - if (ret < 0) { - ret = -errno; - dout(0) << "rollback_to: zfs_rollback '" << zfs.get_name(snap_zh) << "' got" << cpp_strerror(ret) << dendl; - } - - if (!zfs.is_mounted(current_zh, NULL)) { - int ret = zfs.mount(current_zh, NULL, 0); - if (ret < 0) { - ret = -errno; - dout(0) << "update_current_zh: zfs_mount '" << zfs.get_name(current_zh) << "' got " << cpp_strerror(ret) << dendl; - return ret; - } - } - - zfs.close(snap_zh); - return ret; -} - -int ZFSFileStoreBackend::destroy_checkpoint(const string& name) -{ - dout(10) << "destroy_checkpoint: '" << name << "'" << dendl; - if (!current_zh) - return -EINVAL; - - int ret = zfs.destroy_snaps(current_zh, name.c_str(), true); - if (ret < 0) { - ret = -errno; - dout(0) << "destroy_checkpoint: zfs_destroy_snaps '" << name << "' got" << cpp_strerror(ret) << dendl; - } - return ret; -} -#endif diff --git a/src/os/ZFSFileStoreBackend.h b/src/os/ZFSFileStoreBackend.h deleted file mode 100644 index 8186d9ca957d..000000000000 --- a/src/os/ZFSFileStoreBackend.h +++ /dev/null @@ -1,30 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#ifndef CEPH_ZFSFILESTOREBACKEND_H -#define CEPH_ZFSFILESTOREBACKEND_H - -#ifdef HAVE_LIBZFS -#include "GenericFileStoreBackend.h" -#include "ZFS.h" - -class ZFSFileStoreBackend : public GenericFileStoreBackend { -private: - ZFS zfs; - ZFS::Handle *base_zh; - ZFS::Handle *current_zh; - bool m_filestore_zfs_snap; - int update_current_zh(); -public: - ZFSFileStoreBackend(FileStore *fs); - ~ZFSFileStoreBackend(); - int detect_features(); - bool can_checkpoint(); - int create_current(); - int list_checkpoints(list& ls); - int create_checkpoint(const string& name, uint64_t *cid); - int rollback_to(const string& name); - int destroy_checkpoint(const string& name); -}; -#endif -#endif diff --git a/src/os/chain_xattr.cc b/src/os/chain_xattr.cc deleted file mode 100644 index 5351abf440b4..000000000000 --- a/src/os/chain_xattr.cc +++ /dev/null @@ -1,467 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "chain_xattr.h" - -#include "include/int_types.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "include/assert.h" - -#if defined(__linux__) -#include -#endif - -#include "common/xattr.h" -#include "include/compat.h" - -/* - * chaining xattrs - * - * In order to support xattrs that are larger than the xattr size limit that some file systems - * impose, we use multiple xattrs to store the value of a single xattr. The xattrs keys - * are set as follows: - * The first xattr in the chain, has a key that holds the original xattr name, with any '@' char - * being esacped ("@@"). - * The chained keys will have the first xattr's key (with the escaping), and a suffix: "@" - * where marks the num of xattr in the chain. - */ - -static void get_raw_xattr_name(const char *name, int i, char *raw_name, int raw_len) -{ - int pos = 0; - - while (*name) { - switch (*name) { - case '@': /* escape it */ - pos += 2; - assert (pos < raw_len - 1); - *raw_name = '@'; - raw_name++; - *raw_name = '@'; - break; - default: - pos++; - assert(pos < raw_len - 1); - *raw_name = *name; - break; - } - name++; - raw_name++; - } - - if (!i) { - *raw_name = '\0'; - } else { - int r = snprintf(raw_name, raw_len - pos, "@%d", i); - assert(r < raw_len - pos); - } -} - -static int translate_raw_name(const char *raw_name, char *name, int name_len, bool *is_first) -{ - int pos = 0; - - *is_first = true; - while (*raw_name) { - switch (*raw_name) { - case '@': /* escape it */ - raw_name++; - if (!*raw_name) - break; - if (*raw_name != '@') { - *is_first = false; - goto done; - } - - /* fall through */ - default: - *name = *raw_name; - break; - } - pos++; - assert(pos < name_len); - name++; - raw_name++; - } -done: - *name = '\0'; - return pos; -} - - -// setxattr - -static int getxattr_len(const char *fn, const char *name) -{ - int i = 0, total = 0; - char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; - int r; - - do { - get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); - r = sys_getxattr(fn, raw_name, 0, 0); - if (!i && r < 0) - return r; - if (r < 0) - break; - total += r; - i++; - } while (r == CHAIN_XATTR_MAX_BLOCK_LEN || - r == CHAIN_XATTR_SHORT_BLOCK_LEN); - - return total; -} - -int chain_getxattr(const char *fn, const char *name, void *val, size_t size) -{ - int i = 0, pos = 0; - char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; - int ret = 0; - int r; - size_t chunk_size; - - if (!size) - return getxattr_len(fn, name); - - do { - chunk_size = (size < CHAIN_XATTR_MAX_BLOCK_LEN ? size : CHAIN_XATTR_MAX_BLOCK_LEN); - get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); - - r = sys_getxattr(fn, raw_name, (char *)val + pos, chunk_size); - if (i && r == -ENODATA) { - ret = pos; - break; - } - if (r < 0) { - ret = r; - break; - } - - if (r > 0) { - pos += r; - size -= r; - } - - i++; - } while (size && (r == CHAIN_XATTR_MAX_BLOCK_LEN || - r == CHAIN_XATTR_SHORT_BLOCK_LEN)); - - if (r >= 0) { - ret = pos; - /* is there another chunk? that can happen if the last read size span over - exactly one block */ - if (chunk_size == CHAIN_XATTR_MAX_BLOCK_LEN || - chunk_size == CHAIN_XATTR_SHORT_BLOCK_LEN) { - get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); - r = sys_getxattr(fn, raw_name, 0, 0); - if (r > 0) { // there's another chunk.. the original buffer was too small - ret = -ERANGE; - } - } - } - return ret; -} - -static int chain_fgetxattr_len(int fd, const char *name) -{ - int i = 0, total = 0; - char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; - int r; - - do { - get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); - r = sys_fgetxattr(fd, raw_name, 0, 0); - if (!i && r < 0) - return r; - if (r < 0) - break; - total += r; - i++; - } while (r == CHAIN_XATTR_MAX_BLOCK_LEN || - r == CHAIN_XATTR_SHORT_BLOCK_LEN); - - return total; -} - -int chain_fgetxattr(int fd, const char *name, void *val, size_t size) -{ - int i = 0, pos = 0; - char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; - int ret = 0; - int r; - size_t chunk_size; - - if (!size) - return chain_fgetxattr_len(fd, name); - - do { - chunk_size = (size < CHAIN_XATTR_MAX_BLOCK_LEN ? size : CHAIN_XATTR_MAX_BLOCK_LEN); - get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); - - r = sys_fgetxattr(fd, raw_name, (char *)val + pos, chunk_size); - if (i && r == -ENODATA) { - ret = pos; - break; - } - if (r < 0) { - ret = r; - break; - } - - if (r > 0) { - pos += r; - size -= r; - } - - i++; - } while (size && (r == CHAIN_XATTR_MAX_BLOCK_LEN || - r == CHAIN_XATTR_SHORT_BLOCK_LEN)); - - if (r >= 0) { - ret = pos; - /* is there another chunk? that can happen if the last read size span over - exactly one block */ - if (chunk_size == CHAIN_XATTR_MAX_BLOCK_LEN || - chunk_size == CHAIN_XATTR_SHORT_BLOCK_LEN) { - get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); - r = sys_fgetxattr(fd, raw_name, 0, 0); - if (r > 0) { // there's another chunk.. the original buffer was too small - ret = -ERANGE; - } - } - } - return ret; -} - - -// setxattr - -static int get_xattr_block_size(size_t size) -{ - if (size <= CHAIN_XATTR_SHORT_LEN_THRESHOLD) - // this may fit in the inode; stripe over short attrs so that XFS - // won't kick it out. - return CHAIN_XATTR_SHORT_BLOCK_LEN; - return CHAIN_XATTR_MAX_BLOCK_LEN; -} - -int chain_setxattr(const char *fn, const char *name, const void *val, size_t size, bool onechunk) -{ - int i = 0, pos = 0; - char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; - int ret = 0; - size_t max_chunk_size = get_xattr_block_size(size); - - do { - size_t chunk_size = (size < max_chunk_size ? size : max_chunk_size); - get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); - size -= chunk_size; - - int r = sys_setxattr(fn, raw_name, (char *)val + pos, chunk_size); - if (r < 0) { - ret = r; - break; - } - pos += chunk_size; - ret = pos; - i++; - } while (size); - - if (ret >= 0 && !onechunk) { - int r; - do { - get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); - r = sys_removexattr(fn, raw_name); - if (r < 0 && r != -ENODATA) - ret = r; - i++; - } while (r != -ENODATA); - } - - return ret; -} - -int chain_fsetxattr(int fd, const char *name, const void *val, size_t size, bool onechunk) -{ - int i = 0, pos = 0; - char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; - int ret = 0; - size_t max_chunk_size = get_xattr_block_size(size); - - do { - size_t chunk_size = (size < max_chunk_size ? size : max_chunk_size); - get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); - size -= chunk_size; - - int r = sys_fsetxattr(fd, raw_name, (char *)val + pos, chunk_size); - if (r < 0) { - ret = r; - break; - } - pos += chunk_size; - ret = pos; - i++; - } while (size); - - if (ret >= 0 && !onechunk) { - int r; - do { - get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); - r = sys_fremovexattr(fd, raw_name); - if (r < 0 && r != -ENODATA) - ret = r; - i++; - } while (r != -ENODATA); - } - - return ret; -} - - -// removexattr - -int chain_removexattr(const char *fn, const char *name) -{ - int i = 0; - char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; - int r; - - do { - get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); - r = sys_removexattr(fn, raw_name); - if (!i && r < 0) { - return r; - } - i++; - } while (r >= 0); - return 0; -} - -int chain_fremovexattr(int fd, const char *name) -{ - int i = 0; - char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; - int r; - - do { - get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); - r = sys_fremovexattr(fd, raw_name); - if (!i && r < 0) { - return r; - } - i++; - } while (r >= 0); - return 0; -} - - -// listxattr - -int chain_listxattr(const char *fn, char *names, size_t len) { - int r; - - if (!len) - return sys_listxattr(fn, names, len) * 2; - - r = sys_listxattr(fn, 0, 0); - if (r < 0) - return r; - - size_t total_len = r * 2; // should be enough - char *full_buf = (char *)malloc(total_len); - if (!full_buf) - return -ENOMEM; - - r = sys_listxattr(fn, full_buf, total_len); - if (r < 0) { - free(full_buf); - return r; - } - - char *p = full_buf; - const char *end = full_buf + r; - char *dest = names; - char *dest_end = names + len; - - while (p < end) { - char name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; - int attr_len = strlen(p); - bool is_first; - int name_len = translate_raw_name(p, name, sizeof(name), &is_first); - if (is_first) { - if (dest + name_len > dest_end) { - r = -ERANGE; - goto done; - } - strcpy(dest, name); - dest += name_len + 1; - } - p += attr_len + 1; - } - r = dest - names; - -done: - free(full_buf); - return r; -} - -int chain_flistxattr(int fd, char *names, size_t len) { - int r; - char *p; - const char * end; - char *dest; - char *dest_end; - - if (!len) - return sys_flistxattr(fd, names, len) * 2; - - r = sys_flistxattr(fd, 0, 0); - if (r < 0) - return r; - - size_t total_len = r * 2; // should be enough - char *full_buf = (char *)malloc(total_len); - if (!full_buf) - return -ENOMEM; - - r = sys_flistxattr(fd, full_buf, total_len); - if (r < 0) - goto done; - - p = full_buf; - end = full_buf + r; - dest = names; - dest_end = names + len; - - while (p < end) { - char name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; - int attr_len = strlen(p); - bool is_first; - int name_len = translate_raw_name(p, name, sizeof(name), &is_first); - if (is_first) { - if (dest + name_len > dest_end) { - r = -ERANGE; - goto done; - } - strcpy(dest, name); - dest += name_len + 1; - } - p += attr_len + 1; - } - r = dest - names; - -done: - free(full_buf); - return r; -} diff --git a/src/os/chain_xattr.h b/src/os/chain_xattr.h deleted file mode 100644 index 6ee80508d094..000000000000 --- a/src/os/chain_xattr.h +++ /dev/null @@ -1,88 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#ifndef __CEPH_OSD_CHAIN_XATTR_H -#define __CEPH_OSD_CHAIN_XATTR_H - -#include "common/xattr.h" - -#include - -#if defined(__linux__) -#include -#define CHAIN_XATTR_MAX_NAME_LEN ((XATTR_NAME_MAX + 1) / 2) -#elif defined(__APPLE__) -#include -#define CHAIN_XATTR_MAX_NAME_LEN ((XATTR_MAXNAMELEN + 1) / 2) -#else -#define CHAIN_XATTR_MAX_NAME_LEN 128 -#endif - -#define CHAIN_XATTR_MAX_BLOCK_LEN 2048 - -/* - * XFS will only inline xattrs < 255 bytes, so for xattrs that are - * likely to fit in the inode, stripe over short xattrs. - */ -#define CHAIN_XATTR_SHORT_BLOCK_LEN 250 -#define CHAIN_XATTR_SHORT_LEN_THRESHOLD 1000 - -// wrappers to hide annoying errno handling. - -static inline int sys_fgetxattr(int fd, const char *name, void *val, size_t size) -{ - int r = ::ceph_os_fgetxattr(fd, name, val, size); - return (r < 0 ? -errno : r); -} -static inline int sys_getxattr(const char *fn, const char *name, void *val, size_t size) -{ - int r = ::ceph_os_getxattr(fn, name, val, size); - return (r < 0 ? -errno : r); -} - -static inline int sys_setxattr(const char *fn, const char *name, const void *val, size_t size) -{ - int r = ::ceph_os_setxattr(fn, name, val, size); - return (r < 0 ? -errno : r); -} -static inline int sys_fsetxattr(int fd, const char *name, const void *val, size_t size) -{ - int r = ::ceph_os_fsetxattr(fd, name, val, size); - return (r < 0 ? -errno : r); -} - -static inline int sys_listxattr(const char *fn, char *names, size_t len) -{ - int r = ::ceph_os_listxattr(fn, names, len); - return (r < 0 ? -errno : r); -} -static inline int sys_flistxattr(int fd, char *names, size_t len) -{ - int r = ::ceph_os_flistxattr(fd, names, len); - return (r < 0 ? -errno : r); -} - -static inline int sys_removexattr(const char *fn, const char *name) -{ - int r = ::ceph_os_removexattr(fn, name); - return (r < 0 ? -errno : r); -} -static inline int sys_fremovexattr(int fd, const char *name) -{ - int r = ::ceph_os_fremovexattr(fd, name); - return (r < 0 ? -errno : r); -} - - -// wrappers to chain large values across multiple xattrs - -int chain_getxattr(const char *fn, const char *name, void *val, size_t size); -int chain_fgetxattr(int fd, const char *name, void *val, size_t size); -int chain_setxattr(const char *fn, const char *name, const void *val, size_t size, bool onechunk=false); -int chain_fsetxattr(int fd, const char *name, const void *val, size_t size, bool onechunk=false); -int chain_listxattr(const char *fn, char *names, size_t len); -int chain_flistxattr(int fd, char *names, size_t len); -int chain_removexattr(const char *fn, const char *name); -int chain_fremovexattr(int fd, const char *name); - -#endif diff --git a/src/os/filestore/BtrfsFileStoreBackend.cc b/src/os/filestore/BtrfsFileStoreBackend.cc new file mode 100644 index 000000000000..19a6e353aab3 --- /dev/null +++ b/src/os/filestore/BtrfsFileStoreBackend.cc @@ -0,0 +1,578 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/int_types.h" +#include "include/types.h" + +#include +#include +#include +#include +#include +#include +#include +#include "include/compat.h" +#include "include/linux_fiemap.h" +#include "include/color.h" +#include "include/buffer.h" +#include "include/assert.h" + +#ifndef __CYGWIN__ +#include "os/btrfs_ioctl.h" +#endif + +#include +#include +#include + +#include "BtrfsFileStoreBackend.h" + +#include "common/errno.h" +#include "common/config.h" + +#if defined(__linux__) + +#define dout_subsys ceph_subsys_filestore +#undef dout_prefix +#define dout_prefix *_dout << "btrfsfilestorebackend(" << get_basedir_path() << ") " + +#define ALIGN_DOWN(x, by) ((x) - ((x) % (by))) +#define ALIGNED(x, by) (!((x) % (by))) +#define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by))) + +BtrfsFileStoreBackend::BtrfsFileStoreBackend(FileStore *fs): + GenericFileStoreBackend(fs), has_clone_range(false), + has_snap_create(false), has_snap_destroy(false), + has_snap_create_v2(false), has_wait_sync(false), stable_commits(false), + m_filestore_btrfs_clone_range(g_conf->filestore_btrfs_clone_range), + m_filestore_btrfs_snap (g_conf->filestore_btrfs_snap) { } + +int BtrfsFileStoreBackend::detect_features() +{ + int r; + + r = GenericFileStoreBackend::detect_features(); + if (r < 0) + return r; + + // clone_range? + if (m_filestore_btrfs_clone_range) { + int fd = ::openat(get_basedir_fd(), "clone_range_test", O_CREAT|O_WRONLY, 0600); + if (fd >= 0) { + if (::unlinkat(get_basedir_fd(), "clone_range_test", 0) < 0) { + r = -errno; + dout(0) << "detect_feature: failed to unlink test file for CLONE_RANGE ioctl: " + << cpp_strerror(r) << dendl; + } + btrfs_ioctl_clone_range_args clone_args; + memset(&clone_args, 0, sizeof(clone_args)); + clone_args.src_fd = -1; + r = ::ioctl(fd, BTRFS_IOC_CLONE_RANGE, &clone_args); + if (r < 0 && errno == EBADF) { + dout(0) << "detect_feature: CLONE_RANGE ioctl is supported" << dendl; + has_clone_range = true; + } else { + r = -errno; + dout(0) << "detect_feature: CLONE_RANGE ioctl is NOT supported: " << cpp_strerror(r) << dendl; + } + TEMP_FAILURE_RETRY(::close(fd)); + } else { + r = -errno; + dout(0) << "detect_feature: failed to create test file for CLONE_RANGE ioctl: " + << cpp_strerror(r) << dendl; + } + } else { + dout(0) << "detect_feature: CLONE_RANGE ioctl is DISABLED via 'filestore btrfs clone range' option" << dendl; + } + + struct btrfs_ioctl_vol_args vol_args; + memset(&vol_args, 0, sizeof(vol_args)); + + // create test source volume + vol_args.fd = 0; + strcpy(vol_args.name, "test_subvol"); + r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE, &vol_args); + if (r != 0) { + r = -errno; + dout(0) << "detect_feature: failed to create simple subvolume " << vol_args.name << ": " << cpp_strerror(r) << dendl; + } + int srcfd = ::openat(get_basedir_fd(), vol_args.name, O_RDONLY); + if (srcfd < 0) { + r = -errno; + dout(0) << "detect_feature: failed to open " << vol_args.name << ": " << cpp_strerror(r) << dendl; + } + + // snap_create and snap_destroy? + vol_args.fd = srcfd; + strcpy(vol_args.name, "sync_snap_test"); + r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args); + int err = errno; + if (r == 0 || errno == EEXIST) { + dout(0) << "detect_feature: SNAP_CREATE is supported" << dendl; + has_snap_create = true; + + r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args); + if (r == 0) { + dout(0) << "detect_feature: SNAP_DESTROY is supported" << dendl; + has_snap_destroy = true; + } else { + err = -errno; + dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err) << dendl; + + if (err == -EPERM && getuid() != 0) { + dout(0) << "detect_feature: failed with EPERM as non-root; remount with -o user_subvol_rm_allowed" << dendl; + cerr << TEXT_YELLOW + << "btrfs SNAP_DESTROY failed as non-root; remount with -o user_subvol_rm_allowed" + << TEXT_NORMAL << std::endl; + } else if (err == -EOPNOTSUPP) { + derr << "btrfs SNAP_DESTROY ioctl not supported; you need a kernel newer than 2.6.32" << dendl; + } + } + } else { + dout(0) << "detect_feature: SNAP_CREATE failed: " << cpp_strerror(err) << dendl; + } + + if (m_filestore_btrfs_snap) { + if (has_snap_destroy) + stable_commits = true; + else + dout(0) << "detect_feature: snaps enabled, but no SNAP_DESTROY ioctl; DISABLING" << dendl; + } + + // start_sync? + __u64 transid = 0; + r = ::ioctl(get_basedir_fd(), BTRFS_IOC_START_SYNC, &transid); + if (r < 0) { + int err = errno; + dout(0) << "detect_feature: START_SYNC got " << cpp_strerror(err) << dendl; + } + if (r == 0 && transid > 0) { + dout(0) << "detect_feature: START_SYNC is supported (transid " << transid << ")" << dendl; + + // do we have wait_sync too? + r = ::ioctl(get_basedir_fd(), BTRFS_IOC_WAIT_SYNC, &transid); + if (r == 0 || errno == ERANGE) { + dout(0) << "detect_feature: WAIT_SYNC is supported" << dendl; + has_wait_sync = true; + } else { + int err = errno; + dout(0) << "detect_feature: WAIT_SYNC is NOT supported: " << cpp_strerror(err) << dendl; + } + } else { + int err = errno; + dout(0) << "detect_feature: START_SYNC is NOT supported: " << cpp_strerror(err) << dendl; + } + + if (has_wait_sync) { + // async snap creation? + struct btrfs_ioctl_vol_args_v2 async_args; + memset(&async_args, 0, sizeof(async_args)); + async_args.fd = srcfd; + async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC; + strcpy(async_args.name, "async_snap_test"); + + // remove old one, first + struct stat st; + strcpy(vol_args.name, async_args.name); + if (::fstatat(get_basedir_fd(), vol_args.name, &st, 0) == 0) { + dout(0) << "detect_feature: removing old async_snap_test" << dendl; + r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args); + if (r != 0) { + int err = errno; + dout(0) << "detect_feature: failed to remove old async_snap_test: " << cpp_strerror(err) << dendl; + } + } + + r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2, &async_args); + if (r == 0 || errno == EEXIST) { + dout(0) << "detect_feature: SNAP_CREATE_V2 is supported" << dendl; + has_snap_create_v2 = true; + + // clean up + strcpy(vol_args.name, "async_snap_test"); + r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args); + if (r != 0) { + int err = errno; + dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err) << dendl; + } + } else { + int err = errno; + dout(0) << "detect_feature: SNAP_CREATE_V2 is NOT supported: " << cpp_strerror(err) << dendl; + } + } + + // clean up test subvol + if (srcfd >= 0) + TEMP_FAILURE_RETRY(::close(srcfd)); + + strcpy(vol_args.name, "test_subvol"); + r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args); + if (r < 0) { + r = -errno; + dout(0) << "detect_feature: failed to remove " << vol_args.name << ": " << cpp_strerror(r) << dendl; + } + + if (m_filestore_btrfs_snap && !has_snap_create_v2) { + dout(0) << "mount WARNING: btrfs snaps enabled, but no SNAP_CREATE_V2 ioctl (from kernel 2.6.37+)" << dendl; + cerr << TEXT_YELLOW + << " ** WARNING: 'filestore btrfs snap' is enabled (for safe transactions,\n" + << " rollback), but btrfs does not support the SNAP_CREATE_V2 ioctl\n" + << " (added in Linux 2.6.37). Expect slow btrfs sync/commit\n" + << " performance.\n" + << TEXT_NORMAL; + } + + return 0; +} + +bool BtrfsFileStoreBackend::can_checkpoint() +{ + return stable_commits; +} + +int BtrfsFileStoreBackend::create_current() +{ + struct stat st; + int ret = ::stat(get_current_path().c_str(), &st); + if (ret == 0) { + // current/ exists + if (!S_ISDIR(st.st_mode)) { + dout(0) << "create_current: current/ exists but is not a directory" << dendl; + return -EINVAL; + } + + struct stat basest; + struct statfs currentfs; + ret = ::fstat(get_basedir_fd(), &basest); + if (ret < 0) { + ret = -errno; + dout(0) << "create_current: cannot fstat basedir " << cpp_strerror(ret) << dendl; + return ret; + } + ret = ::statfs(get_current_path().c_str(), ¤tfs); + if (ret < 0) { + ret = -errno; + dout(0) << "create_current: cannot statsf basedir " << cpp_strerror(ret) << dendl; + return ret; + } + if (currentfs.f_type == BTRFS_SUPER_MAGIC && basest.st_dev != st.st_dev) { + dout(2) << "create_current: current appears to be a btrfs subvolume" << dendl; + stable_commits = true; + } + return 0; + } + + struct btrfs_ioctl_vol_args volargs; + memset(&volargs, 0, sizeof(volargs)); + + volargs.fd = 0; + strcpy(volargs.name, "current"); + if (::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE, (unsigned long int)&volargs) < 0) { + ret = -errno; + dout(0) << "create_current: BTRFS_IOC_SUBVOL_CREATE failed with error " + << cpp_strerror(ret) << dendl; + return ret; + } + + dout(2) << "create_current: created btrfs subvol " << get_current_path() << dendl; + if (::chmod(get_current_path().c_str(), 0755) < 0) { + ret = -errno; + dout(0) << "create_current: failed to chmod " << get_current_path() << " to 0755: " + << cpp_strerror(ret) << dendl; + return ret; + } + + stable_commits = true; + return 0; +} + +int BtrfsFileStoreBackend::list_checkpoints(list& ls) +{ + int ret, err = 0; + + struct stat basest; + ret = ::fstat(get_basedir_fd(), &basest); + if (ret < 0) { + ret = -errno; + dout(0) << "list_checkpoints: cannot fstat basedir " << cpp_strerror(ret) << dendl; + return ret; + } + + // get snap list + DIR *dir = ::opendir(get_basedir_path().c_str()); + if (!dir) { + ret = -errno; + dout(0) << "list_checkpoints: opendir '" << get_basedir_path() << "' failed: " + << cpp_strerror(ret) << dendl; + return ret; + } + + list snaps; + char path[PATH_MAX]; + char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1]; + struct dirent *de; + while (::readdir_r(dir, (struct dirent *)&buf, &de) == 0) { + if (!de) + break; + + snprintf(path, sizeof(path), "%s/%s", get_basedir_path().c_str(), de->d_name); + + struct stat st; + ret = ::stat(path, &st); + if (ret < 0) { + err = -errno; + dout(0) << "list_checkpoints: stat '" << path << "' failed: " + << cpp_strerror(err) << dendl; + break; + } + + if (!S_ISDIR(st.st_mode)) + continue; + + struct statfs fs; + ret = ::statfs(path, &fs); + if (ret < 0) { + err = -errno; + dout(0) << "list_checkpoints: statfs '" << path << "' failed: " + << cpp_strerror(err) << dendl; + break; + } + + if (fs.f_type == BTRFS_SUPER_MAGIC && basest.st_dev != st.st_dev) + snaps.push_back(string(de->d_name)); + } + + if (::closedir(dir) < 0) { + ret = -errno; + dout(0) << "list_checkpoints: closedir failed: " << cpp_strerror(ret) << dendl; + if (!err) + err = ret; + } + + if (err) + return err; + + ls.swap(snaps); + return 0; +} + +int BtrfsFileStoreBackend::create_checkpoint(const string& name, uint64_t *transid) +{ + dout(10) << "create_checkpoint: '" << name << "'" << dendl; + if (has_snap_create_v2 && transid) { + struct btrfs_ioctl_vol_args_v2 async_args; + memset(&async_args, 0, sizeof(async_args)); + async_args.fd = get_current_fd(); + async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC; + + size_t name_size = sizeof(async_args.name); + strncpy(async_args.name, name.c_str(), name_size); + async_args.name[name_size-1] = '\0'; + + int r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2, &async_args); + if (r < 0) { + r = -errno; + dout(0) << "create_checkpoint: async snap create '" << name << "' got " << cpp_strerror(r) << dendl; + return r; + } + dout(20) << "create_checkpoint: async snap create '" << name << "' transid " << async_args.transid << dendl; + *transid = async_args.transid; + } else { + struct btrfs_ioctl_vol_args vol_args; + memset(&vol_args, 0, sizeof(vol_args)); + vol_args.fd = get_current_fd(); + + size_t name_size = sizeof(vol_args.name); + strncpy(vol_args.name, name.c_str(), name_size); + vol_args.name[name_size-1] = '\0'; + + int r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args); + if (r < 0) { + r = -errno; + dout(0) << "create_checkpoint: snap create '" << name << "' got " << cpp_strerror(r) << dendl; + return r; + } + if (transid) + *transid = 0; + } + return 0; +} + +int BtrfsFileStoreBackend::sync_checkpoint(uint64_t transid) +{ + // wait for commit + dout(10) << "sync_checkpoint: transid " << transid << " to complete" << dendl; + int ret = ::ioctl(get_op_fd(), BTRFS_IOC_WAIT_SYNC, &transid); + if (ret < 0) { + ret = -errno; + dout(0) << "sync_checkpoint: ioctl WAIT_SYNC got " << cpp_strerror(ret) << dendl; + return -errno; + } + dout(20) << "sync_checkpoint: done waiting for transid " << transid << dendl; + return 0; +} + +int BtrfsFileStoreBackend::rollback_to(const string& name) +{ + dout(10) << "rollback_to: to '" << name << "'" << dendl; + char s[PATH_MAX]; + btrfs_ioctl_vol_args vol_args; + + memset(&vol_args, 0, sizeof(vol_args)); + vol_args.fd = 0; + strcpy(vol_args.name, "current"); + + int ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args); + if (ret && errno != ENOENT) { + dout(0) << "rollback_to: error removing old current subvol: " << cpp_strerror(ret) << dendl; + snprintf(s, sizeof(s), "%s/current.remove.me.%d", get_basedir_path().c_str(), rand()); + if (::rename(get_current_path().c_str(), s)) { + ret = -errno; + dout(0) << "rollback_to: error renaming old current subvol: " + << cpp_strerror(ret) << dendl; + return ret; + } + } + + snprintf(s, sizeof(s), "%s/%s", get_basedir_path().c_str(), name.c_str()); + + // roll back + vol_args.fd = ::open(s, O_RDONLY); + if (vol_args.fd < 0) { + ret = -errno; + dout(0) << "rollback_to: error opening '" << s << "': " << cpp_strerror(ret) << dendl; + return ret; + } + ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args); + if (ret < 0 ) { + ret = -errno; + dout(0) << "rollback_to: ioctl SNAP_CREATE got " << cpp_strerror(ret) << dendl; + } + TEMP_FAILURE_RETRY(::close(vol_args.fd)); + return ret; +} + +int BtrfsFileStoreBackend::destroy_checkpoint(const string& name) +{ + dout(10) << "destroy_checkpoint: '" << name << "'" << dendl; + btrfs_ioctl_vol_args vol_args; + memset(&vol_args, 0, sizeof(vol_args)); + vol_args.fd = 0; + strncpy(vol_args.name, name.c_str(), sizeof(vol_args.name)); + + int ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args); + if (ret) { + ret = -errno; + dout(0) << "destroy_checkpoint: ioctl SNAP_DESTROY got " << cpp_strerror(ret) << dendl; + return ret; + } + return 0; +} + +int BtrfsFileStoreBackend::syncfs() +{ + dout(15) << "syncfs" << dendl; + // do a full btrfs commit + int ret = ::ioctl(get_op_fd(), BTRFS_IOC_SYNC); + if (ret < 0) { + ret = -errno; + dout(0) << "syncfs: btrfs IOC_SYNC got " << cpp_strerror(ret) << dendl; + } + return ret; +} + +int BtrfsFileStoreBackend::clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) +{ + dout(20) << "clone_range: " << srcoff << "~" << len << " to " << dstoff << dendl; + size_t blk_size = get_blksize(); + if (!has_clone_range || + srcoff % blk_size != dstoff % blk_size) { + dout(20) << "clone_range: using copy" << dendl; + return _copy_range(from, to, srcoff, len, dstoff); + } + + int err = 0; + int r = 0; + + uint64_t srcoffclone = ALIGN_UP(srcoff, blk_size); + uint64_t dstoffclone = ALIGN_UP(dstoff, blk_size); + if (srcoffclone >= srcoff + len) { + dout(20) << "clone_range: using copy, extent too short to align srcoff" << dendl; + return _copy_range(from, to, srcoff, len, dstoff); + } + + uint64_t lenclone = len - (srcoffclone - srcoff); + if (!ALIGNED(lenclone, blk_size)) { + struct stat from_stat, to_stat; + err = ::fstat(from, &from_stat); + if (err) return -errno; + err = ::fstat(to , &to_stat); + if (err) return -errno; + + if (srcoff + len != (uint64_t)from_stat.st_size || + dstoff + len < (uint64_t)to_stat.st_size) { + // Not to the end of the file, need to align length as well + lenclone = ALIGN_DOWN(lenclone, blk_size); + } + } + if (lenclone == 0) { + // too short + return _copy_range(from, to, srcoff, len, dstoff); + } + + dout(20) << "clone_range: cloning " << srcoffclone << "~" << lenclone + << " to " << dstoffclone << " = " << r << dendl; + btrfs_ioctl_clone_range_args a; + a.src_fd = from; + a.src_offset = srcoffclone; + a.src_length = lenclone; + a.dest_offset = dstoffclone; + err = ::ioctl(to, BTRFS_IOC_CLONE_RANGE, &a); + if (err >= 0) { + r += err; + } else if (errno == EINVAL) { + // Still failed, might be compressed + dout(20) << "clone_range: failed CLONE_RANGE call with -EINVAL, using copy" << dendl; + return _copy_range(from, to, srcoff, len, dstoff); + } else { + return -errno; + } + + // Take care any trimmed from front + if (srcoffclone != srcoff) { + err = _copy_range(from, to, srcoff, srcoffclone - srcoff, dstoff); + if (err >= 0) { + r += err; + } else { + return err; + } + } + + // Copy end + if (srcoffclone + lenclone != srcoff + len) { + err = _copy_range(from, to, + srcoffclone + lenclone, + (srcoff + len) - (srcoffclone + lenclone), + dstoffclone + lenclone); + if (err >= 0) { + r += err; + } else { + return err; + } + } + dout(20) << "clone_range: finished " << srcoff << "~" << len + << " to " << dstoff << " = " << r << dendl; + return r; +} +#endif diff --git a/src/os/filestore/BtrfsFileStoreBackend.h b/src/os/filestore/BtrfsFileStoreBackend.h new file mode 100644 index 000000000000..9bc878f77676 --- /dev/null +++ b/src/os/filestore/BtrfsFileStoreBackend.h @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_BTRFSFILESTOREBACKEDN_H +#define CEPH_BTRFSFILESTOREBACKEDN_H + +#if defined(__linux__) +#include "GenericFileStoreBackend.h" + +class BtrfsFileStoreBackend : public GenericFileStoreBackend { +private: + bool has_clone_range; ///< clone range ioctl is supported + bool has_snap_create; ///< snap create ioctl is supported + bool has_snap_destroy; ///< snap destroy ioctl is supported + bool has_snap_create_v2; ///< snap create v2 ioctl (async!) is supported + bool has_wait_sync; ///< wait sync ioctl is supported + bool stable_commits; + bool m_filestore_btrfs_clone_range; + bool m_filestore_btrfs_snap; +public: + BtrfsFileStoreBackend(FileStore *fs); + ~BtrfsFileStoreBackend() {} + const char *get_name() { + return "btrfs"; + } + int detect_features(); + bool can_checkpoint(); + int create_current(); + int list_checkpoints(list& ls); + int create_checkpoint(const string& name, uint64_t *cid); + int sync_checkpoint(uint64_t cid); + int rollback_to(const string& name); + int destroy_checkpoint(const string& name); + int syncfs(); + int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff); +}; +#endif +#endif diff --git a/src/os/filestore/CollectionIndex.h b/src/os/filestore/CollectionIndex.h new file mode 100644 index 000000000000..a9947cce531d --- /dev/null +++ b/src/os/filestore/CollectionIndex.h @@ -0,0 +1,199 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef OS_COLLECTIONINDEX_H +#define OS_COLLECTIONINDEX_H + +#include +#include +#include "include/memory.h" + +#include "osd/osd_types.h" +#include "include/object.h" +#include "common/RWLock.h" + +/** + * CollectionIndex provides an interface for manipulating indexed collections + */ +class CollectionIndex { +protected: + /** + * Object encapsulating a returned path. + * + * A path to an object (existent or non-existent) becomes invalid + * when a different object is created in the index. Path stores + * a shared_ptr to the CollectionIndex to keep the index alive + * during its lifetime. + * @see IndexManager + * @see self_ref + * @see set_ref + */ + class Path { + public: + /// Returned path + string full_path; + /// Ref to parent Index + CollectionIndex* parent_ref; + /// coll_t for parent Index + coll_t parent_coll; + + /// Normal Constructor + Path( + string path, ///< [in] Path to return. + CollectionIndex* ref) + : full_path(path), parent_ref(ref), parent_coll(parent_ref->coll()) {} + + /// Debugging Constructor + Path( + string path, ///< [in] Path to return. + coll_t coll) ///< [in] collection + : full_path(path), parent_coll(coll) {} + + /// Getter for the stored path. + const char *path() const { return full_path.c_str(); } + + /// Getter for collection + coll_t coll() const { return parent_coll; } + + /// Getter for parent + CollectionIndex* get_index() const { + return parent_ref; + } + }; + public: + + string access_lock_name; + RWLock access_lock; + /// Type of returned paths + typedef ceph::shared_ptr IndexedPath; + + static IndexedPath get_testing_path(string path, coll_t collection) { + return IndexedPath(new Path(path, collection)); + } + + static const uint32_t FLAT_INDEX_TAG = 0; + static const uint32_t HASH_INDEX_TAG = 1; + static const uint32_t HASH_INDEX_TAG_2 = 2; + static const uint32_t HOBJECT_WITH_POOL = 3; + /** + * For tracking Filestore collection versions. + * + * @return Collection version represented by the Index implementation + */ + virtual uint32_t collection_version() = 0; + + /** + * Returns the collection managed by this CollectionIndex + */ + virtual coll_t coll() const = 0; + + + /** + * Initializes the index. + * + * @return Error Code, 0 for success + */ + virtual int init() = 0; + + /** + * Cleanup before replaying journal + * + * Index implemenations may need to perform compound operations + * which may leave the collection unstable if interupted. cleanup + * is called on mount to allow the CollectionIndex implementation + * to stabilize. + * + * @see HashIndex + * @return Error Code, 0 for success + */ + virtual int cleanup() = 0; + + /** + * Call when a file is created using a path returned from lookup. + * + * @return Error Code, 0 for success + */ + virtual int created( + const ghobject_t &oid, ///< [in] Created object. + const char *path ///< [in] Path to created object. + ) = 0; + + /** + * Removes oid from the collection + * + * @return Error Code, 0 for success + */ + virtual int unlink( + const ghobject_t &oid ///< [in] Object to remove + ) = 0; + + /** + * Gets the IndexedPath for oid. + * + * @return Error Code, 0 for success + */ + virtual int lookup( + const ghobject_t &oid, ///< [in] Object to lookup + IndexedPath *path, ///< [out] Path to object + int *hardlink ///< [out] number of hard links of this object. *hardlink=0 mean object no-exist. + ) = 0; + + /** + * Moves objects matching @e match in the lsb @e bits + * + * dest and this must be the same subclass + * + * @return Error Code, 0 for success + */ + virtual int split( + uint32_t match, //< [in] value to match + uint32_t bits, //< [in] bits to check + CollectionIndex* dest //< [in] destination index + ) { assert(0); return 0; } + + + /// List contents of collection by hash + virtual int collection_list_partial( + const ghobject_t &start, ///< [in] object at which to start + const ghobject_t &end, ///< [in] list only objects < end + bool sort_bitwise, ///< [in] use bitwise sort + int max_count, ///< [in] return at most max_count objects + vector *ls, ///< [out] Listed objects + ghobject_t *next ///< [out] Next object to list + ) = 0; + + /// Call prior to removing directory + virtual int prep_delete() { return 0; } + + CollectionIndex(coll_t collection): + access_lock_name ("CollectionIndex::access_lock::" + collection.to_str()), + access_lock(access_lock_name.c_str()) {} + + /* + * Pre-hash the collection, this collection should map to a PG folder. + * + * @param pg_num - pg number of the pool this collection belongs to. + * @param expected_num_objs - expected number of objects in this collection. + * @Return 0 on success, an error code otherwise. + */ + virtual int pre_hash_collection( + uint32_t pg_num, ///< [in] pg number of the pool this collection belongs to + uint64_t expected_num_objs ///< [in] expected number of objects this collection has + ) { assert(0); return 0; } + + /// Virtual destructor + virtual ~CollectionIndex() {} +}; + +#endif diff --git a/src/os/filestore/DBObjectMap.cc b/src/os/filestore/DBObjectMap.cc new file mode 100644 index 000000000000..04340439c7a9 --- /dev/null +++ b/src/os/filestore/DBObjectMap.cc @@ -0,0 +1,1264 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- + +#include "include/int_types.h" +#include "include/buffer.h" + +#include +#include +#include +#include +#include "include/memory.h" +#include + +#include "os/ObjectMap.h" +#include "kv/KeyValueDB.h" +#include "DBObjectMap.h" +#include + +#include "common/debug.h" +#include "common/config.h" +#include "include/assert.h" + +#define dout_subsys ceph_subsys_filestore +#undef dout_prefix +#define dout_prefix *_dout << "filestore " + +const string DBObjectMap::USER_PREFIX = "_USER_"; +const string DBObjectMap::XATTR_PREFIX = "_AXATTR_"; +const string DBObjectMap::SYS_PREFIX = "_SYS_"; +const string DBObjectMap::COMPLETE_PREFIX = "_COMPLETE_"; +const string DBObjectMap::HEADER_KEY = "HEADER"; +const string DBObjectMap::USER_HEADER_KEY = "USER_HEADER"; +const string DBObjectMap::GLOBAL_STATE_KEY = "HEADER"; +const string DBObjectMap::HOBJECT_TO_SEQ = "_HOBJTOSEQ_"; + +// Legacy +const string DBObjectMap::LEAF_PREFIX = "_LEAF_"; +const string DBObjectMap::REVERSE_LEAF_PREFIX = "_REVLEAF_"; + +static void append_escaped(const string &in, string *out) +{ + for (string::const_iterator i = in.begin(); i != in.end(); ++i) { + if (*i == '%') { + out->push_back('%'); + out->push_back('p'); + } else if (*i == '.') { + out->push_back('%'); + out->push_back('e'); + } else if (*i == '_') { + out->push_back('%'); + out->push_back('u'); + } else { + out->push_back(*i); + } + } +} + +bool DBObjectMap::check(std::ostream &out) +{ + bool retval = true; + map parent_to_num_children; + map parent_to_actual_num_children; + KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ); + for (iter->seek_to_first(); iter->valid(); iter->next()) { + _Header header; + assert(header.num_children == 1); + header.num_children = 0; // Hack for leaf node + bufferlist bl = iter->value(); + while (true) { + bufferlist::iterator bliter = bl.begin(); + header.decode(bliter); + if (header.seq != 0) + parent_to_actual_num_children[header.seq] = header.num_children; + if (header.parent == 0) + break; + + if (!parent_to_num_children.count(header.parent)) + parent_to_num_children[header.parent] = 0; + parent_to_num_children[header.parent]++; + if (parent_to_actual_num_children.count(header.parent)) + break; + + set to_get; + map got; + to_get.insert(HEADER_KEY); + db->get(sys_parent_prefix(header), to_get, &got); + if (got.empty()) { + out << "Missing: seq " << header.parent << std::endl; + retval = false; + break; + } else { + bl = got.begin()->second; + } + } + } + + for (map::iterator i = parent_to_num_children.begin(); + i != parent_to_num_children.end(); + parent_to_num_children.erase(i++)) { + if (!parent_to_actual_num_children.count(i->first)) + continue; + if (parent_to_actual_num_children[i->first] != i->second) { + out << "Invalid: seq " << i->first << " recorded children: " + << parent_to_actual_num_children[i->first] << " found: " + << i->second << std::endl; + retval = false; + } + parent_to_actual_num_children.erase(i->first); + } + return retval; +} + +string DBObjectMap::ghobject_key(const ghobject_t &oid) +{ + string out; + append_escaped(oid.hobj.oid.name, &out); + out.push_back('.'); + append_escaped(oid.hobj.get_key(), &out); + out.push_back('.'); + append_escaped(oid.hobj.nspace, &out); + out.push_back('.'); + + char snap_with_hash[1000]; + char *t = snap_with_hash; + char *end = t + sizeof(snap_with_hash); + if (oid.hobj.snap == CEPH_NOSNAP) + t += snprintf(t, end - t, "head"); + else if (oid.hobj.snap == CEPH_SNAPDIR) + t += snprintf(t, end - t, "snapdir"); + else + t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap); + + if (oid.hobj.pool == -1) + t += snprintf(t, end - t, ".none"); + else + t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.hobj.pool); + t += snprintf(t, end - t, ".%.*X", (int)(sizeof(uint32_t)*2), oid.hobj.get_hash()); + + if (oid.generation != ghobject_t::NO_GEN || + oid.shard_id != shard_id_t::NO_SHARD) { + t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.generation); + t += snprintf(t, end - t, ".%x", (int)oid.shard_id); + } + out += string(snap_with_hash); + return out; +} + +// ok: pglog%u3%efs1...0.none.0017B237 +// bad: plana8923501-10...4c.3.ffffffffffffffff.2 +// fixed: plana8923501-10...4c.3.CB767F2D.ffffffffffffffff.2 +// returns 0 for false, 1 for true, negative for error +int DBObjectMap::is_buggy_ghobject_key_v1(const string &in) +{ + int dots = 5; // skip 5 .'s + const char *s = in.c_str(); + do { + while (*s && *s != '.') + ++s; + if (!*s) { + derr << "unexpected null at " << (int)(s-in.c_str()) << dendl; + return -EINVAL; + } + ++s; + } while (*s && --dots); + if (!*s) { + derr << "unexpected null at " << (int)(s-in.c_str()) << dendl; + return -EINVAL; + } + // we are now either at a hash value (32 bits, 8 chars) or a generation + // value (64 bits) '.' and shard id. count the dots! + int len = 0; + while (*s && *s != '.') { + ++s; + ++len; + } + if (*s == '\0') { + if (len != 8) { + derr << "hash value is not 8 chars" << dendl; + return -EINVAL; // the hash value is always 8 chars. + } + return 0; + } + if (*s != '.') { // the shard follows. + derr << "missing final . and shard id at " << (int)(s-in.c_str()) << dendl; + return -EINVAL; + } + return 1; +} + + +string DBObjectMap::map_header_key(const ghobject_t &oid) +{ + return ghobject_key(oid); +} + +string DBObjectMap::header_key(uint64_t seq) +{ + char buf[100]; + snprintf(buf, sizeof(buf), "%.*" PRId64, (int)(2*sizeof(seq)), seq); + return string(buf); +} + +string DBObjectMap::complete_prefix(Header header) +{ + return USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX; +} + +string DBObjectMap::user_prefix(Header header) +{ + return USER_PREFIX + header_key(header->seq) + USER_PREFIX; +} + +string DBObjectMap::sys_prefix(Header header) +{ + return USER_PREFIX + header_key(header->seq) + SYS_PREFIX; +} + +string DBObjectMap::xattr_prefix(Header header) +{ + return USER_PREFIX + header_key(header->seq) + XATTR_PREFIX; +} + +string DBObjectMap::sys_parent_prefix(_Header header) +{ + return USER_PREFIX + header_key(header.parent) + SYS_PREFIX; +} + +int DBObjectMap::DBObjectMapIteratorImpl::init() +{ + invalid = false; + if (ready) { + return 0; + } + assert(!parent_iter); + if (header->parent) { + Header parent = map->lookup_parent(header); + if (!parent) { + assert(0); + return -EINVAL; + } + parent_iter.reset(new DBObjectMapIteratorImpl(map, parent)); + } + key_iter = map->db->get_iterator(map->user_prefix(header)); + assert(key_iter); + complete_iter = map->db->get_iterator(map->complete_prefix(header)); + assert(complete_iter); + cur_iter = key_iter; + assert(cur_iter); + ready = true; + return 0; +} + +ObjectMap::ObjectMapIterator DBObjectMap::get_iterator( + const ghobject_t &oid) +{ + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return ObjectMapIterator(new EmptyIteratorImpl()); + DBObjectMapIterator iter = _get_iterator(header); + iter->hlock.swap(hl); + return iter; +} + +int DBObjectMap::DBObjectMapIteratorImpl::seek_to_first() +{ + init(); + r = 0; + if (parent_iter) { + r = parent_iter->seek_to_first(); + if (r < 0) + return r; + } + r = key_iter->seek_to_first(); + if (r < 0) + return r; + return adjust(); +} + +int DBObjectMap::DBObjectMapIteratorImpl::seek_to_last() +{ + init(); + r = 0; + if (parent_iter) { + r = parent_iter->seek_to_last(); + if (r < 0) + return r; + if (parent_iter->valid()) + r = parent_iter->next(); + if (r < 0) + return r; + } + r = key_iter->seek_to_last(); + if (r < 0) + return r; + if (key_iter->valid()) + r = key_iter->next(); + if (r < 0) + return r; + return adjust(); +} + +int DBObjectMap::DBObjectMapIteratorImpl::lower_bound(const string &to) +{ + init(); + r = 0; + if (parent_iter) { + r = parent_iter->lower_bound(to); + if (r < 0) + return r; + } + r = key_iter->lower_bound(to); + if (r < 0) + return r; + return adjust(); +} + +int DBObjectMap::DBObjectMapIteratorImpl::upper_bound(const string &after) +{ + init(); + r = 0; + if (parent_iter) { + r = parent_iter->upper_bound(after); + if (r < 0) + return r; + } + r = key_iter->upper_bound(after); + if (r < 0) + return r; + return adjust(); +} + +bool DBObjectMap::DBObjectMapIteratorImpl::valid() +{ + bool valid = !invalid && ready; + assert(!valid || cur_iter->valid()); + return valid; +} + +bool DBObjectMap::DBObjectMapIteratorImpl::valid_parent() +{ + if (parent_iter && parent_iter->valid() && + (!key_iter->valid() || key_iter->key() > parent_iter->key())) + return true; + return false; +} + +int DBObjectMap::DBObjectMapIteratorImpl::next(bool validate) +{ + assert(cur_iter->valid()); + assert(valid()); + cur_iter->next(); + return adjust(); +} + +int DBObjectMap::DBObjectMapIteratorImpl::next_parent() +{ + if (!parent_iter || !parent_iter->valid()) { + invalid = true; + return 0; + } + r = next(); + if (r < 0) + return r; + if (!valid() || on_parent() || !parent_iter->valid()) + return 0; + + return lower_bound(parent_iter->key()); +} + +int DBObjectMap::DBObjectMapIteratorImpl::in_complete_region(const string &to_test, + string *begin, + string *end) +{ + complete_iter->upper_bound(to_test); + if (complete_iter->valid()) + complete_iter->prev(); + else + complete_iter->seek_to_last(); + + if (!complete_iter->valid()) + return false; + + string _end; + if (begin) + *begin = complete_iter->key(); + _end = string(complete_iter->value().c_str()); + if (end) + *end = _end; + return (to_test >= complete_iter->key()) && (!_end.size() || _end > to_test); +} + +/** + * Moves parent_iter to the next position both out of the complete_region and + * not equal to key_iter. Then, we set cur_iter to parent_iter if valid and + * less than key_iter and key_iter otherwise. + */ +int DBObjectMap::DBObjectMapIteratorImpl::adjust() +{ + string begin, end; + while (parent_iter && parent_iter->valid()) { + if (in_complete_region(parent_iter->key(), &begin, &end)) { + if (end.size() == 0) { + parent_iter->seek_to_last(); + if (parent_iter->valid()) + parent_iter->next(); + } else + parent_iter->lower_bound(end); + } else if (key_iter->valid() && key_iter->key() == parent_iter->key()) { + parent_iter->next(); + } else { + break; + } + } + if (valid_parent()) { + cur_iter = parent_iter; + } else if (key_iter->valid()) { + cur_iter = key_iter; + } else { + invalid = true; + } + assert(invalid || cur_iter->valid()); + return 0; +} + + +string DBObjectMap::DBObjectMapIteratorImpl::key() +{ + return cur_iter->key(); +} + +bufferlist DBObjectMap::DBObjectMapIteratorImpl::value() +{ + return cur_iter->value(); +} + +int DBObjectMap::DBObjectMapIteratorImpl::status() +{ + return r; +} + +int DBObjectMap::set_keys(const ghobject_t &oid, + const map &set, + const SequencerPosition *spos) +{ + KeyValueDB::Transaction t = db->get_transaction(); + MapHeaderLock hl(this, oid); + Header header = lookup_create_map_header(hl, oid, t); + if (!header) + return -EINVAL; + if (check_spos(oid, header, spos)) + return 0; + + t->set(user_prefix(header), set); + + return db->submit_transaction(t); +} + +int DBObjectMap::set_header(const ghobject_t &oid, + const bufferlist &bl, + const SequencerPosition *spos) +{ + KeyValueDB::Transaction t = db->get_transaction(); + MapHeaderLock hl(this, oid); + Header header = lookup_create_map_header(hl, oid, t); + if (!header) + return -EINVAL; + if (check_spos(oid, header, spos)) + return 0; + _set_header(header, bl, t); + return db->submit_transaction(t); +} + +void DBObjectMap::_set_header(Header header, const bufferlist &bl, + KeyValueDB::Transaction t) +{ + map to_set; + to_set[USER_HEADER_KEY] = bl; + t->set(sys_prefix(header), to_set); +} + +int DBObjectMap::get_header(const ghobject_t &oid, + bufferlist *bl) +{ + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) { + return 0; + } + return _get_header(header, bl); +} + +int DBObjectMap::_get_header(Header header, + bufferlist *bl) +{ + map out; + while (true) { + out.clear(); + set to_get; + to_get.insert(USER_HEADER_KEY); + int r = db->get(sys_prefix(header), to_get, &out); + if (r == 0 && !out.empty()) + break; + if (r < 0) + return r; + Header current(header); + if (!current->parent) + break; + header = lookup_parent(current); + } + + if (!out.empty()) + bl->swap(out.begin()->second); + return 0; +} + +int DBObjectMap::clear(const ghobject_t &oid, + const SequencerPosition *spos) +{ + KeyValueDB::Transaction t = db->get_transaction(); + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + if (check_spos(oid, header, spos)) + return 0; + remove_map_header(hl, oid, header, t); + assert(header->num_children > 0); + header->num_children--; + int r = _clear(header, t); + if (r < 0) + return r; + return db->submit_transaction(t); +} + +int DBObjectMap::_clear(Header header, + KeyValueDB::Transaction t) +{ + while (1) { + if (header->num_children) { + set_header(header, t); + break; + } + clear_header(header, t); + if (!header->parent) + break; + Header parent = lookup_parent(header); + if (!parent) { + return -EINVAL; + } + assert(parent->num_children > 0); + parent->num_children--; + header.swap(parent); + } + return 0; +} + +int DBObjectMap::merge_new_complete(Header header, + const map &new_complete, + DBObjectMapIterator iter, + KeyValueDB::Transaction t) +{ + KeyValueDB::Iterator complete_iter = db->get_iterator( + complete_prefix(header) + ); + map::const_iterator i = new_complete.begin(); + set to_remove; + map to_add; + + string begin, end; + while (i != new_complete.end()) { + string new_begin = i->first; + string new_end = i->second; + int r = iter->in_complete_region(new_begin, &begin, &end); + if (r < 0) + return r; + if (r) { + to_remove.insert(begin); + new_begin = begin; + } + ++i; + while (i != new_complete.end()) { + if (!new_end.size() || i->first <= new_end) { + if (!new_end.size() && i->second > new_end) { + new_end = i->second; + } + ++i; + continue; + } + + r = iter->in_complete_region(new_end, &begin, &end); + if (r < 0) + return r; + if (r) { + to_remove.insert(begin); + new_end = end; + continue; + } + break; + } + bufferlist bl; + bl.append(bufferptr(new_end.c_str(), new_end.size() + 1)); + to_add.insert(make_pair(new_begin, bl)); + } + t->rmkeys(complete_prefix(header), to_remove); + t->set(complete_prefix(header), to_add); + return 0; +} + +int DBObjectMap::copy_up_header(Header header, + KeyValueDB::Transaction t) +{ + bufferlist bl; + int r = _get_header(header, &bl); + if (r < 0) + return r; + + _set_header(header, bl, t); + return 0; +} + +int DBObjectMap::need_parent(DBObjectMapIterator iter) +{ + int r = iter->seek_to_first(); + if (r < 0) + return r; + + if (!iter->valid()) + return 0; + + string begin, end; + if (iter->in_complete_region(iter->key(), &begin, &end) && end == "") { + return 0; + } + return 1; +} + +int DBObjectMap::rm_keys(const ghobject_t &oid, + const set &to_clear, + const SequencerPosition *spos) +{ + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + KeyValueDB::Transaction t = db->get_transaction(); + if (check_spos(oid, header, spos)) + return 0; + t->rmkeys(user_prefix(header), to_clear); + if (!header->parent) { + return db->submit_transaction(t); + } + + // Copy up keys from parent around to_clear + int keep_parent; + { + DBObjectMapIterator iter = _get_iterator(header); + iter->seek_to_first(); + map new_complete; + map to_write; + for(set::const_iterator i = to_clear.begin(); + i != to_clear.end(); + ) { + unsigned copied = 0; + iter->lower_bound(*i); + ++i; + if (!iter->valid()) + break; + string begin = iter->key(); + if (!iter->on_parent()) + iter->next_parent(); + if (new_complete.size() && new_complete.rbegin()->second == begin) { + begin = new_complete.rbegin()->first; + } + while (iter->valid() && copied < 20) { + if (!to_clear.count(iter->key())) + to_write[iter->key()].append(iter->value()); + if (i != to_clear.end() && *i <= iter->key()) { + ++i; + copied = 0; + } + + iter->next_parent(); + copied++; + } + if (iter->valid()) { + new_complete[begin] = iter->key(); + } else { + new_complete[begin] = ""; + break; + } + } + t->set(user_prefix(header), to_write); + merge_new_complete(header, new_complete, iter, t); + keep_parent = need_parent(iter); + if (keep_parent < 0) + return keep_parent; + } + if (!keep_parent) { + copy_up_header(header, t); + Header parent = lookup_parent(header); + if (!parent) + return -EINVAL; + parent->num_children--; + _clear(parent, t); + header->parent = 0; + set_map_header(hl, oid, *header, t); + t->rmkeys_by_prefix(complete_prefix(header)); + } + return db->submit_transaction(t); +} + +int DBObjectMap::clear_keys_header(const ghobject_t &oid, + const SequencerPosition *spos) +{ + KeyValueDB::Transaction t = db->get_transaction(); + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + if (check_spos(oid, header, spos)) + return 0; + + // save old attrs + KeyValueDB::Iterator iter = db->get_iterator(xattr_prefix(header)); + if (!iter) + return -EINVAL; + map attrs; + for (iter->seek_to_first(); !iter->status() && iter->valid(); iter->next()) + attrs.insert(make_pair(iter->key(), iter->value())); + if (iter->status()) + return iter->status(); + + // remove current header + remove_map_header(hl, oid, header, t); + assert(header->num_children > 0); + header->num_children--; + int r = _clear(header, t); + if (r < 0) + return r; + + // create new header + Header newheader = generate_new_header(oid, Header()); + set_map_header(hl, oid, *newheader, t); + if (!attrs.empty()) + t->set(xattr_prefix(newheader), attrs); + return db->submit_transaction(t); +} + +int DBObjectMap::get(const ghobject_t &oid, + bufferlist *_header, + map *out) +{ + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + _get_header(header, _header); + ObjectMapIterator iter = _get_iterator(header); + for (iter->seek_to_first(); iter->valid(); iter->next()) { + if (iter->status()) + return iter->status(); + out->insert(make_pair(iter->key(), iter->value())); + } + return 0; +} + +int DBObjectMap::get_keys(const ghobject_t &oid, + set *keys) +{ + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + ObjectMapIterator iter = _get_iterator(header); + for (iter->seek_to_first(); iter->valid(); iter->next()) { + if (iter->status()) + return iter->status(); + keys->insert(iter->key()); + } + return 0; +} + +int DBObjectMap::scan(Header header, + const set &in_keys, + set *out_keys, + map *out_values) +{ + ObjectMapIterator db_iter = _get_iterator(header); + for (set::const_iterator key_iter = in_keys.begin(); + key_iter != in_keys.end(); + ++key_iter) { + db_iter->lower_bound(*key_iter); + if (db_iter->status()) + return db_iter->status(); + if (db_iter->valid() && db_iter->key() == *key_iter) { + if (out_keys) + out_keys->insert(*key_iter); + if (out_values) + out_values->insert(make_pair(db_iter->key(), db_iter->value())); + } + } + return 0; +} + +int DBObjectMap::get_values(const ghobject_t &oid, + const set &keys, + map *out) +{ + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + return scan(header, keys, 0, out); +} + +int DBObjectMap::check_keys(const ghobject_t &oid, + const set &keys, + set *out) +{ + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + return scan(header, keys, out, 0); +} + +int DBObjectMap::get_xattrs(const ghobject_t &oid, + const set &to_get, + map *out) +{ + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + return db->get(xattr_prefix(header), to_get, out); +} + +int DBObjectMap::get_all_xattrs(const ghobject_t &oid, + set *out) +{ + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + KeyValueDB::Iterator iter = db->get_iterator(xattr_prefix(header)); + if (!iter) + return -EINVAL; + for (iter->seek_to_first(); !iter->status() && iter->valid(); iter->next()) + out->insert(iter->key()); + return iter->status(); +} + +int DBObjectMap::set_xattrs(const ghobject_t &oid, + const map &to_set, + const SequencerPosition *spos) +{ + KeyValueDB::Transaction t = db->get_transaction(); + MapHeaderLock hl(this, oid); + Header header = lookup_create_map_header(hl, oid, t); + if (!header) + return -EINVAL; + if (check_spos(oid, header, spos)) + return 0; + t->set(xattr_prefix(header), to_set); + return db->submit_transaction(t); +} + +int DBObjectMap::remove_xattrs(const ghobject_t &oid, + const set &to_remove, + const SequencerPosition *spos) +{ + KeyValueDB::Transaction t = db->get_transaction(); + MapHeaderLock hl(this, oid); + Header header = lookup_map_header(hl, oid); + if (!header) + return -ENOENT; + if (check_spos(oid, header, spos)) + return 0; + t->rmkeys(xattr_prefix(header), to_remove); + return db->submit_transaction(t); +} + +int DBObjectMap::clone(const ghobject_t &oid, + const ghobject_t &target, + const SequencerPosition *spos) +{ + if (oid == target) + return 0; + + MapHeaderLock _l1(this, MIN_GHOBJ(oid, target, true)); + MapHeaderLock _l2(this, MAX_GHOBJ(oid, target, true)); + MapHeaderLock *lsource, *ltarget; + if (cmp_bitwise(oid, target) > 0) { + lsource = &_l2; + ltarget= &_l1; + } else { + lsource = &_l1; + ltarget= &_l2; + } + + KeyValueDB::Transaction t = db->get_transaction(); + { + Header destination = lookup_map_header(*ltarget, target); + if (destination) { + remove_map_header(*ltarget, target, destination, t); + if (check_spos(target, destination, spos)) + return 0; + destination->num_children--; + _clear(destination, t); + } + } + + Header parent = lookup_map_header(*lsource, oid); + if (!parent) + return db->submit_transaction(t); + + Header source = generate_new_header(oid, parent); + Header destination = generate_new_header(target, parent); + if (spos) + destination->spos = *spos; + + parent->num_children = 2; + set_header(parent, t); + set_map_header(*lsource, oid, *source, t); + set_map_header(*ltarget, target, *destination, t); + + map to_set; + KeyValueDB::Iterator xattr_iter = db->get_iterator(xattr_prefix(parent)); + for (xattr_iter->seek_to_first(); + xattr_iter->valid(); + xattr_iter->next()) + to_set.insert(make_pair(xattr_iter->key(), xattr_iter->value())); + t->set(xattr_prefix(source), to_set); + t->set(xattr_prefix(destination), to_set); + t->rmkeys_by_prefix(xattr_prefix(parent)); + return db->submit_transaction(t); +} + +int DBObjectMap::upgrade_to_v2() +{ + dout(1) << __func__ << " start" << dendl; + KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ); + iter->seek_to_first(); + while (iter->valid()) { + unsigned count = 0; + KeyValueDB::Transaction t = db->get_transaction(); + set remove; + map add; + for (; + iter->valid() && count < 300; + iter->next()) { + dout(20) << __func__ << " key is " << iter->key() << dendl; + int r = is_buggy_ghobject_key_v1(iter->key()); + if (r < 0) { + derr << __func__ << " bad key '" << iter->key() << "'" << dendl; + return r; + } + if (!r) { + dout(20) << __func__ << " " << iter->key() << " ok" << dendl; + continue; + } + + // decode header to get oid + _Header hdr; + bufferlist bl = iter->value(); + bufferlist::iterator bliter = bl.begin(); + hdr.decode(bliter); + + string newkey(ghobject_key(hdr.oid)); + dout(20) << __func__ << " " << iter->key() << " -> " << newkey << dendl; + add[newkey] = iter->value(); + remove.insert(iter->key()); + ++count; + } + + if (!remove.empty()) { + dout(20) << __func__ << " updating " << remove.size() << " keys" << dendl; + t->rmkeys(HOBJECT_TO_SEQ, remove); + t->set(HOBJECT_TO_SEQ, add); + int r = db->submit_transaction(t); + if (r < 0) + return r; + } + } + + state.v = 2; + + Mutex::Locker l(header_lock); + KeyValueDB::Transaction t = db->get_transaction(); + write_state(t); + db->submit_transaction_sync(t); + dout(1) << __func__ << " done" << dendl; + return 0; +} + +int DBObjectMap::init(bool do_upgrade) +{ + map result; + set to_get; + to_get.insert(GLOBAL_STATE_KEY); + int r = db->get(SYS_PREFIX, to_get, &result); + if (r < 0) + return r; + if (!result.empty()) { + bufferlist::iterator bliter = result.begin()->second.begin(); + state.decode(bliter); + if (state.v < 1) { + dout(1) << "DBObjectMap is *very* old; upgrade to an older version first" + << dendl; + return -ENOTSUP; + } + if (state.v < 2) { // Needs upgrade + if (!do_upgrade) { + dout(1) << "DOBjbectMap requires an upgrade," + << " set filestore_update_to" + << dendl; + return -ENOTSUP; + } else { + r = upgrade_to_v2(); + if (r < 0) + return r; + } + } + } else { + // New store + state.v = 2; + state.seq = 1; + } + dout(20) << "(init)dbobjectmap: seq is " << state.seq << dendl; + return 0; +} + +int DBObjectMap::sync(const ghobject_t *oid, + const SequencerPosition *spos) { + KeyValueDB::Transaction t = db->get_transaction(); + if (oid) { + assert(spos); + MapHeaderLock hl(this, *oid); + Header header = lookup_map_header(hl, *oid); + if (header) { + dout(10) << "oid: " << *oid << " setting spos to " + << *spos << dendl; + header->spos = *spos; + set_map_header(hl, *oid, *header, t); + } + /* It may appear that this and the identical portion of the else + * block can combined below, but in this block, the transaction + * must be submitted under *both* the MapHeaderLock and the full + * header_lock. + * + * See 2b63dd25fc1c73fa42e52e9ea4ab5a45dd9422a0 and bug 9891. + */ + Mutex::Locker l(header_lock); + write_state(t); + return db->submit_transaction_sync(t); + } else { + Mutex::Locker l(header_lock); + write_state(t); + return db->submit_transaction_sync(t); + } +} + +int DBObjectMap::write_state(KeyValueDB::Transaction _t) { + assert(header_lock.is_locked_by_me()); + dout(20) << "dbobjectmap: seq is " << state.seq << dendl; + KeyValueDB::Transaction t = _t ? _t : db->get_transaction(); + bufferlist bl; + state.encode(bl); + map to_write; + to_write[GLOBAL_STATE_KEY] = bl; + t->set(SYS_PREFIX, to_write); + return _t ? 0 : db->submit_transaction(t); +} + + +DBObjectMap::Header DBObjectMap::_lookup_map_header( + const MapHeaderLock &l, + const ghobject_t &oid) +{ + assert(l.get_locked() == oid); + + _Header *header = new _Header(); + { + Mutex::Locker l(cache_lock); + if (caches.lookup(oid, header)) { + assert(!in_use.count(header->seq)); + in_use.insert(header->seq); + return Header(header, RemoveOnDelete(this)); + } + } + + bufferlist out; + int r = db->get(HOBJECT_TO_SEQ, map_header_key(oid), &out); + if (r < 0 || out.length()==0) { + delete header; + return Header(); + } + + Header ret(header, RemoveOnDelete(this)); + bufferlist::iterator iter = out.begin(); + + ret->decode(iter); + { + Mutex::Locker l(cache_lock); + caches.add(oid, *ret); + } + + assert(!in_use.count(header->seq)); + in_use.insert(header->seq); + return ret; +} + +DBObjectMap::Header DBObjectMap::_generate_new_header(const ghobject_t &oid, + Header parent) +{ + Header header = Header(new _Header(), RemoveOnDelete(this)); + header->seq = state.seq++; + if (parent) { + header->parent = parent->seq; + header->spos = parent->spos; + } + header->num_children = 1; + header->oid = oid; + assert(!in_use.count(header->seq)); + in_use.insert(header->seq); + + write_state(); + return header; +} + +DBObjectMap::Header DBObjectMap::lookup_parent(Header input) +{ + Mutex::Locker l(header_lock); + while (in_use.count(input->parent)) + header_cond.Wait(header_lock); + map out; + set keys; + keys.insert(HEADER_KEY); + + dout(20) << "lookup_parent: parent " << input->parent + << " for seq " << input->seq << dendl; + int r = db->get(sys_parent_prefix(input), keys, &out); + if (r < 0) { + assert(0); + return Header(); + } + if (out.empty()) { + assert(0); + return Header(); + } + + Header header = Header(new _Header(), RemoveOnDelete(this)); + header->seq = input->parent; + bufferlist::iterator iter = out.begin()->second.begin(); + header->decode(iter); + dout(20) << "lookup_parent: parent seq is " << header->seq << " with parent " + << header->parent << dendl; + in_use.insert(header->seq); + return header; +} + +DBObjectMap::Header DBObjectMap::lookup_create_map_header( + const MapHeaderLock &hl, + const ghobject_t &oid, + KeyValueDB::Transaction t) +{ + Mutex::Locker l(header_lock); + Header header = _lookup_map_header(hl, oid); + if (!header) { + header = _generate_new_header(oid, Header()); + set_map_header(hl, oid, *header, t); + } + return header; +} + +void DBObjectMap::clear_header(Header header, KeyValueDB::Transaction t) +{ + dout(20) << "clear_header: clearing seq " << header->seq << dendl; + t->rmkeys_by_prefix(user_prefix(header)); + t->rmkeys_by_prefix(sys_prefix(header)); + t->rmkeys_by_prefix(complete_prefix(header)); + t->rmkeys_by_prefix(xattr_prefix(header)); + set keys; + keys.insert(header_key(header->seq)); + t->rmkeys(USER_PREFIX, keys); +} + +void DBObjectMap::set_header(Header header, KeyValueDB::Transaction t) +{ + dout(20) << "set_header: setting seq " << header->seq << dendl; + map to_write; + header->encode(to_write[HEADER_KEY]); + t->set(sys_prefix(header), to_write); +} + +void DBObjectMap::remove_map_header( + const MapHeaderLock &l, + const ghobject_t &oid, + Header header, + KeyValueDB::Transaction t) +{ + assert(l.get_locked() == oid); + dout(20) << "remove_map_header: removing " << header->seq + << " oid " << oid << dendl; + set to_remove; + to_remove.insert(map_header_key(oid)); + t->rmkeys(HOBJECT_TO_SEQ, to_remove); + { + Mutex::Locker l(cache_lock); + caches.clear(oid); + } +} + +void DBObjectMap::set_map_header( + const MapHeaderLock &l, + const ghobject_t &oid, _Header header, + KeyValueDB::Transaction t) +{ + assert(l.get_locked() == oid); + dout(20) << "set_map_header: setting " << header.seq + << " oid " << oid << " parent seq " + << header.parent << dendl; + map to_set; + header.encode(to_set[map_header_key(oid)]); + t->set(HOBJECT_TO_SEQ, to_set); + { + Mutex::Locker l(cache_lock); + caches.add(oid, header); + } +} + +bool DBObjectMap::check_spos(const ghobject_t &oid, + Header header, + const SequencerPosition *spos) +{ + if (!spos || *spos > header->spos) { + stringstream out; + if (spos) + dout(10) << "oid: " << oid << " not skipping op, *spos " + << *spos << dendl; + else + dout(10) << "oid: " << oid << " not skipping op, *spos " + << "empty" << dendl; + dout(10) << " > header.spos " << header->spos << dendl; + return false; + } else { + dout(10) << "oid: " << oid << " skipping op, *spos " << *spos + << " <= header.spos " << header->spos << dendl; + return true; + } +} + +int DBObjectMap::list_objects(vector *out) +{ + KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ); + for (iter->seek_to_first(); iter->valid(); iter->next()) { + bufferlist bl = iter->value(); + bufferlist::iterator bliter = bl.begin(); + _Header header; + header.decode(bliter); + out->push_back(header.oid); + } + return 0; +} diff --git a/src/os/filestore/DBObjectMap.h b/src/os/filestore/DBObjectMap.h new file mode 100644 index 000000000000..1b5748548232 --- /dev/null +++ b/src/os/filestore/DBObjectMap.h @@ -0,0 +1,534 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +#ifndef DBOBJECTMAP_DB_H +#define DBOBJECTMAP_DB_H + +#include "include/buffer_fwd.h" +#include +#include +#include + +#include +#include "include/memory.h" +#include + +#include "os/ObjectMap.h" +#include "kv/KeyValueDB.h" +#include "osd/osd_types.h" +#include "common/Mutex.h" +#include "common/Cond.h" +#include "common/simple_cache.hpp" +#include + +#include "SequencerPosition.h" + +/** + * DBObjectMap: Implements ObjectMap in terms of KeyValueDB + * + * Prefix space structure: + * + * @see complete_prefix + * @see user_prefix + * @see sys_prefix + * + * - GHOBJECT_TO_SEQ: Contains leaf mapping from ghobject_t->hobj.seq and + * corresponding omap header + * - SYS_PREFIX: GLOBAL_STATE_KEY - contains next seq number + * @see State + * @see write_state + * @see init + * @see generate_new_header + * - USER_PREFIX + header_key(header->seq) + USER_PREFIX + * : key->value for header->seq + * - USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX: see below + * - USER_PREFIX + header_key(header->seq) + XATTR_PREFIX: xattrs + * - USER_PREFIX + header_key(header->seq) + SYS_PREFIX + * : USER_HEADER_KEY - omap header for header->seq + * : HEADER_KEY - encoding of header for header->seq + * + * For each node (represented by a header), we + * store three mappings: the key mapping, the complete mapping, and the parent. + * The complete mapping (COMPLETE_PREFIX space) is key->key. Each x->y entry in + * this mapping indicates that the key mapping contains all entries on [x,y). + * Note, max string is represented by "", so ""->"" indicates that the parent + * is unnecessary (@see rm_keys). When looking up a key not contained in the + * the complete set, we have to check the parent if we don't find it in the + * key set. During rm_keys, we copy keys from the parent and update the + * complete set to reflect the change @see rm_keys. + */ +class DBObjectMap : public ObjectMap { +public: + boost::scoped_ptr db; + + /** + * Serializes access to next_seq as well as the in_use set + */ + Mutex header_lock; + Cond header_cond; + Cond map_header_cond; + + /** + * Set of headers currently in use + */ + set in_use; + set map_header_in_use; + + /** + * Takes the map_header_in_use entry in constructor, releases in + * destructor + */ + class MapHeaderLock { + DBObjectMap *db; + boost::optional locked; + + MapHeaderLock(const MapHeaderLock &); + MapHeaderLock &operator=(const MapHeaderLock &); + public: + MapHeaderLock(DBObjectMap *db) : db(db) {} + MapHeaderLock(DBObjectMap *db, const ghobject_t &oid) : db(db), locked(oid) { + Mutex::Locker l(db->header_lock); + while (db->map_header_in_use.count(*locked)) + db->map_header_cond.Wait(db->header_lock); + db->map_header_in_use.insert(*locked); + } + + const ghobject_t &get_locked() const { + assert(locked); + return *locked; + } + + void swap(MapHeaderLock &o) { + assert(db == o.db); + + // centos6's boost optional doesn't seem to have swap :( + boost::optional _locked = o.locked; + o.locked = locked; + locked = _locked; + } + + ~MapHeaderLock() { + if (locked) { + Mutex::Locker l(db->header_lock); + assert(db->map_header_in_use.count(*locked)); + db->map_header_cond.Signal(); + db->map_header_in_use.erase(*locked); + } + } + }; + + DBObjectMap(KeyValueDB *db) : db(db), header_lock("DBOBjectMap"), + cache_lock("DBObjectMap::CacheLock"), + caches(g_conf->filestore_omap_header_cache_size) + {} + + int set_keys( + const ghobject_t &oid, + const map &set, + const SequencerPosition *spos=0 + ); + + int set_header( + const ghobject_t &oid, + const bufferlist &bl, + const SequencerPosition *spos=0 + ); + + int get_header( + const ghobject_t &oid, + bufferlist *bl + ); + + int clear( + const ghobject_t &oid, + const SequencerPosition *spos=0 + ); + + int clear_keys_header( + const ghobject_t &oid, + const SequencerPosition *spos=0 + ); + + int rm_keys( + const ghobject_t &oid, + const set &to_clear, + const SequencerPosition *spos=0 + ); + + int get( + const ghobject_t &oid, + bufferlist *header, + map *out + ); + + int get_keys( + const ghobject_t &oid, + set *keys + ); + + int get_values( + const ghobject_t &oid, + const set &keys, + map *out + ); + + int check_keys( + const ghobject_t &oid, + const set &keys, + set *out + ); + + int get_xattrs( + const ghobject_t &oid, + const set &to_get, + map *out + ); + + int get_all_xattrs( + const ghobject_t &oid, + set *out + ); + + int set_xattrs( + const ghobject_t &oid, + const map &to_set, + const SequencerPosition *spos=0 + ); + + int remove_xattrs( + const ghobject_t &oid, + const set &to_remove, + const SequencerPosition *spos=0 + ); + + int clone( + const ghobject_t &oid, + const ghobject_t &target, + const SequencerPosition *spos=0 + ); + + /// Read initial state from backing store + int init(bool upgrade = false); + + /// Upgrade store to current version + int upgrade_to_v2(); + + /// Consistency check, debug, there must be no parallel writes + bool check(std::ostream &out); + + /// Ensure that all previous operations are durable + int sync(const ghobject_t *oid=0, const SequencerPosition *spos=0); + + /// Util, list all objects, there must be no other concurrent access + int list_objects(vector *objs ///< [out] objects + ); + + ObjectMapIterator get_iterator(const ghobject_t &oid); + + static const string USER_PREFIX; + static const string XATTR_PREFIX; + static const string SYS_PREFIX; + static const string COMPLETE_PREFIX; + static const string HEADER_KEY; + static const string USER_HEADER_KEY; + static const string GLOBAL_STATE_KEY; + static const string HOBJECT_TO_SEQ; + + /// Legacy + static const string LEAF_PREFIX; + static const string REVERSE_LEAF_PREFIX; + + /// persistent state for store @see generate_header + struct State { + __u8 v; + uint64_t seq; + State() : v(0), seq(1) {} + State(uint64_t seq) : v(0), seq(seq) {} + + void encode(bufferlist &bl) const { + ENCODE_START(2, 1, bl); + ::encode(v, bl); + ::encode(seq, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::iterator &bl) { + DECODE_START(2, bl); + if (struct_v >= 2) + ::decode(v, bl); + else + v = 0; + ::decode(seq, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const { + f->dump_unsigned("seq", seq); + } + + static void generate_test_instances(list &o) { + o.push_back(new State(0)); + o.push_back(new State(20)); + } + } state; + + struct _Header { + uint64_t seq; + uint64_t parent; + uint64_t num_children; + + coll_t c; + ghobject_t oid; + + SequencerPosition spos; + + void encode(bufferlist &bl) const { + ENCODE_START(2, 1, bl); + ::encode(seq, bl); + ::encode(parent, bl); + ::encode(num_children, bl); + ::encode(c, bl); + ::encode(oid, bl); + ::encode(spos, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::iterator &bl) { + DECODE_START(2, bl); + ::decode(seq, bl); + ::decode(parent, bl); + ::decode(num_children, bl); + ::decode(c, bl); + ::decode(oid, bl); + if (struct_v >= 2) + ::decode(spos, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const { + f->dump_unsigned("seq", seq); + f->dump_unsigned("parent", parent); + f->dump_unsigned("num_children", num_children); + f->dump_stream("coll") << c; + f->dump_stream("oid") << oid; + } + + static void generate_test_instances(list<_Header*> &o) { + o.push_back(new _Header); + o.push_back(new _Header); + o.back()->parent = 20; + o.back()->seq = 30; + } + + _Header() : seq(0), parent(0), num_children(1) {} + }; + + /// String munging (public for testing) + static string ghobject_key(const ghobject_t &oid); + static string ghobject_key_v0(coll_t c, const ghobject_t &oid); + static int is_buggy_ghobject_key_v1(const string &in); +private: + /// Implicit lock on Header->seq + typedef ceph::shared_ptr<_Header> Header; + Mutex cache_lock; + SimpleLRU caches; + + string map_header_key(const ghobject_t &oid); + string header_key(uint64_t seq); + string complete_prefix(Header header); + string user_prefix(Header header); + string sys_prefix(Header header); + string xattr_prefix(Header header); + string sys_parent_prefix(_Header header); + string sys_parent_prefix(Header header) { + return sys_parent_prefix(*header); + } + + class EmptyIteratorImpl : public ObjectMapIteratorImpl { + public: + int seek_to_first() { return 0; } + int seek_to_last() { return 0; } + int upper_bound(const string &after) { return 0; } + int lower_bound(const string &to) { return 0; } + bool valid() { return false; } + int next(bool validate=true) { assert(0); return 0; } + string key() { assert(0); return ""; } + bufferlist value() { assert(0); return bufferlist(); } + int status() { return 0; } + }; + + + /// Iterator + class DBObjectMapIteratorImpl : public ObjectMapIteratorImpl { + public: + DBObjectMap *map; + + /// NOTE: implicit lock hlock->get_locked() when returned out of the class + MapHeaderLock hlock; + /// NOTE: implicit lock on header->seq AND for all ancestors + Header header; + + /// parent_iter == NULL iff no parent + ceph::shared_ptr parent_iter; + KeyValueDB::Iterator key_iter; + KeyValueDB::Iterator complete_iter; + + /// cur_iter points to currently valid iterator + ceph::shared_ptr cur_iter; + int r; + + /// init() called, key_iter, complete_iter, parent_iter filled in + bool ready; + /// past end + bool invalid; + + DBObjectMapIteratorImpl(DBObjectMap *map, Header header) : + map(map), hlock(map), header(header), r(0), ready(false), invalid(true) {} + int seek_to_first(); + int seek_to_last(); + int upper_bound(const string &after); + int lower_bound(const string &to); + bool valid(); + int next(bool validate=true); + string key(); + bufferlist value(); + int status(); + + bool on_parent() { + return cur_iter == parent_iter; + } + + /// skips to next valid parent entry + int next_parent(); + + /// Tests whether to_test is in complete region + int in_complete_region(const string &to_test, ///< [in] key to test + string *begin, ///< [out] beginning of region + string *end ///< [out] end of region + ); ///< @returns true if to_test is in the complete region, else false + + private: + int init(); + bool valid_parent(); + int adjust(); + }; + + typedef ceph::shared_ptr DBObjectMapIterator; + DBObjectMapIterator _get_iterator(Header header) { + return DBObjectMapIterator(new DBObjectMapIteratorImpl(this, header)); + } + + /// sys + + /// Removes node corresponding to header + void clear_header(Header header, KeyValueDB::Transaction t); + + /// Set node containing input to new contents + void set_header(Header input, KeyValueDB::Transaction t); + + /// Remove leaf node corresponding to oid in c + void remove_map_header( + const MapHeaderLock &l, + const ghobject_t &oid, + Header header, + KeyValueDB::Transaction t); + + /// Set leaf node for c and oid to the value of header + void set_map_header( + const MapHeaderLock &l, + const ghobject_t &oid, _Header header, + KeyValueDB::Transaction t); + + /// Set leaf node for c and oid to the value of header + bool check_spos(const ghobject_t &oid, + Header header, + const SequencerPosition *spos); + + /// Lookup or create header for c oid + Header lookup_create_map_header( + const MapHeaderLock &l, + const ghobject_t &oid, + KeyValueDB::Transaction t); + + /** + * Generate new header for c oid with new seq number + * + * Has the side effect of syncronously saving the new DBObjectMap state + */ + Header _generate_new_header(const ghobject_t &oid, Header parent); + Header generate_new_header(const ghobject_t &oid, Header parent) { + Mutex::Locker l(header_lock); + return _generate_new_header(oid, parent); + } + + /// Lookup leaf header for c oid + Header _lookup_map_header( + const MapHeaderLock &l, + const ghobject_t &oid); + Header lookup_map_header( + const MapHeaderLock &l2, + const ghobject_t &oid) { + Mutex::Locker l(header_lock); + return _lookup_map_header(l2, oid); + } + + /// Lookup header node for input + Header lookup_parent(Header input); + + + /// Helpers + int _get_header(Header header, bufferlist *bl); + + /// Scan keys in header into out_keys and out_values (if nonnull) + int scan(Header header, + const set &in_keys, + set *out_keys, + map *out_values); + + /// Remove header and all related prefixes + int _clear(Header header, + KeyValueDB::Transaction t); + /// Adds to t operations necessary to add new_complete to the complete set + int merge_new_complete(Header header, + const map &new_complete, + DBObjectMapIterator iter, + KeyValueDB::Transaction t); + + /// Writes out State (mainly next_seq) + int write_state(KeyValueDB::Transaction _t = + KeyValueDB::Transaction()); + + /// 0 if the complete set now contains all of key space, < 0 on error, 1 else + int need_parent(DBObjectMapIterator iter); + + /// Copies header entry from parent @see rm_keys + int copy_up_header(Header header, + KeyValueDB::Transaction t); + + /// Sets header @see set_header + void _set_header(Header header, const bufferlist &bl, + KeyValueDB::Transaction t); + + /** + * Removes header seq lock and possibly object lock + * once Header is out of scope + * @see lookup_parent + * @see generate_new_header + */ + class RemoveOnDelete { + public: + DBObjectMap *db; + RemoveOnDelete(DBObjectMap *db) : + db(db) {} + void operator() (_Header *header) { + Mutex::Locker l(db->header_lock); + assert(db->in_use.count(header->seq)); + db->in_use.erase(header->seq); + db->header_cond.Signal(); + delete header; + } + }; + friend class RemoveOnDelete; +}; +WRITE_CLASS_ENCODER(DBObjectMap::_Header) +WRITE_CLASS_ENCODER(DBObjectMap::State) + +#endif diff --git a/src/os/filestore/FDCache.h b/src/os/filestore/FDCache.h new file mode 100644 index 000000000000..635043b7e061 --- /dev/null +++ b/src/os/filestore/FDCache.h @@ -0,0 +1,111 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_FDCACHE_H +#define CEPH_FDCACHE_H + +#include +#include +#include +#include "common/hobject.h" +#include "common/Mutex.h" +#include "common/Cond.h" +#include "common/shared_cache.hpp" +#include "include/compat.h" +#include "include/intarith.h" + +/** + * FD Cache + */ +class FDCache : public md_config_obs_t { +public: + /** + * FD + * + * Wrapper for an fd. Destructor closes the fd. + */ + class FD { + public: + const int fd; + FD(int _fd) : fd(_fd) { + assert(_fd >= 0); + } + int operator*() const { + return fd; + } + ~FD() { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + } + }; + +private: + CephContext *cct; + const int registry_shards; + SharedLRU *registry; + +public: + FDCache(CephContext *cct) : cct(cct), + registry_shards(cct->_conf->filestore_fd_cache_shards) { + assert(cct); + cct->_conf->add_observer(this); + registry = new SharedLRU[registry_shards]; + for (int i = 0; i < registry_shards; ++i) { + registry[i].set_cct(cct); + registry[i].set_size( + MAX((cct->_conf->filestore_fd_cache_size / registry_shards), 1)); + } + } + ~FDCache() { + cct->_conf->remove_observer(this); + delete[] registry; + } + typedef ceph::shared_ptr FDRef; + + FDRef lookup(const ghobject_t &hoid) { + int registry_id = hoid.hobj.get_hash() % registry_shards; + return registry[registry_id].lookup(hoid); + } + + FDRef add(const ghobject_t &hoid, int fd, bool *existed) { + int registry_id = hoid.hobj.get_hash() % registry_shards; + return registry[registry_id].add(hoid, new FD(fd), existed); + } + + /// clear cached fd for hoid, subsequent lookups will get an empty FD + void clear(const ghobject_t &hoid) { + int registry_id = hoid.hobj.get_hash() % registry_shards; + registry[registry_id].purge(hoid); + } + + /// md_config_obs_t + const char** get_tracked_conf_keys() const { + static const char* KEYS[] = { + "filestore_fd_cache_size", + NULL + }; + return KEYS; + } + void handle_conf_change(const md_config_t *conf, + const std::set &changed) { + if (changed.count("filestore_fd_cache_size")) { + for (int i = 0; i < registry_shards; ++i) + registry[i].set_size( + MAX((conf->filestore_fd_cache_size / registry_shards), 1)); + } + } + +}; +typedef FDCache::FDRef FDRef; + +#endif diff --git a/src/os/filestore/FileJournal.cc b/src/os/filestore/FileJournal.cc new file mode 100644 index 000000000000..f9e0cc5131b0 --- /dev/null +++ b/src/os/filestore/FileJournal.cc @@ -0,0 +1,2146 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include "acconfig.h" + +#include "common/debug.h" +#include "common/errno.h" +#include "common/safe_io.h" +#include "FileJournal.h" +#include "include/color.h" +#include "common/perf_counters.h" +#include "FileStore.h" + +#include "include/compat.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/blkdev.h" +#include "common/linux_version.h" + +#if defined(__FreeBSD__) +#define O_DSYNC O_SYNC +#endif + +#define dout_subsys ceph_subsys_journal +#undef dout_prefix +#define dout_prefix *_dout << "journal " + +const static int64_t ONE_MEG(1 << 20); +const static int CEPH_MINIMUM_BLOCK_SIZE(4096); + +int FileJournal::_open(bool forwrite, bool create) +{ + int flags, ret; + + if (forwrite) { + flags = O_RDWR; + if (directio) + flags |= O_DIRECT | O_DSYNC; + } else { + flags = O_RDONLY; + } + if (create) + flags |= O_CREAT; + + if (fd >= 0) { + if (TEMP_FAILURE_RETRY(::close(fd))) { + int err = errno; + derr << "FileJournal::_open: error closing old fd: " + << cpp_strerror(err) << dendl; + } + } + fd = TEMP_FAILURE_RETRY(::open(fn.c_str(), flags, 0644)); + if (fd < 0) { + int err = errno; + dout(2) << "FileJournal::_open unable to open journal " + << fn << ": " << cpp_strerror(err) << dendl; + return -err; + } + + struct stat st; + ret = ::fstat(fd, &st); + if (ret) { + ret = errno; + derr << "FileJournal::_open: unable to fstat journal: " << cpp_strerror(ret) << dendl; + ret = -ret; + goto out_fd; + } + + if (S_ISBLK(st.st_mode)) { + ret = _open_block_device(); + } else { + if (aio && !force_aio) { + derr << "FileJournal::_open: disabling aio for non-block journal. Use " + << "journal_force_aio to force use of aio anyway" << dendl; + aio = false; + } + ret = _open_file(st.st_size, st.st_blksize, create); + } + + if (ret) + goto out_fd; + +#ifdef HAVE_LIBAIO + if (aio) { + aio_ctx = 0; + ret = io_setup(128, &aio_ctx); + if (ret < 0) { + ret = errno; + derr << "FileJournal::_open: unable to setup io_context " << cpp_strerror(ret) << dendl; + ret = -ret; + goto out_fd; + } + } +#endif + + /* We really want max_size to be a multiple of block_size. */ + max_size -= max_size % block_size; + + dout(1) << "_open " << fn << " fd " << fd + << ": " << max_size + << " bytes, block size " << block_size + << " bytes, directio = " << directio + << ", aio = " << aio + << dendl; + return 0; + + out_fd: + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return ret; +} + +int FileJournal::_open_block_device() +{ + int64_t bdev_sz = 0; + int ret = get_block_device_size(fd, &bdev_sz); + if (ret) { + dout(0) << __func__ << ": failed to read block device size." << dendl; + return -EIO; + } + + /* Check for bdev_sz too small */ + if (bdev_sz < ONE_MEG) { + dout(0) << __func__ << ": your block device must be at least " + << ONE_MEG << " bytes to be used for a Ceph journal." << dendl; + return -EINVAL; + } + + dout(10) << __func__ << ": ignoring osd journal size. " + << "We'll use the entire block device (size: " << bdev_sz << ")" + << dendl; + max_size = bdev_sz; + + block_size = CEPH_MINIMUM_BLOCK_SIZE; + + if (g_conf->journal_discard) { + discard = block_device_support_discard(fn.c_str()); + dout(10) << fn << " support discard: " << (int)discard << dendl; + } + _check_disk_write_cache(); + return 0; +} + +void FileJournal::_check_disk_write_cache() const +{ + ostringstream hdparm_cmd; + FILE *fp = NULL; + + if (geteuid() != 0) { + dout(10) << "_check_disk_write_cache: not root, NOT checking disk write " + << "cache on raw block device " << fn << dendl; + goto done; + } + + hdparm_cmd << "/sbin/hdparm -W " << fn; + fp = popen(hdparm_cmd.str().c_str(), "r"); + if (!fp) { + dout(10) << "_check_disk_write_cache: failed to run /sbin/hdparm: NOT " + << "checking disk write cache on raw block device " << fn << dendl; + goto done; + } + + while (true) { + char buf[256]; + memset(buf, 0, sizeof(buf)); + char *line = fgets(buf, sizeof(buf) - 1, fp); + if (!line) { + if (ferror(fp)) { + int ret = -errno; + derr << "_check_disk_write_cache: fgets error: " << cpp_strerror(ret) + << dendl; + goto close_f; + } + else { + // EOF. + break; + } + } + + int on; + if (sscanf(line, " write-caching = %d", &on) != 1) + continue; + if (!on) { + dout(10) << "_check_disk_write_cache: disk write cache is off (good) on " + << fn << dendl; + break; + } + + // is our kernel new enough? + int ver = get_linux_version(); + if (ver == 0) { + dout(10) << "_check_disk_write_cache: get_linux_version failed" << dendl; + } else if (ver >= KERNEL_VERSION(2, 6, 33)) { + dout(20) << "_check_disk_write_cache: disk write cache is on, but your " + << "kernel is new enough to handle it correctly. (fn:" + << fn << ")" << dendl; + break; + } + derr << TEXT_RED + << " ** WARNING: disk write cache is ON on " << fn << ".\n" + << " Journaling will not be reliable on kernels prior to 2.6.33\n" + << " (recent kernels are safe). You can disable the write cache with\n" + << " 'hdparm -W 0 " << fn << "'" + << TEXT_NORMAL + << dendl; + break; + } + +close_f: + if (pclose(fp)) { + int ret = -errno; + derr << "_check_disk_write_cache: pclose failed: " << cpp_strerror(ret) + << dendl; + } +done: + ; +} + +int FileJournal::_open_file(int64_t oldsize, blksize_t blksize, + bool create) +{ + int ret; + int64_t conf_journal_sz(g_conf->osd_journal_size); + conf_journal_sz <<= 20; + + if ((g_conf->osd_journal_size == 0) && (oldsize < ONE_MEG)) { + derr << "I'm sorry, I don't know how large of a journal to create." + << "Please specify a block device to use as the journal OR " + << "set osd_journal_size in your ceph.conf" << dendl; + return -EINVAL; + } + + if (create && (oldsize < conf_journal_sz)) { + uint64_t newsize(g_conf->osd_journal_size); + newsize <<= 20; + dout(10) << "_open extending to " << newsize << " bytes" << dendl; + ret = ::ftruncate(fd, newsize); + if (ret < 0) { + int err = errno; + derr << "FileJournal::_open_file : unable to extend journal to " + << newsize << " bytes: " << cpp_strerror(err) << dendl; + return -err; + } +#ifdef HAVE_POSIX_FALLOCATE + ret = ::posix_fallocate(fd, 0, newsize); + if (ret) { + derr << "FileJournal::_open_file : unable to preallocation journal to " + << newsize << " bytes: " << cpp_strerror(ret) << dendl; + return -ret; + } + max_size = newsize; +#elif defined(__APPLE__) + fstore_t store; + store.fst_flags = F_ALLOCATECONTIG; + store.fst_posmode = F_PEOFPOSMODE; + store.fst_offset = 0; + store.fst_length = newsize; + + ret = ::fcntl(fd, F_PREALLOCATE, &store); + if (ret == -1) { + ret = -errno; + derr << "FileJournal::_open_file : unable to preallocation journal to " + << newsize << " bytes: " << cpp_strerror(ret) << dendl; + return ret; + } + max_size = newsize; +#else +# error "Journal pre-allocation not supported on platform." +#endif + } + else { + max_size = oldsize; + } + block_size = MAX(blksize, (blksize_t)CEPH_MINIMUM_BLOCK_SIZE); + + if (create && g_conf->journal_zero_on_create) { + derr << "FileJournal::_open_file : zeroing journal" << dendl; + uint64_t write_size = 1 << 20; + char *buf; + ret = ::posix_memalign((void **)&buf, block_size, write_size); + if (ret != 0) { + return -ret; + } + memset(static_cast(buf), 0, write_size); + uint64_t i = 0; + for (; (i + write_size) <= (uint64_t)max_size; i += write_size) { + ret = ::pwrite(fd, static_cast(buf), write_size, i); + if (ret < 0) { + free(buf); + return -errno; + } + } + if (i < (uint64_t)max_size) { + ret = ::pwrite(fd, static_cast(buf), max_size - i, i); + if (ret < 0) { + free(buf); + return -errno; + } + } + free(buf); + } + + + dout(10) << "_open journal is not a block device, NOT checking disk " + << "write cache on '" << fn << "'" << dendl; + + return 0; +} + +// This can not be used on an active journal +int FileJournal::check() +{ + int ret; + + assert(fd == -1); + ret = _open(false, false); + if (ret) + return ret; + + ret = read_header(&header); + if (ret < 0) + goto done; + + if (header.fsid != fsid) { + derr << "check: ondisk fsid " << header.fsid << " doesn't match expected " << fsid + << ", invalid (someone else's?) journal" << dendl; + ret = -EINVAL; + goto done; + } + + dout(1) << "check: header looks ok" << dendl; + ret = 0; + + done: + close(); + return ret; +} + + +int FileJournal::create() +{ + void *buf = 0; + int64_t needed_space; + int ret; + buffer::ptr bp; + dout(2) << "create " << fn << " fsid " << fsid << dendl; + + ret = _open(true, true); + if (ret) + goto done; + + // write empty header + header = header_t(); + header.flags = header_t::FLAG_CRC; // enable crcs on any new journal. + header.fsid = fsid; + header.max_size = max_size; + header.block_size = block_size; + if (g_conf->journal_block_align || directio) + header.alignment = block_size; + else + header.alignment = 16; // at least stay word aligned on 64bit machines... + + header.start = get_top(); + header.start_seq = 0; + + print_header(header); + + // static zeroed buffer for alignment padding + delete [] zero_buf; + zero_buf = new char[header.alignment]; + memset(zero_buf, 0, header.alignment); + + bp = prepare_header(); + if (TEMP_FAILURE_RETRY(::pwrite(fd, bp.c_str(), bp.length(), 0)) < 0) { + ret = -errno; + derr << "FileJournal::create : create write header error " + << cpp_strerror(ret) << dendl; + goto close_fd; + } + + // zero first little bit, too. + ret = posix_memalign(&buf, block_size, block_size); + if (ret) { + ret = -ret; + derr << "FileJournal::create: failed to allocate " << block_size + << " bytes of memory: " << cpp_strerror(ret) << dendl; + goto close_fd; + } + memset(buf, 0, block_size); + if (TEMP_FAILURE_RETRY(::pwrite(fd, buf, block_size, get_top())) < 0) { + ret = -errno; + derr << "FileJournal::create: error zeroing first " << block_size + << " bytes " << cpp_strerror(ret) << dendl; + goto free_buf; + } + + needed_space = ((int64_t)g_conf->osd_max_write_size) << 20; + needed_space += (2 * sizeof(entry_header_t)) + get_top(); + if (header.max_size - header.start < needed_space) { + derr << "FileJournal::create: OSD journal is not large enough to hold " + << "osd_max_write_size bytes!" << dendl; + ret = -ENOSPC; + goto free_buf; + } + + dout(2) << "create done" << dendl; + ret = 0; + +free_buf: + free(buf); + buf = 0; +close_fd: + if (TEMP_FAILURE_RETRY(::close(fd)) < 0) { + ret = -errno; + derr << "FileJournal::create: error closing fd: " << cpp_strerror(ret) + << dendl; + } +done: + fd = -1; + return ret; +} + +// This can not be used on an active journal +int FileJournal::peek_fsid(uuid_d& fsid) +{ + assert(fd == -1); + int r = _open(false, false); + if (r) + return r; + r = read_header(&header); + if (r < 0) + goto out; + fsid = header.fsid; +out: + close(); + return r; +} + +int FileJournal::open(uint64_t fs_op_seq) +{ + dout(2) << "open " << fn << " fsid " << fsid << " fs_op_seq " << fs_op_seq << dendl; + + uint64_t next_seq = fs_op_seq + 1; + + int err = _open(false); + if (err) + return err; + + // assume writeable, unless... + read_pos = 0; + write_pos = get_top(); + + // read header? + err = read_header(&header); + if (err < 0) + return err; + + // static zeroed buffer for alignment padding + delete [] zero_buf; + zero_buf = new char[header.alignment]; + memset(zero_buf, 0, header.alignment); + + dout(10) << "open header.fsid = " << header.fsid + //<< " vs expected fsid = " << fsid + << dendl; + if (header.fsid != fsid) { + derr << "FileJournal::open: ondisk fsid " << header.fsid << " doesn't match expected " << fsid + << ", invalid (someone else's?) journal" << dendl; + return -EINVAL; + } + if (header.max_size > max_size) { + dout(2) << "open journal size " << header.max_size << " > current " << max_size << dendl; + return -EINVAL; + } + if (header.block_size != block_size) { + dout(2) << "open journal block size " << header.block_size << " != current " << block_size << dendl; + return -EINVAL; + } + if (header.max_size % header.block_size) { + dout(2) << "open journal max size " << header.max_size + << " not a multiple of block size " << header.block_size << dendl; + return -EINVAL; + } + if (header.alignment != block_size && directio) { + dout(0) << "open journal alignment " << header.alignment << " does not match block size " + << block_size << " (required for direct_io journal mode)" << dendl; + return -EINVAL; + } + if ((header.alignment % CEPH_MINIMUM_BLOCK_SIZE) && directio) { + dout(0) << "open journal alignment " << header.alignment << " is not multiple of minimum block size " + << CEPH_MINIMUM_BLOCK_SIZE << " (required for direct_io journal mode)" << dendl; + return -EINVAL; + } + + // looks like a valid header. + write_pos = 0; // not writeable yet + + journaled_seq = header.committed_up_to; + + // find next entry + read_pos = header.start; + uint64_t seq = header.start_seq; + + // last_committed_seq is 1 before the start of the journal or + // 0 if the start is 0 + last_committed_seq = seq > 0 ? seq - 1 : seq; + if (last_committed_seq < fs_op_seq) { + dout(2) << "open advancing committed_seq " << last_committed_seq + << " to fs op_seq " << fs_op_seq << dendl; + last_committed_seq = fs_op_seq; + } + + while (1) { + bufferlist bl; + off64_t old_pos = read_pos; + if (!read_entry(bl, seq)) { + dout(10) << "open reached end of journal." << dendl; + break; + } + if (seq > next_seq) { + dout(10) << "open entry " << seq << " len " << bl.length() << " > next_seq " << next_seq + << ", ignoring journal contents" + << dendl; + read_pos = -1; + last_committed_seq = 0; + seq = 0; + return 0; + } + if (seq == next_seq) { + dout(10) << "open reached seq " << seq << dendl; + read_pos = old_pos; + break; + } + seq++; // next event should follow. + } + + return 0; +} + +void FileJournal::_close(int fd) const +{ + VOID_TEMP_FAILURE_RETRY(::close(fd)); +} + +void FileJournal::close() +{ + dout(1) << "close " << fn << dendl; + + // stop writer thread + stop_writer(); + + // close + assert(writeq_empty()); + assert(!must_write_header); + assert(fd >= 0); + _close(fd); + fd = -1; +} + + +int FileJournal::dump(ostream& out) +{ + return _dump(out, false); +} + +int FileJournal::simple_dump(ostream& out) +{ + return _dump(out, true); +} + +int FileJournal::_dump(ostream& out, bool simple) +{ + JSONFormatter f(true); + int ret = _fdump(f, simple); + f.flush(out); + return ret; +} + +int FileJournal::_fdump(Formatter &f, bool simple) +{ + dout(10) << "_fdump" << dendl; + + assert(fd == -1); + int err = _open(false, false); + if (err) + return err; + + err = read_header(&header); + if (err < 0) { + close(); + return err; + } + + off64_t next_pos = header.start; + + f.open_object_section("journal"); + + f.open_object_section("header"); + f.dump_unsigned("flags", header.flags); + ostringstream os; + os << header.fsid; + f.dump_string("fsid", os.str()); + f.dump_unsigned("block_size", header.block_size); + f.dump_unsigned("alignment", header.alignment); + f.dump_int("max_size", header.max_size); + f.dump_int("start", header.start); + f.dump_unsigned("committed_up_to", header.committed_up_to); + f.dump_unsigned("start_seq", header.start_seq); + f.close_section(); + + f.open_array_section("entries"); + uint64_t seq = header.start_seq; + while (1) { + bufferlist bl; + off64_t pos = next_pos; + + if (!pos) { + dout(2) << "_dump -- not readable" << dendl; + err = -EINVAL; + break; + } + stringstream ss; + read_entry_result result = do_read_entry( + pos, + &next_pos, + &bl, + &seq, + &ss); + if (result != SUCCESS) { + if (seq < header.committed_up_to) { + dout(2) << "Unable to read past sequence " << seq + << " but header indicates the journal has committed up through " + << header.committed_up_to << ", journal is corrupt" << dendl; + err = -EINVAL; + } + dout(25) << ss.str() << dendl; + dout(25) << "No further valid entries found, journal is most likely valid" + << dendl; + break; + } + + f.open_object_section("entry"); + f.dump_unsigned("offset", pos); + f.dump_unsigned("seq", seq); + if (simple) { + f.dump_unsigned("bl.length", bl.length()); + } else { + f.open_array_section("transactions"); + bufferlist::iterator p = bl.begin(); + int trans_num = 0; + while (!p.end()) { + ObjectStore::Transaction t(p); + f.open_object_section("transaction"); + f.dump_unsigned("trans_num", trans_num); + t.dump(&f); + f.close_section(); + trans_num++; + } + f.close_section(); + } + f.close_section(); + } + + f.close_section(); + f.close_section(); + dout(10) << "dump finish" << dendl; + + close(); + return err; +} + + +void FileJournal::start_writer() +{ + write_stop = false; + aio_stop = false; + write_thread.create(); +#ifdef HAVE_LIBAIO + if (aio) + write_finish_thread.create(); +#endif +} + +void FileJournal::stop_writer() +{ + // Do nothing if writer already stopped or never started + if (!write_stop) + { + { + Mutex::Locker l(write_lock); + Mutex::Locker p(writeq_lock); + write_stop = true; + writeq_cond.Signal(); + // Doesn't hurt to signal commit_cond in case thread is waiting there + // and caller didn't use committed_thru() first. + commit_cond.Signal(); + } + write_thread.join(); + + // write journal header now so that we have less to replay on remount + write_header_sync(); + } + +#ifdef HAVE_LIBAIO + // stop aio completeion thread *after* writer thread has stopped + // and has submitted all of its io + if (aio && !aio_stop) { + aio_lock.Lock(); + aio_stop = true; + aio_cond.Signal(); + write_finish_cond.Signal(); + aio_lock.Unlock(); + write_finish_thread.join(); + } +#endif +} + + + +void FileJournal::print_header(const header_t &header) const +{ + dout(10) << "header: block_size " << header.block_size + << " alignment " << header.alignment + << " max_size " << header.max_size + << dendl; + dout(10) << "header: start " << header.start << dendl; + dout(10) << " write_pos " << write_pos << dendl; +} + +int FileJournal::read_header(header_t *hdr) const +{ + dout(10) << "read_header" << dendl; + bufferlist bl; + + buffer::ptr bp = buffer::create_page_aligned(block_size); + char* bpdata = bp.c_str(); + int r = ::pread(fd, bpdata, bp.length(), 0); + + if (r < 0) { + int err = errno; + dout(0) << "read_header got " << cpp_strerror(err) << dendl; + return -err; + } + + // don't use bp.zero() here, because it also invalidates + // crc cache (which is not yet populated anyway) + if (bp.length() != (size_t)r) { + // r will be always less or equal than bp.length + bpdata += r; + memset(bpdata, 0, bp.length() - r); + } + + bl.push_back(bp); + + try { + bufferlist::iterator p = bl.begin(); + ::decode(*hdr, p); + } + catch (buffer::error& e) { + derr << "read_header error decoding journal header" << dendl; + return -EINVAL; + } + + + /* + * Unfortunately we weren't initializing the flags field for new + * journals! Aie. This is safe(ish) now that we have only one + * flag. Probably around when we add the next flag we need to + * remove this or else this (eventually old) code will clobber newer + * code's flags. + */ + if (hdr->flags > 3) { + derr << "read_header appears to have gibberish flags; assuming 0" << dendl; + hdr->flags = 0; + } + + print_header(*hdr); + + return 0; +} + +bufferptr FileJournal::prepare_header() +{ + bufferlist bl; + { + Mutex::Locker l(finisher_lock); + header.committed_up_to = journaled_seq; + } + ::encode(header, bl); + bufferptr bp = buffer::create_page_aligned(get_top()); + // don't use bp.zero() here, because it also invalidates + // crc cache (which is not yet populated anyway) + char* data = bp.c_str(); + memcpy(data, bl.c_str(), bl.length()); + data += bl.length(); + memset(data, 0, bp.length()-bl.length()); + return bp; +} + +void FileJournal::write_header_sync() +{ + Mutex::Locker locker(write_lock); + must_write_header = true; + bufferlist bl; + do_write(bl); + dout(20) << __func__ << " finish" << dendl; +} + +int FileJournal::check_for_full(uint64_t seq, off64_t pos, off64_t size) +{ + // already full? + if (full_state != FULL_NOTFULL) + return -ENOSPC; + + // take 1 byte off so that we only get pos == header.start on EMPTY, never on FULL. + off64_t room; + if (pos >= header.start) + room = (header.max_size - pos) + (header.start - get_top()) - 1; + else + room = header.start - pos - 1; + dout(10) << "room " << room << " max_size " << max_size << " pos " << pos << " header.start " << header.start + << " top " << get_top() << dendl; + + if (do_sync_cond) { + if (room >= (header.max_size >> 1) && + room - size < (header.max_size >> 1)) { + dout(10) << " passing half full mark, triggering commit" << dendl; + do_sync_cond->SloppySignal(); // initiate a real commit so we can trim + } + } + + if (room >= size) { + dout(10) << "check_for_full at " << pos << " : " << size << " < " << room << dendl; + if (pos + size > header.max_size) + must_write_header = true; + return 0; + } + + // full + dout(1) << "check_for_full at " << pos << " : JOURNAL FULL " + << pos << " >= " << room + << " (max_size " << header.max_size << " start " << header.start << ")" + << dendl; + + off64_t max = header.max_size - get_top(); + if (size > max) + dout(0) << "JOURNAL TOO SMALL: continuing, but slow: item " << size << " > journal " << max << " (usable)" << dendl; + + return -ENOSPC; +} + +int FileJournal::prepare_multi_write(bufferlist& bl, uint64_t& orig_ops, uint64_t& orig_bytes) +{ + // gather queued writes + off64_t queue_pos = write_pos; + + int eleft = g_conf->journal_max_write_entries; + unsigned bmax = g_conf->journal_max_write_bytes; + + if (full_state != FULL_NOTFULL) + return -ENOSPC; + + while (!writeq_empty()) { + list items; + batch_pop_write(items); + list::iterator it = items.begin(); + while (it != items.end()) { + int r = prepare_single_write(*it, bl, queue_pos, orig_ops, orig_bytes); + if (r == 0) { // prepare ok, delete it + items.erase(it++); + } + if (r == -ENOSPC) { + // the journal maybe full, insert the left item to writeq + batch_unpop_write(items); + if (orig_ops) + goto out; // commit what we have + + if (logger) + logger->inc(l_os_j_full); + + if (wait_on_full) { + dout(20) << "prepare_multi_write full on first entry, need to wait" << dendl; + } else { + dout(20) << "prepare_multi_write full on first entry, restarting journal" << dendl; + + // throw out what we have so far + full_state = FULL_FULL; + while (!writeq_empty()) { + put_throttle(1, peek_write().orig_len); + pop_write(); + } + print_header(header); + } + + return -ENOSPC; // hrm, full on first op + } + if (eleft) { + if (--eleft == 0) { + dout(20) << "prepare_multi_write hit max events per write " << g_conf->journal_max_write_entries << dendl; + batch_unpop_write(items); + goto out; + } + } + if (bmax) { + if (bl.length() >= bmax) { + dout(20) << "prepare_multi_write hit max write size " << g_conf->journal_max_write_bytes << dendl; + batch_unpop_write(items); + goto out; + } + } + } + } + +out: + dout(20) << "prepare_multi_write queue_pos now " << queue_pos << dendl; + assert((write_pos + bl.length() == queue_pos) || + (write_pos + bl.length() - header.max_size + get_top() == queue_pos)); + return 0; +} + +/* +void FileJournal::queue_write_fin(uint64_t seq, Context *fin) +{ + writing_seq.push_back(seq); + if (!waiting_for_notfull.empty()) { + // make sure previously unjournaled stuff waiting for UNFULL triggers + // _before_ newly journaled stuff does + dout(10) << "queue_write_fin will defer seq " << seq << " callback " << fin + << " until after UNFULL" << dendl; + C_Gather *g = new C_Gather(writeq.front().fin); + writing_fin.push_back(g->new_sub()); + waiting_for_notfull.push_back(g->new_sub()); + } else { + writing_fin.push_back(writeq.front().fin); + dout(20) << "queue_write_fin seq " << seq << " callback " << fin << dendl; + } +} +*/ + +void FileJournal::queue_completions_thru(uint64_t seq) +{ + assert(finisher_lock.is_locked()); + utime_t now = ceph_clock_now(g_ceph_context); + list items; + batch_pop_completions(items); + list::iterator it = items.begin(); + while (it != items.end()) { + completion_item& next = *it; + if (next.seq > seq) + break; + utime_t lat = now; + lat -= next.start; + dout(10) << "queue_completions_thru seq " << seq + << " queueing seq " << next.seq + << " " << next.finish + << " lat " << lat << dendl; + if (logger) { + logger->tinc(l_os_j_lat, lat); + } + if (next.finish) + finisher->queue(next.finish); + if (next.tracked_op) + next.tracked_op->mark_event("journaled_completion_queued"); + items.erase(it++); + } + batch_unpop_completions(items); + finisher_cond.Signal(); +} + + +int FileJournal::prepare_single_write(write_item &next_write, bufferlist& bl, off64_t& queue_pos, uint64_t& orig_ops, uint64_t& orig_bytes) +{ + uint64_t seq = next_write.seq; + bufferlist &ebl = next_write.bl; + off64_t size = ebl.length(); + + int r = check_for_full(seq, queue_pos, size); + if (r < 0) + return r; // ENOSPC or EAGAIN + + uint32_t orig_len = next_write.orig_len; + orig_bytes += orig_len; + orig_ops++; + + // add to write buffer + dout(15) << "prepare_single_write " << orig_ops << " will write " << queue_pos << " : seq " << seq + << " len " << orig_len << " -> " << size << dendl; + + unsigned seq_offset = offsetof(entry_header_t, seq); + unsigned magic1_offset = offsetof(entry_header_t, magic1); + unsigned magic2_offset = offsetof(entry_header_t, magic2); + + bufferptr headerptr = ebl.buffers().front(); + uint64_t _seq = seq; + uint64_t _queue_pos = queue_pos; + uint64_t magic2 = entry_header_t::make_magic(seq, orig_len, header.get_fsid64()); + headerptr.copy_in(seq_offset, sizeof(uint64_t), (char *)&_seq); + headerptr.copy_in(magic1_offset, sizeof(uint64_t), (char *)&_queue_pos); + headerptr.copy_in(magic2_offset, sizeof(uint64_t), (char *)&magic2); + + bufferptr footerptr = ebl.buffers().back(); + unsigned post_offset = footerptr.length() - sizeof(entry_header_t); + footerptr.copy_in(post_offset + seq_offset, sizeof(uint64_t), (char *)&_seq); + footerptr.copy_in(post_offset + magic1_offset, sizeof(uint64_t), (char *)&_queue_pos); + footerptr.copy_in(post_offset + magic2_offset, sizeof(uint64_t), (char *)&magic2); + + bl.claim_append(ebl); + if (next_write.tracked_op) + next_write.tracked_op->mark_event("write_thread_in_journal_buffer"); + + journalq.push_back(pair(seq, queue_pos)); + writing_seq = seq; + + queue_pos += size; + if (queue_pos >= header.max_size) + queue_pos = queue_pos + get_top() - header.max_size; + + return 0; +} + +void FileJournal::align_bl(off64_t pos, bufferlist& bl) +{ + // make sure list segments are page aligned + if (directio && (!bl.is_aligned(block_size) || + !bl.is_n_align_sized(CEPH_MINIMUM_BLOCK_SIZE))) { + assert(0 == "bl should be align"); + if ((bl.length() & (CEPH_MINIMUM_BLOCK_SIZE - 1)) != 0 || + (pos & (CEPH_MINIMUM_BLOCK_SIZE - 1)) != 0) + dout(0) << "rebuild_page_aligned failed, " << bl << dendl; + assert((bl.length() & (CEPH_MINIMUM_BLOCK_SIZE - 1)) == 0); + assert((pos & (CEPH_MINIMUM_BLOCK_SIZE - 1)) == 0); + } +} + +int FileJournal::write_bl(off64_t& pos, bufferlist& bl) +{ + int ret; + + off64_t spos = ::lseek64(fd, pos, SEEK_SET); + if (spos < 0) { + ret = -errno; + derr << "FileJournal::write_bl : lseek64 failed " << cpp_strerror(ret) << dendl; + return ret; + } + ret = bl.write_fd(fd); + if (ret) { + derr << "FileJournal::write_bl : write_fd failed: " << cpp_strerror(ret) << dendl; + return ret; + } + pos += bl.length(); + if (pos == header.max_size) + pos = get_top(); + return 0; +} + +void FileJournal::do_write(bufferlist& bl) +{ + // nothing to do? + if (bl.length() == 0 && !must_write_header) + return; + + buffer::ptr hbp; + if (g_conf->journal_write_header_frequency && + (((++journaled_since_start) % + g_conf->journal_write_header_frequency) == 0)) { + must_write_header = true; + } + + if (must_write_header) { + must_write_header = false; + hbp = prepare_header(); + } + + dout(15) << "do_write writing " << write_pos << "~" << bl.length() + << (hbp.length() ? " + header":"") + << dendl; + + utime_t from = ceph_clock_now(g_ceph_context); + + // entry + off64_t pos = write_pos; + + // Adjust write_pos + align_bl(pos, bl); + write_pos += bl.length(); + if (write_pos >= header.max_size) + write_pos = write_pos - header.max_size + get_top(); + + write_lock.Unlock(); + + // split? + off64_t split = 0; + if (pos + bl.length() > header.max_size) { + bufferlist first, second; + split = header.max_size - pos; + first.substr_of(bl, 0, split); + second.substr_of(bl, split, bl.length() - split); + assert(first.length() + second.length() == bl.length()); + dout(10) << "do_write wrapping, first bit at " << pos << " len " << first.length() + << " second bit len " << second.length() << " (orig len " << bl.length() << ")" << dendl; + + //Save pos to write first piece second + off64_t first_pos = pos; + off64_t orig_pos; + pos = get_top(); + // header too? + if (hbp.length()) { + // be sneaky: include the header in the second fragment + second.push_front(hbp); + pos = 0; // we included the header + } + // Write the second portion first possible with the header, so + // do_read_entry() won't even get a valid entry_header_t if there + // is a crash between the two writes. + orig_pos = pos; + if (write_bl(pos, second)) { + derr << "FileJournal::do_write: write_bl(pos=" << orig_pos + << ") failed" << dendl; + ceph_abort(); + } + orig_pos = first_pos; + if (write_bl(first_pos, first)) { + derr << "FileJournal::do_write: write_bl(pos=" << orig_pos + << ") failed" << dendl; + ceph_abort(); + } + assert(first_pos == get_top()); + } else { + // header too? + if (hbp.length()) { + if (TEMP_FAILURE_RETRY(::pwrite(fd, hbp.c_str(), hbp.length(), 0)) < 0) { + int err = errno; + derr << "FileJournal::do_write: pwrite(fd=" << fd + << ", hbp.length=" << hbp.length() << ") failed :" + << cpp_strerror(err) << dendl; + ceph_abort(); + } + } + + if (write_bl(pos, bl)) { + derr << "FileJournal::do_write: write_bl(pos=" << pos + << ") failed" << dendl; + ceph_abort(); + } + } + + if (!directio) { + dout(20) << "do_write fsync" << dendl; + + /* + * We'd really love to have a fsync_range or fdatasync_range and do a: + * + * if (split) { + * ::fsync_range(fd, header.max_size - split, split)l + * ::fsync_range(fd, get_top(), bl.length() - split); + * else + * ::fsync_range(fd, write_pos, bl.length()) + * + * NetBSD and AIX apparently have it, and adding it to Linux wouldn't be + * too hard given all the underlying infrastructure already exist. + * + * NOTE: using sync_file_range here would not be safe as it does not + * flush disk caches or commits any sort of metadata. + */ + int ret = 0; +#if defined(DARWIN) || defined(__FreeBSD__) + ret = ::fsync(fd); +#else + ret = ::fdatasync(fd); +#endif + if (ret < 0) { + derr << __func__ << " fsync/fdatasync failed: " << cpp_strerror(errno) << dendl; + ceph_abort(); + } +#ifdef HAVE_POSIX_FADVISE + if (g_conf->filestore_fadvise) + posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED); +#endif + } + + utime_t lat = ceph_clock_now(g_ceph_context) - from; + dout(20) << "do_write latency " << lat << dendl; + + write_lock.Lock(); + + assert(write_pos == pos); + assert(write_pos % header.alignment == 0); + + { + Mutex::Locker locker(finisher_lock); + journaled_seq = writing_seq; + + // kick finisher? + // only if we haven't filled up recently! + if (full_state != FULL_NOTFULL) { + dout(10) << "do_write NOT queueing finisher seq " << journaled_seq + << ", full_commit_seq|full_restart_seq" << dendl; + } else { + if (plug_journal_completions) { + dout(20) << "do_write NOT queueing finishers through seq " << journaled_seq + << " due to completion plug" << dendl; + } else { + dout(20) << "do_write queueing finishers through seq " << journaled_seq << dendl; + queue_completions_thru(journaled_seq); + } + } + } +} + +void FileJournal::flush() +{ + dout(10) << "waiting for completions to empty" << dendl; + { + Mutex::Locker l(finisher_lock); + while (!completions_empty()) + finisher_cond.Wait(finisher_lock); + } + dout(10) << "flush waiting for finisher" << dendl; + finisher->wait_for_empty(); + dout(10) << "flush done" << dendl; +} + + +void FileJournal::write_thread_entry() +{ + dout(10) << "write_thread_entry start" << dendl; + while (1) { + { + Mutex::Locker locker(writeq_lock); + if (writeq.empty() && !must_write_header) { + if (write_stop) + break; + dout(20) << "write_thread_entry going to sleep" << dendl; + writeq_cond.Wait(writeq_lock); + dout(20) << "write_thread_entry woke up" << dendl; + continue; + } + } + +#ifdef HAVE_LIBAIO + if (aio) { + Mutex::Locker locker(aio_lock); + // should we back off to limit aios in flight? try to do this + // adaptively so that we submit larger aios once we have lots of + // them in flight. + // + // NOTE: our condition here is based on aio_num (protected by + // aio_lock) and throttle_bytes (part of the write queue). when + // we sleep, we *only* wait for aio_num to change, and do not + // wake when more data is queued. this is not strictly correct, + // but should be fine given that we will have plenty of aios in + // flight if we hit this limit to ensure we keep the device + // saturated. + while (aio_num > 0) { + int exp = MIN(aio_num * 2, 24); + long unsigned min_new = 1ull << exp; + long unsigned cur = throttle_bytes.get_current(); + dout(20) << "write_thread_entry aio throttle: aio num " << aio_num << " bytes " << aio_bytes + << " ... exp " << exp << " min_new " << min_new + << " ... pending " << cur << dendl; + if (cur >= min_new) + break; + dout(20) << "write_thread_entry deferring until more aios complete: " + << aio_num << " aios with " << aio_bytes << " bytes needs " << min_new + << " bytes to start a new aio (currently " << cur << " pending)" << dendl; + aio_cond.Wait(aio_lock); + dout(20) << "write_thread_entry woke up" << dendl; + } + } +#endif + + Mutex::Locker locker(write_lock); + uint64_t orig_ops = 0; + uint64_t orig_bytes = 0; + + bufferlist bl; + int r = prepare_multi_write(bl, orig_ops, orig_bytes); + // Don't care about journal full if stoppping, so drop queue and + // possibly let header get written and loop above to notice stop + if (r == -ENOSPC) { + if (write_stop) { + dout(20) << "write_thread_entry full and stopping, throw out queue and finish up" << dendl; + while (!writeq_empty()) { + put_throttle(1, peek_write().orig_len); + pop_write(); + } + print_header(header); + r = 0; + } else { + dout(20) << "write_thread_entry full, going to sleep (waiting for commit)" << dendl; + commit_cond.Wait(write_lock); + dout(20) << "write_thread_entry woke up" << dendl; + continue; + } + } + assert(r == 0); + + if (logger) { + logger->inc(l_os_j_wr); + logger->inc(l_os_j_wr_bytes, bl.length()); + } + +#ifdef HAVE_LIBAIO + if (aio) + do_aio_write(bl); + else + do_write(bl); +#else + do_write(bl); +#endif + put_throttle(orig_ops, orig_bytes); + } + + dout(10) << "write_thread_entry finish" << dendl; +} + +#ifdef HAVE_LIBAIO +void FileJournal::do_aio_write(bufferlist& bl) +{ + + if (g_conf->journal_write_header_frequency && + (((++journaled_since_start) % + g_conf->journal_write_header_frequency) == 0)) { + must_write_header = true; + } + + // nothing to do? + if (bl.length() == 0 && !must_write_header) + return; + + buffer::ptr hbp; + if (must_write_header) { + must_write_header = false; + hbp = prepare_header(); + } + + // entry + off64_t pos = write_pos; + + dout(15) << "do_aio_write writing " << pos << "~" << bl.length() + << (hbp.length() ? " + header":"") + << dendl; + + // split? + off64_t split = 0; + if (pos + bl.length() > header.max_size) { + bufferlist first, second; + split = header.max_size - pos; + first.substr_of(bl, 0, split); + second.substr_of(bl, split, bl.length() - split); + assert(first.length() + second.length() == bl.length()); + dout(10) << "do_aio_write wrapping, first bit at " << pos << "~" << first.length() << dendl; + + if (write_aio_bl(pos, first, 0)) { + derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos + << ") failed" << dendl; + ceph_abort(); + } + assert(pos == header.max_size); + if (hbp.length()) { + // be sneaky: include the header in the second fragment + second.push_front(hbp); + pos = 0; // we included the header + } else + pos = get_top(); // no header, start after that + if (write_aio_bl(pos, second, writing_seq)) { + derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos + << ") failed" << dendl; + ceph_abort(); + } + } else { + // header too? + if (hbp.length()) { + bufferlist hbl; + hbl.push_back(hbp); + loff_t pos = 0; + if (write_aio_bl(pos, hbl, 0)) { + derr << "FileJournal::do_aio_write: write_aio_bl(header) failed" << dendl; + ceph_abort(); + } + } + + if (write_aio_bl(pos, bl, writing_seq)) { + derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos + << ") failed" << dendl; + ceph_abort(); + } + } + + write_pos = pos; + if (write_pos == header.max_size) + write_pos = get_top(); + assert(write_pos % header.alignment == 0); +} + +/** + * write a buffer using aio + * + * @param seq seq to trigger when this aio completes. if 0, do not update any state + * on completion. + */ +int FileJournal::write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq) +{ + align_bl(pos, bl); + + dout(20) << "write_aio_bl " << pos << "~" << bl.length() << " seq " << seq << dendl; + + while (bl.length() > 0) { + int max = MIN(bl.buffers().size(), IOV_MAX-1); + iovec *iov = new iovec[max]; + int n = 0; + unsigned len = 0; + for (std::list::const_iterator p = bl.buffers().begin(); + n < max; + ++p, ++n) { + assert(p != bl.buffers().end()); + iov[n].iov_base = (void *)p->c_str(); + iov[n].iov_len = p->length(); + len += p->length(); + } + + bufferlist tbl; + bl.splice(0, len, &tbl); // move bytes from bl -> tbl + + // lock only aio_queue, current aio, aio_num, aio_bytes, which may be + // modified in check_aio_completion + aio_lock.Lock(); + aio_queue.push_back(aio_info(tbl, pos, bl.length() > 0 ? 0 : seq)); + aio_info& aio = aio_queue.back(); + aio.iov = iov; + + io_prep_pwritev(&aio.iocb, fd, aio.iov, n, pos); + + dout(20) << "write_aio_bl .. " << aio.off << "~" << aio.len + << " in " << n << dendl; + + aio_num++; + aio_bytes += aio.len; + + // need to save current aio len to update write_pos later because current + // aio could be ereased from aio_queue once it is done + uint64_t cur_len = aio.len; + // unlock aio_lock because following io_submit might take time to return + aio_lock.Unlock(); + + iocb *piocb = &aio.iocb; + int attempts = 10; + do { + int r = io_submit(aio_ctx, 1, &piocb); + dout(20) << "write_aio_bl io_submit return value: " << r << dendl; + if (r < 0) { + derr << "io_submit to " << aio.off << "~" << cur_len + << " got " << cpp_strerror(r) << dendl; + if (r == -EAGAIN && attempts-- > 0) { + usleep(500); + continue; + } + assert(0 == "io_submit got unexpected error"); + } else { + break; + } + } while (true); + pos += cur_len; + } + aio_lock.Lock(); + write_finish_cond.Signal(); + aio_lock.Unlock(); + return 0; +} +#endif + +void FileJournal::write_finish_thread_entry() +{ +#ifdef HAVE_LIBAIO + dout(10) << "write_finish_thread_entry enter" << dendl; + while (true) { + { + Mutex::Locker locker(aio_lock); + if (aio_queue.empty()) { + if (aio_stop) + break; + dout(20) << "write_finish_thread_entry sleeping" << dendl; + write_finish_cond.Wait(aio_lock); + continue; + } + } + + dout(20) << "write_finish_thread_entry waiting for aio(s)" << dendl; + io_event event[16]; + int r = io_getevents(aio_ctx, 1, 16, event, NULL); + if (r < 0) { + if (r == -EINTR) { + dout(0) << "io_getevents got " << cpp_strerror(r) << dendl; + continue; + } + derr << "io_getevents got " << cpp_strerror(r) << dendl; + assert(0 == "got unexpected error from io_getevents"); + } + + { + Mutex::Locker locker(aio_lock); + for (int i=0; ilen) { + derr << "aio to " << ai->off << "~" << ai->len + << " wrote " << event[i].res << dendl; + assert(0 == "unexpected aio error"); + } + dout(10) << "write_finish_thread_entry aio " << ai->off + << "~" << ai->len << " done" << dendl; + ai->done = true; + } + check_aio_completion(); + } + } + dout(10) << "write_finish_thread_entry exit" << dendl; +#endif +} + +#ifdef HAVE_LIBAIO +/** + * check aio_wait for completed aio, and update state appropriately. + */ +void FileJournal::check_aio_completion() +{ + assert(aio_lock.is_locked()); + dout(20) << "check_aio_completion" << dendl; + + bool completed_something = false, signal = false; + uint64_t new_journaled_seq = 0; + + list::iterator p = aio_queue.begin(); + while (p != aio_queue.end() && p->done) { + dout(20) << "check_aio_completion completed seq " << p->seq << " " + << p->off << "~" << p->len << dendl; + if (p->seq) { + new_journaled_seq = p->seq; + completed_something = true; + } + aio_num--; + aio_bytes -= p->len; + aio_queue.erase(p++); + signal = true; + } + + if (completed_something) { + // kick finisher? + // only if we haven't filled up recently! + Mutex::Locker locker(finisher_lock); + journaled_seq = new_journaled_seq; + if (full_state != FULL_NOTFULL) { + dout(10) << "check_aio_completion NOT queueing finisher seq " << journaled_seq + << ", full_commit_seq|full_restart_seq" << dendl; + } else { + if (plug_journal_completions) { + dout(20) << "check_aio_completion NOT queueing finishers through seq " << journaled_seq + << " due to completion plug" << dendl; + } else { + dout(20) << "check_aio_completion queueing finishers through seq " << journaled_seq << dendl; + queue_completions_thru(journaled_seq); + } + } + } + if (signal) { + // maybe write queue was waiting for aio count to drop? + aio_cond.Signal(); + } +} +#endif + +int FileJournal::prepare_entry(list& tls, bufferlist* tbl) { + dout(10) << "prepare_entry " << tls << dendl; + unsigned data_len = 0; + int data_align = -1; // -1 indicates that we don't care about the alignment + bufferlist bl; + for (list::iterator p = tls.begin(); + p != tls.end(); ++p) { + ObjectStore::Transaction *t = *p; + if (t->get_data_length() > data_len && + (int)t->get_data_length() >= g_conf->journal_align_min_size) { + data_len = t->get_data_length(); + data_align = (t->get_data_alignment() - bl.length()) & ~CEPH_PAGE_MASK; + } + ::encode(*t, bl); + } + if (tbl->length()) { + bl.claim_append(*tbl); + } + // add it this entry + entry_header_t h; + unsigned head_size = sizeof(entry_header_t); + off64_t base_size = 2*head_size + bl.length(); + memset(&h, 0, sizeof(h)); + if (data_align >= 0) + h.pre_pad = ((unsigned int)data_align - (unsigned int)head_size) & ~CEPH_PAGE_MASK; + off64_t size = ROUND_UP_TO(base_size + h.pre_pad, header.alignment); + unsigned post_pad = size - base_size - h.pre_pad; + h.len = bl.length(); + h.post_pad = post_pad; + h.crc32c = bl.crc32c(0); + dout(10) << " len " << bl.length() << " -> " << size + << " (head " << head_size << " pre_pad " << h.pre_pad + << " bl " << bl.length() << " post_pad " << post_pad << " tail " << head_size << ")" + << " (bl alignment " << data_align << ")" + << dendl; + bufferlist ebl; + // header + ebl.append((const char*)&h, sizeof(h)); + if (h.pre_pad) { + ebl.push_back(buffer::create_static(h.pre_pad, zero_buf)); + } + // payload + ebl.claim_append(bl, buffer::list::CLAIM_ALLOW_NONSHAREABLE); // potential zero-copy + if (h.post_pad) { + ebl.push_back(buffer::create_static(h.post_pad, zero_buf)); + } + // footer + ebl.append((const char*)&h, sizeof(h)); + ebl.rebuild_aligned(CEPH_MINIMUM_BLOCK_SIZE); + tbl->claim(ebl); + return h.len; +} + +void FileJournal::submit_entry(uint64_t seq, bufferlist& e, uint32_t orig_len, + Context *oncommit, TrackedOpRef osd_op) +{ + // dump on queue + dout(5) << "submit_entry seq " << seq + << " len " << e.length() + << " (" << oncommit << ")" << dendl; + assert(e.length() > 0); + + throttle_ops.take(1); + throttle_bytes.take(orig_len); + if (osd_op) + osd_op->mark_event("commit_queued_for_journal_write"); + if (logger) { + logger->set(l_os_jq_max_ops, throttle_ops.get_max()); + logger->set(l_os_jq_max_bytes, throttle_bytes.get_max()); + logger->set(l_os_jq_ops, throttle_ops.get_current()); + logger->set(l_os_jq_bytes, throttle_bytes.get_current()); + } + + { + Mutex::Locker l1(writeq_lock); // ** lock ** + Mutex::Locker l2(completions_lock); // ** lock ** + completions.push_back( + completion_item( + seq, oncommit, ceph_clock_now(g_ceph_context), osd_op)); + if (writeq.empty()) + writeq_cond.Signal(); + writeq.push_back(write_item(seq, e, orig_len, osd_op)); + } +} + +bool FileJournal::writeq_empty() +{ + Mutex::Locker locker(writeq_lock); + return writeq.empty(); +} + +FileJournal::write_item &FileJournal::peek_write() +{ + assert(write_lock.is_locked()); + Mutex::Locker locker(writeq_lock); + return writeq.front(); +} + +void FileJournal::pop_write() +{ + assert(write_lock.is_locked()); + Mutex::Locker locker(writeq_lock); + writeq.pop_front(); +} + +void FileJournal::batch_pop_write(list &items) +{ + assert(write_lock.is_locked()); + Mutex::Locker locker(writeq_lock); + writeq.swap(items); +} + +void FileJournal::batch_unpop_write(list &items) +{ + assert(write_lock.is_locked()); + Mutex::Locker locker(writeq_lock); + writeq.splice(writeq.begin(), items); +} + +void FileJournal::commit_start(uint64_t seq) +{ + dout(10) << "commit_start" << dendl; + + // was full? + switch (full_state) { + case FULL_NOTFULL: + break; // all good + + case FULL_FULL: + if (seq >= journaled_seq) { + dout(1) << " FULL_FULL -> FULL_WAIT. commit_start on seq " + << seq << " > journaled_seq " << journaled_seq + << ", moving to FULL_WAIT." + << dendl; + full_state = FULL_WAIT; + } else { + dout(1) << "FULL_FULL commit_start on seq " + << seq << " < journaled_seq " << journaled_seq + << ", remaining in FULL_FULL" + << dendl; + } + break; + + case FULL_WAIT: + dout(1) << " FULL_WAIT -> FULL_NOTFULL. journal now active, setting completion plug." << dendl; + full_state = FULL_NOTFULL; + plug_journal_completions = true; + break; + } +} + +/* + *send discard command to joural block deivce + */ +void FileJournal::do_discard(int64_t offset, int64_t end) +{ + dout(10) << __func__ << "trim(" << offset << ", " << end << dendl; + + offset = ROUND_UP_TO(offset, block_size); + if (offset >= end) + return; + end = ROUND_UP_TO(end - block_size, block_size); + assert(end >= offset); + if (offset < end) + if (block_device_discard(fd, offset, end - offset) < 0) + dout(1) << __func__ << "ioctl(BLKDISCARD) error:" << cpp_strerror(errno) << dendl; +} + +void FileJournal::committed_thru(uint64_t seq) +{ + Mutex::Locker locker(write_lock); + + if (seq < last_committed_seq) { + dout(5) << "committed_thru " << seq << " < last_committed_seq " << last_committed_seq << dendl; + assert(seq >= last_committed_seq); + return; + } + if (seq == last_committed_seq) { + dout(5) << "committed_thru " << seq << " == last_committed_seq " << last_committed_seq << dendl; + return; + } + + dout(5) << "committed_thru " << seq << " (last_committed_seq " << last_committed_seq << ")" << dendl; + last_committed_seq = seq; + + // completions! + { + Mutex::Locker locker(finisher_lock); + queue_completions_thru(seq); + if (plug_journal_completions && seq >= header.start_seq) { + dout(10) << " removing completion plug, queuing completions thru journaled_seq " << journaled_seq << dendl; + plug_journal_completions = false; + queue_completions_thru(journaled_seq); + } + } + + // adjust start pointer + while (!journalq.empty() && journalq.front().first <= seq) { + journalq.pop_front(); + } + + int64_t old_start = header.start; + if (!journalq.empty()) { + header.start = journalq.front().second; + header.start_seq = journalq.front().first; + } else { + header.start = write_pos; + header.start_seq = seq + 1; + } + + if (discard) { + dout(10) << __func__ << " will trim (" << old_start << ", " << header.start << ")" << dendl; + if (old_start < header.start) + do_discard(old_start, header.start - 1); + else { + do_discard(old_start, header.max_size - 1); + do_discard(get_top(), header.start - 1); + } + } + + must_write_header = true; + print_header(header); + + // committed but unjournaled items + while (!writeq_empty() && peek_write().seq <= seq) { + dout(15) << " dropping committed but unwritten seq " << peek_write().seq + << " len " << peek_write().bl.length() + << dendl; + put_throttle(1, peek_write().orig_len); + pop_write(); + } + + commit_cond.Signal(); + + dout(10) << "committed_thru done" << dendl; +} + + +void FileJournal::put_throttle(uint64_t ops, uint64_t bytes) +{ + uint64_t new_ops = throttle_ops.put(ops); + uint64_t new_bytes = throttle_bytes.put(bytes); + dout(5) << "put_throttle finished " << ops << " ops and " + << bytes << " bytes, now " + << new_ops << " ops and " << new_bytes << " bytes" + << dendl; + + if (logger) { + logger->inc(l_os_j_ops, ops); + logger->inc(l_os_j_bytes, bytes); + logger->set(l_os_jq_ops, new_ops); + logger->set(l_os_jq_bytes, new_bytes); + logger->set(l_os_jq_max_ops, throttle_ops.get_max()); + logger->set(l_os_jq_max_bytes, throttle_bytes.get_max()); + } +} + +int FileJournal::make_writeable() +{ + dout(10) << __func__ << dendl; + int r = _open(true); + if (r < 0) + return r; + + if (read_pos > 0) + write_pos = read_pos; + else + write_pos = get_top(); + read_pos = 0; + + must_write_header = true; + start_writer(); + return 0; +} + +void FileJournal::wrap_read_bl( + off64_t pos, + int64_t olen, + bufferlist* bl, + off64_t *out_pos + ) const +{ + while (olen > 0) { + while (pos >= header.max_size) + pos = pos + get_top() - header.max_size; + + int64_t len; + if (pos + olen > header.max_size) + len = header.max_size - pos; // partial + else + len = olen; // rest + + int64_t actual = ::lseek64(fd, pos, SEEK_SET); + assert(actual == pos); + + bufferptr bp = buffer::create(len); + int r = safe_read_exact(fd, bp.c_str(), len); + if (r) { + derr << "FileJournal::wrap_read_bl: safe_read_exact " << pos << "~" << len << " returned " + << r << dendl; + ceph_abort(); + } + bl->push_back(bp); + pos += len; + olen -= len; + } + if (pos >= header.max_size) + pos = pos + get_top() - header.max_size; + if (out_pos) + *out_pos = pos; +} + +bool FileJournal::read_entry( + bufferlist &bl, + uint64_t &next_seq, + bool *corrupt) +{ + if (corrupt) + *corrupt = false; + uint64_t seq = next_seq; + + if (!read_pos) { + dout(2) << "read_entry -- not readable" << dendl; + return false; + } + + off64_t pos = read_pos; + off64_t next_pos = pos; + stringstream ss; + read_entry_result result = do_read_entry( + pos, + &next_pos, + &bl, + &seq, + &ss); + if (result == SUCCESS) { + journalq.push_back( pair(seq, pos)); + if (next_seq > seq) { + return false; + } else { + read_pos = next_pos; + next_seq = seq; + if (seq > journaled_seq) + journaled_seq = seq; + return true; + } + } + + if (seq && seq < header.committed_up_to) { + derr << "Unable to read past sequence " << seq + << " but header indicates the journal has committed up through " + << header.committed_up_to << ", journal is corrupt" << dendl; + if (g_conf->journal_ignore_corruption) { + if (corrupt) + *corrupt = true; + return false; + } else { + assert(0); + } + } + + dout(25) << ss.str() << dendl; + dout(2) << "No further valid entries found, journal is most likely valid" + << dendl; + return false; +} + +FileJournal::read_entry_result FileJournal::do_read_entry( + off64_t init_pos, + off64_t *next_pos, + bufferlist *bl, + uint64_t *seq, + ostream *ss, + entry_header_t *_h) const +{ + off64_t cur_pos = init_pos; + bufferlist _bl; + if (!bl) + bl = &_bl; + + // header + entry_header_t *h; + bufferlist hbl; + off64_t _next_pos; + wrap_read_bl(cur_pos, sizeof(*h), &hbl, &_next_pos); + h = reinterpret_cast(hbl.c_str()); + + if (!h->check_magic(cur_pos, header.get_fsid64())) { + dout(25) << "read_entry " << init_pos + << " : bad header magic, end of journal" << dendl; + if (ss) + *ss << "bad header magic"; + if (next_pos) + *next_pos = init_pos + (4<<10); // check 4k ahead + return MAYBE_CORRUPT; + } + cur_pos = _next_pos; + + // pad + body + pad + if (h->pre_pad) + cur_pos += h->pre_pad; + + bl->clear(); + wrap_read_bl(cur_pos, h->len, bl, &cur_pos); + + if (h->post_pad) + cur_pos += h->post_pad; + + // footer + entry_header_t *f; + bufferlist fbl; + wrap_read_bl(cur_pos, sizeof(*f), &fbl, &cur_pos); + f = reinterpret_cast(fbl.c_str()); + if (memcmp(f, h, sizeof(*f))) { + if (ss) + *ss << "bad footer magic, partial entry"; + if (next_pos) + *next_pos = cur_pos; + return MAYBE_CORRUPT; + } + + if ((header.flags & header_t::FLAG_CRC) || // if explicitly enabled (new journal) + h->crc32c != 0) { // newer entry in old journal + uint32_t actual_crc = bl->crc32c(0); + if (actual_crc != h->crc32c) { + if (ss) + *ss << "header crc (" << h->crc32c + << ") doesn't match body crc (" << actual_crc << ")"; + if (next_pos) + *next_pos = cur_pos; + return MAYBE_CORRUPT; + } + } + + // yay! + dout(2) << "read_entry " << init_pos << " : seq " << h->seq + << " " << h->len << " bytes" + << dendl; + + // ok! + if (seq) + *seq = h->seq; + + + if (next_pos) + *next_pos = cur_pos; + + if (_h) + *_h = *h; + + assert(cur_pos % header.alignment == 0); + return SUCCESS; +} + +void FileJournal::throttle() +{ + if (throttle_ops.wait(g_conf->journal_queue_max_ops)) + dout(2) << "throttle: waited for ops" << dendl; + if (throttle_bytes.wait(g_conf->journal_queue_max_bytes)) + dout(2) << "throttle: waited for bytes" << dendl; +} + +void FileJournal::get_header( + uint64_t wanted_seq, + off64_t *_pos, + entry_header_t *h) +{ + off64_t pos = header.start; + off64_t next_pos = pos; + bufferlist bl; + uint64_t seq = 0; + dout(2) << __func__ << dendl; + while (1) { + bl.clear(); + pos = next_pos; + read_entry_result result = do_read_entry( + pos, + &next_pos, + &bl, + &seq, + 0, + h); + if (result == FAILURE || result == MAYBE_CORRUPT) + assert(0); + if (seq == wanted_seq) { + if (_pos) + *_pos = pos; + return; + } + } + assert(0); // not reachable +} + +void FileJournal::corrupt( + int wfd, + off64_t corrupt_at) +{ + dout(2) << __func__ << dendl; + if (corrupt_at >= header.max_size) + corrupt_at = corrupt_at + get_top() - header.max_size; + + int64_t actual = ::lseek64(fd, corrupt_at, SEEK_SET); + assert(actual == corrupt_at); + + char buf[10]; + int r = safe_read_exact(fd, buf, 1); + assert(r == 0); + + actual = ::lseek64(wfd, corrupt_at, SEEK_SET); + assert(actual == corrupt_at); + + buf[0]++; + r = safe_write(wfd, buf, 1); + assert(r == 0); +} + +void FileJournal::corrupt_payload( + int wfd, + uint64_t seq) +{ + dout(2) << __func__ << dendl; + off64_t pos = 0; + entry_header_t h; + get_header(seq, &pos, &h); + off64_t corrupt_at = + pos + sizeof(entry_header_t) + h.pre_pad; + corrupt(wfd, corrupt_at); +} + + +void FileJournal::corrupt_footer_magic( + int wfd, + uint64_t seq) +{ + dout(2) << __func__ << dendl; + off64_t pos = 0; + entry_header_t h; + get_header(seq, &pos, &h); + off64_t corrupt_at = + pos + sizeof(entry_header_t) + h.pre_pad + + h.len + h.post_pad + + (reinterpret_cast(&h.magic2) - reinterpret_cast(&h)); + corrupt(wfd, corrupt_at); +} + + +void FileJournal::corrupt_header_magic( + int wfd, + uint64_t seq) +{ + dout(2) << __func__ << dendl; + off64_t pos = 0; + entry_header_t h; + get_header(seq, &pos, &h); + off64_t corrupt_at = + pos + + (reinterpret_cast(&h.magic2) - reinterpret_cast(&h)); + corrupt(wfd, corrupt_at); +} diff --git a/src/os/filestore/FileJournal.h b/src/os/filestore/FileJournal.h new file mode 100644 index 000000000000..69935a61849f --- /dev/null +++ b/src/os/filestore/FileJournal.h @@ -0,0 +1,516 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_FILEJOURNAL_H +#define CEPH_FILEJOURNAL_H + +#include +using std::deque; + +#include "Journal.h" +#include "common/Cond.h" +#include "common/Mutex.h" +#include "common/Thread.h" +#include "common/Throttle.h" + +#ifdef HAVE_LIBAIO +# include +#endif + +/** + * Implements journaling on top of block device or file. + * + * Lock ordering is write_lock > aio_lock > finisher_lock + */ +class FileJournal : public Journal { +public: + /// Protected by finisher_lock + struct completion_item { + uint64_t seq; + Context *finish; + utime_t start; + TrackedOpRef tracked_op; + completion_item(uint64_t o, Context *c, utime_t s, + TrackedOpRef opref) + : seq(o), finish(c), start(s), tracked_op(opref) {} + completion_item() : seq(0), finish(0), start(0) {} + }; + struct write_item { + uint64_t seq; + bufferlist bl; + uint32_t orig_len; + TrackedOpRef tracked_op; + write_item(uint64_t s, bufferlist& b, int ol, TrackedOpRef opref) : + seq(s), orig_len(ol), tracked_op(opref) { + bl.claim(b, buffer::list::CLAIM_ALLOW_NONSHAREABLE); // potential zero-copy + } + write_item() : seq(0), orig_len(0) {} + }; + + Mutex finisher_lock; + Cond finisher_cond; + uint64_t journaled_seq; + bool plug_journal_completions; + + Mutex writeq_lock; + Cond writeq_cond; + list writeq; + bool writeq_empty(); + write_item &peek_write(); + void pop_write(); + void batch_pop_write(list &items); + void batch_unpop_write(list &items); + + Mutex completions_lock; + list completions; + bool completions_empty() { + Mutex::Locker l(completions_lock); + return completions.empty(); + } + void batch_pop_completions(list &items) { + Mutex::Locker l(completions_lock); + completions.swap(items); + } + void batch_unpop_completions(list &items) { + Mutex::Locker l(completions_lock); + completions.splice(completions.begin(), items); + } + completion_item completion_peek_front() { + Mutex::Locker l(completions_lock); + assert(!completions.empty()); + return completions.front(); + } + void completion_pop_front() { + Mutex::Locker l(completions_lock); + assert(!completions.empty()); + completions.pop_front(); + } + + int prepare_entry(list& tls, bufferlist* tbl); + + void submit_entry(uint64_t seq, bufferlist& bl, uint32_t orig_len, + Context *oncommit, + TrackedOpRef osd_op = TrackedOpRef()); + /// End protected by finisher_lock + + /* + * journal header + */ + struct header_t { + enum { + FLAG_CRC = (1<<0), + // NOTE: remove kludgey weirdness in read_header() next time a flag is added. + }; + + uint64_t flags; + uuid_d fsid; + __u32 block_size; + __u32 alignment; + int64_t max_size; // max size of journal ring buffer + int64_t start; // offset of first entry + uint64_t committed_up_to; // committed up to + + /** + * start_seq + * + * entry at header.start has sequence >= start_seq + * + * Generally, the entry at header.start will have sequence + * start_seq if it exists. The only exception is immediately + * after journal creation since the first sequence number is + * not known. + * + * If the first read on open fails, we can assume corruption + * if start_seq > committed_up_thru because the entry would have + * a sequence >= start_seq and therefore > committed_up_thru. + */ + uint64_t start_seq; + + header_t() : + flags(0), block_size(0), alignment(0), max_size(0), start(0), + committed_up_to(0), start_seq(0) {} + + void clear() { + start = block_size; + } + + uint64_t get_fsid64() const { + return *(uint64_t*)fsid.bytes(); + } + + void encode(bufferlist& bl) const { + __u32 v = 4; + ::encode(v, bl); + bufferlist em; + { + ::encode(flags, em); + ::encode(fsid, em); + ::encode(block_size, em); + ::encode(alignment, em); + ::encode(max_size, em); + ::encode(start, em); + ::encode(committed_up_to, em); + ::encode(start_seq, em); + } + ::encode(em, bl); + } + void decode(bufferlist::iterator& bl) { + __u32 v; + ::decode(v, bl); + if (v < 2) { // normally 0, but concievably 1 + // decode old header_t struct (pre v0.40). + bl.advance(4); // skip __u32 flags (it was unused by any old code) + flags = 0; + uint64_t tfsid; + ::decode(tfsid, bl); + *(uint64_t*)&fsid.bytes()[0] = tfsid; + *(uint64_t*)&fsid.bytes()[8] = tfsid; + ::decode(block_size, bl); + ::decode(alignment, bl); + ::decode(max_size, bl); + ::decode(start, bl); + committed_up_to = 0; + start_seq = 0; + return; + } + bufferlist em; + ::decode(em, bl); + bufferlist::iterator t = em.begin(); + ::decode(flags, t); + ::decode(fsid, t); + ::decode(block_size, t); + ::decode(alignment, t); + ::decode(max_size, t); + ::decode(start, t); + + if (v > 2) + ::decode(committed_up_to, t); + else + committed_up_to = 0; + + if (v > 3) + ::decode(start_seq, t); + else + start_seq = 0; + } + } header; + + struct entry_header_t { + uint64_t seq; // fs op seq # + uint32_t crc32c; // payload only. not header, pre_pad, post_pad, or footer. + uint32_t len; + uint32_t pre_pad, post_pad; + uint64_t magic1; + uint64_t magic2; + + static uint64_t make_magic(uint64_t seq, uint32_t len, uint64_t fsid) { + return (fsid ^ seq ^ len); + } + bool check_magic(off64_t pos, uint64_t fsid) { + return + magic1 == (uint64_t)pos && + magic2 == (fsid ^ seq ^ len); + } + } __attribute__((__packed__, aligned(4))); + + bool journalq_empty() { return journalq.empty(); } + +private: + string fn; + + char *zero_buf; + off64_t max_size; + size_t block_size; + bool directio, aio, force_aio; + bool must_write_header; + off64_t write_pos; // byte where the next entry to be written will go + off64_t read_pos; // + bool discard; //for block journal whether support discard + +#ifdef HAVE_LIBAIO + /// state associated with an in-flight aio request + /// Protected by aio_lock + struct aio_info { + struct iocb iocb; + bufferlist bl; + struct iovec *iov; + bool done; + uint64_t off, len; ///< these are for debug only + uint64_t seq; ///< seq number to complete on aio completion, if non-zero + + aio_info(bufferlist& b, uint64_t o, uint64_t s) + : iov(NULL), done(false), off(o), len(b.length()), seq(s) { + bl.claim(b); + memset((void*)&iocb, 0, sizeof(iocb)); + } + ~aio_info() { + delete[] iov; + } + }; + Mutex aio_lock; + Cond aio_cond; + Cond write_finish_cond; + io_context_t aio_ctx; + list aio_queue; + int aio_num, aio_bytes; + /// End protected by aio_lock +#endif + + uint64_t last_committed_seq; + uint64_t journaled_since_start; + + /* + * full states cycle at the beginnging of each commit epoch, when commit_start() + * is called. + * FULL - we just filled up during this epoch. + * WAIT - we filled up last epoch; now we have to wait until everything during + * that epoch commits to the fs before we can start writing over it. + * NOTFULL - all good, journal away. + */ + enum { + FULL_NOTFULL = 0, + FULL_FULL = 1, + FULL_WAIT = 2, + } full_state; + + int fd; + + // in journal + deque > journalq; // track seq offsets, so we can trim later. + uint64_t writing_seq; + + + // throttle + Throttle throttle_ops, throttle_bytes; + + void put_throttle(uint64_t ops, uint64_t bytes); + + // write thread + Mutex write_lock; + bool write_stop; + bool aio_stop; + + Cond commit_cond; + + int _open(bool wr, bool create=false); + int _open_block_device(); + void _close(int fd) const; + void _check_disk_write_cache() const; + int _open_file(int64_t oldsize, blksize_t blksize, bool create); + int _dump(ostream& out, bool simple); + void print_header(const header_t &hdr) const; + int read_header(header_t *hdr) const; + bufferptr prepare_header(); + void start_writer(); + void stop_writer(); + void write_thread_entry(); + + void queue_completions_thru(uint64_t seq); + + int check_for_full(uint64_t seq, off64_t pos, off64_t size); + int prepare_multi_write(bufferlist& bl, uint64_t& orig_ops, uint64_t& orig_bytee); + int prepare_single_write(write_item &next_write, bufferlist& bl, off64_t& queue_pos, + uint64_t& orig_ops, uint64_t& orig_bytes); + void do_write(bufferlist& bl); + + void write_finish_thread_entry(); + void check_aio_completion(); + void do_aio_write(bufferlist& bl); + int write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq); + + + void align_bl(off64_t pos, bufferlist& bl); + int write_bl(off64_t& pos, bufferlist& bl); + + /// read len from journal starting at in_pos and wrapping up to len + void wrap_read_bl( + off64_t in_pos, ///< [in] start position + int64_t len, ///< [in] length to read + bufferlist* bl, ///< [out] result + off64_t *out_pos ///< [out] next position to read, will be wrapped + ) const; + + void do_discard(int64_t offset, int64_t end); + + class Writer : public Thread { + FileJournal *journal; + public: + Writer(FileJournal *fj) : journal(fj) {} + void *entry() { + journal->write_thread_entry(); + return 0; + } + } write_thread; + + class WriteFinisher : public Thread { + FileJournal *journal; + public: + WriteFinisher(FileJournal *fj) : journal(fj) {} + void *entry() { + journal->write_finish_thread_entry(); + return 0; + } + } write_finish_thread; + + off64_t get_top() const { + return ROUND_UP_TO(sizeof(header), block_size); + } + + public: + FileJournal(uuid_d fsid, Finisher *fin, Cond *sync_cond, const char *f, bool dio=false, bool ai=true, bool faio=false) : + Journal(fsid, fin, sync_cond), + finisher_lock("FileJournal::finisher_lock", false, true, false, g_ceph_context), + journaled_seq(0), + plug_journal_completions(false), + writeq_lock("FileJournal::writeq_lock", false, true, false, g_ceph_context), + completions_lock( + "FileJournal::completions_lock", false, true, false, g_ceph_context), + fn(f), + zero_buf(NULL), + max_size(0), block_size(0), + directio(dio), aio(ai), force_aio(faio), + must_write_header(false), + write_pos(0), read_pos(0), + discard(false), +#ifdef HAVE_LIBAIO + aio_lock("FileJournal::aio_lock"), + aio_ctx(0), + aio_num(0), aio_bytes(0), +#endif + last_committed_seq(0), + journaled_since_start(0), + full_state(FULL_NOTFULL), + fd(-1), + writing_seq(0), + throttle_ops(g_ceph_context, "journal_ops", g_conf->journal_queue_max_ops), + throttle_bytes(g_ceph_context, "journal_bytes", g_conf->journal_queue_max_bytes), + write_lock("FileJournal::write_lock", false, true, false, g_ceph_context), + write_stop(true), + aio_stop(true), + write_thread(this), + write_finish_thread(this) { + + if (aio && !directio) { + derr << "FileJournal::_open_any: aio not supported without directio; disabling aio" << dendl; + aio = false; + } +#ifndef HAVE_LIBAIO + if (aio) { + derr << "FileJournal::_open_any: libaio not compiled in; disabling aio" << dendl; + aio = false; + } +#endif + } + ~FileJournal() { + assert(fd == -1); + delete[] zero_buf; + } + + int check(); + int create(); + int open(uint64_t fs_op_seq); + void close(); + int peek_fsid(uuid_d& fsid); + + int dump(ostream& out); + int simple_dump(ostream& out); + int _fdump(Formatter &f, bool simple); + + void flush(); + + void throttle(); + + bool is_writeable() { + return read_pos == 0; + } + int make_writeable(); + + // writes + void commit_start(uint64_t seq); + void committed_thru(uint64_t seq); + bool should_commit_now() { + return full_state != FULL_NOTFULL && !write_stop; + } + + void write_header_sync(); + + void set_wait_on_full(bool b) { wait_on_full = b; } + + // reads + + /// Result code for read_entry + enum read_entry_result { + SUCCESS, + FAILURE, + MAYBE_CORRUPT + }; + + /** + * read_entry + * + * Reads next entry starting at pos. If the entry appears + * clean, *bl will contain the payload, *seq will contain + * the sequence number, and *out_pos will reflect the next + * read position. If the entry is invalid *ss will contain + * debug text, while *seq, *out_pos, and *bl will be unchanged. + * + * If the entry suggests a corrupt log, *ss will contain debug + * text, *out_pos will contain the next index to check. If + * we find an entry in this way that returns SUCCESS, the journal + * is most likely corrupt. + */ + read_entry_result do_read_entry( + off64_t pos, ///< [in] position to read + off64_t *next_pos, ///< [out] next position to read + bufferlist* bl, ///< [out] payload for successful read + uint64_t *seq, ///< [out] seq of successful read + ostream *ss, ///< [out] error output + entry_header_t *h = 0 ///< [out] header + ) const; ///< @return result code + + bool read_entry( + bufferlist &bl, + uint64_t &last_seq, + bool *corrupt + ); + + bool read_entry( + bufferlist &bl, + uint64_t &last_seq) { + return read_entry(bl, last_seq, 0); + } + + // Debug/Testing + void get_header( + uint64_t wanted_seq, + off64_t *_pos, + entry_header_t *h); + void corrupt( + int wfd, + off64_t corrupt_at); + void corrupt_payload( + int wfd, + uint64_t seq); + void corrupt_footer_magic( + int wfd, + uint64_t seq); + void corrupt_header_magic( + int wfd, + uint64_t seq); +}; + +WRITE_CLASS_ENCODER(FileJournal::header_t) + +#endif diff --git a/src/os/filestore/FileStore.cc b/src/os/filestore/FileStore.cc new file mode 100644 index 000000000000..de2df61a0c4c --- /dev/null +++ b/src/os/filestore/FileStore.cc @@ -0,0 +1,5615 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * Copyright (c) 2015 Hewlett-Packard Development Company, L.P. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include "include/int_types.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__linux__) +#include +#endif + +#include +#include + +#include "include/compat.h" +#include "include/linux_fiemap.h" + +#include "common/xattr.h" +#include "chain_xattr.h" + +#if defined(DARWIN) || defined(__FreeBSD__) +#include +#include +#endif // DARWIN + + +#include +#include + +#include "FileStore.h" +#include "GenericFileStoreBackend.h" +#include "BtrfsFileStoreBackend.h" +#include "XfsFileStoreBackend.h" +#include "ZFSFileStoreBackend.h" +#include "common/BackTrace.h" +#include "include/types.h" +#include "FileJournal.h" + +#include "osd/osd_types.h" +#include "include/color.h" +#include "include/buffer.h" + +#include "common/Timer.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/run_cmd.h" +#include "common/safe_io.h" +#include "common/perf_counters.h" +#include "common/sync_filesystem.h" +#include "common/fd.h" +#include "HashIndex.h" +#include "DBObjectMap.h" +#include "kv/KeyValueDB.h" + +#include "common/ceph_crypto.h" +using ceph::crypto::SHA1; + +#include "include/assert.h" + +#include "common/config.h" +#include "common/blkdev.h" + +#ifdef WITH_LTTNG +#define TRACEPOINT_DEFINE +#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#include "tracing/objectstore.h" +#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#undef TRACEPOINT_DEFINE +#else +#define tracepoint(...) +#endif + +#define dout_subsys ceph_subsys_filestore +#undef dout_prefix +#define dout_prefix *_dout << "filestore(" << basedir << ") " + +#define COMMIT_SNAP_ITEM "snap_%llu" +#define CLUSTER_SNAP_ITEM "clustersnap_%s" + +#define REPLAY_GUARD_XATTR "user.cephos.seq" +#define GLOBAL_REPLAY_GUARD_XATTR "user.cephos.gseq" + +// XATTR_SPILL_OUT_NAME as a xattr is used to maintain that indicates whether +// xattrs spill over into DBObjectMap, if XATTR_SPILL_OUT_NAME exists in file +// xattrs and the value is "no", it indicates no xattrs in DBObjectMap +#define XATTR_SPILL_OUT_NAME "user.cephos.spill_out" +#define XATTR_NO_SPILL_OUT "0" +#define XATTR_SPILL_OUT "1" + +//Initial features in new superblock. +static CompatSet get_fs_initial_compat_set() { + CompatSet::FeatureSet ceph_osd_feature_compat; + CompatSet::FeatureSet ceph_osd_feature_ro_compat; + CompatSet::FeatureSet ceph_osd_feature_incompat; + return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat, + ceph_osd_feature_incompat); +} + +//Features are added here that this FileStore supports. +static CompatSet get_fs_supported_compat_set() { + CompatSet compat = get_fs_initial_compat_set(); + //Any features here can be set in code, but not in initial superblock + compat.incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS); + return compat; +} + +int FileStore::get_block_device_fsid(const string& path, uuid_d *fsid) +{ + // make sure we don't try to use aio or direct_io (and get annoying + // error messages from failing to do so); performance implications + // should be irrelevant for this use + FileJournal j(*fsid, 0, 0, path.c_str(), false, false); + return j.peek_fsid(*fsid); +} + +void FileStore::FSPerfTracker::update_from_perfcounters( + PerfCounters &logger) +{ + os_commit_latency.consume_next( + logger.get_tavg_ms( + l_os_j_lat)); + os_apply_latency.consume_next( + logger.get_tavg_ms( + l_os_apply_lat)); +} + + +ostream& operator<<(ostream& out, const FileStore::OpSequencer& s) +{ + assert(&out); + return out << *s.parent; +} + +int FileStore::get_cdir(coll_t cid, char *s, int len) +{ + const string &cid_str(cid.to_str()); + return snprintf(s, len, "%s/current/%s", basedir.c_str(), cid_str.c_str()); +} + +int FileStore::get_index(coll_t cid, Index *index) +{ + int r = index_manager.get_index(cid, basedir, index); + assert(!m_filestore_fail_eio || r != -EIO); + return r; +} + +int FileStore::init_index(coll_t cid) +{ + char path[PATH_MAX]; + get_cdir(cid, path, sizeof(path)); + int r = index_manager.init_index(cid, path, target_version); + assert(!m_filestore_fail_eio || r != -EIO); + return r; +} + +int FileStore::lfn_find(const ghobject_t& oid, const Index& index, IndexedPath *path) +{ + IndexedPath path2; + if (!path) + path = &path2; + int r, exist; + assert(NULL != index.index); + r = (index.index)->lookup(oid, path, &exist); + if (r < 0) { + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + if (!exist) + return -ENOENT; + return 0; +} + +int FileStore::lfn_truncate(coll_t cid, const ghobject_t& oid, off_t length) +{ + FDRef fd; + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) + return r; + r = ::ftruncate(**fd, length); + if (r < 0) + r = -errno; + if (r >= 0 && m_filestore_sloppy_crc) { + int rc = backend->_crc_update_truncate(**fd, length); + assert(rc >= 0); + } + assert(!m_filestore_fail_eio || r != -EIO); + return r; +} + +int FileStore::lfn_stat(coll_t cid, const ghobject_t& oid, struct stat *buf) +{ + IndexedPath path; + Index index; + int r = get_index(cid, &index); + if (r < 0) + return r; + + assert(NULL != index.index); + RWLock::RLocker l((index.index)->access_lock); + + r = lfn_find(oid, index, &path); + if (r < 0) + return r; + r = ::stat(path->path(), buf); + if (r < 0) + r = -errno; + return r; +} + +int FileStore::lfn_open(coll_t cid, + const ghobject_t& oid, + bool create, + FDRef *outfd, + Index *index) +{ + assert(outfd); + int r = 0; + bool need_lock = true; + int flags = O_RDWR; + + if (create) + flags |= O_CREAT; + + Index index2; + if (!index) { + index = &index2; + } + if (!((*index).index)) { + r = get_index(cid, index); + if (r < 0) { + dout(10) << __func__ << " could not get index r = " << r << dendl; + return r; + } + } else { + need_lock = false; + } + + int fd, exist; + assert(NULL != (*index).index); + if (need_lock) { + ((*index).index)->access_lock.get_write(); + } + if (!replaying) { + *outfd = fdcache.lookup(oid); + if (*outfd) { + if (need_lock) { + ((*index).index)->access_lock.put_write(); + } + return 0; + } + } + + + IndexedPath path2; + IndexedPath *path = &path2; + + r = (*index)->lookup(oid, path, &exist); + if (r < 0) { + derr << "could not find " << oid << " in index: " + << cpp_strerror(-r) << dendl; + goto fail; + } + + r = ::open((*path)->path(), flags, 0644); + if (r < 0) { + r = -errno; + dout(10) << "error opening file " << (*path)->path() << " with flags=" + << flags << ": " << cpp_strerror(-r) << dendl; + goto fail; + } + fd = r; + if (create && (!exist)) { + r = (*index)->created(oid, (*path)->path()); + if (r < 0) { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + derr << "error creating " << oid << " (" << (*path)->path() + << ") in index: " << cpp_strerror(-r) << dendl; + goto fail; + } + r = chain_fsetxattr(fd, XATTR_SPILL_OUT_NAME, + XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT), true); + if (r < 0) { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + derr << "error setting spillout xattr for oid " << oid << " (" << (*path)->path() + << "):" << cpp_strerror(-r) << dendl; + goto fail; + } + } + + if (!replaying) { + bool existed; + *outfd = fdcache.add(oid, fd, &existed); + if (existed) { + TEMP_FAILURE_RETRY(::close(fd)); + } + } else { + *outfd = FDRef(new FDCache::FD(fd)); + } + + if (need_lock) { + ((*index).index)->access_lock.put_write(); + } + + return 0; + + fail: + + if (need_lock) { + ((*index).index)->access_lock.put_write(); + } + + assert(!m_filestore_fail_eio || r != -EIO); + return r; +} + +void FileStore::lfn_close(FDRef fd) +{ +} + +int FileStore::lfn_link(coll_t c, coll_t newcid, const ghobject_t& o, const ghobject_t& newoid) +{ + Index index_new, index_old; + IndexedPath path_new, path_old; + int exist; + int r; + bool index_same = false; + if (c < newcid) { + r = get_index(newcid, &index_new); + if (r < 0) + return r; + r = get_index(c, &index_old); + if (r < 0) + return r; + } else if (c == newcid) { + r = get_index(c, &index_old); + if (r < 0) + return r; + index_new = index_old; + index_same = true; + } else { + r = get_index(c, &index_old); + if (r < 0) + return r; + r = get_index(newcid, &index_new); + if (r < 0) + return r; + } + + assert(NULL != index_old.index); + assert(NULL != index_new.index); + + if (!index_same) { + + RWLock::RLocker l1((index_old.index)->access_lock); + + r = index_old->lookup(o, &path_old, &exist); + if (r < 0) { + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + if (!exist) + return -ENOENT; + + RWLock::WLocker l2((index_new.index)->access_lock); + + r = index_new->lookup(newoid, &path_new, &exist); + if (r < 0) { + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + if (exist) + return -EEXIST; + + dout(25) << "lfn_link path_old: " << path_old << dendl; + dout(25) << "lfn_link path_new: " << path_new << dendl; + r = ::link(path_old->path(), path_new->path()); + if (r < 0) + return -errno; + + r = index_new->created(newoid, path_new->path()); + if (r < 0) { + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + } else { + RWLock::WLocker l1((index_old.index)->access_lock); + + r = index_old->lookup(o, &path_old, &exist); + if (r < 0) { + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + if (!exist) + return -ENOENT; + + r = index_new->lookup(newoid, &path_new, &exist); + if (r < 0) { + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + if (exist) + return -EEXIST; + + dout(25) << "lfn_link path_old: " << path_old << dendl; + dout(25) << "lfn_link path_new: " << path_new << dendl; + r = ::link(path_old->path(), path_new->path()); + if (r < 0) + return -errno; + + // make sure old fd for unlinked/overwritten file is gone + fdcache.clear(newoid); + + r = index_new->created(newoid, path_new->path()); + if (r < 0) { + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + } + return 0; +} + +int FileStore::lfn_unlink(coll_t cid, const ghobject_t& o, + const SequencerPosition &spos, + bool force_clear_omap) +{ + Index index; + int r = get_index(cid, &index); + if (r < 0) { + dout(25) << __func__ << " get_index failed " << cpp_strerror(r) << dendl; + return r; + } + + assert(NULL != index.index); + RWLock::WLocker l((index.index)->access_lock); + + { + IndexedPath path; + int hardlink; + r = index->lookup(o, &path, &hardlink); + if (r < 0) { + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + + if (!force_clear_omap) { + if (hardlink == 0) { + wbthrottle.clear_object(o); // should be only non-cache ref + fdcache.clear(o); + return 0; + } else if (hardlink == 1) { + force_clear_omap = true; + } + } + if (force_clear_omap) { + dout(20) << __func__ << ": clearing omap on " << o + << " in cid " << cid << dendl; + r = object_map->clear(o, &spos); + if (r < 0 && r != -ENOENT) { + dout(25) << __func__ << " omap clear failed " << cpp_strerror(r) << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + if (g_conf->filestore_debug_inject_read_err) { + debug_obj_on_delete(o); + } + wbthrottle.clear_object(o); // should be only non-cache ref + fdcache.clear(o); + } else { + /* Ensure that replay of this op doesn't result in the object_map + * going away. + */ + if (!backend->can_checkpoint()) + object_map->sync(&o, &spos); + } + } + r = index->unlink(o); + if (r < 0) { + dout(25) << __func__ << " index unlink failed " << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +FileStore::FileStore(const std::string &base, const std::string &jdev, osflagbits_t flags, const char *name, bool do_update) : + JournalingObjectStore(base), + internal_name(name), + basedir(base), journalpath(jdev), + generic_flags(flags), + blk_size(0), + fsid_fd(-1), op_fd(-1), + basedir_fd(-1), current_fd(-1), + backend(NULL), + index_manager(do_update), + lock("FileStore::lock"), + force_sync(false), + sync_entry_timeo_lock("sync_entry_timeo_lock"), + timer(g_ceph_context, sync_entry_timeo_lock), + stop(false), sync_thread(this), + fdcache(g_ceph_context), + wbthrottle(g_ceph_context), + next_osr_id(0), + throttle_ops(g_ceph_context, "filestore_ops",g_conf->filestore_queue_max_ops), + throttle_bytes(g_ceph_context, "filestore_bytes",g_conf->filestore_queue_max_bytes), + m_ondisk_finisher_num(g_conf->filestore_ondisk_finisher_threads), + m_apply_finisher_num(g_conf->filestore_apply_finisher_threads), + op_tp(g_ceph_context, "FileStore::op_tp", g_conf->filestore_op_threads, "filestore_op_threads"), + op_wq(this, g_conf->filestore_op_thread_timeout, + g_conf->filestore_op_thread_suicide_timeout, &op_tp), + logger(NULL), + read_error_lock("FileStore::read_error_lock"), + m_filestore_commit_timeout(g_conf->filestore_commit_timeout), + m_filestore_journal_parallel(g_conf->filestore_journal_parallel ), + m_filestore_journal_trailing(g_conf->filestore_journal_trailing), + m_filestore_journal_writeahead(g_conf->filestore_journal_writeahead), + m_filestore_fiemap_threshold(g_conf->filestore_fiemap_threshold), + m_filestore_max_sync_interval(g_conf->filestore_max_sync_interval), + m_filestore_min_sync_interval(g_conf->filestore_min_sync_interval), + m_filestore_fail_eio(g_conf->filestore_fail_eio), + m_filestore_fadvise(g_conf->filestore_fadvise), + do_update(do_update), + m_journal_dio(g_conf->journal_dio), + m_journal_aio(g_conf->journal_aio), + m_journal_force_aio(g_conf->journal_force_aio), + m_osd_rollback_to_cluster_snap(g_conf->osd_rollback_to_cluster_snap), + m_osd_use_stale_snap(g_conf->osd_use_stale_snap), + m_filestore_queue_max_ops(g_conf->filestore_queue_max_ops), + m_filestore_queue_max_bytes(g_conf->filestore_queue_max_bytes), + m_filestore_queue_committing_max_ops(g_conf->filestore_queue_committing_max_ops), + m_filestore_queue_committing_max_bytes(g_conf->filestore_queue_committing_max_bytes), + m_filestore_do_dump(false), + m_filestore_dump_fmt(true), + m_filestore_sloppy_crc(g_conf->filestore_sloppy_crc), + m_filestore_sloppy_crc_block_size(g_conf->filestore_sloppy_crc_block_size), + m_filestore_max_alloc_hint_size(g_conf->filestore_max_alloc_hint_size), + m_fs_type(0), + m_filestore_max_inline_xattr_size(0), + m_filestore_max_inline_xattrs(0) +{ + m_filestore_kill_at.set(g_conf->filestore_kill_at); + for (int i = 0; i < m_ondisk_finisher_num; ++i) { + ostringstream oss; + oss << "filestore-ondisk-" << i; + Finisher *f = new Finisher(g_ceph_context, oss.str()); + ondisk_finishers.push_back(f); + } + for (int i = 0; i < m_apply_finisher_num; ++i) { + ostringstream oss; + oss << "filestore-apply-" << i; + Finisher *f = new Finisher(g_ceph_context, oss.str()); + apply_finishers.push_back(f); + } + + ostringstream oss; + oss << basedir << "/current"; + current_fn = oss.str(); + + ostringstream sss; + sss << basedir << "/current/commit_op_seq"; + current_op_seq_fn = sss.str(); + + ostringstream omss; + omss << basedir << "/current/omap"; + omap_dir = omss.str(); + + // initialize logger + PerfCountersBuilder plb(g_ceph_context, internal_name, l_os_first, l_os_last); + + plb.add_u64(l_os_jq_max_ops, "journal_queue_max_ops", "Max operations in journal queue"); + plb.add_u64(l_os_jq_ops, "journal_queue_ops", "Operations in journal queue"); + plb.add_u64_counter(l_os_j_ops, "journal_ops", "Total journal entries written"); + plb.add_u64(l_os_jq_max_bytes, "journal_queue_max_bytes", "Max data in journal queue"); + plb.add_u64(l_os_jq_bytes, "journal_queue_bytes", "Size of journal queue"); + plb.add_u64_counter(l_os_j_bytes, "journal_bytes", "Total operations size in journal"); + plb.add_time_avg(l_os_j_lat, "journal_latency", "Average journal queue completing latency"); + plb.add_u64_counter(l_os_j_wr, "journal_wr", "Journal write IOs"); + plb.add_u64_avg(l_os_j_wr_bytes, "journal_wr_bytes", "Journal data written"); + plb.add_u64(l_os_oq_max_ops, "op_queue_max_ops", "Max operations in writing to FS queue"); + plb.add_u64(l_os_oq_ops, "op_queue_ops", "Operations in writing to FS queue"); + plb.add_u64_counter(l_os_ops, "ops", "Operations written to store"); + plb.add_u64(l_os_oq_max_bytes, "op_queue_max_bytes", "Max data in writing to FS queue"); + plb.add_u64(l_os_oq_bytes, "op_queue_bytes", "Size of writing to FS queue"); + plb.add_u64_counter(l_os_bytes, "bytes", "Data written to store"); + plb.add_time_avg(l_os_apply_lat, "apply_latency", "Apply latency"); + plb.add_u64(l_os_committing, "committing", "Is currently committing"); + + plb.add_u64_counter(l_os_commit, "commitcycle", "Commit cycles"); + plb.add_time_avg(l_os_commit_len, "commitcycle_interval", "Average interval between commits"); + plb.add_time_avg(l_os_commit_lat, "commitcycle_latency", "Average latency of commit"); + plb.add_u64_counter(l_os_j_full, "journal_full", "Journal writes while full"); + plb.add_time_avg(l_os_queue_lat, "queue_transaction_latency_avg", "Store operation queue latency"); + + logger = plb.create_perf_counters(); + + g_ceph_context->get_perfcounters_collection()->add(logger); + g_ceph_context->_conf->add_observer(this); + + superblock.compat_features = get_fs_initial_compat_set(); +} + +FileStore::~FileStore() +{ + for (vector::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) { + delete *it; + *it = NULL; + } + for (vector::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) { + delete *it; + *it = NULL; + } + g_ceph_context->_conf->remove_observer(this); + g_ceph_context->get_perfcounters_collection()->remove(logger); + + if (journal) + journal->logger = NULL; + delete logger; + + if (m_filestore_do_dump) { + dump_stop(); + } +} + +static void get_attrname(const char *name, char *buf, int len) +{ + snprintf(buf, len, "user.ceph.%s", name); +} + +bool parse_attrname(char **name) +{ + if (strncmp(*name, "user.ceph.", 10) == 0) { + *name += 10; + return true; + } + return false; +} + +void FileStore::collect_metadata(map *pm) +{ + char partition_path[PATH_MAX]; + char dev_node[PATH_MAX]; + int rc = 0; + + (*pm)["filestore_backend"] = backend->get_name(); + ostringstream ss; + ss << "0x" << std::hex << m_fs_type << std::dec; + (*pm)["filestore_f_type"] = ss.str(); + + if (g_conf->filestore_collect_device_partition_information) { + rc = get_device_by_uuid(get_fsid(), "PARTUUID", partition_path, + dev_node); + } else { + rc = -EINVAL; + } + + switch (rc) { + case -EOPNOTSUPP: + case -EINVAL: + (*pm)["backend_filestore_partition_path"] = "unknown"; + (*pm)["backend_filestore_dev_node"] = "unknown"; + break; + case -ENODEV: + (*pm)["backend_filestore_partition_path"] = string(partition_path); + (*pm)["backend_filestore_dev_node"] = "unknown"; + break; + default: + (*pm)["backend_filestore_partition_path"] = string(partition_path); + (*pm)["backend_filestore_dev_node"] = string(dev_node); + } +} + +int FileStore::statfs(struct statfs *buf) +{ + if (::statfs(basedir.c_str(), buf) < 0) { + int r = -errno; + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + return 0; +} + + +void FileStore::new_journal() +{ + if (journalpath.length()) { + dout(10) << "open_journal at " << journalpath << dendl; + journal = new FileJournal(fsid, &finisher, &sync_cond, journalpath.c_str(), + m_journal_dio, m_journal_aio, m_journal_force_aio); + if (journal) + journal->logger = logger; + } + return; +} + +int FileStore::dump_journal(ostream& out) +{ + int r; + + if (!journalpath.length()) + return -EINVAL; + + FileJournal *journal = new FileJournal(fsid, &finisher, &sync_cond, journalpath.c_str(), m_journal_dio); + r = journal->dump(out); + delete journal; + return r; +} + +FileStoreBackend *FileStoreBackend::create(long f_type, FileStore *fs) +{ + switch (f_type) { +#if defined(__linux__) + case BTRFS_SUPER_MAGIC: + return new BtrfsFileStoreBackend(fs); +# ifdef HAVE_LIBXFS + case XFS_SUPER_MAGIC: + return new XfsFileStoreBackend(fs); +# endif +#endif +#ifdef HAVE_LIBZFS + case ZFS_SUPER_MAGIC: + return new ZFSFileStoreBackend(fs); +#endif + default: + return new GenericFileStoreBackend(fs); + } +} + +void FileStore::create_backend(long f_type) +{ + m_fs_type = f_type; + + assert(backend == NULL); + backend = FileStoreBackend::create(f_type, this); + + dout(0) << "backend " << backend->get_name() + << " (magic 0x" << std::hex << f_type << std::dec << ")" + << dendl; + + switch (f_type) { +#if defined(__linux__) + case BTRFS_SUPER_MAGIC: + wbthrottle.set_fs(WBThrottle::BTRFS); + break; + + case XFS_SUPER_MAGIC: + // wbthrottle is constructed with fs(WBThrottle::XFS) + break; +#endif + } + + set_xattr_limits_via_conf(); +} + +int FileStore::mkfs() +{ + int ret = 0; + char fsid_fn[PATH_MAX]; + uuid_d old_fsid; + + dout(1) << "mkfs in " << basedir << dendl; + basedir_fd = ::open(basedir.c_str(), O_RDONLY); + if (basedir_fd < 0) { + ret = -errno; + derr << "mkfs failed to open base dir " << basedir << ": " << cpp_strerror(ret) << dendl; + return ret; + } + + // open+lock fsid + snprintf(fsid_fn, sizeof(fsid_fn), "%s/fsid", basedir.c_str()); + fsid_fd = ::open(fsid_fn, O_RDWR|O_CREAT, 0644); + if (fsid_fd < 0) { + ret = -errno; + derr << "mkfs: failed to open " << fsid_fn << ": " << cpp_strerror(ret) << dendl; + goto close_basedir_fd; + } + + if (lock_fsid() < 0) { + ret = -EBUSY; + goto close_fsid_fd; + } + + if (read_fsid(fsid_fd, &old_fsid) < 0 || old_fsid.is_zero()) { + if (fsid.is_zero()) { + fsid.generate_random(); + dout(1) << "mkfs generated fsid " << fsid << dendl; + } else { + dout(1) << "mkfs using provided fsid " << fsid << dendl; + } + + char fsid_str[40]; + fsid.print(fsid_str); + strcat(fsid_str, "\n"); + ret = ::ftruncate(fsid_fd, 0); + if (ret < 0) { + ret = -errno; + derr << "mkfs: failed to truncate fsid: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + ret = safe_write(fsid_fd, fsid_str, strlen(fsid_str)); + if (ret < 0) { + derr << "mkfs: failed to write fsid: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + if (::fsync(fsid_fd) < 0) { + ret = errno; + derr << "mkfs: close failed: can't write fsid: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + dout(10) << "mkfs fsid is " << fsid << dendl; + } else { + if (!fsid.is_zero() && fsid != old_fsid) { + derr << "mkfs on-disk fsid " << old_fsid << " != provided " << fsid << dendl; + ret = -EINVAL; + goto close_fsid_fd; + } + fsid = old_fsid; + dout(1) << "mkfs fsid is already set to " << fsid << dendl; + } + + // version stamp + ret = write_version_stamp(); + if (ret < 0) { + derr << "mkfs: write_version_stamp() failed: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + + // superblock + superblock.omap_backend = g_conf->filestore_omap_backend; + ret = write_superblock(); + if (ret < 0) { + derr << "mkfs: write_superblock() failed: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + + struct statfs basefs; + ret = ::fstatfs(basedir_fd, &basefs); + if (ret < 0) { + ret = -errno; + derr << "mkfs cannot fstatfs basedir " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + + create_backend(basefs.f_type); + + ret = backend->create_current(); + if (ret < 0) { + derr << "mkfs: failed to create current/ " << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + + // write initial op_seq + { + uint64_t initial_seq = 0; + int fd = read_op_seq(&initial_seq); + if (fd < 0) { + derr << "mkfs: failed to create " << current_op_seq_fn << ": " + << cpp_strerror(fd) << dendl; + goto close_fsid_fd; + } + if (initial_seq == 0) { + int err = write_op_seq(fd, 1); + if (err < 0) { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + derr << "mkfs: failed to write to " << current_op_seq_fn << ": " + << cpp_strerror(err) << dendl; + goto close_fsid_fd; + } + + if (backend->can_checkpoint()) { + // create snap_1 too + current_fd = ::open(current_fn.c_str(), O_RDONLY); + assert(current_fd >= 0); + char s[NAME_MAX]; + snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, 1ull); + ret = backend->create_checkpoint(s, NULL); + VOID_TEMP_FAILURE_RETRY(::close(current_fd)); + if (ret < 0 && ret != -EEXIST) { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + derr << "mkfs: failed to create snap_1: " << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + } + } + VOID_TEMP_FAILURE_RETRY(::close(fd)); + } + ret = KeyValueDB::test_init(superblock.omap_backend, omap_dir); + if (ret < 0) { + derr << "mkfs failed to create " << g_conf->filestore_omap_backend << dendl; + ret = -1; + goto close_fsid_fd; + } + dout(1) << g_conf->filestore_omap_backend << " db exists/created" << dendl; + + // journal? + ret = mkjournal(); + if (ret) + goto close_fsid_fd; + + ret = write_meta("type", "filestore"); + if (ret) + goto close_fsid_fd; + + dout(1) << "mkfs done in " << basedir << dendl; + ret = 0; + + close_fsid_fd: + VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); + fsid_fd = -1; + close_basedir_fd: + VOID_TEMP_FAILURE_RETRY(::close(basedir_fd)); + delete backend; + backend = NULL; + return ret; +} + +int FileStore::mkjournal() +{ + // read fsid + int ret; + char fn[PATH_MAX]; + snprintf(fn, sizeof(fn), "%s/fsid", basedir.c_str()); + int fd = ::open(fn, O_RDONLY, 0644); + if (fd < 0) { + int err = errno; + derr << "FileStore::mkjournal: open error: " << cpp_strerror(err) << dendl; + return -err; + } + ret = read_fsid(fd, &fsid); + if (ret < 0) { + derr << "FileStore::mkjournal: read error: " << cpp_strerror(ret) << dendl; + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return ret; + } + VOID_TEMP_FAILURE_RETRY(::close(fd)); + + ret = 0; + + new_journal(); + if (journal) { + ret = journal->check(); + if (ret < 0) { + ret = journal->create(); + if (ret) + derr << "mkjournal error creating journal on " << journalpath + << ": " << cpp_strerror(ret) << dendl; + else + dout(0) << "mkjournal created journal on " << journalpath << dendl; + } + delete journal; + journal = 0; + } + return ret; +} + +int FileStore::read_fsid(int fd, uuid_d *uuid) +{ + char fsid_str[40]; + int ret = safe_read(fd, fsid_str, sizeof(fsid_str)); + if (ret < 0) + return ret; + if (ret == 8) { + // old 64-bit fsid... mirror it. + *(uint64_t*)&uuid->bytes()[0] = *(uint64_t*)fsid_str; + *(uint64_t*)&uuid->bytes()[8] = *(uint64_t*)fsid_str; + return 0; + } + + if (ret > 36) + fsid_str[36] = 0; + if (!uuid->parse(fsid_str)) + return -EINVAL; + return 0; +} + +int FileStore::lock_fsid() +{ + struct flock l; + memset(&l, 0, sizeof(l)); + l.l_type = F_WRLCK; + l.l_whence = SEEK_SET; + l.l_start = 0; + l.l_len = 0; + int r = ::fcntl(fsid_fd, F_SETLK, &l); + if (r < 0) { + int err = errno; + dout(0) << "lock_fsid failed to lock " << basedir << "/fsid, is another ceph-osd still running? " + << cpp_strerror(err) << dendl; + return -err; + } + return 0; +} + +bool FileStore::test_mount_in_use() +{ + dout(5) << "test_mount basedir " << basedir << " journal " << journalpath << dendl; + char fn[PATH_MAX]; + snprintf(fn, sizeof(fn), "%s/fsid", basedir.c_str()); + + // verify fs isn't in use + + fsid_fd = ::open(fn, O_RDWR, 0644); + if (fsid_fd < 0) + return 0; // no fsid, ok. + bool inuse = lock_fsid() < 0; + VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); + fsid_fd = -1; + return inuse; +} + +int FileStore::_detect_fs() +{ + struct statfs st; + int r = ::fstatfs(basedir_fd, &st); + if (r < 0) + return -errno; + + blk_size = st.f_bsize; + + create_backend(st.f_type); + + r = backend->detect_features(); + if (r < 0) { + derr << "_detect_fs: detect_features error: " << cpp_strerror(r) << dendl; + return r; + } + + // test xattrs + char fn[PATH_MAX]; + int x = rand(); + int y = x+1; + snprintf(fn, sizeof(fn), "%s/xattr_test", basedir.c_str()); + int tmpfd = ::open(fn, O_CREAT|O_WRONLY|O_TRUNC, 0700); + if (tmpfd < 0) { + int ret = -errno; + derr << "_detect_fs unable to create " << fn << ": " << cpp_strerror(ret) << dendl; + return ret; + } + + int ret = chain_fsetxattr(tmpfd, "user.test", &x, sizeof(x)); + if (ret >= 0) + ret = chain_fgetxattr(tmpfd, "user.test", &y, sizeof(y)); + if ((ret < 0) || (x != y)) { + derr << "Extended attributes don't appear to work. "; + if (ret) + *_dout << "Got error " + cpp_strerror(ret) + ". "; + *_dout << "If you are using ext3 or ext4, be sure to mount the underlying " + << "file system with the 'user_xattr' option." << dendl; + ::unlink(fn); + VOID_TEMP_FAILURE_RETRY(::close(tmpfd)); + return -ENOTSUP; + } + + char buf[1000]; + memset(buf, 0, sizeof(buf)); // shut up valgrind + chain_fsetxattr(tmpfd, "user.test", &buf, sizeof(buf)); + chain_fsetxattr(tmpfd, "user.test2", &buf, sizeof(buf)); + chain_fsetxattr(tmpfd, "user.test3", &buf, sizeof(buf)); + chain_fsetxattr(tmpfd, "user.test4", &buf, sizeof(buf)); + ret = chain_fsetxattr(tmpfd, "user.test5", &buf, sizeof(buf)); + if (ret == -ENOSPC) { + dout(0) << "limited size xattrs" << dendl; + } + chain_fremovexattr(tmpfd, "user.test"); + chain_fremovexattr(tmpfd, "user.test2"); + chain_fremovexattr(tmpfd, "user.test3"); + chain_fremovexattr(tmpfd, "user.test4"); + chain_fremovexattr(tmpfd, "user.test5"); + + ::unlink(fn); + VOID_TEMP_FAILURE_RETRY(::close(tmpfd)); + + return 0; +} + +int FileStore::_sanity_check_fs() +{ + // sanity check(s) + + if (((int)m_filestore_journal_writeahead + + (int)m_filestore_journal_parallel + + (int)m_filestore_journal_trailing) > 1) { + dout(0) << "mount ERROR: more than one of filestore journal {writeahead,parallel,trailing} enabled" << dendl; + cerr << TEXT_RED + << " ** WARNING: more than one of 'filestore journal {writeahead,parallel,trailing}'\n" + << " is enabled in ceph.conf. You must choose a single journal mode." + << TEXT_NORMAL << std::endl; + return -EINVAL; + } + + if (!backend->can_checkpoint()) { + if (!journal || !m_filestore_journal_writeahead) { + dout(0) << "mount WARNING: no btrfs, and no journal in writeahead mode; data may be lost" << dendl; + cerr << TEXT_RED + << " ** WARNING: no btrfs AND (no journal OR journal not in writeahead mode)\n" + << " For non-btrfs volumes, a writeahead journal is required to\n" + << " maintain on-disk consistency in the event of a crash. Your conf\n" + << " should include something like:\n" + << " osd journal = /path/to/journal_device_or_file\n" + << " filestore journal writeahead = true\n" + << TEXT_NORMAL; + } + } + + if (!journal) { + dout(0) << "mount WARNING: no journal" << dendl; + cerr << TEXT_YELLOW + << " ** WARNING: No osd journal is configured: write latency may be high.\n" + << " If you will not be using an osd journal, write latency may be\n" + << " relatively high. It can be reduced somewhat by lowering\n" + << " filestore_max_sync_interval, but lower values mean lower write\n" + << " throughput, especially with spinning disks.\n" + << TEXT_NORMAL; + } + + return 0; +} + +int FileStore::write_superblock() +{ + bufferlist bl; + ::encode(superblock, bl); + return safe_write_file(basedir.c_str(), "superblock", + bl.c_str(), bl.length()); +} + +int FileStore::read_superblock() +{ + bufferptr bp(PATH_MAX); + int ret = safe_read_file(basedir.c_str(), "superblock", + bp.c_str(), bp.length()); + if (ret < 0) { + if (ret == -ENOENT) { + // If the file doesn't exist write initial CompatSet + return write_superblock(); + } + return ret; + } + + bufferlist bl; + bl.push_back(bp); + bufferlist::iterator i = bl.begin(); + ::decode(superblock, i); + return 0; +} + +int FileStore::update_version_stamp() +{ + return write_version_stamp(); +} + +int FileStore::version_stamp_is_valid(uint32_t *version) +{ + bufferptr bp(PATH_MAX); + int ret = safe_read_file(basedir.c_str(), "store_version", + bp.c_str(), bp.length()); + if (ret < 0) { + if (ret == -ENOENT) + return 0; + return ret; + } + bufferlist bl; + bl.push_back(bp); + bufferlist::iterator i = bl.begin(); + ::decode(*version, i); + dout(10) << __func__ << " was " << *version << " vs target " + << target_version << dendl; + if (*version == target_version) + return 1; + else + return 0; +} + +int FileStore::write_version_stamp() +{ + dout(1) << __func__ << " " << target_version << dendl; + bufferlist bl; + ::encode(target_version, bl); + + return safe_write_file(basedir.c_str(), "store_version", + bl.c_str(), bl.length()); +} + +int FileStore::upgrade() +{ + dout(1) << "upgrade" << dendl; + uint32_t version; + int r = version_stamp_is_valid(&version); + if (r < 0) + return r; + if (r == 1) + return 0; + + if (version < 3) { + derr << "ObjectStore is old at version " << version << ". Please upgrade to firefly v0.80.x, convert your store, and then upgrade." << dendl; + return -EINVAL; + } + + // nothing necessary in FileStore for v3 -> v4 upgrade; we just need to + // open up DBObjectMap with the do_upgrade flag, which we already did. + update_version_stamp(); + return 0; +} + +int FileStore::read_op_seq(uint64_t *seq) +{ + int op_fd = ::open(current_op_seq_fn.c_str(), O_CREAT|O_RDWR, 0644); + if (op_fd < 0) { + int r = -errno; + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + char s[40]; + memset(s, 0, sizeof(s)); + int ret = safe_read(op_fd, s, sizeof(s) - 1); + if (ret < 0) { + derr << "error reading " << current_op_seq_fn << ": " << cpp_strerror(ret) << dendl; + VOID_TEMP_FAILURE_RETRY(::close(op_fd)); + assert(!m_filestore_fail_eio || ret != -EIO); + return ret; + } + *seq = atoll(s); + return op_fd; +} + +int FileStore::write_op_seq(int fd, uint64_t seq) +{ + char s[30]; + snprintf(s, sizeof(s), "%" PRId64 "\n", seq); + int ret = TEMP_FAILURE_RETRY(::pwrite(fd, s, strlen(s), 0)); + if (ret < 0) { + ret = -errno; + assert(!m_filestore_fail_eio || ret != -EIO); + } + return ret; +} + +int FileStore::mount() +{ + int ret; + char buf[PATH_MAX]; + uint64_t initial_op_seq; + set cluster_snaps; + CompatSet supported_compat_set = get_fs_supported_compat_set(); + + dout(5) << "basedir " << basedir << " journal " << journalpath << dendl; + + // make sure global base dir exists + if (::access(basedir.c_str(), R_OK | W_OK)) { + ret = -errno; + derr << "FileStore::mount: unable to access basedir '" << basedir << "': " + << cpp_strerror(ret) << dendl; + goto done; + } + + // get fsid + snprintf(buf, sizeof(buf), "%s/fsid", basedir.c_str()); + fsid_fd = ::open(buf, O_RDWR, 0644); + if (fsid_fd < 0) { + ret = -errno; + derr << "FileStore::mount: error opening '" << buf << "': " + << cpp_strerror(ret) << dendl; + goto done; + } + + ret = read_fsid(fsid_fd, &fsid); + if (ret < 0) { + derr << "FileStore::mount: error reading fsid_fd: " << cpp_strerror(ret) + << dendl; + goto close_fsid_fd; + } + + if (lock_fsid() < 0) { + derr << "FileStore::mount: lock_fsid failed" << dendl; + ret = -EBUSY; + goto close_fsid_fd; + } + + dout(10) << "mount fsid is " << fsid << dendl; + + + uint32_t version_stamp; + ret = version_stamp_is_valid(&version_stamp); + if (ret < 0) { + derr << "FileStore::mount : error in version_stamp_is_valid: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } else if (ret == 0) { + if (do_update || (int)version_stamp < g_conf->filestore_update_to) { + derr << "FileStore::mount : stale version stamp detected: " + << version_stamp + << ". Proceeding, do_update " + << "is set, performing disk format upgrade." + << dendl; + do_update = true; + } else { + ret = -EINVAL; + derr << "FileStore::mount : stale version stamp " << version_stamp + << ". Please run the FileStore update script before starting the " + << "OSD, or set filestore_update_to to " << target_version + << " (currently " << g_conf->filestore_update_to << ")" + << dendl; + goto close_fsid_fd; + } + } + + ret = read_superblock(); + if (ret < 0) { + ret = -EINVAL; + goto close_fsid_fd; + } + + // Check if this FileStore supports all the necessary features to mount + if (supported_compat_set.compare(superblock.compat_features) == -1) { + derr << "FileStore::mount : Incompatible features set " + << superblock.compat_features << dendl; + ret = -EINVAL; + goto close_fsid_fd; + } + + // open some dir handles + basedir_fd = ::open(basedir.c_str(), O_RDONLY); + if (basedir_fd < 0) { + ret = -errno; + derr << "FileStore::mount: failed to open " << basedir << ": " + << cpp_strerror(ret) << dendl; + basedir_fd = -1; + goto close_fsid_fd; + } + + // test for btrfs, xattrs, etc. + ret = _detect_fs(); + if (ret < 0) { + derr << "FileStore::mount : error in _detect_fs: " + << cpp_strerror(ret) << dendl; + goto close_basedir_fd; + } + + { + list ls; + ret = backend->list_checkpoints(ls); + if (ret < 0) { + derr << "FileStore::mount : error in _list_snaps: "<< cpp_strerror(ret) << dendl; + goto close_basedir_fd; + } + + long long unsigned c, prev = 0; + char clustersnap[NAME_MAX]; + for (list::iterator it = ls.begin(); it != ls.end(); ++it) { + if (sscanf(it->c_str(), COMMIT_SNAP_ITEM, &c) == 1) { + assert(c > prev); + prev = c; + snaps.push_back(c); + } else if (sscanf(it->c_str(), CLUSTER_SNAP_ITEM, clustersnap) == 1) + cluster_snaps.insert(*it); + } + } + + if (m_osd_rollback_to_cluster_snap.length() && + cluster_snaps.count(m_osd_rollback_to_cluster_snap) == 0) { + derr << "rollback to cluster snapshot '" << m_osd_rollback_to_cluster_snap << "': not found" << dendl; + ret = -ENOENT; + goto close_basedir_fd; + } + + char nosnapfn[200]; + snprintf(nosnapfn, sizeof(nosnapfn), "%s/nosnap", current_fn.c_str()); + + if (backend->can_checkpoint()) { + if (snaps.empty()) { + dout(0) << "mount WARNING: no consistent snaps found, store may be in inconsistent state" << dendl; + } else { + char s[NAME_MAX]; + uint64_t curr_seq = 0; + + if (m_osd_rollback_to_cluster_snap.length()) { + derr << TEXT_RED + << " ** NOTE: rolling back to cluster snapshot " << m_osd_rollback_to_cluster_snap << " **" + << TEXT_NORMAL + << dendl; + assert(cluster_snaps.count(m_osd_rollback_to_cluster_snap)); + snprintf(s, sizeof(s), CLUSTER_SNAP_ITEM, m_osd_rollback_to_cluster_snap.c_str()); + } else { + { + int fd = read_op_seq(&curr_seq); + if (fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + } + } + if (curr_seq) + dout(10) << " current/ seq was " << curr_seq << dendl; + else + dout(10) << " current/ missing entirely (unusual, but okay)" << dendl; + + uint64_t cp = snaps.back(); + dout(10) << " most recent snap from " << snaps << " is " << cp << dendl; + + // if current/ is marked as non-snapshotted, refuse to roll + // back (without clear direction) to avoid throwing out new + // data. + struct stat st; + if (::stat(nosnapfn, &st) == 0) { + if (!m_osd_use_stale_snap) { + derr << "ERROR: " << nosnapfn << " exists, not rolling back to avoid losing new data" << dendl; + derr << "Force rollback to old snapshotted version with 'osd use stale snap = true'" << dendl; + derr << "config option for --osd-use-stale-snap startup argument." << dendl; + ret = -ENOTSUP; + goto close_basedir_fd; + } + derr << "WARNING: user forced start with data sequence mismatch: current was " << curr_seq + << ", newest snap is " << cp << dendl; + cerr << TEXT_YELLOW + << " ** WARNING: forcing the use of stale snapshot data **" + << TEXT_NORMAL << std::endl; + } + + dout(10) << "mount rolling back to consistent snap " << cp << dendl; + snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)cp); + } + + // drop current? + ret = backend->rollback_to(s); + if (ret) { + derr << "FileStore::mount: error rolling back to " << s << ": " + << cpp_strerror(ret) << dendl; + goto close_basedir_fd; + } + } + } + initial_op_seq = 0; + + current_fd = ::open(current_fn.c_str(), O_RDONLY); + if (current_fd < 0) { + ret = -errno; + derr << "FileStore::mount: error opening: " << current_fn << ": " << cpp_strerror(ret) << dendl; + goto close_basedir_fd; + } + + assert(current_fd >= 0); + + op_fd = read_op_seq(&initial_op_seq); + if (op_fd < 0) { + derr << "FileStore::mount: read_op_seq failed" << dendl; + goto close_current_fd; + } + + dout(5) << "mount op_seq is " << initial_op_seq << dendl; + if (initial_op_seq == 0) { + derr << "mount initial op seq is 0; something is wrong" << dendl; + ret = -EINVAL; + goto close_current_fd; + } + + if (!backend->can_checkpoint()) { + // mark current/ as non-snapshotted so that we don't rollback away + // from it. + int r = ::creat(nosnapfn, 0644); + if (r < 0) { + derr << "FileStore::mount: failed to create current/nosnap" << dendl; + goto close_current_fd; + } + VOID_TEMP_FAILURE_RETRY(::close(r)); + } else { + // clear nosnap marker, if present. + ::unlink(nosnapfn); + } + + if (!(generic_flags & SKIP_MOUNT_OMAP)) { + KeyValueDB * omap_store = KeyValueDB::create(g_ceph_context, + superblock.omap_backend, + omap_dir); + if (omap_store == NULL) + { + derr << "Error creating " << superblock.omap_backend << dendl; + ret = -1; + goto close_current_fd; + } + + if (superblock.omap_backend == "rocksdb") + omap_store->init(g_conf->filestore_rocksdb_options); + else + omap_store->init(); + + stringstream err; + if (omap_store->create_and_open(err)) { + delete omap_store; + derr << "Error initializing " << superblock.omap_backend + << " : " << err.str() << dendl; + ret = -1; + goto close_current_fd; + } + + DBObjectMap *dbomap = new DBObjectMap(omap_store); + ret = dbomap->init(do_update); + if (ret < 0) { + delete dbomap; + derr << "Error initializing DBObjectMap: " << ret << dendl; + goto close_current_fd; + } + stringstream err2; + + if (g_conf->filestore_debug_omap_check && !dbomap->check(err2)) { + derr << err2.str() << dendl; + delete dbomap; + ret = -EINVAL; + goto close_current_fd; + } + object_map.reset(dbomap); + } + + // journal + new_journal(); + + // select journal mode? + if (journal) { + if (!m_filestore_journal_writeahead && + !m_filestore_journal_parallel && + !m_filestore_journal_trailing) { + if (!backend->can_checkpoint()) { + m_filestore_journal_writeahead = true; + dout(0) << "mount: enabling WRITEAHEAD journal mode: checkpoint is not enabled" << dendl; + } else { + m_filestore_journal_parallel = true; + dout(0) << "mount: enabling PARALLEL journal mode: fs, checkpoint is enabled" << dendl; + } + } else { + if (m_filestore_journal_writeahead) + dout(0) << "mount: WRITEAHEAD journal mode explicitly enabled in conf" << dendl; + if (m_filestore_journal_parallel) + dout(0) << "mount: PARALLEL journal mode explicitly enabled in conf" << dendl; + if (m_filestore_journal_trailing) + dout(0) << "mount: TRAILING journal mode explicitly enabled in conf" << dendl; + } + if (m_filestore_journal_writeahead) + journal->set_wait_on_full(true); + } else { + dout(0) << "mount: no journal" << dendl; + } + + ret = _sanity_check_fs(); + if (ret) { + derr << "FileStore::mount: _sanity_check_fs failed with error " + << ret << dendl; + goto close_current_fd; + } + + // Cleanup possibly invalid collections + { + vector collections; + ret = list_collections(collections, true); + if (ret < 0) { + derr << "Error " << ret << " while listing collections" << dendl; + goto close_current_fd; + } + for (vector::iterator i = collections.begin(); + i != collections.end(); + ++i) { + Index index; + ret = get_index(*i, &index); + if (ret < 0) { + derr << "Unable to mount index " << *i + << " with error: " << ret << dendl; + goto close_current_fd; + } + assert(NULL != index.index); + RWLock::WLocker l((index.index)->access_lock); + + index->cleanup(); + } + } + + wbthrottle.start(); + sync_thread.create(); + + if (!(generic_flags & SKIP_JOURNAL_REPLAY)) { + ret = journal_replay(initial_op_seq); + if (ret < 0) { + derr << "mount failed to open journal " << journalpath << ": " << cpp_strerror(ret) << dendl; + if (ret == -ENOTTY) { + derr << "maybe journal is not pointing to a block device and its size " + << "wasn't configured?" << dendl; + } + + // stop sync thread + lock.Lock(); + stop = true; + sync_cond.Signal(); + lock.Unlock(); + sync_thread.join(); + + wbthrottle.stop(); + + goto close_current_fd; + } + } + + { + stringstream err2; + if (g_conf->filestore_debug_omap_check && !object_map->check(err2)) { + derr << err2.str() << dendl; + ret = -EINVAL; + goto close_current_fd; + } + } + + init_temp_collections(); + + journal_start(); + + op_tp.start(); + for (vector::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) { + (*it)->start(); + } + for (vector::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) { + (*it)->start(); + } + + timer.init(); + + // upgrade? + if (g_conf->filestore_update_to >= (int)get_target_version()) { + int err = upgrade(); + if (err < 0) { + derr << "error converting store" << dendl; + umount(); + return err; + } + } + + // all okay. + return 0; + +close_current_fd: + VOID_TEMP_FAILURE_RETRY(::close(current_fd)); + current_fd = -1; +close_basedir_fd: + VOID_TEMP_FAILURE_RETRY(::close(basedir_fd)); + basedir_fd = -1; +close_fsid_fd: + VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); + fsid_fd = -1; +done: + assert(!m_filestore_fail_eio || ret != -EIO); + return ret; +} + +void FileStore::init_temp_collections() +{ + dout(10) << __func__ << dendl; + vector ls; + int r = list_collections(ls, true); + assert(r >= 0); + + dout(20) << " ls " << ls << dendl; + + SequencerPosition spos; + + set temps; + for (vector::iterator p = ls.begin(); p != ls.end(); ++p) + if (p->is_temp()) + temps.insert(*p); + dout(20) << " temps " << temps << dendl; + + for (vector::iterator p = ls.begin(); p != ls.end(); ++p) { + if (p->is_temp()) + continue; + if (p->is_meta()) + continue; + coll_t temp = p->get_temp(); + if (temps.count(temp)) { + temps.erase(temp); + } else { + dout(10) << __func__ << " creating " << temp << dendl; + r = _create_collection(temp, spos); + assert(r == 0); + } + } + + for (set::iterator p = temps.begin(); p != temps.end(); ++p) { + dout(10) << __func__ << " removing stray " << *p << dendl; + r = _collection_remove_recursive(*p, spos); + assert(r == 0); + } +} + +int FileStore::umount() +{ + dout(5) << "umount " << basedir << dendl; + + flush(); + sync(); + do_force_sync(); + + lock.Lock(); + stop = true; + sync_cond.Signal(); + lock.Unlock(); + sync_thread.join(); + wbthrottle.stop(); + op_tp.stop(); + + journal_stop(); + if (!(generic_flags & SKIP_JOURNAL_REPLAY)) + journal_write_close(); + + for (vector::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) { + (*it)->stop(); + } + for (vector::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) { + (*it)->stop(); + } + + if (fsid_fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); + fsid_fd = -1; + } + if (op_fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(op_fd)); + op_fd = -1; + } + if (current_fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(current_fd)); + current_fd = -1; + } + if (basedir_fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(basedir_fd)); + basedir_fd = -1; + } + + force_sync = false; + + delete backend; + backend = NULL; + + object_map.reset(); + + { + Mutex::Locker l(sync_entry_timeo_lock); + timer.shutdown(); + } + + // nothing + return 0; +} + + + + +/// ----------------------------- + +FileStore::Op *FileStore::build_op(list& tls, + Context *onreadable, + Context *onreadable_sync, + TrackedOpRef osd_op) +{ + uint64_t bytes = 0, ops = 0; + for (list::iterator p = tls.begin(); + p != tls.end(); + ++p) { + bytes += (*p)->get_num_bytes(); + ops += (*p)->get_num_ops(); + } + + Op *o = new Op; + o->start = ceph_clock_now(g_ceph_context); + o->tls.swap(tls); + o->onreadable = onreadable; + o->onreadable_sync = onreadable_sync; + o->ops = ops; + o->bytes = bytes; + o->osd_op = osd_op; + return o; +} + + + +void FileStore::queue_op(OpSequencer *osr, Op *o) +{ + // queue op on sequencer, then queue sequencer for the threadpool, + // so that regardless of which order the threads pick up the + // sequencer, the op order will be preserved. + + osr->queue(o); + + logger->inc(l_os_ops); + logger->inc(l_os_bytes, o->bytes); + + dout(5) << "queue_op " << o << " seq " << o->op + << " " << *osr + << " " << o->bytes << " bytes" + << " (queue has " << throttle_ops.get_current() << " ops and " << throttle_bytes.get_current() << " bytes)" + << dendl; + op_wq.queue(osr); +} + +void FileStore::op_queue_reserve_throttle(Op *o, ThreadPool::TPHandle *handle) +{ + // Do not call while holding the journal lock! + uint64_t max_ops = m_filestore_queue_max_ops; + uint64_t max_bytes = m_filestore_queue_max_bytes; + + if (backend->can_checkpoint() && is_committing()) { + max_ops += m_filestore_queue_committing_max_ops; + max_bytes += m_filestore_queue_committing_max_bytes; + } + + logger->set(l_os_oq_max_ops, max_ops); + logger->set(l_os_oq_max_bytes, max_bytes); + + if (handle) + handle->suspend_tp_timeout(); + if (throttle_ops.should_wait(1) || + (throttle_bytes.get_current() // let single large ops through! + && throttle_bytes.should_wait(o->bytes))) { + dout(2) << "waiting " << throttle_ops.get_current() + 1 << " > " << max_ops << " ops || " + << throttle_bytes.get_current() + o->bytes << " > " << max_bytes << dendl; + } + throttle_ops.get(); + throttle_bytes.get(o->bytes); + if (handle) + handle->reset_tp_timeout(); + + logger->set(l_os_oq_ops, throttle_ops.get_current()); + logger->set(l_os_oq_bytes, throttle_bytes.get_current()); +} + +void FileStore::op_queue_release_throttle(Op *o) +{ + throttle_ops.put(); + throttle_bytes.put(o->bytes); + logger->set(l_os_oq_ops, throttle_ops.get_current()); + logger->set(l_os_oq_bytes, throttle_bytes.get_current()); +} + +void FileStore::_do_op(OpSequencer *osr, ThreadPool::TPHandle &handle) +{ + wbthrottle.throttle(); + // inject a stall? + if (g_conf->filestore_inject_stall) { + int orig = g_conf->filestore_inject_stall; + dout(5) << "_do_op filestore_inject_stall " << orig << ", sleeping" << dendl; + for (int n = 0; n < g_conf->filestore_inject_stall; n++) + sleep(1); + g_conf->set_val("filestore_inject_stall", "0"); + dout(5) << "_do_op done stalling" << dendl; + } + + osr->apply_lock.Lock(); + Op *o = osr->peek_queue(); + apply_manager.op_apply_start(o->op); + dout(5) << "_do_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << " start" << dendl; + int r = _do_transactions(o->tls, o->op, &handle); + apply_manager.op_apply_finish(o->op); + dout(10) << "_do_op " << o << " seq " << o->op << " r = " << r + << ", finisher " << o->onreadable << " " << o->onreadable_sync << dendl; +} + +void FileStore::_finish_op(OpSequencer *osr) +{ + list to_queue; + Op *o = osr->dequeue(&to_queue); + + utime_t lat = ceph_clock_now(g_ceph_context); + lat -= o->start; + + dout(10) << "_finish_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << " lat " << lat << dendl; + osr->apply_lock.Unlock(); // locked in _do_op + + // called with tp lock held + op_queue_release_throttle(o); + + logger->tinc(l_os_apply_lat, lat); + + if (o->onreadable_sync) { + o->onreadable_sync->complete(0); + } + if (o->onreadable) { + apply_finishers[osr->id % m_apply_finisher_num]->queue(o->onreadable); + } + if (!to_queue.empty()) { + apply_finishers[osr->id % m_apply_finisher_num]->queue(to_queue); + } + delete o; +} + + +struct C_JournaledAhead : public Context { + FileStore *fs; + FileStore::OpSequencer *osr; + FileStore::Op *o; + Context *ondisk; + + C_JournaledAhead(FileStore *f, FileStore::OpSequencer *os, FileStore::Op *o, Context *ondisk): + fs(f), osr(os), o(o), ondisk(ondisk) { } + void finish(int r) { + fs->_journaled_ahead(osr, o, ondisk); + } +}; + +int FileStore::queue_transactions(Sequencer *posr, list &tls, + TrackedOpRef osd_op, + ThreadPool::TPHandle *handle) +{ + Context *onreadable; + Context *ondisk; + Context *onreadable_sync; + ObjectStore::Transaction::collect_contexts( + tls, &onreadable, &ondisk, &onreadable_sync); + if (g_conf->filestore_blackhole) { + dout(0) << "queue_transactions filestore_blackhole = TRUE, dropping transaction" << dendl; + delete ondisk; + delete onreadable; + delete onreadable_sync; + return 0; + } + + utime_t start = ceph_clock_now(g_ceph_context); + // set up the sequencer + OpSequencer *osr; + assert(posr); + if (posr->p) { + osr = static_cast(posr->p.get()); + dout(5) << "queue_transactions existing " << osr << " " << *osr << dendl; + } else { + osr = new OpSequencer(next_osr_id.inc()); + osr->set_cct(g_ceph_context); + osr->parent = posr; + posr->p = osr; + dout(5) << "queue_transactions new " << osr << " " << *osr << dendl; + } + + // used to include osr information in tracepoints during transaction apply + for (list::iterator i = tls.begin(); i != tls.end(); ++i) { + (*i)->set_osr(osr); + } + + if (journal && journal->is_writeable() && !m_filestore_journal_trailing) { + Op *o = build_op(tls, onreadable, onreadable_sync, osd_op); + op_queue_reserve_throttle(o, handle); + journal->throttle(); + //prepare and encode transactions data out of lock + bufferlist tbl; + int orig_len = journal->prepare_entry(o->tls, &tbl); + uint64_t op_num = submit_manager.op_submit_start(); + o->op = op_num; + + if (m_filestore_do_dump) + dump_transactions(o->tls, o->op, osr); + + if (m_filestore_journal_parallel) { + dout(5) << "queue_transactions (parallel) " << o->op << " " << o->tls << dendl; + + _op_journal_transactions(tbl, orig_len, o->op, ondisk, osd_op); + + // queue inside submit_manager op submission lock + queue_op(osr, o); + } else if (m_filestore_journal_writeahead) { + dout(5) << "queue_transactions (writeahead) " << o->op << " " << o->tls << dendl; + + osr->queue_journal(o->op); + + _op_journal_transactions(tbl, orig_len, o->op, + new C_JournaledAhead(this, osr, o, ondisk), + osd_op); + } else { + assert(0); + } + submit_manager.op_submit_finish(op_num); + utime_t end = ceph_clock_now(g_ceph_context); + logger->tinc(l_os_queue_lat, end - start); + return 0; + } + + if (!journal) { + Op *o = build_op(tls, onreadable, onreadable_sync, osd_op); + dout(5) << __func__ << " (no journal) " << o << " " << tls << dendl; + + op_queue_reserve_throttle(o, handle); + + uint64_t op_num = submit_manager.op_submit_start(); + o->op = op_num; + + if (m_filestore_do_dump) + dump_transactions(o->tls, o->op, osr); + + queue_op(osr, o); + + if (ondisk) + apply_manager.add_waiter(op_num, ondisk); + submit_manager.op_submit_finish(op_num); + utime_t end = ceph_clock_now(g_ceph_context); + logger->tinc(l_os_queue_lat, end - start); + return 0; + } + + assert(journal); + //prepare and encode transactions data out of lock + bufferlist tbl; + int orig_len = -1; + if (journal->is_writeable()) { + orig_len = journal->prepare_entry(tls, &tbl); + } + uint64_t op = submit_manager.op_submit_start(); + dout(5) << "queue_transactions (trailing journal) " << op << " " << tls << dendl; + + if (m_filestore_do_dump) + dump_transactions(tls, op, osr); + + apply_manager.op_apply_start(op); + int r = do_transactions(tls, op); + + if (r >= 0) { + _op_journal_transactions(tbl, orig_len, op, ondisk, osd_op); + } else { + delete ondisk; + } + + // start on_readable finisher after we queue journal item, as on_readable callback + // is allowed to delete the Transaction + if (onreadable_sync) { + onreadable_sync->complete(r); + } + apply_finishers[osr->id % m_apply_finisher_num]->queue(onreadable, r); + + submit_manager.op_submit_finish(op); + apply_manager.op_apply_finish(op); + + utime_t end = ceph_clock_now(g_ceph_context); + logger->tinc(l_os_queue_lat, end - start); + return r; +} + +void FileStore::_journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk) +{ + dout(5) << "_journaled_ahead " << o << " seq " << o->op << " " << *osr << " " << o->tls << dendl; + + // this should queue in order because the journal does it's completions in order. + queue_op(osr, o); + + list to_queue; + osr->dequeue_journal(&to_queue); + + // do ondisk completions async, to prevent any onreadable_sync completions + // getting blocked behind an ondisk completion. + if (ondisk) { + dout(10) << " queueing ondisk " << ondisk << dendl; + ondisk_finishers[osr->id % m_ondisk_finisher_num]->queue(ondisk); + } + if (!to_queue.empty()) { + ondisk_finishers[osr->id % m_ondisk_finisher_num]->queue(to_queue); + } +} + +int FileStore::_do_transactions( + list &tls, + uint64_t op_seq, + ThreadPool::TPHandle *handle) +{ + int r = 0; + int trans_num = 0; + + for (list::iterator p = tls.begin(); + p != tls.end(); + ++p, trans_num++) { + r = _do_transaction(**p, op_seq, trans_num, handle); + if (r < 0) + break; + if (handle) + handle->reset_tp_timeout(); + } + + return r; +} + +void FileStore::_set_global_replay_guard(coll_t cid, + const SequencerPosition &spos) +{ + if (backend->can_checkpoint()) + return; + + // sync all previous operations on this sequencer + int ret = object_map->sync(); + if (ret < 0) { + derr << __func__ << " : omap sync error " << cpp_strerror(ret) << dendl; + assert(0 == "_set_global_replay_guard failed"); + } + ret = sync_filesystem(basedir_fd); + if (ret < 0) { + derr << __func__ << " :sync_filesytem error " << cpp_strerror(ret) << dendl; + assert(0 == "_set_global_replay_guard failed"); + } + + char fn[PATH_MAX]; + get_cdir(cid, fn, sizeof(fn)); + int fd = ::open(fn, O_RDONLY); + if (fd < 0) { + int err = errno; + derr << __func__ << ": " << cid << " error " << cpp_strerror(err) << dendl; + assert(0 == "_set_global_replay_guard failed"); + } + + _inject_failure(); + + // then record that we did it + bufferlist v; + ::encode(spos, v); + int r = chain_fsetxattr(fd, GLOBAL_REPLAY_GUARD_XATTR, v.c_str(), v.length(), true); + if (r < 0) { + derr << __func__ << ": fsetxattr " << GLOBAL_REPLAY_GUARD_XATTR + << " got " << cpp_strerror(r) << dendl; + assert(0 == "fsetxattr failed"); + } + + // and make sure our xattr is durable. + ::fsync(fd); + + _inject_failure(); + + VOID_TEMP_FAILURE_RETRY(::close(fd)); + dout(10) << __func__ << ": " << spos << " done" << dendl; +} + +int FileStore::_check_global_replay_guard(coll_t cid, + const SequencerPosition& spos) +{ + char fn[PATH_MAX]; + get_cdir(cid, fn, sizeof(fn)); + int fd = ::open(fn, O_RDONLY); + if (fd < 0) { + dout(10) << __func__ << ": " << cid << " dne" << dendl; + return 1; // if collection does not exist, there is no guard, and we can replay. + } + + char buf[100]; + int r = chain_fgetxattr(fd, GLOBAL_REPLAY_GUARD_XATTR, buf, sizeof(buf)); + if (r < 0) { + dout(20) << __func__ << " no xattr" << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return 1; // no xattr + } + bufferlist bl; + bl.append(buf, r); + + SequencerPosition opos; + bufferlist::iterator p = bl.begin(); + ::decode(opos, p); + + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return spos >= opos ? 1 : -1; +} + + +void FileStore::_set_replay_guard(coll_t cid, + const SequencerPosition &spos, + bool in_progress=false) +{ + char fn[PATH_MAX]; + get_cdir(cid, fn, sizeof(fn)); + int fd = ::open(fn, O_RDONLY); + if (fd < 0) { + int err = errno; + derr << "_set_replay_guard " << cid << " error " << cpp_strerror(err) << dendl; + assert(0 == "_set_replay_guard failed"); + } + _set_replay_guard(fd, spos, 0, in_progress); + VOID_TEMP_FAILURE_RETRY(::close(fd)); +} + + +void FileStore::_set_replay_guard(int fd, + const SequencerPosition& spos, + const ghobject_t *hoid, + bool in_progress) +{ + if (backend->can_checkpoint()) + return; + + dout(10) << "_set_replay_guard " << spos << (in_progress ? " START" : "") << dendl; + + _inject_failure(); + + // first make sure the previous operation commits + ::fsync(fd); + + // sync object_map too. even if this object has a header or keys, + // it have had them in the past and then removed them, so always + // sync. + object_map->sync(hoid, &spos); + + _inject_failure(); + + // then record that we did it + bufferlist v(40); + ::encode(spos, v); + ::encode(in_progress, v); + int r = chain_fsetxattr(fd, REPLAY_GUARD_XATTR, v.c_str(), v.length(), true); + if (r < 0) { + derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl; + assert(0 == "fsetxattr failed"); + } + + // and make sure our xattr is durable. + ::fsync(fd); + + _inject_failure(); + + dout(10) << "_set_replay_guard " << spos << " done" << dendl; +} + +void FileStore::_close_replay_guard(coll_t cid, + const SequencerPosition &spos) +{ + char fn[PATH_MAX]; + get_cdir(cid, fn, sizeof(fn)); + int fd = ::open(fn, O_RDONLY); + if (fd < 0) { + int err = errno; + derr << "_close_replay_guard " << cid << " error " << cpp_strerror(err) << dendl; + assert(0 == "_close_replay_guard failed"); + } + _close_replay_guard(fd, spos); + VOID_TEMP_FAILURE_RETRY(::close(fd)); +} + +void FileStore::_close_replay_guard(int fd, const SequencerPosition& spos) +{ + if (backend->can_checkpoint()) + return; + + dout(10) << "_close_replay_guard " << spos << dendl; + + _inject_failure(); + + // then record that we are done with this operation + bufferlist v(40); + ::encode(spos, v); + bool in_progress = false; + ::encode(in_progress, v); + int r = chain_fsetxattr(fd, REPLAY_GUARD_XATTR, v.c_str(), v.length(), true); + if (r < 0) { + derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl; + assert(0 == "fsetxattr failed"); + } + + // and make sure our xattr is durable. + ::fsync(fd); + + _inject_failure(); + + dout(10) << "_close_replay_guard " << spos << " done" << dendl; +} + +int FileStore::_check_replay_guard(coll_t cid, ghobject_t oid, const SequencerPosition& spos) +{ + if (!replaying || backend->can_checkpoint()) + return 1; + + int r = _check_global_replay_guard(cid, spos); + if (r < 0) + return r; + + FDRef fd; + r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + dout(10) << "_check_replay_guard " << cid << " " << oid << " dne" << dendl; + return 1; // if file does not exist, there is no guard, and we can replay. + } + int ret = _check_replay_guard(**fd, spos); + lfn_close(fd); + return ret; +} + +int FileStore::_check_replay_guard(coll_t cid, const SequencerPosition& spos) +{ + if (!replaying || backend->can_checkpoint()) + return 1; + + char fn[PATH_MAX]; + get_cdir(cid, fn, sizeof(fn)); + int fd = ::open(fn, O_RDONLY); + if (fd < 0) { + dout(10) << "_check_replay_guard " << cid << " dne" << dendl; + return 1; // if collection does not exist, there is no guard, and we can replay. + } + int ret = _check_replay_guard(fd, spos); + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return ret; +} + +int FileStore::_check_replay_guard(int fd, const SequencerPosition& spos) +{ + if (!replaying || backend->can_checkpoint()) + return 1; + + char buf[100]; + int r = chain_fgetxattr(fd, REPLAY_GUARD_XATTR, buf, sizeof(buf)); + if (r < 0) { + dout(20) << "_check_replay_guard no xattr" << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + return 1; // no xattr + } + bufferlist bl; + bl.append(buf, r); + + SequencerPosition opos; + bufferlist::iterator p = bl.begin(); + ::decode(opos, p); + bool in_progress = false; + if (!p.end()) // older journals don't have this + ::decode(in_progress, p); + if (opos > spos) { + dout(10) << "_check_replay_guard object has " << opos << " > current pos " << spos + << ", now or in future, SKIPPING REPLAY" << dendl; + return -1; + } else if (opos == spos) { + if (in_progress) { + dout(10) << "_check_replay_guard object has " << opos << " == current pos " << spos + << ", in_progress=true, CONDITIONAL REPLAY" << dendl; + return 0; + } else { + dout(10) << "_check_replay_guard object has " << opos << " == current pos " << spos + << ", in_progress=false, SKIPPING REPLAY" << dendl; + return -1; + } + } else { + dout(10) << "_check_replay_guard object has " << opos << " < current pos " << spos + << ", in past, will replay" << dendl; + return 1; + } +} + +unsigned FileStore::_do_transaction( + Transaction& t, uint64_t op_seq, int trans_num, + ThreadPool::TPHandle *handle) +{ + dout(10) << "_do_transaction on " << &t << dendl; + +#ifdef WITH_LTTNG + const char *osr_name = t.get_osr() ? static_cast(t.get_osr())->get_name().c_str() : ""; +#endif + + Transaction::iterator i = t.begin(); + + SequencerPosition spos(op_seq, trans_num, 0); + while (i.have_op()) { + if (handle) + handle->reset_tp_timeout(); + + Transaction::Op *op = i.decode_op(); + int r = 0; + + _inject_failure(); + + switch (op->op) { + case Transaction::OP_NOP: + break; + case Transaction::OP_TOUCH: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + _kludge_temp_object_collection(cid, oid); + tracepoint(objectstore, touch_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _touch(cid, oid); + tracepoint(objectstore, touch_exit, r); + } + break; + + case Transaction::OP_WRITE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + _kludge_temp_object_collection(cid, oid); + uint64_t off = op->off; + uint64_t len = op->len; + uint32_t fadvise_flags = i.get_fadvise_flags(); + bufferlist bl; + i.decode_bl(bl); + tracepoint(objectstore, write_enter, osr_name, off, len); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _write(cid, oid, off, len, bl, fadvise_flags); + tracepoint(objectstore, write_exit, r); + } + break; + + case Transaction::OP_ZERO: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + _kludge_temp_object_collection(cid, oid); + uint64_t off = op->off; + uint64_t len = op->len; + tracepoint(objectstore, zero_enter, osr_name, off, len); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _zero(cid, oid, off, len); + tracepoint(objectstore, zero_exit, r); + } + break; + + case Transaction::OP_TRIMCACHE: + { + // deprecated, no-op + } + break; + + case Transaction::OP_TRUNCATE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + _kludge_temp_object_collection(cid, oid); + uint64_t off = op->off; + tracepoint(objectstore, truncate_enter, osr_name, off); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _truncate(cid, oid, off); + tracepoint(objectstore, truncate_exit, r); + } + break; + + case Transaction::OP_REMOVE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + _kludge_temp_object_collection(cid, oid); + tracepoint(objectstore, remove_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _remove(cid, oid, spos); + tracepoint(objectstore, remove_exit, r); + } + break; + + case Transaction::OP_SETATTR: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + _kludge_temp_object_collection(cid, oid); + string name = i.decode_string(); + bufferlist bl; + i.decode_bl(bl); + tracepoint(objectstore, setattr_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) { + map to_set; + to_set[name] = bufferptr(bl.c_str(), bl.length()); + r = _setattrs(cid, oid, to_set, spos); + if (r == -ENOSPC) + dout(0) << " ENOSPC on setxattr on " << cid << "/" << oid + << " name " << name << " size " << bl.length() << dendl; + } + tracepoint(objectstore, setattr_exit, r); + } + break; + + case Transaction::OP_SETATTRS: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + _kludge_temp_object_collection(cid, oid); + map aset; + i.decode_attrset(aset); + tracepoint(objectstore, setattrs_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _setattrs(cid, oid, aset, spos); + tracepoint(objectstore, setattrs_exit, r); + if (r == -ENOSPC) + dout(0) << " ENOSPC on setxattrs on " << cid << "/" << oid << dendl; + } + break; + + case Transaction::OP_RMATTR: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + _kludge_temp_object_collection(cid, oid); + string name = i.decode_string(); + tracepoint(objectstore, rmattr_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _rmattr(cid, oid, name.c_str(), spos); + tracepoint(objectstore, rmattr_exit, r); + } + break; + + case Transaction::OP_RMATTRS: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + _kludge_temp_object_collection(cid, oid); + tracepoint(objectstore, rmattrs_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _rmattrs(cid, oid, spos); + tracepoint(objectstore, rmattrs_exit, r); + } + break; + + case Transaction::OP_CLONE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + _kludge_temp_object_collection(cid, oid); + ghobject_t noid = i.get_oid(op->dest_oid); + tracepoint(objectstore, clone_enter, osr_name); + r = _clone(cid, oid, noid, spos); + tracepoint(objectstore, clone_exit, r); + } + break; + + case Transaction::OP_CLONERANGE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + _kludge_temp_object_collection(cid, oid); + ghobject_t noid = i.get_oid(op->dest_oid); + _kludge_temp_object_collection(cid, noid); + uint64_t off = op->off; + uint64_t len = op->len; + tracepoint(objectstore, clone_range_enter, osr_name, len); + r = _clone_range(cid, oid, noid, off, len, off, spos); + tracepoint(objectstore, clone_range_exit, r); + } + break; + + case Transaction::OP_CLONERANGE2: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + _kludge_temp_object_collection(cid, oid); + ghobject_t noid = i.get_oid(op->dest_oid); + _kludge_temp_object_collection(cid, noid); + uint64_t srcoff = op->off; + uint64_t len = op->len; + uint64_t dstoff = op->dest_off; + tracepoint(objectstore, clone_range2_enter, osr_name, len); + r = _clone_range(cid, oid, noid, srcoff, len, dstoff, spos); + tracepoint(objectstore, clone_range2_exit, r); + } + break; + + case Transaction::OP_MKCOLL: + { + coll_t cid = i.get_cid(op->cid); + tracepoint(objectstore, mkcoll_enter, osr_name); + if (_check_replay_guard(cid, spos) > 0) + r = _create_collection(cid, spos); + tracepoint(objectstore, mkcoll_exit, r); + } + break; + + case Transaction::OP_COLL_HINT: + { + coll_t cid = i.get_cid(op->cid); + uint32_t type = op->hint_type; + bufferlist hint; + i.decode_bl(hint); + bufferlist::iterator hiter = hint.begin(); + if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) { + uint32_t pg_num; + uint64_t num_objs; + ::decode(pg_num, hiter); + ::decode(num_objs, hiter); + if (_check_replay_guard(cid, spos) > 0) { + r = _collection_hint_expected_num_objs(cid, pg_num, num_objs, spos); + } + } else { + // Ignore the hint + dout(10) << "Unrecognized collection hint type: " << type << dendl; + } + } + break; + + case Transaction::OP_RMCOLL: + { + coll_t cid = i.get_cid(op->cid); + tracepoint(objectstore, rmcoll_enter, osr_name); + if (_check_replay_guard(cid, spos) > 0) + r = _destroy_collection(cid); + tracepoint(objectstore, rmcoll_exit, r); + } + break; + + case Transaction::OP_COLL_ADD: + { + coll_t ocid = i.get_cid(op->cid); + coll_t ncid = i.get_cid(op->dest_cid); + ghobject_t oid = i.get_oid(op->oid); + + assert(oid.hobj.pool >= -1); + + // always followed by OP_COLL_REMOVE + Transaction::Op *op2 = i.decode_op(); + coll_t ocid2 = i.get_cid(op2->cid); + ghobject_t oid2 = i.get_oid(op2->oid); + assert(op2->op == Transaction::OP_COLL_REMOVE); + assert(ocid2 == ocid); + assert(oid2 == oid); + + tracepoint(objectstore, coll_add_enter); + r = _collection_add(ncid, ocid, oid, spos); + tracepoint(objectstore, coll_add_exit, r); + spos.op++; + if (r < 0) + break; + tracepoint(objectstore, coll_remove_enter, osr_name); + if (_check_replay_guard(ocid, oid, spos) > 0) + r = _remove(ocid, oid, spos); + tracepoint(objectstore, coll_remove_exit, r); + } + break; + + case Transaction::OP_COLL_MOVE: + { + // WARNING: this is deprecated and buggy; only here to replay old journals. + coll_t ocid = i.get_cid(op->cid); + coll_t ncid = i.get_cid(op->dest_cid); + ghobject_t oid = i.get_oid(op->oid); + tracepoint(objectstore, coll_move_enter); + r = _collection_add(ocid, ncid, oid, spos); + if (r == 0 && + (_check_replay_guard(ocid, oid, spos) > 0)) + r = _remove(ocid, oid, spos); + tracepoint(objectstore, coll_move_exit, r); + } + break; + + case Transaction::OP_COLL_MOVE_RENAME: + { + coll_t oldcid = i.get_cid(op->cid); + ghobject_t oldoid = i.get_oid(op->oid); + coll_t newcid = i.get_cid(op->dest_cid); + ghobject_t newoid = i.get_oid(op->dest_oid); + _kludge_temp_object_collection(oldcid, oldoid); + _kludge_temp_object_collection(newcid, newoid); + tracepoint(objectstore, coll_move_rename_enter); + r = _collection_move_rename(oldcid, oldoid, newcid, newoid, spos); + tracepoint(objectstore, coll_move_rename_exit, r); + } + break; + + case Transaction::OP_COLL_SETATTR: + { + coll_t cid = i.get_cid(op->cid); + string name = i.decode_string(); + bufferlist bl; + i.decode_bl(bl); + tracepoint(objectstore, coll_setattr_enter, osr_name); + if (_check_replay_guard(cid, spos) > 0) + r = _collection_setattr(cid, name.c_str(), bl.c_str(), bl.length()); + tracepoint(objectstore, coll_setattr_exit, r); + } + break; + + case Transaction::OP_COLL_RMATTR: + { + coll_t cid = i.get_cid(op->cid); + string name = i.decode_string(); + tracepoint(objectstore, coll_rmattr_enter, osr_name); + if (_check_replay_guard(cid, spos) > 0) + r = _collection_rmattr(cid, name.c_str()); + tracepoint(objectstore, coll_rmattr_exit, r); + } + break; + + case Transaction::OP_STARTSYNC: + tracepoint(objectstore, startsync_enter, osr_name); + _start_sync(); + tracepoint(objectstore, startsync_exit); + break; + + case Transaction::OP_COLL_RENAME: + { + r = -EOPNOTSUPP; + } + break; + + case Transaction::OP_OMAP_CLEAR: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + _kludge_temp_object_collection(cid, oid); + tracepoint(objectstore, omap_clear_enter, osr_name); + r = _omap_clear(cid, oid, spos); + tracepoint(objectstore, omap_clear_exit, r); + } + break; + case Transaction::OP_OMAP_SETKEYS: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + _kludge_temp_object_collection(cid, oid); + map aset; + i.decode_attrset(aset); + tracepoint(objectstore, omap_setkeys_enter, osr_name); + r = _omap_setkeys(cid, oid, aset, spos); + tracepoint(objectstore, omap_setkeys_exit, r); + } + break; + case Transaction::OP_OMAP_RMKEYS: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + _kludge_temp_object_collection(cid, oid); + set keys; + i.decode_keyset(keys); + tracepoint(objectstore, omap_rmkeys_enter, osr_name); + r = _omap_rmkeys(cid, oid, keys, spos); + tracepoint(objectstore, omap_rmkeys_exit, r); + } + break; + case Transaction::OP_OMAP_RMKEYRANGE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + _kludge_temp_object_collection(cid, oid); + string first, last; + first = i.decode_string(); + last = i.decode_string(); + tracepoint(objectstore, omap_rmkeyrange_enter, osr_name); + r = _omap_rmkeyrange(cid, oid, first, last, spos); + tracepoint(objectstore, omap_rmkeyrange_exit, r); + } + break; + case Transaction::OP_OMAP_SETHEADER: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + _kludge_temp_object_collection(cid, oid); + bufferlist bl; + i.decode_bl(bl); + tracepoint(objectstore, omap_setheader_enter, osr_name); + r = _omap_setheader(cid, oid, bl, spos); + tracepoint(objectstore, omap_setheader_exit, r); + } + break; + case Transaction::OP_SPLIT_COLLECTION: + { + assert(0 == "not legacy journal; upgrade to firefly first"); + } + break; + case Transaction::OP_SPLIT_COLLECTION2: + { + coll_t cid = i.get_cid(op->cid); + uint32_t bits = op->split_bits; + uint32_t rem = op->split_rem; + coll_t dest = i.get_cid(op->dest_cid); + tracepoint(objectstore, split_coll2_enter, osr_name); + r = _split_collection(cid, bits, rem, dest, spos); + tracepoint(objectstore, split_coll2_exit, r); + } + break; + + case Transaction::OP_SETALLOCHINT: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + _kludge_temp_object_collection(cid, oid); + uint64_t expected_object_size = op->expected_object_size; + uint64_t expected_write_size = op->expected_write_size; + tracepoint(objectstore, setallochint_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _set_alloc_hint(cid, oid, expected_object_size, + expected_write_size); + tracepoint(objectstore, setallochint_exit, r); + } + break; + + default: + derr << "bad op " << op->op << dendl; + assert(0); + } + + if (r < 0) { + bool ok = false; + + if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE || + op->op == Transaction::OP_CLONE || + op->op == Transaction::OP_CLONERANGE2 || + op->op == Transaction::OP_COLL_ADD)) + // -ENOENT is normally okay + // ...including on a replayed OP_RMCOLL with checkpoint mode + ok = true; + if (r == -ENODATA) + ok = true; + + if (op->op == Transaction::OP_SETALLOCHINT) + // Either EOPNOTSUPP or EINVAL most probably. EINVAL in most + // cases means invalid hint size (e.g. too big, not a multiple + // of block size, etc) or, at least on xfs, an attempt to set + // or change it when the file is not empty. However, + // OP_SETALLOCHINT is advisory, so ignore all errors. + ok = true; + + if (replaying && !backend->can_checkpoint()) { + if (r == -EEXIST && op->op == Transaction::OP_MKCOLL) { + dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl; + ok = true; + } + if (r == -EEXIST && op->op == Transaction::OP_COLL_ADD) { + dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl; + ok = true; + } + if (r == -EEXIST && op->op == Transaction::OP_COLL_MOVE) { + dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl; + ok = true; + } + if (r == -ERANGE) { + dout(10) << "tolerating ERANGE on replay" << dendl; + ok = true; + } + if (r == -ENOENT) { + dout(10) << "tolerating ENOENT on replay" << dendl; + ok = true; + } + } + + if (!ok) { + const char *msg = "unexpected error code"; + + if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE || + op->op == Transaction::OP_CLONE || + op->op == Transaction::OP_CLONERANGE2)) + msg = "ENOENT on clone suggests osd bug"; + + if (r == -ENOSPC) + // For now, if we hit _any_ ENOSPC, crash, before we do any damage + // by partially applying transactions. + msg = "ENOSPC handling not implemented"; + + if (r == -ENOTEMPTY) { + msg = "ENOTEMPTY suggests garbage data in osd data dir"; + } + + dout(0) << " error " << cpp_strerror(r) << " not handled on operation " << op + << " (" << spos << ", or op " << spos.op << ", counting from 0)" << dendl; + dout(0) << msg << dendl; + dout(0) << " transaction dump:\n"; + JSONFormatter f(true); + f.open_object_section("transaction"); + t.dump(&f); + f.close_section(); + f.flush(*_dout); + *_dout << dendl; + + if (r == -EMFILE) { + dump_open_fds(g_ceph_context); + } + + assert(0 == "unexpected error"); + } + } + + spos.op++; + } + + _inject_failure(); + + return 0; // FIXME count errors +} + + /*********************************************/ + + + +// -------------------- +// objects + +bool FileStore::exists(coll_t cid, const ghobject_t& oid) +{ + tracepoint(objectstore, exists_enter, cid.c_str()); + _kludge_temp_object_collection(cid, oid); + struct stat st; + bool retval = stat(cid, oid, &st) == 0; + tracepoint(objectstore, exists_exit, retval); + return retval; +} + +int FileStore::stat( + coll_t cid, const ghobject_t& oid, struct stat *st, bool allow_eio) +{ + tracepoint(objectstore, stat_enter, cid.c_str()); + _kludge_temp_object_collection(cid, oid); + int r = lfn_stat(cid, oid, st); + assert(allow_eio || !m_filestore_fail_eio || r != -EIO); + if (r < 0) { + dout(10) << "stat " << cid << "/" << oid + << " = " << r << dendl; + } else { + dout(10) << "stat " << cid << "/" << oid + << " = " << r + << " (size " << st->st_size << ")" << dendl; + } + if (g_conf->filestore_debug_inject_read_err && + debug_mdata_eio(oid)) { + return -EIO; + } else { + tracepoint(objectstore, stat_exit, r); + return r; + } +} + +int FileStore::read( + coll_t cid, + const ghobject_t& oid, + uint64_t offset, + size_t len, + bufferlist& bl, + uint32_t op_flags, + bool allow_eio) +{ + int got; + tracepoint(objectstore, read_enter, cid.c_str(), offset, len); + _kludge_temp_object_collection(cid, oid); + + dout(15) << "read " << cid << "/" << oid << " " << offset << "~" << len << dendl; + + FDRef fd; + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + dout(10) << "FileStore::read(" << cid << "/" << oid << ") open error: " + << cpp_strerror(r) << dendl; + return r; + } + + if (len == 0) { + struct stat st; + memset(&st, 0, sizeof(struct stat)); + int r = ::fstat(**fd, &st); + assert(r == 0); + len = st.st_size; + } + +#ifdef HAVE_POSIX_FADVISE + if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_RANDOM) + posix_fadvise(**fd, offset, len, POSIX_FADV_RANDOM); + if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) + posix_fadvise(**fd, offset, len, POSIX_FADV_SEQUENTIAL); +#endif + + bufferptr bptr(len); // prealloc space for entire read + got = safe_pread(**fd, bptr.c_str(), len, offset); + if (got < 0) { + dout(10) << "FileStore::read(" << cid << "/" << oid << ") pread error: " << cpp_strerror(got) << dendl; + lfn_close(fd); + assert(allow_eio || !m_filestore_fail_eio || got != -EIO); + return got; + } + bptr.set_length(got); // properly size the buffer + bl.push_back(bptr); // put it in the target bufferlist + +#ifdef HAVE_POSIX_FADVISE + if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED) + posix_fadvise(**fd, offset, len, POSIX_FADV_DONTNEED); + if (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_RANDOM | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL)) + posix_fadvise(**fd, offset, len, POSIX_FADV_NORMAL); +#endif + + if (m_filestore_sloppy_crc && (!replaying || backend->can_checkpoint())) { + ostringstream ss; + int errors = backend->_crc_verify_read(**fd, offset, got, bl, &ss); + if (errors > 0) { + dout(0) << "FileStore::read " << cid << "/" << oid << " " << offset << "~" + << got << " ... BAD CRC:\n" << ss.str() << dendl; + assert(0 == "bad crc on read"); + } + } + + lfn_close(fd); + + dout(10) << "FileStore::read " << cid << "/" << oid << " " << offset << "~" + << got << "/" << len << dendl; + if (g_conf->filestore_debug_inject_read_err && + debug_data_eio(oid)) { + return -EIO; + } else { + tracepoint(objectstore, read_exit, got); + return got; + } +} + +int FileStore::_do_fiemap(int fd, uint64_t offset, size_t len, + map *m) +{ + struct fiemap *fiemap = NULL; + uint64_t i; + struct fiemap_extent *extent = NULL; + int r = 0; + + r = backend->do_fiemap(fd, offset, len, &fiemap); + if (r < 0) + return r; + + if (fiemap->fm_mapped_extents == 0) { + free(fiemap); + return r; + } + + extent = &fiemap->fm_extents[0]; + + /* start where we were asked to start */ + if (extent->fe_logical < offset) { + extent->fe_length -= offset - extent->fe_logical; + extent->fe_logical = offset; + } + + i = 0; + + while (i < fiemap->fm_mapped_extents) { + struct fiemap_extent *next = extent + 1; + + dout(10) << "FileStore::fiemap() fm_mapped_extents=" << fiemap->fm_mapped_extents + << " fe_logical=" << extent->fe_logical << " fe_length=" << extent->fe_length << dendl; + + /* try to merge extents */ + while ((i < fiemap->fm_mapped_extents - 1) && + (extent->fe_logical + extent->fe_length == next->fe_logical)) { + next->fe_length += extent->fe_length; + next->fe_logical = extent->fe_logical; + extent = next; + next = extent + 1; + i++; + } + + if (extent->fe_logical + extent->fe_length > offset + len) + extent->fe_length = offset + len - extent->fe_logical; + (*m)[extent->fe_logical] = extent->fe_length; + i++; + extent++; + } + free(fiemap); + + return r; +} + +int FileStore::_do_seek_hole_data(int fd, uint64_t offset, size_t len, + map *m) +{ +#if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA) + off_t hole_pos, data_pos; + int r = 0; + + // If lseek fails with errno setting to be ENXIO, this means the current + // file offset is beyond the end of the file. + off_t start = offset; + while(start < (off_t)(offset + len)) { + data_pos = lseek(fd, start, SEEK_DATA); + if (data_pos < 0) { + if (errno == ENXIO) + break; + else { + r = -errno; + dout(10) << "failed to lseek: " << cpp_strerror(r) << dendl; + return r; + } + } else if (data_pos > (off_t)(offset + len)) { + break; + } + + hole_pos = lseek(fd, data_pos, SEEK_HOLE); + if (hole_pos < 0) { + if (errno == ENXIO) { + break; + } else { + r = -errno; + dout(10) << "failed to lseek: " << cpp_strerror(r) << dendl; + return r; + } + } + + if (hole_pos >= (off_t)(offset + len)) { + (*m)[data_pos] = offset + len - data_pos; + break; + } + (*m)[data_pos] = hole_pos - data_pos; + start = hole_pos; + } + + return r; +#else + (*m)[offset] = len; + return 0; +#endif +} + +int FileStore::fiemap(coll_t cid, const ghobject_t& oid, + uint64_t offset, size_t len, + bufferlist& bl) +{ + tracepoint(objectstore, fiemap_enter, cid.c_str(), offset, len); + _kludge_temp_object_collection(cid, oid); + + if ((!backend->has_seek_data_hole() && !backend->has_fiemap()) || + len <= (size_t)m_filestore_fiemap_threshold) { + map m; + m[offset] = len; + ::encode(m, bl); + return 0; + } + + dout(15) << "fiemap " << cid << "/" << oid << " " << offset << "~" << len << dendl; + + map exomap; + FDRef fd; + + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + dout(10) << "read couldn't open " << cid << "/" << oid << ": " << cpp_strerror(r) << dendl; + goto done; + } + + if (backend->has_seek_data_hole()) { + dout(15) << "seek_data/seek_hole " << cid << "/" << oid << " " << offset << "~" << len << dendl; + r = _do_seek_hole_data(**fd, offset, len, &exomap); + } else if (backend->has_fiemap()) { + dout(15) << "fiemap ioctl" << cid << "/" << oid << " " << offset << "~" << len << dendl; + r = _do_fiemap(**fd, offset, len, &exomap); + } + +done: + if (r >= 0) { + lfn_close(fd); + ::encode(exomap, bl); + } + + dout(10) << "fiemap " << cid << "/" << oid << " " << offset << "~" << len << " = " << r << " num_extents=" << exomap.size() << " " << exomap << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + tracepoint(objectstore, fiemap_exit, r); + return r; +} + + +int FileStore::_remove(coll_t cid, const ghobject_t& oid, + const SequencerPosition &spos) +{ + dout(15) << "remove " << cid << "/" << oid << dendl; + int r = lfn_unlink(cid, oid, spos); + dout(10) << "remove " << cid << "/" << oid << " = " << r << dendl; + return r; +} + +int FileStore::_truncate(coll_t cid, const ghobject_t& oid, uint64_t size) +{ + dout(15) << "truncate " << cid << "/" << oid << " size " << size << dendl; + int r = lfn_truncate(cid, oid, size); + dout(10) << "truncate " << cid << "/" << oid << " size " << size << " = " << r << dendl; + return r; +} + + +int FileStore::_touch(coll_t cid, const ghobject_t& oid) +{ + dout(15) << "touch " << cid << "/" << oid << dendl; + + FDRef fd; + int r = lfn_open(cid, oid, true, &fd); + if (r < 0) { + return r; + } else { + lfn_close(fd); + } + dout(10) << "touch " << cid << "/" << oid << " = " << r << dendl; + return r; +} + +int FileStore::_write(coll_t cid, const ghobject_t& oid, + uint64_t offset, size_t len, + const bufferlist& bl, uint32_t fadvise_flags) +{ + dout(15) << "write " << cid << "/" << oid << " " << offset << "~" << len << dendl; + int r; + + int64_t actual; + + FDRef fd; + r = lfn_open(cid, oid, true, &fd); + if (r < 0) { + dout(0) << "write couldn't open " << cid << "/" + << oid << ": " + << cpp_strerror(r) << dendl; + goto out; + } + + // seek + actual = ::lseek64(**fd, offset, SEEK_SET); + if (actual < 0) { + r = -errno; + dout(0) << "write lseek64 to " << offset << " failed: " << cpp_strerror(r) << dendl; + lfn_close(fd); + goto out; + } + if (actual != (int64_t)offset) { + dout(0) << "write lseek64 to " << offset << " gave bad offset " << actual << dendl; + r = -EIO; + lfn_close(fd); + goto out; + } + + // write + r = bl.write_fd(**fd); + if (r == 0) + r = bl.length(); + + if (r >= 0 && m_filestore_sloppy_crc) { + int rc = backend->_crc_update_write(**fd, offset, len, bl); + assert(rc >= 0); + } + + // flush? + if (!replaying && + g_conf->filestore_wbthrottle_enable) + wbthrottle.queue_wb(fd, oid, offset, len, + fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED); + lfn_close(fd); + + out: + dout(10) << "write " << cid << "/" << oid << " " << offset << "~" << len << " = " << r << dendl; + return r; +} + +int FileStore::_zero(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len) +{ + dout(15) << "zero " << cid << "/" << oid << " " << offset << "~" << len << dendl; + int ret = 0; + +#ifdef CEPH_HAVE_FALLOCATE +# if !defined(DARWIN) && !defined(__FreeBSD__) + // first try to punch a hole. + FDRef fd; + ret = lfn_open(cid, oid, false, &fd); + if (ret < 0) { + goto out; + } + + // first try fallocate + ret = fallocate(**fd, FALLOC_FL_PUNCH_HOLE, offset, len); + if (ret < 0) + ret = -errno; + lfn_close(fd); + + if (ret >= 0 && m_filestore_sloppy_crc) { + int rc = backend->_crc_update_zero(**fd, offset, len); + assert(rc >= 0); + } + + if (ret == 0) + goto out; // yay! + if (ret != -EOPNOTSUPP) + goto out; // some other error +# endif +#endif + + // lame, kernel is old and doesn't support it. + // write zeros.. yuck! + dout(20) << "zero FALLOC_FL_PUNCH_HOLE not supported, falling back to writing zeros" << dendl; + { + bufferptr bp(len); + bp.zero(); + bufferlist bl; + bl.push_back(bp); + ret = _write(cid, oid, offset, len, bl); + } + + out: + dout(20) << "zero " << cid << "/" << oid << " " << offset << "~" << len << " = " << ret << dendl; + return ret; +} + +int FileStore::_clone(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid, + const SequencerPosition& spos) +{ + dout(15) << "clone " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << dendl; + + if (_check_replay_guard(cid, newoid, spos) < 0) + return 0; + + int r; + FDRef o, n; + { + Index index; + r = lfn_open(cid, oldoid, false, &o, &index); + if (r < 0) { + goto out2; + } + assert(NULL != (index.index)); + RWLock::WLocker l((index.index)->access_lock); + + r = lfn_open(cid, newoid, true, &n, &index); + if (r < 0) { + goto out; + } + r = ::ftruncate(**n, 0); + if (r < 0) { + goto out3; + } + struct stat st; + ::fstat(**o, &st); + r = _do_clone_range(**o, **n, 0, st.st_size, 0); + if (r < 0) { + r = -errno; + goto out3; + } + + dout(20) << "objectmap clone" << dendl; + r = object_map->clone(oldoid, newoid, &spos); + if (r < 0 && r != -ENOENT) + goto out3; + } + + { + char buf[2]; + map aset; + r = _fgetattrs(**o, aset); + if (r < 0) + goto out3; + + r = chain_fgetxattr(**o, XATTR_SPILL_OUT_NAME, buf, sizeof(buf)); + if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) { + r = chain_fsetxattr(**n, XATTR_SPILL_OUT_NAME, XATTR_NO_SPILL_OUT, + sizeof(XATTR_NO_SPILL_OUT), true); + } else { + r = chain_fsetxattr(**n, XATTR_SPILL_OUT_NAME, XATTR_SPILL_OUT, + sizeof(XATTR_SPILL_OUT), true); + } + if (r < 0) + goto out3; + + r = _fsetattrs(**n, aset); + if (r < 0) + goto out3; + } + + // clone is non-idempotent; record our work. + _set_replay_guard(**n, spos, &newoid); + + out3: + lfn_close(n); + out: + lfn_close(o); + out2: + dout(10) << "clone " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << " = " << r << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + return r; +} + +int FileStore::_do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) +{ + dout(20) << "_do_clone_range copy " << srcoff << "~" << len << " to " << dstoff << dendl; + return backend->clone_range(from, to, srcoff, len, dstoff); +} + +int FileStore::_do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) +{ + dout(20) << __func__ << " " << srcoff << "~" << len << " to " << dstoff << dendl; + int r = 0; + map exomap; + // fiemap doesn't allow zero length + if (len == 0) + return 0; + + if (backend->has_seek_data_hole()) { + dout(15) << "seek_data/seek_hole " << from << " " << srcoff << "~" << len << dendl; + r = _do_seek_hole_data(from, srcoff, len, &exomap); + } else if (backend->has_fiemap()) { + dout(15) << "fiemap ioctl" << from << " " << srcoff << "~" << len << dendl; + r = _do_fiemap(from, srcoff, len, &exomap); + } + + int64_t written = 0; + for (map::iterator miter = exomap.begin(); miter != exomap.end(); ++miter) { + uint64_t it_off = miter->first - srcoff + dstoff; + r = _do_copy_range(from, to, miter->first, miter->second, it_off, true); + if (r < 0) { + r = -errno; + derr << "FileStore::_do_copy_range: copy error at " << miter->first << "~" << miter->second + << " to " << it_off << ", " << cpp_strerror(r) << dendl; + break; + } + written += miter->second; + } + + if (r >= 0) { + if (m_filestore_sloppy_crc) { + int rc = backend->_crc_update_clone_range(from, to, srcoff, len, dstoff); + assert(rc >= 0); + } + struct stat st; + r = ::fstat(to, &st); + if (r < 0) { + r = -errno; + derr << __func__ << ": fstat error at " << to << " " << cpp_strerror(r) << dendl; + goto out; + } + if (st.st_size < (int)(dstoff + len)) { + r = ::ftruncate(to, dstoff + len); + if (r < 0) { + r = -errno; + derr << __func__ << ": ftruncate error at " << dstoff+len << " " << cpp_strerror(r) << dendl; + goto out; + } + } + r = written; + } + + out: + dout(20) << __func__ << " " << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl; + return r; +} + +int FileStore::_do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc) +{ + dout(20) << "_do_copy_range " << srcoff << "~" << len << " to " << dstoff << dendl; + int r = 0; + loff_t pos = srcoff; + loff_t end = srcoff + len; + int buflen = 4096 * 16; //limit by pipe max size.see fcntl + +#ifdef CEPH_HAVE_SPLICE + if (backend->has_splice()) { + int pipefd[2]; + if (pipe(pipefd) < 0) { + r = errno; + derr << " pipe " << " got " << cpp_strerror(r) << dendl; + return r; + } + + loff_t dstpos = dstoff; + while (pos < end) { + int l = MIN(end-pos, buflen); + r = safe_splice(from, &pos, pipefd[1], NULL, l, SPLICE_F_NONBLOCK); + dout(10) << " safe_splice read from " << pos << "~" << l << " got " << r << dendl; + if (r < 0) { + derr << "FileStore::_do_copy_range: safe_splice read error at " << pos << "~" << len + << ", " << cpp_strerror(r) << dendl; + break; + } + if (r == 0) { + // hrm, bad source range, wtf. + r = -ERANGE; + derr << "FileStore::_do_copy_range got short read result at " << pos + << " of fd " << from << " len " << len << dendl; + break; + } + + r = safe_splice(pipefd[0], NULL, to, &dstpos, r, 0); + dout(10) << " safe_splice write to " << to << " len " << r + << " got " << r << dendl; + if (r < 0) { + derr << "FileStore::_do_copy_range: write error at " << pos << "~" + << r << ", " << cpp_strerror(r) << dendl; + break; + } + } + close(pipefd[0]); + close(pipefd[1]); + } else +#endif + { + int64_t actual; + + actual = ::lseek64(from, srcoff, SEEK_SET); + if (actual != (int64_t)srcoff) { + r = errno; + derr << "lseek64 to " << srcoff << " got " << cpp_strerror(r) << dendl; + return r; + } + actual = ::lseek64(to, dstoff, SEEK_SET); + if (actual != (int64_t)dstoff) { + r = errno; + derr << "lseek64 to " << dstoff << " got " << cpp_strerror(r) << dendl; + return r; + } + + char buf[buflen]; + while (pos < end) { + int l = MIN(end-pos, buflen); + r = ::read(from, buf, l); + dout(25) << " read from " << pos << "~" << l << " got " << r << dendl; + if (r < 0) { + if (errno == EINTR) { + continue; + } else { + r = -errno; + derr << "FileStore::_do_copy_range: read error at " << pos << "~" << len + << ", " << cpp_strerror(r) << dendl; + break; + } + } + if (r == 0) { + // hrm, bad source range, wtf. + r = -ERANGE; + derr << "FileStore::_do_copy_range got short read result at " << pos + << " of fd " << from << " len " << len << dendl; + break; + } + int op = 0; + while (op < r) { + int r2 = safe_write(to, buf+op, r-op); + dout(25) << " write to " << to << " len " << (r-op) + << " got " << r2 << dendl; + if (r2 < 0) { + r = r2; + derr << "FileStore::_do_copy_range: write error at " << pos << "~" + << r-op << ", " << cpp_strerror(r) << dendl; + + break; + } + op += (r-op); + } + if (r < 0) + break; + pos += r; + } + } + + assert(pos == end); + if (r >= 0 && !skip_sloppycrc && m_filestore_sloppy_crc) { + int rc = backend->_crc_update_clone_range(from, to, srcoff, len, dstoff); + assert(rc >= 0); + } + dout(20) << "_do_copy_range " << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl; + return r; +} + +int FileStore::_clone_range(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid, + uint64_t srcoff, uint64_t len, uint64_t dstoff, + const SequencerPosition& spos) +{ + dout(15) << "clone_range " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << " " << srcoff << "~" << len << " to " << dstoff << dendl; + + if (_check_replay_guard(cid, newoid, spos) < 0) + return 0; + + int r; + FDRef o, n; + r = lfn_open(cid, oldoid, false, &o); + if (r < 0) { + goto out2; + } + r = lfn_open(cid, newoid, true, &n); + if (r < 0) { + goto out; + } + r = _do_clone_range(**o, **n, srcoff, len, dstoff); + if (r < 0) { + r = -errno; + goto out3; + } + + // clone is non-idempotent; record our work. + _set_replay_guard(**n, spos, &newoid); + + out3: + lfn_close(n); + out: + lfn_close(o); + out2: + dout(10) << "clone_range " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << " " + << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl; + return r; +} + +class SyncEntryTimeout : public Context { +public: + SyncEntryTimeout(int commit_timeo) + : m_commit_timeo(commit_timeo) + { + } + + void finish(int r) { + BackTrace *bt = new BackTrace(1); + generic_dout(-1) << "FileStore: sync_entry timed out after " + << m_commit_timeo << " seconds.\n"; + bt->print(*_dout); + *_dout << dendl; + delete bt; + ceph_abort(); + } +private: + int m_commit_timeo; +}; + +void FileStore::sync_entry() +{ + lock.Lock(); + while (!stop) { + utime_t max_interval; + max_interval.set_from_double(m_filestore_max_sync_interval); + utime_t min_interval; + min_interval.set_from_double(m_filestore_min_sync_interval); + + utime_t startwait = ceph_clock_now(g_ceph_context); + if (!force_sync) { + dout(20) << "sync_entry waiting for max_interval " << max_interval << dendl; + sync_cond.WaitInterval(g_ceph_context, lock, max_interval); + } else { + dout(20) << "sync_entry not waiting, force_sync set" << dendl; + } + + if (force_sync) { + dout(20) << "sync_entry force_sync set" << dendl; + force_sync = false; + } else { + // wait for at least the min interval + utime_t woke = ceph_clock_now(g_ceph_context); + woke -= startwait; + dout(20) << "sync_entry woke after " << woke << dendl; + if (woke < min_interval) { + utime_t t = min_interval; + t -= woke; + dout(20) << "sync_entry waiting for another " << t + << " to reach min interval " << min_interval << dendl; + sync_cond.WaitInterval(g_ceph_context, lock, t); + } + } + + list fin; + again: + fin.swap(sync_waiters); + lock.Unlock(); + + op_tp.pause(); + if (apply_manager.commit_start()) { + utime_t start = ceph_clock_now(g_ceph_context); + uint64_t cp = apply_manager.get_committing_seq(); + + sync_entry_timeo_lock.Lock(); + SyncEntryTimeout *sync_entry_timeo = + new SyncEntryTimeout(m_filestore_commit_timeout); + timer.add_event_after(m_filestore_commit_timeout, sync_entry_timeo); + sync_entry_timeo_lock.Unlock(); + + logger->set(l_os_committing, 1); + + dout(15) << "sync_entry committing " << cp << dendl; + stringstream errstream; + if (g_conf->filestore_debug_omap_check && !object_map->check(errstream)) { + derr << errstream.str() << dendl; + assert(0); + } + + if (backend->can_checkpoint()) { + int err = write_op_seq(op_fd, cp); + if (err < 0) { + derr << "Error during write_op_seq: " << cpp_strerror(err) << dendl; + assert(0 == "error during write_op_seq"); + } + + char s[NAME_MAX]; + snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)cp); + uint64_t cid = 0; + err = backend->create_checkpoint(s, &cid); + if (err < 0) { + int err = errno; + derr << "snap create '" << s << "' got error " << err << dendl; + assert(err == 0); + } + + snaps.push_back(cp); + apply_manager.commit_started(); + op_tp.unpause(); + + if (cid > 0) { + dout(20) << " waiting for checkpoint " << cid << " to complete" << dendl; + err = backend->sync_checkpoint(cid); + if (err < 0) { + derr << "ioctl WAIT_SYNC got " << cpp_strerror(err) << dendl; + assert(0 == "wait_sync got error"); + } + dout(20) << " done waiting for checkpoint" << cid << " to complete" << dendl; + } + } else + { + apply_manager.commit_started(); + op_tp.unpause(); + + object_map->sync(); + int err = backend->syncfs(); + if (err < 0) { + derr << "syncfs got " << cpp_strerror(err) << dendl; + assert(0 == "syncfs returned error"); + } + + err = write_op_seq(op_fd, cp); + if (err < 0) { + derr << "Error during write_op_seq: " << cpp_strerror(err) << dendl; + assert(0 == "error during write_op_seq"); + } + err = ::fsync(op_fd); + if (err < 0) { + derr << "Error during fsync of op_seq: " << cpp_strerror(err) << dendl; + assert(0 == "error during fsync of op_seq"); + } + } + + utime_t done = ceph_clock_now(g_ceph_context); + utime_t lat = done - start; + utime_t dur = done - startwait; + dout(10) << "sync_entry commit took " << lat << ", interval was " << dur << dendl; + + logger->inc(l_os_commit); + logger->tinc(l_os_commit_lat, lat); + logger->tinc(l_os_commit_len, dur); + + apply_manager.commit_finish(); + wbthrottle.clear(); + + logger->set(l_os_committing, 0); + + // remove old snaps? + if (backend->can_checkpoint()) { + char s[NAME_MAX]; + while (snaps.size() > 2) { + snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)snaps.front()); + snaps.pop_front(); + dout(10) << "removing snap '" << s << "'" << dendl; + int r = backend->destroy_checkpoint(s); + if (r) { + int err = errno; + derr << "unable to destroy snap '" << s << "' got " << cpp_strerror(err) << dendl; + } + } + } + + dout(15) << "sync_entry committed to op_seq " << cp << dendl; + + sync_entry_timeo_lock.Lock(); + timer.cancel_event(sync_entry_timeo); + sync_entry_timeo_lock.Unlock(); + } else { + op_tp.unpause(); + } + + lock.Lock(); + finish_contexts(g_ceph_context, fin, 0); + fin.clear(); + if (!sync_waiters.empty()) { + dout(10) << "sync_entry more waiters, committing again" << dendl; + goto again; + } + if (!stop && journal && journal->should_commit_now()) { + dout(10) << "sync_entry journal says we should commit again (probably is/was full)" << dendl; + goto again; + } + } + stop = false; + lock.Unlock(); +} + +void FileStore::_start_sync() +{ + if (!journal) { // don't do a big sync if the journal is on + dout(10) << "start_sync" << dendl; + sync_cond.Signal(); + } else { + dout(10) << "start_sync - NOOP (journal is on)" << dendl; + } +} + +void FileStore::do_force_sync() +{ + dout(10) << __func__ << dendl; + Mutex::Locker l(lock); + force_sync = true; + sync_cond.Signal(); +} + +void FileStore::start_sync(Context *onsafe) +{ + Mutex::Locker l(lock); + sync_waiters.push_back(onsafe); + sync_cond.Signal(); + force_sync = true; + dout(10) << "start_sync" << dendl; +} + +void FileStore::sync() +{ + Mutex l("FileStore::sync"); + Cond c; + bool done; + C_SafeCond *fin = new C_SafeCond(&l, &c, &done); + + start_sync(fin); + + l.Lock(); + while (!done) { + dout(10) << "sync waiting" << dendl; + c.Wait(l); + } + l.Unlock(); + dout(10) << "sync done" << dendl; +} + +void FileStore::_flush_op_queue() +{ + dout(10) << "_flush_op_queue draining op tp" << dendl; + op_wq.drain(); + dout(10) << "_flush_op_queue waiting for apply finisher" << dendl; + for (vector::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) { + (*it)->wait_for_empty(); + } +} + +/* + * flush - make every queued write readable + */ +void FileStore::flush() +{ + dout(10) << "flush" << dendl; + + if (g_conf->filestore_blackhole) { + // wait forever + Mutex lock("FileStore::flush::lock"); + Cond cond; + lock.Lock(); + while (true) + cond.Wait(lock); + assert(0); + } + + if (m_filestore_journal_writeahead) { + if (journal) + journal->flush(); + dout(10) << "flush draining ondisk finisher" << dendl; + for (vector::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) { + (*it)->wait_for_empty(); + } + } + + _flush_op_queue(); + dout(10) << "flush complete" << dendl; +} + +/* + * sync_and_flush - make every queued write readable AND committed to disk + */ +void FileStore::sync_and_flush() +{ + dout(10) << "sync_and_flush" << dendl; + + if (m_filestore_journal_writeahead) { + if (journal) + journal->flush(); + _flush_op_queue(); + } else { + // includes m_filestore_journal_parallel + _flush_op_queue(); + sync(); + } + dout(10) << "sync_and_flush done" << dendl; +} + +int FileStore::flush_journal() +{ + dout(10) << __func__ << dendl; + sync_and_flush(); + sync(); + return 0; +} + +int FileStore::snapshot(const string& name) +{ + dout(10) << "snapshot " << name << dendl; + sync_and_flush(); + + if (!backend->can_checkpoint()) { + dout(0) << "snapshot " << name << " failed, not supported" << dendl; + return -EOPNOTSUPP; + } + + char s[NAME_MAX]; + snprintf(s, sizeof(s), CLUSTER_SNAP_ITEM, name.c_str()); + + int r = backend->create_checkpoint(s, NULL); + if (r) { + r = -errno; + derr << "snapshot " << name << " failed: " << cpp_strerror(r) << dendl; + } + + return r; +} + +// ------------------------------- +// attributes + +int FileStore::_fgetattr(int fd, const char *name, bufferptr& bp) +{ + char val[CHAIN_XATTR_MAX_BLOCK_LEN]; + int l = chain_fgetxattr(fd, name, val, sizeof(val)); + if (l >= 0) { + bp = buffer::create(l); + memcpy(bp.c_str(), val, l); + } else if (l == -ERANGE) { + l = chain_fgetxattr(fd, name, 0, 0); + if (l > 0) { + bp = buffer::create(l); + l = chain_fgetxattr(fd, name, bp.c_str(), l); + } + } + assert(!m_filestore_fail_eio || l != -EIO); + return l; +} + +int FileStore::_fgetattrs(int fd, map& aset) +{ + // get attr list + char names1[100]; + int len = chain_flistxattr(fd, names1, sizeof(names1)-1); + char *names2 = 0; + char *name = 0; + if (len == -ERANGE) { + len = chain_flistxattr(fd, 0, 0); + if (len < 0) { + assert(!m_filestore_fail_eio || len != -EIO); + return len; + } + dout(10) << " -ERANGE, len is " << len << dendl; + names2 = new char[len+1]; + len = chain_flistxattr(fd, names2, len); + dout(10) << " -ERANGE, got " << len << dendl; + if (len < 0) { + assert(!m_filestore_fail_eio || len != -EIO); + delete[] names2; + return len; + } + name = names2; + } else if (len < 0) { + assert(!m_filestore_fail_eio || len != -EIO); + return len; + } else { + name = names1; + } + name[len] = 0; + + char *end = name + len; + while (name < end) { + char *attrname = name; + if (parse_attrname(&name)) { + if (*name) { + dout(20) << "fgetattrs " << fd << " getting '" << name << "'" << dendl; + int r = _fgetattr(fd, attrname, aset[name]); + if (r < 0) { + delete[] names2; + return r; + } + } + } + name += strlen(name) + 1; + } + + delete[] names2; + return 0; +} + +int FileStore::_fsetattrs(int fd, map &aset) +{ + for (map::iterator p = aset.begin(); + p != aset.end(); + ++p) { + char n[CHAIN_XATTR_MAX_NAME_LEN]; + get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN); + const char *val; + if (p->second.length()) + val = p->second.c_str(); + else + val = ""; + // ??? Why do we skip setting all the other attrs if one fails? + int r = chain_fsetxattr(fd, n, val, p->second.length()); + if (r < 0) { + derr << "FileStore::_setattrs: chain_setxattr returned " << r << dendl; + return r; + } + } + return 0; +} + +// debug EIO injection +void FileStore::inject_data_error(const ghobject_t &oid) { + Mutex::Locker l(read_error_lock); + dout(10) << __func__ << ": init error on " << oid << dendl; + data_error_set.insert(oid); +} +void FileStore::inject_mdata_error(const ghobject_t &oid) { + Mutex::Locker l(read_error_lock); + dout(10) << __func__ << ": init error on " << oid << dendl; + mdata_error_set.insert(oid); +} +void FileStore::debug_obj_on_delete(const ghobject_t &oid) { + Mutex::Locker l(read_error_lock); + dout(10) << __func__ << ": clear error on " << oid << dendl; + data_error_set.erase(oid); + mdata_error_set.erase(oid); +} +bool FileStore::debug_data_eio(const ghobject_t &oid) { + Mutex::Locker l(read_error_lock); + if (data_error_set.count(oid)) { + dout(10) << __func__ << ": inject error on " << oid << dendl; + return true; + } else { + return false; + } +} +bool FileStore::debug_mdata_eio(const ghobject_t &oid) { + Mutex::Locker l(read_error_lock); + if (mdata_error_set.count(oid)) { + dout(10) << __func__ << ": inject error on " << oid << dendl; + return true; + } else { + return false; + } +} + + +// objects + +int FileStore::getattr(coll_t cid, const ghobject_t& oid, const char *name, bufferptr &bp) +{ + tracepoint(objectstore, getattr_enter, cid.c_str()); + _kludge_temp_object_collection(cid, oid); + dout(15) << "getattr " << cid << "/" << oid << " '" << name << "'" << dendl; + FDRef fd; + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + goto out; + } + char n[CHAIN_XATTR_MAX_NAME_LEN]; + get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN); + r = _fgetattr(**fd, n, bp); + lfn_close(fd); + if (r == -ENODATA) { + map got; + set to_get; + to_get.insert(string(name)); + Index index; + r = get_index(cid, &index); + if (r < 0) { + dout(10) << __func__ << " could not get index r = " << r << dendl; + goto out; + } + r = object_map->get_xattrs(oid, to_get, &got); + if (r < 0 && r != -ENOENT) { + dout(10) << __func__ << " get_xattrs err r =" << r << dendl; + goto out; + } + if (got.empty()) { + dout(10) << __func__ << " got.size() is 0" << dendl; + return -ENODATA; + } + bp = bufferptr(got.begin()->second.c_str(), + got.begin()->second.length()); + r = bp.length(); + } + out: + dout(10) << "getattr " << cid << "/" << oid << " '" << name << "' = " << r << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + if (g_conf->filestore_debug_inject_read_err && + debug_mdata_eio(oid)) { + return -EIO; + } else { + tracepoint(objectstore, getattr_exit, r); + return r < 0 ? r : 0; + } +} + +int FileStore::getattrs(coll_t cid, const ghobject_t& oid, map& aset) +{ + tracepoint(objectstore, getattrs_enter, cid.c_str()); + _kludge_temp_object_collection(cid, oid); + set omap_attrs; + map omap_aset; + Index index; + dout(15) << "getattrs " << cid << "/" << oid << dendl; + FDRef fd; + bool spill_out = true; + char buf[2]; + + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + goto out; + } + + r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf)); + if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) + spill_out = false; + + r = _fgetattrs(**fd, aset); + if (r < 0) { + goto out; + } + lfn_close(fd); + + if (!spill_out) { + dout(10) << __func__ << " no xattr exists in object_map r = " << r << dendl; + goto out; + } + + r = get_index(cid, &index); + if (r < 0) { + dout(10) << __func__ << " could not get index r = " << r << dendl; + goto out; + } + { + r = object_map->get_all_xattrs(oid, &omap_attrs); + if (r < 0 && r != -ENOENT) { + dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl; + goto out; + } + + r = object_map->get_xattrs(oid, omap_attrs, &omap_aset); + if (r < 0 && r != -ENOENT) { + dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl; + goto out; + } + if (r == -ENOENT) + r = 0; + } + assert(omap_attrs.size() == omap_aset.size()); + for (map::iterator i = omap_aset.begin(); + i != omap_aset.end(); + ++i) { + string key(i->first); + aset.insert(make_pair(key, + bufferptr(i->second.c_str(), i->second.length()))); + } + out: + dout(10) << "getattrs " << cid << "/" << oid << " = " << r << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + + if (g_conf->filestore_debug_inject_read_err && + debug_mdata_eio(oid)) { + return -EIO; + } else { + tracepoint(objectstore, getattrs_exit, r); + return r; + } +} + +int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map& aset, + const SequencerPosition &spos) +{ + map omap_set; + set omap_remove; + map inline_set; + map inline_to_set; + FDRef fd; + int spill_out = -1; + bool incomplete_inline = false; + + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + goto out; + } + + char buf[2]; + r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf)); + if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) + spill_out = 0; + else + spill_out = 1; + + r = _fgetattrs(**fd, inline_set); + incomplete_inline = (r == -E2BIG); + assert(!m_filestore_fail_eio || r != -EIO); + dout(15) << "setattrs " << cid << "/" << oid + << (incomplete_inline ? " (incomplete_inline, forcing omap)" : "") + << dendl; + + for (map::iterator p = aset.begin(); + p != aset.end(); + ++p) { + char n[CHAIN_XATTR_MAX_NAME_LEN]; + get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN); + + if (incomplete_inline) { + chain_fremovexattr(**fd, n); // ignore any error + omap_set[p->first].push_back(p->second); + continue; + } + + if (p->second.length() > m_filestore_max_inline_xattr_size) { + if (inline_set.count(p->first)) { + inline_set.erase(p->first); + r = chain_fremovexattr(**fd, n); + if (r < 0) + goto out_close; + } + omap_set[p->first].push_back(p->second); + continue; + } + + if (!inline_set.count(p->first) && + inline_set.size() >= m_filestore_max_inline_xattrs) { + omap_set[p->first].push_back(p->second); + continue; + } + omap_remove.insert(p->first); + inline_set.insert(*p); + + inline_to_set.insert(*p); + } + + if (spill_out != 1 && !omap_set.empty()) { + chain_fsetxattr(**fd, XATTR_SPILL_OUT_NAME, XATTR_SPILL_OUT, + sizeof(XATTR_SPILL_OUT)); + } + + r = _fsetattrs(**fd, inline_to_set); + if (r < 0) + goto out_close; + + if (spill_out && !omap_remove.empty()) { + r = object_map->remove_xattrs(oid, omap_remove, &spos); + if (r < 0 && r != -ENOENT) { + dout(10) << __func__ << " could not remove_xattrs r = " << r << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + goto out_close; + } else { + r = 0; // don't confuse the debug output + } + } + + if (!omap_set.empty()) { + r = object_map->set_xattrs(oid, omap_set, &spos); + if (r < 0) { + dout(10) << __func__ << " could not set_xattrs r = " << r << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + goto out_close; + } + } + out_close: + lfn_close(fd); + out: + dout(10) << "setattrs " << cid << "/" << oid << " = " << r << dendl; + return r; +} + + +int FileStore::_rmattr(coll_t cid, const ghobject_t& oid, const char *name, + const SequencerPosition &spos) +{ + dout(15) << "rmattr " << cid << "/" << oid << " '" << name << "'" << dendl; + FDRef fd; + bool spill_out = true; + bufferptr bp; + + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + goto out; + } + + char buf[2]; + r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf)); + if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) { + spill_out = false; + } + + char n[CHAIN_XATTR_MAX_NAME_LEN]; + get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN); + r = chain_fremovexattr(**fd, n); + if (r == -ENODATA && spill_out) { + Index index; + r = get_index(cid, &index); + if (r < 0) { + dout(10) << __func__ << " could not get index r = " << r << dendl; + goto out_close; + } + set to_remove; + to_remove.insert(string(name)); + r = object_map->remove_xattrs(oid, to_remove, &spos); + if (r < 0 && r != -ENOENT) { + dout(10) << __func__ << " could not remove_xattrs index r = " << r << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + goto out_close; + } + } + out_close: + lfn_close(fd); + out: + dout(10) << "rmattr " << cid << "/" << oid << " '" << name << "' = " << r << dendl; + return r; +} + +int FileStore::_rmattrs(coll_t cid, const ghobject_t& oid, + const SequencerPosition &spos) +{ + dout(15) << "rmattrs " << cid << "/" << oid << dendl; + + map aset; + FDRef fd; + set omap_attrs; + Index index; + bool spill_out = true; + + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + goto out; + } + + char buf[2]; + r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf)); + if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) { + spill_out = false; + } + + r = _fgetattrs(**fd, aset); + if (r >= 0) { + for (map::iterator p = aset.begin(); p != aset.end(); ++p) { + char n[CHAIN_XATTR_MAX_NAME_LEN]; + get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN); + r = chain_fremovexattr(**fd, n); + if (r < 0) + break; + } + } + + if (!spill_out) { + dout(10) << __func__ << " no xattr exists in object_map r = " << r << dendl; + goto out_close; + } + + r = get_index(cid, &index); + if (r < 0) { + dout(10) << __func__ << " could not get index r = " << r << dendl; + goto out_close; + } + { + r = object_map->get_all_xattrs(oid, &omap_attrs); + if (r < 0 && r != -ENOENT) { + dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + goto out_close; + } + r = object_map->remove_xattrs(oid, omap_attrs, &spos); + if (r < 0 && r != -ENOENT) { + dout(10) << __func__ << " could not remove omap_attrs r = " << r << dendl; + goto out_close; + } + if (r == -ENOENT) + r = 0; + chain_fsetxattr(**fd, XATTR_SPILL_OUT_NAME, XATTR_NO_SPILL_OUT, + sizeof(XATTR_NO_SPILL_OUT)); + } + + out_close: + lfn_close(fd); + out: + dout(10) << "rmattrs " << cid << "/" << oid << " = " << r << dendl; + return r; +} + + + +// collections + +int FileStore::collection_getattr(coll_t c, const char *name, + void *value, size_t size) +{ + char fn[PATH_MAX]; + get_cdir(c, fn, sizeof(fn)); + dout(15) << "collection_getattr " << fn << " '" << name << "' len " << size << dendl; + int r; + int fd = ::open(fn, O_RDONLY); + if (fd < 0) { + r = -errno; + goto out; + } + char n[PATH_MAX]; + get_attrname(name, n, PATH_MAX); + r = chain_fgetxattr(fd, n, value, size); + VOID_TEMP_FAILURE_RETRY(::close(fd)); + out: + dout(10) << "collection_getattr " << fn << " '" << name << "' len " << size << " = " << r << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + return r; +} + +int FileStore::collection_getattr(coll_t c, const char *name, bufferlist& bl) +{ + char fn[PATH_MAX]; + get_cdir(c, fn, sizeof(fn)); + dout(15) << "collection_getattr " << fn << " '" << name << "'" << dendl; + char n[PATH_MAX]; + get_attrname(name, n, PATH_MAX); + buffer::ptr bp; + int r; + int fd = ::open(fn, O_RDONLY); + if (fd < 0) { + r = -errno; + goto out; + } + r = _fgetattr(fd, n, bp); + bl.push_back(bp); + VOID_TEMP_FAILURE_RETRY(::close(fd)); + out: + dout(10) << "collection_getattr " << fn << " '" << name << "' = " << r << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + return r; +} + +int FileStore::collection_getattrs(coll_t cid, map& aset) +{ + char fn[PATH_MAX]; + get_cdir(cid, fn, sizeof(fn)); + dout(10) << "collection_getattrs " << fn << dendl; + int r = 0; + int fd = ::open(fn, O_RDONLY); + if (fd < 0) { + r = -errno; + goto out; + } + r = _fgetattrs(fd, aset); + VOID_TEMP_FAILURE_RETRY(::close(fd)); + out: + dout(10) << "collection_getattrs " << fn << " = " << r << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + return r; +} + + +int FileStore::_collection_setattr(coll_t c, const char *name, + const void *value, size_t size) +{ + char fn[PATH_MAX]; + get_cdir(c, fn, sizeof(fn)); + dout(10) << "collection_setattr " << fn << " '" << name << "' len " << size << dendl; + char n[PATH_MAX]; + int r; + int fd = ::open(fn, O_RDONLY); + if (fd < 0) { + r = -errno; + goto out; + } + get_attrname(name, n, PATH_MAX); + r = chain_fsetxattr(fd, n, value, size); + VOID_TEMP_FAILURE_RETRY(::close(fd)); + out: + dout(10) << "collection_setattr " << fn << " '" << name << "' len " << size << " = " << r << dendl; + return r; +} + +int FileStore::_collection_rmattr(coll_t c, const char *name) +{ + char fn[PATH_MAX]; + get_cdir(c, fn, sizeof(fn)); + dout(15) << "collection_rmattr " << fn << dendl; + char n[PATH_MAX]; + get_attrname(name, n, PATH_MAX); + int r; + int fd = ::open(fn, O_RDONLY); + if (fd < 0) { + r = -errno; + goto out; + } + r = chain_fremovexattr(fd, n); + VOID_TEMP_FAILURE_RETRY(::close(fd)); + out: + dout(10) << "collection_rmattr " << fn << " = " << r << dendl; + return r; +} + + +int FileStore::_collection_setattrs(coll_t cid, map& aset) +{ + char fn[PATH_MAX]; + get_cdir(cid, fn, sizeof(fn)); + dout(15) << "collection_setattrs " << fn << dendl; + int r = 0; + int fd = ::open(fn, O_RDONLY); + if (fd < 0) { + r = -errno; + goto out; + } + for (map::iterator p = aset.begin(); + p != aset.end(); + ++p) { + char n[PATH_MAX]; + get_attrname(p->first.c_str(), n, PATH_MAX); + r = chain_fsetxattr(fd, n, p->second.c_str(), p->second.length()); + if (r < 0) + break; + } + VOID_TEMP_FAILURE_RETRY(::close(fd)); + out: + dout(10) << "collection_setattrs " << fn << " = " << r << dendl; + return r; +} + +int FileStore::_collection_remove_recursive(const coll_t &cid, + const SequencerPosition &spos) +{ + struct stat st; + int r = collection_stat(cid, &st); + if (r < 0) { + if (r == -ENOENT) + return 0; + return r; + } + + vector objects; + ghobject_t max; + while (!max.is_max()) { + r = collection_list(cid, max, ghobject_t::get_max(), true, + 300, &objects, &max); + if (r < 0) + return r; + for (vector::iterator i = objects.begin(); + i != objects.end(); + ++i) { + assert(_check_replay_guard(cid, *i, spos)); + r = _remove(cid, *i, spos); + if (r < 0) + return r; + } + } + return _destroy_collection(cid); +} + +// -------------------------- +// collections + +int FileStore::collection_version_current(coll_t c, uint32_t *version) +{ + Index index; + int r = get_index(c, &index); + if (r < 0) + return r; + + assert(NULL != index.index); + RWLock::RLocker l((index.index)->access_lock); + + *version = index->collection_version(); + if (*version == target_version) + return 1; + else + return 0; +} + +int FileStore::list_collections(vector& ls) +{ + return list_collections(ls, false); +} + +int FileStore::list_collections(vector& ls, bool include_temp) +{ + tracepoint(objectstore, list_collections_enter); + dout(10) << "list_collections" << dendl; + + char fn[PATH_MAX]; + snprintf(fn, sizeof(fn), "%s/current", basedir.c_str()); + + int r = 0; + DIR *dir = ::opendir(fn); + if (!dir) { + r = -errno; + derr << "tried opening directory " << fn << ": " << cpp_strerror(-r) << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + + char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1]; + struct dirent *de; + while ((r = ::readdir_r(dir, (struct dirent *)&buf, &de)) == 0) { + if (!de) + break; + if (de->d_type == DT_UNKNOWN) { + // d_type not supported (non-ext[234], btrfs), must stat + struct stat sb; + char filename[PATH_MAX]; + snprintf(filename, sizeof(filename), "%s/%s", fn, de->d_name); + + r = ::stat(filename, &sb); + if (r < 0) { + r = -errno; + derr << "stat on " << filename << ": " << cpp_strerror(-r) << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + break; + } + if (!S_ISDIR(sb.st_mode)) { + continue; + } + } else if (de->d_type != DT_DIR) { + continue; + } + if (strcmp(de->d_name, "omap") == 0) { + continue; + } + if (de->d_name[0] == '.' && + (de->d_name[1] == '\0' || + (de->d_name[1] == '.' && + de->d_name[2] == '\0'))) + continue; + coll_t cid; + if (!cid.parse(de->d_name)) { + derr << "ignoging invalid collection '" << de->d_name << "'" << dendl; + continue; + } + if (!cid.is_temp() || include_temp) + ls.push_back(cid); + } + + if (r > 0) { + derr << "trying readdir_r " << fn << ": " << cpp_strerror(r) << dendl; + r = -r; + } + + ::closedir(dir); + assert(!m_filestore_fail_eio || r != -EIO); + tracepoint(objectstore, list_collections_exit, r); + return r; +} + +int FileStore::collection_stat(coll_t c, struct stat *st) +{ + tracepoint(objectstore, collection_stat_enter, c.c_str()); + char fn[PATH_MAX]; + get_cdir(c, fn, sizeof(fn)); + dout(15) << "collection_stat " << fn << dendl; + int r = ::stat(fn, st); + if (r < 0) + r = -errno; + dout(10) << "collection_stat " << fn << " = " << r << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + tracepoint(objectstore, collection_stat_exit, r); + return r; +} + +bool FileStore::collection_exists(coll_t c) +{ + tracepoint(objectstore, collection_exists_enter, c.c_str()); + struct stat st; + bool ret = collection_stat(c, &st) == 0; + tracepoint(objectstore, collection_exists_exit, ret); + return ret; +} + +bool FileStore::collection_empty(coll_t c) +{ + tracepoint(objectstore, collection_empty_enter, c.c_str()); + dout(15) << "collection_empty " << c << dendl; + Index index; + int r = get_index(c, &index); + if (r < 0) + return false; + + assert(NULL != index.index); + RWLock::RLocker l((index.index)->access_lock); + + vector ls; + r = index->collection_list_partial(ghobject_t(), ghobject_t::get_max(), true, + 1, &ls, NULL); + if (r < 0) { + assert(!m_filestore_fail_eio || r != -EIO); + return false; + } + bool ret = ls.empty(); + tracepoint(objectstore, collection_empty_exit, ret); + return ret; +} +int FileStore::collection_list(coll_t c, ghobject_t start, ghobject_t end, + bool sort_bitwise, int max, + vector *ls, ghobject_t *next) +{ + if (start.is_max()) + return 0; + + ghobject_t temp_next; + if (!next) + next = &temp_next; + // figure out the pool id. we need this in order to generate a + // meaningful 'next' value. + int64_t pool = -1; + shard_id_t shard; + { + spg_t pgid; + if (c.is_temp(&pgid)) { + pool = -2 - pgid.pool(); + shard = pgid.shard; + } else if (c.is_pg(&pgid)) { + pool = pgid.pool(); + shard = pgid.shard; + } else if (c.is_meta()) { + pool = -1; + shard = shard_id_t::NO_SHARD; + } else { + // hrm, the caller is test code! we should get kill it off. for now, + // tolerate it. + pool = 0; + shard = shard_id_t::NO_SHARD; + } + dout(20) << __func__ << " pool is " << pool << " shard is " << shard + << " pgid " << pgid << dendl; + } + ghobject_t sep; + sep.hobj.pool = -1; + sep.set_shard(shard); + if (!c.is_temp() && !c.is_meta()) { + if (cmp_bitwise(start, sep) < 0) { // bitwise vs nibble doesn't matter here + dout(10) << __func__ << " first checking temp pool" << dendl; + coll_t temp = c.get_temp(); + int r = collection_list(temp, start, end, sort_bitwise, max, ls, next); + if (r < 0) + return r; + if (*next != ghobject_t::get_max()) + return r; + start = sep; + dout(10) << __func__ << " fall through to non-temp collection, start " + << start << dendl; + } else { + dout(10) << __func__ << " start " << start << " >= sep " << sep << dendl; + } + } + + Index index; + int r = get_index(c, &index); + if (r < 0) + return r; + + assert(NULL != index.index); + RWLock::RLocker l((index.index)->access_lock); + + r = index->collection_list_partial(start, end, sort_bitwise, max, ls, next); + + if (r < 0) { + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + dout(20) << "objects: " << ls << dendl; + + // HashIndex doesn't know the pool when constructing a 'next' value + if (next && !next->is_max()) { + next->hobj.pool = pool; + next->set_shard(shard); + dout(20) << " next " << *next << dendl; + } + + return 0; +} + +int FileStore::omap_get(coll_t c, const ghobject_t &hoid, + bufferlist *header, + map *out) +{ + tracepoint(objectstore, omap_get_enter, c.c_str()); + _kludge_temp_object_collection(c, hoid); + dout(15) << __func__ << " " << c << "/" << hoid << dendl; + Index index; + int r = get_index(c, &index); + if (r < 0) + return r; + { + assert(NULL != index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } + r = object_map->get(hoid, header, out); + if (r < 0 && r != -ENOENT) { + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + tracepoint(objectstore, omap_get_exit, 0); + return 0; +} + +int FileStore::omap_get_header( + coll_t c, + const ghobject_t &hoid, + bufferlist *bl, + bool allow_eio) +{ + tracepoint(objectstore, omap_get_header_enter, c.c_str()); + _kludge_temp_object_collection(c, hoid); + dout(15) << __func__ << " " << c << "/" << hoid << dendl; + Index index; + int r = get_index(c, &index); + if (r < 0) + return r; + { + assert(NULL != index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } + r = object_map->get_header(hoid, bl); + if (r < 0 && r != -ENOENT) { + assert(allow_eio || !m_filestore_fail_eio || r != -EIO); + return r; + } + tracepoint(objectstore, omap_get_header_exit, 0); + return 0; +} + +int FileStore::omap_get_keys(coll_t c, const ghobject_t &hoid, set *keys) +{ + tracepoint(objectstore, omap_get_keys_enter, c.c_str()); + _kludge_temp_object_collection(c, hoid); + dout(15) << __func__ << " " << c << "/" << hoid << dendl; + Index index; + int r = get_index(c, &index); + if (r < 0) + return r; + { + assert(NULL != index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } + r = object_map->get_keys(hoid, keys); + if (r < 0 && r != -ENOENT) { + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + tracepoint(objectstore, omap_get_keys_exit, 0); + return 0; +} + +int FileStore::omap_get_values(coll_t c, const ghobject_t &hoid, + const set &keys, + map *out) +{ + tracepoint(objectstore, omap_get_values_enter, c.c_str()); + _kludge_temp_object_collection(c, hoid); + dout(15) << __func__ << " " << c << "/" << hoid << dendl; + Index index; + const char *where = 0; + int r = get_index(c, &index); + if (r < 0) { + where = " (get_index)"; + goto out; + } + { + assert(NULL != index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) { + where = " (lfn_find)"; + goto out; + } + } + r = object_map->get_values(hoid, keys, out); + if (r < 0 && r != -ENOENT) { + assert(!m_filestore_fail_eio || r != -EIO); + goto out; + } + r = 0; + out: + tracepoint(objectstore, omap_get_values_exit, r); + dout(15) << __func__ << " " << c << "/" << hoid << " = " << r + << where << dendl; + return r; +} + +int FileStore::omap_check_keys(coll_t c, const ghobject_t &hoid, + const set &keys, + set *out) +{ + tracepoint(objectstore, omap_check_keys_enter, c.c_str()); + _kludge_temp_object_collection(c, hoid); + dout(15) << __func__ << " " << c << "/" << hoid << dendl; + + Index index; + int r = get_index(c, &index); + if (r < 0) + return r; + { + assert(NULL != index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } + r = object_map->check_keys(hoid, keys, out); + if (r < 0 && r != -ENOENT) { + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + tracepoint(objectstore, omap_check_keys_exit, 0); + return 0; +} + +ObjectMap::ObjectMapIterator FileStore::get_omap_iterator(coll_t c, + const ghobject_t &hoid) +{ + tracepoint(objectstore, get_omap_iterator, c.c_str()); + _kludge_temp_object_collection(c, hoid); + dout(15) << __func__ << " " << c << "/" << hoid << dendl; + Index index; + int r = get_index(c, &index); + if (r < 0) { + dout(10) << __func__ << " " << c << "/" << hoid << " = 0 " + << "(get_index failed with " << cpp_strerror(r) << ")" << dendl; + return ObjectMap::ObjectMapIterator(); + } + { + assert(NULL != index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) { + dout(10) << __func__ << " " << c << "/" << hoid << " = 0 " + << "(lfn_find failed with " << cpp_strerror(r) << ")" << dendl; + return ObjectMap::ObjectMapIterator(); + } + } + return object_map->get_iterator(hoid); +} + +int FileStore::_collection_hint_expected_num_objs(coll_t c, uint32_t pg_num, + uint64_t expected_num_objs, + const SequencerPosition &spos) +{ + dout(15) << __func__ << " collection: " << c << " pg number: " + << pg_num << " expected number of objects: " << expected_num_objs << dendl; + + if (!collection_empty(c) && !replaying) { + dout(0) << "Failed to give an expected number of objects hint to collection : " + << c << ", only empty collection can take such type of hint. " << dendl; + return 0; + } + + int ret; + Index index; + ret = get_index(c, &index); + if (ret < 0) + return ret; + // Pre-hash the collection + ret = index->pre_hash_collection(pg_num, expected_num_objs); + dout(10) << "pre_hash_collection " << c << " = " << ret << dendl; + if (ret < 0) + return ret; + _set_replay_guard(c, spos); + + return 0; +} + +int FileStore::_create_collection( + coll_t c, + const SequencerPosition &spos) +{ + char fn[PATH_MAX]; + get_cdir(c, fn, sizeof(fn)); + dout(15) << "create_collection " << fn << dendl; + int r = ::mkdir(fn, 0755); + if (r < 0) + r = -errno; + if (r == -EEXIST && replaying) + r = 0; + dout(10) << "create_collection " << fn << " = " << r << dendl; + + if (r < 0) + return r; + r = init_index(c); + if (r < 0) + return r; + + // create parallel temp collection, too + if (!c.is_meta() && !c.is_temp()) { + coll_t temp = c.get_temp(); + r = _create_collection(temp, spos); + if (r < 0) + return r; + } + + _set_replay_guard(c, spos); + return 0; +} + +int FileStore::_destroy_collection(coll_t c) +{ + int r = 0; + char fn[PATH_MAX]; + get_cdir(c, fn, sizeof(fn)); + dout(15) << "_destroy_collection " << fn << dendl; + { + Index from; + r = get_index(c, &from); + if (r < 0) + goto out; + assert(NULL != from.index); + RWLock::WLocker l((from.index)->access_lock); + + r = from->prep_delete(); + if (r < 0) + goto out; + } + r = ::rmdir(fn); + if (r < 0) { + r = -errno; + goto out; + } + + out: + // destroy parallel temp collection, too + if (!c.is_meta() && !c.is_temp()) { + coll_t temp = c.get_temp(); + int r2 = _destroy_collection(temp); + if (r2 < 0) { + r = r2; + goto out_final; + } + } + + out_final: + dout(10) << "_destroy_collection " << fn << " = " << r << dendl; + return r; +} + + +int FileStore::_collection_add(coll_t c, coll_t oldcid, const ghobject_t& o, + const SequencerPosition& spos) +{ + dout(15) << "collection_add " << c << "/" << o << " from " << oldcid << "/" << o << dendl; + + int dstcmp = _check_replay_guard(c, o, spos); + if (dstcmp < 0) + return 0; + + // check the src name too; it might have a newer guard, and we don't + // want to clobber it + int srccmp = _check_replay_guard(oldcid, o, spos); + if (srccmp < 0) + return 0; + + // open guard on object so we don't any previous operations on the + // new name that will modify the source inode. + FDRef fd; + int r = lfn_open(oldcid, o, 0, &fd); + if (r < 0) { + // the source collection/object does not exist. If we are replaying, we + // should be safe, so just return 0 and move on. + assert(replaying); + dout(10) << "collection_add " << c << "/" << o << " from " + << oldcid << "/" << o << " (dne, continue replay) " << dendl; + return 0; + } + if (dstcmp > 0) { // if dstcmp == 0 the guard already says "in-progress" + _set_replay_guard(**fd, spos, &o, true); + } + + r = lfn_link(oldcid, c, o, o); + if (replaying && !backend->can_checkpoint() && + r == -EEXIST) // crashed between link() and set_replay_guard() + r = 0; + + _inject_failure(); + + // close guard on object so we don't do this again + if (r == 0) { + _close_replay_guard(**fd, spos); + } + lfn_close(fd); + + dout(10) << "collection_add " << c << "/" << o << " from " << oldcid << "/" << o << " = " << r << dendl; + return r; +} + +int FileStore::_collection_move_rename(coll_t oldcid, const ghobject_t& oldoid, + coll_t c, const ghobject_t& o, + const SequencerPosition& spos) +{ + dout(15) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" << oldoid << dendl; + int r = 0; + int dstcmp, srccmp; + + if (replaying) { + /* If the destination collection doesn't exist during replay, + * we need to delete the src object and continue on + */ + if (!collection_exists(c)) + goto out_rm_src; + } + + dstcmp = _check_replay_guard(c, o, spos); + if (dstcmp < 0) + goto out_rm_src; + + // check the src name too; it might have a newer guard, and we don't + // want to clobber it + srccmp = _check_replay_guard(oldcid, oldoid, spos); + if (srccmp < 0) + return 0; + + { + // open guard on object so we don't any previous operations on the + // new name that will modify the source inode. + FDRef fd; + r = lfn_open(oldcid, oldoid, 0, &fd); + if (r < 0) { + // the source collection/object does not exist. If we are replaying, we + // should be safe, so just return 0 and move on. + assert(replaying); + dout(10) << __func__ << " " << c << "/" << o << " from " + << oldcid << "/" << oldoid << " (dne, continue replay) " << dendl; + return 0; + } + if (dstcmp > 0) { // if dstcmp == 0 the guard already says "in-progress" + _set_replay_guard(**fd, spos, &o, true); + } + + r = lfn_link(oldcid, c, oldoid, o); + if (replaying && !backend->can_checkpoint() && + r == -EEXIST) // crashed between link() and set_replay_guard() + r = 0; + + _inject_failure(); + + if (r == 0) { + // the name changed; link the omap content + r = object_map->clone(oldoid, o, &spos); + if (r == -ENOENT) + r = 0; + } + + _inject_failure(); + + lfn_close(fd); + fd = FDRef(); + + if (r == 0) + r = lfn_unlink(oldcid, oldoid, spos, true); + + if (r == 0) + r = lfn_open(c, o, 0, &fd); + + // close guard on object so we don't do this again + if (r == 0) + _close_replay_guard(**fd, spos); + + lfn_close(fd); + } + + dout(10) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" << oldoid + << " = " << r << dendl; + return r; + + out_rm_src: + // remove source + if (_check_replay_guard(oldcid, oldoid, spos) > 0) { + r = lfn_unlink(oldcid, oldoid, spos, true); + } + + dout(10) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" << oldoid + << " = " << r << dendl; + return r; +} + +void FileStore::_inject_failure() +{ + if (m_filestore_kill_at.read()) { + int final = m_filestore_kill_at.dec(); + dout(5) << "_inject_failure " << (final+1) << " -> " << final << dendl; + if (final == 0) { + derr << "_inject_failure KILLING" << dendl; + g_ceph_context->_log->flush(); + _exit(1); + } + } +} + +int FileStore::_omap_clear(coll_t cid, const ghobject_t &hoid, + const SequencerPosition &spos) { + dout(15) << __func__ << " " << cid << "/" << hoid << dendl; + Index index; + int r = get_index(cid, &index); + if (r < 0) + return r; + { + assert(NULL != index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } + r = object_map->clear_keys_header(hoid, &spos); + if (r < 0 && r != -ENOENT) + return r; + return 0; +} + +int FileStore::_omap_setkeys(coll_t cid, const ghobject_t &hoid, + const map &aset, + const SequencerPosition &spos) { + dout(15) << __func__ << " " << cid << "/" << hoid << dendl; + Index index; + int r; + //treat pgmeta as a logical object, skip to check exist + if (hoid.is_pgmeta()) + goto skip; + + r = get_index(cid, &index); + if (r < 0) { + dout(20) << __func__ << " get_index got " << cpp_strerror(r) << dendl; + return r; + } + { + assert(NULL != index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) { + dout(20) << __func__ << " lfn_find got " << cpp_strerror(r) << dendl; + return r; + } + } +skip: + r = object_map->set_keys(hoid, aset, &spos); + dout(20) << __func__ << " " << cid << "/" << hoid << " = " << r << dendl; + return r; +} + +int FileStore::_omap_rmkeys(coll_t cid, const ghobject_t &hoid, + const set &keys, + const SequencerPosition &spos) { + dout(15) << __func__ << " " << cid << "/" << hoid << dendl; + Index index; + int r; + //treat pgmeta as a logical object, skip to check exist + if (hoid.is_pgmeta()) + goto skip; + + r = get_index(cid, &index); + if (r < 0) + return r; + { + assert(NULL != index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } +skip: + r = object_map->rm_keys(hoid, keys, &spos); + if (r < 0 && r != -ENOENT) + return r; + return 0; +} + +int FileStore::_omap_rmkeyrange(coll_t cid, const ghobject_t &hoid, + const string& first, const string& last, + const SequencerPosition &spos) { + dout(15) << __func__ << " " << cid << "/" << hoid << " [" << first << "," << last << "]" << dendl; + set keys; + { + ObjectMap::ObjectMapIterator iter = get_omap_iterator(cid, hoid); + if (!iter) + return -ENOENT; + for (iter->lower_bound(first); iter->valid() && iter->key() < last; + iter->next()) { + keys.insert(iter->key()); + } + } + return _omap_rmkeys(cid, hoid, keys, spos); +} + +int FileStore::_omap_setheader(coll_t cid, const ghobject_t &hoid, + const bufferlist &bl, + const SequencerPosition &spos) +{ + dout(15) << __func__ << " " << cid << "/" << hoid << dendl; + Index index; + int r = get_index(cid, &index); + if (r < 0) + return r; + { + assert(NULL != index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } + return object_map->set_header(hoid, bl, &spos); +} + +int FileStore::_split_collection(coll_t cid, + uint32_t bits, + uint32_t rem, + coll_t dest, + const SequencerPosition &spos) +{ + int r; + { + dout(15) << __func__ << " " << cid << " bits: " << bits << dendl; + if (!collection_exists(cid)) { + dout(2) << __func__ << ": " << cid << " DNE" << dendl; + assert(replaying); + return 0; + } + if (!collection_exists(dest)) { + dout(2) << __func__ << ": " << dest << " DNE" << dendl; + assert(replaying); + return 0; + } + + int dstcmp = _check_replay_guard(dest, spos); + if (dstcmp < 0) + return 0; + + int srccmp = _check_replay_guard(cid, spos); + if (srccmp < 0) + return 0; + + _set_global_replay_guard(cid, spos); + _set_replay_guard(cid, spos, true); + _set_replay_guard(dest, spos, true); + + Index from; + r = get_index(cid, &from); + + Index to; + if (!r) + r = get_index(dest, &to); + + if (!r) { + assert(NULL != from.index); + RWLock::WLocker l1((from.index)->access_lock); + + assert(NULL != to.index); + RWLock::WLocker l2((to.index)->access_lock); + + r = from->split(rem, bits, to.index); + } + + _close_replay_guard(cid, spos); + _close_replay_guard(dest, spos); + } + if (g_conf->filestore_debug_verify_split) { + vector objects; + ghobject_t next; + while (1) { + collection_list( + cid, + next, ghobject_t::get_max(), + true, + get_ideal_list_max(), + &objects, + &next); + if (objects.empty()) + break; + for (vector::iterator i = objects.begin(); + i != objects.end(); + ++i) { + dout(20) << __func__ << ": " << *i << " still in source " + << cid << dendl; + assert(!i->match(bits, rem)); + } + objects.clear(); + } + next = ghobject_t(); + while (1) { + collection_list( + dest, + next, ghobject_t::get_max(), + true, + get_ideal_list_max(), + &objects, + &next); + if (objects.empty()) + break; + for (vector::iterator i = objects.begin(); + i != objects.end(); + ++i) { + dout(20) << __func__ << ": " << *i << " now in dest " + << *i << dendl; + assert(i->match(bits, rem)); + } + objects.clear(); + } + } + return r; +} + +int FileStore::_set_alloc_hint(coll_t cid, const ghobject_t& oid, + uint64_t expected_object_size, + uint64_t expected_write_size) +{ + dout(15) << "set_alloc_hint " << cid << "/" << oid << " object_size " << expected_object_size << " write_size " << expected_write_size << dendl; + + FDRef fd; + int ret; + + ret = lfn_open(cid, oid, false, &fd); + if (ret < 0) + goto out; + + { + // TODO: a more elaborate hint calculation + uint64_t hint = MIN(expected_write_size, m_filestore_max_alloc_hint_size); + + ret = backend->set_alloc_hint(**fd, hint); + dout(20) << "set_alloc_hint hint " << hint << " ret " << ret << dendl; + } + + lfn_close(fd); +out: + dout(10) << "set_alloc_hint " << cid << "/" << oid << " object_size " << expected_object_size << " write_size " << expected_write_size << " = " << ret << dendl; + assert(!m_filestore_fail_eio || ret != -EIO); + return ret; +} + +const char** FileStore::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "filestore_min_sync_interval", + "filestore_max_sync_interval", + "filestore_queue_max_ops", + "filestore_queue_max_bytes", + "filestore_queue_committing_max_ops", + "filestore_queue_committing_max_bytes", + "filestore_commit_timeout", + "filestore_dump_file", + "filestore_kill_at", + "filestore_fail_eio", + "filestore_fadvise", + "filestore_sloppy_crc", + "filestore_sloppy_crc_block_size", + "filestore_max_alloc_hint_size", + NULL + }; + return KEYS; +} + +void FileStore::handle_conf_change(const struct md_config_t *conf, + const std::set &changed) +{ + if (changed.count("filestore_max_inline_xattr_size") || + changed.count("filestore_max_inline_xattr_size_xfs") || + changed.count("filestore_max_inline_xattr_size_btrfs") || + changed.count("filestore_max_inline_xattr_size_other") || + changed.count("filestore_max_inline_xattrs") || + changed.count("filestore_max_inline_xattrs_xfs") || + changed.count("filestore_max_inline_xattrs_btrfs") || + changed.count("filestore_max_inline_xattrs_other")) { + Mutex::Locker l(lock); + set_xattr_limits_via_conf(); + } + if (changed.count("filestore_min_sync_interval") || + changed.count("filestore_max_sync_interval") || + changed.count("filestore_queue_max_ops") || + changed.count("filestore_queue_max_bytes") || + changed.count("filestore_queue_committing_max_ops") || + changed.count("filestore_queue_committing_max_bytes") || + changed.count("filestore_kill_at") || + changed.count("filestore_fail_eio") || + changed.count("filestore_sloppy_crc") || + changed.count("filestore_sloppy_crc_block_size") || + changed.count("filestore_max_alloc_hint_size") || + changed.count("filestore_fadvise")) { + Mutex::Locker l(lock); + m_filestore_min_sync_interval = conf->filestore_min_sync_interval; + m_filestore_max_sync_interval = conf->filestore_max_sync_interval; + m_filestore_queue_max_ops = conf->filestore_queue_max_ops; + m_filestore_queue_max_bytes = conf->filestore_queue_max_bytes; + m_filestore_queue_committing_max_ops = conf->filestore_queue_committing_max_ops; + m_filestore_queue_committing_max_bytes = conf->filestore_queue_committing_max_bytes; + m_filestore_kill_at.set(conf->filestore_kill_at); + m_filestore_fail_eio = conf->filestore_fail_eio; + m_filestore_fadvise = conf->filestore_fadvise; + m_filestore_sloppy_crc = conf->filestore_sloppy_crc; + m_filestore_sloppy_crc_block_size = conf->filestore_sloppy_crc_block_size; + m_filestore_max_alloc_hint_size = conf->filestore_max_alloc_hint_size; + throttle_ops.reset_max(conf->filestore_queue_max_ops); + throttle_bytes.reset_max(conf->filestore_queue_max_bytes); + } + if (changed.count("filestore_commit_timeout")) { + Mutex::Locker l(sync_entry_timeo_lock); + m_filestore_commit_timeout = conf->filestore_commit_timeout; + } + if (changed.count("filestore_dump_file")) { + if (conf->filestore_dump_file.length() && + conf->filestore_dump_file != "-") { + dump_start(conf->filestore_dump_file); + } else { + dump_stop(); + } + } +} + +void FileStore::dump_start(const std::string& file) +{ + dout(10) << "dump_start " << file << dendl; + if (m_filestore_do_dump) { + dump_stop(); + } + m_filestore_dump_fmt.reset(); + m_filestore_dump_fmt.open_array_section("dump"); + m_filestore_dump.open(file.c_str()); + m_filestore_do_dump = true; +} + +void FileStore::dump_stop() +{ + dout(10) << "dump_stop" << dendl; + m_filestore_do_dump = false; + if (m_filestore_dump.is_open()) { + m_filestore_dump_fmt.close_section(); + m_filestore_dump_fmt.flush(m_filestore_dump); + m_filestore_dump.flush(); + m_filestore_dump.close(); + } +} + +void FileStore::dump_transactions(list& ls, uint64_t seq, OpSequencer *osr) +{ + m_filestore_dump_fmt.open_array_section("transactions"); + unsigned trans_num = 0; + for (list::iterator i = ls.begin(); i != ls.end(); ++i, ++trans_num) { + m_filestore_dump_fmt.open_object_section("transaction"); + m_filestore_dump_fmt.dump_string("osr", osr->get_name()); + m_filestore_dump_fmt.dump_unsigned("seq", seq); + m_filestore_dump_fmt.dump_unsigned("trans_num", trans_num); + (*i)->dump(&m_filestore_dump_fmt); + m_filestore_dump_fmt.close_section(); + } + m_filestore_dump_fmt.close_section(); + m_filestore_dump_fmt.flush(m_filestore_dump); + m_filestore_dump.flush(); +} + +void FileStore::set_xattr_limits_via_conf() +{ + uint32_t fs_xattr_size; + uint32_t fs_xattrs; + + switch (m_fs_type) { +#if defined(__linux__) + case XFS_SUPER_MAGIC: + fs_xattr_size = g_conf->filestore_max_inline_xattr_size_xfs; + fs_xattrs = g_conf->filestore_max_inline_xattrs_xfs; + break; + case BTRFS_SUPER_MAGIC: + fs_xattr_size = g_conf->filestore_max_inline_xattr_size_btrfs; + fs_xattrs = g_conf->filestore_max_inline_xattrs_btrfs; + break; +#endif + default: + fs_xattr_size = g_conf->filestore_max_inline_xattr_size_other; + fs_xattrs = g_conf->filestore_max_inline_xattrs_other; + break; + } + + // Use override value if set + if (g_conf->filestore_max_inline_xattr_size) + m_filestore_max_inline_xattr_size = g_conf->filestore_max_inline_xattr_size; + else + m_filestore_max_inline_xattr_size = fs_xattr_size; + + // Use override value if set + if (g_conf->filestore_max_inline_xattrs) + m_filestore_max_inline_xattrs = g_conf->filestore_max_inline_xattrs; + else + m_filestore_max_inline_xattrs = fs_xattrs; +} + +// -- FSSuperblock -- + +void FSSuperblock::encode(bufferlist &bl) const +{ + ENCODE_START(2, 1, bl); + compat_features.encode(bl); + ::encode(omap_backend, bl); + ENCODE_FINISH(bl); +} + +void FSSuperblock::decode(bufferlist::iterator &bl) +{ + DECODE_START(2, bl); + compat_features.decode(bl); + if (struct_v >= 2) + ::decode(omap_backend, bl); + else + omap_backend = "leveldb"; + DECODE_FINISH(bl); +} + +void FSSuperblock::dump(Formatter *f) const +{ + f->open_object_section("compat"); + compat_features.dump(f); + f->dump_string("omap_backend", omap_backend); + f->close_section(); +} + +void FSSuperblock::generate_test_instances(list& o) +{ + FSSuperblock z; + o.push_back(new FSSuperblock(z)); + CompatSet::FeatureSet feature_compat; + CompatSet::FeatureSet feature_ro_compat; + CompatSet::FeatureSet feature_incompat; + feature_incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS); + z.compat_features = CompatSet(feature_compat, feature_ro_compat, + feature_incompat); + o.push_back(new FSSuperblock(z)); + z.omap_backend = "rocksdb"; + o.push_back(new FSSuperblock(z)); +} diff --git a/src/os/filestore/FileStore.h b/src/os/filestore/FileStore.h new file mode 100644 index 000000000000..cb77287f4c6f --- /dev/null +++ b/src/os/filestore/FileStore.h @@ -0,0 +1,816 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_FILESTORE_H +#define CEPH_FILESTORE_H + +#include "include/types.h" + +#include +#include +#include +#include +using namespace std; + +#include "include/unordered_map.h" + +#include "include/assert.h" + +#include "os/ObjectStore.h" +#include "JournalingObjectStore.h" + +#include "common/Timer.h" +#include "common/WorkQueue.h" + +#include "common/Mutex.h" +#include "HashIndex.h" +#include "IndexManager.h" +#include "os/ObjectMap.h" +#include "SequencerPosition.h" +#include "FDCache.h" +#include "WBThrottle.h" + +#include "include/uuid.h" + + +// from include/linux/falloc.h: +#ifndef FALLOC_FL_PUNCH_HOLE +# define FALLOC_FL_PUNCH_HOLE 0x2 +#endif + +#if defined(__linux__) +# ifndef BTRFS_SUPER_MAGIC +#define BTRFS_SUPER_MAGIC 0x9123683E +# endif +# ifndef XFS_SUPER_MAGIC +#define XFS_SUPER_MAGIC 0x58465342 +# endif +# ifndef ZFS_SUPER_MAGIC +#define ZFS_SUPER_MAGIC 0x2fc12fc1 +# endif +#endif + + +class FileStoreBackend; + +#define CEPH_FS_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(1, "sharded objects") + +class FSSuperblock { +public: + CompatSet compat_features; + string omap_backend; + + FSSuperblock() { } + + void encode(bufferlist &bl) const; + void decode(bufferlist::iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list& o); +}; +WRITE_CLASS_ENCODER(FSSuperblock) + +inline ostream& operator<<(ostream& out, const FSSuperblock& sb) +{ + return out << "sb(" << sb.compat_features << "): " + << sb.omap_backend; +} + +class FileStore : public JournalingObjectStore, + public md_config_obs_t +{ + static const uint32_t target_version = 4; +public: + uint32_t get_target_version() { + return target_version; + } + + static int get_block_device_fsid(const string& path, uuid_d *fsid); + + struct FSPerfTracker { + PerfCounters::avg_tracker os_commit_latency; + PerfCounters::avg_tracker os_apply_latency; + + objectstore_perf_stat_t get_cur_stats() const { + objectstore_perf_stat_t ret; + ret.filestore_commit_latency = os_commit_latency.avg(); + ret.filestore_apply_latency = os_apply_latency.avg(); + return ret; + } + + void update_from_perfcounters(PerfCounters &logger); + } perf_tracker; + objectstore_perf_stat_t get_cur_stats() { + perf_tracker.update_from_perfcounters(*logger); + return perf_tracker.get_cur_stats(); + } + +private: + string internal_name; ///< internal name, used to name the perfcounter instance + string basedir, journalpath; + osflagbits_t generic_flags; + std::string current_fn; + std::string current_op_seq_fn; + std::string omap_dir; + uuid_d fsid; + + size_t blk_size; ///< fs block size + + int fsid_fd, op_fd, basedir_fd, current_fd; + + FileStoreBackend *backend; + + void create_backend(long f_type); + + deque snaps; + + // Indexed Collections + IndexManager index_manager; + int get_index(coll_t c, Index *index); + int init_index(coll_t c); + + void _kludge_temp_object_collection(coll_t& cid, const ghobject_t& oid) { + // - normal temp case: cid is pg, object is temp (pool < -1) + // - hammer temp case: cid is pg (or already temp), object pool is -1 + if (cid.is_pg() && (oid.hobj.pool < -1 || + oid.hobj.pool == -1)) + cid = cid.get_temp(); + } + void init_temp_collections(); + + // ObjectMap + boost::scoped_ptr object_map; + + // helper fns + int get_cdir(coll_t cid, char *s, int len); + + /// read a uuid from fd + int read_fsid(int fd, uuid_d *uuid); + + /// lock fsid_fd + int lock_fsid(); + + // sync thread + Mutex lock; + bool force_sync; + Cond sync_cond; + + Mutex sync_entry_timeo_lock; + SafeTimer timer; + + list sync_waiters; + bool stop; + void sync_entry(); + struct SyncThread : public Thread { + FileStore *fs; + SyncThread(FileStore *f) : fs(f) {} + void *entry() { + fs->sync_entry(); + return 0; + } + } sync_thread; + + // -- op workqueue -- + struct Op { + utime_t start; + uint64_t op; + list tls; + Context *onreadable, *onreadable_sync; + uint64_t ops, bytes; + TrackedOpRef osd_op; + }; + class OpSequencer : public Sequencer_impl { + Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock) + list q; + list jq; + list > flush_commit_waiters; + Cond cond; + public: + Sequencer *parent; + Mutex apply_lock; // for apply mutual exclusion + int id; + + /// get_max_uncompleted + bool _get_max_uncompleted( + uint64_t *seq ///< [out] max uncompleted seq + ) { + assert(qlock.is_locked()); + assert(seq); + *seq = 0; + if (q.empty() && jq.empty()) + return true; + + if (!q.empty()) + *seq = q.back()->op; + if (!jq.empty() && jq.back() > *seq) + *seq = jq.back(); + + return false; + } /// @returns true if both queues are empty + + /// get_min_uncompleted + bool _get_min_uncompleted( + uint64_t *seq ///< [out] min uncompleted seq + ) { + assert(qlock.is_locked()); + assert(seq); + *seq = 0; + if (q.empty() && jq.empty()) + return true; + + if (!q.empty()) + *seq = q.front()->op; + if (!jq.empty() && jq.front() < *seq) + *seq = jq.front(); + + return false; + } /// @returns true if both queues are empty + + void _wake_flush_waiters(list *to_queue) { + uint64_t seq; + if (_get_min_uncompleted(&seq)) + seq = -1; + + for (list >::iterator i = + flush_commit_waiters.begin(); + i != flush_commit_waiters.end() && i->first < seq; + flush_commit_waiters.erase(i++)) { + to_queue->push_back(i->second); + } + } + + void queue_journal(uint64_t s) { + Mutex::Locker l(qlock); + jq.push_back(s); + } + void dequeue_journal(list *to_queue) { + Mutex::Locker l(qlock); + jq.pop_front(); + cond.Signal(); + _wake_flush_waiters(to_queue); + } + void queue(Op *o) { + Mutex::Locker l(qlock); + q.push_back(o); + } + Op *peek_queue() { + Mutex::Locker l(qlock); + assert(apply_lock.is_locked()); + return q.front(); + } + + Op *dequeue(list *to_queue) { + assert(to_queue); + assert(apply_lock.is_locked()); + Mutex::Locker l(qlock); + Op *o = q.front(); + q.pop_front(); + cond.Signal(); + + _wake_flush_waiters(to_queue); + return o; + } + + void flush() { + Mutex::Locker l(qlock); + + while (g_conf->filestore_blackhole) + cond.Wait(qlock); // wait forever + + + // get max for journal _or_ op queues + uint64_t seq = 0; + if (!q.empty()) + seq = q.back()->op; + if (!jq.empty() && jq.back() > seq) + seq = jq.back(); + + if (seq) { + // everything prior to our watermark to drain through either/both queues + while ((!q.empty() && q.front()->op <= seq) || + (!jq.empty() && jq.front() <= seq)) + cond.Wait(qlock); + } + } + bool flush_commit(Context *c) { + Mutex::Locker l(qlock); + uint64_t seq = 0; + if (_get_max_uncompleted(&seq)) { + return true; + } else { + flush_commit_waiters.push_back(make_pair(seq, c)); + return false; + } + } + + OpSequencer(int i) + : qlock("FileStore::OpSequencer::qlock", false, false), + parent(0), + apply_lock("FileStore::OpSequencer::apply_lock", false, false), + id(i) {} + ~OpSequencer() { + assert(q.empty()); + } + + const string& get_name() const { + return parent->get_name(); + } + }; + + friend ostream& operator<<(ostream& out, const OpSequencer& s); + + FDCache fdcache; + WBThrottle wbthrottle; + + atomic_t next_osr_id; + deque op_queue; + Throttle throttle_ops, throttle_bytes; + const int m_ondisk_finisher_num; + const int m_apply_finisher_num; + vector ondisk_finishers; + vector apply_finishers; + + ThreadPool op_tp; + struct OpWQ : public ThreadPool::WorkQueue { + FileStore *store; + OpWQ(FileStore *fs, time_t timeout, time_t suicide_timeout, ThreadPool *tp) + : ThreadPool::WorkQueue("FileStore::OpWQ", timeout, suicide_timeout, tp), store(fs) {} + + bool _enqueue(OpSequencer *osr) { + store->op_queue.push_back(osr); + return true; + } + void _dequeue(OpSequencer *o) { + assert(0); + } + bool _empty() { + return store->op_queue.empty(); + } + OpSequencer *_dequeue() { + if (store->op_queue.empty()) + return NULL; + OpSequencer *osr = store->op_queue.front(); + store->op_queue.pop_front(); + return osr; + } + void _process(OpSequencer *osr, ThreadPool::TPHandle &handle) { + store->_do_op(osr, handle); + } + using ThreadPool::WorkQueue::_process; + void _process_finish(OpSequencer *osr) { + store->_finish_op(osr); + } + void _clear() { + assert(store->op_queue.empty()); + } + } op_wq; + + void _do_op(OpSequencer *o, ThreadPool::TPHandle &handle); + void _finish_op(OpSequencer *o); + Op *build_op(list& tls, + Context *onreadable, Context *onreadable_sync, + TrackedOpRef osd_op); + void queue_op(OpSequencer *osr, Op *o); + void op_queue_reserve_throttle(Op *o, ThreadPool::TPHandle *handle = NULL); + void op_queue_release_throttle(Op *o); + void _journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk); + friend struct C_JournaledAhead; + + void new_journal(); + + PerfCounters *logger; + +public: + int lfn_find(const ghobject_t& oid, const Index& index, + IndexedPath *path = NULL); + int lfn_truncate(coll_t cid, const ghobject_t& oid, off_t length); + int lfn_stat(coll_t cid, const ghobject_t& oid, struct stat *buf); + int lfn_open( + coll_t cid, + const ghobject_t& oid, + bool create, + FDRef *outfd, + Index *index = 0); + + void lfn_close(FDRef fd); + int lfn_link(coll_t c, coll_t newcid, const ghobject_t& o, const ghobject_t& newoid) ; + int lfn_unlink(coll_t cid, const ghobject_t& o, const SequencerPosition &spos, + bool force_clear_omap=false); + +public: + FileStore(const std::string &base, const std::string &jdev, + osflagbits_t flags = 0, + const char *internal_name = "filestore", bool update_to=false); + ~FileStore(); + + int _detect_fs(); + int _sanity_check_fs(); + + bool test_mount_in_use(); + int read_op_seq(uint64_t *seq); + int write_op_seq(int, uint64_t seq); + int mount(); + int umount(); + unsigned get_max_object_name_length() { + // not safe for all file systems, btw! use the tunable to limit this. + return 4096; + } + unsigned get_max_attr_name_length() { + // xattr limit is 128; leave room for our prefixes (user.ceph._), + // some margin, and cap at 100 + return 100; + } + int mkfs(); + int mkjournal(); + bool wants_journal() { + return true; + } + bool allows_journal() { + return true; + } + bool needs_journal() { + return false; + } + + int write_version_stamp(); + int version_stamp_is_valid(uint32_t *version); + int update_version_stamp(); + int upgrade(); + + bool can_sort_nibblewise() { + return true; // i support legacy sort order + } + + void collect_metadata(map *pm); + + int statfs(struct statfs *buf); + + int _do_transactions( + list &tls, uint64_t op_seq, + ThreadPool::TPHandle *handle); + int do_transactions(list &tls, uint64_t op_seq) { + return _do_transactions(tls, op_seq, 0); + } + unsigned _do_transaction( + Transaction& t, uint64_t op_seq, int trans_num, + ThreadPool::TPHandle *handle); + + int queue_transactions(Sequencer *osr, list& tls, + TrackedOpRef op = TrackedOpRef(), + ThreadPool::TPHandle *handle = NULL); + + /** + * set replay guard xattr on given file + * + * This will ensure that we will not replay this (or any previous) operation + * against this particular inode/object. + * + * @param fd open file descriptor for the file/object + * @param spos sequencer position of the last operation we should not replay + */ + void _set_replay_guard(int fd, + const SequencerPosition& spos, + const ghobject_t *oid=0, + bool in_progress=false); + void _set_replay_guard(coll_t cid, + const SequencerPosition& spos, + bool in_progress); + void _set_global_replay_guard(coll_t cid, + const SequencerPosition &spos); + + /// close a replay guard opened with in_progress=true + void _close_replay_guard(int fd, const SequencerPosition& spos); + void _close_replay_guard(coll_t cid, const SequencerPosition& spos); + + /** + * check replay guard xattr on given file + * + * Check the current position against any marker on the file that + * indicates which operations have already been applied. If the + * current or a newer operation has been marked as applied, we + * should not replay the current operation again. + * + * If we are not replaying the journal, we already return true. It + * is only on replay that we might return false, indicated that the + * operation should not be performed (again). + * + * @param fd open fd on the file/object in question + * @param spos sequencerposition for an operation we could apply/replay + * @return 1 if we can apply (maybe replay) this operation, -1 if spos has already been applied, 0 if it was in progress + */ + int _check_replay_guard(int fd, const SequencerPosition& spos); + int _check_replay_guard(coll_t cid, const SequencerPosition& spos); + int _check_replay_guard(coll_t cid, ghobject_t oid, const SequencerPosition& pos); + int _check_global_replay_guard(coll_t cid, const SequencerPosition& spos); + + // ------------------ + // objects + int pick_object_revision_lt(ghobject_t& oid) { + return 0; + } + bool exists(coll_t cid, const ghobject_t& oid); + int stat( + coll_t cid, + const ghobject_t& oid, + struct stat *st, + bool allow_eio = false); + int read( + coll_t cid, + const ghobject_t& oid, + uint64_t offset, + size_t len, + bufferlist& bl, + uint32_t op_flags = 0, + bool allow_eio = false); + int _do_fiemap(int fd, uint64_t offset, size_t len, + map *m); + int _do_seek_hole_data(int fd, uint64_t offset, size_t len, + map *m); + int fiemap(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl); + + int _touch(coll_t cid, const ghobject_t& oid); + int _write(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, + const bufferlist& bl, uint32_t fadvise_flags = 0); + int _zero(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len); + int _truncate(coll_t cid, const ghobject_t& oid, uint64_t size); + int _clone(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid, + const SequencerPosition& spos); + int _clone_range(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid, + uint64_t srcoff, uint64_t len, uint64_t dstoff, + const SequencerPosition& spos); + int _do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff); + int _do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff); + int _do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc=false); + int _remove(coll_t cid, const ghobject_t& oid, const SequencerPosition &spos); + + int _fgetattr(int fd, const char *name, bufferptr& bp); + int _fgetattrs(int fd, map& aset); + int _fsetattrs(int fd, map &aset); + + void _start_sync(); + + void do_force_sync(); + void start_sync(Context *onsafe); + void sync(); + void _flush_op_queue(); + void flush(); + void sync_and_flush(); + + int flush_journal(); + int dump_journal(ostream& out); + + void set_fsid(uuid_d u) { + fsid = u; + } + uuid_d get_fsid() { return fsid; } + + // DEBUG read error injection, an object is removed from both on delete() + Mutex read_error_lock; + set data_error_set; // read() will return -EIO + set mdata_error_set; // getattr(),stat() will return -EIO + void inject_data_error(const ghobject_t &oid); + void inject_mdata_error(const ghobject_t &oid); + void debug_obj_on_delete(const ghobject_t &oid); + bool debug_data_eio(const ghobject_t &oid); + bool debug_mdata_eio(const ghobject_t &oid); + + int snapshot(const string& name); + + // attrs + int getattr(coll_t cid, const ghobject_t& oid, const char *name, bufferptr &bp); + int getattrs(coll_t cid, const ghobject_t& oid, map& aset); + + int _setattrs(coll_t cid, const ghobject_t& oid, map& aset, + const SequencerPosition &spos); + int _rmattr(coll_t cid, const ghobject_t& oid, const char *name, + const SequencerPosition &spos); + int _rmattrs(coll_t cid, const ghobject_t& oid, + const SequencerPosition &spos); + + int collection_getattr(coll_t c, const char *name, void *value, size_t size); + int collection_getattr(coll_t c, const char *name, bufferlist& bl); + int collection_getattrs(coll_t cid, map &aset); + + int _collection_setattr(coll_t c, const char *name, const void *value, size_t size); + int _collection_rmattr(coll_t c, const char *name); + int _collection_setattrs(coll_t cid, map &aset); + int _collection_remove_recursive(const coll_t &cid, + const SequencerPosition &spos); + + // collections + int collection_list(coll_t c, ghobject_t start, ghobject_t end, + bool sort_bitwise, int max, + vector *ls, ghobject_t *next); + int list_collections(vector& ls); + int list_collections(vector& ls, bool include_temp); + int collection_version_current(coll_t c, uint32_t *version); + int collection_stat(coll_t c, struct stat *st); + bool collection_exists(coll_t c); + bool collection_empty(coll_t c); + + // omap (see ObjectStore.h for documentation) + int omap_get(coll_t c, const ghobject_t &oid, bufferlist *header, + map *out); + int omap_get_header( + coll_t c, + const ghobject_t &oid, + bufferlist *out, + bool allow_eio = false); + int omap_get_keys(coll_t c, const ghobject_t &oid, set *keys); + int omap_get_values(coll_t c, const ghobject_t &oid, const set &keys, + map *out); + int omap_check_keys(coll_t c, const ghobject_t &oid, const set &keys, + set *out); + ObjectMap::ObjectMapIterator get_omap_iterator(coll_t c, const ghobject_t &oid); + + int _create_collection(coll_t c, const SequencerPosition &spos); + int _destroy_collection(coll_t c); + /** + * Give an expected number of objects hint to the collection. + * + * @param c - collection id. + * @param pg_num - pg number of the pool this collection belongs to + * @param expected_num_objs - expected number of objects in this collection + * @param spos - sequence position + * + * @return 0 on success, an error code otherwise + */ + int _collection_hint_expected_num_objs(coll_t c, uint32_t pg_num, + uint64_t expected_num_objs, + const SequencerPosition &spos); + int _collection_add(coll_t c, coll_t ocid, const ghobject_t& oid, + const SequencerPosition& spos); + int _collection_move_rename(coll_t oldcid, const ghobject_t& oldoid, + coll_t c, const ghobject_t& o, + const SequencerPosition& spos); + + int _set_alloc_hint(coll_t cid, const ghobject_t& oid, + uint64_t expected_object_size, + uint64_t expected_write_size); + + void dump_start(const std::string& file); + void dump_stop(); + void dump_transactions(list& ls, uint64_t seq, OpSequencer *osr); + +private: + void _inject_failure(); + + // omap + int _omap_clear(coll_t cid, const ghobject_t &oid, + const SequencerPosition &spos); + int _omap_setkeys(coll_t cid, const ghobject_t &oid, + const map &aset, + const SequencerPosition &spos); + int _omap_rmkeys(coll_t cid, const ghobject_t &oid, const set &keys, + const SequencerPosition &spos); + int _omap_rmkeyrange(coll_t cid, const ghobject_t &oid, + const string& first, const string& last, + const SequencerPosition &spos); + int _omap_setheader(coll_t cid, const ghobject_t &oid, const bufferlist &bl, + const SequencerPosition &spos); + int _split_collection(coll_t cid, uint32_t bits, uint32_t rem, coll_t dest, + const SequencerPosition &spos); + int _split_collection_create(coll_t cid, uint32_t bits, uint32_t rem, + coll_t dest, + const SequencerPosition &spos); + + virtual const char** get_tracked_conf_keys() const; + virtual void handle_conf_change(const struct md_config_t *conf, + const std::set &changed); + float m_filestore_commit_timeout; + bool m_filestore_journal_parallel; + bool m_filestore_journal_trailing; + bool m_filestore_journal_writeahead; + int m_filestore_fiemap_threshold; + double m_filestore_max_sync_interval; + double m_filestore_min_sync_interval; + bool m_filestore_fail_eio; + bool m_filestore_fadvise; + int do_update; + bool m_journal_dio, m_journal_aio, m_journal_force_aio; + std::string m_osd_rollback_to_cluster_snap; + bool m_osd_use_stale_snap; + int m_filestore_queue_max_ops; + int m_filestore_queue_max_bytes; + int m_filestore_queue_committing_max_ops; + int m_filestore_queue_committing_max_bytes; + bool m_filestore_do_dump; + std::ofstream m_filestore_dump; + JSONFormatter m_filestore_dump_fmt; + atomic_t m_filestore_kill_at; + bool m_filestore_sloppy_crc; + int m_filestore_sloppy_crc_block_size; + uint64_t m_filestore_max_alloc_hint_size; + long m_fs_type; + + //Determined xattr handling based on fs type + void set_xattr_limits_via_conf(); + uint32_t m_filestore_max_inline_xattr_size; + uint32_t m_filestore_max_inline_xattrs; + + FSSuperblock superblock; + + /** + * write_superblock() + * + * Write superblock to persisent storage + * + * return value: 0 on success, otherwise negative errno + */ + int write_superblock(); + + /** + * read_superblock() + * + * Fill in FileStore::superblock by reading persistent storage + * + * return value: 0 on success, otherwise negative errno + */ + int read_superblock(); + + friend class FileStoreBackend; + friend class TestFileStore; +}; + +ostream& operator<<(ostream& out, const FileStore::OpSequencer& s); + +struct fiemap; + +class FileStoreBackend { +private: + FileStore *filestore; +protected: + int get_basedir_fd() { + return filestore->basedir_fd; + } + int get_current_fd() { + return filestore->current_fd; + } + int get_op_fd() { + return filestore->op_fd; + } + size_t get_blksize() { + return filestore->blk_size; + } + const string& get_basedir_path() { + return filestore->basedir; + } + const string& get_current_path() { + return filestore->current_fn; + } + int _copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) { + if (has_fiemap() || has_seek_data_hole()) { + return filestore->_do_sparse_copy_range(from, to, srcoff, len, dstoff); + } else { + return filestore->_do_copy_range(from, to, srcoff, len, dstoff); + } + } + int get_crc_block_size() { + return filestore->m_filestore_sloppy_crc_block_size; + } + +public: + FileStoreBackend(FileStore *fs) : filestore(fs) {} + virtual ~FileStoreBackend() {} + + static FileStoreBackend *create(long f_type, FileStore *fs); + + virtual const char *get_name() = 0; + virtual int detect_features() = 0; + virtual int create_current() = 0; + virtual bool can_checkpoint() = 0; + virtual int list_checkpoints(list& ls) = 0; + virtual int create_checkpoint(const string& name, uint64_t *cid) = 0; + virtual int sync_checkpoint(uint64_t id) = 0; + virtual int rollback_to(const string& name) = 0; + virtual int destroy_checkpoint(const string& name) = 0; + virtual int syncfs() = 0; + virtual bool has_fiemap() = 0; + virtual bool has_seek_data_hole() = 0; + virtual int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) = 0; + virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) = 0; + virtual int set_alloc_hint(int fd, uint64_t hint) = 0; + virtual bool has_splice() const = 0; + + // hooks for (sloppy) crc tracking + virtual int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) = 0; + virtual int _crc_update_truncate(int fd, loff_t off) = 0; + virtual int _crc_update_zero(int fd, loff_t off, size_t len) = 0; + virtual int _crc_update_clone_range(int srcfd, int destfd, + loff_t srcoff, size_t len, loff_t dstoff) = 0; + virtual int _crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl, + ostream *out) = 0; +}; + +#endif diff --git a/src/os/filestore/GenericFileStoreBackend.cc b/src/os/filestore/GenericFileStoreBackend.cc new file mode 100644 index 000000000000..d62d6221b3f0 --- /dev/null +++ b/src/os/filestore/GenericFileStoreBackend.cc @@ -0,0 +1,431 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/int_types.h" +#include "include/types.h" + +#include +#include +#include +#include +#include +#include +#include + +#if defined(__linux__) +#include +#endif + +#include "include/compat.h" +#include "include/linux_fiemap.h" + +#include +#include +#include + +#include "GenericFileStoreBackend.h" + +#include "common/errno.h" +#include "common/config.h" +#include "common/sync_filesystem.h" + +#include "common/SloppyCRCMap.h" +#include "os/filestore/chain_xattr.h" + +#define SLOPPY_CRC_XATTR "user.cephos.scrc" + + +#define dout_subsys ceph_subsys_filestore +#undef dout_prefix +#define dout_prefix *_dout << "genericfilestorebackend(" << get_basedir_path() << ") " + +#define ALIGN_DOWN(x, by) ((x) - ((x) % (by))) +#define ALIGNED(x, by) (!((x) % (by))) +#define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by))) + +GenericFileStoreBackend::GenericFileStoreBackend(FileStore *fs): + FileStoreBackend(fs), + ioctl_fiemap(false), + seek_data_hole(false), + m_filestore_fiemap(g_conf->filestore_fiemap), + m_filestore_seek_data_hole(g_conf->filestore_seek_data_hole), + m_filestore_fsync_flushes_journal_data(g_conf->filestore_fsync_flushes_journal_data), + m_filestore_splice(false) {} + +int GenericFileStoreBackend::detect_features() +{ + char fn[PATH_MAX]; + snprintf(fn, sizeof(fn), "%s/fiemap_test", get_basedir_path().c_str()); + + int fd = ::open(fn, O_CREAT|O_RDWR|O_TRUNC, 0644); + if (fd < 0) { + fd = -errno; + derr << "detect_features: unable to create " << fn << ": " << cpp_strerror(fd) << dendl; + return fd; + } + + // ext4 has a bug in older kernels where fiemap will return an empty + // result in some cases. this is a file layout that triggers the bug + // on 2.6.34-rc5. + int v[] = { + 0x0000000000016000, 0x0000000000007000, + 0x000000000004a000, 0x0000000000007000, + 0x0000000000060000, 0x0000000000001000, + 0x0000000000061000, 0x0000000000008000, + 0x0000000000069000, 0x0000000000007000, + 0x00000000000a3000, 0x000000000000c000, + 0x000000000024e000, 0x000000000000c000, + 0x000000000028b000, 0x0000000000009000, + 0x00000000002b1000, 0x0000000000003000, + 0, 0 + }; + for (int i=0; v[i]; i++) { + int off = v[i++]; + int len = v[i]; + + // write a large extent + char buf[len]; + memset(buf, 1, sizeof(buf)); + int r = ::lseek(fd, off, SEEK_SET); + if (r < 0) { + r = -errno; + derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(r) << dendl; + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return r; + } + r = write(fd, buf, sizeof(buf)); + if (r < 0) { + derr << "detect_features: failed to write to " << fn << ": " << cpp_strerror(r) << dendl; + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return r; + } + } + + // fiemap an extent inside that + if (!m_filestore_fiemap) { + dout(0) << "detect_features: FIEMAP ioctl is disabled via 'filestore fiemap' config option" << dendl; + ioctl_fiemap = false; + } else { + struct fiemap *fiemap; + int r = do_fiemap(fd, 2430421, 59284, &fiemap); + if (r < 0) { + dout(0) << "detect_features: FIEMAP ioctl is NOT supported" << dendl; + ioctl_fiemap = false; + } else { + if (fiemap->fm_mapped_extents == 0) { + dout(0) << "detect_features: FIEMAP ioctl is supported, but buggy -- upgrade your kernel" << dendl; + ioctl_fiemap = false; + } else { + dout(0) << "detect_features: FIEMAP ioctl is supported and appears to work" << dendl; + ioctl_fiemap = true; + } + free(fiemap); + } + } + + // SEEK_DATA/SEEK_HOLE detection + if (!m_filestore_seek_data_hole) { + dout(0) << "detect_features: SEEK_DATA/SEEK_HOLE is disabled via 'filestore seek data hole' config option" << dendl; + seek_data_hole = false; + } else { +#if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA) + // If compiled on an OS with SEEK_HOLE/SEEK_DATA support, but running + // on an OS that doesn't support SEEK_HOLE/SEEK_DATA, EINVAL is returned. + // Fall back to use fiemap. + off_t hole_pos; + + hole_pos = lseek(fd, 0, SEEK_HOLE); + if (hole_pos < 0) { + if (errno == EINVAL) { + dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is NOT supported" << dendl; + seek_data_hole = false; + } else { + derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(-errno) << dendl; + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return -errno; + } + } else { + dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is supported" << dendl; + seek_data_hole = true; + } +#endif + } + + //splice detection +#ifdef CEPH_HAVE_SPLICE + if (!m_filestore_splice) { + int pipefd[2]; + loff_t off_in = 0; + int r; + if ((r = pipe(pipefd)) < 0) + dout(0) << "detect_features: splice pipe met error " << cpp_strerror(errno) << dendl; + else { + lseek(fd, 0, SEEK_SET); + r = splice(fd, &off_in, pipefd[1], NULL, 10, 0); + if (!(r < 0 && errno == EINVAL)) { + m_filestore_splice = true; + dout(0) << "detect_features: splice is supported" << dendl; + } else + dout(0) << "detect_features: splice is NOT supported" << dendl; + close(pipefd[0]); + close(pipefd[1]); + } + } +#endif + ::unlink(fn); + VOID_TEMP_FAILURE_RETRY(::close(fd)); + + + bool have_syncfs = false; +#ifdef HAVE_SYS_SYNCFS + if (::syncfs(get_basedir_fd()) == 0) { + dout(0) << "detect_features: syncfs(2) syscall fully supported (by glibc and kernel)" << dendl; + have_syncfs = true; + } else { + dout(0) << "detect_features: syncfs(2) syscall supported by glibc BUT NOT the kernel" << dendl; + } +#elif defined(SYS_syncfs) + if (syscall(SYS_syncfs, get_basedir_fd()) == 0) { + dout(0) << "detect_features: syscall(SYS_syncfs, fd) fully supported" << dendl; + have_syncfs = true; + } else { + dout(0) << "detect_features: syscall(SYS_syncfs, fd) supported by libc BUT NOT the kernel" << dendl; + } +#elif defined(__NR_syncfs) + if (syscall(__NR_syncfs, get_basedir_fd()) == 0) { + dout(0) << "detect_features: syscall(__NR_syncfs, fd) fully supported" << dendl; + have_syncfs = true; + } else { + dout(0) << "detect_features: syscall(__NR_syncfs, fd) supported by libc BUT NOT the kernel" << dendl; + } +#endif + if (!have_syncfs) { + dout(0) << "detect_features: syncfs(2) syscall not supported" << dendl; + if (m_filestore_fsync_flushes_journal_data) { + dout(0) << "detect_features: no syncfs(2), but 'filestore fsync flushes journal data = true', so fsync will suffice." << dendl; + } else { + dout(0) << "detect_features: no syncfs(2), must use sync(2)." << dendl; + dout(0) << "detect_features: WARNING: multiple ceph-osd daemons on the same host will be slow" << dendl; + } + } + + return 0; +} + +int GenericFileStoreBackend::create_current() +{ + struct stat st; + int ret = ::stat(get_current_path().c_str(), &st); + if (ret == 0) { + // current/ exists + if (!S_ISDIR(st.st_mode)) { + dout(0) << "_create_current: current/ exists but is not a directory" << dendl; + ret = -EINVAL; + } + } else { + ret = ::mkdir(get_current_path().c_str(), 0755); + if (ret < 0) { + ret = -errno; + dout(0) << "_create_current: mkdir " << get_current_path() << " failed: "<< cpp_strerror(ret) << dendl; + } + } + return ret; +} + +int GenericFileStoreBackend::syncfs() +{ + int ret; + if (m_filestore_fsync_flushes_journal_data) { + dout(15) << "syncfs: doing fsync on " << get_op_fd() << dendl; + // make the file system's journal commit. + // this works with ext3, but NOT ext4 + ret = ::fsync(get_op_fd()); + if (ret < 0) + ret = -errno; + } else { + dout(15) << "syncfs: doing a full sync (syncfs(2) if possible)" << dendl; + ret = sync_filesystem(get_current_fd()); + } + return ret; +} + +int GenericFileStoreBackend::do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) +{ + struct fiemap *fiemap = NULL; + struct fiemap *_realloc_fiemap = NULL; + int size; + int ret; + + fiemap = (struct fiemap*)calloc(sizeof(struct fiemap), 1); + if (!fiemap) + return -ENOMEM; + /* + * There is a bug on xfs about fiemap. Suppose(offset=3990, len=4096), + * the result is (logical=4096, len=4096). It leak the [3990, 4096). + * Commit:"xfs: fix rounding error of fiemap length parameter + * (eedf32bfcace7d8e20cc66757d74fc68f3439ff7)" fix this bug. + * Here, we make offset aligned with CEPH_PAGE_SIZE to avoid this bug. + */ + fiemap->fm_start = start - start % CEPH_PAGE_SIZE; + fiemap->fm_length = len + start % CEPH_PAGE_SIZE; + fiemap->fm_flags = FIEMAP_FLAG_SYNC; /* flush extents to disk if needed */ + +#if defined(DARWIN) || defined(__FreeBSD__) + ret = -ENOTSUP; + goto done_err; +#else + if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) { + ret = -errno; + goto done_err; + } +#endif + size = sizeof(struct fiemap_extent) * (fiemap->fm_mapped_extents); + + _realloc_fiemap = (struct fiemap *)realloc(fiemap, sizeof(struct fiemap) + size); + if (!_realloc_fiemap) { + ret = -ENOMEM; + goto done_err; + } else { + fiemap = _realloc_fiemap; + } + + memset(fiemap->fm_extents, 0, size); + + fiemap->fm_extent_count = fiemap->fm_mapped_extents; + fiemap->fm_mapped_extents = 0; + +#if defined(DARWIN) || defined(__FreeBSD__) + ret = -ENOTSUP; + goto done_err; +#else + if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) { + ret = -errno; + goto done_err; + } + *pfiemap = fiemap; +#endif + return 0; + +done_err: + *pfiemap = NULL; + free(fiemap); + return ret; +} + + +int GenericFileStoreBackend::_crc_load_or_init(int fd, SloppyCRCMap *cm) +{ + char buf[100]; + bufferptr bp; + int r = 0; + int l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, buf, sizeof(buf)); + if (l == -ENODATA) { + return 0; + } + if (l >= 0) { + bp = buffer::create(l); + memcpy(bp.c_str(), buf, l); + } else if (l == -ERANGE) { + l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, 0, 0); + if (l > 0) { + bp = buffer::create(l); + l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, bp.c_str(), l); + } + } + bufferlist bl; + bl.append(bp); + bufferlist::iterator p = bl.begin(); + try { + ::decode(*cm, p); + } + catch (buffer::error &e) { + r = -EIO; + } + if (r < 0) + derr << __func__ << " got " << cpp_strerror(r) << dendl; + return r; +} + +int GenericFileStoreBackend::_crc_save(int fd, SloppyCRCMap *cm) +{ + bufferlist bl; + ::encode(*cm, bl); + int r = chain_fsetxattr(fd, SLOPPY_CRC_XATTR, bl.c_str(), bl.length()); + if (r < 0) + derr << __func__ << " got " << cpp_strerror(r) << dendl; + return r; +} + +int GenericFileStoreBackend::_crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) +{ + SloppyCRCMap scm(get_crc_block_size()); + int r = _crc_load_or_init(fd, &scm); + if (r < 0) + return r; + ostringstream ss; + scm.write(off, len, bl, &ss); + dout(30) << __func__ << "\n" << ss.str() << dendl; + r = _crc_save(fd, &scm); + return r; +} + +int GenericFileStoreBackend::_crc_update_truncate(int fd, loff_t off) +{ + SloppyCRCMap scm(get_crc_block_size()); + int r = _crc_load_or_init(fd, &scm); + if (r < 0) + return r; + scm.truncate(off); + r = _crc_save(fd, &scm); + return r; +} + +int GenericFileStoreBackend::_crc_update_zero(int fd, loff_t off, size_t len) +{ + SloppyCRCMap scm(get_crc_block_size()); + int r = _crc_load_or_init(fd, &scm); + if (r < 0) + return r; + scm.zero(off, len); + r = _crc_save(fd, &scm); + return r; +} + +int GenericFileStoreBackend::_crc_update_clone_range(int srcfd, int destfd, + loff_t srcoff, size_t len, loff_t dstoff) +{ + SloppyCRCMap scm_src(get_crc_block_size()); + SloppyCRCMap scm_dst(get_crc_block_size()); + int r = _crc_load_or_init(srcfd, &scm_src); + if (r < 0) + return r; + r = _crc_load_or_init(destfd, &scm_dst); + if (r < 0) + return r; + ostringstream ss; + scm_dst.clone_range(srcoff, len, dstoff, scm_src, &ss); + dout(30) << __func__ << "\n" << ss.str() << dendl; + r = _crc_save(destfd, &scm_dst); + return r; +} + +int GenericFileStoreBackend::_crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl, + ostream *out) +{ + SloppyCRCMap scm(get_crc_block_size()); + int r = _crc_load_or_init(fd, &scm); + if (r < 0) + return r; + return scm.read(off, len, bl, out); +} diff --git a/src/os/filestore/GenericFileStoreBackend.h b/src/os/filestore/GenericFileStoreBackend.h new file mode 100644 index 000000000000..f31e2029a652 --- /dev/null +++ b/src/os/filestore/GenericFileStoreBackend.h @@ -0,0 +1,66 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_GENERICFILESTOREBACKEDN_H +#define CEPH_GENERICFILESTOREBACKEDN_H + +#include "FileStore.h" + +class SloppyCRCMap; + +class GenericFileStoreBackend : public FileStoreBackend { +private: + bool ioctl_fiemap; + bool seek_data_hole; + bool m_filestore_fiemap; + bool m_filestore_seek_data_hole; + bool m_filestore_fsync_flushes_journal_data; + bool m_filestore_splice; +public: + GenericFileStoreBackend(FileStore *fs); + virtual ~GenericFileStoreBackend() {} + + virtual const char *get_name() { + return "generic"; + } + virtual int detect_features(); + virtual int create_current(); + virtual bool can_checkpoint() { return false; } + virtual int list_checkpoints(list& ls) { return 0; } + virtual int create_checkpoint(const string& name, uint64_t *cid) { return -EOPNOTSUPP; } + virtual int sync_checkpoint(uint64_t id) { return -EOPNOTSUPP; } + virtual int rollback_to(const string& name) { return -EOPNOTSUPP; } + virtual int destroy_checkpoint(const string& name) { return -EOPNOTSUPP; } + virtual int syncfs(); + virtual bool has_fiemap() { return ioctl_fiemap; } + virtual bool has_seek_data_hole() { return seek_data_hole; } + virtual int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap); + virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) { + return _copy_range(from, to, srcoff, len, dstoff); + } + virtual int set_alloc_hint(int fd, uint64_t hint) { return -EOPNOTSUPP; } + virtual bool has_splice() const { return m_filestore_splice; } +private: + int _crc_load_or_init(int fd, SloppyCRCMap *cm); + int _crc_save(int fd, SloppyCRCMap *cm); +public: + virtual int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl); + virtual int _crc_update_truncate(int fd, loff_t off); + virtual int _crc_update_zero(int fd, loff_t off, size_t len); + virtual int _crc_update_clone_range(int srcfd, int destfd, + loff_t srcoff, size_t len, loff_t dstoff); + virtual int _crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl, + ostream *out); +}; +#endif diff --git a/src/os/filestore/HashIndex.cc b/src/os/filestore/HashIndex.cc new file mode 100644 index 000000000000..a358ef3a8146 --- /dev/null +++ b/src/os/filestore/HashIndex.cc @@ -0,0 +1,1085 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/types.h" +#include "include/buffer.h" +#include "osd/osd_types.h" +#include + +#include "HashIndex.h" + +#include "common/debug.h" +#define dout_subsys ceph_subsys_filestore + +const string HashIndex::SUBDIR_ATTR = "contents"; +const string HashIndex::IN_PROGRESS_OP_TAG = "in_progress_op"; + +/// hex digit to integer value +int hex_to_int(char c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + if (c >= 'A' && c <= 'F') + return c - 'A' + 10; + assert(0); +} + +/// int value to hex digit +char int_to_hex(int v) +{ + assert(v < 16); + if (v < 10) + return '0' + v; + return 'A' + v - 10; +} + +/// reverse bits in a nibble (0..15) +int reverse_nibble_bits(int in) +{ + assert(in < 16); + return + ((in & 8) >> 3) | + ((in & 4) >> 1) | + ((in & 2) << 1) | + ((in & 1) << 3); +} + +/// reverse nibble bits in a hex digit +char reverse_hexdigit_bits(char c) +{ + return int_to_hex(reverse_nibble_bits(hex_to_int(c))); +} + +/// reverse nibble bits in a hex string +string reverse_hexdigit_bits_string(string s) +{ + for (unsigned i=0; i(), IN_PROGRESS_OP_TAG, bl); + if (r < 0) { + // No in progress operations! + return 0; + } + bufferlist::iterator i = bl.begin(); + InProgressOp in_progress(i); + subdir_info_s info; + r = get_info(in_progress.path, &info); + if (r == -ENOENT) { + return end_split_or_merge(in_progress.path); + } else if (r < 0) { + return r; + } + + if (in_progress.is_split()) + return complete_split(in_progress.path, info); + else if (in_progress.is_merge()) + return complete_merge(in_progress.path, info); + else if (in_progress.is_col_split()) { + for (vector::iterator i = in_progress.path.begin(); + i != in_progress.path.end(); + ++i) { + vector path(in_progress.path.begin(), i); + int r = reset_attr(path); + if (r < 0) + return r; + } + return 0; + } + else + return -EINVAL; +} + +int HashIndex::reset_attr( + const vector &path) +{ + int exists = 0; + int r = path_exists(path, &exists); + if (r < 0) + return r; + if (!exists) + return 0; + map objects; + vector subdirs; + r = list_objects(path, 0, 0, &objects); + if (r < 0) + return r; + r = list_subdirs(path, &subdirs); + if (r < 0) + return r; + + subdir_info_s info; + info.hash_level = path.size(); + info.objs = objects.size(); + info.subdirs = subdirs.size(); + return set_info(path, info); +} + +int HashIndex::col_split_level( + HashIndex &from, + HashIndex &to, + const vector &path, + uint32_t inbits, + uint32_t match, + unsigned *mkdirred) +{ + /* For each subdir, move, recurse, or ignore based on comparing the low order + * bits of the hash represented by the subdir path with inbits, match passed + * in. + */ + vector subdirs; + int r = from.list_subdirs(path, &subdirs); + if (r < 0) + return r; + map objects; + r = from.list_objects(path, 0, 0, &objects); + if (r < 0) + return r; + + set to_move; + for (vector::iterator i = subdirs.begin(); + i != subdirs.end(); + ++i) { + uint32_t bits = 0; + uint32_t hash = 0; + vector sub_path(path.begin(), path.end()); + sub_path.push_back(*i); + path_to_hobject_hash_prefix(sub_path, &bits, &hash); + if (bits < inbits) { + if (hobject_t::match_hash(hash, bits, match)) { + r = col_split_level( + from, + to, + sub_path, + inbits, + match, + mkdirred); + if (r < 0) + return r; + if (*mkdirred > path.size()) + *mkdirred = path.size(); + } // else, skip, doesn't need to be moved or recursed into + } else { + if (hobject_t::match_hash(hash, inbits, match)) { + to_move.insert(*i); + } + } // else, skip, doesn't need to be moved or recursed into + } + + /* Then, do the same for each object */ + map objs_to_move; + for (map::iterator i = objects.begin(); + i != objects.end(); + ++i) { + if (i->second.match(inbits, match)) { + objs_to_move.insert(*i); + } + } + + if (objs_to_move.empty() && to_move.empty()) + return 0; + + // Make parent directories as needed + while (*mkdirred < path.size()) { + ++*mkdirred; + int exists = 0; + vector creating_path(path.begin(), path.begin()+*mkdirred); + r = to.path_exists(creating_path, &exists); + if (r < 0) + return r; + if (exists) + continue; + subdir_info_s info; + info.objs = 0; + info.subdirs = 0; + info.hash_level = creating_path.size(); + if (*mkdirred < path.size() - 1) + info.subdirs = 1; + r = to.start_col_split(creating_path); + if (r < 0) + return r; + r = to.create_path(creating_path); + if (r < 0) + return r; + r = to.set_info(creating_path, info); + if (r < 0) + return r; + r = to.end_split_or_merge(creating_path); + if (r < 0) + return r; + } + + subdir_info_s from_info; + subdir_info_s to_info; + r = from.get_info(path, &from_info); + if (r < 0) + return r; + r = to.get_info(path, &to_info); + if (r < 0) + return r; + + from.start_col_split(path); + to.start_col_split(path); + + // Do subdir moves + for (set::iterator i = to_move.begin(); + i != to_move.end(); + ++i) { + from_info.subdirs--; + to_info.subdirs++; + r = move_subdir(from, to, path, *i); + if (r < 0) + return r; + } + + for (map::iterator i = objs_to_move.begin(); + i != objs_to_move.end(); + ++i) { + from_info.objs--; + to_info.objs++; + r = move_object(from, to, path, *i); + if (r < 0) + return r; + } + + + r = to.set_info(path, to_info); + if (r < 0) + return r; + r = from.set_info(path, from_info); + if (r < 0) + return r; + from.end_split_or_merge(path); + to.end_split_or_merge(path); + return 0; +} + +int HashIndex::_split( + uint32_t match, + uint32_t bits, + CollectionIndex* dest) { + assert(collection_version() == dest->collection_version()); + unsigned mkdirred = 0; + return col_split_level( + *this, + *static_cast(dest), + vector(), + bits, + match, + &mkdirred); +} + +int HashIndex::_init() { + subdir_info_s info; + vector path; + return set_info(path, info); +} + +/* LFNIndex virtual method implementations */ +int HashIndex::_created(const vector &path, + const ghobject_t &oid, + const string &mangled_name) { + subdir_info_s info; + int r; + r = get_info(path, &info); + if (r < 0) + return r; + info.objs++; + r = set_info(path, info); + if (r < 0) + return r; + + if (must_split(info)) { + int r = initiate_split(path, info); + if (r < 0) + return r; + return complete_split(path, info); + } else { + return 0; + } +} + +int HashIndex::_remove(const vector &path, + const ghobject_t &oid, + const string &mangled_name) { + int r; + r = remove_object(path, oid); + if (r < 0) + return r; + subdir_info_s info; + r = get_info(path, &info); + if (r < 0) + return r; + info.objs--; + r = set_info(path, info); + if (r < 0) + return r; + if (must_merge(info)) { + r = initiate_merge(path, info); + if (r < 0) + return r; + return complete_merge(path, info); + } else { + return 0; + } +} + +int HashIndex::_lookup(const ghobject_t &oid, + vector *path, + string *mangled_name, + int *hardlink) { + vector path_comp; + get_path_components(oid, &path_comp); + vector::iterator next = path_comp.begin(); + int exists; + while (1) { + int r = path_exists(*path, &exists); + if (r < 0) + return r; + if (!exists) { + if (path->empty()) + return -ENOENT; + path->pop_back(); + break; + } + if (next == path_comp.end()) + break; + path->push_back(*(next++)); + } + return get_mangled_name(*path, oid, mangled_name, hardlink); +} + +int HashIndex::_collection_list_partial(const ghobject_t &start, + const ghobject_t &end, + bool sort_bitwise, + int max_count, + vector *ls, + ghobject_t *next) { + vector path; + ghobject_t _next; + if (!next) + next = &_next; + *next = start; + dout(20) << __func__ << " start:" << start << " end:" << end << "-" << max_count << " ls.size " << ls->size() << dendl; + return list_by_hash(path, end, sort_bitwise, max_count, next, ls); +} + +int HashIndex::prep_delete() { + return recursive_remove(vector()); +} + +int HashIndex::_pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs) { + int ret; + vector path; + subdir_info_s root_info; + // Make sure there is neither objects nor sub-folders + // in this collection + ret = get_info(path, &root_info); + if (ret < 0) + return ret; + + // Do the folder splitting first + ret = pre_split_folder(pg_num, expected_num_objs); + if (ret < 0) + return ret; + // Initialize the folder info starting from root + return init_split_folder(path, 0); +} + +int HashIndex::pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs) +{ + // If folder merging is enabled (by setting the threshold positive), + // no need to split + if (merge_threshold > 0) + return 0; + const coll_t c = coll(); + // Do not split if the expected number of objects in this collection is zero (by default) + if (expected_num_objs == 0) + return 0; + + // Calculate the number of leaf folders (which actually store files) + // need to be created + const uint64_t objs_per_folder = (uint64_t)(abs(merge_threshold)) * (uint64_t)split_multiplier * 16; + uint64_t leavies = expected_num_objs / objs_per_folder ; + // No need to split + if (leavies == 0 || expected_num_objs == objs_per_folder) + return 0; + + spg_t spgid; + if (!c.is_pg_prefix(&spgid)) + return -EINVAL; + const ps_t ps = spgid.pgid.ps(); + + // the most significant bits of pg_num + const int pg_num_bits = calc_num_bits(pg_num - 1); + ps_t tmp_id = ps; + // calculate the number of levels we only create one sub folder + int num = pg_num_bits / 4; + // pg num's hex value is like 1xxx,xxxx,xxxx but not 1111,1111,1111, + // so that splitting starts at level 3 + if (pg_num_bits % 4 == 0 && pg_num < ((uint32_t)1 << pg_num_bits)) { + --num; + } + + int ret; + // Start with creation that only has one subfolder + vector paths; + int dump_num = num; + while (num-- > 0) { + ps_t v = tmp_id & 0x0000000f; + paths.push_back(to_hex(v)); + ret = create_path(paths); + if (ret < 0 && ret != -EEXIST) + return ret; + tmp_id = tmp_id >> 4; + } + + // Starting from here, we can split by creating multiple subfolders + const int left_bits = pg_num_bits - dump_num * 4; + // this variable denotes how many bits (for this level) that can be + // used for sub folder splitting + int split_bits = 4 - left_bits; + // the below logic is inspired by rados.h#ceph_stable_mod, + // it basically determines how many sub-folders should we + // create for splitting + assert(pg_num_bits > 0); // otherwise BAD_SHIFT + if (((1 << (pg_num_bits - 1)) | ps) >= pg_num) { + ++split_bits; + } + const uint32_t subs = (1 << split_bits); + // Calculate how many levels we create starting from here + int level = 0; + leavies /= subs; + while (leavies > 1) { + ++level; + leavies = leavies >> 4; + } + for (uint32_t i = 0; i < subs; ++i) { + assert(split_bits <= 4); // otherwise BAD_SHIFT + int v = tmp_id | (i << ((4 - split_bits) % 4)); + paths.push_back(to_hex(v)); + ret = create_path(paths); + if (ret < 0 && ret != -EEXIST) + return ret; + ret = recursive_create_path(paths, level); + if (ret < 0) + return ret; + paths.pop_back(); + } + return 0; +} + +int HashIndex::init_split_folder(vector &path, uint32_t hash_level) +{ + // Get the number of sub directories for the current path + vector subdirs; + int ret = list_subdirs(path, &subdirs); + if (ret < 0) + return ret; + subdir_info_s info; + info.subdirs = subdirs.size(); + info.hash_level = hash_level; + ret = set_info(path, info); + if (ret < 0) + return ret; + ret = fsync_dir(path); + if (ret < 0) + return ret; + + // Do the same for subdirs + vector::const_iterator iter; + for (iter = subdirs.begin(); iter != subdirs.end(); ++iter) { + path.push_back(*iter); + ret = init_split_folder(path, hash_level + 1); + if (ret < 0) + return ret; + path.pop_back(); + } + return 0; +} + +int HashIndex::recursive_create_path(vector& path, int level) +{ + if (level == 0) + return 0; + for (int i = 0; i < 16; ++i) { + path.push_back(to_hex(i)); + int ret = create_path(path); + if (ret < 0 && ret != -EEXIST) + return ret; + ret = recursive_create_path(path, level - 1); + if (ret < 0) + return ret; + path.pop_back(); + } + return 0; +} + +int HashIndex::recursive_remove(const vector &path) { + vector subdirs; + int r = list_subdirs(path, &subdirs); + if (r < 0) + return r; + map objects; + r = list_objects(path, 0, 0, &objects); + if (r < 0) + return r; + if (!objects.empty()) + return -ENOTEMPTY; + vector subdir(path); + for (vector::iterator i = subdirs.begin(); + i != subdirs.end(); + ++i) { + subdir.push_back(*i); + r = recursive_remove(subdir); + if (r < 0) + return r; + subdir.pop_back(); + } + return remove_path(path); +} + +int HashIndex::start_col_split(const vector &path) { + bufferlist bl; + InProgressOp op_tag(InProgressOp::COL_SPLIT, path); + op_tag.encode(bl); + int r = add_attr_path(vector(), IN_PROGRESS_OP_TAG, bl); + if (r < 0) + return r; + return fsync_dir(vector()); +} + +int HashIndex::start_split(const vector &path) { + bufferlist bl; + InProgressOp op_tag(InProgressOp::SPLIT, path); + op_tag.encode(bl); + int r = add_attr_path(vector(), IN_PROGRESS_OP_TAG, bl); + if (r < 0) + return r; + return fsync_dir(vector()); +} + +int HashIndex::start_merge(const vector &path) { + bufferlist bl; + InProgressOp op_tag(InProgressOp::MERGE, path); + op_tag.encode(bl); + int r = add_attr_path(vector(), IN_PROGRESS_OP_TAG, bl); + if (r < 0) + return r; + return fsync_dir(vector()); +} + +int HashIndex::end_split_or_merge(const vector &path) { + return remove_attr_path(vector(), IN_PROGRESS_OP_TAG); +} + +int HashIndex::get_info(const vector &path, subdir_info_s *info) { + bufferlist buf; + int r = get_attr_path(path, SUBDIR_ATTR, buf); + if (r < 0) + return r; + bufferlist::iterator bufiter = buf.begin(); + info->decode(bufiter); + assert(path.size() == (unsigned)info->hash_level); + return 0; +} + +int HashIndex::set_info(const vector &path, const subdir_info_s &info) { + bufferlist buf; + assert(path.size() == (unsigned)info.hash_level); + info.encode(buf); + return add_attr_path(path, SUBDIR_ATTR, buf); +} + +bool HashIndex::must_merge(const subdir_info_s &info) { + return (info.hash_level > 0 && + merge_threshold > 0 && + info.objs < (unsigned)merge_threshold && + info.subdirs == 0); +} + +bool HashIndex::must_split(const subdir_info_s &info) { + return (info.hash_level < (unsigned)MAX_HASH_LEVEL && + info.objs > ((unsigned)(abs(merge_threshold)) * 16 * split_multiplier)); + +} + +int HashIndex::initiate_merge(const vector &path, subdir_info_s info) { + return start_merge(path); +} + +int HashIndex::complete_merge(const vector &path, subdir_info_s info) { + vector dst = path; + dst.pop_back(); + subdir_info_s dstinfo; + int r, exists; + r = path_exists(path, &exists); + if (r < 0) + return r; + r = get_info(dst, &dstinfo); + if (r < 0) + return r; + if (exists) { + r = move_objects(path, dst); + if (r < 0) + return r; + r = reset_attr(dst); + if (r < 0) + return r; + r = remove_path(path); + if (r < 0) + return r; + } + if (must_merge(dstinfo)) { + r = initiate_merge(dst, dstinfo); + if (r < 0) + return r; + r = fsync_dir(dst); + if (r < 0) + return r; + return complete_merge(dst, dstinfo); + } + r = fsync_dir(dst); + if (r < 0) + return r; + return end_split_or_merge(path); +} + +int HashIndex::initiate_split(const vector &path, subdir_info_s info) { + return start_split(path); +} + +int HashIndex::complete_split(const vector &path, subdir_info_s info) { + int level = info.hash_level; + map objects; + vector dst = path; + int r; + dst.push_back(""); + r = list_objects(path, 0, 0, &objects); + if (r < 0) + return r; + vector subdirs_vec; + r = list_subdirs(path, &subdirs_vec); + if (r < 0) + return r; + set subdirs; + subdirs.insert(subdirs_vec.begin(), subdirs_vec.end()); + map > mapped; + map moved; + int num_moved = 0; + for (map::iterator i = objects.begin(); + i != objects.end(); + ++i) { + vector new_path; + get_path_components(i->second, &new_path); + mapped[new_path[level]][i->first] = i->second; + } + for (map >::iterator i = mapped.begin(); + i != mapped.end(); + ) { + dst[level] = i->first; + /* If the info already exists, it must be correct, + * we may be picking up a partially finished split */ + subdir_info_s temp; + // subdir has already been fully copied + if (subdirs.count(i->first) && !get_info(dst, &temp)) { + for (map::iterator j = i->second.begin(); + j != i->second.end(); + ++j) { + moved[j->first] = j->second; + num_moved++; + objects.erase(j->first); + } + ++i; + continue; + } + + subdir_info_s info_new; + info_new.objs = i->second.size(); + info_new.subdirs = 0; + info_new.hash_level = level + 1; + if (must_merge(info_new) && !subdirs.count(i->first)) { + mapped.erase(i++); + continue; + } + + // Subdir doesn't yet exist + if (!subdirs.count(i->first)) { + info.subdirs += 1; + r = create_path(dst); + if (r < 0) + return r; + } // else subdir has been created but only partially copied + + for (map::iterator j = i->second.begin(); + j != i->second.end(); + ++j) { + moved[j->first] = j->second; + num_moved++; + objects.erase(j->first); + r = link_object(path, dst, j->second, j->first); + // May be a partially finished split + if (r < 0 && r != -EEXIST) { + return r; + } + } + + r = fsync_dir(dst); + if (r < 0) + return r; + + // Presence of info must imply that all objects have been copied + r = set_info(dst, info_new); + if (r < 0) + return r; + + r = fsync_dir(dst); + if (r < 0) + return r; + + ++i; + } + r = remove_objects(path, moved, &objects); + if (r < 0) + return r; + info.objs = objects.size(); + r = reset_attr(path); + if (r < 0) + return r; + r = fsync_dir(path); + if (r < 0) + return r; + return end_split_or_merge(path); +} + +void HashIndex::get_path_components(const ghobject_t &oid, + vector *path) { + char buf[MAX_HASH_LEVEL + 1]; + snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, (uint32_t)oid.hobj.get_nibblewise_key()); + + // Path components are the hex characters of oid.hobj.hash, least + // significant first + for (int i = 0; i < MAX_HASH_LEVEL; ++i) { + path->push_back(string(&buf[i], 1)); + } +} + +string HashIndex::get_hash_str(uint32_t hash) { + char buf[MAX_HASH_LEVEL + 1]; + snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, hash); + string retval; + for (int i = 0; i < MAX_HASH_LEVEL; ++i) { + retval.push_back(buf[MAX_HASH_LEVEL - 1 - i]); + } + return retval; +} + +string HashIndex::get_path_str(const ghobject_t &oid) { + assert(!oid.is_max()); + return get_hash_str(oid.hobj.get_hash()); +} + +uint32_t HashIndex::hash_prefix_to_hash(string prefix) { + while (prefix.size() < sizeof(uint32_t) * 2) { + prefix.push_back('0'); + } + uint32_t hash; + sscanf(prefix.c_str(), "%x", &hash); + // nibble reverse + hash = ((hash & 0x0f0f0f0f) << 4) | ((hash & 0xf0f0f0f0) >> 4); + hash = ((hash & 0x00ff00ff) << 8) | ((hash & 0xff00ff00) >> 8); + hash = ((hash & 0x0000ffff) << 16) | ((hash & 0xffff0000) >> 16); + return hash; +} + +int HashIndex::get_path_contents_by_hash_bitwise( + const vector &path, + const ghobject_t *next_object, + set *hash_prefixes, + set, CmpPairBitwise> *objects) +{ + map rev_objects; + int r; + r = list_objects(path, 0, 0, &rev_objects); + if (r < 0) + return r; + // bitwise sort + for (map::iterator i = rev_objects.begin(); + i != rev_objects.end(); + ++i) { + if (next_object && cmp_bitwise(i->second, *next_object) < 0) + continue; + string hash_prefix = get_path_str(i->second); + hash_prefixes->insert(hash_prefix); + objects->insert(pair(hash_prefix, i->second)); + } + vector subdirs; + r = list_subdirs(path, &subdirs); + if (r < 0) + return r; + + // sort subdirs bitwise (by reversing hex digit nibbles) + std::sort(subdirs.begin(), subdirs.end(), cmp_hexdigit_bitwise); + + // Local to this function, we will convert the prefix strings + // (previously simply the reversed hex digits) to also have each + // digit's nibbles reversed. This will make the strings sort + // bitwise. + string cur_prefix; + for (vector::const_iterator i = path.begin(); + i != path.end(); + ++i) { + cur_prefix.append(reverse_hexdigit_bits_string(*i)); + } + string next_object_string; + if (next_object) + next_object_string = reverse_hexdigit_bits_string(get_path_str(*next_object)); + for (vector::iterator i = subdirs.begin(); + i != subdirs.end(); + ++i) { + string candidate = cur_prefix + reverse_hexdigit_bits_string(*i); + if (next_object) { + if (next_object->is_max()) + continue; + if (candidate < next_object_string.substr(0, candidate.size())) + continue; + } + // re-reverse the hex digit nibbles for the caller + hash_prefixes->insert(reverse_hexdigit_bits_string(candidate)); + } + return 0; +} + +int HashIndex::get_path_contents_by_hash_nibblewise( + const vector &path, + const ghobject_t *next_object, + set *hash_prefixes, + set, CmpPairNibblewise > *objects) +{ + map rev_objects; + int r; + r = list_objects(path, 0, 0, &rev_objects); + if (r < 0) + return r; + + for (map::iterator i = rev_objects.begin(); + i != rev_objects.end(); + ++i) { + string hash_prefix = get_path_str(i->second); + if (next_object && cmp_nibblewise(i->second, *next_object) < 0) + continue; + hash_prefixes->insert(hash_prefix); + objects->insert(pair(hash_prefix, i->second)); + } + + vector subdirs; + r = list_subdirs(path, &subdirs); + if (r < 0) + return r; + + // sort nibblewise (string sort of (reversed) hex digits) + std::sort(subdirs.begin(), subdirs.end()); + + string cur_prefix; + for (vector::const_iterator i = path.begin(); + i != path.end(); + ++i) { + cur_prefix.append(*i); + } + string next_object_string; + if (next_object) + next_object_string = get_path_str(*next_object); + + for (vector::iterator i = subdirs.begin(); + i != subdirs.end(); + ++i) { + string candidate = cur_prefix + *i; + if (next_object) { + if (next_object->is_max()) + continue; + if (candidate < next_object_string.substr(0, candidate.size())) + continue; + } + hash_prefixes->insert(cur_prefix + *i); + } + return 0; +} + +int HashIndex::list_by_hash(const vector &path, + const ghobject_t &end, + bool sort_bitwise, + int max_count, + ghobject_t *next, + vector *out) +{ + assert(out); + if (sort_bitwise) + return list_by_hash_bitwise(path, end, max_count, next, out); + else + return list_by_hash_nibblewise(path, end, max_count, next, out); +} + +int HashIndex::list_by_hash_bitwise( + const vector &path, + const ghobject_t& end, + int max_count, + ghobject_t *next, + vector *out) +{ + vector next_path = path; + next_path.push_back(""); + set hash_prefixes; + set, CmpPairBitwise> objects; + int r = get_path_contents_by_hash_bitwise(path, + next, + &hash_prefixes, + &objects); + if (r < 0) + return r; + for (set::iterator i = hash_prefixes.begin(); + i != hash_prefixes.end(); + ++i) { + dout(20) << __func__ << " prefix " << *i << dendl; + set, CmpPairBitwise>::iterator j = objects.lower_bound( + make_pair(*i, ghobject_t())); + if (j == objects.end() || j->first != *i) { + *(next_path.rbegin()) = *(i->rbegin()); + ghobject_t next_recurse; + if (next) + next_recurse = *next; + r = list_by_hash_bitwise(next_path, + end, + max_count, + &next_recurse, + out); + + if (r < 0) + return r; + if (!next_recurse.is_max()) { + if (next) + *next = next_recurse; + return 0; + } + } else { + while (j != objects.end() && j->first == *i) { + if (max_count > 0 && out->size() == (unsigned)max_count) { + if (next) + *next = j->second; + return 0; + } + if (cmp_bitwise(j->second, end) >= 0) { + if (next) + *next = ghobject_t::get_max(); + return 0; + } + if (!next || cmp_bitwise(j->second, *next) >= 0) { + dout(20) << __func__ << " prefix " << *i << " ob " << j->second << dendl; + out->push_back(j->second); + } + ++j; + } + } + } + if (next) + *next = ghobject_t::get_max(); + return 0; +} + +int HashIndex::list_by_hash_nibblewise( + const vector &path, + const ghobject_t& end, + int max_count, + ghobject_t *next, + vector *out) +{ + vector next_path = path; + next_path.push_back(""); + set hash_prefixes; + set, CmpPairNibblewise> objects; + int r = get_path_contents_by_hash_nibblewise(path, + next, + &hash_prefixes, + &objects); + if (r < 0) + return r; + for (set::iterator i = hash_prefixes.begin(); + i != hash_prefixes.end(); + ++i) { + dout(20) << __func__ << " prefix " << *i << dendl; + set, CmpPairNibblewise >::iterator j = + objects.lower_bound(make_pair(*i, ghobject_t())); + if (j == objects.end() || j->first != *i) { + *(next_path.rbegin()) = *(i->rbegin()); + ghobject_t next_recurse; + if (next) + next_recurse = *next; + r = list_by_hash_nibblewise(next_path, + end, + max_count, + &next_recurse, + out); + + if (r < 0) + return r; + if (!next_recurse.is_max()) { + if (next) + *next = next_recurse; + return 0; + } + } else { + while (j != objects.end() && j->first == *i) { + if (max_count > 0 && out->size() == (unsigned)max_count) { + if (next) + *next = j->second; + return 0; + } + if (cmp_nibblewise(j->second, end) >= 0) { + if (next) + *next = ghobject_t::get_max(); + return 0; + } + if (!next || cmp_nibblewise(j->second, *next) >= 0) { + out->push_back(j->second); + } + ++j; + } + } + } + if (next) + *next = ghobject_t::get_max(); + return 0; +} diff --git a/src/os/filestore/HashIndex.h b/src/os/filestore/HashIndex.h new file mode 100644 index 000000000000..2ed21860e624 --- /dev/null +++ b/src/os/filestore/HashIndex.h @@ -0,0 +1,432 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_HASHINDEX_H +#define CEPH_HASHINDEX_H + +#include "include/buffer_fwd.h" +#include "include/encoding.h" +#include "LFNIndex.h" + +extern string reverse_hexdigit_bits_string(string l); + +/** + * Implements collection prehashing. + * + * @verbatim + * (root) - 0 - 0 + * - 1 + * - E + * - 1 + * - 2 - D - 0 + * . + * . + * . + * - F - 0 + * @endverbatim + * + * A file is located at the longest existing directory from the root + * given by the hex characters in the hash beginning with the least + * significant. + * + * ex: ghobject_t("object", CEPH_NO_SNAP, 0xA4CEE0D2) + * would be located in (root)/2/D/0/ + * + * Subdirectories are created when the number of objects in a directory + * exceed (abs(merge_threshhold)) * 16 * split_multiplier. The number of objects in a directory + * is encoded as subdir_info_s in an xattr on the directory. + */ +class HashIndex : public LFNIndex { +private: + /// Attribute name for storing subdir info @see subdir_info_s + static const string SUBDIR_ATTR; + /// Attribute name for storing in progress op tag + static const string IN_PROGRESS_OP_TAG; + /// Size (bits) in object hash + static const int PATH_HASH_LEN = 32; + /// Max length of hashed path + static const int MAX_HASH_LEVEL = (PATH_HASH_LEN/4); + + /** + * Merges occur when the number of object drops below + * merge_threshold and splits occur when the number of objects + * exceeds 16 * abs(merge_threshold) * split_multiplier. + * Please note if merge_threshold is less than zero, it will never do merging + */ + int merge_threshold; + int split_multiplier; + + /// Encodes current subdir state for determining when to split/merge. + struct subdir_info_s { + uint64_t objs; ///< Objects in subdir. + uint32_t subdirs; ///< Subdirs in subdir. + uint32_t hash_level; ///< Hashlevel of subdir. + + subdir_info_s() : objs(0), subdirs(0), hash_level(0) {} + + void encode(bufferlist &bl) const + { + __u8 v = 1; + ::encode(v, bl); + ::encode(objs, bl); + ::encode(subdirs, bl); + ::encode(hash_level, bl); + } + + void decode(bufferlist::iterator &bl) + { + __u8 v; + ::decode(v, bl); + assert(v == 1); + ::decode(objs, bl); + ::decode(subdirs, bl); + ::decode(hash_level, bl); + } + }; + + /// Encodes in progress split or merge + struct InProgressOp { + static const int SPLIT = 0; + static const int MERGE = 1; + static const int COL_SPLIT = 2; + int op; + vector path; + + InProgressOp(int op, const vector &path) + : op(op), path(path) {} + + InProgressOp(bufferlist::iterator &bl) { + decode(bl); + } + + bool is_split() const { return op == SPLIT; } + bool is_col_split() const { return op == COL_SPLIT; } + bool is_merge() const { return op == MERGE; } + + void encode(bufferlist &bl) const { + __u8 v = 1; + ::encode(v, bl); + ::encode(op, bl); + ::encode(path, bl); + } + + void decode(bufferlist::iterator &bl) { + __u8 v; + ::decode(v, bl); + assert(v == 1); + ::decode(op, bl); + ::decode(path, bl); + } + }; + + +public: + /// Constructor. + HashIndex( + coll_t collection, ///< [in] Collection + const char *base_path, ///< [in] Path to the index root. + int merge_at, ///< [in] Merge threshhold. + int split_multiple, ///< [in] Split threshhold. + uint32_t index_version,///< [in] Index version + double retry_probability=0) ///< [in] retry probability + : LFNIndex(collection, base_path, index_version, retry_probability), + merge_threshold(merge_at), + split_multiplier(split_multiple) {} + + /// @see CollectionIndex + uint32_t collection_version() { return index_version; } + + /// @see CollectionIndex + int cleanup(); + + /// @see CollectionIndex + int prep_delete(); + + /// @see CollectionIndex + int _split( + uint32_t match, + uint32_t bits, + CollectionIndex* dest + ); + +protected: + int _init(); + + int _created( + const vector &path, + const ghobject_t &oid, + const string &mangled_name + ); + int _remove( + const vector &path, + const ghobject_t &oid, + const string &mangled_name + ); + int _lookup( + const ghobject_t &oid, + vector *path, + string *mangled_name, + int *hardlink + ); + + /** + * Pre-hash the collection to create folders according to the expected number + * of objects in this collection. + */ + int _pre_hash_collection( + uint32_t pg_num, + uint64_t expected_num_objs + ); + + int _collection_list_partial( + const ghobject_t &start, + const ghobject_t &end, + bool sort_bitwise, + int max_count, + vector *ls, + ghobject_t *next + ); +private: + /// Recursively remove path and its subdirs + int recursive_remove( + const vector &path ///< [in] path to remove + ); /// @return Error Code, 0 on success + /// Tag root directory at beginning of col_split + int start_col_split( + const vector &path ///< [in] path to split + ); ///< @return Error Code, 0 on success + /// Tag root directory at beginning of split + int start_split( + const vector &path ///< [in] path to split + ); ///< @return Error Code, 0 on success + /// Tag root directory at beginning of split + int start_merge( + const vector &path ///< [in] path to merge + ); ///< @return Error Code, 0 on success + /// Remove tag at end of split or merge + int end_split_or_merge( + const vector &path ///< [in] path to split or merged + ); ///< @return Error Code, 0 on success + /// Gets info from the xattr on the subdir represented by path + int get_info( + const vector &path, ///< [in] Path from which to read attribute. + subdir_info_s *info ///< [out] Attribute value + ); /// @return Error Code, 0 on success + + /// Sets info to the xattr on the subdir represented by path + int set_info( + const vector &path, ///< [in] Path on which to set attribute. + const subdir_info_s &info ///< [in] Value to set + ); /// @return Error Code, 0 on success + + /// Encapsulates logic for when to split. + bool must_merge( + const subdir_info_s &info ///< [in] Info to check + ); /// @return True if info must be merged, False otherwise + + /// Encapsulates logic for when to merge. + bool must_split( + const subdir_info_s &info ///< [in] Info to check + ); /// @return True if info must be split, False otherwise + + /// Initiates merge + int initiate_merge( + const vector &path, ///< [in] Subdir to merge + subdir_info_s info ///< [in] Info attached to path + ); /// @return Error Code, 0 on success + + /// Completes merge + int complete_merge( + const vector &path, ///< [in] Subdir to merge + subdir_info_s info ///< [in] Info attached to path + ); /// @return Error Code, 0 on success + + /// Resets attr to match actual subdir contents + int reset_attr( + const vector &path ///< [in] path to cleanup + ); + + /// Initiate Split + int initiate_split( + const vector &path, ///< [in] Subdir to split + subdir_info_s info ///< [in] Info attached to path + ); /// @return Error Code, 0 on success + + /// Completes Split + int complete_split( + const vector &path, ///< [in] Subdir to split + subdir_info_s info ///< [in] Info attached to path + ); /// @return Error Code, 0 on success + + /// Determine path components from hoid hash + void get_path_components( + const ghobject_t &oid, ///< [in] Object for which to get path components + vector *path ///< [out] Path components for hoid. + ); + + /// Pre-hash and split folders to avoid runtime splitting + /// according to the given expected object number. + int pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs); + + /// Initialize the folder (dir info) with the given hash + /// level and number of its subdirs. + int init_split_folder(vector &path, uint32_t hash_level); + + /// do collection split for path + static int col_split_level( + HashIndex &from, ///< [in] from index + HashIndex &dest, ///< [in] to index + const vector &path, ///< [in] path to split + uint32_t bits, ///< [in] num bits to match + uint32_t match, ///< [in] bits to match + unsigned *mkdirred ///< [in,out] path[:mkdirred] has been mkdirred + ); + + + /** + * Get string representation of ghobject_t/hash + * + * e.g: 0x01234567 -> "76543210" + */ + static string get_path_str( + const ghobject_t &oid ///< [in] Object to get hash string for + ); ///< @return Hash string for hoid. + + /// Get string from hash, @see get_path_str + static string get_hash_str( + uint32_t hash ///< [in] Hash to convert to a string. + ); ///< @return String representation of hash + + /// Get hash from hash prefix string e.g. "FFFFAB" -> 0xFFFFAB00 + static uint32_t hash_prefix_to_hash( + string prefix ///< [in] string to convert + ); ///< @return Hash + + /// Get hash mod from path + static void path_to_hobject_hash_prefix( + const vector &path,///< [in] path to convert + uint32_t *bits, ///< [out] bits + uint32_t *hash ///< [out] hash + ) { + string hash_str; + for (vector::const_iterator i = path.begin(); + i != path.end(); + ++i) { + hash_str.push_back(*i->begin()); + } + uint32_t rev_hash = hash_prefix_to_hash(hash_str); + if (hash) + *hash = rev_hash; + if (bits) + *bits = path.size() * 4; + } + + /// Calculate the number of bits. + static int calc_num_bits(uint64_t n) { + int ret = 0; + while (n > 0) { + n = n >> 1; + ret++; + } + return ret; + } + + /// Convert a number to hex string (upper case). + static string to_hex(int n) { + assert(n >= 0 && n < 16); + char c = (n <= 9 ? ('0' + n) : ('A' + n - 10)); + string str; + str.append(1, c); + return str; + } + + struct CmpPairNibblewise { + bool operator()(const pair& l, + const pair& r) + { + if (l.first < r.first) + return true; + if (l.first > r.first) + return false; + if (cmp_nibblewise(l.second, r.second) < 0) + return true; + return false; + } + }; + + struct CmpPairBitwise { + bool operator()(const pair& l, + const pair& r) + { + if (l.first < r.first) + return true; + if (l.first > r.first) + return false; + if (cmp_bitwise(l.second, r.second) < 0) + return true; + return false; + } + }; + + struct CmpHexdigitStringBitwise { + bool operator()(const string& l, const string& r) { + return reverse_hexdigit_bits_string(l) < reverse_hexdigit_bits_string(r); + } + }; + + /// Get path contents by hash + int get_path_contents_by_hash_bitwise( + const vector &path, /// [in] Path to list + const ghobject_t *next_object, /// [in] list > *next_object + set *hash_prefixes, /// [out] prefixes in dir + set, CmpPairBitwise> *objects /// [out] objects + ); + int get_path_contents_by_hash_nibblewise( + const vector &path, /// [in] Path to list + const ghobject_t *next_object, /// [in] list > *next_object + set *hash_prefixes, /// [out] prefixes in dir + set, CmpPairNibblewise> *objects /// [out] objects + ); + + /// List objects in collection in ghobject_t order + int list_by_hash( + const vector &path, /// [in] Path to list + const ghobject_t &end, /// [in] List only objects < end + bool sort_bitwise, /// [in] sort bitwise + int max_count, /// [in] List at most max_count + ghobject_t *next, /// [in,out] List objects >= *next + vector *out /// [out] Listed objects + ); ///< @return Error Code, 0 on success + /// List objects in collection in ghobject_t order + int list_by_hash_bitwise( + const vector &path, /// [in] Path to list + const ghobject_t &end, /// [in] List only objects < end + int max_count, /// [in] List at most max_count + ghobject_t *next, /// [in,out] List objects >= *next + vector *out /// [out] Listed objects + ); ///< @return Error Code, 0 on success + int list_by_hash_nibblewise( + const vector &path, /// [in] Path to list + const ghobject_t &end, /// [in] List only objects < end + int max_count, /// [in] List at most max_count + ghobject_t *next, /// [in,out] List objects >= *next + vector *out /// [out] Listed objects + ); ///< @return Error Code, 0 on success + + /// Create the given levels of sub directories from the given root. + /// The contents of *path* is not changed after calling this function. + int recursive_create_path(vector& path, int level); +}; + +#endif diff --git a/src/os/filestore/IndexManager.cc b/src/os/filestore/IndexManager.cc new file mode 100644 index 000000000000..3a3e5c99a7ba --- /dev/null +++ b/src/os/filestore/IndexManager.cc @@ -0,0 +1,136 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/memory.h" +#include "include/unordered_map.h" + +#if defined(__FreeBSD__) +#include +#endif + +#include + +#include "common/Mutex.h" +#include "common/Cond.h" +#include "common/config.h" +#include "common/debug.h" +#include "include/buffer.h" + +#include "IndexManager.h" +#include "HashIndex.h" +#include "CollectionIndex.h" + +#include "chain_xattr.h" + +static int set_version(const char *path, uint32_t version) { + bufferlist bl; + ::encode(version, bl); + return chain_setxattr(path, "user.cephos.collection_version", bl.c_str(), + bl.length(), true); +} + +static int get_version(const char *path, uint32_t *version) { + bufferptr bp(PATH_MAX); + int r = chain_getxattr(path, "user.cephos.collection_version", + bp.c_str(), bp.length()); + if (r < 0) { + if (r != -ENOENT) { + *version = 0; + return 0; + } else { + return r; + } + } + bp.set_length(r); + bufferlist bl; + bl.push_back(bp); + bufferlist::iterator i = bl.begin(); + ::decode(*version, i); + return 0; +} + +IndexManager::~IndexManager() { + + for (ceph::unordered_map ::iterator it = col_indices.begin(); + it != col_indices.end(); ++it) { + + delete it->second; + it->second = NULL; + } + col_indices.clear(); +} + + +int IndexManager::init_index(coll_t c, const char *path, uint32_t version) { + Mutex::Locker l(lock); + int r = set_version(path, version); + if (r < 0) + return r; + HashIndex index(c, path, g_conf->filestore_merge_threshold, + g_conf->filestore_split_multiple, + version, + g_conf->filestore_index_retry_probability); + return index.init(); +} + +int IndexManager::build_index(coll_t c, const char *path, CollectionIndex **index) { + if (upgrade) { + // Need to check the collection generation + int r; + uint32_t version = 0; + r = get_version(path, &version); + if (r < 0) + return r; + + switch (version) { + case CollectionIndex::FLAT_INDEX_TAG: + case CollectionIndex::HASH_INDEX_TAG: // fall through + case CollectionIndex::HASH_INDEX_TAG_2: // fall through + case CollectionIndex::HOBJECT_WITH_POOL: { + // Must be a HashIndex + *index = new HashIndex(c, path, g_conf->filestore_merge_threshold, + g_conf->filestore_split_multiple, version); + return 0; + } + default: assert(0); + } + + } else { + // No need to check + *index = new HashIndex(c, path, g_conf->filestore_merge_threshold, + g_conf->filestore_split_multiple, + CollectionIndex::HOBJECT_WITH_POOL, + g_conf->filestore_index_retry_probability); + return 0; + } +} + +int IndexManager::get_index(coll_t c, const string& baseDir, Index *index) { + + Mutex::Locker l(lock); + ceph::unordered_map ::iterator it = col_indices.find(c); + if (it == col_indices.end()) { + char path[PATH_MAX]; + snprintf(path, sizeof(path), "%s/current/%s", baseDir.c_str(), c.to_str().c_str()); + CollectionIndex* colIndex = NULL; + int r = build_index(c, path, &colIndex); + if (r < 0) + return r; + col_indices[c] = colIndex; + index->index = colIndex; + } else { + index->index = it->second; + } + return 0; +} diff --git a/src/os/filestore/IndexManager.h b/src/os/filestore/IndexManager.h new file mode 100644 index 000000000000..da71807dad1e --- /dev/null +++ b/src/os/filestore/IndexManager.h @@ -0,0 +1,96 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef OS_INDEXMANAGER_H +#define OS_INDEXMANAGER_H + +#include "include/memory.h" +#include "include/unordered_map.h" + +#include "common/Mutex.h" +#include "common/Cond.h" +#include "common/config.h" +#include "common/debug.h" + +#include "CollectionIndex.h" +#include "HashIndex.h" + + +/// Public type for Index +struct Index { + CollectionIndex *index; + + Index() : index(NULL) {} + Index(CollectionIndex* index) : index(index) {} + + CollectionIndex *operator->() { return index; } + CollectionIndex &operator*() { return *index; } +}; + + +/** + * Encapsulates mutual exclusion for CollectionIndexes. + * + * Allowing a modification (removal or addition of an object) to occur + * while a read is occuring (lookup of an object's path and use of + * that path) may result in the path becoming invalid. Thus, during + * the lifetime of a CollectionIndex object and any paths returned + * by it, no other concurrent accesses may be allowed. + * This is enforced by using CollectionIndex::access_lock + */ +class IndexManager { + Mutex lock; ///< Lock for Index Manager + bool upgrade; + ceph::unordered_map col_indices; + + /** + * Index factory + * + * Encapsulates logic for handling legacy FileStore + * layouts + * + * @param [in] c Collection for which to get index + * @param [in] path Path to collection + * @param [out] index Index for c + * @return error code + */ + int build_index(coll_t c, const char *path, CollectionIndex **index); +public: + /// Constructor + IndexManager(bool upgrade) : lock("IndexManager lock"), + upgrade(upgrade) {} + + ~IndexManager(); + + /** + * Reserve and return index for c + * + * @param [in] c Collection for which to get index + * @param [in] baseDir base directory of collections + * @param [out] index Index for c + * @return error code + */ + int get_index(coll_t c, const string& baseDir, Index *index); + + /** + * Initialize index for collection c at path + * + * @param [in] c Collection for which to init Index + * @param [in] path Path to collection + * @param [in] filestore_version version of containing FileStore + * @return error code + */ + int init_index(coll_t c, const char *path, uint32_t filestore_version); +}; + +#endif diff --git a/src/os/filestore/Journal.h b/src/os/filestore/Journal.h new file mode 100644 index 000000000000..602e8ea7c36b --- /dev/null +++ b/src/os/filestore/Journal.h @@ -0,0 +1,81 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_JOURNAL_H +#define CEPH_JOURNAL_H + +#include + +#include "include/buffer_fwd.h" +#include "include/Context.h" +#include "common/Finisher.h" +#include "common/TrackedOp.h" +#include "os/ObjectStore.h" + +class PerfCounters; + +class Journal { +protected: + uuid_d fsid; + Finisher *finisher; +public: + PerfCounters *logger; +protected: + Cond *do_sync_cond; + bool wait_on_full; + +public: + Journal(uuid_d f, Finisher *fin, Cond *c=0) : + fsid(f), finisher(fin), logger(NULL), + do_sync_cond(c), + wait_on_full(false) { } + virtual ~Journal() { } + + virtual int check() = 0; ///< check if journal appears valid + virtual int create() = 0; ///< create a fresh journal + virtual int open(uint64_t fs_op_seq) = 0; ///< open an existing journal + virtual void close() = 0; ///< close an open journal + + virtual void flush() = 0; + virtual void throttle() = 0; + + virtual int dump(ostream& out) { return -EOPNOTSUPP; } + + void set_wait_on_full(bool b) { wait_on_full = b; } + + // writes + virtual bool is_writeable() = 0; + virtual int make_writeable() = 0; + virtual void submit_entry(uint64_t seq, bufferlist& e, uint32_t orig_len, + Context *oncommit, + TrackedOpRef osd_op = TrackedOpRef()) = 0; + virtual void commit_start(uint64_t seq) = 0; + virtual void committed_thru(uint64_t seq) = 0; + + /// Read next journal entry - asserts on invalid journal + virtual bool read_entry( + bufferlist &bl, ///< [out] payload on successful read + uint64_t &seq ///< [in,out] sequence number on last successful read + ) = 0; ///< @return true on successful read, false on journal end + + virtual bool should_commit_now() = 0; + + virtual int prepare_entry(list& tls, bufferlist* tbl) = 0; + + // reads/recovery + +}; + +#endif diff --git a/src/os/filestore/JournalingObjectStore.cc b/src/os/filestore/JournalingObjectStore.cc new file mode 100644 index 000000000000..e47b2e6e8b33 --- /dev/null +++ b/src/os/filestore/JournalingObjectStore.cc @@ -0,0 +1,268 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- + +#include "JournalingObjectStore.h" + +#include "common/errno.h" +#include "common/debug.h" + +#define dout_subsys ceph_subsys_journal +#undef dout_prefix +#define dout_prefix *_dout << "journal " + + + +void JournalingObjectStore::journal_start() +{ + dout(10) << "journal_start" << dendl; + finisher.start(); +} + +void JournalingObjectStore::journal_stop() +{ + dout(10) << "journal_stop" << dendl; + finisher.stop(); +} + +// A journal_replay() makes journal writeable, this closes that out. +void JournalingObjectStore::journal_write_close() +{ + if (journal) { + journal->close(); + delete journal; + journal = 0; + } + apply_manager.reset(); +} + +int JournalingObjectStore::journal_replay(uint64_t fs_op_seq) +{ + dout(10) << "journal_replay fs op_seq " << fs_op_seq << dendl; + + if (g_conf->journal_replay_from) { + dout(0) << "journal_replay forcing replay from " << g_conf->journal_replay_from + << " instead of " << fs_op_seq << dendl; + // the previous op is the last one committed + fs_op_seq = g_conf->journal_replay_from - 1; + } + + uint64_t op_seq = fs_op_seq; + apply_manager.init_seq(fs_op_seq); + + if (!journal) { + submit_manager.set_op_seq(op_seq); + return 0; + } + + int err = journal->open(op_seq); + if (err < 0) { + dout(3) << "journal_replay open failed with " + << cpp_strerror(err) << dendl; + delete journal; + journal = 0; + return err; + } + + replaying = true; + + int count = 0; + while (1) { + bufferlist bl; + uint64_t seq = op_seq + 1; + if (!journal->read_entry(bl, seq)) { + dout(3) << "journal_replay: end of journal, done." << dendl; + break; + } + + if (seq <= op_seq) { + dout(3) << "journal_replay: skipping old op seq " << seq << " <= " << op_seq << dendl; + continue; + } + assert(op_seq == seq-1); + + dout(3) << "journal_replay: applying op seq " << seq << dendl; + bufferlist::iterator p = bl.begin(); + list tls; + while (!p.end()) { + Transaction *t = new Transaction(p); + tls.push_back(t); + } + + apply_manager.op_apply_start(seq); + int r = do_transactions(tls, seq); + apply_manager.op_apply_finish(seq); + + op_seq = seq; + + while (!tls.empty()) { + delete tls.front(); + tls.pop_front(); + } + + dout(3) << "journal_replay: r = " << r << ", op_seq now " << op_seq << dendl; + } + + replaying = false; + + submit_manager.set_op_seq(op_seq); + + // done reading, make writeable. + err = journal->make_writeable(); + if (err < 0) + return err; + + return count; +} + + +// ------------------------------------ + +uint64_t JournalingObjectStore::ApplyManager::op_apply_start(uint64_t op) +{ + Mutex::Locker l(apply_lock); + while (blocked) { + // note: this only happens during journal replay + dout(10) << "op_apply_start blocked, waiting" << dendl; + blocked_cond.Wait(apply_lock); + } + dout(10) << "op_apply_start " << op << " open_ops " << open_ops << " -> " << (open_ops+1) << dendl; + assert(!blocked); + assert(op > committed_seq); + open_ops++; + return op; +} + +void JournalingObjectStore::ApplyManager::op_apply_finish(uint64_t op) +{ + Mutex::Locker l(apply_lock); + dout(10) << "op_apply_finish " << op << " open_ops " << open_ops + << " -> " << (open_ops-1) + << ", max_applied_seq " << max_applied_seq << " -> " << MAX(op, max_applied_seq) + << dendl; + --open_ops; + assert(open_ops >= 0); + + // signal a blocked commit_start (only needed during journal replay) + if (blocked) { + blocked_cond.Signal(); + } + + // there can be multiple applies in flight; track the max value we + // note. note that we can't _read_ this value and learn anything + // meaningful unless/until we've quiesced all in-flight applies. + if (op > max_applied_seq) + max_applied_seq = op; +} + +uint64_t JournalingObjectStore::SubmitManager::op_submit_start() +{ + lock.Lock(); + uint64_t op = ++op_seq; + dout(10) << "op_submit_start " << op << dendl; + return op; +} + +void JournalingObjectStore::SubmitManager::op_submit_finish(uint64_t op) +{ + dout(10) << "op_submit_finish " << op << dendl; + if (op != op_submitted + 1) { + dout(0) << "op_submit_finish " << op << " expected " << (op_submitted + 1) + << ", OUT OF ORDER" << dendl; + assert(0 == "out of order op_submit_finish"); + } + op_submitted = op; + lock.Unlock(); +} + + +// ------------------------------------------ + +void JournalingObjectStore::ApplyManager::add_waiter(uint64_t op, Context *c) +{ + Mutex::Locker l(com_lock); + assert(c); + commit_waiters[op].push_back(c); +} + +bool JournalingObjectStore::ApplyManager::commit_start() +{ + bool ret = false; + + uint64_t _committing_seq = 0; + { + Mutex::Locker l(apply_lock); + dout(10) << "commit_start max_applied_seq " << max_applied_seq + << ", open_ops " << open_ops + << dendl; + blocked = true; + while (open_ops > 0) { + dout(10) << "commit_start waiting for " << open_ops << " open ops to drain" << dendl; + blocked_cond.Wait(apply_lock); + } + assert(open_ops == 0); + dout(10) << "commit_start blocked, all open_ops have completed" << dendl; + { + Mutex::Locker l(com_lock); + if (max_applied_seq == committed_seq) { + dout(10) << "commit_start nothing to do" << dendl; + blocked = false; + assert(commit_waiters.empty()); + goto out; + } + + _committing_seq = committing_seq = max_applied_seq; + + dout(10) << "commit_start committing " << committing_seq + << ", still blocked" << dendl; + } + } + ret = true; + + out: + if (journal) + journal->commit_start(_committing_seq); // tell the journal too + return ret; +} + +void JournalingObjectStore::ApplyManager::commit_started() +{ + Mutex::Locker l(apply_lock); + // allow new ops. (underlying fs should now be committing all prior ops) + dout(10) << "commit_started committing " << committing_seq << ", unblocking" << dendl; + blocked = false; + blocked_cond.Signal(); +} + +void JournalingObjectStore::ApplyManager::commit_finish() +{ + Mutex::Locker l(com_lock); + dout(10) << "commit_finish thru " << committing_seq << dendl; + + if (journal) + journal->committed_thru(committing_seq); + + committed_seq = committing_seq; + + map >::iterator p = commit_waiters.begin(); + while (p != commit_waiters.end() && + p->first <= committing_seq) { + finisher.queue(p->second); + commit_waiters.erase(p++); + } +} + +void JournalingObjectStore::_op_journal_transactions( + bufferlist& tbl, uint32_t orig_len, uint64_t op, + Context *onjournal, TrackedOpRef osd_op) +{ + if (osd_op.get()) + dout(10) << "op_journal_transactions " << op << " reqid_t " + << (static_cast(osd_op.get()))->get_reqid() << dendl; + else + dout(10) << "op_journal_transactions " << op << dendl; + + if (journal && journal->is_writeable()) { + journal->submit_entry(op, tbl, orig_len, onjournal, osd_op); + } else if (onjournal) { + apply_manager.add_waiter(op, onjournal); + } +} diff --git a/src/os/filestore/JournalingObjectStore.h b/src/os/filestore/JournalingObjectStore.h new file mode 100644 index 000000000000..e757526b749d --- /dev/null +++ b/src/os/filestore/JournalingObjectStore.h @@ -0,0 +1,143 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_JOURNALINGOBJECTSTORE_H +#define CEPH_JOURNALINGOBJECTSTORE_H + +#include "os/ObjectStore.h" +#include "Journal.h" +#include "FileJournal.h" +#include "common/RWLock.h" + +class JournalingObjectStore : public ObjectStore { +protected: + Journal *journal; + Finisher finisher; + + + class SubmitManager { + Mutex lock; + uint64_t op_seq; + uint64_t op_submitted; + public: + SubmitManager() : + lock("JOS::SubmitManager::lock", false, true, false, g_ceph_context), + op_seq(0), op_submitted(0) + {} + uint64_t op_submit_start(); + void op_submit_finish(uint64_t op); + void set_op_seq(uint64_t seq) { + Mutex::Locker l(lock); + op_submitted = op_seq = seq; + } + uint64_t get_op_seq() { + return op_seq; + } + } submit_manager; + + class ApplyManager { + Journal *&journal; + Finisher &finisher; + + Mutex apply_lock; + bool blocked; + Cond blocked_cond; + int open_ops; + uint64_t max_applied_seq; + + Mutex com_lock; + map > commit_waiters; + uint64_t committing_seq, committed_seq; + + public: + ApplyManager(Journal *&j, Finisher &f) : + journal(j), finisher(f), + apply_lock("JOS::ApplyManager::apply_lock", false, true, false, g_ceph_context), + blocked(false), + open_ops(0), + max_applied_seq(0), + com_lock("JOS::ApplyManager::com_lock", false, true, false, g_ceph_context), + committing_seq(0), committed_seq(0) {} + void reset() { + assert(open_ops == 0); + assert(blocked == false); + max_applied_seq = 0; + committing_seq = 0; + committed_seq = 0; + } + void add_waiter(uint64_t, Context*); + uint64_t op_apply_start(uint64_t op); + void op_apply_finish(uint64_t op); + bool commit_start(); + void commit_started(); + void commit_finish(); + bool is_committing() { + Mutex::Locker l(com_lock); + return committing_seq != committed_seq; + } + uint64_t get_committed_seq() { + Mutex::Locker l(com_lock); + return committed_seq; + } + uint64_t get_committing_seq() { + Mutex::Locker l(com_lock); + return committing_seq; + } + void init_seq(uint64_t fs_op_seq) { + { + Mutex::Locker l(com_lock); + committed_seq = fs_op_seq; + committing_seq = fs_op_seq; + } + { + Mutex::Locker l(apply_lock); + max_applied_seq = fs_op_seq; + } + } + } apply_manager; + + bool replaying; + +protected: + void journal_start(); + void journal_stop(); + void journal_write_close(); + int journal_replay(uint64_t fs_op_seq); + + void _op_journal_transactions(bufferlist& tls, uint32_t orig_len, uint64_t op, + Context *onjournal, TrackedOpRef osd_op); + + virtual int do_transactions(list& tls, uint64_t op_seq) = 0; + +public: + bool is_committing() { + return apply_manager.is_committing(); + } + uint64_t get_committed_seq() { + return apply_manager.get_committed_seq(); + } + +public: + JournalingObjectStore(const std::string& path) + : ObjectStore(path), + journal(NULL), + finisher(g_ceph_context, "JournalObjectStore"), + apply_manager(journal, finisher), + replaying(false) {} + + ~JournalingObjectStore() { + } +}; + +#endif diff --git a/src/os/filestore/LFNIndex.cc b/src/os/filestore/LFNIndex.cc new file mode 100644 index 000000000000..7f8e6d00ffe5 --- /dev/null +++ b/src/os/filestore/LFNIndex.cc @@ -0,0 +1,1356 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include +#include +#include +#include + +#if defined(__FreeBSD__) +#include +#endif + +#include "osd/osd_types.h" +#include "include/object.h" +#include "common/config.h" +#include "common/debug.h" +#include "include/buffer.h" +#include "common/ceph_crypto.h" +#include "include/compat.h" +#include "chain_xattr.h" + +#include "LFNIndex.h" +using ceph::crypto::SHA1; + +#define dout_subsys ceph_subsys_filestore +#undef dout_prefix +#define dout_prefix *_dout << "LFNIndex(" << get_base_path() << ") " + + +const string LFNIndex::LFN_ATTR = "user.cephos.lfn"; +const string LFNIndex::PHASH_ATTR_PREFIX = "user.cephos.phash."; +const string LFNIndex::SUBDIR_PREFIX = "DIR_"; +const string LFNIndex::FILENAME_COOKIE = "long"; +const int LFNIndex::FILENAME_PREFIX_LEN = FILENAME_SHORT_LEN - FILENAME_HASH_LEN - + FILENAME_COOKIE.size() - + FILENAME_EXTRA; +void LFNIndex::maybe_inject_failure() +{ + if (error_injection_enabled) { + if (current_failure > last_failure && + (((double)(rand() % 10000))/((double)(10000)) + < error_injection_probability)) { + last_failure = current_failure; + current_failure = 0; + throw RetryException(); + } + ++current_failure; + } +} + +// Helper to close fd's when we leave scope. This is useful when used +// in combination with RetryException, thrown by the above. +struct FDCloser { + int fd; + FDCloser(int f) : fd(f) {} + ~FDCloser() { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + } +}; + + +/* Public methods */ + + +int LFNIndex::init() +{ + return _init(); +} + +int LFNIndex::created(const ghobject_t &oid, const char *path) +{ + WRAP_RETRY( + vector path_comp; + string short_name; + r = decompose_full_path(path, &path_comp, 0, &short_name); + if (r < 0) + goto out; + r = lfn_created(path_comp, oid, short_name); + if (r < 0) + goto out; + r = _created(path_comp, oid, short_name); + if (r < 0) + goto out; + ); +} + +int LFNIndex::unlink(const ghobject_t &oid) +{ + WRAP_RETRY( + vector path; + string short_name; + r = _lookup(oid, &path, &short_name, NULL); + if (r < 0) { + goto out; + } + r = _remove(path, oid, short_name); + if (r < 0) { + goto out; + } + ); +} + +int LFNIndex::lookup(const ghobject_t &oid, + IndexedPath *out_path, + int *hardlink) +{ + WRAP_RETRY( + vector path; + string short_name; + r = _lookup(oid, &path, &short_name, hardlink); + if (r < 0) + goto out; + string full_path = get_full_path(path, short_name); + *out_path = IndexedPath(new Path(full_path, this)); + r = 0; + ); +} + +int LFNIndex::pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs) +{ + return _pre_hash_collection(pg_num, expected_num_objs); +} + + +int LFNIndex::collection_list_partial(const ghobject_t &start, + const ghobject_t &end, + bool sort_bitwise, + int max_count, + vector *ls, + ghobject_t *next) +{ + return _collection_list_partial(start, end, sort_bitwise, max_count, ls, next); +} + +/* Derived class utility methods */ + +int LFNIndex::fsync_dir(const vector &path) +{ + maybe_inject_failure(); + int fd = ::open(get_full_path_subdir(path).c_str(), O_RDONLY); + if (fd < 0) + return -errno; + FDCloser f(fd); + maybe_inject_failure(); + int r = ::fsync(fd); + maybe_inject_failure(); + if (r < 0) + return -errno; + else + return 0; +} + +int LFNIndex::link_object(const vector &from, + const vector &to, + const ghobject_t &oid, + const string &from_short_name) +{ + int r; + string from_path = get_full_path(from, from_short_name); + string to_path; + maybe_inject_failure(); + r = lfn_get_name(to, oid, 0, &to_path, 0); + if (r < 0) + return r; + maybe_inject_failure(); + r = ::link(from_path.c_str(), to_path.c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + else + return 0; +} + +int LFNIndex::remove_objects(const vector &dir, + const map &to_remove, + map *remaining) +{ + set clean_chains; + for (map::const_iterator to_clean = to_remove.begin(); + to_clean != to_remove.end(); + ++to_clean) { + if (!lfn_is_hashed_filename(to_clean->first)) { + maybe_inject_failure(); + int r = ::unlink(get_full_path(dir, to_clean->first).c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + continue; + } + if (clean_chains.count(lfn_get_short_name(to_clean->second, 0))) + continue; + set holes; + map > chain; + for (int i = 0; ; ++i) { + string short_name = lfn_get_short_name(to_clean->second, i); + if (remaining->count(short_name)) { + chain[i] = *(remaining->find(short_name)); + } else if (to_remove.count(short_name)) { + holes.insert(i); + } else { + break; + } + } + + map >::reverse_iterator candidate = chain.rbegin(); + for (set::iterator i = holes.begin(); + i != holes.end(); + ++i) { + if (candidate == chain.rend() || *i > candidate->first) { + string remove_path_name = + get_full_path(dir, lfn_get_short_name(to_clean->second, *i)); + maybe_inject_failure(); + int r = ::unlink(remove_path_name.c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + continue; + } + string from = get_full_path(dir, candidate->second.first); + string to = get_full_path(dir, lfn_get_short_name(candidate->second.second, *i)); + maybe_inject_failure(); + int r = ::rename(from.c_str(), to.c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + remaining->erase(candidate->second.first); + remaining->insert(pair( + lfn_get_short_name(candidate->second.second, *i), + candidate->second.second)); + ++candidate; + } + if (!holes.empty()) + clean_chains.insert(lfn_get_short_name(to_clean->second, 0)); + } + return 0; +} + +int LFNIndex::move_objects(const vector &from, + const vector &to) +{ + map to_move; + int r; + r = list_objects(from, 0, NULL, &to_move); + if (r < 0) + return r; + for (map::iterator i = to_move.begin(); + i != to_move.end(); + ++i) { + string from_path = get_full_path(from, i->first); + string to_path, to_name; + r = lfn_get_name(to, i->second, &to_name, &to_path, 0); + if (r < 0) + return r; + maybe_inject_failure(); + r = ::link(from_path.c_str(), to_path.c_str()); + if (r < 0 && errno != EEXIST) + return -errno; + maybe_inject_failure(); + r = lfn_created(to, i->second, to_name); + maybe_inject_failure(); + if (r < 0) + return r; + } + r = fsync_dir(to); + if (r < 0) + return r; + for (map::iterator i = to_move.begin(); + i != to_move.end(); + ++i) { + maybe_inject_failure(); + r = ::unlink(get_full_path(from, i->first).c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + } + return fsync_dir(from); +} + +int LFNIndex::remove_object(const vector &from, + const ghobject_t &oid) +{ + string short_name; + int r, exist; + maybe_inject_failure(); + r = get_mangled_name(from, oid, &short_name, &exist); + maybe_inject_failure(); + if (r < 0) + return r; + if (exist == 0) + return -ENOENT; + return lfn_unlink(from, oid, short_name); +} + +int LFNIndex::get_mangled_name(const vector &from, + const ghobject_t &oid, + string *mangled_name, int *hardlink) +{ + return lfn_get_name(from, oid, mangled_name, 0, hardlink); +} + +int LFNIndex::move_subdir( + LFNIndex &from, + LFNIndex &dest, + const vector &path, + string dir + ) +{ + vector sub_path(path.begin(), path.end()); + sub_path.push_back(dir); + string from_path(from.get_full_path_subdir(sub_path)); + string to_path(dest.get_full_path_subdir(sub_path)); + int r = ::rename(from_path.c_str(), to_path.c_str()); + if (r < 0) + return -errno; + return 0; +} + +int LFNIndex::move_object( + LFNIndex &from, + LFNIndex &dest, + const vector &path, + const pair &obj + ) +{ + string from_path(from.get_full_path(path, obj.first)); + string to_path; + string to_name; + int exists; + int r = dest.lfn_get_name(path, obj.second, &to_name, &to_path, &exists); + if (r < 0) + return r; + if (!exists) { + r = ::link(from_path.c_str(), to_path.c_str()); + if (r < 0) + return r; + } + r = dest.lfn_created(path, obj.second, to_name); + if (r < 0) + return r; + r = dest.fsync_dir(path); + if (r < 0) + return r; + r = from.remove_object(path, obj.second); + if (r < 0) + return r; + return from.fsync_dir(path); +} + + +static int get_hobject_from_oinfo(const char *dir, const char *file, + ghobject_t *o) +{ + char path[PATH_MAX]; + bufferptr bp(PATH_MAX); + snprintf(path, sizeof(path), "%s/%s", dir, file); + // Hack, user.ceph._ is the attribute used to store the object info + int r = chain_getxattr(path, "user.ceph._", bp.c_str(), bp.length()); + if (r < 0) + return r; + bufferlist bl; + bl.push_back(bp); + object_info_t oi(bl); + *o = ghobject_t(oi.soid); + return 0; +} + + +int LFNIndex::list_objects(const vector &to_list, int max_objs, + long *handle, map *out) +{ + string to_list_path = get_full_path_subdir(to_list); + DIR *dir = ::opendir(to_list_path.c_str()); + char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1]; + int r; + if (!dir) { + return -errno; + } + + if (handle && *handle) { + seekdir(dir, *handle); + } + + struct dirent *de; + int listed = 0; + bool end = false; + while (!::readdir_r(dir, reinterpret_cast(buf), &de)) { + if (!de) { + end = true; + break; + } + if (max_objs > 0 && listed >= max_objs) { + break; + } + if (de->d_name[0] == '.') + continue; + string short_name(de->d_name); + ghobject_t obj; + if (lfn_is_object(short_name)) { + r = lfn_translate(to_list, short_name, &obj); + if (r < 0) { + r = -errno; + goto cleanup; + } else if (r > 0) { + string long_name = lfn_generate_object_name(obj); + if (!lfn_must_hash(long_name)) { + assert(long_name == short_name); + } + if (index_version == HASH_INDEX_TAG) + get_hobject_from_oinfo(to_list_path.c_str(), short_name.c_str(), &obj); + + out->insert(pair(short_name, obj)); + ++listed; + } else { + continue; + } + } + } + + if (handle && !end) { + *handle = telldir(dir); + } + + r = 0; + cleanup: + ::closedir(dir); + return r; +} + +int LFNIndex::list_subdirs(const vector &to_list, + vector *out) +{ + string to_list_path = get_full_path_subdir(to_list); + DIR *dir = ::opendir(to_list_path.c_str()); + char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1]; + if (!dir) + return -errno; + + struct dirent *de; + while (!::readdir_r(dir, reinterpret_cast(buf), &de)) { + if (!de) { + break; + } + string short_name(de->d_name); + string demangled_name; + if (lfn_is_subdir(short_name, &demangled_name)) { + out->push_back(demangled_name); + } + } + + ::closedir(dir); + return 0; +} + +int LFNIndex::create_path(const vector &to_create) +{ + maybe_inject_failure(); + int r = ::mkdir(get_full_path_subdir(to_create).c_str(), 0777); + maybe_inject_failure(); + if (r < 0) + return -errno; + else + return 0; +} + +int LFNIndex::remove_path(const vector &to_remove) +{ + maybe_inject_failure(); + int r = ::rmdir(get_full_path_subdir(to_remove).c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + else + return 0; +} + +int LFNIndex::path_exists(const vector &to_check, int *exists) +{ + string full_path = get_full_path_subdir(to_check); + struct stat buf; + if (::stat(full_path.c_str(), &buf)) { + int r = -errno; + if (r == -ENOENT) { + *exists = 0; + return 0; + } else { + return r; + } + } else { + *exists = 1; + return 0; + } +} + +int LFNIndex::add_attr_path(const vector &path, + const string &attr_name, + bufferlist &attr_value) +{ + string full_path = get_full_path_subdir(path); + maybe_inject_failure(); + return chain_setxattr(full_path.c_str(), mangle_attr_name(attr_name).c_str(), + reinterpret_cast(attr_value.c_str()), + attr_value.length()); +} + +int LFNIndex::get_attr_path(const vector &path, + const string &attr_name, + bufferlist &attr_value) +{ + string full_path = get_full_path_subdir(path); + size_t size = 1024; // Initial + while (1) { + bufferptr buf(size); + int r = chain_getxattr(full_path.c_str(), mangle_attr_name(attr_name).c_str(), + reinterpret_cast(buf.c_str()), + size); + if (r > 0) { + buf.set_length(r); + attr_value.push_back(buf); + break; + } else { + r = -errno; + if (r == -ERANGE) { + size *= 2; + } else { + return r; + } + } + } + return 0; +} + +int LFNIndex::remove_attr_path(const vector &path, + const string &attr_name) +{ + string full_path = get_full_path_subdir(path); + string mangled_attr_name = mangle_attr_name(attr_name); + maybe_inject_failure(); + return chain_removexattr(full_path.c_str(), mangled_attr_name.c_str()); +} + +string LFNIndex::lfn_generate_object_name_keyless(const ghobject_t &oid) +{ + char s[FILENAME_MAX_LEN]; + char *end = s + sizeof(s); + char *t = s; + + assert(oid.generation == ghobject_t::NO_GEN); + const char *i = oid.hobj.oid.name.c_str(); + // Escape subdir prefix + if (oid.hobj.oid.name.substr(0, 4) == "DIR_") { + *t++ = '\\'; + *t++ = 'd'; + i += 4; + } + while (*i && t < end) { + if (*i == '\\') { + *t++ = '\\'; + *t++ = '\\'; + } else if (*i == '.' && i == oid.hobj.oid.name.c_str()) { // only escape leading . + *t++ = '\\'; + *t++ = '.'; + } else if (*i == '/') { + *t++ = '\\'; + *t++ = 's'; + } else + *t++ = *i; + i++; + } + + if (oid.hobj.snap == CEPH_NOSNAP) + t += snprintf(t, end - t, "_head"); + else if (oid.hobj.snap == CEPH_SNAPDIR) + t += snprintf(t, end - t, "_snapdir"); + else + t += snprintf(t, end - t, "_%llx", (long long unsigned)oid.hobj.snap); + snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash()); + + return string(s); +} + +static void append_escaped(string::const_iterator begin, + string::const_iterator end, + string *out) +{ + for (string::const_iterator i = begin; i != end; ++i) { + if (*i == '\\') { + out->append("\\\\"); + } else if (*i == '/') { + out->append("\\s"); + } else if (*i == '_') { + out->append("\\u"); + } else if (*i == '\0') { + out->append("\\n"); + } else { + out->append(i, i+1); + } + } +} + +string LFNIndex::lfn_generate_object_name(const ghobject_t &oid) +{ + if (index_version == HASH_INDEX_TAG) + return lfn_generate_object_name_keyless(oid); + if (index_version == HASH_INDEX_TAG_2) + return lfn_generate_object_name_poolless(oid); + + string full_name; + string::const_iterator i = oid.hobj.oid.name.begin(); + if (oid.hobj.oid.name.substr(0, 4) == "DIR_") { + full_name.append("\\d"); + i += 4; + } else if (oid.hobj.oid.name[0] == '.') { + full_name.append("\\."); + ++i; + } + append_escaped(i, oid.hobj.oid.name.end(), &full_name); + full_name.append("_"); + append_escaped(oid.hobj.get_key().begin(), oid.hobj.get_key().end(), &full_name); + full_name.append("_"); + + char buf[PATH_MAX]; + char *t = buf; + char *end = t + sizeof(buf); + if (oid.hobj.snap == CEPH_NOSNAP) + t += snprintf(t, end - t, "head"); + else if (oid.hobj.snap == CEPH_SNAPDIR) + t += snprintf(t, end - t, "snapdir"); + else + t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap); + snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash()); + full_name += string(buf); + full_name.append("_"); + + append_escaped(oid.hobj.nspace.begin(), oid.hobj.nspace.end(), &full_name); + full_name.append("_"); + + t = buf; + end = t + sizeof(buf); + if (oid.hobj.pool == -1) + t += snprintf(t, end - t, "none"); + else + t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.pool); + full_name += string(buf); + + if (oid.generation != ghobject_t::NO_GEN || + oid.shard_id != shard_id_t::NO_SHARD) { + full_name.append("_"); + + t = buf; + end = t + sizeof(buf); + t += snprintf(t, end - t, "%llx", (long long unsigned)oid.generation); + full_name += string(buf); + + full_name.append("_"); + + t = buf; + end = t + sizeof(buf); + t += snprintf(t, end - t, "%x", (int)oid.shard_id); + full_name += string(buf); + } + + return full_name; +} + +string LFNIndex::lfn_generate_object_name_poolless(const ghobject_t &oid) +{ + if (index_version == HASH_INDEX_TAG) + return lfn_generate_object_name_keyless(oid); + + assert(oid.generation == ghobject_t::NO_GEN); + string full_name; + string::const_iterator i = oid.hobj.oid.name.begin(); + if (oid.hobj.oid.name.substr(0, 4) == "DIR_") { + full_name.append("\\d"); + i += 4; + } else if (oid.hobj.oid.name[0] == '.') { + full_name.append("\\."); + ++i; + } + append_escaped(i, oid.hobj.oid.name.end(), &full_name); + full_name.append("_"); + append_escaped(oid.hobj.get_key().begin(), oid.hobj.get_key().end(), &full_name); + full_name.append("_"); + + char snap_with_hash[PATH_MAX]; + char *t = snap_with_hash; + char *end = t + sizeof(snap_with_hash); + if (oid.hobj.snap == CEPH_NOSNAP) + t += snprintf(t, end - t, "head"); + else if (oid.hobj.snap == CEPH_SNAPDIR) + t += snprintf(t, end - t, "snapdir"); + else + t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap); + snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash()); + full_name += string(snap_with_hash); + return full_name; +} + +int LFNIndex::lfn_get_name(const vector &path, + const ghobject_t &oid, + string *mangled_name, string *out_path, + int *hardlink) +{ + string subdir_path = get_full_path_subdir(path); + string full_name = lfn_generate_object_name(oid); + int r; + + if (!lfn_must_hash(full_name)) { + if (mangled_name) + *mangled_name = full_name; + if (out_path) + *out_path = get_full_path(path, full_name); + if (hardlink) { + struct stat buf; + string full_path = get_full_path(path, full_name); + maybe_inject_failure(); + r = ::stat(full_path.c_str(), &buf); + if (r < 0) { + if (errno == ENOENT) + *hardlink = 0; + else + return -errno; + } else { + *hardlink = buf.st_nlink; + } + } + return 0; + } + + int i = 0; + string candidate; + string candidate_path; + char buf[FILENAME_MAX_LEN + 1]; + for ( ; ; ++i) { + candidate = lfn_get_short_name(oid, i); + candidate_path = get_full_path(path, candidate); + r = chain_getxattr(candidate_path.c_str(), get_lfn_attr().c_str(), + buf, sizeof(buf)); + if (r < 0) { + if (errno != ENODATA && errno != ENOENT) + return -errno; + if (errno == ENODATA) { + // Left over from incomplete transaction, it'll be replayed + maybe_inject_failure(); + r = ::unlink(candidate_path.c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + } + if (mangled_name) + *mangled_name = candidate; + if (out_path) + *out_path = candidate_path; + if (hardlink) + *hardlink = 0; + return 0; + } + assert(r > 0); + buf[MIN((int)sizeof(buf) - 1, r)] = '\0'; + if (!strcmp(buf, full_name.c_str())) { + if (mangled_name) + *mangled_name = candidate; + if (out_path) + *out_path = candidate_path; + if (hardlink) { + struct stat st; + r = ::stat(candidate_path.c_str(), &st); + *hardlink = st.st_nlink; + } + return 0; + } + r = chain_getxattr(candidate_path.c_str(), get_alt_lfn_attr().c_str(), + buf, sizeof(buf)); + if (r > 0) { + // only consider alt name if nlink > 1 + struct stat st; + int rc = ::stat(candidate_path.c_str(), &st); + if (rc < 0) + return -errno; + if (st.st_nlink <= 1) { + // left over from incomplete unlink, remove + maybe_inject_failure(); + dout(20) << __func__ << " found extra alt attr for " << candidate_path + << ", long name " << string(buf, r) << dendl; + rc = chain_removexattr(candidate_path.c_str(), + get_alt_lfn_attr().c_str()); + maybe_inject_failure(); + if (rc < 0) + return rc; + continue; + } + buf[MIN((int)sizeof(buf) - 1, r)] = '\0'; + if (!strcmp(buf, full_name.c_str())) { + dout(20) << __func__ << " used alt attr for " << full_name << dendl; + if (mangled_name) + *mangled_name = candidate; + if (out_path) + *out_path = candidate_path; + if (hardlink) + *hardlink = st.st_nlink; + return 0; + } + } + } + assert(0); // Unreachable + return 0; +} + +int LFNIndex::lfn_created(const vector &path, + const ghobject_t &oid, + const string &mangled_name) +{ + if (!lfn_is_hashed_filename(mangled_name)) + return 0; + string full_path = get_full_path(path, mangled_name); + string full_name = lfn_generate_object_name(oid); + maybe_inject_failure(); + + // if the main attr exists and is different, move it to the alt attr. + char buf[FILENAME_MAX_LEN + 1]; + int r = chain_getxattr(full_path.c_str(), get_lfn_attr().c_str(), + buf, sizeof(buf)); + if (r >= 0 && (r != (int)full_name.length() || + memcmp(buf, full_name.c_str(), full_name.length()))) { + dout(20) << __func__ << " " << mangled_name + << " moving old name to alt attr " + << string(buf, r) + << ", new name is " << full_name << dendl; + r = chain_setxattr(full_path.c_str(), get_alt_lfn_attr().c_str(), + buf, r); + if (r < 0) + return r; + } + + return chain_setxattr(full_path.c_str(), get_lfn_attr().c_str(), + full_name.c_str(), full_name.size()); +} + +int LFNIndex::lfn_unlink(const vector &path, + const ghobject_t &oid, + const string &mangled_name) +{ + if (!lfn_is_hashed_filename(mangled_name)) { + string full_path = get_full_path(path, mangled_name); + maybe_inject_failure(); + int r = ::unlink(full_path.c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + return 0; + } + string subdir_path = get_full_path_subdir(path); + + + int i = 0; + for ( ; ; ++i) { + string candidate = lfn_get_short_name(oid, i); + if (candidate == mangled_name) + break; + } + int removed_index = i; + ++i; + for ( ; ; ++i) { + struct stat buf; + string to_check = lfn_get_short_name(oid, i); + string to_check_path = get_full_path(path, to_check); + int r = ::stat(to_check_path.c_str(), &buf); + if (r < 0) { + if (errno == ENOENT) { + break; + } else { + return -errno; + } + } + } + string full_path = get_full_path(path, mangled_name); + int fd = ::open(full_path.c_str(), O_RDONLY); + if (fd < 0) + return -errno; + FDCloser f(fd); + if (i == removed_index + 1) { + maybe_inject_failure(); + int r = ::unlink(full_path.c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + } else { + string& rename_to = full_path; + string rename_from = get_full_path(path, lfn_get_short_name(oid, i - 1)); + maybe_inject_failure(); + int r = ::rename(rename_from.c_str(), rename_to.c_str()); + maybe_inject_failure(); + if (r < 0) + return -errno; + } + struct stat st; + int r = ::fstat(fd, &st); + if (r == 0 && st.st_nlink > 0) { + // remove alt attr + dout(20) << __func__ << " removing alt attr from " << full_path << dendl; + fsync_dir(path); + chain_fremovexattr(fd, get_alt_lfn_attr().c_str()); + } + return r; +} + +int LFNIndex::lfn_translate(const vector &path, + const string &short_name, + ghobject_t *out) +{ + if (!lfn_is_hashed_filename(short_name)) { + return lfn_parse_object_name(short_name, out); + } + // Get lfn_attr + string full_path = get_full_path(path, short_name); + char attr[PATH_MAX]; + int r = chain_getxattr(full_path.c_str(), get_lfn_attr().c_str(), attr, sizeof(attr) - 1); + if (r < 0) + return -errno; + if (r < (int)sizeof(attr)) + attr[r] = '\0'; + + string long_name(attr); + return lfn_parse_object_name(long_name, out); +} + +bool LFNIndex::lfn_is_object(const string &short_name) +{ + return lfn_is_hashed_filename(short_name) || !lfn_is_subdir(short_name, 0); +} + +bool LFNIndex::lfn_is_subdir(const string &name, string *demangled) +{ + if (name.substr(0, SUBDIR_PREFIX.size()) == SUBDIR_PREFIX) { + if (demangled) + *demangled = demangle_path_component(name); + return 1; + } + return 0; +} + +static int parse_object(const char *s, ghobject_t& o) +{ + const char *hash = s + strlen(s) - 1; + while (*hash != '_' && + hash > s) + hash--; + const char *bar = hash - 1; + while (*bar != '_' && + bar > s) + bar--; + if (*bar == '_') { + char buf[bar-s + 1]; + char *t = buf; + const char *i = s; + while (i < bar) { + if (*i == '\\') { + i++; + switch (*i) { + case '\\': *t++ = '\\'; break; + case '.': *t++ = '.'; break; + case 's': *t++ = '/'; break; + case 'd': { + *t++ = 'D'; + *t++ = 'I'; + *t++ = 'R'; + *t++ = '_'; + break; + } + default: assert(0); + } + } else { + *t++ = *i; + } + i++; + } + *t = 0; + o.hobj.oid.name = string(buf, t-buf); + if (strncmp(bar+1, "head", 4) == 0) + o.hobj.snap = CEPH_NOSNAP; + else if (strncmp(bar+1, "snapdir", 7) == 0) + o.hobj.snap = CEPH_SNAPDIR; + else + o.hobj.snap = strtoull(bar+1, NULL, 16); + + uint32_t hobject_hash_input; + sscanf(hash, "_%X", &hobject_hash_input); + o.hobj.set_hash(hobject_hash_input); + + return 1; + } + return 0; +} + +bool LFNIndex::lfn_parse_object_name_keyless(const string &long_name, ghobject_t *out) +{ + bool r = parse_object(long_name.c_str(), *out); + int64_t pool = -1; + spg_t pg; + if (coll().is_pg_prefix(&pg)) + pool = (int64_t)pg.pgid.pool(); + out->hobj.pool = pool; + if (!r) return r; + string temp = lfn_generate_object_name(*out); + return r; +} + +static bool append_unescaped(string::const_iterator begin, + string::const_iterator end, + string *out) +{ + for (string::const_iterator i = begin; i != end; ++i) { + if (*i == '\\') { + ++i; + if (*i == '\\') + out->append("\\"); + else if (*i == 's') + out->append("/"); + else if (*i == 'n') + (*out) += '\0'; + else if (*i == 'u') + out->append("_"); + else + return false; + } else { + out->append(i, i+1); + } + } + return true; +} + +bool LFNIndex::lfn_parse_object_name_poolless(const string &long_name, + ghobject_t *out) +{ + string name; + string key; + uint32_t hash; + snapid_t snap; + + string::const_iterator current = long_name.begin(); + if (*current == '\\') { + ++current; + if (current == long_name.end()) { + return false; + } else if (*current == 'd') { + name.append("DIR_"); + ++current; + } else if (*current == '.') { + name.append("."); + ++current; + } else { + --current; + } + } + + string::const_iterator end = current; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return false; + if (!append_unescaped(current, end, &name)) + return false; + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return false; + if (!append_unescaped(current, end, &key)) + return false; + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return false; + string snap_str(current, end); + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end != long_name.end()) + return false; + string hash_str(current, end); + + if (snap_str == "head") + snap = CEPH_NOSNAP; + else if (snap_str == "snapdir") + snap = CEPH_SNAPDIR; + else + snap = strtoull(snap_str.c_str(), NULL, 16); + sscanf(hash_str.c_str(), "%X", &hash); + + + int64_t pool = -1; + spg_t pg; + if (coll().is_pg_prefix(&pg)) + pool = (int64_t)pg.pgid.pool(); + (*out) = ghobject_t(hobject_t(name, key, snap, hash, pool, "")); + return true; +} + + +bool LFNIndex::lfn_parse_object_name(const string &long_name, ghobject_t *out) +{ + string name; + string key; + string ns; + uint32_t hash; + snapid_t snap; + uint64_t pool; + gen_t generation = ghobject_t::NO_GEN; + shard_id_t shard_id = shard_id_t::NO_SHARD; + + if (index_version == HASH_INDEX_TAG) + return lfn_parse_object_name_keyless(long_name, out); + if (index_version == HASH_INDEX_TAG_2) + return lfn_parse_object_name_poolless(long_name, out); + + string::const_iterator current = long_name.begin(); + if (*current == '\\') { + ++current; + if (current == long_name.end()) { + return false; + } else if (*current == 'd') { + name.append("DIR_"); + ++current; + } else if (*current == '.') { + name.append("."); + ++current; + } else { + --current; + } + } + + string::const_iterator end = current; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return false; + if (!append_unescaped(current, end, &name)) + return false; + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return false; + if (!append_unescaped(current, end, &key)) + return false; + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return false; + string snap_str(current, end); + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return false; + string hash_str(current, end); + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return false; + if (!append_unescaped(current, end, &ns)) + return false; + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + string pstring(current, end); + + // Optional generation/shard_id + string genstring, shardstring; + if (end != long_name.end()) { + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return false; + genstring = string(current, end); + + generation = (gen_t)strtoull(genstring.c_str(), NULL, 16); + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end != long_name.end()) + return false; + shardstring = string(current, end); + + shard_id = (shard_id_t)strtoul(shardstring.c_str(), NULL, 16); + } + + if (snap_str == "head") + snap = CEPH_NOSNAP; + else if (snap_str == "snapdir") + snap = CEPH_SNAPDIR; + else + snap = strtoull(snap_str.c_str(), NULL, 16); + sscanf(hash_str.c_str(), "%X", &hash); + + if (pstring == "none") + pool = (uint64_t)-1; + else + pool = strtoull(pstring.c_str(), NULL, 16); + + (*out) = ghobject_t(hobject_t(name, key, snap, hash, (int64_t)pool, ns), generation, shard_id); + return true; +} + +bool LFNIndex::lfn_is_hashed_filename(const string &name) +{ + if (name.size() < (unsigned)FILENAME_SHORT_LEN) { + return 0; + } + if (name.substr(name.size() - FILENAME_COOKIE.size(), FILENAME_COOKIE.size()) + == FILENAME_COOKIE) { + return 1; + } else { + return 0; + } +} + +bool LFNIndex::lfn_must_hash(const string &long_name) +{ + return (int)long_name.size() >= FILENAME_SHORT_LEN; +} + +static inline void buf_to_hex(const unsigned char *buf, int len, char *str) +{ + int i; + str[0] = '\0'; + for (i = 0; i < len; i++) { + sprintf(&str[i*2], "%02x", (int)buf[i]); + } +} + +int LFNIndex::hash_filename(const char *filename, char *hash, int buf_len) +{ + if (buf_len < FILENAME_HASH_LEN + 1) + return -EINVAL; + + char buf[FILENAME_LFN_DIGEST_SIZE]; + char hex[FILENAME_LFN_DIGEST_SIZE * 2]; + + SHA1 h; + h.Update((const byte *)filename, strlen(filename)); + h.Final((byte *)buf); + + buf_to_hex((byte *)buf, (FILENAME_HASH_LEN + 1) / 2, hex); + strncpy(hash, hex, FILENAME_HASH_LEN); + hash[FILENAME_HASH_LEN] = '\0'; + return 0; +} + +void LFNIndex::build_filename(const char *old_filename, int i, char *filename, int len) +{ + char hash[FILENAME_HASH_LEN + 1]; + + assert(len >= FILENAME_SHORT_LEN + 4); + + strncpy(filename, old_filename, FILENAME_PREFIX_LEN); + filename[FILENAME_PREFIX_LEN] = '\0'; + if ((int)strlen(filename) < FILENAME_PREFIX_LEN) + return; + if (old_filename[FILENAME_PREFIX_LEN] == '\0') + return; + + hash_filename(old_filename, hash, sizeof(hash)); + int ofs = FILENAME_PREFIX_LEN; + while (1) { + int suffix_len = sprintf(filename + ofs, "_%s_%d_%s", hash, i, FILENAME_COOKIE.c_str()); + if (ofs + suffix_len <= FILENAME_SHORT_LEN || !ofs) + break; + ofs--; + } +} + +string LFNIndex::lfn_get_short_name(const ghobject_t &oid, int i) +{ + string long_name = lfn_generate_object_name(oid); + assert(lfn_must_hash(long_name)); + char buf[FILENAME_SHORT_LEN + 4]; + build_filename(long_name.c_str(), i, buf, sizeof(buf)); + return string(buf); +} + +const string &LFNIndex::get_base_path() +{ + return base_path; +} + +string LFNIndex::get_full_path_subdir(const vector &rel) +{ + string retval = get_base_path(); + for (vector::const_iterator i = rel.begin(); + i != rel.end(); + ++i) { + retval += "/"; + retval += mangle_path_component(*i); + } + return retval; +} + +string LFNIndex::get_full_path(const vector &rel, const string &name) +{ + return get_full_path_subdir(rel) + "/" + name; +} + +string LFNIndex::mangle_path_component(const string &component) +{ + return SUBDIR_PREFIX + component; +} + +string LFNIndex::demangle_path_component(const string &component) +{ + return component.substr(SUBDIR_PREFIX.size(), component.size() - SUBDIR_PREFIX.size()); +} + +int LFNIndex::decompose_full_path(const char *in, vector *out, + ghobject_t *oid, string *shortname) +{ + const char *beginning = in + get_base_path().size(); + const char *end = beginning; + while (1) { + end++; + beginning = end++; + for ( ; *end != '\0' && *end != '/'; ++end) ; + if (*end != '\0') { + out->push_back(demangle_path_component(string(beginning, end - beginning))); + continue; + } else { + break; + } + } + *shortname = string(beginning, end - beginning); + if (oid) { + int r = lfn_translate(*out, *shortname, oid); + if (r < 0) + return r; + } + return 0; +} + +string LFNIndex::mangle_attr_name(const string &attr) +{ + return PHASH_ATTR_PREFIX + attr; +} diff --git a/src/os/filestore/LFNIndex.h b/src/os/filestore/LFNIndex.h new file mode 100644 index 000000000000..8f04407fc855 --- /dev/null +++ b/src/os/filestore/LFNIndex.h @@ -0,0 +1,578 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef OS_LFNINDEX_H +#define OS_LFNINDEX_H + +#include +#include +#include +#include +#include "include/memory.h" +#include + +#include "osd/osd_types.h" +#include "include/object.h" +#include "common/ceph_crypto.h" + +#include "CollectionIndex.h" + +/** + * LFNIndex also encapsulates logic for manipulating + * subdirectories of of a collection as well as the long filename + * logic. + * + * The protected methods provide machinery for derived classes to + * manipulate subdirectories and objects. + * + * The virtual methods are to be overridden to provide the actual + * hashed layout. + * + * User must call created when an object is created. + * + * Syncronization: Calling code must ensure that there are no object + * creations or deletions during the lifetime of a Path object (except + * of an object at that path). + * + * Unless otherwise noted, methods which return an int return 0 on sucess + * and a negative error code on failure. + */ +#define WRAP_RETRY(x) { \ + bool failed = false; \ + int r = 0; \ + init_inject_failure(); \ + while (1) { \ + try { \ + if (failed) { \ + r = cleanup(); \ + assert(r == 0); \ + } \ + { x } \ + out: \ + complete_inject_failure(); \ + return r; \ + } catch (RetryException) { \ + failed = true; \ + } catch (...) { \ + assert(0); \ + } \ + } \ + return -1; \ + } \ + + + +class LFNIndex : public CollectionIndex { + /// Hash digest output size. + static const int FILENAME_LFN_DIGEST_SIZE = CEPH_CRYPTO_SHA1_DIGESTSIZE; + /// Length of filename hash. + static const int FILENAME_HASH_LEN = FILENAME_LFN_DIGEST_SIZE; + /// Max filename size. + static const int FILENAME_MAX_LEN = 4096; + /// Length of hashed filename. + static const int FILENAME_SHORT_LEN = 255; + /// Length of hashed filename prefix. + static const int FILENAME_PREFIX_LEN; + /// Length of hashed filename cookie. + static const int FILENAME_EXTRA = 4; + /// Lfn cookie value. + static const string FILENAME_COOKIE; + /// Name of LFN attribute for storing full name. + static const string LFN_ATTR; + /// Prefix for subdir index attributes. + static const string PHASH_ATTR_PREFIX; + /// Prefix for index subdirectories. + static const string SUBDIR_PREFIX; + + /// Path to Index base. + const string base_path; + +protected: + const uint32_t index_version; + + /// true if retry injection is enabled + struct RetryException : public exception {}; + bool error_injection_enabled; + bool error_injection_on; + double error_injection_probability; + uint64_t last_failure; + uint64_t current_failure; + void init_inject_failure() { + if (error_injection_on) { + error_injection_enabled = true; + last_failure = current_failure = 0; + } + } + void maybe_inject_failure(); + void complete_inject_failure() { + error_injection_enabled = false; + } + +private: + string lfn_attribute, lfn_alt_attribute; + coll_t collection; + +public: + /// Constructor + LFNIndex( + coll_t collection, + const char *base_path, ///< [in] path to Index root + uint32_t index_version, + double _error_injection_probability=0) + : CollectionIndex(collection), + base_path(base_path), + index_version(index_version), + error_injection_enabled(false), + error_injection_on(_error_injection_probability != 0), + error_injection_probability(_error_injection_probability), + last_failure(0), current_failure(0), + collection(collection) { + if (index_version == HASH_INDEX_TAG) { + lfn_attribute = LFN_ATTR; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "%d", index_version); + lfn_attribute = LFN_ATTR + string(buf); + lfn_alt_attribute = LFN_ATTR + string(buf) + "-alt"; + } + } + + coll_t coll() const { return collection; } + + /// Virtual destructor + virtual ~LFNIndex() {} + + /// @see CollectionIndex + int init(); + + /// @see CollectionIndex + int cleanup() = 0; + + /// @see CollectionIndex + int created( + const ghobject_t &oid, + const char *path + ); + + /// @see CollectionIndex + int unlink( + const ghobject_t &oid + ); + + /// @see CollectionIndex + int lookup( + const ghobject_t &oid, + IndexedPath *path, + int *hardlink + ); + + /// @see CollectionIndex; + int pre_hash_collection( + uint32_t pg_num, + uint64_t expected_num_objs + ); + + /// @see CollectionIndex + int collection_list_partial( + const ghobject_t &start, + const ghobject_t &end, + bool sort_bitwise, + int max_count, + vector *ls, + ghobject_t *next + ); + + virtual int _split( + uint32_t match, //< [in] value to match + uint32_t bits, //< [in] bits to check + CollectionIndex* dest //< [in] destination index + ) = 0; + + /// @see CollectionIndex + int split( + uint32_t match, + uint32_t bits, + CollectionIndex* dest + ) { + WRAP_RETRY( + r = _split(match, bits, dest); + goto out; + ); + } + + +protected: + virtual int _init() = 0; + + /// Will be called upon object creation + virtual int _created( + const vector &path, ///< [in] Path to subdir. + const ghobject_t &oid, ///< [in] Object created. + const string &mangled_name ///< [in] Mangled filename. + ) = 0; + + /// Will be called to remove an object + virtual int _remove( + const vector &path, ///< [in] Path to subdir. + const ghobject_t &oid, ///< [in] Object to remove. + const string &mangled_name ///< [in] Mangled filename. + ) = 0; + + /// Return the path and mangled_name for oid. + virtual int _lookup( + const ghobject_t &oid,///< [in] Object for lookup. + vector *path, ///< [out] Path to the object. + string *mangled_name, ///< [out] Mangled filename. + int *exists ///< [out] True if the object exists. + ) = 0; + + /// Pre-hash the collection with the given pg number and + /// expected number of objects in the collection. + virtual int _pre_hash_collection( + uint32_t pg_num, + uint64_t expected_num_objs + ) = 0; + + /// @see CollectionIndex + virtual int _collection_list_partial( + const ghobject_t &start, + const ghobject_t &end, + bool sort_bitwise, + int max_count, + vector *ls, + ghobject_t *next + ) = 0; + +protected: + + /* Non-virtual utility methods */ + + /// Sync a subdirectory + int fsync_dir( + const vector &path ///< [in] Path to sync + ); ///< @return Error Code, 0 on success + + /// Link an object from from into to + int link_object( + const vector &from, ///< [in] Source subdirectory. + const vector &to, ///< [in] Dest subdirectory. + const ghobject_t &oid, ///< [in] Object to move. + const string &from_short_name ///< [in] Mangled filename of oid. + ); ///< @return Error Code, 0 on success + + /** + * Efficiently remove objects from a subdirectory + * + * remove_object invalidates mangled names in the directory requiring + * the mangled name of each additional object to be looked up a second + * time. remove_objects removes the need for additional lookups + * + * @param [in] dir Directory from which to remove. + * @param [in] map of objects to remove to mangle names + * @param [in,out] map of filenames to objects + * @return Error Code, 0 on success. + */ + int remove_objects( + const vector &dir, + const map &to_remove, + map *remaining + ); + + + /** + * Moves contents of from into to. + * + * Invalidates mangled names in to. If interupted, all objects will be + * present in to before objects are removed from from. Ignores EEXIST + * while linking into to. + * @return Error Code, 0 on success + */ + int move_objects( + const vector &from, ///< [in] Source subdirectory. + const vector &to ///< [in] Dest subdirectory. + ); + + /** + * Remove an object from from. + * + * Invalidates mangled names in from. + * @return Error Code, 0 on success + */ + int remove_object( + const vector &from, ///< [in] Directory from which to remove. + const ghobject_t &to_remove ///< [in] Object to remove. + ); + + /** + * Gets the filename corresponding to oid in from. + * + * The filename may differ between subdirectories. Furthermore, + * file creations ore removals in from may invalidate the name. + * @return Error code on failure, 0 on success + */ + int get_mangled_name( + const vector &from, ///< [in] Subdirectory + const ghobject_t &oid, ///< [in] Object + string *mangled_name, ///< [out] Filename + int *hardlink ///< [out] hardlink for this file, hardlink=0 mean no-exist + ); + + /// do move subdir from from to dest + static int move_subdir( + LFNIndex &from, ///< [in] from index + LFNIndex &dest, ///< [in] to index + const vector &path, ///< [in] path containing dir + string dir ///< [in] dir to move + ); + + /// do move object from from to dest + static int move_object( + LFNIndex &from, ///< [in] from index + LFNIndex &dest, ///< [in] to index + const vector &path, ///< [in] path to split + const pair &obj ///< [in] obj to move + ); + + /** + * Lists objects in to_list. + * + * @param [in] to_list Directory to list. + * @param [in] max_objects Max number to list. + * @param [in,out] handle Cookie for continuing the listing. + * Initialize to zero to start at the beginning of the directory. + * @param [out] out Mapping of listed object filenames to objects. + * @return Error code on failure, 0 on success + */ + int list_objects( + const vector &to_list, + int max_objects, + long *handle, + map *out + ); + + /// Lists subdirectories. + int list_subdirs( + const vector &to_list, ///< [in] Directory to list. + vector *out ///< [out] Subdirectories listed. + ); + + /// Create subdirectory. + int create_path( + const vector &to_create ///< [in] Subdirectory to create. + ); + + /// Remove subdirectory. + int remove_path( + const vector &to_remove ///< [in] Subdirectory to remove. + ); + + /// Check whether to_check exists. + int path_exists( + const vector &to_check, ///< [in] Subdirectory to check. + int *exists ///< [out] 1 if it exists, 0 else + ); + + /// Save attr_value to attr_name attribute on path. + int add_attr_path( + const vector &path, ///< [in] Path to modify. + const string &attr_name, ///< [in] Name of attribute. + bufferlist &attr_value ///< [in] Value to save. + ); + + /// Read into attr_value atribute attr_name on path. + int get_attr_path( + const vector &path, ///< [in] Path to read. + const string &attr_name, ///< [in] Attribute to read. + bufferlist &attr_value ///< [out] Attribute value read. + ); + + /// Remove attr from path + int remove_attr_path( + const vector &path, ///< [in] path from which to remove attr + const string &attr_name ///< [in] attr to remove + ); ///< @return Error code, 0 on success + +private: + /* lfn translation functions */ + + /** + * Gets the version specific lfn attribute tag + */ + const string &get_lfn_attr() const { + return lfn_attribute; + } + const string &get_alt_lfn_attr() const { + return lfn_alt_attribute; + } + + /** + * Gets the filename corresponsing to oid in path. + * + * @param [in] path Path in which to get filename for oid. + * @param [in] oid Object for which to get filename. + * @param [out] mangled_name Filename for oid, pass NULL if not needed. + * @param [out] full_path Fullpath for oid, pass NULL if not needed. + * @param [out] hardlink of this file, 0 mean no-exist, pass NULL if + * not needed + * @return Error Code, 0 on success. + */ + int lfn_get_name( + const vector &path, + const ghobject_t &oid, + string *mangled_name, + string *full_path, + int *hardlink + ); + + /// Adjusts path contents when oid is created at name mangled_name. + int lfn_created( + const vector &path, ///< [in] Path to adjust. + const ghobject_t &oid, ///< [in] Object created. + const string &mangled_name ///< [in] Filename of created object. + ); + + /// Removes oid from path while adjusting path contents + int lfn_unlink( + const vector &path, ///< [in] Path containing oid. + const ghobject_t &oid, ///< [in] Object to remove. + const string &mangled_name ///< [in] Filename of object to remove. + ); + + ///Transate a file into and ghobject_t. + int lfn_translate( + const vector &path, ///< [in] Path containing the file. + const string &short_name, ///< [in] Filename to translate. + ghobject_t *out ///< [out] Object found. + ); ///< @return Negative error code on error, 0 if not an object, 1 else + + /* manglers/demanglers */ + /// Filters object filenames + bool lfn_is_object( + const string &short_name ///< [in] Filename to check + ); ///< True if short_name is an object, false otherwise + + /// Filters subdir filenames + bool lfn_is_subdir( + const string &short_name, ///< [in] Filename to check. + string *demangled_name ///< [out] Demangled subdir name. + ); ///< @return True if short_name is a subdir, false otherwise + + /// Generate object name + string lfn_generate_object_name_keyless( + const ghobject_t &oid ///< [in] Object for which to generate. + ); ///< @return Generated object name. + + /// Generate object name + string lfn_generate_object_name_poolless( + const ghobject_t &oid ///< [in] Object for which to generate. + ); ///< @return Generated object name. + + /// Generate object name + string lfn_generate_object_name( + const ghobject_t &oid ///< [in] Object for which to generate. + ); ///< @return Generated object name. + + /// Parse object name + bool lfn_parse_object_name_keyless( + const string &long_name, ///< [in] Name to parse + ghobject_t *out ///< [out] Resulting Object + ); ///< @return True if successfull, False otherwise. + + /// Parse object name + bool lfn_parse_object_name_poolless( + const string &long_name, ///< [in] Name to parse + ghobject_t *out ///< [out] Resulting Object + ); ///< @return True if successfull, False otherwise. + + /// Parse object name + bool lfn_parse_object_name( + const string &long_name, ///< [in] Name to parse + ghobject_t *out ///< [out] Resulting Object + ); ///< @return True if successfull, False otherwise. + + /// Checks whether short_name is a hashed filename. + bool lfn_is_hashed_filename( + const string &short_name ///< [in] Name to check. + ); ///< @return True if short_name is hashed, False otherwise. + + /// Checks whether long_name must be hashed. + bool lfn_must_hash( + const string &long_name ///< [in] Name to check. + ); ///< @return True if long_name must be hashed, False otherwise. + + /// Generate hashed name. + string lfn_get_short_name( + const ghobject_t &oid, ///< [in] Object for which to generate. + int i ///< [in] Index of hashed name to generate. + ); ///< @return Hashed filename. + + /* other common methods */ + /// Gets the base path + const string &get_base_path(); ///< @return Index base_path + + /// Get full path the subdir + string get_full_path_subdir( + const vector &rel ///< [in] The subdir. + ); ///< @return Full path to rel. + + /// Get full path to object + string get_full_path( + const vector &rel, ///< [in] Path to object. + const string &name ///< [in] Filename of object. + ); ///< @return Fullpath to object at name in rel. + + /// Get mangled path component + string mangle_path_component( + const string &component ///< [in] Component to mangle + ); /// @return Mangled component + + /// Demangle component + string demangle_path_component( + const string &component ///< [in] Subdir name to demangle + ); ///< @return Demangled path component. + + /// Decompose full path into object name and filename. + int decompose_full_path( + const char *in, ///< [in] Full path to object. + vector *out, ///< [out] Path to object at in. + ghobject_t *oid, ///< [out] Object at in. + string *shortname ///< [out] Filename of object at in. + ); ///< @return Error Code, 0 on success. + + /// Mangle attribute name + string mangle_attr_name( + const string &attr ///< [in] Attribute to mangle. + ); ///< @return Mangled attribute name. + + /// Builds hashed filename + void build_filename( + const char *old_filename, ///< [in] Filename to convert. + int i, ///< [in] Index of hash. + char *filename, ///< [out] Resulting filename. + int len ///< [in] Size of buffer for filename + ); ///< @return Error Code, 0 on success + + /// Get hash of filename + int hash_filename( + const char *filename, ///< [in] Filename to hash. + char *hash, ///< [out] Hash of filename. + int len ///< [in] Size of hash buffer. + ); ///< @return Error Code, 0 on success. + + friend class TestWrapLFNIndex; +}; +typedef LFNIndex::IndexedPath IndexedPath; + +#endif diff --git a/src/os/filestore/SequencerPosition.h b/src/os/filestore/SequencerPosition.h new file mode 100644 index 000000000000..c2ecf853bcf3 --- /dev/null +++ b/src/os/filestore/SequencerPosition.h @@ -0,0 +1,59 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef __CEPH_OS_SEQUENCERPOSITION_H +#define __CEPH_OS_SEQUENCERPOSITION_H + +#include "include/types.h" +#include "include/cmp.h" +#include "include/encoding.h" +#include "common/Formatter.h" + +#include + +/** + * transaction and op offset + */ +struct SequencerPosition { + uint64_t seq; ///< seq + uint32_t trans; ///< transaction in that seq (0-based) + uint32_t op; ///< op in that transaction (0-based) + + SequencerPosition(uint64_t s=0, int32_t t=0, int32_t o=0) : seq(s), trans(t), op(o) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + ::encode(seq, bl); + ::encode(trans, bl); + ::encode(op, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::iterator& p) { + DECODE_START(1, p); + ::decode(seq, p); + ::decode(trans, p); + ::decode(op, p); + DECODE_FINISH(p); + } + void dump(Formatter *f) const { + f->dump_unsigned("seq", seq); + f->dump_unsigned("trans", trans); + f->dump_unsigned("op", op); + } + static void generate_test_instances(list& o) { + o.push_back(new SequencerPosition); + o.push_back(new SequencerPosition(1, 2, 3)); + o.push_back(new SequencerPosition(4, 5, 6)); + } +}; +WRITE_CLASS_ENCODER(SequencerPosition) + +inline ostream& operator<<(ostream& out, const SequencerPosition& t) { + return out << t.seq << "." << t.trans << "." << t.op; +} + +WRITE_EQ_OPERATORS_3(SequencerPosition, seq, trans, op) +WRITE_CMP_OPERATORS_3(SequencerPosition, seq, trans, op) + + +#endif diff --git a/src/os/filestore/WBThrottle.cc b/src/os/filestore/WBThrottle.cc new file mode 100644 index 000000000000..6eb559dc0164 --- /dev/null +++ b/src/os/filestore/WBThrottle.cc @@ -0,0 +1,267 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "acconfig.h" + +#include "os/filestore/WBThrottle.h" +#include "common/perf_counters.h" + +WBThrottle::WBThrottle(CephContext *cct) : + cur_ios(0), cur_size(0), + cct(cct), + logger(NULL), + stopping(true), + lock("WBThrottle::lock", false, true, false, cct), + fs(XFS) +{ + { + Mutex::Locker l(lock); + set_from_conf(); + } + assert(cct); + PerfCountersBuilder b( + cct, string("WBThrottle"), + l_wbthrottle_first, l_wbthrottle_last); + b.add_u64(l_wbthrottle_bytes_dirtied, "bytes_dirtied", "Dirty data"); + b.add_u64(l_wbthrottle_bytes_wb, "bytes_wb", "Written data"); + b.add_u64(l_wbthrottle_ios_dirtied, "ios_dirtied", "Dirty operations"); + b.add_u64(l_wbthrottle_ios_wb, "ios_wb", "Written operations"); + b.add_u64(l_wbthrottle_inodes_dirtied, "inodes_dirtied", "Entries waiting for write"); + b.add_u64(l_wbthrottle_inodes_wb, "inodes_wb", "Written entries"); + logger = b.create_perf_counters(); + cct->get_perfcounters_collection()->add(logger); + for (unsigned i = l_wbthrottle_first + 1; i != l_wbthrottle_last; ++i) + logger->set(i, 0); + + cct->_conf->add_observer(this); +} + +WBThrottle::~WBThrottle() { + assert(cct); + cct->get_perfcounters_collection()->remove(logger); + delete logger; + cct->_conf->remove_observer(this); +} + +void WBThrottle::start() +{ + { + Mutex::Locker l(lock); + stopping = false; + } + create(); +} + +void WBThrottle::stop() +{ + { + Mutex::Locker l(lock); + stopping = true; + cond.Signal(); + } + + join(); +} + +const char** WBThrottle::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "filestore_wbthrottle_btrfs_bytes_start_flusher", + "filestore_wbthrottle_btrfs_bytes_hard_limit", + "filestore_wbthrottle_btrfs_ios_start_flusher", + "filestore_wbthrottle_btrfs_ios_hard_limit", + "filestore_wbthrottle_btrfs_inodes_start_flusher", + "filestore_wbthrottle_btrfs_inodes_hard_limit", + "filestore_wbthrottle_xfs_bytes_start_flusher", + "filestore_wbthrottle_xfs_bytes_hard_limit", + "filestore_wbthrottle_xfs_ios_start_flusher", + "filestore_wbthrottle_xfs_ios_hard_limit", + "filestore_wbthrottle_xfs_inodes_start_flusher", + "filestore_wbthrottle_xfs_inodes_hard_limit", + NULL + }; + return KEYS; +} + +void WBThrottle::set_from_conf() +{ + assert(lock.is_locked()); + if (fs == BTRFS) { + size_limits.first = + cct->_conf->filestore_wbthrottle_btrfs_bytes_start_flusher; + size_limits.second = + cct->_conf->filestore_wbthrottle_btrfs_bytes_hard_limit; + io_limits.first = + cct->_conf->filestore_wbthrottle_btrfs_ios_start_flusher; + io_limits.second = + cct->_conf->filestore_wbthrottle_btrfs_ios_hard_limit; + fd_limits.first = + cct->_conf->filestore_wbthrottle_btrfs_inodes_start_flusher; + fd_limits.second = + cct->_conf->filestore_wbthrottle_btrfs_inodes_hard_limit; + } else if (fs == XFS) { + size_limits.first = + cct->_conf->filestore_wbthrottle_xfs_bytes_start_flusher; + size_limits.second = + cct->_conf->filestore_wbthrottle_xfs_bytes_hard_limit; + io_limits.first = + cct->_conf->filestore_wbthrottle_xfs_ios_start_flusher; + io_limits.second = + cct->_conf->filestore_wbthrottle_xfs_ios_hard_limit; + fd_limits.first = + cct->_conf->filestore_wbthrottle_xfs_inodes_start_flusher; + fd_limits.second = + cct->_conf->filestore_wbthrottle_xfs_inodes_hard_limit; + } else { + assert(0 == "invalid value for fs"); + } + cond.Signal(); +} + +void WBThrottle::handle_conf_change(const md_config_t *conf, + const std::set &changed) +{ + Mutex::Locker l(lock); + for (const char** i = get_tracked_conf_keys(); *i; ++i) { + if (changed.count(*i)) { + set_from_conf(); + return; + } + } +} + +bool WBThrottle::get_next_should_flush( + boost::tuple *next) +{ + assert(lock.is_locked()); + assert(next); + while (!stopping && !beyond_limit()) + cond.Wait(lock); + if (stopping) + return false; + assert(!pending_wbs.empty()); + ghobject_t obj(pop_object()); + + ceph::unordered_map >::iterator i = + pending_wbs.find(obj); + *next = boost::make_tuple(obj, i->second.second, i->second.first); + pending_wbs.erase(i); + return true; +} + + +void *WBThrottle::entry() +{ + Mutex::Locker l(lock); + boost::tuple wb; + while (get_next_should_flush(&wb)) { + clearing = wb.get<0>(); + cur_ios -= wb.get<2>().ios; + logger->dec(l_wbthrottle_ios_dirtied, wb.get<2>().ios); + logger->inc(l_wbthrottle_ios_wb, wb.get<2>().ios); + cur_size -= wb.get<2>().size; + logger->dec(l_wbthrottle_bytes_dirtied, wb.get<2>().size); + logger->inc(l_wbthrottle_bytes_wb, wb.get<2>().size); + logger->dec(l_wbthrottle_inodes_dirtied); + logger->inc(l_wbthrottle_inodes_wb); + lock.Unlock(); +#ifdef HAVE_FDATASYNC + ::fdatasync(**wb.get<1>()); +#else + ::fsync(**wb.get<1>()); +#endif +#ifdef HAVE_POSIX_FADVISE + if (g_conf->filestore_fadvise && wb.get<2>().nocache) { + int fa_r = posix_fadvise(**wb.get<1>(), 0, 0, POSIX_FADV_DONTNEED); + assert(fa_r == 0); + } +#endif + lock.Lock(); + clearing = ghobject_t(); + cond.Signal(); + wb = boost::tuple(); + } + return 0; +} + +void WBThrottle::queue_wb( + FDRef fd, const ghobject_t &hoid, uint64_t offset, uint64_t len, + bool nocache) +{ + Mutex::Locker l(lock); + ceph::unordered_map >::iterator wbiter = + pending_wbs.find(hoid); + if (wbiter == pending_wbs.end()) { + wbiter = pending_wbs.insert( + make_pair(hoid, + make_pair( + PendingWB(), + fd))).first; + logger->inc(l_wbthrottle_inodes_dirtied); + } else { + remove_object(hoid); + } + + cur_ios++; + logger->inc(l_wbthrottle_ios_dirtied); + cur_size += len; + logger->inc(l_wbthrottle_bytes_dirtied, len); + + wbiter->second.first.add(nocache, len, 1); + insert_object(hoid); + if (beyond_limit()) + cond.Signal(); +} + +void WBThrottle::clear() +{ + Mutex::Locker l(lock); + for (ceph::unordered_map >::iterator i = + pending_wbs.begin(); + i != pending_wbs.end(); + ++i) { +#ifdef HAVE_POSIX_FADVISE + if (g_conf->filestore_fadvise && i->second.first.nocache) { + int fa_r = posix_fadvise(**i->second.second, 0, 0, POSIX_FADV_DONTNEED); + assert(fa_r == 0); + } +#endif + + } + cur_ios = cur_size = 0; + logger->set(l_wbthrottle_ios_dirtied, 0); + logger->set(l_wbthrottle_bytes_dirtied, 0); + logger->set(l_wbthrottle_inodes_dirtied, 0); + pending_wbs.clear(); + lru.clear(); + rev_lru.clear(); + cond.Signal(); +} + +void WBThrottle::clear_object(const ghobject_t &hoid) +{ + Mutex::Locker l(lock); + while (clearing == hoid) + cond.Wait(lock); + ceph::unordered_map >::iterator i = + pending_wbs.find(hoid); + if (i == pending_wbs.end()) + return; + + cur_ios -= i->second.first.ios; + logger->dec(l_wbthrottle_ios_dirtied, i->second.first.ios); + cur_size -= i->second.first.size; + logger->dec(l_wbthrottle_bytes_dirtied, i->second.first.size); + logger->dec(l_wbthrottle_inodes_dirtied); + + pending_wbs.erase(i); + remove_object(hoid); + cond.Signal(); +} + +void WBThrottle::throttle() +{ + Mutex::Locker l(lock); + while (!stopping && need_flush()) + cond.Wait(lock); +} diff --git a/src/os/filestore/WBThrottle.h b/src/os/filestore/WBThrottle.h new file mode 100644 index 000000000000..f06ec877b2d4 --- /dev/null +++ b/src/os/filestore/WBThrottle.h @@ -0,0 +1,188 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef WBTHROTTLE_H +#define WBTHROTTLE_H + +#include "include/unordered_map.h" +#include +#include "include/memory.h" +#include "common/Formatter.h" +#include "common/hobject.h" +#include "include/interval_set.h" +#include "FDCache.h" +#include "common/Thread.h" +#include "common/ceph_context.h" + +class PerfCounters; +enum { + l_wbthrottle_first = 999090, + l_wbthrottle_bytes_dirtied, + l_wbthrottle_bytes_wb, + l_wbthrottle_ios_dirtied, + l_wbthrottle_ios_wb, + l_wbthrottle_inodes_dirtied, + l_wbthrottle_inodes_wb, + l_wbthrottle_last +}; + +/** + * WBThrottle + * + * Tracks, throttles, and flushes outstanding IO + */ +class WBThrottle : Thread, public md_config_obs_t { + ghobject_t clearing; + /* *_limits.first is the start_flusher limit and + * *_limits.second is the hard limit + */ + + /// Limits on unflushed bytes + pair size_limits; + + /// Limits on unflushed ios + pair io_limits; + + /// Limits on unflushed objects + pair fd_limits; + + uint64_t cur_ios; /// Currently unflushed IOs + uint64_t cur_size; /// Currently unflushed bytes + + /** + * PendingWB tracks the ios pending on an object. + */ + class PendingWB { + public: + bool nocache; + uint64_t size; + uint64_t ios; + PendingWB() : nocache(true), size(0), ios(0) {} + void add(bool _nocache, uint64_t _size, uint64_t _ios) { + if (!_nocache) + nocache = false; // only nocache if all writes are nocache + size += _size; + ios += _ios; + } + }; + + CephContext *cct; + PerfCounters *logger; + bool stopping; + Mutex lock; + Cond cond; + + + /** + * Flush objects in lru order + */ + list lru; + ceph::unordered_map::iterator> rev_lru; + void remove_object(const ghobject_t &oid) { + assert(lock.is_locked()); + ceph::unordered_map::iterator>::iterator iter = + rev_lru.find(oid); + if (iter == rev_lru.end()) + return; + + lru.erase(iter->second); + rev_lru.erase(iter); + } + ghobject_t pop_object() { + assert(!lru.empty()); + ghobject_t oid(lru.front()); + lru.pop_front(); + rev_lru.erase(oid); + return oid; + } + void insert_object(const ghobject_t &oid) { + assert(rev_lru.find(oid) == rev_lru.end()); + lru.push_back(oid); + rev_lru.insert(make_pair(oid, --lru.end())); + } + + ceph::unordered_map > pending_wbs; + + /// get next flush to perform + bool get_next_should_flush( + boost::tuple *next ///< [out] next to flush + ); ///< @return false if we are shutting down +public: + enum FS { + BTRFS, + XFS + }; + +private: + FS fs; + + void set_from_conf(); + bool beyond_limit() const { + if (cur_ios < io_limits.first && + pending_wbs.size() < fd_limits.first && + cur_size < size_limits.first) + return false; + else + return true; + } + bool need_flush() const { + if (cur_ios < io_limits.second && + pending_wbs.size() < fd_limits.second && + cur_size < size_limits.second) + return false; + else + return true; + } + +public: + WBThrottle(CephContext *cct); + ~WBThrottle(); + + void start(); + void stop(); + /// Set fs as XFS or BTRFS + void set_fs(FS new_fs) { + Mutex::Locker l(lock); + fs = new_fs; + set_from_conf(); + } + + /// Queue wb on oid, fd taking throttle (does not block) + void queue_wb( + FDRef fd, ///< [in] FDRef to oid + const ghobject_t &oid, ///< [in] object + uint64_t offset, ///< [in] offset written + uint64_t len, ///< [in] length written + bool nocache ///< [in] try to clear out of cache after write + ); + + /// Clear all wb (probably due to sync) + void clear(); + + /// Clear object + void clear_object(const ghobject_t &oid); + + /// Block until there is throttle available + void throttle(); + + /// md_config_obs_t + const char** get_tracked_conf_keys() const; + void handle_conf_change(const md_config_t *conf, + const std::set &changed); + + /// Thread + void *entry(); +}; + +#endif diff --git a/src/os/filestore/XfsFileStoreBackend.cc b/src/os/filestore/XfsFileStoreBackend.cc new file mode 100644 index 000000000000..365692ca8671 --- /dev/null +++ b/src/os/filestore/XfsFileStoreBackend.cc @@ -0,0 +1,148 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Inktank, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "XfsFileStoreBackend.h" + +#include +#include +#include +#include +#include +#include + +#include + +#include "common/errno.h" +#include "common/linux_version.h" +#include "include/assert.h" +#include "include/compat.h" + +#define dout_subsys ceph_subsys_filestore +#undef dout_prefix +#define dout_prefix *_dout << "xfsfilestorebackend(" << get_basedir_path() << ") " + +XfsFileStoreBackend::XfsFileStoreBackend(FileStore *fs): + GenericFileStoreBackend(fs), m_has_extsize(false) { } + +/* + * Set extsize attr on a file to val. Should be a free-standing + * function, but dout_prefix expanding to a call to get_basedir_path() + * protected member function won't let it. + */ +int XfsFileStoreBackend::set_extsize(int fd, unsigned int val) +{ + struct fsxattr fsx; + struct stat sb; + int ret; + + if (fstat(fd, &sb) < 0) { + ret = -errno; + dout(0) << "set_extsize: fstat: " << cpp_strerror(ret) << dendl; + return ret; + } + if (!S_ISREG(sb.st_mode)) { + dout(0) << "set_extsize: invalid target file type" << dendl; + return -EINVAL; + } + + if (ioctl(fd, XFS_IOC_FSGETXATTR, &fsx) < 0) { + ret = -errno; + dout(0) << "set_extsize: FSGETXATTR: " << cpp_strerror(ret) << dendl; + return ret; + } + + // already set? + if ((fsx.fsx_xflags & XFS_XFLAG_EXTSIZE) && fsx.fsx_extsize == val) + return 0; + + // xfs won't change extent size if any extents are allocated + if (fsx.fsx_nextents != 0) + return 0; + + fsx.fsx_xflags |= XFS_XFLAG_EXTSIZE; + fsx.fsx_extsize = val; + + if (ioctl(fd, XFS_IOC_FSSETXATTR, &fsx) < 0) { + ret = -errno; + dout(0) << "set_extsize: FSSETXATTR: " << cpp_strerror(ret) << dendl; + return ret; + } + + return 0; +} + +int XfsFileStoreBackend::detect_features() +{ + int ret; + + ret = GenericFileStoreBackend::detect_features(); + if (ret < 0) + return ret; + + // extsize? + int fd = ::openat(get_basedir_fd(), "extsize_test", O_CREAT|O_WRONLY, 0600); + if (fd < 0) { + ret = -errno; + dout(0) << "detect_feature: failed to create test file for extsize attr: " + << cpp_strerror(ret) << dendl; + goto out; + } + if (::unlinkat(get_basedir_fd(), "extsize_test", 0) < 0) { + ret = -errno; + dout(0) << "detect_feature: failed to unlink test file for extsize attr: " + << cpp_strerror(ret) << dendl; + goto out_close; + } + + if (g_conf->filestore_xfs_extsize) { + ret = set_extsize(fd, 1U << 15); // a few pages + if (ret) { + ret = 0; + dout(0) << "detect_feature: failed to set test file extsize, assuming extsize is NOT supported" << dendl; + goto out_close; + } + + // make sure we have 3.5 or newer, which includes this fix + // aff3a9edb7080f69f07fe76a8bd089b3dfa4cb5d + // for this set_extsize bug + // http://oss.sgi.com/bugzilla/show_bug.cgi?id=874 + int ver = get_linux_version(); + if (ver == 0) { + dout(0) << __func__ << ": couldn't verify extsize not buggy, disabling extsize" << dendl; + m_has_extsize = false; + } else if (ver < KERNEL_VERSION(3, 5, 0)) { + dout(0) << __func__ << ": disabling extsize, your kernel < 3.5 and has buggy extsize ioctl" << dendl; + m_has_extsize = false; + } else { + dout(0) << __func__ << ": extsize is supported and your kernel >= 3.5" << dendl; + m_has_extsize = true; + } + } else { + dout(0) << "detect_feature: extsize is disabled by conf" << dendl; + } + +out_close: + TEMP_FAILURE_RETRY(::close(fd)); +out: + return ret; +} + +int XfsFileStoreBackend::set_alloc_hint(int fd, uint64_t hint) +{ + if (!m_has_extsize) + return -EOPNOTSUPP; + + assert(hint < UINT_MAX); + return set_extsize(fd, hint); +} diff --git a/src/os/filestore/XfsFileStoreBackend.h b/src/os/filestore/XfsFileStoreBackend.h new file mode 100644 index 000000000000..84d4694368d4 --- /dev/null +++ b/src/os/filestore/XfsFileStoreBackend.h @@ -0,0 +1,36 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Inktank, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_XFSFILESTOREBACKEND_H +#define CEPH_XFSFILESTOREBACKEND_H + +#include "GenericFileStoreBackend.h" + +#include "include/int_types.h" + +class XfsFileStoreBackend : public GenericFileStoreBackend { +private: + bool m_has_extsize; + int set_extsize(int fd, unsigned int val); +public: + XfsFileStoreBackend(FileStore *fs); + ~XfsFileStoreBackend() {} + const char *get_name() { + return "xfs"; + } + int detect_features(); + int set_alloc_hint(int fd, uint64_t hint); +}; + +#endif /* CEPH_XFSFILESTOREBACKEND_H */ diff --git a/src/os/filestore/ZFSFileStoreBackend.cc b/src/os/filestore/ZFSFileStoreBackend.cc new file mode 100644 index 000000000000..aa52b8d29339 --- /dev/null +++ b/src/os/filestore/ZFSFileStoreBackend.cc @@ -0,0 +1,260 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/int_types.h" +#include "include/types.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "include/compat.h" +#include "include/linux_fiemap.h" +#include "include/color.h" +#include "include/buffer.h" +#include "include/assert.h" + +#include +#include +#include + +#include "common/errno.h" +#include "common/config.h" +#include "common/sync_filesystem.h" + +#ifdef HAVE_LIBZFS + +#include "ZFSFileStoreBackend.h" + +#define dout_subsys ceph_subsys_filestore +#undef dout_prefix +#define dout_prefix *_dout << "zfsfilestorebackend(" << get_basedir_path() << ") " + +ZFSFileStoreBackend::ZFSFileStoreBackend(FileStore *fs) : + GenericFileStoreBackend(fs), base_zh(NULL), current_zh(NULL), + m_filestore_zfs_snap(g_conf->filestore_zfs_snap) +{ + int ret = zfs.init(); + if (ret < 0) { + dout(0) << "ZFSFileStoreBackend: failed to init libzfs" << dendl; + return; + } + + base_zh = zfs.path_to_zhandle(get_basedir_path().c_str(), ZFS::TYPE_FILESYSTEM); + if (!base_zh) { + dout(0) << "ZFSFileStoreBackend: failed to get zfs handler for basedir" << dendl; + return; + } + + update_current_zh(); +} + +ZFSFileStoreBackend::~ZFSFileStoreBackend() +{ + if (base_zh) + zfs.close(base_zh); + if (current_zh) + zfs.close(current_zh); +} + +int ZFSFileStoreBackend::update_current_zh() +{ + char path[PATH_MAX]; + snprintf(path, sizeof(path), "%s/current", zfs.get_name(base_zh)); + ZFS::Handle *zh = zfs.open(path, ZFS::TYPE_FILESYSTEM); + if (zh) { + char *mnt; + if (zfs.is_mounted(zh, &mnt)) { + int ret = get_current_path() == mnt; + free(mnt); + if (ret) { + current_zh = zh; + return 0; + } + } else { + int ret = zfs.mount(zh, NULL, 0); + if (ret < 0) { + ret = -errno; + dout(0) << "update_current_zh: zfs_mount '" << zfs.get_name(zh) + << "' got " << cpp_strerror(ret) << dendl; + return ret; + } + } + zfs.close(zh); + } else { + dout(0) << "update_current_zh: zfs_open '" << path << "' got NULL" << dendl; + return -ENOENT; + } + + zh = zfs.path_to_zhandle(get_current_path().c_str(), ZFS::TYPE_FILESYSTEM); + if (zh) { + if (strcmp(zfs.get_name(base_zh), zfs.get_name(zh))) { + current_zh = zh; + return 0; + } + zfs.close(zh); + dout(0) << "update_current_zh: basedir and current/ on the same filesystem" << dendl; + } else { + dout(0) << "update_current_zh: current/ not exist" << dendl; + } + return -ENOENT; +} + +int ZFSFileStoreBackend::detect_features() +{ + if (!current_zh) + dout(0) << "detect_features: null zfs handle for current/" << dendl; + return 0; +} + +bool ZFSFileStoreBackend::can_checkpoint() +{ + return m_filestore_zfs_snap && current_zh != NULL; +} + +int ZFSFileStoreBackend::create_current() +{ + struct stat st; + int ret = ::stat(get_current_path().c_str(), &st); + if (ret == 0) { + // current/ exists + if (!S_ISDIR(st.st_mode)) { + dout(0) << "create_current: current/ exists but is not a directory" << dendl; + return -ENOTDIR; + } + return 0; + } else if (errno != ENOENT) { + ret = -errno; + dout(0) << "create_current: cannot stat current/ " << cpp_strerror(ret) << dendl; + return ret; + } + + char path[PATH_MAX]; + snprintf(path, sizeof(path), "%s/current", zfs.get_name(base_zh)); + ret = zfs.create(path, ZFS::TYPE_FILESYSTEM); + if (ret < 0 && errno != EEXIST) { + ret = -errno; + dout(0) << "create_current: zfs_create '" << path << "' got " << cpp_strerror(ret) << dendl; + return ret; + } + + ret = update_current_zh(); + return ret; +} + +static int list_checkpoints_callback(ZFS::Handle *zh, void *data) +{ + list *ls = static_cast *>(data); + string str = ZFS::get_name(zh); + size_t pos = str.find('@'); + assert(pos != string::npos && pos + 1 != str.length()); + ls->push_back(str.substr(pos + 1)); + return 0; +} + +int ZFSFileStoreBackend::list_checkpoints(list& ls) +{ + dout(10) << "list_checkpoints:" << dendl; + if (!current_zh) + return -EINVAL; + + list snaps; + int ret = zfs.iter_snapshots_sorted(current_zh, list_checkpoints_callback, &snaps); + if (ret < 0) { + ret = -errno; + dout(0) << "list_checkpoints: zfs_iter_snapshots_sorted got" << cpp_strerror(ret) << dendl; + return ret; + } + ls.swap(snaps); + return 0; +} + +int ZFSFileStoreBackend::create_checkpoint(const string& name, uint64_t *cid) +{ + dout(10) << "create_checkpoint: '" << name << "'" << dendl; + if (!current_zh) + return -EINVAL; + + // looks like zfsonlinux doesn't flush dirty data when taking snapshot + int ret = sync_filesystem(get_current_fd()); + if (ret < 0) { + ret = -errno; + dout(0) << "create_checkpoint: sync_filesystem got" << cpp_strerror(ret) << dendl; + return ret; + } + + char path[PATH_MAX]; + snprintf(path, sizeof(path), "%s@%s", zfs.get_name(current_zh), name.c_str()); + ret = zfs.snapshot(path, false); + if (ret < 0) { + ret = -errno; + dout(0) << "create_checkpoint: zfs_snapshot '" << path << "' got" << cpp_strerror(ret) << dendl; + return ret; + } + if (cid) + *cid = 0; + return 0; +} + +int ZFSFileStoreBackend::rollback_to(const string& name) +{ + dout(10) << "rollback_to: '" << name << "'" << dendl; + if (!current_zh) + return -EINVAL; + + // umount current to avoid triggering online rollback deadlock + int ret; + if (zfs.is_mounted(current_zh, NULL)) { + ret = zfs.umount(current_zh, NULL, 0); + if (ret < 0) { + ret = -errno; + dout(0) << "rollback_to: zfs_umount '" << zfs.get_name(current_zh) << "' got" << cpp_strerror(ret) << dendl; + } + } + + char path[PATH_MAX]; + snprintf(path, sizeof(path), "%s@%s", zfs.get_name(current_zh), name.c_str()); + + ZFS::Handle *snap_zh = zfs.open(path, ZFS::TYPE_SNAPSHOT); + if (!snap_zh) { + dout(0) << "rollback_to: zfs_open '" << path << "' got NULL" << dendl; + return -ENOENT; + } + + ret = zfs.rollback(current_zh, snap_zh, false); + if (ret < 0) { + ret = -errno; + dout(0) << "rollback_to: zfs_rollback '" << zfs.get_name(snap_zh) << "' got" << cpp_strerror(ret) << dendl; + } + + if (!zfs.is_mounted(current_zh, NULL)) { + int ret = zfs.mount(current_zh, NULL, 0); + if (ret < 0) { + ret = -errno; + dout(0) << "update_current_zh: zfs_mount '" << zfs.get_name(current_zh) << "' got " << cpp_strerror(ret) << dendl; + return ret; + } + } + + zfs.close(snap_zh); + return ret; +} + +int ZFSFileStoreBackend::destroy_checkpoint(const string& name) +{ + dout(10) << "destroy_checkpoint: '" << name << "'" << dendl; + if (!current_zh) + return -EINVAL; + + int ret = zfs.destroy_snaps(current_zh, name.c_str(), true); + if (ret < 0) { + ret = -errno; + dout(0) << "destroy_checkpoint: zfs_destroy_snaps '" << name << "' got" << cpp_strerror(ret) << dendl; + } + return ret; +} +#endif diff --git a/src/os/filestore/ZFSFileStoreBackend.h b/src/os/filestore/ZFSFileStoreBackend.h new file mode 100644 index 000000000000..8186d9ca957d --- /dev/null +++ b/src/os/filestore/ZFSFileStoreBackend.h @@ -0,0 +1,30 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_ZFSFILESTOREBACKEND_H +#define CEPH_ZFSFILESTOREBACKEND_H + +#ifdef HAVE_LIBZFS +#include "GenericFileStoreBackend.h" +#include "ZFS.h" + +class ZFSFileStoreBackend : public GenericFileStoreBackend { +private: + ZFS zfs; + ZFS::Handle *base_zh; + ZFS::Handle *current_zh; + bool m_filestore_zfs_snap; + int update_current_zh(); +public: + ZFSFileStoreBackend(FileStore *fs); + ~ZFSFileStoreBackend(); + int detect_features(); + bool can_checkpoint(); + int create_current(); + int list_checkpoints(list& ls); + int create_checkpoint(const string& name, uint64_t *cid); + int rollback_to(const string& name); + int destroy_checkpoint(const string& name); +}; +#endif +#endif diff --git a/src/os/filestore/chain_xattr.cc b/src/os/filestore/chain_xattr.cc new file mode 100644 index 000000000000..28bb87bed1e0 --- /dev/null +++ b/src/os/filestore/chain_xattr.cc @@ -0,0 +1,467 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "chain_xattr.h" + +#include "include/int_types.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "include/assert.h" + +#if defined(__linux__) +#include +#endif + +#include "common/xattr.h" +#include "include/compat.h" + +/* + * chaining xattrs + * + * In order to support xattrs that are larger than the xattr size limit that some file systems + * impose, we use multiple xattrs to store the value of a single xattr. The xattrs keys + * are set as follows: + * The first xattr in the chain, has a key that holds the original xattr name, with any '@' char + * being esacped ("@@"). + * The chained keys will have the first xattr's key (with the escaping), and a suffix: "@" + * where marks the num of xattr in the chain. + */ + +static void get_raw_xattr_name(const char *name, int i, char *raw_name, int raw_len) +{ + int pos = 0; + + while (*name) { + switch (*name) { + case '@': /* escape it */ + pos += 2; + assert (pos < raw_len - 1); + *raw_name = '@'; + raw_name++; + *raw_name = '@'; + break; + default: + pos++; + assert(pos < raw_len - 1); + *raw_name = *name; + break; + } + name++; + raw_name++; + } + + if (!i) { + *raw_name = '\0'; + } else { + int r = snprintf(raw_name, raw_len - pos, "@%d", i); + assert(r < raw_len - pos); + } +} + +static int translate_raw_name(const char *raw_name, char *name, int name_len, bool *is_first) +{ + int pos = 0; + + *is_first = true; + while (*raw_name) { + switch (*raw_name) { + case '@': /* escape it */ + raw_name++; + if (!*raw_name) + break; + if (*raw_name != '@') { + *is_first = false; + goto done; + } + + /* fall through */ + default: + *name = *raw_name; + break; + } + pos++; + assert(pos < name_len); + name++; + raw_name++; + } +done: + *name = '\0'; + return pos; +} + + +// setxattr + +static int getxattr_len(const char *fn, const char *name) +{ + int i = 0, total = 0; + char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int r; + + do { + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + r = sys_getxattr(fn, raw_name, 0, 0); + if (!i && r < 0) + return r; + if (r < 0) + break; + total += r; + i++; + } while (r == CHAIN_XATTR_MAX_BLOCK_LEN || + r == CHAIN_XATTR_SHORT_BLOCK_LEN); + + return total; +} + +int chain_getxattr(const char *fn, const char *name, void *val, size_t size) +{ + int i = 0, pos = 0; + char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int ret = 0; + int r; + size_t chunk_size; + + if (!size) + return getxattr_len(fn, name); + + do { + chunk_size = (size < CHAIN_XATTR_MAX_BLOCK_LEN ? size : CHAIN_XATTR_MAX_BLOCK_LEN); + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + + r = sys_getxattr(fn, raw_name, (char *)val + pos, chunk_size); + if (i && r == -ENODATA) { + ret = pos; + break; + } + if (r < 0) { + ret = r; + break; + } + + if (r > 0) { + pos += r; + size -= r; + } + + i++; + } while (size && (r == CHAIN_XATTR_MAX_BLOCK_LEN || + r == CHAIN_XATTR_SHORT_BLOCK_LEN)); + + if (r >= 0) { + ret = pos; + /* is there another chunk? that can happen if the last read size span over + exactly one block */ + if (chunk_size == CHAIN_XATTR_MAX_BLOCK_LEN || + chunk_size == CHAIN_XATTR_SHORT_BLOCK_LEN) { + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + r = sys_getxattr(fn, raw_name, 0, 0); + if (r > 0) { // there's another chunk.. the original buffer was too small + ret = -ERANGE; + } + } + } + return ret; +} + +static int chain_fgetxattr_len(int fd, const char *name) +{ + int i = 0, total = 0; + char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int r; + + do { + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + r = sys_fgetxattr(fd, raw_name, 0, 0); + if (!i && r < 0) + return r; + if (r < 0) + break; + total += r; + i++; + } while (r == CHAIN_XATTR_MAX_BLOCK_LEN || + r == CHAIN_XATTR_SHORT_BLOCK_LEN); + + return total; +} + +int chain_fgetxattr(int fd, const char *name, void *val, size_t size) +{ + int i = 0, pos = 0; + char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int ret = 0; + int r; + size_t chunk_size; + + if (!size) + return chain_fgetxattr_len(fd, name); + + do { + chunk_size = (size < CHAIN_XATTR_MAX_BLOCK_LEN ? size : CHAIN_XATTR_MAX_BLOCK_LEN); + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + + r = sys_fgetxattr(fd, raw_name, (char *)val + pos, chunk_size); + if (i && r == -ENODATA) { + ret = pos; + break; + } + if (r < 0) { + ret = r; + break; + } + + if (r > 0) { + pos += r; + size -= r; + } + + i++; + } while (size && (r == CHAIN_XATTR_MAX_BLOCK_LEN || + r == CHAIN_XATTR_SHORT_BLOCK_LEN)); + + if (r >= 0) { + ret = pos; + /* is there another chunk? that can happen if the last read size span over + exactly one block */ + if (chunk_size == CHAIN_XATTR_MAX_BLOCK_LEN || + chunk_size == CHAIN_XATTR_SHORT_BLOCK_LEN) { + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + r = sys_fgetxattr(fd, raw_name, 0, 0); + if (r > 0) { // there's another chunk.. the original buffer was too small + ret = -ERANGE; + } + } + } + return ret; +} + + +// setxattr + +static int get_xattr_block_size(size_t size) +{ + if (size <= CHAIN_XATTR_SHORT_LEN_THRESHOLD) + // this may fit in the inode; stripe over short attrs so that XFS + // won't kick it out. + return CHAIN_XATTR_SHORT_BLOCK_LEN; + return CHAIN_XATTR_MAX_BLOCK_LEN; +} + +int chain_setxattr(const char *fn, const char *name, const void *val, size_t size, bool onechunk) +{ + int i = 0, pos = 0; + char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int ret = 0; + size_t max_chunk_size = get_xattr_block_size(size); + + do { + size_t chunk_size = (size < max_chunk_size ? size : max_chunk_size); + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + size -= chunk_size; + + int r = sys_setxattr(fn, raw_name, (char *)val + pos, chunk_size); + if (r < 0) { + ret = r; + break; + } + pos += chunk_size; + ret = pos; + i++; + } while (size); + + if (ret >= 0 && !onechunk) { + int r; + do { + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + r = sys_removexattr(fn, raw_name); + if (r < 0 && r != -ENODATA) + ret = r; + i++; + } while (r != -ENODATA); + } + + return ret; +} + +int chain_fsetxattr(int fd, const char *name, const void *val, size_t size, bool onechunk) +{ + int i = 0, pos = 0; + char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int ret = 0; + size_t max_chunk_size = get_xattr_block_size(size); + + do { + size_t chunk_size = (size < max_chunk_size ? size : max_chunk_size); + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + size -= chunk_size; + + int r = sys_fsetxattr(fd, raw_name, (char *)val + pos, chunk_size); + if (r < 0) { + ret = r; + break; + } + pos += chunk_size; + ret = pos; + i++; + } while (size); + + if (ret >= 0 && !onechunk) { + int r; + do { + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + r = sys_fremovexattr(fd, raw_name); + if (r < 0 && r != -ENODATA) + ret = r; + i++; + } while (r != -ENODATA); + } + + return ret; +} + + +// removexattr + +int chain_removexattr(const char *fn, const char *name) +{ + int i = 0; + char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int r; + + do { + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + r = sys_removexattr(fn, raw_name); + if (!i && r < 0) { + return r; + } + i++; + } while (r >= 0); + return 0; +} + +int chain_fremovexattr(int fd, const char *name) +{ + int i = 0; + char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int r; + + do { + get_raw_xattr_name(name, i, raw_name, sizeof(raw_name)); + r = sys_fremovexattr(fd, raw_name); + if (!i && r < 0) { + return r; + } + i++; + } while (r >= 0); + return 0; +} + + +// listxattr + +int chain_listxattr(const char *fn, char *names, size_t len) { + int r; + + if (!len) + return sys_listxattr(fn, names, len) * 2; + + r = sys_listxattr(fn, 0, 0); + if (r < 0) + return r; + + size_t total_len = r * 2; // should be enough + char *full_buf = (char *)malloc(total_len); + if (!full_buf) + return -ENOMEM; + + r = sys_listxattr(fn, full_buf, total_len); + if (r < 0) { + free(full_buf); + return r; + } + + char *p = full_buf; + const char *end = full_buf + r; + char *dest = names; + char *dest_end = names + len; + + while (p < end) { + char name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int attr_len = strlen(p); + bool is_first; + int name_len = translate_raw_name(p, name, sizeof(name), &is_first); + if (is_first) { + if (dest + name_len > dest_end) { + r = -ERANGE; + goto done; + } + strcpy(dest, name); + dest += name_len + 1; + } + p += attr_len + 1; + } + r = dest - names; + +done: + free(full_buf); + return r; +} + +int chain_flistxattr(int fd, char *names, size_t len) { + int r; + char *p; + const char * end; + char *dest; + char *dest_end; + + if (!len) + return sys_flistxattr(fd, names, len) * 2; + + r = sys_flistxattr(fd, 0, 0); + if (r < 0) + return r; + + size_t total_len = r * 2; // should be enough + char *full_buf = (char *)malloc(total_len); + if (!full_buf) + return -ENOMEM; + + r = sys_flistxattr(fd, full_buf, total_len); + if (r < 0) + goto done; + + p = full_buf; + end = full_buf + r; + dest = names; + dest_end = names + len; + + while (p < end) { + char name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16]; + int attr_len = strlen(p); + bool is_first; + int name_len = translate_raw_name(p, name, sizeof(name), &is_first); + if (is_first) { + if (dest + name_len > dest_end) { + r = -ERANGE; + goto done; + } + strcpy(dest, name); + dest += name_len + 1; + } + p += attr_len + 1; + } + r = dest - names; + +done: + free(full_buf); + return r; +} diff --git a/src/os/filestore/chain_xattr.h b/src/os/filestore/chain_xattr.h new file mode 100644 index 000000000000..6ee80508d094 --- /dev/null +++ b/src/os/filestore/chain_xattr.h @@ -0,0 +1,88 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef __CEPH_OSD_CHAIN_XATTR_H +#define __CEPH_OSD_CHAIN_XATTR_H + +#include "common/xattr.h" + +#include + +#if defined(__linux__) +#include +#define CHAIN_XATTR_MAX_NAME_LEN ((XATTR_NAME_MAX + 1) / 2) +#elif defined(__APPLE__) +#include +#define CHAIN_XATTR_MAX_NAME_LEN ((XATTR_MAXNAMELEN + 1) / 2) +#else +#define CHAIN_XATTR_MAX_NAME_LEN 128 +#endif + +#define CHAIN_XATTR_MAX_BLOCK_LEN 2048 + +/* + * XFS will only inline xattrs < 255 bytes, so for xattrs that are + * likely to fit in the inode, stripe over short xattrs. + */ +#define CHAIN_XATTR_SHORT_BLOCK_LEN 250 +#define CHAIN_XATTR_SHORT_LEN_THRESHOLD 1000 + +// wrappers to hide annoying errno handling. + +static inline int sys_fgetxattr(int fd, const char *name, void *val, size_t size) +{ + int r = ::ceph_os_fgetxattr(fd, name, val, size); + return (r < 0 ? -errno : r); +} +static inline int sys_getxattr(const char *fn, const char *name, void *val, size_t size) +{ + int r = ::ceph_os_getxattr(fn, name, val, size); + return (r < 0 ? -errno : r); +} + +static inline int sys_setxattr(const char *fn, const char *name, const void *val, size_t size) +{ + int r = ::ceph_os_setxattr(fn, name, val, size); + return (r < 0 ? -errno : r); +} +static inline int sys_fsetxattr(int fd, const char *name, const void *val, size_t size) +{ + int r = ::ceph_os_fsetxattr(fd, name, val, size); + return (r < 0 ? -errno : r); +} + +static inline int sys_listxattr(const char *fn, char *names, size_t len) +{ + int r = ::ceph_os_listxattr(fn, names, len); + return (r < 0 ? -errno : r); +} +static inline int sys_flistxattr(int fd, char *names, size_t len) +{ + int r = ::ceph_os_flistxattr(fd, names, len); + return (r < 0 ? -errno : r); +} + +static inline int sys_removexattr(const char *fn, const char *name) +{ + int r = ::ceph_os_removexattr(fn, name); + return (r < 0 ? -errno : r); +} +static inline int sys_fremovexattr(int fd, const char *name) +{ + int r = ::ceph_os_fremovexattr(fd, name); + return (r < 0 ? -errno : r); +} + + +// wrappers to chain large values across multiple xattrs + +int chain_getxattr(const char *fn, const char *name, void *val, size_t size); +int chain_fgetxattr(int fd, const char *name, void *val, size_t size); +int chain_setxattr(const char *fn, const char *name, const void *val, size_t size, bool onechunk=false); +int chain_fsetxattr(int fd, const char *name, const void *val, size_t size, bool onechunk=false); +int chain_listxattr(const char *fn, char *names, size_t len); +int chain_flistxattr(int fd, char *names, size_t len); +int chain_removexattr(const char *fn, const char *name); +int chain_fremovexattr(int fd, const char *name); + +#endif diff --git a/src/test/ObjectMap/test_object_map.cc b/src/test/ObjectMap/test_object_map.cc index 6af60cf50333..0d504df576a4 100644 --- a/src/test/ObjectMap/test_object_map.cc +++ b/src/test/ObjectMap/test_object_map.cc @@ -7,8 +7,8 @@ #include "include/buffer.h" #include "test/ObjectMap/KeyValueDBMemory.h" #include "kv/KeyValueDB.h" -#include "os/DBObjectMap.h" -#include "os/HashIndex.h" +#include "os/filestore/DBObjectMap.h" +#include "os/filestore/HashIndex.h" #include #include "global/global_init.h" #include "common/ceph_argparse.h" diff --git a/src/test/bench/small_io_bench_fs.cc b/src/test/bench/small_io_bench_fs.cc index 4b273e410a89..55b2aaeba7b8 100644 --- a/src/test/bench/small_io_bench_fs.cc +++ b/src/test/bench/small_io_bench_fs.cc @@ -20,7 +20,7 @@ #include "detailed_stat_collector.h" #include "distribution.h" #include "global/global_init.h" -#include "os/FileStore.h" +#include "os/filestore/FileStore.h" #include "testfilestore_backend.h" #include "common/perf_counters.h" diff --git a/src/test/encoding/types.h b/src/test/encoding/types.h index 3ea3a01f198d..add2a0d680d4 100644 --- a/src/test/encoding/types.h +++ b/src/test/encoding/types.h @@ -108,7 +108,7 @@ TYPE(HitSet::Params) #include "os/ObjectStore.h" TYPE(ObjectStore::Transaction) -#include "os/SequencerPosition.h" +#include "os/filestore/SequencerPosition.h" TYPE(SequencerPosition) #include "os/bluestore/bluestore_types.h" @@ -144,7 +144,7 @@ TYPE(MonCap) #include "mon/mon_types.h" TYPE(LevelDBStoreStats) -#include "os/DBObjectMap.h" +#include "os/filestore/DBObjectMap.h" TYPE(DBObjectMap::_Header) TYPE(DBObjectMap::State) diff --git a/src/test/filestore/TestFileStore.cc b/src/test/filestore/TestFileStore.cc index a9b58a1468fe..a100a5f3a69b 100644 --- a/src/test/filestore/TestFileStore.cc +++ b/src/test/filestore/TestFileStore.cc @@ -16,7 +16,7 @@ #include "common/ceph_argparse.h" #include "global/global_init.h" -#include "os/FileStore.h" +#include "os/filestore/FileStore.h" #include class TestFileStore { diff --git a/src/test/objectstore/FileStoreDiff.cc b/src/test/objectstore/FileStoreDiff.cc index a49e4af37437..6a08e9c37c8d 100644 --- a/src/test/objectstore/FileStoreDiff.cc +++ b/src/test/objectstore/FileStoreDiff.cc @@ -15,7 +15,7 @@ #include #include #include "common/debug.h" -#include "os/FileStore.h" +#include "os/filestore/FileStore.h" #include "common/config.h" #include "FileStoreDiff.h" diff --git a/src/test/objectstore/FileStoreDiff.h b/src/test/objectstore/FileStoreDiff.h index f7aedeee2e68..ba9cfb9837bc 100644 --- a/src/test/objectstore/FileStoreDiff.h +++ b/src/test/objectstore/FileStoreDiff.h @@ -18,7 +18,7 @@ #include #include #include "common/debug.h" -#include "os/FileStore.h" +#include "os/filestore/FileStore.h" #include "common/config.h" class FileStoreDiff { diff --git a/src/test/objectstore/FileStoreTracker.h b/src/test/objectstore/FileStoreTracker.h index e350c80fe1f5..a324fa25e32b 100644 --- a/src/test/objectstore/FileStoreTracker.h +++ b/src/test/objectstore/FileStoreTracker.h @@ -3,7 +3,7 @@ #ifndef FILESTORE_TRACKER_H #define FILESTORE_TRACKER_H #include "test/common/ObjectContents.h" -#include "os/FileStore.h" +#include "os/filestore/FileStore.h" #include "kv/KeyValueDB.h" #include #include diff --git a/src/test/objectstore/chain_xattr.cc b/src/test/objectstore/chain_xattr.cc index 9243150d66f8..5080321cfc65 100644 --- a/src/test/objectstore/chain_xattr.cc +++ b/src/test/objectstore/chain_xattr.cc @@ -21,7 +21,7 @@ #include #include -#include "os/chain_xattr.h" +#include "os/filestore/chain_xattr.h" #include "include/Context.h" #include "common/errno.h" #include "common/ceph_argparse.h" diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc index 7b1a7609d538..dbc2e2cdf676 100644 --- a/src/test/objectstore/store_test.cc +++ b/src/test/objectstore/store_test.cc @@ -18,7 +18,7 @@ #include #include #include "os/ObjectStore.h" -#include "os/FileStore.h" +#include "os/filestore/FileStore.h" #include "os/KeyValueStore.h" #include "include/Context.h" #include "common/ceph_argparse.h" diff --git a/src/test/objectstore/test_idempotent.cc b/src/test/objectstore/test_idempotent.cc index d52f7dbf03b9..8c663a6bf244 100644 --- a/src/test/objectstore/test_idempotent.cc +++ b/src/test/objectstore/test_idempotent.cc @@ -15,7 +15,7 @@ #include #include #include -#include "os/FileStore.h" +#include "os/filestore/FileStore.h" #include "global/global_init.h" #include "common/ceph_argparse.h" #include "common/debug.h" diff --git a/src/test/objectstore/test_idempotent_sequence.cc b/src/test/objectstore/test_idempotent_sequence.cc index 95bf196975ea..75ebc3377d3c 100644 --- a/src/test/objectstore/test_idempotent_sequence.cc +++ b/src/test/objectstore/test_idempotent_sequence.cc @@ -19,7 +19,7 @@ #include "common/ceph_argparse.h" #include "global/global_init.h" #include "common/debug.h" -#include "os/FileStore.h" +#include "os/filestore/FileStore.h" #include "DeterministicOpSequence.h" #include "FileStoreDiff.h" diff --git a/src/test/os/TestLFNIndex.cc b/src/test/os/TestLFNIndex.cc index 5e44355c1e7a..ad4cb75e0119 100644 --- a/src/test/os/TestLFNIndex.cc +++ b/src/test/os/TestLFNIndex.cc @@ -21,8 +21,8 @@ #include #include -#include "os/LFNIndex.h" -#include "os/chain_xattr.h" +#include "os/filestore/LFNIndex.h" +#include "os/filestore/chain_xattr.h" #include "common/ceph_argparse.h" #include "global/global_init.h" #include diff --git a/src/test/test_filejournal.cc b/src/test/test_filejournal.cc index bd0ff5293384..d9dfb30d58dd 100644 --- a/src/test/test_filejournal.cc +++ b/src/test/test_filejournal.cc @@ -8,11 +8,11 @@ #include "global/global_init.h" #include "common/config.h" #include "common/Finisher.h" -#include "os/FileJournal.h" +#include "os/filestore/FileJournal.h" #include "include/Context.h" #include "common/Mutex.h" #include "common/safe_io.h" -#include "os/JournalingObjectStore.h" +#include "os/filestore/JournalingObjectStore.h" Finisher *finisher; Cond sync_cond; diff --git a/src/test/test_trans.cc b/src/test/test_trans.cc index c374ed440a76..b55de59ba839 100644 --- a/src/test/test_trans.cc +++ b/src/test/test_trans.cc @@ -15,7 +15,7 @@ #include #include "common/ceph_argparse.h" #include "common/debug.h" -#include "os/FileStore.h" +#include "os/filestore/FileStore.h" #include "global/global_init.h" #include "include/assert.h" diff --git a/src/test/xattr_bench.cc b/src/test/xattr_bench.cc index e26e32f9ee70..544bb348689c 100644 --- a/src/test/xattr_bench.cc +++ b/src/test/xattr_bench.cc @@ -17,7 +17,7 @@ #include #include #include -#include "os/FileStore.h" +#include "os/filestore/FileStore.h" #include "include/Context.h" #include "common/ceph_argparse.h" #include "global/global_init.h" diff --git a/src/tools/ceph_objectstore_tool.cc b/src/tools/ceph_objectstore_tool.cc index 6c79be6eab91..23affbfc7ba7 100644 --- a/src/tools/ceph_objectstore_tool.cc +++ b/src/tools/ceph_objectstore_tool.cc @@ -25,7 +25,7 @@ #include "global/global_init.h" #include "os/ObjectStore.h" -#include "os/FileJournal.h" +#include "os/filestore/FileJournal.h" #include "osd/PGLog.h" #include "osd/OSD.h" diff --git a/src/tools/ceph_osdomap_tool.cc b/src/tools/ceph_osdomap_tool.cc index 465ffda9195e..ffb452883938 100644 --- a/src/tools/ceph_osdomap_tool.cc +++ b/src/tools/ceph_osdomap_tool.cc @@ -19,7 +19,7 @@ #include "common/errno.h" #include "global/global_init.h" -#include "os/DBObjectMap.h" +#include "os/filestore/DBObjectMap.h" #include "kv/KeyValueDB.h" namespace po = boost::program_options;