+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "include/int_types.h"
-#include "include/types.h"
-
-#include <unistd.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/ioctl.h>
-#include "include/compat.h"
-#include "include/linux_fiemap.h"
-#include "include/color.h"
-#include "include/buffer.h"
-#include "include/assert.h"
-
-#ifndef __CYGWIN__
-#include "btrfs_ioctl.h"
-#endif
-
-#include <iostream>
-#include <fstream>
-#include <sstream>
-
-#include "BtrfsFileStoreBackend.h"
-
-#include "common/errno.h"
-#include "common/config.h"
-
-#if defined(__linux__)
-
-#define dout_subsys ceph_subsys_filestore
-#undef dout_prefix
-#define dout_prefix *_dout << "btrfsfilestorebackend(" << get_basedir_path() << ") "
-
-#define ALIGN_DOWN(x, by) ((x) - ((x) % (by)))
-#define ALIGNED(x, by) (!((x) % (by)))
-#define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by)))
-
-BtrfsFileStoreBackend::BtrfsFileStoreBackend(FileStore *fs):
- GenericFileStoreBackend(fs), has_clone_range(false),
- has_snap_create(false), has_snap_destroy(false),
- has_snap_create_v2(false), has_wait_sync(false), stable_commits(false),
- m_filestore_btrfs_clone_range(g_conf->filestore_btrfs_clone_range),
- m_filestore_btrfs_snap (g_conf->filestore_btrfs_snap) { }
-
-int BtrfsFileStoreBackend::detect_features()
-{
- int r;
-
- r = GenericFileStoreBackend::detect_features();
- if (r < 0)
- return r;
-
- // clone_range?
- if (m_filestore_btrfs_clone_range) {
- int fd = ::openat(get_basedir_fd(), "clone_range_test", O_CREAT|O_WRONLY, 0600);
- if (fd >= 0) {
- if (::unlinkat(get_basedir_fd(), "clone_range_test", 0) < 0) {
- r = -errno;
- dout(0) << "detect_feature: failed to unlink test file for CLONE_RANGE ioctl: "
- << cpp_strerror(r) << dendl;
- }
- btrfs_ioctl_clone_range_args clone_args;
- memset(&clone_args, 0, sizeof(clone_args));
- clone_args.src_fd = -1;
- r = ::ioctl(fd, BTRFS_IOC_CLONE_RANGE, &clone_args);
- if (r < 0 && errno == EBADF) {
- dout(0) << "detect_feature: CLONE_RANGE ioctl is supported" << dendl;
- has_clone_range = true;
- } else {
- r = -errno;
- dout(0) << "detect_feature: CLONE_RANGE ioctl is NOT supported: " << cpp_strerror(r) << dendl;
- }
- TEMP_FAILURE_RETRY(::close(fd));
- } else {
- r = -errno;
- dout(0) << "detect_feature: failed to create test file for CLONE_RANGE ioctl: "
- << cpp_strerror(r) << dendl;
- }
- } else {
- dout(0) << "detect_feature: CLONE_RANGE ioctl is DISABLED via 'filestore btrfs clone range' option" << dendl;
- }
-
- struct btrfs_ioctl_vol_args vol_args;
- memset(&vol_args, 0, sizeof(vol_args));
-
- // create test source volume
- vol_args.fd = 0;
- strcpy(vol_args.name, "test_subvol");
- r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE, &vol_args);
- if (r != 0) {
- r = -errno;
- dout(0) << "detect_feature: failed to create simple subvolume " << vol_args.name << ": " << cpp_strerror(r) << dendl;
- }
- int srcfd = ::openat(get_basedir_fd(), vol_args.name, O_RDONLY);
- if (srcfd < 0) {
- r = -errno;
- dout(0) << "detect_feature: failed to open " << vol_args.name << ": " << cpp_strerror(r) << dendl;
- }
-
- // snap_create and snap_destroy?
- vol_args.fd = srcfd;
- strcpy(vol_args.name, "sync_snap_test");
- r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args);
- int err = errno;
- if (r == 0 || errno == EEXIST) {
- dout(0) << "detect_feature: SNAP_CREATE is supported" << dendl;
- has_snap_create = true;
-
- r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
- if (r == 0) {
- dout(0) << "detect_feature: SNAP_DESTROY is supported" << dendl;
- has_snap_destroy = true;
- } else {
- err = -errno;
- dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err) << dendl;
-
- if (err == -EPERM && getuid() != 0) {
- dout(0) << "detect_feature: failed with EPERM as non-root; remount with -o user_subvol_rm_allowed" << dendl;
- cerr << TEXT_YELLOW
- << "btrfs SNAP_DESTROY failed as non-root; remount with -o user_subvol_rm_allowed"
- << TEXT_NORMAL << std::endl;
- } else if (err == -EOPNOTSUPP) {
- derr << "btrfs SNAP_DESTROY ioctl not supported; you need a kernel newer than 2.6.32" << dendl;
- }
- }
- } else {
- dout(0) << "detect_feature: SNAP_CREATE failed: " << cpp_strerror(err) << dendl;
- }
-
- if (m_filestore_btrfs_snap) {
- if (has_snap_destroy)
- stable_commits = true;
- else
- dout(0) << "detect_feature: snaps enabled, but no SNAP_DESTROY ioctl; DISABLING" << dendl;
- }
-
- // start_sync?
- __u64 transid = 0;
- r = ::ioctl(get_basedir_fd(), BTRFS_IOC_START_SYNC, &transid);
- if (r < 0) {
- int err = errno;
- dout(0) << "detect_feature: START_SYNC got " << cpp_strerror(err) << dendl;
- }
- if (r == 0 && transid > 0) {
- dout(0) << "detect_feature: START_SYNC is supported (transid " << transid << ")" << dendl;
-
- // do we have wait_sync too?
- r = ::ioctl(get_basedir_fd(), BTRFS_IOC_WAIT_SYNC, &transid);
- if (r == 0 || errno == ERANGE) {
- dout(0) << "detect_feature: WAIT_SYNC is supported" << dendl;
- has_wait_sync = true;
- } else {
- int err = errno;
- dout(0) << "detect_feature: WAIT_SYNC is NOT supported: " << cpp_strerror(err) << dendl;
- }
- } else {
- int err = errno;
- dout(0) << "detect_feature: START_SYNC is NOT supported: " << cpp_strerror(err) << dendl;
- }
-
- if (has_wait_sync) {
- // async snap creation?
- struct btrfs_ioctl_vol_args_v2 async_args;
- memset(&async_args, 0, sizeof(async_args));
- async_args.fd = srcfd;
- async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC;
- strcpy(async_args.name, "async_snap_test");
-
- // remove old one, first
- struct stat st;
- strcpy(vol_args.name, async_args.name);
- if (::fstatat(get_basedir_fd(), vol_args.name, &st, 0) == 0) {
- dout(0) << "detect_feature: removing old async_snap_test" << dendl;
- r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
- if (r != 0) {
- int err = errno;
- dout(0) << "detect_feature: failed to remove old async_snap_test: " << cpp_strerror(err) << dendl;
- }
- }
-
- r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2, &async_args);
- if (r == 0 || errno == EEXIST) {
- dout(0) << "detect_feature: SNAP_CREATE_V2 is supported" << dendl;
- has_snap_create_v2 = true;
-
- // clean up
- strcpy(vol_args.name, "async_snap_test");
- r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
- if (r != 0) {
- int err = errno;
- dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err) << dendl;
- }
- } else {
- int err = errno;
- dout(0) << "detect_feature: SNAP_CREATE_V2 is NOT supported: " << cpp_strerror(err) << dendl;
- }
- }
-
- // clean up test subvol
- if (srcfd >= 0)
- TEMP_FAILURE_RETRY(::close(srcfd));
-
- strcpy(vol_args.name, "test_subvol");
- r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
- if (r < 0) {
- r = -errno;
- dout(0) << "detect_feature: failed to remove " << vol_args.name << ": " << cpp_strerror(r) << dendl;
- }
-
- if (m_filestore_btrfs_snap && !has_snap_create_v2) {
- dout(0) << "mount WARNING: btrfs snaps enabled, but no SNAP_CREATE_V2 ioctl (from kernel 2.6.37+)" << dendl;
- cerr << TEXT_YELLOW
- << " ** WARNING: 'filestore btrfs snap' is enabled (for safe transactions,\n"
- << " rollback), but btrfs does not support the SNAP_CREATE_V2 ioctl\n"
- << " (added in Linux 2.6.37). Expect slow btrfs sync/commit\n"
- << " performance.\n"
- << TEXT_NORMAL;
- }
-
- return 0;
-}
-
-bool BtrfsFileStoreBackend::can_checkpoint()
-{
- return stable_commits;
-}
-
-int BtrfsFileStoreBackend::create_current()
-{
- struct stat st;
- int ret = ::stat(get_current_path().c_str(), &st);
- if (ret == 0) {
- // current/ exists
- if (!S_ISDIR(st.st_mode)) {
- dout(0) << "create_current: current/ exists but is not a directory" << dendl;
- return -EINVAL;
- }
-
- struct stat basest;
- struct statfs currentfs;
- ret = ::fstat(get_basedir_fd(), &basest);
- if (ret < 0) {
- ret = -errno;
- dout(0) << "create_current: cannot fstat basedir " << cpp_strerror(ret) << dendl;
- return ret;
- }
- ret = ::statfs(get_current_path().c_str(), ¤tfs);
- if (ret < 0) {
- ret = -errno;
- dout(0) << "create_current: cannot statsf basedir " << cpp_strerror(ret) << dendl;
- return ret;
- }
- if (currentfs.f_type == BTRFS_SUPER_MAGIC && basest.st_dev != st.st_dev) {
- dout(2) << "create_current: current appears to be a btrfs subvolume" << dendl;
- stable_commits = true;
- }
- return 0;
- }
-
- struct btrfs_ioctl_vol_args volargs;
- memset(&volargs, 0, sizeof(volargs));
-
- volargs.fd = 0;
- strcpy(volargs.name, "current");
- if (::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE, (unsigned long int)&volargs) < 0) {
- ret = -errno;
- dout(0) << "create_current: BTRFS_IOC_SUBVOL_CREATE failed with error "
- << cpp_strerror(ret) << dendl;
- return ret;
- }
-
- dout(2) << "create_current: created btrfs subvol " << get_current_path() << dendl;
- if (::chmod(get_current_path().c_str(), 0755) < 0) {
- ret = -errno;
- dout(0) << "create_current: failed to chmod " << get_current_path() << " to 0755: "
- << cpp_strerror(ret) << dendl;
- return ret;
- }
-
- stable_commits = true;
- return 0;
-}
-
-int BtrfsFileStoreBackend::list_checkpoints(list<string>& ls)
-{
- int ret, err = 0;
-
- struct stat basest;
- ret = ::fstat(get_basedir_fd(), &basest);
- if (ret < 0) {
- ret = -errno;
- dout(0) << "list_checkpoints: cannot fstat basedir " << cpp_strerror(ret) << dendl;
- return ret;
- }
-
- // get snap list
- DIR *dir = ::opendir(get_basedir_path().c_str());
- if (!dir) {
- ret = -errno;
- dout(0) << "list_checkpoints: opendir '" << get_basedir_path() << "' failed: "
- << cpp_strerror(ret) << dendl;
- return ret;
- }
-
- list<string> snaps;
- char path[PATH_MAX];
- char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1];
- struct dirent *de;
- while (::readdir_r(dir, (struct dirent *)&buf, &de) == 0) {
- if (!de)
- break;
-
- snprintf(path, sizeof(path), "%s/%s", get_basedir_path().c_str(), de->d_name);
-
- struct stat st;
- ret = ::stat(path, &st);
- if (ret < 0) {
- err = -errno;
- dout(0) << "list_checkpoints: stat '" << path << "' failed: "
- << cpp_strerror(err) << dendl;
- break;
- }
-
- if (!S_ISDIR(st.st_mode))
- continue;
-
- struct statfs fs;
- ret = ::statfs(path, &fs);
- if (ret < 0) {
- err = -errno;
- dout(0) << "list_checkpoints: statfs '" << path << "' failed: "
- << cpp_strerror(err) << dendl;
- break;
- }
-
- if (fs.f_type == BTRFS_SUPER_MAGIC && basest.st_dev != st.st_dev)
- snaps.push_back(string(de->d_name));
- }
-
- if (::closedir(dir) < 0) {
- ret = -errno;
- dout(0) << "list_checkpoints: closedir failed: " << cpp_strerror(ret) << dendl;
- if (!err)
- err = ret;
- }
-
- if (err)
- return err;
-
- ls.swap(snaps);
- return 0;
-}
-
-int BtrfsFileStoreBackend::create_checkpoint(const string& name, uint64_t *transid)
-{
- dout(10) << "create_checkpoint: '" << name << "'" << dendl;
- if (has_snap_create_v2 && transid) {
- struct btrfs_ioctl_vol_args_v2 async_args;
- memset(&async_args, 0, sizeof(async_args));
- async_args.fd = get_current_fd();
- async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC;
-
- size_t name_size = sizeof(async_args.name);
- strncpy(async_args.name, name.c_str(), name_size);
- async_args.name[name_size-1] = '\0';
-
- int r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2, &async_args);
- if (r < 0) {
- r = -errno;
- dout(0) << "create_checkpoint: async snap create '" << name << "' got " << cpp_strerror(r) << dendl;
- return r;
- }
- dout(20) << "create_checkpoint: async snap create '" << name << "' transid " << async_args.transid << dendl;
- *transid = async_args.transid;
- } else {
- struct btrfs_ioctl_vol_args vol_args;
- memset(&vol_args, 0, sizeof(vol_args));
- vol_args.fd = get_current_fd();
-
- size_t name_size = sizeof(vol_args.name);
- strncpy(vol_args.name, name.c_str(), name_size);
- vol_args.name[name_size-1] = '\0';
-
- int r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args);
- if (r < 0) {
- r = -errno;
- dout(0) << "create_checkpoint: snap create '" << name << "' got " << cpp_strerror(r) << dendl;
- return r;
- }
- if (transid)
- *transid = 0;
- }
- return 0;
-}
-
-int BtrfsFileStoreBackend::sync_checkpoint(uint64_t transid)
-{
- // wait for commit
- dout(10) << "sync_checkpoint: transid " << transid << " to complete" << dendl;
- int ret = ::ioctl(get_op_fd(), BTRFS_IOC_WAIT_SYNC, &transid);
- if (ret < 0) {
- ret = -errno;
- dout(0) << "sync_checkpoint: ioctl WAIT_SYNC got " << cpp_strerror(ret) << dendl;
- return -errno;
- }
- dout(20) << "sync_checkpoint: done waiting for transid " << transid << dendl;
- return 0;
-}
-
-int BtrfsFileStoreBackend::rollback_to(const string& name)
-{
- dout(10) << "rollback_to: to '" << name << "'" << dendl;
- char s[PATH_MAX];
- btrfs_ioctl_vol_args vol_args;
-
- memset(&vol_args, 0, sizeof(vol_args));
- vol_args.fd = 0;
- strcpy(vol_args.name, "current");
-
- int ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
- if (ret && errno != ENOENT) {
- dout(0) << "rollback_to: error removing old current subvol: " << cpp_strerror(ret) << dendl;
- snprintf(s, sizeof(s), "%s/current.remove.me.%d", get_basedir_path().c_str(), rand());
- if (::rename(get_current_path().c_str(), s)) {
- ret = -errno;
- dout(0) << "rollback_to: error renaming old current subvol: "
- << cpp_strerror(ret) << dendl;
- return ret;
- }
- }
-
- snprintf(s, sizeof(s), "%s/%s", get_basedir_path().c_str(), name.c_str());
-
- // roll back
- vol_args.fd = ::open(s, O_RDONLY);
- if (vol_args.fd < 0) {
- ret = -errno;
- dout(0) << "rollback_to: error opening '" << s << "': " << cpp_strerror(ret) << dendl;
- return ret;
- }
- ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args);
- if (ret < 0 ) {
- ret = -errno;
- dout(0) << "rollback_to: ioctl SNAP_CREATE got " << cpp_strerror(ret) << dendl;
- }
- TEMP_FAILURE_RETRY(::close(vol_args.fd));
- return ret;
-}
-
-int BtrfsFileStoreBackend::destroy_checkpoint(const string& name)
-{
- dout(10) << "destroy_checkpoint: '" << name << "'" << dendl;
- btrfs_ioctl_vol_args vol_args;
- memset(&vol_args, 0, sizeof(vol_args));
- vol_args.fd = 0;
- strncpy(vol_args.name, name.c_str(), sizeof(vol_args.name));
-
- int ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
- if (ret) {
- ret = -errno;
- dout(0) << "destroy_checkpoint: ioctl SNAP_DESTROY got " << cpp_strerror(ret) << dendl;
- return ret;
- }
- return 0;
-}
-
-int BtrfsFileStoreBackend::syncfs()
-{
- dout(15) << "syncfs" << dendl;
- // do a full btrfs commit
- int ret = ::ioctl(get_op_fd(), BTRFS_IOC_SYNC);
- if (ret < 0) {
- ret = -errno;
- dout(0) << "syncfs: btrfs IOC_SYNC got " << cpp_strerror(ret) << dendl;
- }
- return ret;
-}
-
-int BtrfsFileStoreBackend::clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff)
-{
- dout(20) << "clone_range: " << srcoff << "~" << len << " to " << dstoff << dendl;
- size_t blk_size = get_blksize();
- if (!has_clone_range ||
- srcoff % blk_size != dstoff % blk_size) {
- dout(20) << "clone_range: using copy" << dendl;
- return _copy_range(from, to, srcoff, len, dstoff);
- }
-
- int err = 0;
- int r = 0;
-
- uint64_t srcoffclone = ALIGN_UP(srcoff, blk_size);
- uint64_t dstoffclone = ALIGN_UP(dstoff, blk_size);
- if (srcoffclone >= srcoff + len) {
- dout(20) << "clone_range: using copy, extent too short to align srcoff" << dendl;
- return _copy_range(from, to, srcoff, len, dstoff);
- }
-
- uint64_t lenclone = len - (srcoffclone - srcoff);
- if (!ALIGNED(lenclone, blk_size)) {
- struct stat from_stat, to_stat;
- err = ::fstat(from, &from_stat);
- if (err) return -errno;
- err = ::fstat(to , &to_stat);
- if (err) return -errno;
-
- if (srcoff + len != (uint64_t)from_stat.st_size ||
- dstoff + len < (uint64_t)to_stat.st_size) {
- // Not to the end of the file, need to align length as well
- lenclone = ALIGN_DOWN(lenclone, blk_size);
- }
- }
- if (lenclone == 0) {
- // too short
- return _copy_range(from, to, srcoff, len, dstoff);
- }
-
- dout(20) << "clone_range: cloning " << srcoffclone << "~" << lenclone
- << " to " << dstoffclone << " = " << r << dendl;
- btrfs_ioctl_clone_range_args a;
- a.src_fd = from;
- a.src_offset = srcoffclone;
- a.src_length = lenclone;
- a.dest_offset = dstoffclone;
- err = ::ioctl(to, BTRFS_IOC_CLONE_RANGE, &a);
- if (err >= 0) {
- r += err;
- } else if (errno == EINVAL) {
- // Still failed, might be compressed
- dout(20) << "clone_range: failed CLONE_RANGE call with -EINVAL, using copy" << dendl;
- return _copy_range(from, to, srcoff, len, dstoff);
- } else {
- return -errno;
- }
-
- // Take care any trimmed from front
- if (srcoffclone != srcoff) {
- err = _copy_range(from, to, srcoff, srcoffclone - srcoff, dstoff);
- if (err >= 0) {
- r += err;
- } else {
- return err;
- }
- }
-
- // Copy end
- if (srcoffclone + lenclone != srcoff + len) {
- err = _copy_range(from, to,
- srcoffclone + lenclone,
- (srcoff + len) - (srcoffclone + lenclone),
- dstoffclone + lenclone);
- if (err >= 0) {
- r += err;
- } else {
- return err;
- }
- }
- dout(20) << "clone_range: finished " << srcoff << "~" << len
- << " to " << dstoff << " = " << r << dendl;
- return r;
-}
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#ifndef CEPH_BTRFSFILESTOREBACKEDN_H
-#define CEPH_BTRFSFILESTOREBACKEDN_H
-
-#if defined(__linux__)
-#include "GenericFileStoreBackend.h"
-
-class BtrfsFileStoreBackend : public GenericFileStoreBackend {
-private:
- bool has_clone_range; ///< clone range ioctl is supported
- bool has_snap_create; ///< snap create ioctl is supported
- bool has_snap_destroy; ///< snap destroy ioctl is supported
- bool has_snap_create_v2; ///< snap create v2 ioctl (async!) is supported
- bool has_wait_sync; ///< wait sync ioctl is supported
- bool stable_commits;
- bool m_filestore_btrfs_clone_range;
- bool m_filestore_btrfs_snap;
-public:
- BtrfsFileStoreBackend(FileStore *fs);
- ~BtrfsFileStoreBackend() {}
- const char *get_name() {
- return "btrfs";
- }
- int detect_features();
- bool can_checkpoint();
- int create_current();
- int list_checkpoints(list<string>& ls);
- int create_checkpoint(const string& name, uint64_t *cid);
- int sync_checkpoint(uint64_t cid);
- int rollback_to(const string& name);
- int destroy_checkpoint(const string& name);
- int syncfs();
- int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
-};
-#endif
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#ifndef OS_COLLECTIONINDEX_H
-#define OS_COLLECTIONINDEX_H
-
-#include <string>
-#include <vector>
-#include "include/memory.h"
-
-#include "osd/osd_types.h"
-#include "include/object.h"
-#include "common/RWLock.h"
-
-/**
- * CollectionIndex provides an interface for manipulating indexed collections
- */
-class CollectionIndex {
-protected:
- /**
- * Object encapsulating a returned path.
- *
- * A path to an object (existent or non-existent) becomes invalid
- * when a different object is created in the index. Path stores
- * a shared_ptr to the CollectionIndex to keep the index alive
- * during its lifetime.
- * @see IndexManager
- * @see self_ref
- * @see set_ref
- */
- class Path {
- public:
- /// Returned path
- string full_path;
- /// Ref to parent Index
- CollectionIndex* parent_ref;
- /// coll_t for parent Index
- coll_t parent_coll;
-
- /// Normal Constructor
- Path(
- string path, ///< [in] Path to return.
- CollectionIndex* ref)
- : full_path(path), parent_ref(ref), parent_coll(parent_ref->coll()) {}
-
- /// Debugging Constructor
- Path(
- string path, ///< [in] Path to return.
- coll_t coll) ///< [in] collection
- : full_path(path), parent_coll(coll) {}
-
- /// Getter for the stored path.
- const char *path() const { return full_path.c_str(); }
-
- /// Getter for collection
- coll_t coll() const { return parent_coll; }
-
- /// Getter for parent
- CollectionIndex* get_index() const {
- return parent_ref;
- }
- };
- public:
-
- string access_lock_name;
- RWLock access_lock;
- /// Type of returned paths
- typedef ceph::shared_ptr<Path> IndexedPath;
-
- static IndexedPath get_testing_path(string path, coll_t collection) {
- return IndexedPath(new Path(path, collection));
- }
-
- static const uint32_t FLAT_INDEX_TAG = 0;
- static const uint32_t HASH_INDEX_TAG = 1;
- static const uint32_t HASH_INDEX_TAG_2 = 2;
- static const uint32_t HOBJECT_WITH_POOL = 3;
- /**
- * For tracking Filestore collection versions.
- *
- * @return Collection version represented by the Index implementation
- */
- virtual uint32_t collection_version() = 0;
-
- /**
- * Returns the collection managed by this CollectionIndex
- */
- virtual coll_t coll() const = 0;
-
-
- /**
- * Initializes the index.
- *
- * @return Error Code, 0 for success
- */
- virtual int init() = 0;
-
- /**
- * Cleanup before replaying journal
- *
- * Index implemenations may need to perform compound operations
- * which may leave the collection unstable if interupted. cleanup
- * is called on mount to allow the CollectionIndex implementation
- * to stabilize.
- *
- * @see HashIndex
- * @return Error Code, 0 for success
- */
- virtual int cleanup() = 0;
-
- /**
- * Call when a file is created using a path returned from lookup.
- *
- * @return Error Code, 0 for success
- */
- virtual int created(
- const ghobject_t &oid, ///< [in] Created object.
- const char *path ///< [in] Path to created object.
- ) = 0;
-
- /**
- * Removes oid from the collection
- *
- * @return Error Code, 0 for success
- */
- virtual int unlink(
- const ghobject_t &oid ///< [in] Object to remove
- ) = 0;
-
- /**
- * Gets the IndexedPath for oid.
- *
- * @return Error Code, 0 for success
- */
- virtual int lookup(
- const ghobject_t &oid, ///< [in] Object to lookup
- IndexedPath *path, ///< [out] Path to object
- int *hardlink ///< [out] number of hard links of this object. *hardlink=0 mean object no-exist.
- ) = 0;
-
- /**
- * Moves objects matching @e match in the lsb @e bits
- *
- * dest and this must be the same subclass
- *
- * @return Error Code, 0 for success
- */
- virtual int split(
- uint32_t match, //< [in] value to match
- uint32_t bits, //< [in] bits to check
- CollectionIndex* dest //< [in] destination index
- ) { assert(0); return 0; }
-
-
- /// List contents of collection by hash
- virtual int collection_list_partial(
- const ghobject_t &start, ///< [in] object at which to start
- const ghobject_t &end, ///< [in] list only objects < end
- bool sort_bitwise, ///< [in] use bitwise sort
- int max_count, ///< [in] return at most max_count objects
- vector<ghobject_t> *ls, ///< [out] Listed objects
- ghobject_t *next ///< [out] Next object to list
- ) = 0;
-
- /// Call prior to removing directory
- virtual int prep_delete() { return 0; }
-
- CollectionIndex(coll_t collection):
- access_lock_name ("CollectionIndex::access_lock::" + collection.to_str()),
- access_lock(access_lock_name.c_str()) {}
-
- /*
- * Pre-hash the collection, this collection should map to a PG folder.
- *
- * @param pg_num - pg number of the pool this collection belongs to.
- * @param expected_num_objs - expected number of objects in this collection.
- * @Return 0 on success, an error code otherwise.
- */
- virtual int pre_hash_collection(
- uint32_t pg_num, ///< [in] pg number of the pool this collection belongs to
- uint64_t expected_num_objs ///< [in] expected number of objects this collection has
- ) { assert(0); return 0; }
-
- /// Virtual destructor
- virtual ~CollectionIndex() {}
-};
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-
-#include "include/int_types.h"
-#include "include/buffer.h"
-
-#include <iostream>
-#include <set>
-#include <map>
-#include <string>
-#include "include/memory.h"
-#include <vector>
-
-#include "ObjectMap.h"
-#include "kv/KeyValueDB.h"
-#include "DBObjectMap.h"
-#include <errno.h>
-
-#include "common/debug.h"
-#include "common/config.h"
-#include "include/assert.h"
-
-#define dout_subsys ceph_subsys_filestore
-#undef dout_prefix
-#define dout_prefix *_dout << "filestore "
-
-const string DBObjectMap::USER_PREFIX = "_USER_";
-const string DBObjectMap::XATTR_PREFIX = "_AXATTR_";
-const string DBObjectMap::SYS_PREFIX = "_SYS_";
-const string DBObjectMap::COMPLETE_PREFIX = "_COMPLETE_";
-const string DBObjectMap::HEADER_KEY = "HEADER";
-const string DBObjectMap::USER_HEADER_KEY = "USER_HEADER";
-const string DBObjectMap::GLOBAL_STATE_KEY = "HEADER";
-const string DBObjectMap::HOBJECT_TO_SEQ = "_HOBJTOSEQ_";
-
-// Legacy
-const string DBObjectMap::LEAF_PREFIX = "_LEAF_";
-const string DBObjectMap::REVERSE_LEAF_PREFIX = "_REVLEAF_";
-
-static void append_escaped(const string &in, string *out)
-{
- for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
- if (*i == '%') {
- out->push_back('%');
- out->push_back('p');
- } else if (*i == '.') {
- out->push_back('%');
- out->push_back('e');
- } else if (*i == '_') {
- out->push_back('%');
- out->push_back('u');
- } else {
- out->push_back(*i);
- }
- }
-}
-
-bool DBObjectMap::check(std::ostream &out)
-{
- bool retval = true;
- map<uint64_t, uint64_t> parent_to_num_children;
- map<uint64_t, uint64_t> parent_to_actual_num_children;
- KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
- for (iter->seek_to_first(); iter->valid(); iter->next()) {
- _Header header;
- assert(header.num_children == 1);
- header.num_children = 0; // Hack for leaf node
- bufferlist bl = iter->value();
- while (true) {
- bufferlist::iterator bliter = bl.begin();
- header.decode(bliter);
- if (header.seq != 0)
- parent_to_actual_num_children[header.seq] = header.num_children;
- if (header.parent == 0)
- break;
-
- if (!parent_to_num_children.count(header.parent))
- parent_to_num_children[header.parent] = 0;
- parent_to_num_children[header.parent]++;
- if (parent_to_actual_num_children.count(header.parent))
- break;
-
- set<string> to_get;
- map<string, bufferlist> got;
- to_get.insert(HEADER_KEY);
- db->get(sys_parent_prefix(header), to_get, &got);
- if (got.empty()) {
- out << "Missing: seq " << header.parent << std::endl;
- retval = false;
- break;
- } else {
- bl = got.begin()->second;
- }
- }
- }
-
- for (map<uint64_t, uint64_t>::iterator i = parent_to_num_children.begin();
- i != parent_to_num_children.end();
- parent_to_num_children.erase(i++)) {
- if (!parent_to_actual_num_children.count(i->first))
- continue;
- if (parent_to_actual_num_children[i->first] != i->second) {
- out << "Invalid: seq " << i->first << " recorded children: "
- << parent_to_actual_num_children[i->first] << " found: "
- << i->second << std::endl;
- retval = false;
- }
- parent_to_actual_num_children.erase(i->first);
- }
- return retval;
-}
-
-string DBObjectMap::ghobject_key(const ghobject_t &oid)
-{
- string out;
- append_escaped(oid.hobj.oid.name, &out);
- out.push_back('.');
- append_escaped(oid.hobj.get_key(), &out);
- out.push_back('.');
- append_escaped(oid.hobj.nspace, &out);
- out.push_back('.');
-
- char snap_with_hash[1000];
- char *t = snap_with_hash;
- char *end = t + sizeof(snap_with_hash);
- if (oid.hobj.snap == CEPH_NOSNAP)
- t += snprintf(t, end - t, "head");
- else if (oid.hobj.snap == CEPH_SNAPDIR)
- t += snprintf(t, end - t, "snapdir");
- else
- t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
-
- if (oid.hobj.pool == -1)
- t += snprintf(t, end - t, ".none");
- else
- t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.hobj.pool);
- t += snprintf(t, end - t, ".%.*X", (int)(sizeof(uint32_t)*2), oid.hobj.get_hash());
-
- if (oid.generation != ghobject_t::NO_GEN ||
- oid.shard_id != shard_id_t::NO_SHARD) {
- t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.generation);
- t += snprintf(t, end - t, ".%x", (int)oid.shard_id);
- }
- out += string(snap_with_hash);
- return out;
-}
-
-// ok: pglog%u3%efs1...0.none.0017B237
-// bad: plana8923501-10...4c.3.ffffffffffffffff.2
-// fixed: plana8923501-10...4c.3.CB767F2D.ffffffffffffffff.2
-// returns 0 for false, 1 for true, negative for error
-int DBObjectMap::is_buggy_ghobject_key_v1(const string &in)
-{
- int dots = 5; // skip 5 .'s
- const char *s = in.c_str();
- do {
- while (*s && *s != '.')
- ++s;
- if (!*s) {
- derr << "unexpected null at " << (int)(s-in.c_str()) << dendl;
- return -EINVAL;
- }
- ++s;
- } while (*s && --dots);
- if (!*s) {
- derr << "unexpected null at " << (int)(s-in.c_str()) << dendl;
- return -EINVAL;
- }
- // we are now either at a hash value (32 bits, 8 chars) or a generation
- // value (64 bits) '.' and shard id. count the dots!
- int len = 0;
- while (*s && *s != '.') {
- ++s;
- ++len;
- }
- if (*s == '\0') {
- if (len != 8) {
- derr << "hash value is not 8 chars" << dendl;
- return -EINVAL; // the hash value is always 8 chars.
- }
- return 0;
- }
- if (*s != '.') { // the shard follows.
- derr << "missing final . and shard id at " << (int)(s-in.c_str()) << dendl;
- return -EINVAL;
- }
- return 1;
-}
-
-
-string DBObjectMap::map_header_key(const ghobject_t &oid)
-{
- return ghobject_key(oid);
-}
-
-string DBObjectMap::header_key(uint64_t seq)
-{
- char buf[100];
- snprintf(buf, sizeof(buf), "%.*" PRId64, (int)(2*sizeof(seq)), seq);
- return string(buf);
-}
-
-string DBObjectMap::complete_prefix(Header header)
-{
- return USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX;
-}
-
-string DBObjectMap::user_prefix(Header header)
-{
- return USER_PREFIX + header_key(header->seq) + USER_PREFIX;
-}
-
-string DBObjectMap::sys_prefix(Header header)
-{
- return USER_PREFIX + header_key(header->seq) + SYS_PREFIX;
-}
-
-string DBObjectMap::xattr_prefix(Header header)
-{
- return USER_PREFIX + header_key(header->seq) + XATTR_PREFIX;
-}
-
-string DBObjectMap::sys_parent_prefix(_Header header)
-{
- return USER_PREFIX + header_key(header.parent) + SYS_PREFIX;
-}
-
-int DBObjectMap::DBObjectMapIteratorImpl::init()
-{
- invalid = false;
- if (ready) {
- return 0;
- }
- assert(!parent_iter);
- if (header->parent) {
- Header parent = map->lookup_parent(header);
- if (!parent) {
- assert(0);
- return -EINVAL;
- }
- parent_iter.reset(new DBObjectMapIteratorImpl(map, parent));
- }
- key_iter = map->db->get_iterator(map->user_prefix(header));
- assert(key_iter);
- complete_iter = map->db->get_iterator(map->complete_prefix(header));
- assert(complete_iter);
- cur_iter = key_iter;
- assert(cur_iter);
- ready = true;
- return 0;
-}
-
-ObjectMap::ObjectMapIterator DBObjectMap::get_iterator(
- const ghobject_t &oid)
-{
- MapHeaderLock hl(this, oid);
- Header header = lookup_map_header(hl, oid);
- if (!header)
- return ObjectMapIterator(new EmptyIteratorImpl());
- DBObjectMapIterator iter = _get_iterator(header);
- iter->hlock.swap(hl);
- return iter;
-}
-
-int DBObjectMap::DBObjectMapIteratorImpl::seek_to_first()
-{
- init();
- r = 0;
- if (parent_iter) {
- r = parent_iter->seek_to_first();
- if (r < 0)
- return r;
- }
- r = key_iter->seek_to_first();
- if (r < 0)
- return r;
- return adjust();
-}
-
-int DBObjectMap::DBObjectMapIteratorImpl::seek_to_last()
-{
- init();
- r = 0;
- if (parent_iter) {
- r = parent_iter->seek_to_last();
- if (r < 0)
- return r;
- if (parent_iter->valid())
- r = parent_iter->next();
- if (r < 0)
- return r;
- }
- r = key_iter->seek_to_last();
- if (r < 0)
- return r;
- if (key_iter->valid())
- r = key_iter->next();
- if (r < 0)
- return r;
- return adjust();
-}
-
-int DBObjectMap::DBObjectMapIteratorImpl::lower_bound(const string &to)
-{
- init();
- r = 0;
- if (parent_iter) {
- r = parent_iter->lower_bound(to);
- if (r < 0)
- return r;
- }
- r = key_iter->lower_bound(to);
- if (r < 0)
- return r;
- return adjust();
-}
-
-int DBObjectMap::DBObjectMapIteratorImpl::upper_bound(const string &after)
-{
- init();
- r = 0;
- if (parent_iter) {
- r = parent_iter->upper_bound(after);
- if (r < 0)
- return r;
- }
- r = key_iter->upper_bound(after);
- if (r < 0)
- return r;
- return adjust();
-}
-
-bool DBObjectMap::DBObjectMapIteratorImpl::valid()
-{
- bool valid = !invalid && ready;
- assert(!valid || cur_iter->valid());
- return valid;
-}
-
-bool DBObjectMap::DBObjectMapIteratorImpl::valid_parent()
-{
- if (parent_iter && parent_iter->valid() &&
- (!key_iter->valid() || key_iter->key() > parent_iter->key()))
- return true;
- return false;
-}
-
-int DBObjectMap::DBObjectMapIteratorImpl::next(bool validate)
-{
- assert(cur_iter->valid());
- assert(valid());
- cur_iter->next();
- return adjust();
-}
-
-int DBObjectMap::DBObjectMapIteratorImpl::next_parent()
-{
- if (!parent_iter || !parent_iter->valid()) {
- invalid = true;
- return 0;
- }
- r = next();
- if (r < 0)
- return r;
- if (!valid() || on_parent() || !parent_iter->valid())
- return 0;
-
- return lower_bound(parent_iter->key());
-}
-
-int DBObjectMap::DBObjectMapIteratorImpl::in_complete_region(const string &to_test,
- string *begin,
- string *end)
-{
- complete_iter->upper_bound(to_test);
- if (complete_iter->valid())
- complete_iter->prev();
- else
- complete_iter->seek_to_last();
-
- if (!complete_iter->valid())
- return false;
-
- string _end;
- if (begin)
- *begin = complete_iter->key();
- _end = string(complete_iter->value().c_str());
- if (end)
- *end = _end;
- return (to_test >= complete_iter->key()) && (!_end.size() || _end > to_test);
-}
-
-/**
- * Moves parent_iter to the next position both out of the complete_region and
- * not equal to key_iter. Then, we set cur_iter to parent_iter if valid and
- * less than key_iter and key_iter otherwise.
- */
-int DBObjectMap::DBObjectMapIteratorImpl::adjust()
-{
- string begin, end;
- while (parent_iter && parent_iter->valid()) {
- if (in_complete_region(parent_iter->key(), &begin, &end)) {
- if (end.size() == 0) {
- parent_iter->seek_to_last();
- if (parent_iter->valid())
- parent_iter->next();
- } else
- parent_iter->lower_bound(end);
- } else if (key_iter->valid() && key_iter->key() == parent_iter->key()) {
- parent_iter->next();
- } else {
- break;
- }
- }
- if (valid_parent()) {
- cur_iter = parent_iter;
- } else if (key_iter->valid()) {
- cur_iter = key_iter;
- } else {
- invalid = true;
- }
- assert(invalid || cur_iter->valid());
- return 0;
-}
-
-
-string DBObjectMap::DBObjectMapIteratorImpl::key()
-{
- return cur_iter->key();
-}
-
-bufferlist DBObjectMap::DBObjectMapIteratorImpl::value()
-{
- return cur_iter->value();
-}
-
-int DBObjectMap::DBObjectMapIteratorImpl::status()
-{
- return r;
-}
-
-int DBObjectMap::set_keys(const ghobject_t &oid,
- const map<string, bufferlist> &set,
- const SequencerPosition *spos)
-{
- KeyValueDB::Transaction t = db->get_transaction();
- MapHeaderLock hl(this, oid);
- Header header = lookup_create_map_header(hl, oid, t);
- if (!header)
- return -EINVAL;
- if (check_spos(oid, header, spos))
- return 0;
-
- t->set(user_prefix(header), set);
-
- return db->submit_transaction(t);
-}
-
-int DBObjectMap::set_header(const ghobject_t &oid,
- const bufferlist &bl,
- const SequencerPosition *spos)
-{
- KeyValueDB::Transaction t = db->get_transaction();
- MapHeaderLock hl(this, oid);
- Header header = lookup_create_map_header(hl, oid, t);
- if (!header)
- return -EINVAL;
- if (check_spos(oid, header, spos))
- return 0;
- _set_header(header, bl, t);
- return db->submit_transaction(t);
-}
-
-void DBObjectMap::_set_header(Header header, const bufferlist &bl,
- KeyValueDB::Transaction t)
-{
- map<string, bufferlist> to_set;
- to_set[USER_HEADER_KEY] = bl;
- t->set(sys_prefix(header), to_set);
-}
-
-int DBObjectMap::get_header(const ghobject_t &oid,
- bufferlist *bl)
-{
- MapHeaderLock hl(this, oid);
- Header header = lookup_map_header(hl, oid);
- if (!header) {
- return 0;
- }
- return _get_header(header, bl);
-}
-
-int DBObjectMap::_get_header(Header header,
- bufferlist *bl)
-{
- map<string, bufferlist> out;
- while (true) {
- out.clear();
- set<string> to_get;
- to_get.insert(USER_HEADER_KEY);
- int r = db->get(sys_prefix(header), to_get, &out);
- if (r == 0 && !out.empty())
- break;
- if (r < 0)
- return r;
- Header current(header);
- if (!current->parent)
- break;
- header = lookup_parent(current);
- }
-
- if (!out.empty())
- bl->swap(out.begin()->second);
- return 0;
-}
-
-int DBObjectMap::clear(const ghobject_t &oid,
- const SequencerPosition *spos)
-{
- KeyValueDB::Transaction t = db->get_transaction();
- MapHeaderLock hl(this, oid);
- Header header = lookup_map_header(hl, oid);
- if (!header)
- return -ENOENT;
- if (check_spos(oid, header, spos))
- return 0;
- remove_map_header(hl, oid, header, t);
- assert(header->num_children > 0);
- header->num_children--;
- int r = _clear(header, t);
- if (r < 0)
- return r;
- return db->submit_transaction(t);
-}
-
-int DBObjectMap::_clear(Header header,
- KeyValueDB::Transaction t)
-{
- while (1) {
- if (header->num_children) {
- set_header(header, t);
- break;
- }
- clear_header(header, t);
- if (!header->parent)
- break;
- Header parent = lookup_parent(header);
- if (!parent) {
- return -EINVAL;
- }
- assert(parent->num_children > 0);
- parent->num_children--;
- header.swap(parent);
- }
- return 0;
-}
-
-int DBObjectMap::merge_new_complete(Header header,
- const map<string, string> &new_complete,
- DBObjectMapIterator iter,
- KeyValueDB::Transaction t)
-{
- KeyValueDB::Iterator complete_iter = db->get_iterator(
- complete_prefix(header)
- );
- map<string, string>::const_iterator i = new_complete.begin();
- set<string> to_remove;
- map<string, bufferlist> to_add;
-
- string begin, end;
- while (i != new_complete.end()) {
- string new_begin = i->first;
- string new_end = i->second;
- int r = iter->in_complete_region(new_begin, &begin, &end);
- if (r < 0)
- return r;
- if (r) {
- to_remove.insert(begin);
- new_begin = begin;
- }
- ++i;
- while (i != new_complete.end()) {
- if (!new_end.size() || i->first <= new_end) {
- if (!new_end.size() && i->second > new_end) {
- new_end = i->second;
- }
- ++i;
- continue;
- }
-
- r = iter->in_complete_region(new_end, &begin, &end);
- if (r < 0)
- return r;
- if (r) {
- to_remove.insert(begin);
- new_end = end;
- continue;
- }
- break;
- }
- bufferlist bl;
- bl.append(bufferptr(new_end.c_str(), new_end.size() + 1));
- to_add.insert(make_pair(new_begin, bl));
- }
- t->rmkeys(complete_prefix(header), to_remove);
- t->set(complete_prefix(header), to_add);
- return 0;
-}
-
-int DBObjectMap::copy_up_header(Header header,
- KeyValueDB::Transaction t)
-{
- bufferlist bl;
- int r = _get_header(header, &bl);
- if (r < 0)
- return r;
-
- _set_header(header, bl, t);
- return 0;
-}
-
-int DBObjectMap::need_parent(DBObjectMapIterator iter)
-{
- int r = iter->seek_to_first();
- if (r < 0)
- return r;
-
- if (!iter->valid())
- return 0;
-
- string begin, end;
- if (iter->in_complete_region(iter->key(), &begin, &end) && end == "") {
- return 0;
- }
- return 1;
-}
-
-int DBObjectMap::rm_keys(const ghobject_t &oid,
- const set<string> &to_clear,
- const SequencerPosition *spos)
-{
- MapHeaderLock hl(this, oid);
- Header header = lookup_map_header(hl, oid);
- if (!header)
- return -ENOENT;
- KeyValueDB::Transaction t = db->get_transaction();
- if (check_spos(oid, header, spos))
- return 0;
- t->rmkeys(user_prefix(header), to_clear);
- if (!header->parent) {
- return db->submit_transaction(t);
- }
-
- // Copy up keys from parent around to_clear
- int keep_parent;
- {
- DBObjectMapIterator iter = _get_iterator(header);
- iter->seek_to_first();
- map<string, string> new_complete;
- map<string, bufferlist> to_write;
- for(set<string>::const_iterator i = to_clear.begin();
- i != to_clear.end();
- ) {
- unsigned copied = 0;
- iter->lower_bound(*i);
- ++i;
- if (!iter->valid())
- break;
- string begin = iter->key();
- if (!iter->on_parent())
- iter->next_parent();
- if (new_complete.size() && new_complete.rbegin()->second == begin) {
- begin = new_complete.rbegin()->first;
- }
- while (iter->valid() && copied < 20) {
- if (!to_clear.count(iter->key()))
- to_write[iter->key()].append(iter->value());
- if (i != to_clear.end() && *i <= iter->key()) {
- ++i;
- copied = 0;
- }
-
- iter->next_parent();
- copied++;
- }
- if (iter->valid()) {
- new_complete[begin] = iter->key();
- } else {
- new_complete[begin] = "";
- break;
- }
- }
- t->set(user_prefix(header), to_write);
- merge_new_complete(header, new_complete, iter, t);
- keep_parent = need_parent(iter);
- if (keep_parent < 0)
- return keep_parent;
- }
- if (!keep_parent) {
- copy_up_header(header, t);
- Header parent = lookup_parent(header);
- if (!parent)
- return -EINVAL;
- parent->num_children--;
- _clear(parent, t);
- header->parent = 0;
- set_map_header(hl, oid, *header, t);
- t->rmkeys_by_prefix(complete_prefix(header));
- }
- return db->submit_transaction(t);
-}
-
-int DBObjectMap::clear_keys_header(const ghobject_t &oid,
- const SequencerPosition *spos)
-{
- KeyValueDB::Transaction t = db->get_transaction();
- MapHeaderLock hl(this, oid);
- Header header = lookup_map_header(hl, oid);
- if (!header)
- return -ENOENT;
- if (check_spos(oid, header, spos))
- return 0;
-
- // save old attrs
- KeyValueDB::Iterator iter = db->get_iterator(xattr_prefix(header));
- if (!iter)
- return -EINVAL;
- map<string, bufferlist> attrs;
- for (iter->seek_to_first(); !iter->status() && iter->valid(); iter->next())
- attrs.insert(make_pair(iter->key(), iter->value()));
- if (iter->status())
- return iter->status();
-
- // remove current header
- remove_map_header(hl, oid, header, t);
- assert(header->num_children > 0);
- header->num_children--;
- int r = _clear(header, t);
- if (r < 0)
- return r;
-
- // create new header
- Header newheader = generate_new_header(oid, Header());
- set_map_header(hl, oid, *newheader, t);
- if (!attrs.empty())
- t->set(xattr_prefix(newheader), attrs);
- return db->submit_transaction(t);
-}
-
-int DBObjectMap::get(const ghobject_t &oid,
- bufferlist *_header,
- map<string, bufferlist> *out)
-{
- MapHeaderLock hl(this, oid);
- Header header = lookup_map_header(hl, oid);
- if (!header)
- return -ENOENT;
- _get_header(header, _header);
- ObjectMapIterator iter = _get_iterator(header);
- for (iter->seek_to_first(); iter->valid(); iter->next()) {
- if (iter->status())
- return iter->status();
- out->insert(make_pair(iter->key(), iter->value()));
- }
- return 0;
-}
-
-int DBObjectMap::get_keys(const ghobject_t &oid,
- set<string> *keys)
-{
- MapHeaderLock hl(this, oid);
- Header header = lookup_map_header(hl, oid);
- if (!header)
- return -ENOENT;
- ObjectMapIterator iter = _get_iterator(header);
- for (iter->seek_to_first(); iter->valid(); iter->next()) {
- if (iter->status())
- return iter->status();
- keys->insert(iter->key());
- }
- return 0;
-}
-
-int DBObjectMap::scan(Header header,
- const set<string> &in_keys,
- set<string> *out_keys,
- map<string, bufferlist> *out_values)
-{
- ObjectMapIterator db_iter = _get_iterator(header);
- for (set<string>::const_iterator key_iter = in_keys.begin();
- key_iter != in_keys.end();
- ++key_iter) {
- db_iter->lower_bound(*key_iter);
- if (db_iter->status())
- return db_iter->status();
- if (db_iter->valid() && db_iter->key() == *key_iter) {
- if (out_keys)
- out_keys->insert(*key_iter);
- if (out_values)
- out_values->insert(make_pair(db_iter->key(), db_iter->value()));
- }
- }
- return 0;
-}
-
-int DBObjectMap::get_values(const ghobject_t &oid,
- const set<string> &keys,
- map<string, bufferlist> *out)
-{
- MapHeaderLock hl(this, oid);
- Header header = lookup_map_header(hl, oid);
- if (!header)
- return -ENOENT;
- return scan(header, keys, 0, out);
-}
-
-int DBObjectMap::check_keys(const ghobject_t &oid,
- const set<string> &keys,
- set<string> *out)
-{
- MapHeaderLock hl(this, oid);
- Header header = lookup_map_header(hl, oid);
- if (!header)
- return -ENOENT;
- return scan(header, keys, out, 0);
-}
-
-int DBObjectMap::get_xattrs(const ghobject_t &oid,
- const set<string> &to_get,
- map<string, bufferlist> *out)
-{
- MapHeaderLock hl(this, oid);
- Header header = lookup_map_header(hl, oid);
- if (!header)
- return -ENOENT;
- return db->get(xattr_prefix(header), to_get, out);
-}
-
-int DBObjectMap::get_all_xattrs(const ghobject_t &oid,
- set<string> *out)
-{
- MapHeaderLock hl(this, oid);
- Header header = lookup_map_header(hl, oid);
- if (!header)
- return -ENOENT;
- KeyValueDB::Iterator iter = db->get_iterator(xattr_prefix(header));
- if (!iter)
- return -EINVAL;
- for (iter->seek_to_first(); !iter->status() && iter->valid(); iter->next())
- out->insert(iter->key());
- return iter->status();
-}
-
-int DBObjectMap::set_xattrs(const ghobject_t &oid,
- const map<string, bufferlist> &to_set,
- const SequencerPosition *spos)
-{
- KeyValueDB::Transaction t = db->get_transaction();
- MapHeaderLock hl(this, oid);
- Header header = lookup_create_map_header(hl, oid, t);
- if (!header)
- return -EINVAL;
- if (check_spos(oid, header, spos))
- return 0;
- t->set(xattr_prefix(header), to_set);
- return db->submit_transaction(t);
-}
-
-int DBObjectMap::remove_xattrs(const ghobject_t &oid,
- const set<string> &to_remove,
- const SequencerPosition *spos)
-{
- KeyValueDB::Transaction t = db->get_transaction();
- MapHeaderLock hl(this, oid);
- Header header = lookup_map_header(hl, oid);
- if (!header)
- return -ENOENT;
- if (check_spos(oid, header, spos))
- return 0;
- t->rmkeys(xattr_prefix(header), to_remove);
- return db->submit_transaction(t);
-}
-
-int DBObjectMap::clone(const ghobject_t &oid,
- const ghobject_t &target,
- const SequencerPosition *spos)
-{
- if (oid == target)
- return 0;
-
- MapHeaderLock _l1(this, MIN_GHOBJ(oid, target, true));
- MapHeaderLock _l2(this, MAX_GHOBJ(oid, target, true));
- MapHeaderLock *lsource, *ltarget;
- if (cmp_bitwise(oid, target) > 0) {
- lsource = &_l2;
- ltarget= &_l1;
- } else {
- lsource = &_l1;
- ltarget= &_l2;
- }
-
- KeyValueDB::Transaction t = db->get_transaction();
- {
- Header destination = lookup_map_header(*ltarget, target);
- if (destination) {
- remove_map_header(*ltarget, target, destination, t);
- if (check_spos(target, destination, spos))
- return 0;
- destination->num_children--;
- _clear(destination, t);
- }
- }
-
- Header parent = lookup_map_header(*lsource, oid);
- if (!parent)
- return db->submit_transaction(t);
-
- Header source = generate_new_header(oid, parent);
- Header destination = generate_new_header(target, parent);
- if (spos)
- destination->spos = *spos;
-
- parent->num_children = 2;
- set_header(parent, t);
- set_map_header(*lsource, oid, *source, t);
- set_map_header(*ltarget, target, *destination, t);
-
- map<string, bufferlist> to_set;
- KeyValueDB::Iterator xattr_iter = db->get_iterator(xattr_prefix(parent));
- for (xattr_iter->seek_to_first();
- xattr_iter->valid();
- xattr_iter->next())
- to_set.insert(make_pair(xattr_iter->key(), xattr_iter->value()));
- t->set(xattr_prefix(source), to_set);
- t->set(xattr_prefix(destination), to_set);
- t->rmkeys_by_prefix(xattr_prefix(parent));
- return db->submit_transaction(t);
-}
-
-int DBObjectMap::upgrade_to_v2()
-{
- dout(1) << __func__ << " start" << dendl;
- KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
- iter->seek_to_first();
- while (iter->valid()) {
- unsigned count = 0;
- KeyValueDB::Transaction t = db->get_transaction();
- set<string> remove;
- map<string, bufferlist> add;
- for (;
- iter->valid() && count < 300;
- iter->next()) {
- dout(20) << __func__ << " key is " << iter->key() << dendl;
- int r = is_buggy_ghobject_key_v1(iter->key());
- if (r < 0) {
- derr << __func__ << " bad key '" << iter->key() << "'" << dendl;
- return r;
- }
- if (!r) {
- dout(20) << __func__ << " " << iter->key() << " ok" << dendl;
- continue;
- }
-
- // decode header to get oid
- _Header hdr;
- bufferlist bl = iter->value();
- bufferlist::iterator bliter = bl.begin();
- hdr.decode(bliter);
-
- string newkey(ghobject_key(hdr.oid));
- dout(20) << __func__ << " " << iter->key() << " -> " << newkey << dendl;
- add[newkey] = iter->value();
- remove.insert(iter->key());
- ++count;
- }
-
- if (!remove.empty()) {
- dout(20) << __func__ << " updating " << remove.size() << " keys" << dendl;
- t->rmkeys(HOBJECT_TO_SEQ, remove);
- t->set(HOBJECT_TO_SEQ, add);
- int r = db->submit_transaction(t);
- if (r < 0)
- return r;
- }
- }
-
- state.v = 2;
-
- Mutex::Locker l(header_lock);
- KeyValueDB::Transaction t = db->get_transaction();
- write_state(t);
- db->submit_transaction_sync(t);
- dout(1) << __func__ << " done" << dendl;
- return 0;
-}
-
-int DBObjectMap::init(bool do_upgrade)
-{
- map<string, bufferlist> result;
- set<string> to_get;
- to_get.insert(GLOBAL_STATE_KEY);
- int r = db->get(SYS_PREFIX, to_get, &result);
- if (r < 0)
- return r;
- if (!result.empty()) {
- bufferlist::iterator bliter = result.begin()->second.begin();
- state.decode(bliter);
- if (state.v < 1) {
- dout(1) << "DBObjectMap is *very* old; upgrade to an older version first"
- << dendl;
- return -ENOTSUP;
- }
- if (state.v < 2) { // Needs upgrade
- if (!do_upgrade) {
- dout(1) << "DOBjbectMap requires an upgrade,"
- << " set filestore_update_to"
- << dendl;
- return -ENOTSUP;
- } else {
- r = upgrade_to_v2();
- if (r < 0)
- return r;
- }
- }
- } else {
- // New store
- state.v = 2;
- state.seq = 1;
- }
- dout(20) << "(init)dbobjectmap: seq is " << state.seq << dendl;
- return 0;
-}
-
-int DBObjectMap::sync(const ghobject_t *oid,
- const SequencerPosition *spos) {
- KeyValueDB::Transaction t = db->get_transaction();
- if (oid) {
- assert(spos);
- MapHeaderLock hl(this, *oid);
- Header header = lookup_map_header(hl, *oid);
- if (header) {
- dout(10) << "oid: " << *oid << " setting spos to "
- << *spos << dendl;
- header->spos = *spos;
- set_map_header(hl, *oid, *header, t);
- }
- /* It may appear that this and the identical portion of the else
- * block can combined below, but in this block, the transaction
- * must be submitted under *both* the MapHeaderLock and the full
- * header_lock.
- *
- * See 2b63dd25fc1c73fa42e52e9ea4ab5a45dd9422a0 and bug 9891.
- */
- Mutex::Locker l(header_lock);
- write_state(t);
- return db->submit_transaction_sync(t);
- } else {
- Mutex::Locker l(header_lock);
- write_state(t);
- return db->submit_transaction_sync(t);
- }
-}
-
-int DBObjectMap::write_state(KeyValueDB::Transaction _t) {
- assert(header_lock.is_locked_by_me());
- dout(20) << "dbobjectmap: seq is " << state.seq << dendl;
- KeyValueDB::Transaction t = _t ? _t : db->get_transaction();
- bufferlist bl;
- state.encode(bl);
- map<string, bufferlist> to_write;
- to_write[GLOBAL_STATE_KEY] = bl;
- t->set(SYS_PREFIX, to_write);
- return _t ? 0 : db->submit_transaction(t);
-}
-
-
-DBObjectMap::Header DBObjectMap::_lookup_map_header(
- const MapHeaderLock &l,
- const ghobject_t &oid)
-{
- assert(l.get_locked() == oid);
-
- _Header *header = new _Header();
- {
- Mutex::Locker l(cache_lock);
- if (caches.lookup(oid, header)) {
- assert(!in_use.count(header->seq));
- in_use.insert(header->seq);
- return Header(header, RemoveOnDelete(this));
- }
- }
-
- bufferlist out;
- int r = db->get(HOBJECT_TO_SEQ, map_header_key(oid), &out);
- if (r < 0 || out.length()==0) {
- delete header;
- return Header();
- }
-
- Header ret(header, RemoveOnDelete(this));
- bufferlist::iterator iter = out.begin();
-
- ret->decode(iter);
- {
- Mutex::Locker l(cache_lock);
- caches.add(oid, *ret);
- }
-
- assert(!in_use.count(header->seq));
- in_use.insert(header->seq);
- return ret;
-}
-
-DBObjectMap::Header DBObjectMap::_generate_new_header(const ghobject_t &oid,
- Header parent)
-{
- Header header = Header(new _Header(), RemoveOnDelete(this));
- header->seq = state.seq++;
- if (parent) {
- header->parent = parent->seq;
- header->spos = parent->spos;
- }
- header->num_children = 1;
- header->oid = oid;
- assert(!in_use.count(header->seq));
- in_use.insert(header->seq);
-
- write_state();
- return header;
-}
-
-DBObjectMap::Header DBObjectMap::lookup_parent(Header input)
-{
- Mutex::Locker l(header_lock);
- while (in_use.count(input->parent))
- header_cond.Wait(header_lock);
- map<string, bufferlist> out;
- set<string> keys;
- keys.insert(HEADER_KEY);
-
- dout(20) << "lookup_parent: parent " << input->parent
- << " for seq " << input->seq << dendl;
- int r = db->get(sys_parent_prefix(input), keys, &out);
- if (r < 0) {
- assert(0);
- return Header();
- }
- if (out.empty()) {
- assert(0);
- return Header();
- }
-
- Header header = Header(new _Header(), RemoveOnDelete(this));
- header->seq = input->parent;
- bufferlist::iterator iter = out.begin()->second.begin();
- header->decode(iter);
- dout(20) << "lookup_parent: parent seq is " << header->seq << " with parent "
- << header->parent << dendl;
- in_use.insert(header->seq);
- return header;
-}
-
-DBObjectMap::Header DBObjectMap::lookup_create_map_header(
- const MapHeaderLock &hl,
- const ghobject_t &oid,
- KeyValueDB::Transaction t)
-{
- Mutex::Locker l(header_lock);
- Header header = _lookup_map_header(hl, oid);
- if (!header) {
- header = _generate_new_header(oid, Header());
- set_map_header(hl, oid, *header, t);
- }
- return header;
-}
-
-void DBObjectMap::clear_header(Header header, KeyValueDB::Transaction t)
-{
- dout(20) << "clear_header: clearing seq " << header->seq << dendl;
- t->rmkeys_by_prefix(user_prefix(header));
- t->rmkeys_by_prefix(sys_prefix(header));
- t->rmkeys_by_prefix(complete_prefix(header));
- t->rmkeys_by_prefix(xattr_prefix(header));
- set<string> keys;
- keys.insert(header_key(header->seq));
- t->rmkeys(USER_PREFIX, keys);
-}
-
-void DBObjectMap::set_header(Header header, KeyValueDB::Transaction t)
-{
- dout(20) << "set_header: setting seq " << header->seq << dendl;
- map<string, bufferlist> to_write;
- header->encode(to_write[HEADER_KEY]);
- t->set(sys_prefix(header), to_write);
-}
-
-void DBObjectMap::remove_map_header(
- const MapHeaderLock &l,
- const ghobject_t &oid,
- Header header,
- KeyValueDB::Transaction t)
-{
- assert(l.get_locked() == oid);
- dout(20) << "remove_map_header: removing " << header->seq
- << " oid " << oid << dendl;
- set<string> to_remove;
- to_remove.insert(map_header_key(oid));
- t->rmkeys(HOBJECT_TO_SEQ, to_remove);
- {
- Mutex::Locker l(cache_lock);
- caches.clear(oid);
- }
-}
-
-void DBObjectMap::set_map_header(
- const MapHeaderLock &l,
- const ghobject_t &oid, _Header header,
- KeyValueDB::Transaction t)
-{
- assert(l.get_locked() == oid);
- dout(20) << "set_map_header: setting " << header.seq
- << " oid " << oid << " parent seq "
- << header.parent << dendl;
- map<string, bufferlist> to_set;
- header.encode(to_set[map_header_key(oid)]);
- t->set(HOBJECT_TO_SEQ, to_set);
- {
- Mutex::Locker l(cache_lock);
- caches.add(oid, header);
- }
-}
-
-bool DBObjectMap::check_spos(const ghobject_t &oid,
- Header header,
- const SequencerPosition *spos)
-{
- if (!spos || *spos > header->spos) {
- stringstream out;
- if (spos)
- dout(10) << "oid: " << oid << " not skipping op, *spos "
- << *spos << dendl;
- else
- dout(10) << "oid: " << oid << " not skipping op, *spos "
- << "empty" << dendl;
- dout(10) << " > header.spos " << header->spos << dendl;
- return false;
- } else {
- dout(10) << "oid: " << oid << " skipping op, *spos " << *spos
- << " <= header.spos " << header->spos << dendl;
- return true;
- }
-}
-
-int DBObjectMap::list_objects(vector<ghobject_t> *out)
-{
- KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
- for (iter->seek_to_first(); iter->valid(); iter->next()) {
- bufferlist bl = iter->value();
- bufferlist::iterator bliter = bl.begin();
- _Header header;
- header.decode(bliter);
- out->push_back(header.oid);
- }
- return 0;
-}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-#ifndef DBOBJECTMAP_DB_H
-#define DBOBJECTMAP_DB_H
-
-#include "include/buffer_fwd.h"
-#include <set>
-#include <map>
-#include <string>
-
-#include <vector>
-#include "include/memory.h"
-#include <boost/scoped_ptr.hpp>
-
-#include "ObjectMap.h"
-#include "kv/KeyValueDB.h"
-#include "osd/osd_types.h"
-#include "common/Mutex.h"
-#include "common/Cond.h"
-#include "common/simple_cache.hpp"
-#include <boost/optional/optional_io.hpp>
-
-/**
- * DBObjectMap: Implements ObjectMap in terms of KeyValueDB
- *
- * Prefix space structure:
- *
- * @see complete_prefix
- * @see user_prefix
- * @see sys_prefix
- *
- * - GHOBJECT_TO_SEQ: Contains leaf mapping from ghobject_t->hobj.seq and
- * corresponding omap header
- * - SYS_PREFIX: GLOBAL_STATE_KEY - contains next seq number
- * @see State
- * @see write_state
- * @see init
- * @see generate_new_header
- * - USER_PREFIX + header_key(header->seq) + USER_PREFIX
- * : key->value for header->seq
- * - USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX: see below
- * - USER_PREFIX + header_key(header->seq) + XATTR_PREFIX: xattrs
- * - USER_PREFIX + header_key(header->seq) + SYS_PREFIX
- * : USER_HEADER_KEY - omap header for header->seq
- * : HEADER_KEY - encoding of header for header->seq
- *
- * For each node (represented by a header), we
- * store three mappings: the key mapping, the complete mapping, and the parent.
- * The complete mapping (COMPLETE_PREFIX space) is key->key. Each x->y entry in
- * this mapping indicates that the key mapping contains all entries on [x,y).
- * Note, max string is represented by "", so ""->"" indicates that the parent
- * is unnecessary (@see rm_keys). When looking up a key not contained in the
- * the complete set, we have to check the parent if we don't find it in the
- * key set. During rm_keys, we copy keys from the parent and update the
- * complete set to reflect the change @see rm_keys.
- */
-class DBObjectMap : public ObjectMap {
-public:
- boost::scoped_ptr<KeyValueDB> db;
-
- /**
- * Serializes access to next_seq as well as the in_use set
- */
- Mutex header_lock;
- Cond header_cond;
- Cond map_header_cond;
-
- /**
- * Set of headers currently in use
- */
- set<uint64_t> in_use;
- set<ghobject_t, ghobject_t::BitwiseComparator> map_header_in_use;
-
- /**
- * Takes the map_header_in_use entry in constructor, releases in
- * destructor
- */
- class MapHeaderLock {
- DBObjectMap *db;
- boost::optional<ghobject_t> locked;
-
- MapHeaderLock(const MapHeaderLock &);
- MapHeaderLock &operator=(const MapHeaderLock &);
- public:
- MapHeaderLock(DBObjectMap *db) : db(db) {}
- MapHeaderLock(DBObjectMap *db, const ghobject_t &oid) : db(db), locked(oid) {
- Mutex::Locker l(db->header_lock);
- while (db->map_header_in_use.count(*locked))
- db->map_header_cond.Wait(db->header_lock);
- db->map_header_in_use.insert(*locked);
- }
-
- const ghobject_t &get_locked() const {
- assert(locked);
- return *locked;
- }
-
- void swap(MapHeaderLock &o) {
- assert(db == o.db);
-
- // centos6's boost optional doesn't seem to have swap :(
- boost::optional<ghobject_t> _locked = o.locked;
- o.locked = locked;
- locked = _locked;
- }
-
- ~MapHeaderLock() {
- if (locked) {
- Mutex::Locker l(db->header_lock);
- assert(db->map_header_in_use.count(*locked));
- db->map_header_cond.Signal();
- db->map_header_in_use.erase(*locked);
- }
- }
- };
-
- DBObjectMap(KeyValueDB *db) : db(db), header_lock("DBOBjectMap"),
- cache_lock("DBObjectMap::CacheLock"),
- caches(g_conf->filestore_omap_header_cache_size)
- {}
-
- int set_keys(
- const ghobject_t &oid,
- const map<string, bufferlist> &set,
- const SequencerPosition *spos=0
- );
-
- int set_header(
- const ghobject_t &oid,
- const bufferlist &bl,
- const SequencerPosition *spos=0
- );
-
- int get_header(
- const ghobject_t &oid,
- bufferlist *bl
- );
-
- int clear(
- const ghobject_t &oid,
- const SequencerPosition *spos=0
- );
-
- int clear_keys_header(
- const ghobject_t &oid,
- const SequencerPosition *spos=0
- );
-
- int rm_keys(
- const ghobject_t &oid,
- const set<string> &to_clear,
- const SequencerPosition *spos=0
- );
-
- int get(
- const ghobject_t &oid,
- bufferlist *header,
- map<string, bufferlist> *out
- );
-
- int get_keys(
- const ghobject_t &oid,
- set<string> *keys
- );
-
- int get_values(
- const ghobject_t &oid,
- const set<string> &keys,
- map<string, bufferlist> *out
- );
-
- int check_keys(
- const ghobject_t &oid,
- const set<string> &keys,
- set<string> *out
- );
-
- int get_xattrs(
- const ghobject_t &oid,
- const set<string> &to_get,
- map<string, bufferlist> *out
- );
-
- int get_all_xattrs(
- const ghobject_t &oid,
- set<string> *out
- );
-
- int set_xattrs(
- const ghobject_t &oid,
- const map<string, bufferlist> &to_set,
- const SequencerPosition *spos=0
- );
-
- int remove_xattrs(
- const ghobject_t &oid,
- const set<string> &to_remove,
- const SequencerPosition *spos=0
- );
-
- int clone(
- const ghobject_t &oid,
- const ghobject_t &target,
- const SequencerPosition *spos=0
- );
-
- /// Read initial state from backing store
- int init(bool upgrade = false);
-
- /// Upgrade store to current version
- int upgrade_to_v2();
-
- /// Consistency check, debug, there must be no parallel writes
- bool check(std::ostream &out);
-
- /// Ensure that all previous operations are durable
- int sync(const ghobject_t *oid=0, const SequencerPosition *spos=0);
-
- /// Util, list all objects, there must be no other concurrent access
- int list_objects(vector<ghobject_t> *objs ///< [out] objects
- );
-
- ObjectMapIterator get_iterator(const ghobject_t &oid);
-
- static const string USER_PREFIX;
- static const string XATTR_PREFIX;
- static const string SYS_PREFIX;
- static const string COMPLETE_PREFIX;
- static const string HEADER_KEY;
- static const string USER_HEADER_KEY;
- static const string GLOBAL_STATE_KEY;
- static const string HOBJECT_TO_SEQ;
-
- /// Legacy
- static const string LEAF_PREFIX;
- static const string REVERSE_LEAF_PREFIX;
-
- /// persistent state for store @see generate_header
- struct State {
- __u8 v;
- uint64_t seq;
- State() : v(0), seq(1) {}
- State(uint64_t seq) : v(0), seq(seq) {}
-
- void encode(bufferlist &bl) const {
- ENCODE_START(2, 1, bl);
- ::encode(v, bl);
- ::encode(seq, bl);
- ENCODE_FINISH(bl);
- }
-
- void decode(bufferlist::iterator &bl) {
- DECODE_START(2, bl);
- if (struct_v >= 2)
- ::decode(v, bl);
- else
- v = 0;
- ::decode(seq, bl);
- DECODE_FINISH(bl);
- }
-
- void dump(Formatter *f) const {
- f->dump_unsigned("seq", seq);
- }
-
- static void generate_test_instances(list<State*> &o) {
- o.push_back(new State(0));
- o.push_back(new State(20));
- }
- } state;
-
- struct _Header {
- uint64_t seq;
- uint64_t parent;
- uint64_t num_children;
-
- coll_t c;
- ghobject_t oid;
-
- SequencerPosition spos;
-
- void encode(bufferlist &bl) const {
- ENCODE_START(2, 1, bl);
- ::encode(seq, bl);
- ::encode(parent, bl);
- ::encode(num_children, bl);
- ::encode(c, bl);
- ::encode(oid, bl);
- ::encode(spos, bl);
- ENCODE_FINISH(bl);
- }
-
- void decode(bufferlist::iterator &bl) {
- DECODE_START(2, bl);
- ::decode(seq, bl);
- ::decode(parent, bl);
- ::decode(num_children, bl);
- ::decode(c, bl);
- ::decode(oid, bl);
- if (struct_v >= 2)
- ::decode(spos, bl);
- DECODE_FINISH(bl);
- }
-
- void dump(Formatter *f) const {
- f->dump_unsigned("seq", seq);
- f->dump_unsigned("parent", parent);
- f->dump_unsigned("num_children", num_children);
- f->dump_stream("coll") << c;
- f->dump_stream("oid") << oid;
- }
-
- static void generate_test_instances(list<_Header*> &o) {
- o.push_back(new _Header);
- o.push_back(new _Header);
- o.back()->parent = 20;
- o.back()->seq = 30;
- }
-
- _Header() : seq(0), parent(0), num_children(1) {}
- };
-
- /// String munging (public for testing)
- static string ghobject_key(const ghobject_t &oid);
- static string ghobject_key_v0(coll_t c, const ghobject_t &oid);
- static int is_buggy_ghobject_key_v1(const string &in);
-private:
- /// Implicit lock on Header->seq
- typedef ceph::shared_ptr<_Header> Header;
- Mutex cache_lock;
- SimpleLRU<ghobject_t, _Header, ghobject_t::BitwiseComparator> caches;
-
- string map_header_key(const ghobject_t &oid);
- string header_key(uint64_t seq);
- string complete_prefix(Header header);
- string user_prefix(Header header);
- string sys_prefix(Header header);
- string xattr_prefix(Header header);
- string sys_parent_prefix(_Header header);
- string sys_parent_prefix(Header header) {
- return sys_parent_prefix(*header);
- }
-
- class EmptyIteratorImpl : public ObjectMapIteratorImpl {
- public:
- int seek_to_first() { return 0; }
- int seek_to_last() { return 0; }
- int upper_bound(const string &after) { return 0; }
- int lower_bound(const string &to) { return 0; }
- bool valid() { return false; }
- int next(bool validate=true) { assert(0); return 0; }
- string key() { assert(0); return ""; }
- bufferlist value() { assert(0); return bufferlist(); }
- int status() { return 0; }
- };
-
-
- /// Iterator
- class DBObjectMapIteratorImpl : public ObjectMapIteratorImpl {
- public:
- DBObjectMap *map;
-
- /// NOTE: implicit lock hlock->get_locked() when returned out of the class
- MapHeaderLock hlock;
- /// NOTE: implicit lock on header->seq AND for all ancestors
- Header header;
-
- /// parent_iter == NULL iff no parent
- ceph::shared_ptr<DBObjectMapIteratorImpl> parent_iter;
- KeyValueDB::Iterator key_iter;
- KeyValueDB::Iterator complete_iter;
-
- /// cur_iter points to currently valid iterator
- ceph::shared_ptr<ObjectMapIteratorImpl> cur_iter;
- int r;
-
- /// init() called, key_iter, complete_iter, parent_iter filled in
- bool ready;
- /// past end
- bool invalid;
-
- DBObjectMapIteratorImpl(DBObjectMap *map, Header header) :
- map(map), hlock(map), header(header), r(0), ready(false), invalid(true) {}
- int seek_to_first();
- int seek_to_last();
- int upper_bound(const string &after);
- int lower_bound(const string &to);
- bool valid();
- int next(bool validate=true);
- string key();
- bufferlist value();
- int status();
-
- bool on_parent() {
- return cur_iter == parent_iter;
- }
-
- /// skips to next valid parent entry
- int next_parent();
-
- /// Tests whether to_test is in complete region
- int in_complete_region(const string &to_test, ///< [in] key to test
- string *begin, ///< [out] beginning of region
- string *end ///< [out] end of region
- ); ///< @returns true if to_test is in the complete region, else false
-
- private:
- int init();
- bool valid_parent();
- int adjust();
- };
-
- typedef ceph::shared_ptr<DBObjectMapIteratorImpl> DBObjectMapIterator;
- DBObjectMapIterator _get_iterator(Header header) {
- return DBObjectMapIterator(new DBObjectMapIteratorImpl(this, header));
- }
-
- /// sys
-
- /// Removes node corresponding to header
- void clear_header(Header header, KeyValueDB::Transaction t);
-
- /// Set node containing input to new contents
- void set_header(Header input, KeyValueDB::Transaction t);
-
- /// Remove leaf node corresponding to oid in c
- void remove_map_header(
- const MapHeaderLock &l,
- const ghobject_t &oid,
- Header header,
- KeyValueDB::Transaction t);
-
- /// Set leaf node for c and oid to the value of header
- void set_map_header(
- const MapHeaderLock &l,
- const ghobject_t &oid, _Header header,
- KeyValueDB::Transaction t);
-
- /// Set leaf node for c and oid to the value of header
- bool check_spos(const ghobject_t &oid,
- Header header,
- const SequencerPosition *spos);
-
- /// Lookup or create header for c oid
- Header lookup_create_map_header(
- const MapHeaderLock &l,
- const ghobject_t &oid,
- KeyValueDB::Transaction t);
-
- /**
- * Generate new header for c oid with new seq number
- *
- * Has the side effect of syncronously saving the new DBObjectMap state
- */
- Header _generate_new_header(const ghobject_t &oid, Header parent);
- Header generate_new_header(const ghobject_t &oid, Header parent) {
- Mutex::Locker l(header_lock);
- return _generate_new_header(oid, parent);
- }
-
- /// Lookup leaf header for c oid
- Header _lookup_map_header(
- const MapHeaderLock &l,
- const ghobject_t &oid);
- Header lookup_map_header(
- const MapHeaderLock &l2,
- const ghobject_t &oid) {
- Mutex::Locker l(header_lock);
- return _lookup_map_header(l2, oid);
- }
-
- /// Lookup header node for input
- Header lookup_parent(Header input);
-
-
- /// Helpers
- int _get_header(Header header, bufferlist *bl);
-
- /// Scan keys in header into out_keys and out_values (if nonnull)
- int scan(Header header,
- const set<string> &in_keys,
- set<string> *out_keys,
- map<string, bufferlist> *out_values);
-
- /// Remove header and all related prefixes
- int _clear(Header header,
- KeyValueDB::Transaction t);
- /// Adds to t operations necessary to add new_complete to the complete set
- int merge_new_complete(Header header,
- const map<string, string> &new_complete,
- DBObjectMapIterator iter,
- KeyValueDB::Transaction t);
-
- /// Writes out State (mainly next_seq)
- int write_state(KeyValueDB::Transaction _t =
- KeyValueDB::Transaction());
-
- /// 0 if the complete set now contains all of key space, < 0 on error, 1 else
- int need_parent(DBObjectMapIterator iter);
-
- /// Copies header entry from parent @see rm_keys
- int copy_up_header(Header header,
- KeyValueDB::Transaction t);
-
- /// Sets header @see set_header
- void _set_header(Header header, const bufferlist &bl,
- KeyValueDB::Transaction t);
-
- /**
- * Removes header seq lock and possibly object lock
- * once Header is out of scope
- * @see lookup_parent
- * @see generate_new_header
- */
- class RemoveOnDelete {
- public:
- DBObjectMap *db;
- RemoveOnDelete(DBObjectMap *db) :
- db(db) {}
- void operator() (_Header *header) {
- Mutex::Locker l(db->header_lock);
- assert(db->in_use.count(header->seq));
- db->in_use.erase(header->seq);
- db->header_cond.Signal();
- delete header;
- }
- };
- friend class RemoveOnDelete;
-};
-WRITE_CLASS_ENCODER(DBObjectMap::_Header)
-WRITE_CLASS_ENCODER(DBObjectMap::State)
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2013 Inktank Storage, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#ifndef CEPH_FDCACHE_H
-#define CEPH_FDCACHE_H
-
-#include <memory>
-#include <errno.h>
-#include <cstdio>
-#include "common/hobject.h"
-#include "common/Mutex.h"
-#include "common/Cond.h"
-#include "common/shared_cache.hpp"
-#include "include/compat.h"
-#include "include/intarith.h"
-
-/**
- * FD Cache
- */
-class FDCache : public md_config_obs_t {
-public:
- /**
- * FD
- *
- * Wrapper for an fd. Destructor closes the fd.
- */
- class FD {
- public:
- const int fd;
- FD(int _fd) : fd(_fd) {
- assert(_fd >= 0);
- }
- int operator*() const {
- return fd;
- }
- ~FD() {
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- }
- };
-
-private:
- CephContext *cct;
- const int registry_shards;
- SharedLRU<ghobject_t, FD, ghobject_t::BitwiseComparator> *registry;
-
-public:
- FDCache(CephContext *cct) : cct(cct),
- registry_shards(cct->_conf->filestore_fd_cache_shards) {
- assert(cct);
- cct->_conf->add_observer(this);
- registry = new SharedLRU<ghobject_t, FD, ghobject_t::BitwiseComparator>[registry_shards];
- for (int i = 0; i < registry_shards; ++i) {
- registry[i].set_cct(cct);
- registry[i].set_size(
- MAX((cct->_conf->filestore_fd_cache_size / registry_shards), 1));
- }
- }
- ~FDCache() {
- cct->_conf->remove_observer(this);
- delete[] registry;
- }
- typedef ceph::shared_ptr<FD> FDRef;
-
- FDRef lookup(const ghobject_t &hoid) {
- int registry_id = hoid.hobj.get_hash() % registry_shards;
- return registry[registry_id].lookup(hoid);
- }
-
- FDRef add(const ghobject_t &hoid, int fd, bool *existed) {
- int registry_id = hoid.hobj.get_hash() % registry_shards;
- return registry[registry_id].add(hoid, new FD(fd), existed);
- }
-
- /// clear cached fd for hoid, subsequent lookups will get an empty FD
- void clear(const ghobject_t &hoid) {
- int registry_id = hoid.hobj.get_hash() % registry_shards;
- registry[registry_id].purge(hoid);
- }
-
- /// md_config_obs_t
- const char** get_tracked_conf_keys() const {
- static const char* KEYS[] = {
- "filestore_fd_cache_size",
- NULL
- };
- return KEYS;
- }
- void handle_conf_change(const md_config_t *conf,
- const std::set<std::string> &changed) {
- if (changed.count("filestore_fd_cache_size")) {
- for (int i = 0; i < registry_shards; ++i)
- registry[i].set_size(
- MAX((conf->filestore_fd_cache_size / registry_shards), 1));
- }
- }
-
-};
-typedef FDCache::FDRef FDRef;
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-#include "acconfig.h"
-
-#include "common/debug.h"
-#include "common/errno.h"
-#include "common/safe_io.h"
-#include "FileJournal.h"
-#include "include/color.h"
-#include "common/perf_counters.h"
-#include "os/FileStore.h"
-
-#include "include/compat.h"
-
-#include <fcntl.h>
-#include <limits.h>
-#include <sstream>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mount.h>
-
-#include "common/blkdev.h"
-#include "common/linux_version.h"
-
-#if defined(__FreeBSD__)
-#define O_DSYNC O_SYNC
-#endif
-
-#define dout_subsys ceph_subsys_journal
-#undef dout_prefix
-#define dout_prefix *_dout << "journal "
-
-const static int64_t ONE_MEG(1 << 20);
-const static int CEPH_MINIMUM_BLOCK_SIZE(4096);
-
-int FileJournal::_open(bool forwrite, bool create)
-{
- int flags, ret;
-
- if (forwrite) {
- flags = O_RDWR;
- if (directio)
- flags |= O_DIRECT | O_DSYNC;
- } else {
- flags = O_RDONLY;
- }
- if (create)
- flags |= O_CREAT;
-
- if (fd >= 0) {
- if (TEMP_FAILURE_RETRY(::close(fd))) {
- int err = errno;
- derr << "FileJournal::_open: error closing old fd: "
- << cpp_strerror(err) << dendl;
- }
- }
- fd = TEMP_FAILURE_RETRY(::open(fn.c_str(), flags, 0644));
- if (fd < 0) {
- int err = errno;
- dout(2) << "FileJournal::_open unable to open journal "
- << fn << ": " << cpp_strerror(err) << dendl;
- return -err;
- }
-
- struct stat st;
- ret = ::fstat(fd, &st);
- if (ret) {
- ret = errno;
- derr << "FileJournal::_open: unable to fstat journal: " << cpp_strerror(ret) << dendl;
- ret = -ret;
- goto out_fd;
- }
-
- if (S_ISBLK(st.st_mode)) {
- ret = _open_block_device();
- } else {
- if (aio && !force_aio) {
- derr << "FileJournal::_open: disabling aio for non-block journal. Use "
- << "journal_force_aio to force use of aio anyway" << dendl;
- aio = false;
- }
- ret = _open_file(st.st_size, st.st_blksize, create);
- }
-
- if (ret)
- goto out_fd;
-
-#ifdef HAVE_LIBAIO
- if (aio) {
- aio_ctx = 0;
- ret = io_setup(128, &aio_ctx);
- if (ret < 0) {
- ret = errno;
- derr << "FileJournal::_open: unable to setup io_context " << cpp_strerror(ret) << dendl;
- ret = -ret;
- goto out_fd;
- }
- }
-#endif
-
- /* We really want max_size to be a multiple of block_size. */
- max_size -= max_size % block_size;
-
- dout(1) << "_open " << fn << " fd " << fd
- << ": " << max_size
- << " bytes, block size " << block_size
- << " bytes, directio = " << directio
- << ", aio = " << aio
- << dendl;
- return 0;
-
- out_fd:
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- return ret;
-}
-
-int FileJournal::_open_block_device()
-{
- int64_t bdev_sz = 0;
- int ret = get_block_device_size(fd, &bdev_sz);
- if (ret) {
- dout(0) << __func__ << ": failed to read block device size." << dendl;
- return -EIO;
- }
-
- /* Check for bdev_sz too small */
- if (bdev_sz < ONE_MEG) {
- dout(0) << __func__ << ": your block device must be at least "
- << ONE_MEG << " bytes to be used for a Ceph journal." << dendl;
- return -EINVAL;
- }
-
- dout(10) << __func__ << ": ignoring osd journal size. "
- << "We'll use the entire block device (size: " << bdev_sz << ")"
- << dendl;
- max_size = bdev_sz;
-
- block_size = CEPH_MINIMUM_BLOCK_SIZE;
-
- if (g_conf->journal_discard) {
- discard = block_device_support_discard(fn.c_str());
- dout(10) << fn << " support discard: " << (int)discard << dendl;
- }
- _check_disk_write_cache();
- return 0;
-}
-
-void FileJournal::_check_disk_write_cache() const
-{
- ostringstream hdparm_cmd;
- FILE *fp = NULL;
-
- if (geteuid() != 0) {
- dout(10) << "_check_disk_write_cache: not root, NOT checking disk write "
- << "cache on raw block device " << fn << dendl;
- goto done;
- }
-
- hdparm_cmd << "/sbin/hdparm -W " << fn;
- fp = popen(hdparm_cmd.str().c_str(), "r");
- if (!fp) {
- dout(10) << "_check_disk_write_cache: failed to run /sbin/hdparm: NOT "
- << "checking disk write cache on raw block device " << fn << dendl;
- goto done;
- }
-
- while (true) {
- char buf[256];
- memset(buf, 0, sizeof(buf));
- char *line = fgets(buf, sizeof(buf) - 1, fp);
- if (!line) {
- if (ferror(fp)) {
- int ret = -errno;
- derr << "_check_disk_write_cache: fgets error: " << cpp_strerror(ret)
- << dendl;
- goto close_f;
- }
- else {
- // EOF.
- break;
- }
- }
-
- int on;
- if (sscanf(line, " write-caching = %d", &on) != 1)
- continue;
- if (!on) {
- dout(10) << "_check_disk_write_cache: disk write cache is off (good) on "
- << fn << dendl;
- break;
- }
-
- // is our kernel new enough?
- int ver = get_linux_version();
- if (ver == 0) {
- dout(10) << "_check_disk_write_cache: get_linux_version failed" << dendl;
- } else if (ver >= KERNEL_VERSION(2, 6, 33)) {
- dout(20) << "_check_disk_write_cache: disk write cache is on, but your "
- << "kernel is new enough to handle it correctly. (fn:"
- << fn << ")" << dendl;
- break;
- }
- derr << TEXT_RED
- << " ** WARNING: disk write cache is ON on " << fn << ".\n"
- << " Journaling will not be reliable on kernels prior to 2.6.33\n"
- << " (recent kernels are safe). You can disable the write cache with\n"
- << " 'hdparm -W 0 " << fn << "'"
- << TEXT_NORMAL
- << dendl;
- break;
- }
-
-close_f:
- if (pclose(fp)) {
- int ret = -errno;
- derr << "_check_disk_write_cache: pclose failed: " << cpp_strerror(ret)
- << dendl;
- }
-done:
- ;
-}
-
-int FileJournal::_open_file(int64_t oldsize, blksize_t blksize,
- bool create)
-{
- int ret;
- int64_t conf_journal_sz(g_conf->osd_journal_size);
- conf_journal_sz <<= 20;
-
- if ((g_conf->osd_journal_size == 0) && (oldsize < ONE_MEG)) {
- derr << "I'm sorry, I don't know how large of a journal to create."
- << "Please specify a block device to use as the journal OR "
- << "set osd_journal_size in your ceph.conf" << dendl;
- return -EINVAL;
- }
-
- if (create && (oldsize < conf_journal_sz)) {
- uint64_t newsize(g_conf->osd_journal_size);
- newsize <<= 20;
- dout(10) << "_open extending to " << newsize << " bytes" << dendl;
- ret = ::ftruncate(fd, newsize);
- if (ret < 0) {
- int err = errno;
- derr << "FileJournal::_open_file : unable to extend journal to "
- << newsize << " bytes: " << cpp_strerror(err) << dendl;
- return -err;
- }
-#ifdef HAVE_POSIX_FALLOCATE
- ret = ::posix_fallocate(fd, 0, newsize);
- if (ret) {
- derr << "FileJournal::_open_file : unable to preallocation journal to "
- << newsize << " bytes: " << cpp_strerror(ret) << dendl;
- return -ret;
- }
- max_size = newsize;
-#elif defined(__APPLE__)
- fstore_t store;
- store.fst_flags = F_ALLOCATECONTIG;
- store.fst_posmode = F_PEOFPOSMODE;
- store.fst_offset = 0;
- store.fst_length = newsize;
-
- ret = ::fcntl(fd, F_PREALLOCATE, &store);
- if (ret == -1) {
- ret = -errno;
- derr << "FileJournal::_open_file : unable to preallocation journal to "
- << newsize << " bytes: " << cpp_strerror(ret) << dendl;
- return ret;
- }
- max_size = newsize;
-#else
-# error "Journal pre-allocation not supported on platform."
-#endif
- }
- else {
- max_size = oldsize;
- }
- block_size = MAX(blksize, (blksize_t)CEPH_MINIMUM_BLOCK_SIZE);
-
- if (create && g_conf->journal_zero_on_create) {
- derr << "FileJournal::_open_file : zeroing journal" << dendl;
- uint64_t write_size = 1 << 20;
- char *buf;
- ret = ::posix_memalign((void **)&buf, block_size, write_size);
- if (ret != 0) {
- return -ret;
- }
- memset(static_cast<void*>(buf), 0, write_size);
- uint64_t i = 0;
- for (; (i + write_size) <= (uint64_t)max_size; i += write_size) {
- ret = ::pwrite(fd, static_cast<void*>(buf), write_size, i);
- if (ret < 0) {
- free(buf);
- return -errno;
- }
- }
- if (i < (uint64_t)max_size) {
- ret = ::pwrite(fd, static_cast<void*>(buf), max_size - i, i);
- if (ret < 0) {
- free(buf);
- return -errno;
- }
- }
- free(buf);
- }
-
-
- dout(10) << "_open journal is not a block device, NOT checking disk "
- << "write cache on '" << fn << "'" << dendl;
-
- return 0;
-}
-
-// This can not be used on an active journal
-int FileJournal::check()
-{
- int ret;
-
- assert(fd == -1);
- ret = _open(false, false);
- if (ret)
- return ret;
-
- ret = read_header(&header);
- if (ret < 0)
- goto done;
-
- if (header.fsid != fsid) {
- derr << "check: ondisk fsid " << header.fsid << " doesn't match expected " << fsid
- << ", invalid (someone else's?) journal" << dendl;
- ret = -EINVAL;
- goto done;
- }
-
- dout(1) << "check: header looks ok" << dendl;
- ret = 0;
-
- done:
- close();
- return ret;
-}
-
-
-int FileJournal::create()
-{
- void *buf = 0;
- int64_t needed_space;
- int ret;
- buffer::ptr bp;
- dout(2) << "create " << fn << " fsid " << fsid << dendl;
-
- ret = _open(true, true);
- if (ret)
- goto done;
-
- // write empty header
- header = header_t();
- header.flags = header_t::FLAG_CRC; // enable crcs on any new journal.
- header.fsid = fsid;
- header.max_size = max_size;
- header.block_size = block_size;
- if (g_conf->journal_block_align || directio)
- header.alignment = block_size;
- else
- header.alignment = 16; // at least stay word aligned on 64bit machines...
-
- header.start = get_top();
- header.start_seq = 0;
-
- print_header(header);
-
- // static zeroed buffer for alignment padding
- delete [] zero_buf;
- zero_buf = new char[header.alignment];
- memset(zero_buf, 0, header.alignment);
-
- bp = prepare_header();
- if (TEMP_FAILURE_RETRY(::pwrite(fd, bp.c_str(), bp.length(), 0)) < 0) {
- ret = -errno;
- derr << "FileJournal::create : create write header error "
- << cpp_strerror(ret) << dendl;
- goto close_fd;
- }
-
- // zero first little bit, too.
- ret = posix_memalign(&buf, block_size, block_size);
- if (ret) {
- ret = -ret;
- derr << "FileJournal::create: failed to allocate " << block_size
- << " bytes of memory: " << cpp_strerror(ret) << dendl;
- goto close_fd;
- }
- memset(buf, 0, block_size);
- if (TEMP_FAILURE_RETRY(::pwrite(fd, buf, block_size, get_top())) < 0) {
- ret = -errno;
- derr << "FileJournal::create: error zeroing first " << block_size
- << " bytes " << cpp_strerror(ret) << dendl;
- goto free_buf;
- }
-
- needed_space = ((int64_t)g_conf->osd_max_write_size) << 20;
- needed_space += (2 * sizeof(entry_header_t)) + get_top();
- if (header.max_size - header.start < needed_space) {
- derr << "FileJournal::create: OSD journal is not large enough to hold "
- << "osd_max_write_size bytes!" << dendl;
- ret = -ENOSPC;
- goto free_buf;
- }
-
- dout(2) << "create done" << dendl;
- ret = 0;
-
-free_buf:
- free(buf);
- buf = 0;
-close_fd:
- if (TEMP_FAILURE_RETRY(::close(fd)) < 0) {
- ret = -errno;
- derr << "FileJournal::create: error closing fd: " << cpp_strerror(ret)
- << dendl;
- }
-done:
- fd = -1;
- return ret;
-}
-
-// This can not be used on an active journal
-int FileJournal::peek_fsid(uuid_d& fsid)
-{
- assert(fd == -1);
- int r = _open(false, false);
- if (r)
- return r;
- r = read_header(&header);
- if (r < 0)
- goto out;
- fsid = header.fsid;
-out:
- close();
- return r;
-}
-
-int FileJournal::open(uint64_t fs_op_seq)
-{
- dout(2) << "open " << fn << " fsid " << fsid << " fs_op_seq " << fs_op_seq << dendl;
-
- uint64_t next_seq = fs_op_seq + 1;
-
- int err = _open(false);
- if (err)
- return err;
-
- // assume writeable, unless...
- read_pos = 0;
- write_pos = get_top();
-
- // read header?
- err = read_header(&header);
- if (err < 0)
- return err;
-
- // static zeroed buffer for alignment padding
- delete [] zero_buf;
- zero_buf = new char[header.alignment];
- memset(zero_buf, 0, header.alignment);
-
- dout(10) << "open header.fsid = " << header.fsid
- //<< " vs expected fsid = " << fsid
- << dendl;
- if (header.fsid != fsid) {
- derr << "FileJournal::open: ondisk fsid " << header.fsid << " doesn't match expected " << fsid
- << ", invalid (someone else's?) journal" << dendl;
- return -EINVAL;
- }
- if (header.max_size > max_size) {
- dout(2) << "open journal size " << header.max_size << " > current " << max_size << dendl;
- return -EINVAL;
- }
- if (header.block_size != block_size) {
- dout(2) << "open journal block size " << header.block_size << " != current " << block_size << dendl;
- return -EINVAL;
- }
- if (header.max_size % header.block_size) {
- dout(2) << "open journal max size " << header.max_size
- << " not a multiple of block size " << header.block_size << dendl;
- return -EINVAL;
- }
- if (header.alignment != block_size && directio) {
- dout(0) << "open journal alignment " << header.alignment << " does not match block size "
- << block_size << " (required for direct_io journal mode)" << dendl;
- return -EINVAL;
- }
- if ((header.alignment % CEPH_MINIMUM_BLOCK_SIZE) && directio) {
- dout(0) << "open journal alignment " << header.alignment << " is not multiple of minimum block size "
- << CEPH_MINIMUM_BLOCK_SIZE << " (required for direct_io journal mode)" << dendl;
- return -EINVAL;
- }
-
- // looks like a valid header.
- write_pos = 0; // not writeable yet
-
- journaled_seq = header.committed_up_to;
-
- // find next entry
- read_pos = header.start;
- uint64_t seq = header.start_seq;
-
- // last_committed_seq is 1 before the start of the journal or
- // 0 if the start is 0
- last_committed_seq = seq > 0 ? seq - 1 : seq;
- if (last_committed_seq < fs_op_seq) {
- dout(2) << "open advancing committed_seq " << last_committed_seq
- << " to fs op_seq " << fs_op_seq << dendl;
- last_committed_seq = fs_op_seq;
- }
-
- while (1) {
- bufferlist bl;
- off64_t old_pos = read_pos;
- if (!read_entry(bl, seq)) {
- dout(10) << "open reached end of journal." << dendl;
- break;
- }
- if (seq > next_seq) {
- dout(10) << "open entry " << seq << " len " << bl.length() << " > next_seq " << next_seq
- << ", ignoring journal contents"
- << dendl;
- read_pos = -1;
- last_committed_seq = 0;
- seq = 0;
- return 0;
- }
- if (seq == next_seq) {
- dout(10) << "open reached seq " << seq << dendl;
- read_pos = old_pos;
- break;
- }
- seq++; // next event should follow.
- }
-
- return 0;
-}
-
-void FileJournal::_close(int fd) const
-{
- VOID_TEMP_FAILURE_RETRY(::close(fd));
-}
-
-void FileJournal::close()
-{
- dout(1) << "close " << fn << dendl;
-
- // stop writer thread
- stop_writer();
-
- // close
- assert(writeq_empty());
- assert(!must_write_header);
- assert(fd >= 0);
- _close(fd);
- fd = -1;
-}
-
-
-int FileJournal::dump(ostream& out)
-{
- return _dump(out, false);
-}
-
-int FileJournal::simple_dump(ostream& out)
-{
- return _dump(out, true);
-}
-
-int FileJournal::_dump(ostream& out, bool simple)
-{
- JSONFormatter f(true);
- int ret = _fdump(f, simple);
- f.flush(out);
- return ret;
-}
-
-int FileJournal::_fdump(Formatter &f, bool simple)
-{
- dout(10) << "_fdump" << dendl;
-
- assert(fd == -1);
- int err = _open(false, false);
- if (err)
- return err;
-
- err = read_header(&header);
- if (err < 0) {
- close();
- return err;
- }
-
- off64_t next_pos = header.start;
-
- f.open_object_section("journal");
-
- f.open_object_section("header");
- f.dump_unsigned("flags", header.flags);
- ostringstream os;
- os << header.fsid;
- f.dump_string("fsid", os.str());
- f.dump_unsigned("block_size", header.block_size);
- f.dump_unsigned("alignment", header.alignment);
- f.dump_int("max_size", header.max_size);
- f.dump_int("start", header.start);
- f.dump_unsigned("committed_up_to", header.committed_up_to);
- f.dump_unsigned("start_seq", header.start_seq);
- f.close_section();
-
- f.open_array_section("entries");
- uint64_t seq = header.start_seq;
- while (1) {
- bufferlist bl;
- off64_t pos = next_pos;
-
- if (!pos) {
- dout(2) << "_dump -- not readable" << dendl;
- err = -EINVAL;
- break;
- }
- stringstream ss;
- read_entry_result result = do_read_entry(
- pos,
- &next_pos,
- &bl,
- &seq,
- &ss);
- if (result != SUCCESS) {
- if (seq < header.committed_up_to) {
- dout(2) << "Unable to read past sequence " << seq
- << " but header indicates the journal has committed up through "
- << header.committed_up_to << ", journal is corrupt" << dendl;
- err = -EINVAL;
- }
- dout(25) << ss.str() << dendl;
- dout(25) << "No further valid entries found, journal is most likely valid"
- << dendl;
- break;
- }
-
- f.open_object_section("entry");
- f.dump_unsigned("offset", pos);
- f.dump_unsigned("seq", seq);
- if (simple) {
- f.dump_unsigned("bl.length", bl.length());
- } else {
- f.open_array_section("transactions");
- bufferlist::iterator p = bl.begin();
- int trans_num = 0;
- while (!p.end()) {
- ObjectStore::Transaction t(p);
- f.open_object_section("transaction");
- f.dump_unsigned("trans_num", trans_num);
- t.dump(&f);
- f.close_section();
- trans_num++;
- }
- f.close_section();
- }
- f.close_section();
- }
-
- f.close_section();
- f.close_section();
- dout(10) << "dump finish" << dendl;
-
- close();
- return err;
-}
-
-
-void FileJournal::start_writer()
-{
- write_stop = false;
- aio_stop = false;
- write_thread.create();
-#ifdef HAVE_LIBAIO
- if (aio)
- write_finish_thread.create();
-#endif
-}
-
-void FileJournal::stop_writer()
-{
- // Do nothing if writer already stopped or never started
- if (!write_stop)
- {
- {
- Mutex::Locker l(write_lock);
- Mutex::Locker p(writeq_lock);
- write_stop = true;
- writeq_cond.Signal();
- // Doesn't hurt to signal commit_cond in case thread is waiting there
- // and caller didn't use committed_thru() first.
- commit_cond.Signal();
- }
- write_thread.join();
-
- // write journal header now so that we have less to replay on remount
- write_header_sync();
- }
-
-#ifdef HAVE_LIBAIO
- // stop aio completeion thread *after* writer thread has stopped
- // and has submitted all of its io
- if (aio && !aio_stop) {
- aio_lock.Lock();
- aio_stop = true;
- aio_cond.Signal();
- write_finish_cond.Signal();
- aio_lock.Unlock();
- write_finish_thread.join();
- }
-#endif
-}
-
-
-
-void FileJournal::print_header(const header_t &header) const
-{
- dout(10) << "header: block_size " << header.block_size
- << " alignment " << header.alignment
- << " max_size " << header.max_size
- << dendl;
- dout(10) << "header: start " << header.start << dendl;
- dout(10) << " write_pos " << write_pos << dendl;
-}
-
-int FileJournal::read_header(header_t *hdr) const
-{
- dout(10) << "read_header" << dendl;
- bufferlist bl;
-
- buffer::ptr bp = buffer::create_page_aligned(block_size);
- char* bpdata = bp.c_str();
- int r = ::pread(fd, bpdata, bp.length(), 0);
-
- if (r < 0) {
- int err = errno;
- dout(0) << "read_header got " << cpp_strerror(err) << dendl;
- return -err;
- }
-
- // don't use bp.zero() here, because it also invalidates
- // crc cache (which is not yet populated anyway)
- if (bp.length() != (size_t)r) {
- // r will be always less or equal than bp.length
- bpdata += r;
- memset(bpdata, 0, bp.length() - r);
- }
-
- bl.push_back(bp);
-
- try {
- bufferlist::iterator p = bl.begin();
- ::decode(*hdr, p);
- }
- catch (buffer::error& e) {
- derr << "read_header error decoding journal header" << dendl;
- return -EINVAL;
- }
-
-
- /*
- * Unfortunately we weren't initializing the flags field for new
- * journals! Aie. This is safe(ish) now that we have only one
- * flag. Probably around when we add the next flag we need to
- * remove this or else this (eventually old) code will clobber newer
- * code's flags.
- */
- if (hdr->flags > 3) {
- derr << "read_header appears to have gibberish flags; assuming 0" << dendl;
- hdr->flags = 0;
- }
-
- print_header(*hdr);
-
- return 0;
-}
-
-bufferptr FileJournal::prepare_header()
-{
- bufferlist bl;
- {
- Mutex::Locker l(finisher_lock);
- header.committed_up_to = journaled_seq;
- }
- ::encode(header, bl);
- bufferptr bp = buffer::create_page_aligned(get_top());
- // don't use bp.zero() here, because it also invalidates
- // crc cache (which is not yet populated anyway)
- char* data = bp.c_str();
- memcpy(data, bl.c_str(), bl.length());
- data += bl.length();
- memset(data, 0, bp.length()-bl.length());
- return bp;
-}
-
-void FileJournal::write_header_sync()
-{
- Mutex::Locker locker(write_lock);
- must_write_header = true;
- bufferlist bl;
- do_write(bl);
- dout(20) << __func__ << " finish" << dendl;
-}
-
-int FileJournal::check_for_full(uint64_t seq, off64_t pos, off64_t size)
-{
- // already full?
- if (full_state != FULL_NOTFULL)
- return -ENOSPC;
-
- // take 1 byte off so that we only get pos == header.start on EMPTY, never on FULL.
- off64_t room;
- if (pos >= header.start)
- room = (header.max_size - pos) + (header.start - get_top()) - 1;
- else
- room = header.start - pos - 1;
- dout(10) << "room " << room << " max_size " << max_size << " pos " << pos << " header.start " << header.start
- << " top " << get_top() << dendl;
-
- if (do_sync_cond) {
- if (room >= (header.max_size >> 1) &&
- room - size < (header.max_size >> 1)) {
- dout(10) << " passing half full mark, triggering commit" << dendl;
- do_sync_cond->SloppySignal(); // initiate a real commit so we can trim
- }
- }
-
- if (room >= size) {
- dout(10) << "check_for_full at " << pos << " : " << size << " < " << room << dendl;
- if (pos + size > header.max_size)
- must_write_header = true;
- return 0;
- }
-
- // full
- dout(1) << "check_for_full at " << pos << " : JOURNAL FULL "
- << pos << " >= " << room
- << " (max_size " << header.max_size << " start " << header.start << ")"
- << dendl;
-
- off64_t max = header.max_size - get_top();
- if (size > max)
- dout(0) << "JOURNAL TOO SMALL: continuing, but slow: item " << size << " > journal " << max << " (usable)" << dendl;
-
- return -ENOSPC;
-}
-
-int FileJournal::prepare_multi_write(bufferlist& bl, uint64_t& orig_ops, uint64_t& orig_bytes)
-{
- // gather queued writes
- off64_t queue_pos = write_pos;
-
- int eleft = g_conf->journal_max_write_entries;
- unsigned bmax = g_conf->journal_max_write_bytes;
-
- if (full_state != FULL_NOTFULL)
- return -ENOSPC;
-
- while (!writeq_empty()) {
- list<write_item> items;
- batch_pop_write(items);
- list<write_item>::iterator it = items.begin();
- while (it != items.end()) {
- int r = prepare_single_write(*it, bl, queue_pos, orig_ops, orig_bytes);
- if (r == 0) { // prepare ok, delete it
- items.erase(it++);
- }
- if (r == -ENOSPC) {
- // the journal maybe full, insert the left item to writeq
- batch_unpop_write(items);
- if (orig_ops)
- goto out; // commit what we have
-
- if (logger)
- logger->inc(l_os_j_full);
-
- if (wait_on_full) {
- dout(20) << "prepare_multi_write full on first entry, need to wait" << dendl;
- } else {
- dout(20) << "prepare_multi_write full on first entry, restarting journal" << dendl;
-
- // throw out what we have so far
- full_state = FULL_FULL;
- while (!writeq_empty()) {
- put_throttle(1, peek_write().orig_len);
- pop_write();
- }
- print_header(header);
- }
-
- return -ENOSPC; // hrm, full on first op
- }
- if (eleft) {
- if (--eleft == 0) {
- dout(20) << "prepare_multi_write hit max events per write " << g_conf->journal_max_write_entries << dendl;
- batch_unpop_write(items);
- goto out;
- }
- }
- if (bmax) {
- if (bl.length() >= bmax) {
- dout(20) << "prepare_multi_write hit max write size " << g_conf->journal_max_write_bytes << dendl;
- batch_unpop_write(items);
- goto out;
- }
- }
- }
- }
-
-out:
- dout(20) << "prepare_multi_write queue_pos now " << queue_pos << dendl;
- assert((write_pos + bl.length() == queue_pos) ||
- (write_pos + bl.length() - header.max_size + get_top() == queue_pos));
- return 0;
-}
-
-/*
-void FileJournal::queue_write_fin(uint64_t seq, Context *fin)
-{
- writing_seq.push_back(seq);
- if (!waiting_for_notfull.empty()) {
- // make sure previously unjournaled stuff waiting for UNFULL triggers
- // _before_ newly journaled stuff does
- dout(10) << "queue_write_fin will defer seq " << seq << " callback " << fin
- << " until after UNFULL" << dendl;
- C_Gather *g = new C_Gather(writeq.front().fin);
- writing_fin.push_back(g->new_sub());
- waiting_for_notfull.push_back(g->new_sub());
- } else {
- writing_fin.push_back(writeq.front().fin);
- dout(20) << "queue_write_fin seq " << seq << " callback " << fin << dendl;
- }
-}
-*/
-
-void FileJournal::queue_completions_thru(uint64_t seq)
-{
- assert(finisher_lock.is_locked());
- utime_t now = ceph_clock_now(g_ceph_context);
- list<completion_item> items;
- batch_pop_completions(items);
- list<completion_item>::iterator it = items.begin();
- while (it != items.end()) {
- completion_item& next = *it;
- if (next.seq > seq)
- break;
- utime_t lat = now;
- lat -= next.start;
- dout(10) << "queue_completions_thru seq " << seq
- << " queueing seq " << next.seq
- << " " << next.finish
- << " lat " << lat << dendl;
- if (logger) {
- logger->tinc(l_os_j_lat, lat);
- }
- if (next.finish)
- finisher->queue(next.finish);
- if (next.tracked_op)
- next.tracked_op->mark_event("journaled_completion_queued");
- items.erase(it++);
- }
- batch_unpop_completions(items);
- finisher_cond.Signal();
-}
-
-
-int FileJournal::prepare_single_write(write_item &next_write, bufferlist& bl, off64_t& queue_pos, uint64_t& orig_ops, uint64_t& orig_bytes)
-{
- uint64_t seq = next_write.seq;
- bufferlist &ebl = next_write.bl;
- off64_t size = ebl.length();
-
- int r = check_for_full(seq, queue_pos, size);
- if (r < 0)
- return r; // ENOSPC or EAGAIN
-
- uint32_t orig_len = next_write.orig_len;
- orig_bytes += orig_len;
- orig_ops++;
-
- // add to write buffer
- dout(15) << "prepare_single_write " << orig_ops << " will write " << queue_pos << " : seq " << seq
- << " len " << orig_len << " -> " << size << dendl;
-
- unsigned seq_offset = offsetof(entry_header_t, seq);
- unsigned magic1_offset = offsetof(entry_header_t, magic1);
- unsigned magic2_offset = offsetof(entry_header_t, magic2);
-
- bufferptr headerptr = ebl.buffers().front();
- uint64_t _seq = seq;
- uint64_t _queue_pos = queue_pos;
- uint64_t magic2 = entry_header_t::make_magic(seq, orig_len, header.get_fsid64());
- headerptr.copy_in(seq_offset, sizeof(uint64_t), (char *)&_seq);
- headerptr.copy_in(magic1_offset, sizeof(uint64_t), (char *)&_queue_pos);
- headerptr.copy_in(magic2_offset, sizeof(uint64_t), (char *)&magic2);
-
- bufferptr footerptr = ebl.buffers().back();
- unsigned post_offset = footerptr.length() - sizeof(entry_header_t);
- footerptr.copy_in(post_offset + seq_offset, sizeof(uint64_t), (char *)&_seq);
- footerptr.copy_in(post_offset + magic1_offset, sizeof(uint64_t), (char *)&_queue_pos);
- footerptr.copy_in(post_offset + magic2_offset, sizeof(uint64_t), (char *)&magic2);
-
- bl.claim_append(ebl);
- if (next_write.tracked_op)
- next_write.tracked_op->mark_event("write_thread_in_journal_buffer");
-
- journalq.push_back(pair<uint64_t,off64_t>(seq, queue_pos));
- writing_seq = seq;
-
- queue_pos += size;
- if (queue_pos >= header.max_size)
- queue_pos = queue_pos + get_top() - header.max_size;
-
- return 0;
-}
-
-void FileJournal::align_bl(off64_t pos, bufferlist& bl)
-{
- // make sure list segments are page aligned
- if (directio && (!bl.is_aligned(block_size) ||
- !bl.is_n_align_sized(CEPH_MINIMUM_BLOCK_SIZE))) {
- assert(0 == "bl should be align");
- if ((bl.length() & (CEPH_MINIMUM_BLOCK_SIZE - 1)) != 0 ||
- (pos & (CEPH_MINIMUM_BLOCK_SIZE - 1)) != 0)
- dout(0) << "rebuild_page_aligned failed, " << bl << dendl;
- assert((bl.length() & (CEPH_MINIMUM_BLOCK_SIZE - 1)) == 0);
- assert((pos & (CEPH_MINIMUM_BLOCK_SIZE - 1)) == 0);
- }
-}
-
-int FileJournal::write_bl(off64_t& pos, bufferlist& bl)
-{
- int ret;
-
- off64_t spos = ::lseek64(fd, pos, SEEK_SET);
- if (spos < 0) {
- ret = -errno;
- derr << "FileJournal::write_bl : lseek64 failed " << cpp_strerror(ret) << dendl;
- return ret;
- }
- ret = bl.write_fd(fd);
- if (ret) {
- derr << "FileJournal::write_bl : write_fd failed: " << cpp_strerror(ret) << dendl;
- return ret;
- }
- pos += bl.length();
- if (pos == header.max_size)
- pos = get_top();
- return 0;
-}
-
-void FileJournal::do_write(bufferlist& bl)
-{
- // nothing to do?
- if (bl.length() == 0 && !must_write_header)
- return;
-
- buffer::ptr hbp;
- if (g_conf->journal_write_header_frequency &&
- (((++journaled_since_start) %
- g_conf->journal_write_header_frequency) == 0)) {
- must_write_header = true;
- }
-
- if (must_write_header) {
- must_write_header = false;
- hbp = prepare_header();
- }
-
- dout(15) << "do_write writing " << write_pos << "~" << bl.length()
- << (hbp.length() ? " + header":"")
- << dendl;
-
- utime_t from = ceph_clock_now(g_ceph_context);
-
- // entry
- off64_t pos = write_pos;
-
- // Adjust write_pos
- align_bl(pos, bl);
- write_pos += bl.length();
- if (write_pos >= header.max_size)
- write_pos = write_pos - header.max_size + get_top();
-
- write_lock.Unlock();
-
- // split?
- off64_t split = 0;
- if (pos + bl.length() > header.max_size) {
- bufferlist first, second;
- split = header.max_size - pos;
- first.substr_of(bl, 0, split);
- second.substr_of(bl, split, bl.length() - split);
- assert(first.length() + second.length() == bl.length());
- dout(10) << "do_write wrapping, first bit at " << pos << " len " << first.length()
- << " second bit len " << second.length() << " (orig len " << bl.length() << ")" << dendl;
-
- //Save pos to write first piece second
- off64_t first_pos = pos;
- off64_t orig_pos;
- pos = get_top();
- // header too?
- if (hbp.length()) {
- // be sneaky: include the header in the second fragment
- second.push_front(hbp);
- pos = 0; // we included the header
- }
- // Write the second portion first possible with the header, so
- // do_read_entry() won't even get a valid entry_header_t if there
- // is a crash between the two writes.
- orig_pos = pos;
- if (write_bl(pos, second)) {
- derr << "FileJournal::do_write: write_bl(pos=" << orig_pos
- << ") failed" << dendl;
- ceph_abort();
- }
- orig_pos = first_pos;
- if (write_bl(first_pos, first)) {
- derr << "FileJournal::do_write: write_bl(pos=" << orig_pos
- << ") failed" << dendl;
- ceph_abort();
- }
- assert(first_pos == get_top());
- } else {
- // header too?
- if (hbp.length()) {
- if (TEMP_FAILURE_RETRY(::pwrite(fd, hbp.c_str(), hbp.length(), 0)) < 0) {
- int err = errno;
- derr << "FileJournal::do_write: pwrite(fd=" << fd
- << ", hbp.length=" << hbp.length() << ") failed :"
- << cpp_strerror(err) << dendl;
- ceph_abort();
- }
- }
-
- if (write_bl(pos, bl)) {
- derr << "FileJournal::do_write: write_bl(pos=" << pos
- << ") failed" << dendl;
- ceph_abort();
- }
- }
-
- if (!directio) {
- dout(20) << "do_write fsync" << dendl;
-
- /*
- * We'd really love to have a fsync_range or fdatasync_range and do a:
- *
- * if (split) {
- * ::fsync_range(fd, header.max_size - split, split)l
- * ::fsync_range(fd, get_top(), bl.length() - split);
- * else
- * ::fsync_range(fd, write_pos, bl.length())
- *
- * NetBSD and AIX apparently have it, and adding it to Linux wouldn't be
- * too hard given all the underlying infrastructure already exist.
- *
- * NOTE: using sync_file_range here would not be safe as it does not
- * flush disk caches or commits any sort of metadata.
- */
- int ret = 0;
-#if defined(DARWIN) || defined(__FreeBSD__)
- ret = ::fsync(fd);
-#else
- ret = ::fdatasync(fd);
-#endif
- if (ret < 0) {
- derr << __func__ << " fsync/fdatasync failed: " << cpp_strerror(errno) << dendl;
- ceph_abort();
- }
-#ifdef HAVE_POSIX_FADVISE
- if (g_conf->filestore_fadvise)
- posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
-#endif
- }
-
- utime_t lat = ceph_clock_now(g_ceph_context) - from;
- dout(20) << "do_write latency " << lat << dendl;
-
- write_lock.Lock();
-
- assert(write_pos == pos);
- assert(write_pos % header.alignment == 0);
-
- {
- Mutex::Locker locker(finisher_lock);
- journaled_seq = writing_seq;
-
- // kick finisher?
- // only if we haven't filled up recently!
- if (full_state != FULL_NOTFULL) {
- dout(10) << "do_write NOT queueing finisher seq " << journaled_seq
- << ", full_commit_seq|full_restart_seq" << dendl;
- } else {
- if (plug_journal_completions) {
- dout(20) << "do_write NOT queueing finishers through seq " << journaled_seq
- << " due to completion plug" << dendl;
- } else {
- dout(20) << "do_write queueing finishers through seq " << journaled_seq << dendl;
- queue_completions_thru(journaled_seq);
- }
- }
- }
-}
-
-void FileJournal::flush()
-{
- dout(10) << "waiting for completions to empty" << dendl;
- {
- Mutex::Locker l(finisher_lock);
- while (!completions_empty())
- finisher_cond.Wait(finisher_lock);
- }
- dout(10) << "flush waiting for finisher" << dendl;
- finisher->wait_for_empty();
- dout(10) << "flush done" << dendl;
-}
-
-
-void FileJournal::write_thread_entry()
-{
- dout(10) << "write_thread_entry start" << dendl;
- while (1) {
- {
- Mutex::Locker locker(writeq_lock);
- if (writeq.empty() && !must_write_header) {
- if (write_stop)
- break;
- dout(20) << "write_thread_entry going to sleep" << dendl;
- writeq_cond.Wait(writeq_lock);
- dout(20) << "write_thread_entry woke up" << dendl;
- continue;
- }
- }
-
-#ifdef HAVE_LIBAIO
- if (aio) {
- Mutex::Locker locker(aio_lock);
- // should we back off to limit aios in flight? try to do this
- // adaptively so that we submit larger aios once we have lots of
- // them in flight.
- //
- // NOTE: our condition here is based on aio_num (protected by
- // aio_lock) and throttle_bytes (part of the write queue). when
- // we sleep, we *only* wait for aio_num to change, and do not
- // wake when more data is queued. this is not strictly correct,
- // but should be fine given that we will have plenty of aios in
- // flight if we hit this limit to ensure we keep the device
- // saturated.
- while (aio_num > 0) {
- int exp = MIN(aio_num * 2, 24);
- long unsigned min_new = 1ull << exp;
- long unsigned cur = throttle_bytes.get_current();
- dout(20) << "write_thread_entry aio throttle: aio num " << aio_num << " bytes " << aio_bytes
- << " ... exp " << exp << " min_new " << min_new
- << " ... pending " << cur << dendl;
- if (cur >= min_new)
- break;
- dout(20) << "write_thread_entry deferring until more aios complete: "
- << aio_num << " aios with " << aio_bytes << " bytes needs " << min_new
- << " bytes to start a new aio (currently " << cur << " pending)" << dendl;
- aio_cond.Wait(aio_lock);
- dout(20) << "write_thread_entry woke up" << dendl;
- }
- }
-#endif
-
- Mutex::Locker locker(write_lock);
- uint64_t orig_ops = 0;
- uint64_t orig_bytes = 0;
-
- bufferlist bl;
- int r = prepare_multi_write(bl, orig_ops, orig_bytes);
- // Don't care about journal full if stoppping, so drop queue and
- // possibly let header get written and loop above to notice stop
- if (r == -ENOSPC) {
- if (write_stop) {
- dout(20) << "write_thread_entry full and stopping, throw out queue and finish up" << dendl;
- while (!writeq_empty()) {
- put_throttle(1, peek_write().orig_len);
- pop_write();
- }
- print_header(header);
- r = 0;
- } else {
- dout(20) << "write_thread_entry full, going to sleep (waiting for commit)" << dendl;
- commit_cond.Wait(write_lock);
- dout(20) << "write_thread_entry woke up" << dendl;
- continue;
- }
- }
- assert(r == 0);
-
- if (logger) {
- logger->inc(l_os_j_wr);
- logger->inc(l_os_j_wr_bytes, bl.length());
- }
-
-#ifdef HAVE_LIBAIO
- if (aio)
- do_aio_write(bl);
- else
- do_write(bl);
-#else
- do_write(bl);
-#endif
- put_throttle(orig_ops, orig_bytes);
- }
-
- dout(10) << "write_thread_entry finish" << dendl;
-}
-
-#ifdef HAVE_LIBAIO
-void FileJournal::do_aio_write(bufferlist& bl)
-{
-
- if (g_conf->journal_write_header_frequency &&
- (((++journaled_since_start) %
- g_conf->journal_write_header_frequency) == 0)) {
- must_write_header = true;
- }
-
- // nothing to do?
- if (bl.length() == 0 && !must_write_header)
- return;
-
- buffer::ptr hbp;
- if (must_write_header) {
- must_write_header = false;
- hbp = prepare_header();
- }
-
- // entry
- off64_t pos = write_pos;
-
- dout(15) << "do_aio_write writing " << pos << "~" << bl.length()
- << (hbp.length() ? " + header":"")
- << dendl;
-
- // split?
- off64_t split = 0;
- if (pos + bl.length() > header.max_size) {
- bufferlist first, second;
- split = header.max_size - pos;
- first.substr_of(bl, 0, split);
- second.substr_of(bl, split, bl.length() - split);
- assert(first.length() + second.length() == bl.length());
- dout(10) << "do_aio_write wrapping, first bit at " << pos << "~" << first.length() << dendl;
-
- if (write_aio_bl(pos, first, 0)) {
- derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos
- << ") failed" << dendl;
- ceph_abort();
- }
- assert(pos == header.max_size);
- if (hbp.length()) {
- // be sneaky: include the header in the second fragment
- second.push_front(hbp);
- pos = 0; // we included the header
- } else
- pos = get_top(); // no header, start after that
- if (write_aio_bl(pos, second, writing_seq)) {
- derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos
- << ") failed" << dendl;
- ceph_abort();
- }
- } else {
- // header too?
- if (hbp.length()) {
- bufferlist hbl;
- hbl.push_back(hbp);
- loff_t pos = 0;
- if (write_aio_bl(pos, hbl, 0)) {
- derr << "FileJournal::do_aio_write: write_aio_bl(header) failed" << dendl;
- ceph_abort();
- }
- }
-
- if (write_aio_bl(pos, bl, writing_seq)) {
- derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos
- << ") failed" << dendl;
- ceph_abort();
- }
- }
-
- write_pos = pos;
- if (write_pos == header.max_size)
- write_pos = get_top();
- assert(write_pos % header.alignment == 0);
-}
-
-/**
- * write a buffer using aio
- *
- * @param seq seq to trigger when this aio completes. if 0, do not update any state
- * on completion.
- */
-int FileJournal::write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq)
-{
- align_bl(pos, bl);
-
- dout(20) << "write_aio_bl " << pos << "~" << bl.length() << " seq " << seq << dendl;
-
- while (bl.length() > 0) {
- int max = MIN(bl.buffers().size(), IOV_MAX-1);
- iovec *iov = new iovec[max];
- int n = 0;
- unsigned len = 0;
- for (std::list<buffer::ptr>::const_iterator p = bl.buffers().begin();
- n < max;
- ++p, ++n) {
- assert(p != bl.buffers().end());
- iov[n].iov_base = (void *)p->c_str();
- iov[n].iov_len = p->length();
- len += p->length();
- }
-
- bufferlist tbl;
- bl.splice(0, len, &tbl); // move bytes from bl -> tbl
-
- // lock only aio_queue, current aio, aio_num, aio_bytes, which may be
- // modified in check_aio_completion
- aio_lock.Lock();
- aio_queue.push_back(aio_info(tbl, pos, bl.length() > 0 ? 0 : seq));
- aio_info& aio = aio_queue.back();
- aio.iov = iov;
-
- io_prep_pwritev(&aio.iocb, fd, aio.iov, n, pos);
-
- dout(20) << "write_aio_bl .. " << aio.off << "~" << aio.len
- << " in " << n << dendl;
-
- aio_num++;
- aio_bytes += aio.len;
-
- // need to save current aio len to update write_pos later because current
- // aio could be ereased from aio_queue once it is done
- uint64_t cur_len = aio.len;
- // unlock aio_lock because following io_submit might take time to return
- aio_lock.Unlock();
-
- iocb *piocb = &aio.iocb;
- int attempts = 10;
- do {
- int r = io_submit(aio_ctx, 1, &piocb);
- dout(20) << "write_aio_bl io_submit return value: " << r << dendl;
- if (r < 0) {
- derr << "io_submit to " << aio.off << "~" << cur_len
- << " got " << cpp_strerror(r) << dendl;
- if (r == -EAGAIN && attempts-- > 0) {
- usleep(500);
- continue;
- }
- assert(0 == "io_submit got unexpected error");
- } else {
- break;
- }
- } while (true);
- pos += cur_len;
- }
- aio_lock.Lock();
- write_finish_cond.Signal();
- aio_lock.Unlock();
- return 0;
-}
-#endif
-
-void FileJournal::write_finish_thread_entry()
-{
-#ifdef HAVE_LIBAIO
- dout(10) << "write_finish_thread_entry enter" << dendl;
- while (true) {
- {
- Mutex::Locker locker(aio_lock);
- if (aio_queue.empty()) {
- if (aio_stop)
- break;
- dout(20) << "write_finish_thread_entry sleeping" << dendl;
- write_finish_cond.Wait(aio_lock);
- continue;
- }
- }
-
- dout(20) << "write_finish_thread_entry waiting for aio(s)" << dendl;
- io_event event[16];
- int r = io_getevents(aio_ctx, 1, 16, event, NULL);
- if (r < 0) {
- if (r == -EINTR) {
- dout(0) << "io_getevents got " << cpp_strerror(r) << dendl;
- continue;
- }
- derr << "io_getevents got " << cpp_strerror(r) << dendl;
- assert(0 == "got unexpected error from io_getevents");
- }
-
- {
- Mutex::Locker locker(aio_lock);
- for (int i=0; i<r; i++) {
- aio_info *ai = (aio_info *)event[i].obj;
- if (event[i].res != ai->len) {
- derr << "aio to " << ai->off << "~" << ai->len
- << " wrote " << event[i].res << dendl;
- assert(0 == "unexpected aio error");
- }
- dout(10) << "write_finish_thread_entry aio " << ai->off
- << "~" << ai->len << " done" << dendl;
- ai->done = true;
- }
- check_aio_completion();
- }
- }
- dout(10) << "write_finish_thread_entry exit" << dendl;
-#endif
-}
-
-#ifdef HAVE_LIBAIO
-/**
- * check aio_wait for completed aio, and update state appropriately.
- */
-void FileJournal::check_aio_completion()
-{
- assert(aio_lock.is_locked());
- dout(20) << "check_aio_completion" << dendl;
-
- bool completed_something = false, signal = false;
- uint64_t new_journaled_seq = 0;
-
- list<aio_info>::iterator p = aio_queue.begin();
- while (p != aio_queue.end() && p->done) {
- dout(20) << "check_aio_completion completed seq " << p->seq << " "
- << p->off << "~" << p->len << dendl;
- if (p->seq) {
- new_journaled_seq = p->seq;
- completed_something = true;
- }
- aio_num--;
- aio_bytes -= p->len;
- aio_queue.erase(p++);
- signal = true;
- }
-
- if (completed_something) {
- // kick finisher?
- // only if we haven't filled up recently!
- Mutex::Locker locker(finisher_lock);
- journaled_seq = new_journaled_seq;
- if (full_state != FULL_NOTFULL) {
- dout(10) << "check_aio_completion NOT queueing finisher seq " << journaled_seq
- << ", full_commit_seq|full_restart_seq" << dendl;
- } else {
- if (plug_journal_completions) {
- dout(20) << "check_aio_completion NOT queueing finishers through seq " << journaled_seq
- << " due to completion plug" << dendl;
- } else {
- dout(20) << "check_aio_completion queueing finishers through seq " << journaled_seq << dendl;
- queue_completions_thru(journaled_seq);
- }
- }
- }
- if (signal) {
- // maybe write queue was waiting for aio count to drop?
- aio_cond.Signal();
- }
-}
-#endif
-
-int FileJournal::prepare_entry(list<ObjectStore::Transaction*>& tls, bufferlist* tbl) {
- dout(10) << "prepare_entry " << tls << dendl;
- unsigned data_len = 0;
- int data_align = -1; // -1 indicates that we don't care about the alignment
- bufferlist bl;
- for (list<ObjectStore::Transaction*>::iterator p = tls.begin();
- p != tls.end(); ++p) {
- ObjectStore::Transaction *t = *p;
- if (t->get_data_length() > data_len &&
- (int)t->get_data_length() >= g_conf->journal_align_min_size) {
- data_len = t->get_data_length();
- data_align = (t->get_data_alignment() - bl.length()) & ~CEPH_PAGE_MASK;
- }
- ::encode(*t, bl);
- }
- if (tbl->length()) {
- bl.claim_append(*tbl);
- }
- // add it this entry
- entry_header_t h;
- unsigned head_size = sizeof(entry_header_t);
- off64_t base_size = 2*head_size + bl.length();
- memset(&h, 0, sizeof(h));
- if (data_align >= 0)
- h.pre_pad = ((unsigned int)data_align - (unsigned int)head_size) & ~CEPH_PAGE_MASK;
- off64_t size = ROUND_UP_TO(base_size + h.pre_pad, header.alignment);
- unsigned post_pad = size - base_size - h.pre_pad;
- h.len = bl.length();
- h.post_pad = post_pad;
- h.crc32c = bl.crc32c(0);
- dout(10) << " len " << bl.length() << " -> " << size
- << " (head " << head_size << " pre_pad " << h.pre_pad
- << " bl " << bl.length() << " post_pad " << post_pad << " tail " << head_size << ")"
- << " (bl alignment " << data_align << ")"
- << dendl;
- bufferlist ebl;
- // header
- ebl.append((const char*)&h, sizeof(h));
- if (h.pre_pad) {
- ebl.push_back(buffer::create_static(h.pre_pad, zero_buf));
- }
- // payload
- ebl.claim_append(bl, buffer::list::CLAIM_ALLOW_NONSHAREABLE); // potential zero-copy
- if (h.post_pad) {
- ebl.push_back(buffer::create_static(h.post_pad, zero_buf));
- }
- // footer
- ebl.append((const char*)&h, sizeof(h));
- ebl.rebuild_aligned(CEPH_MINIMUM_BLOCK_SIZE);
- tbl->claim(ebl);
- return h.len;
-}
-
-void FileJournal::submit_entry(uint64_t seq, bufferlist& e, uint32_t orig_len,
- Context *oncommit, TrackedOpRef osd_op)
-{
- // dump on queue
- dout(5) << "submit_entry seq " << seq
- << " len " << e.length()
- << " (" << oncommit << ")" << dendl;
- assert(e.length() > 0);
-
- throttle_ops.take(1);
- throttle_bytes.take(orig_len);
- if (osd_op)
- osd_op->mark_event("commit_queued_for_journal_write");
- if (logger) {
- logger->set(l_os_jq_max_ops, throttle_ops.get_max());
- logger->set(l_os_jq_max_bytes, throttle_bytes.get_max());
- logger->set(l_os_jq_ops, throttle_ops.get_current());
- logger->set(l_os_jq_bytes, throttle_bytes.get_current());
- }
-
- {
- Mutex::Locker l1(writeq_lock); // ** lock **
- Mutex::Locker l2(completions_lock); // ** lock **
- completions.push_back(
- completion_item(
- seq, oncommit, ceph_clock_now(g_ceph_context), osd_op));
- if (writeq.empty())
- writeq_cond.Signal();
- writeq.push_back(write_item(seq, e, orig_len, osd_op));
- }
-}
-
-bool FileJournal::writeq_empty()
-{
- Mutex::Locker locker(writeq_lock);
- return writeq.empty();
-}
-
-FileJournal::write_item &FileJournal::peek_write()
-{
- assert(write_lock.is_locked());
- Mutex::Locker locker(writeq_lock);
- return writeq.front();
-}
-
-void FileJournal::pop_write()
-{
- assert(write_lock.is_locked());
- Mutex::Locker locker(writeq_lock);
- writeq.pop_front();
-}
-
-void FileJournal::batch_pop_write(list<write_item> &items)
-{
- assert(write_lock.is_locked());
- Mutex::Locker locker(writeq_lock);
- writeq.swap(items);
-}
-
-void FileJournal::batch_unpop_write(list<write_item> &items)
-{
- assert(write_lock.is_locked());
- Mutex::Locker locker(writeq_lock);
- writeq.splice(writeq.begin(), items);
-}
-
-void FileJournal::commit_start(uint64_t seq)
-{
- dout(10) << "commit_start" << dendl;
-
- // was full?
- switch (full_state) {
- case FULL_NOTFULL:
- break; // all good
-
- case FULL_FULL:
- if (seq >= journaled_seq) {
- dout(1) << " FULL_FULL -> FULL_WAIT. commit_start on seq "
- << seq << " > journaled_seq " << journaled_seq
- << ", moving to FULL_WAIT."
- << dendl;
- full_state = FULL_WAIT;
- } else {
- dout(1) << "FULL_FULL commit_start on seq "
- << seq << " < journaled_seq " << journaled_seq
- << ", remaining in FULL_FULL"
- << dendl;
- }
- break;
-
- case FULL_WAIT:
- dout(1) << " FULL_WAIT -> FULL_NOTFULL. journal now active, setting completion plug." << dendl;
- full_state = FULL_NOTFULL;
- plug_journal_completions = true;
- break;
- }
-}
-
-/*
- *send discard command to joural block deivce
- */
-void FileJournal::do_discard(int64_t offset, int64_t end)
-{
- dout(10) << __func__ << "trim(" << offset << ", " << end << dendl;
-
- offset = ROUND_UP_TO(offset, block_size);
- if (offset >= end)
- return;
- end = ROUND_UP_TO(end - block_size, block_size);
- assert(end >= offset);
- if (offset < end)
- if (block_device_discard(fd, offset, end - offset) < 0)
- dout(1) << __func__ << "ioctl(BLKDISCARD) error:" << cpp_strerror(errno) << dendl;
-}
-
-void FileJournal::committed_thru(uint64_t seq)
-{
- Mutex::Locker locker(write_lock);
-
- if (seq < last_committed_seq) {
- dout(5) << "committed_thru " << seq << " < last_committed_seq " << last_committed_seq << dendl;
- assert(seq >= last_committed_seq);
- return;
- }
- if (seq == last_committed_seq) {
- dout(5) << "committed_thru " << seq << " == last_committed_seq " << last_committed_seq << dendl;
- return;
- }
-
- dout(5) << "committed_thru " << seq << " (last_committed_seq " << last_committed_seq << ")" << dendl;
- last_committed_seq = seq;
-
- // completions!
- {
- Mutex::Locker locker(finisher_lock);
- queue_completions_thru(seq);
- if (plug_journal_completions && seq >= header.start_seq) {
- dout(10) << " removing completion plug, queuing completions thru journaled_seq " << journaled_seq << dendl;
- plug_journal_completions = false;
- queue_completions_thru(journaled_seq);
- }
- }
-
- // adjust start pointer
- while (!journalq.empty() && journalq.front().first <= seq) {
- journalq.pop_front();
- }
-
- int64_t old_start = header.start;
- if (!journalq.empty()) {
- header.start = journalq.front().second;
- header.start_seq = journalq.front().first;
- } else {
- header.start = write_pos;
- header.start_seq = seq + 1;
- }
-
- if (discard) {
- dout(10) << __func__ << " will trim (" << old_start << ", " << header.start << ")" << dendl;
- if (old_start < header.start)
- do_discard(old_start, header.start - 1);
- else {
- do_discard(old_start, header.max_size - 1);
- do_discard(get_top(), header.start - 1);
- }
- }
-
- must_write_header = true;
- print_header(header);
-
- // committed but unjournaled items
- while (!writeq_empty() && peek_write().seq <= seq) {
- dout(15) << " dropping committed but unwritten seq " << peek_write().seq
- << " len " << peek_write().bl.length()
- << dendl;
- put_throttle(1, peek_write().orig_len);
- pop_write();
- }
-
- commit_cond.Signal();
-
- dout(10) << "committed_thru done" << dendl;
-}
-
-
-void FileJournal::put_throttle(uint64_t ops, uint64_t bytes)
-{
- uint64_t new_ops = throttle_ops.put(ops);
- uint64_t new_bytes = throttle_bytes.put(bytes);
- dout(5) << "put_throttle finished " << ops << " ops and "
- << bytes << " bytes, now "
- << new_ops << " ops and " << new_bytes << " bytes"
- << dendl;
-
- if (logger) {
- logger->inc(l_os_j_ops, ops);
- logger->inc(l_os_j_bytes, bytes);
- logger->set(l_os_jq_ops, new_ops);
- logger->set(l_os_jq_bytes, new_bytes);
- logger->set(l_os_jq_max_ops, throttle_ops.get_max());
- logger->set(l_os_jq_max_bytes, throttle_bytes.get_max());
- }
-}
-
-int FileJournal::make_writeable()
-{
- dout(10) << __func__ << dendl;
- int r = _open(true);
- if (r < 0)
- return r;
-
- if (read_pos > 0)
- write_pos = read_pos;
- else
- write_pos = get_top();
- read_pos = 0;
-
- must_write_header = true;
- start_writer();
- return 0;
-}
-
-void FileJournal::wrap_read_bl(
- off64_t pos,
- int64_t olen,
- bufferlist* bl,
- off64_t *out_pos
- ) const
-{
- while (olen > 0) {
- while (pos >= header.max_size)
- pos = pos + get_top() - header.max_size;
-
- int64_t len;
- if (pos + olen > header.max_size)
- len = header.max_size - pos; // partial
- else
- len = olen; // rest
-
- int64_t actual = ::lseek64(fd, pos, SEEK_SET);
- assert(actual == pos);
-
- bufferptr bp = buffer::create(len);
- int r = safe_read_exact(fd, bp.c_str(), len);
- if (r) {
- derr << "FileJournal::wrap_read_bl: safe_read_exact " << pos << "~" << len << " returned "
- << r << dendl;
- ceph_abort();
- }
- bl->push_back(bp);
- pos += len;
- olen -= len;
- }
- if (pos >= header.max_size)
- pos = pos + get_top() - header.max_size;
- if (out_pos)
- *out_pos = pos;
-}
-
-bool FileJournal::read_entry(
- bufferlist &bl,
- uint64_t &next_seq,
- bool *corrupt)
-{
- if (corrupt)
- *corrupt = false;
- uint64_t seq = next_seq;
-
- if (!read_pos) {
- dout(2) << "read_entry -- not readable" << dendl;
- return false;
- }
-
- off64_t pos = read_pos;
- off64_t next_pos = pos;
- stringstream ss;
- read_entry_result result = do_read_entry(
- pos,
- &next_pos,
- &bl,
- &seq,
- &ss);
- if (result == SUCCESS) {
- journalq.push_back( pair<uint64_t,off64_t>(seq, pos));
- if (next_seq > seq) {
- return false;
- } else {
- read_pos = next_pos;
- next_seq = seq;
- if (seq > journaled_seq)
- journaled_seq = seq;
- return true;
- }
- }
-
- if (seq && seq < header.committed_up_to) {
- derr << "Unable to read past sequence " << seq
- << " but header indicates the journal has committed up through "
- << header.committed_up_to << ", journal is corrupt" << dendl;
- if (g_conf->journal_ignore_corruption) {
- if (corrupt)
- *corrupt = true;
- return false;
- } else {
- assert(0);
- }
- }
-
- dout(25) << ss.str() << dendl;
- dout(2) << "No further valid entries found, journal is most likely valid"
- << dendl;
- return false;
-}
-
-FileJournal::read_entry_result FileJournal::do_read_entry(
- off64_t init_pos,
- off64_t *next_pos,
- bufferlist *bl,
- uint64_t *seq,
- ostream *ss,
- entry_header_t *_h) const
-{
- off64_t cur_pos = init_pos;
- bufferlist _bl;
- if (!bl)
- bl = &_bl;
-
- // header
- entry_header_t *h;
- bufferlist hbl;
- off64_t _next_pos;
- wrap_read_bl(cur_pos, sizeof(*h), &hbl, &_next_pos);
- h = reinterpret_cast<entry_header_t *>(hbl.c_str());
-
- if (!h->check_magic(cur_pos, header.get_fsid64())) {
- dout(25) << "read_entry " << init_pos
- << " : bad header magic, end of journal" << dendl;
- if (ss)
- *ss << "bad header magic";
- if (next_pos)
- *next_pos = init_pos + (4<<10); // check 4k ahead
- return MAYBE_CORRUPT;
- }
- cur_pos = _next_pos;
-
- // pad + body + pad
- if (h->pre_pad)
- cur_pos += h->pre_pad;
-
- bl->clear();
- wrap_read_bl(cur_pos, h->len, bl, &cur_pos);
-
- if (h->post_pad)
- cur_pos += h->post_pad;
-
- // footer
- entry_header_t *f;
- bufferlist fbl;
- wrap_read_bl(cur_pos, sizeof(*f), &fbl, &cur_pos);
- f = reinterpret_cast<entry_header_t *>(fbl.c_str());
- if (memcmp(f, h, sizeof(*f))) {
- if (ss)
- *ss << "bad footer magic, partial entry";
- if (next_pos)
- *next_pos = cur_pos;
- return MAYBE_CORRUPT;
- }
-
- if ((header.flags & header_t::FLAG_CRC) || // if explicitly enabled (new journal)
- h->crc32c != 0) { // newer entry in old journal
- uint32_t actual_crc = bl->crc32c(0);
- if (actual_crc != h->crc32c) {
- if (ss)
- *ss << "header crc (" << h->crc32c
- << ") doesn't match body crc (" << actual_crc << ")";
- if (next_pos)
- *next_pos = cur_pos;
- return MAYBE_CORRUPT;
- }
- }
-
- // yay!
- dout(2) << "read_entry " << init_pos << " : seq " << h->seq
- << " " << h->len << " bytes"
- << dendl;
-
- // ok!
- if (seq)
- *seq = h->seq;
-
-
- if (next_pos)
- *next_pos = cur_pos;
-
- if (_h)
- *_h = *h;
-
- assert(cur_pos % header.alignment == 0);
- return SUCCESS;
-}
-
-void FileJournal::throttle()
-{
- if (throttle_ops.wait(g_conf->journal_queue_max_ops))
- dout(2) << "throttle: waited for ops" << dendl;
- if (throttle_bytes.wait(g_conf->journal_queue_max_bytes))
- dout(2) << "throttle: waited for bytes" << dendl;
-}
-
-void FileJournal::get_header(
- uint64_t wanted_seq,
- off64_t *_pos,
- entry_header_t *h)
-{
- off64_t pos = header.start;
- off64_t next_pos = pos;
- bufferlist bl;
- uint64_t seq = 0;
- dout(2) << __func__ << dendl;
- while (1) {
- bl.clear();
- pos = next_pos;
- read_entry_result result = do_read_entry(
- pos,
- &next_pos,
- &bl,
- &seq,
- 0,
- h);
- if (result == FAILURE || result == MAYBE_CORRUPT)
- assert(0);
- if (seq == wanted_seq) {
- if (_pos)
- *_pos = pos;
- return;
- }
- }
- assert(0); // not reachable
-}
-
-void FileJournal::corrupt(
- int wfd,
- off64_t corrupt_at)
-{
- dout(2) << __func__ << dendl;
- if (corrupt_at >= header.max_size)
- corrupt_at = corrupt_at + get_top() - header.max_size;
-
- int64_t actual = ::lseek64(fd, corrupt_at, SEEK_SET);
- assert(actual == corrupt_at);
-
- char buf[10];
- int r = safe_read_exact(fd, buf, 1);
- assert(r == 0);
-
- actual = ::lseek64(wfd, corrupt_at, SEEK_SET);
- assert(actual == corrupt_at);
-
- buf[0]++;
- r = safe_write(wfd, buf, 1);
- assert(r == 0);
-}
-
-void FileJournal::corrupt_payload(
- int wfd,
- uint64_t seq)
-{
- dout(2) << __func__ << dendl;
- off64_t pos = 0;
- entry_header_t h;
- get_header(seq, &pos, &h);
- off64_t corrupt_at =
- pos + sizeof(entry_header_t) + h.pre_pad;
- corrupt(wfd, corrupt_at);
-}
-
-
-void FileJournal::corrupt_footer_magic(
- int wfd,
- uint64_t seq)
-{
- dout(2) << __func__ << dendl;
- off64_t pos = 0;
- entry_header_t h;
- get_header(seq, &pos, &h);
- off64_t corrupt_at =
- pos + sizeof(entry_header_t) + h.pre_pad +
- h.len + h.post_pad +
- (reinterpret_cast<char*>(&h.magic2) - reinterpret_cast<char*>(&h));
- corrupt(wfd, corrupt_at);
-}
-
-
-void FileJournal::corrupt_header_magic(
- int wfd,
- uint64_t seq)
-{
- dout(2) << __func__ << dendl;
- off64_t pos = 0;
- entry_header_t h;
- get_header(seq, &pos, &h);
- off64_t corrupt_at =
- pos +
- (reinterpret_cast<char*>(&h.magic2) - reinterpret_cast<char*>(&h));
- corrupt(wfd, corrupt_at);
-}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-#ifndef CEPH_FILEJOURNAL_H
-#define CEPH_FILEJOURNAL_H
-
-#include <deque>
-using std::deque;
-
-#include "Journal.h"
-#include "common/Cond.h"
-#include "common/Mutex.h"
-#include "common/Thread.h"
-#include "common/Throttle.h"
-
-#ifdef HAVE_LIBAIO
-# include <libaio.h>
-#endif
-
-/**
- * Implements journaling on top of block device or file.
- *
- * Lock ordering is write_lock > aio_lock > finisher_lock
- */
-class FileJournal : public Journal {
-public:
- /// Protected by finisher_lock
- struct completion_item {
- uint64_t seq;
- Context *finish;
- utime_t start;
- TrackedOpRef tracked_op;
- completion_item(uint64_t o, Context *c, utime_t s,
- TrackedOpRef opref)
- : seq(o), finish(c), start(s), tracked_op(opref) {}
- completion_item() : seq(0), finish(0), start(0) {}
- };
- struct write_item {
- uint64_t seq;
- bufferlist bl;
- uint32_t orig_len;
- TrackedOpRef tracked_op;
- write_item(uint64_t s, bufferlist& b, int ol, TrackedOpRef opref) :
- seq(s), orig_len(ol), tracked_op(opref) {
- bl.claim(b, buffer::list::CLAIM_ALLOW_NONSHAREABLE); // potential zero-copy
- }
- write_item() : seq(0), orig_len(0) {}
- };
-
- Mutex finisher_lock;
- Cond finisher_cond;
- uint64_t journaled_seq;
- bool plug_journal_completions;
-
- Mutex writeq_lock;
- Cond writeq_cond;
- list<write_item> writeq;
- bool writeq_empty();
- write_item &peek_write();
- void pop_write();
- void batch_pop_write(list<write_item> &items);
- void batch_unpop_write(list<write_item> &items);
-
- Mutex completions_lock;
- list<completion_item> completions;
- bool completions_empty() {
- Mutex::Locker l(completions_lock);
- return completions.empty();
- }
- void batch_pop_completions(list<completion_item> &items) {
- Mutex::Locker l(completions_lock);
- completions.swap(items);
- }
- void batch_unpop_completions(list<completion_item> &items) {
- Mutex::Locker l(completions_lock);
- completions.splice(completions.begin(), items);
- }
- completion_item completion_peek_front() {
- Mutex::Locker l(completions_lock);
- assert(!completions.empty());
- return completions.front();
- }
- void completion_pop_front() {
- Mutex::Locker l(completions_lock);
- assert(!completions.empty());
- completions.pop_front();
- }
-
- int prepare_entry(list<ObjectStore::Transaction*>& tls, bufferlist* tbl);
-
- void submit_entry(uint64_t seq, bufferlist& bl, uint32_t orig_len,
- Context *oncommit,
- TrackedOpRef osd_op = TrackedOpRef());
- /// End protected by finisher_lock
-
- /*
- * journal header
- */
- struct header_t {
- enum {
- FLAG_CRC = (1<<0),
- // NOTE: remove kludgey weirdness in read_header() next time a flag is added.
- };
-
- uint64_t flags;
- uuid_d fsid;
- __u32 block_size;
- __u32 alignment;
- int64_t max_size; // max size of journal ring buffer
- int64_t start; // offset of first entry
- uint64_t committed_up_to; // committed up to
-
- /**
- * start_seq
- *
- * entry at header.start has sequence >= start_seq
- *
- * Generally, the entry at header.start will have sequence
- * start_seq if it exists. The only exception is immediately
- * after journal creation since the first sequence number is
- * not known.
- *
- * If the first read on open fails, we can assume corruption
- * if start_seq > committed_up_thru because the entry would have
- * a sequence >= start_seq and therefore > committed_up_thru.
- */
- uint64_t start_seq;
-
- header_t() :
- flags(0), block_size(0), alignment(0), max_size(0), start(0),
- committed_up_to(0), start_seq(0) {}
-
- void clear() {
- start = block_size;
- }
-
- uint64_t get_fsid64() const {
- return *(uint64_t*)fsid.bytes();
- }
-
- void encode(bufferlist& bl) const {
- __u32 v = 4;
- ::encode(v, bl);
- bufferlist em;
- {
- ::encode(flags, em);
- ::encode(fsid, em);
- ::encode(block_size, em);
- ::encode(alignment, em);
- ::encode(max_size, em);
- ::encode(start, em);
- ::encode(committed_up_to, em);
- ::encode(start_seq, em);
- }
- ::encode(em, bl);
- }
- void decode(bufferlist::iterator& bl) {
- __u32 v;
- ::decode(v, bl);
- if (v < 2) { // normally 0, but concievably 1
- // decode old header_t struct (pre v0.40).
- bl.advance(4); // skip __u32 flags (it was unused by any old code)
- flags = 0;
- uint64_t tfsid;
- ::decode(tfsid, bl);
- *(uint64_t*)&fsid.bytes()[0] = tfsid;
- *(uint64_t*)&fsid.bytes()[8] = tfsid;
- ::decode(block_size, bl);
- ::decode(alignment, bl);
- ::decode(max_size, bl);
- ::decode(start, bl);
- committed_up_to = 0;
- start_seq = 0;
- return;
- }
- bufferlist em;
- ::decode(em, bl);
- bufferlist::iterator t = em.begin();
- ::decode(flags, t);
- ::decode(fsid, t);
- ::decode(block_size, t);
- ::decode(alignment, t);
- ::decode(max_size, t);
- ::decode(start, t);
-
- if (v > 2)
- ::decode(committed_up_to, t);
- else
- committed_up_to = 0;
-
- if (v > 3)
- ::decode(start_seq, t);
- else
- start_seq = 0;
- }
- } header;
-
- struct entry_header_t {
- uint64_t seq; // fs op seq #
- uint32_t crc32c; // payload only. not header, pre_pad, post_pad, or footer.
- uint32_t len;
- uint32_t pre_pad, post_pad;
- uint64_t magic1;
- uint64_t magic2;
-
- static uint64_t make_magic(uint64_t seq, uint32_t len, uint64_t fsid) {
- return (fsid ^ seq ^ len);
- }
- bool check_magic(off64_t pos, uint64_t fsid) {
- return
- magic1 == (uint64_t)pos &&
- magic2 == (fsid ^ seq ^ len);
- }
- } __attribute__((__packed__, aligned(4)));
-
- bool journalq_empty() { return journalq.empty(); }
-
-private:
- string fn;
-
- char *zero_buf;
- off64_t max_size;
- size_t block_size;
- bool directio, aio, force_aio;
- bool must_write_header;
- off64_t write_pos; // byte where the next entry to be written will go
- off64_t read_pos; //
- bool discard; //for block journal whether support discard
-
-#ifdef HAVE_LIBAIO
- /// state associated with an in-flight aio request
- /// Protected by aio_lock
- struct aio_info {
- struct iocb iocb;
- bufferlist bl;
- struct iovec *iov;
- bool done;
- uint64_t off, len; ///< these are for debug only
- uint64_t seq; ///< seq number to complete on aio completion, if non-zero
-
- aio_info(bufferlist& b, uint64_t o, uint64_t s)
- : iov(NULL), done(false), off(o), len(b.length()), seq(s) {
- bl.claim(b);
- memset((void*)&iocb, 0, sizeof(iocb));
- }
- ~aio_info() {
- delete[] iov;
- }
- };
- Mutex aio_lock;
- Cond aio_cond;
- Cond write_finish_cond;
- io_context_t aio_ctx;
- list<aio_info> aio_queue;
- int aio_num, aio_bytes;
- /// End protected by aio_lock
-#endif
-
- uint64_t last_committed_seq;
- uint64_t journaled_since_start;
-
- /*
- * full states cycle at the beginnging of each commit epoch, when commit_start()
- * is called.
- * FULL - we just filled up during this epoch.
- * WAIT - we filled up last epoch; now we have to wait until everything during
- * that epoch commits to the fs before we can start writing over it.
- * NOTFULL - all good, journal away.
- */
- enum {
- FULL_NOTFULL = 0,
- FULL_FULL = 1,
- FULL_WAIT = 2,
- } full_state;
-
- int fd;
-
- // in journal
- deque<pair<uint64_t, off64_t> > journalq; // track seq offsets, so we can trim later.
- uint64_t writing_seq;
-
-
- // throttle
- Throttle throttle_ops, throttle_bytes;
-
- void put_throttle(uint64_t ops, uint64_t bytes);
-
- // write thread
- Mutex write_lock;
- bool write_stop;
- bool aio_stop;
-
- Cond commit_cond;
-
- int _open(bool wr, bool create=false);
- int _open_block_device();
- void _close(int fd) const;
- void _check_disk_write_cache() const;
- int _open_file(int64_t oldsize, blksize_t blksize, bool create);
- int _dump(ostream& out, bool simple);
- void print_header(const header_t &hdr) const;
- int read_header(header_t *hdr) const;
- bufferptr prepare_header();
- void start_writer();
- void stop_writer();
- void write_thread_entry();
-
- void queue_completions_thru(uint64_t seq);
-
- int check_for_full(uint64_t seq, off64_t pos, off64_t size);
- int prepare_multi_write(bufferlist& bl, uint64_t& orig_ops, uint64_t& orig_bytee);
- int prepare_single_write(write_item &next_write, bufferlist& bl, off64_t& queue_pos,
- uint64_t& orig_ops, uint64_t& orig_bytes);
- void do_write(bufferlist& bl);
-
- void write_finish_thread_entry();
- void check_aio_completion();
- void do_aio_write(bufferlist& bl);
- int write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq);
-
-
- void align_bl(off64_t pos, bufferlist& bl);
- int write_bl(off64_t& pos, bufferlist& bl);
-
- /// read len from journal starting at in_pos and wrapping up to len
- void wrap_read_bl(
- off64_t in_pos, ///< [in] start position
- int64_t len, ///< [in] length to read
- bufferlist* bl, ///< [out] result
- off64_t *out_pos ///< [out] next position to read, will be wrapped
- ) const;
-
- void do_discard(int64_t offset, int64_t end);
-
- class Writer : public Thread {
- FileJournal *journal;
- public:
- Writer(FileJournal *fj) : journal(fj) {}
- void *entry() {
- journal->write_thread_entry();
- return 0;
- }
- } write_thread;
-
- class WriteFinisher : public Thread {
- FileJournal *journal;
- public:
- WriteFinisher(FileJournal *fj) : journal(fj) {}
- void *entry() {
- journal->write_finish_thread_entry();
- return 0;
- }
- } write_finish_thread;
-
- off64_t get_top() const {
- return ROUND_UP_TO(sizeof(header), block_size);
- }
-
- public:
- FileJournal(uuid_d fsid, Finisher *fin, Cond *sync_cond, const char *f, bool dio=false, bool ai=true, bool faio=false) :
- Journal(fsid, fin, sync_cond),
- finisher_lock("FileJournal::finisher_lock", false, true, false, g_ceph_context),
- journaled_seq(0),
- plug_journal_completions(false),
- writeq_lock("FileJournal::writeq_lock", false, true, false, g_ceph_context),
- completions_lock(
- "FileJournal::completions_lock", false, true, false, g_ceph_context),
- fn(f),
- zero_buf(NULL),
- max_size(0), block_size(0),
- directio(dio), aio(ai), force_aio(faio),
- must_write_header(false),
- write_pos(0), read_pos(0),
- discard(false),
-#ifdef HAVE_LIBAIO
- aio_lock("FileJournal::aio_lock"),
- aio_ctx(0),
- aio_num(0), aio_bytes(0),
-#endif
- last_committed_seq(0),
- journaled_since_start(0),
- full_state(FULL_NOTFULL),
- fd(-1),
- writing_seq(0),
- throttle_ops(g_ceph_context, "journal_ops", g_conf->journal_queue_max_ops),
- throttle_bytes(g_ceph_context, "journal_bytes", g_conf->journal_queue_max_bytes),
- write_lock("FileJournal::write_lock", false, true, false, g_ceph_context),
- write_stop(true),
- aio_stop(true),
- write_thread(this),
- write_finish_thread(this) {
-
- if (aio && !directio) {
- derr << "FileJournal::_open_any: aio not supported without directio; disabling aio" << dendl;
- aio = false;
- }
-#ifndef HAVE_LIBAIO
- if (aio) {
- derr << "FileJournal::_open_any: libaio not compiled in; disabling aio" << dendl;
- aio = false;
- }
-#endif
- }
- ~FileJournal() {
- assert(fd == -1);
- delete[] zero_buf;
- }
-
- int check();
- int create();
- int open(uint64_t fs_op_seq);
- void close();
- int peek_fsid(uuid_d& fsid);
-
- int dump(ostream& out);
- int simple_dump(ostream& out);
- int _fdump(Formatter &f, bool simple);
-
- void flush();
-
- void throttle();
-
- bool is_writeable() {
- return read_pos == 0;
- }
- int make_writeable();
-
- // writes
- void commit_start(uint64_t seq);
- void committed_thru(uint64_t seq);
- bool should_commit_now() {
- return full_state != FULL_NOTFULL && !write_stop;
- }
-
- void write_header_sync();
-
- void set_wait_on_full(bool b) { wait_on_full = b; }
-
- // reads
-
- /// Result code for read_entry
- enum read_entry_result {
- SUCCESS,
- FAILURE,
- MAYBE_CORRUPT
- };
-
- /**
- * read_entry
- *
- * Reads next entry starting at pos. If the entry appears
- * clean, *bl will contain the payload, *seq will contain
- * the sequence number, and *out_pos will reflect the next
- * read position. If the entry is invalid *ss will contain
- * debug text, while *seq, *out_pos, and *bl will be unchanged.
- *
- * If the entry suggests a corrupt log, *ss will contain debug
- * text, *out_pos will contain the next index to check. If
- * we find an entry in this way that returns SUCCESS, the journal
- * is most likely corrupt.
- */
- read_entry_result do_read_entry(
- off64_t pos, ///< [in] position to read
- off64_t *next_pos, ///< [out] next position to read
- bufferlist* bl, ///< [out] payload for successful read
- uint64_t *seq, ///< [out] seq of successful read
- ostream *ss, ///< [out] error output
- entry_header_t *h = 0 ///< [out] header
- ) const; ///< @return result code
-
- bool read_entry(
- bufferlist &bl,
- uint64_t &last_seq,
- bool *corrupt
- );
-
- bool read_entry(
- bufferlist &bl,
- uint64_t &last_seq) {
- return read_entry(bl, last_seq, 0);
- }
-
- // Debug/Testing
- void get_header(
- uint64_t wanted_seq,
- off64_t *_pos,
- entry_header_t *h);
- void corrupt(
- int wfd,
- off64_t corrupt_at);
- void corrupt_payload(
- int wfd,
- uint64_t seq);
- void corrupt_footer_magic(
- int wfd,
- uint64_t seq);
- void corrupt_header_magic(
- int wfd,
- uint64_t seq);
-};
-
-WRITE_CLASS_ENCODER(FileJournal::header_t)
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- * Copyright (c) 2015 Hewlett-Packard Development Company, L.P.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-#include "include/int_types.h"
-
-#include <unistd.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <sys/file.h>
-#include <errno.h>
-#include <dirent.h>
-#include <sys/ioctl.h>
-
-#if defined(__linux__)
-#include <linux/fs.h>
-#endif
-
-#include <iostream>
-#include <map>
-
-#include "include/compat.h"
-#include "include/linux_fiemap.h"
-
-#include "common/xattr.h"
-#include "chain_xattr.h"
-
-#if defined(DARWIN) || defined(__FreeBSD__)
-#include <sys/param.h>
-#include <sys/mount.h>
-#endif // DARWIN
-
-
-#include <fstream>
-#include <sstream>
-
-#include "FileStore.h"
-#include "GenericFileStoreBackend.h"
-#include "BtrfsFileStoreBackend.h"
-#include "XfsFileStoreBackend.h"
-#include "ZFSFileStoreBackend.h"
-#include "common/BackTrace.h"
-#include "include/types.h"
-#include "FileJournal.h"
-
-#include "osd/osd_types.h"
-#include "include/color.h"
-#include "include/buffer.h"
-
-#include "common/Timer.h"
-#include "common/debug.h"
-#include "common/errno.h"
-#include "common/run_cmd.h"
-#include "common/safe_io.h"
-#include "common/perf_counters.h"
-#include "common/sync_filesystem.h"
-#include "common/fd.h"
-#include "HashIndex.h"
-#include "DBObjectMap.h"
-#include "kv/KeyValueDB.h"
-
-#include "common/ceph_crypto.h"
-using ceph::crypto::SHA1;
-
-#include "include/assert.h"
-
-#include "common/config.h"
-#include "common/blkdev.h"
-
-#ifdef WITH_LTTNG
-#define TRACEPOINT_DEFINE
-#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
-#include "tracing/objectstore.h"
-#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
-#undef TRACEPOINT_DEFINE
-#else
-#define tracepoint(...)
-#endif
-
-#define dout_subsys ceph_subsys_filestore
-#undef dout_prefix
-#define dout_prefix *_dout << "filestore(" << basedir << ") "
-
-#define COMMIT_SNAP_ITEM "snap_%llu"
-#define CLUSTER_SNAP_ITEM "clustersnap_%s"
-
-#define REPLAY_GUARD_XATTR "user.cephos.seq"
-#define GLOBAL_REPLAY_GUARD_XATTR "user.cephos.gseq"
-
-// XATTR_SPILL_OUT_NAME as a xattr is used to maintain that indicates whether
-// xattrs spill over into DBObjectMap, if XATTR_SPILL_OUT_NAME exists in file
-// xattrs and the value is "no", it indicates no xattrs in DBObjectMap
-#define XATTR_SPILL_OUT_NAME "user.cephos.spill_out"
-#define XATTR_NO_SPILL_OUT "0"
-#define XATTR_SPILL_OUT "1"
-
-//Initial features in new superblock.
-static CompatSet get_fs_initial_compat_set() {
- CompatSet::FeatureSet ceph_osd_feature_compat;
- CompatSet::FeatureSet ceph_osd_feature_ro_compat;
- CompatSet::FeatureSet ceph_osd_feature_incompat;
- return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
- ceph_osd_feature_incompat);
-}
-
-//Features are added here that this FileStore supports.
-static CompatSet get_fs_supported_compat_set() {
- CompatSet compat = get_fs_initial_compat_set();
- //Any features here can be set in code, but not in initial superblock
- compat.incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS);
- return compat;
-}
-
-int FileStore::get_block_device_fsid(const string& path, uuid_d *fsid)
-{
- // make sure we don't try to use aio or direct_io (and get annoying
- // error messages from failing to do so); performance implications
- // should be irrelevant for this use
- FileJournal j(*fsid, 0, 0, path.c_str(), false, false);
- return j.peek_fsid(*fsid);
-}
-
-void FileStore::FSPerfTracker::update_from_perfcounters(
- PerfCounters &logger)
-{
- os_commit_latency.consume_next(
- logger.get_tavg_ms(
- l_os_j_lat));
- os_apply_latency.consume_next(
- logger.get_tavg_ms(
- l_os_apply_lat));
-}
-
-
-ostream& operator<<(ostream& out, const FileStore::OpSequencer& s)
-{
- assert(&out);
- return out << *s.parent;
-}
-
-int FileStore::get_cdir(coll_t cid, char *s, int len)
-{
- const string &cid_str(cid.to_str());
- return snprintf(s, len, "%s/current/%s", basedir.c_str(), cid_str.c_str());
-}
-
-int FileStore::get_index(coll_t cid, Index *index)
-{
- int r = index_manager.get_index(cid, basedir, index);
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
-}
-
-int FileStore::init_index(coll_t cid)
-{
- char path[PATH_MAX];
- get_cdir(cid, path, sizeof(path));
- int r = index_manager.init_index(cid, path, target_version);
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
-}
-
-int FileStore::lfn_find(const ghobject_t& oid, const Index& index, IndexedPath *path)
-{
- IndexedPath path2;
- if (!path)
- path = &path2;
- int r, exist;
- assert(NULL != index.index);
- r = (index.index)->lookup(oid, path, &exist);
- if (r < 0) {
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
- }
- if (!exist)
- return -ENOENT;
- return 0;
-}
-
-int FileStore::lfn_truncate(coll_t cid, const ghobject_t& oid, off_t length)
-{
- FDRef fd;
- int r = lfn_open(cid, oid, false, &fd);
- if (r < 0)
- return r;
- r = ::ftruncate(**fd, length);
- if (r < 0)
- r = -errno;
- if (r >= 0 && m_filestore_sloppy_crc) {
- int rc = backend->_crc_update_truncate(**fd, length);
- assert(rc >= 0);
- }
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
-}
-
-int FileStore::lfn_stat(coll_t cid, const ghobject_t& oid, struct stat *buf)
-{
- IndexedPath path;
- Index index;
- int r = get_index(cid, &index);
- if (r < 0)
- return r;
-
- assert(NULL != index.index);
- RWLock::RLocker l((index.index)->access_lock);
-
- r = lfn_find(oid, index, &path);
- if (r < 0)
- return r;
- r = ::stat(path->path(), buf);
- if (r < 0)
- r = -errno;
- return r;
-}
-
-int FileStore::lfn_open(coll_t cid,
- const ghobject_t& oid,
- bool create,
- FDRef *outfd,
- Index *index)
-{
- assert(outfd);
- int r = 0;
- bool need_lock = true;
- int flags = O_RDWR;
-
- if (create)
- flags |= O_CREAT;
-
- Index index2;
- if (!index) {
- index = &index2;
- }
- if (!((*index).index)) {
- r = get_index(cid, index);
- if (r < 0) {
- dout(10) << __func__ << " could not get index r = " << r << dendl;
- return r;
- }
- } else {
- need_lock = false;
- }
-
- int fd, exist;
- assert(NULL != (*index).index);
- if (need_lock) {
- ((*index).index)->access_lock.get_write();
- }
- if (!replaying) {
- *outfd = fdcache.lookup(oid);
- if (*outfd) {
- if (need_lock) {
- ((*index).index)->access_lock.put_write();
- }
- return 0;
- }
- }
-
-
- IndexedPath path2;
- IndexedPath *path = &path2;
-
- r = (*index)->lookup(oid, path, &exist);
- if (r < 0) {
- derr << "could not find " << oid << " in index: "
- << cpp_strerror(-r) << dendl;
- goto fail;
- }
-
- r = ::open((*path)->path(), flags, 0644);
- if (r < 0) {
- r = -errno;
- dout(10) << "error opening file " << (*path)->path() << " with flags="
- << flags << ": " << cpp_strerror(-r) << dendl;
- goto fail;
- }
- fd = r;
- if (create && (!exist)) {
- r = (*index)->created(oid, (*path)->path());
- if (r < 0) {
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- derr << "error creating " << oid << " (" << (*path)->path()
- << ") in index: " << cpp_strerror(-r) << dendl;
- goto fail;
- }
- r = chain_fsetxattr(fd, XATTR_SPILL_OUT_NAME,
- XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT), true);
- if (r < 0) {
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- derr << "error setting spillout xattr for oid " << oid << " (" << (*path)->path()
- << "):" << cpp_strerror(-r) << dendl;
- goto fail;
- }
- }
-
- if (!replaying) {
- bool existed;
- *outfd = fdcache.add(oid, fd, &existed);
- if (existed) {
- TEMP_FAILURE_RETRY(::close(fd));
- }
- } else {
- *outfd = FDRef(new FDCache::FD(fd));
- }
-
- if (need_lock) {
- ((*index).index)->access_lock.put_write();
- }
-
- return 0;
-
- fail:
-
- if (need_lock) {
- ((*index).index)->access_lock.put_write();
- }
-
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
-}
-
-void FileStore::lfn_close(FDRef fd)
-{
-}
-
-int FileStore::lfn_link(coll_t c, coll_t newcid, const ghobject_t& o, const ghobject_t& newoid)
-{
- Index index_new, index_old;
- IndexedPath path_new, path_old;
- int exist;
- int r;
- bool index_same = false;
- if (c < newcid) {
- r = get_index(newcid, &index_new);
- if (r < 0)
- return r;
- r = get_index(c, &index_old);
- if (r < 0)
- return r;
- } else if (c == newcid) {
- r = get_index(c, &index_old);
- if (r < 0)
- return r;
- index_new = index_old;
- index_same = true;
- } else {
- r = get_index(c, &index_old);
- if (r < 0)
- return r;
- r = get_index(newcid, &index_new);
- if (r < 0)
- return r;
- }
-
- assert(NULL != index_old.index);
- assert(NULL != index_new.index);
-
- if (!index_same) {
-
- RWLock::RLocker l1((index_old.index)->access_lock);
-
- r = index_old->lookup(o, &path_old, &exist);
- if (r < 0) {
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
- }
- if (!exist)
- return -ENOENT;
-
- RWLock::WLocker l2((index_new.index)->access_lock);
-
- r = index_new->lookup(newoid, &path_new, &exist);
- if (r < 0) {
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
- }
- if (exist)
- return -EEXIST;
-
- dout(25) << "lfn_link path_old: " << path_old << dendl;
- dout(25) << "lfn_link path_new: " << path_new << dendl;
- r = ::link(path_old->path(), path_new->path());
- if (r < 0)
- return -errno;
-
- r = index_new->created(newoid, path_new->path());
- if (r < 0) {
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
- }
- } else {
- RWLock::WLocker l1((index_old.index)->access_lock);
-
- r = index_old->lookup(o, &path_old, &exist);
- if (r < 0) {
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
- }
- if (!exist)
- return -ENOENT;
-
- r = index_new->lookup(newoid, &path_new, &exist);
- if (r < 0) {
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
- }
- if (exist)
- return -EEXIST;
-
- dout(25) << "lfn_link path_old: " << path_old << dendl;
- dout(25) << "lfn_link path_new: " << path_new << dendl;
- r = ::link(path_old->path(), path_new->path());
- if (r < 0)
- return -errno;
-
- // make sure old fd for unlinked/overwritten file is gone
- fdcache.clear(newoid);
-
- r = index_new->created(newoid, path_new->path());
- if (r < 0) {
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
- }
- }
- return 0;
-}
-
-int FileStore::lfn_unlink(coll_t cid, const ghobject_t& o,
- const SequencerPosition &spos,
- bool force_clear_omap)
-{
- Index index;
- int r = get_index(cid, &index);
- if (r < 0) {
- dout(25) << __func__ << " get_index failed " << cpp_strerror(r) << dendl;
- return r;
- }
-
- assert(NULL != index.index);
- RWLock::WLocker l((index.index)->access_lock);
-
- {
- IndexedPath path;
- int hardlink;
- r = index->lookup(o, &path, &hardlink);
- if (r < 0) {
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
- }
-
- if (!force_clear_omap) {
- if (hardlink == 0) {
- wbthrottle.clear_object(o); // should be only non-cache ref
- fdcache.clear(o);
- return 0;
- } else if (hardlink == 1) {
- force_clear_omap = true;
- }
- }
- if (force_clear_omap) {
- dout(20) << __func__ << ": clearing omap on " << o
- << " in cid " << cid << dendl;
- r = object_map->clear(o, &spos);
- if (r < 0 && r != -ENOENT) {
- dout(25) << __func__ << " omap clear failed " << cpp_strerror(r) << dendl;
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
- }
- if (g_conf->filestore_debug_inject_read_err) {
- debug_obj_on_delete(o);
- }
- wbthrottle.clear_object(o); // should be only non-cache ref
- fdcache.clear(o);
- } else {
- /* Ensure that replay of this op doesn't result in the object_map
- * going away.
- */
- if (!backend->can_checkpoint())
- object_map->sync(&o, &spos);
- }
- }
- r = index->unlink(o);
- if (r < 0) {
- dout(25) << __func__ << " index unlink failed " << cpp_strerror(r) << dendl;
- return r;
- }
- return 0;
-}
-
-FileStore::FileStore(const std::string &base, const std::string &jdev, osflagbits_t flags, const char *name, bool do_update) :
- JournalingObjectStore(base),
- internal_name(name),
- basedir(base), journalpath(jdev),
- generic_flags(flags),
- blk_size(0),
- fsid_fd(-1), op_fd(-1),
- basedir_fd(-1), current_fd(-1),
- backend(NULL),
- index_manager(do_update),
- lock("FileStore::lock"),
- force_sync(false),
- sync_entry_timeo_lock("sync_entry_timeo_lock"),
- timer(g_ceph_context, sync_entry_timeo_lock),
- stop(false), sync_thread(this),
- fdcache(g_ceph_context),
- wbthrottle(g_ceph_context),
- next_osr_id(0),
- throttle_ops(g_ceph_context, "filestore_ops",g_conf->filestore_queue_max_ops),
- throttle_bytes(g_ceph_context, "filestore_bytes",g_conf->filestore_queue_max_bytes),
- m_ondisk_finisher_num(g_conf->filestore_ondisk_finisher_threads),
- m_apply_finisher_num(g_conf->filestore_apply_finisher_threads),
- op_tp(g_ceph_context, "FileStore::op_tp", g_conf->filestore_op_threads, "filestore_op_threads"),
- op_wq(this, g_conf->filestore_op_thread_timeout,
- g_conf->filestore_op_thread_suicide_timeout, &op_tp),
- logger(NULL),
- read_error_lock("FileStore::read_error_lock"),
- m_filestore_commit_timeout(g_conf->filestore_commit_timeout),
- m_filestore_journal_parallel(g_conf->filestore_journal_parallel ),
- m_filestore_journal_trailing(g_conf->filestore_journal_trailing),
- m_filestore_journal_writeahead(g_conf->filestore_journal_writeahead),
- m_filestore_fiemap_threshold(g_conf->filestore_fiemap_threshold),
- m_filestore_max_sync_interval(g_conf->filestore_max_sync_interval),
- m_filestore_min_sync_interval(g_conf->filestore_min_sync_interval),
- m_filestore_fail_eio(g_conf->filestore_fail_eio),
- m_filestore_fadvise(g_conf->filestore_fadvise),
- do_update(do_update),
- m_journal_dio(g_conf->journal_dio),
- m_journal_aio(g_conf->journal_aio),
- m_journal_force_aio(g_conf->journal_force_aio),
- m_osd_rollback_to_cluster_snap(g_conf->osd_rollback_to_cluster_snap),
- m_osd_use_stale_snap(g_conf->osd_use_stale_snap),
- m_filestore_queue_max_ops(g_conf->filestore_queue_max_ops),
- m_filestore_queue_max_bytes(g_conf->filestore_queue_max_bytes),
- m_filestore_queue_committing_max_ops(g_conf->filestore_queue_committing_max_ops),
- m_filestore_queue_committing_max_bytes(g_conf->filestore_queue_committing_max_bytes),
- m_filestore_do_dump(false),
- m_filestore_dump_fmt(true),
- m_filestore_sloppy_crc(g_conf->filestore_sloppy_crc),
- m_filestore_sloppy_crc_block_size(g_conf->filestore_sloppy_crc_block_size),
- m_filestore_max_alloc_hint_size(g_conf->filestore_max_alloc_hint_size),
- m_fs_type(0),
- m_filestore_max_inline_xattr_size(0),
- m_filestore_max_inline_xattrs(0)
-{
- m_filestore_kill_at.set(g_conf->filestore_kill_at);
- for (int i = 0; i < m_ondisk_finisher_num; ++i) {
- ostringstream oss;
- oss << "filestore-ondisk-" << i;
- Finisher *f = new Finisher(g_ceph_context, oss.str());
- ondisk_finishers.push_back(f);
- }
- for (int i = 0; i < m_apply_finisher_num; ++i) {
- ostringstream oss;
- oss << "filestore-apply-" << i;
- Finisher *f = new Finisher(g_ceph_context, oss.str());
- apply_finishers.push_back(f);
- }
-
- ostringstream oss;
- oss << basedir << "/current";
- current_fn = oss.str();
-
- ostringstream sss;
- sss << basedir << "/current/commit_op_seq";
- current_op_seq_fn = sss.str();
-
- ostringstream omss;
- omss << basedir << "/current/omap";
- omap_dir = omss.str();
-
- // initialize logger
- PerfCountersBuilder plb(g_ceph_context, internal_name, l_os_first, l_os_last);
-
- plb.add_u64(l_os_jq_max_ops, "journal_queue_max_ops", "Max operations in journal queue");
- plb.add_u64(l_os_jq_ops, "journal_queue_ops", "Operations in journal queue");
- plb.add_u64_counter(l_os_j_ops, "journal_ops", "Total journal entries written");
- plb.add_u64(l_os_jq_max_bytes, "journal_queue_max_bytes", "Max data in journal queue");
- plb.add_u64(l_os_jq_bytes, "journal_queue_bytes", "Size of journal queue");
- plb.add_u64_counter(l_os_j_bytes, "journal_bytes", "Total operations size in journal");
- plb.add_time_avg(l_os_j_lat, "journal_latency", "Average journal queue completing latency");
- plb.add_u64_counter(l_os_j_wr, "journal_wr", "Journal write IOs");
- plb.add_u64_avg(l_os_j_wr_bytes, "journal_wr_bytes", "Journal data written");
- plb.add_u64(l_os_oq_max_ops, "op_queue_max_ops", "Max operations in writing to FS queue");
- plb.add_u64(l_os_oq_ops, "op_queue_ops", "Operations in writing to FS queue");
- plb.add_u64_counter(l_os_ops, "ops", "Operations written to store");
- plb.add_u64(l_os_oq_max_bytes, "op_queue_max_bytes", "Max data in writing to FS queue");
- plb.add_u64(l_os_oq_bytes, "op_queue_bytes", "Size of writing to FS queue");
- plb.add_u64_counter(l_os_bytes, "bytes", "Data written to store");
- plb.add_time_avg(l_os_apply_lat, "apply_latency", "Apply latency");
- plb.add_u64(l_os_committing, "committing", "Is currently committing");
-
- plb.add_u64_counter(l_os_commit, "commitcycle", "Commit cycles");
- plb.add_time_avg(l_os_commit_len, "commitcycle_interval", "Average interval between commits");
- plb.add_time_avg(l_os_commit_lat, "commitcycle_latency", "Average latency of commit");
- plb.add_u64_counter(l_os_j_full, "journal_full", "Journal writes while full");
- plb.add_time_avg(l_os_queue_lat, "queue_transaction_latency_avg", "Store operation queue latency");
-
- logger = plb.create_perf_counters();
-
- g_ceph_context->get_perfcounters_collection()->add(logger);
- g_ceph_context->_conf->add_observer(this);
-
- superblock.compat_features = get_fs_initial_compat_set();
-}
-
-FileStore::~FileStore()
-{
- for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
- delete *it;
- *it = NULL;
- }
- for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
- delete *it;
- *it = NULL;
- }
- g_ceph_context->_conf->remove_observer(this);
- g_ceph_context->get_perfcounters_collection()->remove(logger);
-
- if (journal)
- journal->logger = NULL;
- delete logger;
-
- if (m_filestore_do_dump) {
- dump_stop();
- }
-}
-
-static void get_attrname(const char *name, char *buf, int len)
-{
- snprintf(buf, len, "user.ceph.%s", name);
-}
-
-bool parse_attrname(char **name)
-{
- if (strncmp(*name, "user.ceph.", 10) == 0) {
- *name += 10;
- return true;
- }
- return false;
-}
-
-void FileStore::collect_metadata(map<string,string> *pm)
-{
- char partition_path[PATH_MAX];
- char dev_node[PATH_MAX];
- int rc = 0;
-
- (*pm)["filestore_backend"] = backend->get_name();
- ostringstream ss;
- ss << "0x" << std::hex << m_fs_type << std::dec;
- (*pm)["filestore_f_type"] = ss.str();
-
- if (g_conf->filestore_collect_device_partition_information) {
- rc = get_device_by_uuid(get_fsid(), "PARTUUID", partition_path,
- dev_node);
- } else {
- rc = -EINVAL;
- }
-
- switch (rc) {
- case -EOPNOTSUPP:
- case -EINVAL:
- (*pm)["backend_filestore_partition_path"] = "unknown";
- (*pm)["backend_filestore_dev_node"] = "unknown";
- break;
- case -ENODEV:
- (*pm)["backend_filestore_partition_path"] = string(partition_path);
- (*pm)["backend_filestore_dev_node"] = "unknown";
- break;
- default:
- (*pm)["backend_filestore_partition_path"] = string(partition_path);
- (*pm)["backend_filestore_dev_node"] = string(dev_node);
- }
-}
-
-int FileStore::statfs(struct statfs *buf)
-{
- if (::statfs(basedir.c_str(), buf) < 0) {
- int r = -errno;
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
- }
- return 0;
-}
-
-
-void FileStore::new_journal()
-{
- if (journalpath.length()) {
- dout(10) << "open_journal at " << journalpath << dendl;
- journal = new FileJournal(fsid, &finisher, &sync_cond, journalpath.c_str(),
- m_journal_dio, m_journal_aio, m_journal_force_aio);
- if (journal)
- journal->logger = logger;
- }
- return;
-}
-
-int FileStore::dump_journal(ostream& out)
-{
- int r;
-
- if (!journalpath.length())
- return -EINVAL;
-
- FileJournal *journal = new FileJournal(fsid, &finisher, &sync_cond, journalpath.c_str(), m_journal_dio);
- r = journal->dump(out);
- delete journal;
- return r;
-}
-
-FileStoreBackend *FileStoreBackend::create(long f_type, FileStore *fs)
-{
- switch (f_type) {
-#if defined(__linux__)
- case BTRFS_SUPER_MAGIC:
- return new BtrfsFileStoreBackend(fs);
-# ifdef HAVE_LIBXFS
- case XFS_SUPER_MAGIC:
- return new XfsFileStoreBackend(fs);
-# endif
-#endif
-#ifdef HAVE_LIBZFS
- case ZFS_SUPER_MAGIC:
- return new ZFSFileStoreBackend(fs);
-#endif
- default:
- return new GenericFileStoreBackend(fs);
- }
-}
-
-void FileStore::create_backend(long f_type)
-{
- m_fs_type = f_type;
-
- assert(backend == NULL);
- backend = FileStoreBackend::create(f_type, this);
-
- dout(0) << "backend " << backend->get_name()
- << " (magic 0x" << std::hex << f_type << std::dec << ")"
- << dendl;
-
- switch (f_type) {
-#if defined(__linux__)
- case BTRFS_SUPER_MAGIC:
- wbthrottle.set_fs(WBThrottle::BTRFS);
- break;
-
- case XFS_SUPER_MAGIC:
- // wbthrottle is constructed with fs(WBThrottle::XFS)
- break;
-#endif
- }
-
- set_xattr_limits_via_conf();
-}
-
-int FileStore::mkfs()
-{
- int ret = 0;
- char fsid_fn[PATH_MAX];
- uuid_d old_fsid;
-
- dout(1) << "mkfs in " << basedir << dendl;
- basedir_fd = ::open(basedir.c_str(), O_RDONLY);
- if (basedir_fd < 0) {
- ret = -errno;
- derr << "mkfs failed to open base dir " << basedir << ": " << cpp_strerror(ret) << dendl;
- return ret;
- }
-
- // open+lock fsid
- snprintf(fsid_fn, sizeof(fsid_fn), "%s/fsid", basedir.c_str());
- fsid_fd = ::open(fsid_fn, O_RDWR|O_CREAT, 0644);
- if (fsid_fd < 0) {
- ret = -errno;
- derr << "mkfs: failed to open " << fsid_fn << ": " << cpp_strerror(ret) << dendl;
- goto close_basedir_fd;
- }
-
- if (lock_fsid() < 0) {
- ret = -EBUSY;
- goto close_fsid_fd;
- }
-
- if (read_fsid(fsid_fd, &old_fsid) < 0 || old_fsid.is_zero()) {
- if (fsid.is_zero()) {
- fsid.generate_random();
- dout(1) << "mkfs generated fsid " << fsid << dendl;
- } else {
- dout(1) << "mkfs using provided fsid " << fsid << dendl;
- }
-
- char fsid_str[40];
- fsid.print(fsid_str);
- strcat(fsid_str, "\n");
- ret = ::ftruncate(fsid_fd, 0);
- if (ret < 0) {
- ret = -errno;
- derr << "mkfs: failed to truncate fsid: "
- << cpp_strerror(ret) << dendl;
- goto close_fsid_fd;
- }
- ret = safe_write(fsid_fd, fsid_str, strlen(fsid_str));
- if (ret < 0) {
- derr << "mkfs: failed to write fsid: "
- << cpp_strerror(ret) << dendl;
- goto close_fsid_fd;
- }
- if (::fsync(fsid_fd) < 0) {
- ret = errno;
- derr << "mkfs: close failed: can't write fsid: "
- << cpp_strerror(ret) << dendl;
- goto close_fsid_fd;
- }
- dout(10) << "mkfs fsid is " << fsid << dendl;
- } else {
- if (!fsid.is_zero() && fsid != old_fsid) {
- derr << "mkfs on-disk fsid " << old_fsid << " != provided " << fsid << dendl;
- ret = -EINVAL;
- goto close_fsid_fd;
- }
- fsid = old_fsid;
- dout(1) << "mkfs fsid is already set to " << fsid << dendl;
- }
-
- // version stamp
- ret = write_version_stamp();
- if (ret < 0) {
- derr << "mkfs: write_version_stamp() failed: "
- << cpp_strerror(ret) << dendl;
- goto close_fsid_fd;
- }
-
- // superblock
- superblock.omap_backend = g_conf->filestore_omap_backend;
- ret = write_superblock();
- if (ret < 0) {
- derr << "mkfs: write_superblock() failed: "
- << cpp_strerror(ret) << dendl;
- goto close_fsid_fd;
- }
-
- struct statfs basefs;
- ret = ::fstatfs(basedir_fd, &basefs);
- if (ret < 0) {
- ret = -errno;
- derr << "mkfs cannot fstatfs basedir "
- << cpp_strerror(ret) << dendl;
- goto close_fsid_fd;
- }
-
- create_backend(basefs.f_type);
-
- ret = backend->create_current();
- if (ret < 0) {
- derr << "mkfs: failed to create current/ " << cpp_strerror(ret) << dendl;
- goto close_fsid_fd;
- }
-
- // write initial op_seq
- {
- uint64_t initial_seq = 0;
- int fd = read_op_seq(&initial_seq);
- if (fd < 0) {
- derr << "mkfs: failed to create " << current_op_seq_fn << ": "
- << cpp_strerror(fd) << dendl;
- goto close_fsid_fd;
- }
- if (initial_seq == 0) {
- int err = write_op_seq(fd, 1);
- if (err < 0) {
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- derr << "mkfs: failed to write to " << current_op_seq_fn << ": "
- << cpp_strerror(err) << dendl;
- goto close_fsid_fd;
- }
-
- if (backend->can_checkpoint()) {
- // create snap_1 too
- current_fd = ::open(current_fn.c_str(), O_RDONLY);
- assert(current_fd >= 0);
- char s[NAME_MAX];
- snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, 1ull);
- ret = backend->create_checkpoint(s, NULL);
- VOID_TEMP_FAILURE_RETRY(::close(current_fd));
- if (ret < 0 && ret != -EEXIST) {
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- derr << "mkfs: failed to create snap_1: " << cpp_strerror(ret) << dendl;
- goto close_fsid_fd;
- }
- }
- }
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- }
- ret = KeyValueDB::test_init(superblock.omap_backend, omap_dir);
- if (ret < 0) {
- derr << "mkfs failed to create " << g_conf->filestore_omap_backend << dendl;
- ret = -1;
- goto close_fsid_fd;
- }
- dout(1) << g_conf->filestore_omap_backend << " db exists/created" << dendl;
-
- // journal?
- ret = mkjournal();
- if (ret)
- goto close_fsid_fd;
-
- ret = write_meta("type", "filestore");
- if (ret)
- goto close_fsid_fd;
-
- dout(1) << "mkfs done in " << basedir << dendl;
- ret = 0;
-
- close_fsid_fd:
- VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
- fsid_fd = -1;
- close_basedir_fd:
- VOID_TEMP_FAILURE_RETRY(::close(basedir_fd));
- delete backend;
- backend = NULL;
- return ret;
-}
-
-int FileStore::mkjournal()
-{
- // read fsid
- int ret;
- char fn[PATH_MAX];
- snprintf(fn, sizeof(fn), "%s/fsid", basedir.c_str());
- int fd = ::open(fn, O_RDONLY, 0644);
- if (fd < 0) {
- int err = errno;
- derr << "FileStore::mkjournal: open error: " << cpp_strerror(err) << dendl;
- return -err;
- }
- ret = read_fsid(fd, &fsid);
- if (ret < 0) {
- derr << "FileStore::mkjournal: read error: " << cpp_strerror(ret) << dendl;
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- return ret;
- }
- VOID_TEMP_FAILURE_RETRY(::close(fd));
-
- ret = 0;
-
- new_journal();
- if (journal) {
- ret = journal->check();
- if (ret < 0) {
- ret = journal->create();
- if (ret)
- derr << "mkjournal error creating journal on " << journalpath
- << ": " << cpp_strerror(ret) << dendl;
- else
- dout(0) << "mkjournal created journal on " << journalpath << dendl;
- }
- delete journal;
- journal = 0;
- }
- return ret;
-}
-
-int FileStore::read_fsid(int fd, uuid_d *uuid)
-{
- char fsid_str[40];
- int ret = safe_read(fd, fsid_str, sizeof(fsid_str));
- if (ret < 0)
- return ret;
- if (ret == 8) {
- // old 64-bit fsid... mirror it.
- *(uint64_t*)&uuid->bytes()[0] = *(uint64_t*)fsid_str;
- *(uint64_t*)&uuid->bytes()[8] = *(uint64_t*)fsid_str;
- return 0;
- }
-
- if (ret > 36)
- fsid_str[36] = 0;
- if (!uuid->parse(fsid_str))
- return -EINVAL;
- return 0;
-}
-
-int FileStore::lock_fsid()
-{
- struct flock l;
- memset(&l, 0, sizeof(l));
- l.l_type = F_WRLCK;
- l.l_whence = SEEK_SET;
- l.l_start = 0;
- l.l_len = 0;
- int r = ::fcntl(fsid_fd, F_SETLK, &l);
- if (r < 0) {
- int err = errno;
- dout(0) << "lock_fsid failed to lock " << basedir << "/fsid, is another ceph-osd still running? "
- << cpp_strerror(err) << dendl;
- return -err;
- }
- return 0;
-}
-
-bool FileStore::test_mount_in_use()
-{
- dout(5) << "test_mount basedir " << basedir << " journal " << journalpath << dendl;
- char fn[PATH_MAX];
- snprintf(fn, sizeof(fn), "%s/fsid", basedir.c_str());
-
- // verify fs isn't in use
-
- fsid_fd = ::open(fn, O_RDWR, 0644);
- if (fsid_fd < 0)
- return 0; // no fsid, ok.
- bool inuse = lock_fsid() < 0;
- VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
- fsid_fd = -1;
- return inuse;
-}
-
-int FileStore::_detect_fs()
-{
- struct statfs st;
- int r = ::fstatfs(basedir_fd, &st);
- if (r < 0)
- return -errno;
-
- blk_size = st.f_bsize;
-
- create_backend(st.f_type);
-
- r = backend->detect_features();
- if (r < 0) {
- derr << "_detect_fs: detect_features error: " << cpp_strerror(r) << dendl;
- return r;
- }
-
- // test xattrs
- char fn[PATH_MAX];
- int x = rand();
- int y = x+1;
- snprintf(fn, sizeof(fn), "%s/xattr_test", basedir.c_str());
- int tmpfd = ::open(fn, O_CREAT|O_WRONLY|O_TRUNC, 0700);
- if (tmpfd < 0) {
- int ret = -errno;
- derr << "_detect_fs unable to create " << fn << ": " << cpp_strerror(ret) << dendl;
- return ret;
- }
-
- int ret = chain_fsetxattr(tmpfd, "user.test", &x, sizeof(x));
- if (ret >= 0)
- ret = chain_fgetxattr(tmpfd, "user.test", &y, sizeof(y));
- if ((ret < 0) || (x != y)) {
- derr << "Extended attributes don't appear to work. ";
- if (ret)
- *_dout << "Got error " + cpp_strerror(ret) + ". ";
- *_dout << "If you are using ext3 or ext4, be sure to mount the underlying "
- << "file system with the 'user_xattr' option." << dendl;
- ::unlink(fn);
- VOID_TEMP_FAILURE_RETRY(::close(tmpfd));
- return -ENOTSUP;
- }
-
- char buf[1000];
- memset(buf, 0, sizeof(buf)); // shut up valgrind
- chain_fsetxattr(tmpfd, "user.test", &buf, sizeof(buf));
- chain_fsetxattr(tmpfd, "user.test2", &buf, sizeof(buf));
- chain_fsetxattr(tmpfd, "user.test3", &buf, sizeof(buf));
- chain_fsetxattr(tmpfd, "user.test4", &buf, sizeof(buf));
- ret = chain_fsetxattr(tmpfd, "user.test5", &buf, sizeof(buf));
- if (ret == -ENOSPC) {
- dout(0) << "limited size xattrs" << dendl;
- }
- chain_fremovexattr(tmpfd, "user.test");
- chain_fremovexattr(tmpfd, "user.test2");
- chain_fremovexattr(tmpfd, "user.test3");
- chain_fremovexattr(tmpfd, "user.test4");
- chain_fremovexattr(tmpfd, "user.test5");
-
- ::unlink(fn);
- VOID_TEMP_FAILURE_RETRY(::close(tmpfd));
-
- return 0;
-}
-
-int FileStore::_sanity_check_fs()
-{
- // sanity check(s)
-
- if (((int)m_filestore_journal_writeahead +
- (int)m_filestore_journal_parallel +
- (int)m_filestore_journal_trailing) > 1) {
- dout(0) << "mount ERROR: more than one of filestore journal {writeahead,parallel,trailing} enabled" << dendl;
- cerr << TEXT_RED
- << " ** WARNING: more than one of 'filestore journal {writeahead,parallel,trailing}'\n"
- << " is enabled in ceph.conf. You must choose a single journal mode."
- << TEXT_NORMAL << std::endl;
- return -EINVAL;
- }
-
- if (!backend->can_checkpoint()) {
- if (!journal || !m_filestore_journal_writeahead) {
- dout(0) << "mount WARNING: no btrfs, and no journal in writeahead mode; data may be lost" << dendl;
- cerr << TEXT_RED
- << " ** WARNING: no btrfs AND (no journal OR journal not in writeahead mode)\n"
- << " For non-btrfs volumes, a writeahead journal is required to\n"
- << " maintain on-disk consistency in the event of a crash. Your conf\n"
- << " should include something like:\n"
- << " osd journal = /path/to/journal_device_or_file\n"
- << " filestore journal writeahead = true\n"
- << TEXT_NORMAL;
- }
- }
-
- if (!journal) {
- dout(0) << "mount WARNING: no journal" << dendl;
- cerr << TEXT_YELLOW
- << " ** WARNING: No osd journal is configured: write latency may be high.\n"
- << " If you will not be using an osd journal, write latency may be\n"
- << " relatively high. It can be reduced somewhat by lowering\n"
- << " filestore_max_sync_interval, but lower values mean lower write\n"
- << " throughput, especially with spinning disks.\n"
- << TEXT_NORMAL;
- }
-
- return 0;
-}
-
-int FileStore::write_superblock()
-{
- bufferlist bl;
- ::encode(superblock, bl);
- return safe_write_file(basedir.c_str(), "superblock",
- bl.c_str(), bl.length());
-}
-
-int FileStore::read_superblock()
-{
- bufferptr bp(PATH_MAX);
- int ret = safe_read_file(basedir.c_str(), "superblock",
- bp.c_str(), bp.length());
- if (ret < 0) {
- if (ret == -ENOENT) {
- // If the file doesn't exist write initial CompatSet
- return write_superblock();
- }
- return ret;
- }
-
- bufferlist bl;
- bl.push_back(bp);
- bufferlist::iterator i = bl.begin();
- ::decode(superblock, i);
- return 0;
-}
-
-int FileStore::update_version_stamp()
-{
- return write_version_stamp();
-}
-
-int FileStore::version_stamp_is_valid(uint32_t *version)
-{
- bufferptr bp(PATH_MAX);
- int ret = safe_read_file(basedir.c_str(), "store_version",
- bp.c_str(), bp.length());
- if (ret < 0) {
- if (ret == -ENOENT)
- return 0;
- return ret;
- }
- bufferlist bl;
- bl.push_back(bp);
- bufferlist::iterator i = bl.begin();
- ::decode(*version, i);
- dout(10) << __func__ << " was " << *version << " vs target "
- << target_version << dendl;
- if (*version == target_version)
- return 1;
- else
- return 0;
-}
-
-int FileStore::write_version_stamp()
-{
- dout(1) << __func__ << " " << target_version << dendl;
- bufferlist bl;
- ::encode(target_version, bl);
-
- return safe_write_file(basedir.c_str(), "store_version",
- bl.c_str(), bl.length());
-}
-
-int FileStore::upgrade()
-{
- dout(1) << "upgrade" << dendl;
- uint32_t version;
- int r = version_stamp_is_valid(&version);
- if (r < 0)
- return r;
- if (r == 1)
- return 0;
-
- if (version < 3) {
- derr << "ObjectStore is old at version " << version << ". Please upgrade to firefly v0.80.x, convert your store, and then upgrade." << dendl;
- return -EINVAL;
- }
-
- // nothing necessary in FileStore for v3 -> v4 upgrade; we just need to
- // open up DBObjectMap with the do_upgrade flag, which we already did.
- update_version_stamp();
- return 0;
-}
-
-int FileStore::read_op_seq(uint64_t *seq)
-{
- int op_fd = ::open(current_op_seq_fn.c_str(), O_CREAT|O_RDWR, 0644);
- if (op_fd < 0) {
- int r = -errno;
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
- }
- char s[40];
- memset(s, 0, sizeof(s));
- int ret = safe_read(op_fd, s, sizeof(s) - 1);
- if (ret < 0) {
- derr << "error reading " << current_op_seq_fn << ": " << cpp_strerror(ret) << dendl;
- VOID_TEMP_FAILURE_RETRY(::close(op_fd));
- assert(!m_filestore_fail_eio || ret != -EIO);
- return ret;
- }
- *seq = atoll(s);
- return op_fd;
-}
-
-int FileStore::write_op_seq(int fd, uint64_t seq)
-{
- char s[30];
- snprintf(s, sizeof(s), "%" PRId64 "\n", seq);
- int ret = TEMP_FAILURE_RETRY(::pwrite(fd, s, strlen(s), 0));
- if (ret < 0) {
- ret = -errno;
- assert(!m_filestore_fail_eio || ret != -EIO);
- }
- return ret;
-}
-
-int FileStore::mount()
-{
- int ret;
- char buf[PATH_MAX];
- uint64_t initial_op_seq;
- set<string> cluster_snaps;
- CompatSet supported_compat_set = get_fs_supported_compat_set();
-
- dout(5) << "basedir " << basedir << " journal " << journalpath << dendl;
-
- // make sure global base dir exists
- if (::access(basedir.c_str(), R_OK | W_OK)) {
- ret = -errno;
- derr << "FileStore::mount: unable to access basedir '" << basedir << "': "
- << cpp_strerror(ret) << dendl;
- goto done;
- }
-
- // get fsid
- snprintf(buf, sizeof(buf), "%s/fsid", basedir.c_str());
- fsid_fd = ::open(buf, O_RDWR, 0644);
- if (fsid_fd < 0) {
- ret = -errno;
- derr << "FileStore::mount: error opening '" << buf << "': "
- << cpp_strerror(ret) << dendl;
- goto done;
- }
-
- ret = read_fsid(fsid_fd, &fsid);
- if (ret < 0) {
- derr << "FileStore::mount: error reading fsid_fd: " << cpp_strerror(ret)
- << dendl;
- goto close_fsid_fd;
- }
-
- if (lock_fsid() < 0) {
- derr << "FileStore::mount: lock_fsid failed" << dendl;
- ret = -EBUSY;
- goto close_fsid_fd;
- }
-
- dout(10) << "mount fsid is " << fsid << dendl;
-
-
- uint32_t version_stamp;
- ret = version_stamp_is_valid(&version_stamp);
- if (ret < 0) {
- derr << "FileStore::mount : error in version_stamp_is_valid: "
- << cpp_strerror(ret) << dendl;
- goto close_fsid_fd;
- } else if (ret == 0) {
- if (do_update || (int)version_stamp < g_conf->filestore_update_to) {
- derr << "FileStore::mount : stale version stamp detected: "
- << version_stamp
- << ". Proceeding, do_update "
- << "is set, performing disk format upgrade."
- << dendl;
- do_update = true;
- } else {
- ret = -EINVAL;
- derr << "FileStore::mount : stale version stamp " << version_stamp
- << ". Please run the FileStore update script before starting the "
- << "OSD, or set filestore_update_to to " << target_version
- << " (currently " << g_conf->filestore_update_to << ")"
- << dendl;
- goto close_fsid_fd;
- }
- }
-
- ret = read_superblock();
- if (ret < 0) {
- ret = -EINVAL;
- goto close_fsid_fd;
- }
-
- // Check if this FileStore supports all the necessary features to mount
- if (supported_compat_set.compare(superblock.compat_features) == -1) {
- derr << "FileStore::mount : Incompatible features set "
- << superblock.compat_features << dendl;
- ret = -EINVAL;
- goto close_fsid_fd;
- }
-
- // open some dir handles
- basedir_fd = ::open(basedir.c_str(), O_RDONLY);
- if (basedir_fd < 0) {
- ret = -errno;
- derr << "FileStore::mount: failed to open " << basedir << ": "
- << cpp_strerror(ret) << dendl;
- basedir_fd = -1;
- goto close_fsid_fd;
- }
-
- // test for btrfs, xattrs, etc.
- ret = _detect_fs();
- if (ret < 0) {
- derr << "FileStore::mount : error in _detect_fs: "
- << cpp_strerror(ret) << dendl;
- goto close_basedir_fd;
- }
-
- {
- list<string> ls;
- ret = backend->list_checkpoints(ls);
- if (ret < 0) {
- derr << "FileStore::mount : error in _list_snaps: "<< cpp_strerror(ret) << dendl;
- goto close_basedir_fd;
- }
-
- long long unsigned c, prev = 0;
- char clustersnap[NAME_MAX];
- for (list<string>::iterator it = ls.begin(); it != ls.end(); ++it) {
- if (sscanf(it->c_str(), COMMIT_SNAP_ITEM, &c) == 1) {
- assert(c > prev);
- prev = c;
- snaps.push_back(c);
- } else if (sscanf(it->c_str(), CLUSTER_SNAP_ITEM, clustersnap) == 1)
- cluster_snaps.insert(*it);
- }
- }
-
- if (m_osd_rollback_to_cluster_snap.length() &&
- cluster_snaps.count(m_osd_rollback_to_cluster_snap) == 0) {
- derr << "rollback to cluster snapshot '" << m_osd_rollback_to_cluster_snap << "': not found" << dendl;
- ret = -ENOENT;
- goto close_basedir_fd;
- }
-
- char nosnapfn[200];
- snprintf(nosnapfn, sizeof(nosnapfn), "%s/nosnap", current_fn.c_str());
-
- if (backend->can_checkpoint()) {
- if (snaps.empty()) {
- dout(0) << "mount WARNING: no consistent snaps found, store may be in inconsistent state" << dendl;
- } else {
- char s[NAME_MAX];
- uint64_t curr_seq = 0;
-
- if (m_osd_rollback_to_cluster_snap.length()) {
- derr << TEXT_RED
- << " ** NOTE: rolling back to cluster snapshot " << m_osd_rollback_to_cluster_snap << " **"
- << TEXT_NORMAL
- << dendl;
- assert(cluster_snaps.count(m_osd_rollback_to_cluster_snap));
- snprintf(s, sizeof(s), CLUSTER_SNAP_ITEM, m_osd_rollback_to_cluster_snap.c_str());
- } else {
- {
- int fd = read_op_seq(&curr_seq);
- if (fd >= 0) {
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- }
- }
- if (curr_seq)
- dout(10) << " current/ seq was " << curr_seq << dendl;
- else
- dout(10) << " current/ missing entirely (unusual, but okay)" << dendl;
-
- uint64_t cp = snaps.back();
- dout(10) << " most recent snap from " << snaps << " is " << cp << dendl;
-
- // if current/ is marked as non-snapshotted, refuse to roll
- // back (without clear direction) to avoid throwing out new
- // data.
- struct stat st;
- if (::stat(nosnapfn, &st) == 0) {
- if (!m_osd_use_stale_snap) {
- derr << "ERROR: " << nosnapfn << " exists, not rolling back to avoid losing new data" << dendl;
- derr << "Force rollback to old snapshotted version with 'osd use stale snap = true'" << dendl;
- derr << "config option for --osd-use-stale-snap startup argument." << dendl;
- ret = -ENOTSUP;
- goto close_basedir_fd;
- }
- derr << "WARNING: user forced start with data sequence mismatch: current was " << curr_seq
- << ", newest snap is " << cp << dendl;
- cerr << TEXT_YELLOW
- << " ** WARNING: forcing the use of stale snapshot data **"
- << TEXT_NORMAL << std::endl;
- }
-
- dout(10) << "mount rolling back to consistent snap " << cp << dendl;
- snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)cp);
- }
-
- // drop current?
- ret = backend->rollback_to(s);
- if (ret) {
- derr << "FileStore::mount: error rolling back to " << s << ": "
- << cpp_strerror(ret) << dendl;
- goto close_basedir_fd;
- }
- }
- }
- initial_op_seq = 0;
-
- current_fd = ::open(current_fn.c_str(), O_RDONLY);
- if (current_fd < 0) {
- ret = -errno;
- derr << "FileStore::mount: error opening: " << current_fn << ": " << cpp_strerror(ret) << dendl;
- goto close_basedir_fd;
- }
-
- assert(current_fd >= 0);
-
- op_fd = read_op_seq(&initial_op_seq);
- if (op_fd < 0) {
- derr << "FileStore::mount: read_op_seq failed" << dendl;
- goto close_current_fd;
- }
-
- dout(5) << "mount op_seq is " << initial_op_seq << dendl;
- if (initial_op_seq == 0) {
- derr << "mount initial op seq is 0; something is wrong" << dendl;
- ret = -EINVAL;
- goto close_current_fd;
- }
-
- if (!backend->can_checkpoint()) {
- // mark current/ as non-snapshotted so that we don't rollback away
- // from it.
- int r = ::creat(nosnapfn, 0644);
- if (r < 0) {
- derr << "FileStore::mount: failed to create current/nosnap" << dendl;
- goto close_current_fd;
- }
- VOID_TEMP_FAILURE_RETRY(::close(r));
- } else {
- // clear nosnap marker, if present.
- ::unlink(nosnapfn);
- }
-
- if (!(generic_flags & SKIP_MOUNT_OMAP)) {
- KeyValueDB * omap_store = KeyValueDB::create(g_ceph_context,
- superblock.omap_backend,
- omap_dir);
- if (omap_store == NULL)
- {
- derr << "Error creating " << superblock.omap_backend << dendl;
- ret = -1;
- goto close_current_fd;
- }
-
- if (superblock.omap_backend == "rocksdb")
- omap_store->init(g_conf->filestore_rocksdb_options);
- else
- omap_store->init();
-
- stringstream err;
- if (omap_store->create_and_open(err)) {
- delete omap_store;
- derr << "Error initializing " << superblock.omap_backend
- << " : " << err.str() << dendl;
- ret = -1;
- goto close_current_fd;
- }
-
- DBObjectMap *dbomap = new DBObjectMap(omap_store);
- ret = dbomap->init(do_update);
- if (ret < 0) {
- delete dbomap;
- derr << "Error initializing DBObjectMap: " << ret << dendl;
- goto close_current_fd;
- }
- stringstream err2;
-
- if (g_conf->filestore_debug_omap_check && !dbomap->check(err2)) {
- derr << err2.str() << dendl;
- delete dbomap;
- ret = -EINVAL;
- goto close_current_fd;
- }
- object_map.reset(dbomap);
- }
-
- // journal
- new_journal();
-
- // select journal mode?
- if (journal) {
- if (!m_filestore_journal_writeahead &&
- !m_filestore_journal_parallel &&
- !m_filestore_journal_trailing) {
- if (!backend->can_checkpoint()) {
- m_filestore_journal_writeahead = true;
- dout(0) << "mount: enabling WRITEAHEAD journal mode: checkpoint is not enabled" << dendl;
- } else {
- m_filestore_journal_parallel = true;
- dout(0) << "mount: enabling PARALLEL journal mode: fs, checkpoint is enabled" << dendl;
- }
- } else {
- if (m_filestore_journal_writeahead)
- dout(0) << "mount: WRITEAHEAD journal mode explicitly enabled in conf" << dendl;
- if (m_filestore_journal_parallel)
- dout(0) << "mount: PARALLEL journal mode explicitly enabled in conf" << dendl;
- if (m_filestore_journal_trailing)
- dout(0) << "mount: TRAILING journal mode explicitly enabled in conf" << dendl;
- }
- if (m_filestore_journal_writeahead)
- journal->set_wait_on_full(true);
- } else {
- dout(0) << "mount: no journal" << dendl;
- }
-
- ret = _sanity_check_fs();
- if (ret) {
- derr << "FileStore::mount: _sanity_check_fs failed with error "
- << ret << dendl;
- goto close_current_fd;
- }
-
- // Cleanup possibly invalid collections
- {
- vector<coll_t> collections;
- ret = list_collections(collections, true);
- if (ret < 0) {
- derr << "Error " << ret << " while listing collections" << dendl;
- goto close_current_fd;
- }
- for (vector<coll_t>::iterator i = collections.begin();
- i != collections.end();
- ++i) {
- Index index;
- ret = get_index(*i, &index);
- if (ret < 0) {
- derr << "Unable to mount index " << *i
- << " with error: " << ret << dendl;
- goto close_current_fd;
- }
- assert(NULL != index.index);
- RWLock::WLocker l((index.index)->access_lock);
-
- index->cleanup();
- }
- }
-
- wbthrottle.start();
- sync_thread.create();
-
- if (!(generic_flags & SKIP_JOURNAL_REPLAY)) {
- ret = journal_replay(initial_op_seq);
- if (ret < 0) {
- derr << "mount failed to open journal " << journalpath << ": " << cpp_strerror(ret) << dendl;
- if (ret == -ENOTTY) {
- derr << "maybe journal is not pointing to a block device and its size "
- << "wasn't configured?" << dendl;
- }
-
- // stop sync thread
- lock.Lock();
- stop = true;
- sync_cond.Signal();
- lock.Unlock();
- sync_thread.join();
-
- wbthrottle.stop();
-
- goto close_current_fd;
- }
- }
-
- {
- stringstream err2;
- if (g_conf->filestore_debug_omap_check && !object_map->check(err2)) {
- derr << err2.str() << dendl;
- ret = -EINVAL;
- goto close_current_fd;
- }
- }
-
- init_temp_collections();
-
- journal_start();
-
- op_tp.start();
- for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
- (*it)->start();
- }
- for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
- (*it)->start();
- }
-
- timer.init();
-
- // upgrade?
- if (g_conf->filestore_update_to >= (int)get_target_version()) {
- int err = upgrade();
- if (err < 0) {
- derr << "error converting store" << dendl;
- umount();
- return err;
- }
- }
-
- // all okay.
- return 0;
-
-close_current_fd:
- VOID_TEMP_FAILURE_RETRY(::close(current_fd));
- current_fd = -1;
-close_basedir_fd:
- VOID_TEMP_FAILURE_RETRY(::close(basedir_fd));
- basedir_fd = -1;
-close_fsid_fd:
- VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
- fsid_fd = -1;
-done:
- assert(!m_filestore_fail_eio || ret != -EIO);
- return ret;
-}
-
-void FileStore::init_temp_collections()
-{
- dout(10) << __func__ << dendl;
- vector<coll_t> ls;
- int r = list_collections(ls, true);
- assert(r >= 0);
-
- dout(20) << " ls " << ls << dendl;
-
- SequencerPosition spos;
-
- set<coll_t> temps;
- for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p)
- if (p->is_temp())
- temps.insert(*p);
- dout(20) << " temps " << temps << dendl;
-
- for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
- if (p->is_temp())
- continue;
- if (p->is_meta())
- continue;
- coll_t temp = p->get_temp();
- if (temps.count(temp)) {
- temps.erase(temp);
- } else {
- dout(10) << __func__ << " creating " << temp << dendl;
- r = _create_collection(temp, spos);
- assert(r == 0);
- }
- }
-
- for (set<coll_t>::iterator p = temps.begin(); p != temps.end(); ++p) {
- dout(10) << __func__ << " removing stray " << *p << dendl;
- r = _collection_remove_recursive(*p, spos);
- assert(r == 0);
- }
-}
-
-int FileStore::umount()
-{
- dout(5) << "umount " << basedir << dendl;
-
- flush();
- sync();
- do_force_sync();
-
- lock.Lock();
- stop = true;
- sync_cond.Signal();
- lock.Unlock();
- sync_thread.join();
- wbthrottle.stop();
- op_tp.stop();
-
- journal_stop();
- if (!(generic_flags & SKIP_JOURNAL_REPLAY))
- journal_write_close();
-
- for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
- (*it)->stop();
- }
- for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
- (*it)->stop();
- }
-
- if (fsid_fd >= 0) {
- VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
- fsid_fd = -1;
- }
- if (op_fd >= 0) {
- VOID_TEMP_FAILURE_RETRY(::close(op_fd));
- op_fd = -1;
- }
- if (current_fd >= 0) {
- VOID_TEMP_FAILURE_RETRY(::close(current_fd));
- current_fd = -1;
- }
- if (basedir_fd >= 0) {
- VOID_TEMP_FAILURE_RETRY(::close(basedir_fd));
- basedir_fd = -1;
- }
-
- force_sync = false;
-
- delete backend;
- backend = NULL;
-
- object_map.reset();
-
- {
- Mutex::Locker l(sync_entry_timeo_lock);
- timer.shutdown();
- }
-
- // nothing
- return 0;
-}
-
-
-
-
-/// -----------------------------
-
-FileStore::Op *FileStore::build_op(list<Transaction*>& tls,
- Context *onreadable,
- Context *onreadable_sync,
- TrackedOpRef osd_op)
-{
- uint64_t bytes = 0, ops = 0;
- for (list<Transaction*>::iterator p = tls.begin();
- p != tls.end();
- ++p) {
- bytes += (*p)->get_num_bytes();
- ops += (*p)->get_num_ops();
- }
-
- Op *o = new Op;
- o->start = ceph_clock_now(g_ceph_context);
- o->tls.swap(tls);
- o->onreadable = onreadable;
- o->onreadable_sync = onreadable_sync;
- o->ops = ops;
- o->bytes = bytes;
- o->osd_op = osd_op;
- return o;
-}
-
-
-
-void FileStore::queue_op(OpSequencer *osr, Op *o)
-{
- // queue op on sequencer, then queue sequencer for the threadpool,
- // so that regardless of which order the threads pick up the
- // sequencer, the op order will be preserved.
-
- osr->queue(o);
-
- logger->inc(l_os_ops);
- logger->inc(l_os_bytes, o->bytes);
-
- dout(5) << "queue_op " << o << " seq " << o->op
- << " " << *osr
- << " " << o->bytes << " bytes"
- << " (queue has " << throttle_ops.get_current() << " ops and " << throttle_bytes.get_current() << " bytes)"
- << dendl;
- op_wq.queue(osr);
-}
-
-void FileStore::op_queue_reserve_throttle(Op *o, ThreadPool::TPHandle *handle)
-{
- // Do not call while holding the journal lock!
- uint64_t max_ops = m_filestore_queue_max_ops;
- uint64_t max_bytes = m_filestore_queue_max_bytes;
-
- if (backend->can_checkpoint() && is_committing()) {
- max_ops += m_filestore_queue_committing_max_ops;
- max_bytes += m_filestore_queue_committing_max_bytes;
- }
-
- logger->set(l_os_oq_max_ops, max_ops);
- logger->set(l_os_oq_max_bytes, max_bytes);
-
- if (handle)
- handle->suspend_tp_timeout();
- if (throttle_ops.should_wait(1) ||
- (throttle_bytes.get_current() // let single large ops through!
- && throttle_bytes.should_wait(o->bytes))) {
- dout(2) << "waiting " << throttle_ops.get_current() + 1 << " > " << max_ops << " ops || "
- << throttle_bytes.get_current() + o->bytes << " > " << max_bytes << dendl;
- }
- throttle_ops.get();
- throttle_bytes.get(o->bytes);
- if (handle)
- handle->reset_tp_timeout();
-
- logger->set(l_os_oq_ops, throttle_ops.get_current());
- logger->set(l_os_oq_bytes, throttle_bytes.get_current());
-}
-
-void FileStore::op_queue_release_throttle(Op *o)
-{
- throttle_ops.put();
- throttle_bytes.put(o->bytes);
- logger->set(l_os_oq_ops, throttle_ops.get_current());
- logger->set(l_os_oq_bytes, throttle_bytes.get_current());
-}
-
-void FileStore::_do_op(OpSequencer *osr, ThreadPool::TPHandle &handle)
-{
- wbthrottle.throttle();
- // inject a stall?
- if (g_conf->filestore_inject_stall) {
- int orig = g_conf->filestore_inject_stall;
- dout(5) << "_do_op filestore_inject_stall " << orig << ", sleeping" << dendl;
- for (int n = 0; n < g_conf->filestore_inject_stall; n++)
- sleep(1);
- g_conf->set_val("filestore_inject_stall", "0");
- dout(5) << "_do_op done stalling" << dendl;
- }
-
- osr->apply_lock.Lock();
- Op *o = osr->peek_queue();
- apply_manager.op_apply_start(o->op);
- dout(5) << "_do_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << " start" << dendl;
- int r = _do_transactions(o->tls, o->op, &handle);
- apply_manager.op_apply_finish(o->op);
- dout(10) << "_do_op " << o << " seq " << o->op << " r = " << r
- << ", finisher " << o->onreadable << " " << o->onreadable_sync << dendl;
-}
-
-void FileStore::_finish_op(OpSequencer *osr)
-{
- list<Context*> to_queue;
- Op *o = osr->dequeue(&to_queue);
-
- utime_t lat = ceph_clock_now(g_ceph_context);
- lat -= o->start;
-
- dout(10) << "_finish_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << " lat " << lat << dendl;
- osr->apply_lock.Unlock(); // locked in _do_op
-
- // called with tp lock held
- op_queue_release_throttle(o);
-
- logger->tinc(l_os_apply_lat, lat);
-
- if (o->onreadable_sync) {
- o->onreadable_sync->complete(0);
- }
- if (o->onreadable) {
- apply_finishers[osr->id % m_apply_finisher_num]->queue(o->onreadable);
- }
- if (!to_queue.empty()) {
- apply_finishers[osr->id % m_apply_finisher_num]->queue(to_queue);
- }
- delete o;
-}
-
-
-struct C_JournaledAhead : public Context {
- FileStore *fs;
- FileStore::OpSequencer *osr;
- FileStore::Op *o;
- Context *ondisk;
-
- C_JournaledAhead(FileStore *f, FileStore::OpSequencer *os, FileStore::Op *o, Context *ondisk):
- fs(f), osr(os), o(o), ondisk(ondisk) { }
- void finish(int r) {
- fs->_journaled_ahead(osr, o, ondisk);
- }
-};
-
-int FileStore::queue_transactions(Sequencer *posr, list<Transaction*> &tls,
- TrackedOpRef osd_op,
- ThreadPool::TPHandle *handle)
-{
- Context *onreadable;
- Context *ondisk;
- Context *onreadable_sync;
- ObjectStore::Transaction::collect_contexts(
- tls, &onreadable, &ondisk, &onreadable_sync);
- if (g_conf->filestore_blackhole) {
- dout(0) << "queue_transactions filestore_blackhole = TRUE, dropping transaction" << dendl;
- delete ondisk;
- delete onreadable;
- delete onreadable_sync;
- return 0;
- }
-
- utime_t start = ceph_clock_now(g_ceph_context);
- // set up the sequencer
- OpSequencer *osr;
- assert(posr);
- if (posr->p) {
- osr = static_cast<OpSequencer *>(posr->p.get());
- dout(5) << "queue_transactions existing " << osr << " " << *osr << dendl;
- } else {
- osr = new OpSequencer(next_osr_id.inc());
- osr->set_cct(g_ceph_context);
- osr->parent = posr;
- posr->p = osr;
- dout(5) << "queue_transactions new " << osr << " " << *osr << dendl;
- }
-
- // used to include osr information in tracepoints during transaction apply
- for (list<ObjectStore::Transaction*>::iterator i = tls.begin(); i != tls.end(); ++i) {
- (*i)->set_osr(osr);
- }
-
- if (journal && journal->is_writeable() && !m_filestore_journal_trailing) {
- Op *o = build_op(tls, onreadable, onreadable_sync, osd_op);
- op_queue_reserve_throttle(o, handle);
- journal->throttle();
- //prepare and encode transactions data out of lock
- bufferlist tbl;
- int orig_len = journal->prepare_entry(o->tls, &tbl);
- uint64_t op_num = submit_manager.op_submit_start();
- o->op = op_num;
-
- if (m_filestore_do_dump)
- dump_transactions(o->tls, o->op, osr);
-
- if (m_filestore_journal_parallel) {
- dout(5) << "queue_transactions (parallel) " << o->op << " " << o->tls << dendl;
-
- _op_journal_transactions(tbl, orig_len, o->op, ondisk, osd_op);
-
- // queue inside submit_manager op submission lock
- queue_op(osr, o);
- } else if (m_filestore_journal_writeahead) {
- dout(5) << "queue_transactions (writeahead) " << o->op << " " << o->tls << dendl;
-
- osr->queue_journal(o->op);
-
- _op_journal_transactions(tbl, orig_len, o->op,
- new C_JournaledAhead(this, osr, o, ondisk),
- osd_op);
- } else {
- assert(0);
- }
- submit_manager.op_submit_finish(op_num);
- utime_t end = ceph_clock_now(g_ceph_context);
- logger->tinc(l_os_queue_lat, end - start);
- return 0;
- }
-
- if (!journal) {
- Op *o = build_op(tls, onreadable, onreadable_sync, osd_op);
- dout(5) << __func__ << " (no journal) " << o << " " << tls << dendl;
-
- op_queue_reserve_throttle(o, handle);
-
- uint64_t op_num = submit_manager.op_submit_start();
- o->op = op_num;
-
- if (m_filestore_do_dump)
- dump_transactions(o->tls, o->op, osr);
-
- queue_op(osr, o);
-
- if (ondisk)
- apply_manager.add_waiter(op_num, ondisk);
- submit_manager.op_submit_finish(op_num);
- utime_t end = ceph_clock_now(g_ceph_context);
- logger->tinc(l_os_queue_lat, end - start);
- return 0;
- }
-
- assert(journal);
- //prepare and encode transactions data out of lock
- bufferlist tbl;
- int orig_len = -1;
- if (journal->is_writeable()) {
- orig_len = journal->prepare_entry(tls, &tbl);
- }
- uint64_t op = submit_manager.op_submit_start();
- dout(5) << "queue_transactions (trailing journal) " << op << " " << tls << dendl;
-
- if (m_filestore_do_dump)
- dump_transactions(tls, op, osr);
-
- apply_manager.op_apply_start(op);
- int r = do_transactions(tls, op);
-
- if (r >= 0) {
- _op_journal_transactions(tbl, orig_len, op, ondisk, osd_op);
- } else {
- delete ondisk;
- }
-
- // start on_readable finisher after we queue journal item, as on_readable callback
- // is allowed to delete the Transaction
- if (onreadable_sync) {
- onreadable_sync->complete(r);
- }
- apply_finishers[osr->id % m_apply_finisher_num]->queue(onreadable, r);
-
- submit_manager.op_submit_finish(op);
- apply_manager.op_apply_finish(op);
-
- utime_t end = ceph_clock_now(g_ceph_context);
- logger->tinc(l_os_queue_lat, end - start);
- return r;
-}
-
-void FileStore::_journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk)
-{
- dout(5) << "_journaled_ahead " << o << " seq " << o->op << " " << *osr << " " << o->tls << dendl;
-
- // this should queue in order because the journal does it's completions in order.
- queue_op(osr, o);
-
- list<Context*> to_queue;
- osr->dequeue_journal(&to_queue);
-
- // do ondisk completions async, to prevent any onreadable_sync completions
- // getting blocked behind an ondisk completion.
- if (ondisk) {
- dout(10) << " queueing ondisk " << ondisk << dendl;
- ondisk_finishers[osr->id % m_ondisk_finisher_num]->queue(ondisk);
- }
- if (!to_queue.empty()) {
- ondisk_finishers[osr->id % m_ondisk_finisher_num]->queue(to_queue);
- }
-}
-
-int FileStore::_do_transactions(
- list<Transaction*> &tls,
- uint64_t op_seq,
- ThreadPool::TPHandle *handle)
-{
- int r = 0;
- int trans_num = 0;
-
- for (list<Transaction*>::iterator p = tls.begin();
- p != tls.end();
- ++p, trans_num++) {
- r = _do_transaction(**p, op_seq, trans_num, handle);
- if (r < 0)
- break;
- if (handle)
- handle->reset_tp_timeout();
- }
-
- return r;
-}
-
-void FileStore::_set_global_replay_guard(coll_t cid,
- const SequencerPosition &spos)
-{
- if (backend->can_checkpoint())
- return;
-
- // sync all previous operations on this sequencer
- int ret = object_map->sync();
- if (ret < 0) {
- derr << __func__ << " : omap sync error " << cpp_strerror(ret) << dendl;
- assert(0 == "_set_global_replay_guard failed");
- }
- ret = sync_filesystem(basedir_fd);
- if (ret < 0) {
- derr << __func__ << " :sync_filesytem error " << cpp_strerror(ret) << dendl;
- assert(0 == "_set_global_replay_guard failed");
- }
-
- char fn[PATH_MAX];
- get_cdir(cid, fn, sizeof(fn));
- int fd = ::open(fn, O_RDONLY);
- if (fd < 0) {
- int err = errno;
- derr << __func__ << ": " << cid << " error " << cpp_strerror(err) << dendl;
- assert(0 == "_set_global_replay_guard failed");
- }
-
- _inject_failure();
-
- // then record that we did it
- bufferlist v;
- ::encode(spos, v);
- int r = chain_fsetxattr(fd, GLOBAL_REPLAY_GUARD_XATTR, v.c_str(), v.length(), true);
- if (r < 0) {
- derr << __func__ << ": fsetxattr " << GLOBAL_REPLAY_GUARD_XATTR
- << " got " << cpp_strerror(r) << dendl;
- assert(0 == "fsetxattr failed");
- }
-
- // and make sure our xattr is durable.
- ::fsync(fd);
-
- _inject_failure();
-
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- dout(10) << __func__ << ": " << spos << " done" << dendl;
-}
-
-int FileStore::_check_global_replay_guard(coll_t cid,
- const SequencerPosition& spos)
-{
- char fn[PATH_MAX];
- get_cdir(cid, fn, sizeof(fn));
- int fd = ::open(fn, O_RDONLY);
- if (fd < 0) {
- dout(10) << __func__ << ": " << cid << " dne" << dendl;
- return 1; // if collection does not exist, there is no guard, and we can replay.
- }
-
- char buf[100];
- int r = chain_fgetxattr(fd, GLOBAL_REPLAY_GUARD_XATTR, buf, sizeof(buf));
- if (r < 0) {
- dout(20) << __func__ << " no xattr" << dendl;
- assert(!m_filestore_fail_eio || r != -EIO);
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- return 1; // no xattr
- }
- bufferlist bl;
- bl.append(buf, r);
-
- SequencerPosition opos;
- bufferlist::iterator p = bl.begin();
- ::decode(opos, p);
-
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- return spos >= opos ? 1 : -1;
-}
-
-
-void FileStore::_set_replay_guard(coll_t cid,
- const SequencerPosition &spos,
- bool in_progress=false)
-{
- char fn[PATH_MAX];
- get_cdir(cid, fn, sizeof(fn));
- int fd = ::open(fn, O_RDONLY);
- if (fd < 0) {
- int err = errno;
- derr << "_set_replay_guard " << cid << " error " << cpp_strerror(err) << dendl;
- assert(0 == "_set_replay_guard failed");
- }
- _set_replay_guard(fd, spos, 0, in_progress);
- VOID_TEMP_FAILURE_RETRY(::close(fd));
-}
-
-
-void FileStore::_set_replay_guard(int fd,
- const SequencerPosition& spos,
- const ghobject_t *hoid,
- bool in_progress)
-{
- if (backend->can_checkpoint())
- return;
-
- dout(10) << "_set_replay_guard " << spos << (in_progress ? " START" : "") << dendl;
-
- _inject_failure();
-
- // first make sure the previous operation commits
- ::fsync(fd);
-
- // sync object_map too. even if this object has a header or keys,
- // it have had them in the past and then removed them, so always
- // sync.
- object_map->sync(hoid, &spos);
-
- _inject_failure();
-
- // then record that we did it
- bufferlist v(40);
- ::encode(spos, v);
- ::encode(in_progress, v);
- int r = chain_fsetxattr(fd, REPLAY_GUARD_XATTR, v.c_str(), v.length(), true);
- if (r < 0) {
- derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl;
- assert(0 == "fsetxattr failed");
- }
-
- // and make sure our xattr is durable.
- ::fsync(fd);
-
- _inject_failure();
-
- dout(10) << "_set_replay_guard " << spos << " done" << dendl;
-}
-
-void FileStore::_close_replay_guard(coll_t cid,
- const SequencerPosition &spos)
-{
- char fn[PATH_MAX];
- get_cdir(cid, fn, sizeof(fn));
- int fd = ::open(fn, O_RDONLY);
- if (fd < 0) {
- int err = errno;
- derr << "_close_replay_guard " << cid << " error " << cpp_strerror(err) << dendl;
- assert(0 == "_close_replay_guard failed");
- }
- _close_replay_guard(fd, spos);
- VOID_TEMP_FAILURE_RETRY(::close(fd));
-}
-
-void FileStore::_close_replay_guard(int fd, const SequencerPosition& spos)
-{
- if (backend->can_checkpoint())
- return;
-
- dout(10) << "_close_replay_guard " << spos << dendl;
-
- _inject_failure();
-
- // then record that we are done with this operation
- bufferlist v(40);
- ::encode(spos, v);
- bool in_progress = false;
- ::encode(in_progress, v);
- int r = chain_fsetxattr(fd, REPLAY_GUARD_XATTR, v.c_str(), v.length(), true);
- if (r < 0) {
- derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl;
- assert(0 == "fsetxattr failed");
- }
-
- // and make sure our xattr is durable.
- ::fsync(fd);
-
- _inject_failure();
-
- dout(10) << "_close_replay_guard " << spos << " done" << dendl;
-}
-
-int FileStore::_check_replay_guard(coll_t cid, ghobject_t oid, const SequencerPosition& spos)
-{
- if (!replaying || backend->can_checkpoint())
- return 1;
-
- int r = _check_global_replay_guard(cid, spos);
- if (r < 0)
- return r;
-
- FDRef fd;
- r = lfn_open(cid, oid, false, &fd);
- if (r < 0) {
- dout(10) << "_check_replay_guard " << cid << " " << oid << " dne" << dendl;
- return 1; // if file does not exist, there is no guard, and we can replay.
- }
- int ret = _check_replay_guard(**fd, spos);
- lfn_close(fd);
- return ret;
-}
-
-int FileStore::_check_replay_guard(coll_t cid, const SequencerPosition& spos)
-{
- if (!replaying || backend->can_checkpoint())
- return 1;
-
- char fn[PATH_MAX];
- get_cdir(cid, fn, sizeof(fn));
- int fd = ::open(fn, O_RDONLY);
- if (fd < 0) {
- dout(10) << "_check_replay_guard " << cid << " dne" << dendl;
- return 1; // if collection does not exist, there is no guard, and we can replay.
- }
- int ret = _check_replay_guard(fd, spos);
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- return ret;
-}
-
-int FileStore::_check_replay_guard(int fd, const SequencerPosition& spos)
-{
- if (!replaying || backend->can_checkpoint())
- return 1;
-
- char buf[100];
- int r = chain_fgetxattr(fd, REPLAY_GUARD_XATTR, buf, sizeof(buf));
- if (r < 0) {
- dout(20) << "_check_replay_guard no xattr" << dendl;
- assert(!m_filestore_fail_eio || r != -EIO);
- return 1; // no xattr
- }
- bufferlist bl;
- bl.append(buf, r);
-
- SequencerPosition opos;
- bufferlist::iterator p = bl.begin();
- ::decode(opos, p);
- bool in_progress = false;
- if (!p.end()) // older journals don't have this
- ::decode(in_progress, p);
- if (opos > spos) {
- dout(10) << "_check_replay_guard object has " << opos << " > current pos " << spos
- << ", now or in future, SKIPPING REPLAY" << dendl;
- return -1;
- } else if (opos == spos) {
- if (in_progress) {
- dout(10) << "_check_replay_guard object has " << opos << " == current pos " << spos
- << ", in_progress=true, CONDITIONAL REPLAY" << dendl;
- return 0;
- } else {
- dout(10) << "_check_replay_guard object has " << opos << " == current pos " << spos
- << ", in_progress=false, SKIPPING REPLAY" << dendl;
- return -1;
- }
- } else {
- dout(10) << "_check_replay_guard object has " << opos << " < current pos " << spos
- << ", in past, will replay" << dendl;
- return 1;
- }
-}
-
-unsigned FileStore::_do_transaction(
- Transaction& t, uint64_t op_seq, int trans_num,
- ThreadPool::TPHandle *handle)
-{
- dout(10) << "_do_transaction on " << &t << dendl;
-
-#ifdef WITH_LTTNG
- const char *osr_name = t.get_osr() ? static_cast<OpSequencer*>(t.get_osr())->get_name().c_str() : "<NULL>";
-#endif
-
- Transaction::iterator i = t.begin();
-
- SequencerPosition spos(op_seq, trans_num, 0);
- while (i.have_op()) {
- if (handle)
- handle->reset_tp_timeout();
-
- Transaction::Op *op = i.decode_op();
- int r = 0;
-
- _inject_failure();
-
- switch (op->op) {
- case Transaction::OP_NOP:
- break;
- case Transaction::OP_TOUCH:
- {
- coll_t cid = i.get_cid(op->cid);
- ghobject_t oid = i.get_oid(op->oid);
- _kludge_temp_object_collection(cid, oid);
- tracepoint(objectstore, touch_enter, osr_name);
- if (_check_replay_guard(cid, oid, spos) > 0)
- r = _touch(cid, oid);
- tracepoint(objectstore, touch_exit, r);
- }
- break;
-
- case Transaction::OP_WRITE:
- {
- coll_t cid = i.get_cid(op->cid);
- ghobject_t oid = i.get_oid(op->oid);
- _kludge_temp_object_collection(cid, oid);
- uint64_t off = op->off;
- uint64_t len = op->len;
- uint32_t fadvise_flags = i.get_fadvise_flags();
- bufferlist bl;
- i.decode_bl(bl);
- tracepoint(objectstore, write_enter, osr_name, off, len);
- if (_check_replay_guard(cid, oid, spos) > 0)
- r = _write(cid, oid, off, len, bl, fadvise_flags);
- tracepoint(objectstore, write_exit, r);
- }
- break;
-
- case Transaction::OP_ZERO:
- {
- coll_t cid = i.get_cid(op->cid);
- ghobject_t oid = i.get_oid(op->oid);
- _kludge_temp_object_collection(cid, oid);
- uint64_t off = op->off;
- uint64_t len = op->len;
- tracepoint(objectstore, zero_enter, osr_name, off, len);
- if (_check_replay_guard(cid, oid, spos) > 0)
- r = _zero(cid, oid, off, len);
- tracepoint(objectstore, zero_exit, r);
- }
- break;
-
- case Transaction::OP_TRIMCACHE:
- {
- // deprecated, no-op
- }
- break;
-
- case Transaction::OP_TRUNCATE:
- {
- coll_t cid = i.get_cid(op->cid);
- ghobject_t oid = i.get_oid(op->oid);
- _kludge_temp_object_collection(cid, oid);
- uint64_t off = op->off;
- tracepoint(objectstore, truncate_enter, osr_name, off);
- if (_check_replay_guard(cid, oid, spos) > 0)
- r = _truncate(cid, oid, off);
- tracepoint(objectstore, truncate_exit, r);
- }
- break;
-
- case Transaction::OP_REMOVE:
- {
- coll_t cid = i.get_cid(op->cid);
- ghobject_t oid = i.get_oid(op->oid);
- _kludge_temp_object_collection(cid, oid);
- tracepoint(objectstore, remove_enter, osr_name);
- if (_check_replay_guard(cid, oid, spos) > 0)
- r = _remove(cid, oid, spos);
- tracepoint(objectstore, remove_exit, r);
- }
- break;
-
- case Transaction::OP_SETATTR:
- {
- coll_t cid = i.get_cid(op->cid);
- ghobject_t oid = i.get_oid(op->oid);
- _kludge_temp_object_collection(cid, oid);
- string name = i.decode_string();
- bufferlist bl;
- i.decode_bl(bl);
- tracepoint(objectstore, setattr_enter, osr_name);
- if (_check_replay_guard(cid, oid, spos) > 0) {
- map<string, bufferptr> to_set;
- to_set[name] = bufferptr(bl.c_str(), bl.length());
- r = _setattrs(cid, oid, to_set, spos);
- if (r == -ENOSPC)
- dout(0) << " ENOSPC on setxattr on " << cid << "/" << oid
- << " name " << name << " size " << bl.length() << dendl;
- }
- tracepoint(objectstore, setattr_exit, r);
- }
- break;
-
- case Transaction::OP_SETATTRS:
- {
- coll_t cid = i.get_cid(op->cid);
- ghobject_t oid = i.get_oid(op->oid);
- _kludge_temp_object_collection(cid, oid);
- map<string, bufferptr> aset;
- i.decode_attrset(aset);
- tracepoint(objectstore, setattrs_enter, osr_name);
- if (_check_replay_guard(cid, oid, spos) > 0)
- r = _setattrs(cid, oid, aset, spos);
- tracepoint(objectstore, setattrs_exit, r);
- if (r == -ENOSPC)
- dout(0) << " ENOSPC on setxattrs on " << cid << "/" << oid << dendl;
- }
- break;
-
- case Transaction::OP_RMATTR:
- {
- coll_t cid = i.get_cid(op->cid);
- ghobject_t oid = i.get_oid(op->oid);
- _kludge_temp_object_collection(cid, oid);
- string name = i.decode_string();
- tracepoint(objectstore, rmattr_enter, osr_name);
- if (_check_replay_guard(cid, oid, spos) > 0)
- r = _rmattr(cid, oid, name.c_str(), spos);
- tracepoint(objectstore, rmattr_exit, r);
- }
- break;
-
- case Transaction::OP_RMATTRS:
- {
- coll_t cid = i.get_cid(op->cid);
- ghobject_t oid = i.get_oid(op->oid);
- _kludge_temp_object_collection(cid, oid);
- tracepoint(objectstore, rmattrs_enter, osr_name);
- if (_check_replay_guard(cid, oid, spos) > 0)
- r = _rmattrs(cid, oid, spos);
- tracepoint(objectstore, rmattrs_exit, r);
- }
- break;
-
- case Transaction::OP_CLONE:
- {
- coll_t cid = i.get_cid(op->cid);
- ghobject_t oid = i.get_oid(op->oid);
- _kludge_temp_object_collection(cid, oid);
- ghobject_t noid = i.get_oid(op->dest_oid);
- tracepoint(objectstore, clone_enter, osr_name);
- r = _clone(cid, oid, noid, spos);
- tracepoint(objectstore, clone_exit, r);
- }
- break;
-
- case Transaction::OP_CLONERANGE:
- {
- coll_t cid = i.get_cid(op->cid);
- ghobject_t oid = i.get_oid(op->oid);
- _kludge_temp_object_collection(cid, oid);
- ghobject_t noid = i.get_oid(op->dest_oid);
- _kludge_temp_object_collection(cid, noid);
- uint64_t off = op->off;
- uint64_t len = op->len;
- tracepoint(objectstore, clone_range_enter, osr_name, len);
- r = _clone_range(cid, oid, noid, off, len, off, spos);
- tracepoint(objectstore, clone_range_exit, r);
- }
- break;
-
- case Transaction::OP_CLONERANGE2:
- {
- coll_t cid = i.get_cid(op->cid);
- ghobject_t oid = i.get_oid(op->oid);
- _kludge_temp_object_collection(cid, oid);
- ghobject_t noid = i.get_oid(op->dest_oid);
- _kludge_temp_object_collection(cid, noid);
- uint64_t srcoff = op->off;
- uint64_t len = op->len;
- uint64_t dstoff = op->dest_off;
- tracepoint(objectstore, clone_range2_enter, osr_name, len);
- r = _clone_range(cid, oid, noid, srcoff, len, dstoff, spos);
- tracepoint(objectstore, clone_range2_exit, r);
- }
- break;
-
- case Transaction::OP_MKCOLL:
- {
- coll_t cid = i.get_cid(op->cid);
- tracepoint(objectstore, mkcoll_enter, osr_name);
- if (_check_replay_guard(cid, spos) > 0)
- r = _create_collection(cid, spos);
- tracepoint(objectstore, mkcoll_exit, r);
- }
- break;
-
- case Transaction::OP_COLL_HINT:
- {
- coll_t cid = i.get_cid(op->cid);
- uint32_t type = op->hint_type;
- bufferlist hint;
- i.decode_bl(hint);
- bufferlist::iterator hiter = hint.begin();
- if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
- uint32_t pg_num;
- uint64_t num_objs;
- ::decode(pg_num, hiter);
- ::decode(num_objs, hiter);
- if (_check_replay_guard(cid, spos) > 0) {
- r = _collection_hint_expected_num_objs(cid, pg_num, num_objs, spos);
- }
- } else {
- // Ignore the hint
- dout(10) << "Unrecognized collection hint type: " << type << dendl;
- }
- }
- break;
-
- case Transaction::OP_RMCOLL:
- {
- coll_t cid = i.get_cid(op->cid);
- tracepoint(objectstore, rmcoll_enter, osr_name);
- if (_check_replay_guard(cid, spos) > 0)
- r = _destroy_collection(cid);
- tracepoint(objectstore, rmcoll_exit, r);
- }
- break;
-
- case Transaction::OP_COLL_ADD:
- {
- coll_t ocid = i.get_cid(op->cid);
- coll_t ncid = i.get_cid(op->dest_cid);
- ghobject_t oid = i.get_oid(op->oid);
-
- assert(oid.hobj.pool >= -1);
-
- // always followed by OP_COLL_REMOVE
- Transaction::Op *op2 = i.decode_op();
- coll_t ocid2 = i.get_cid(op2->cid);
- ghobject_t oid2 = i.get_oid(op2->oid);
- assert(op2->op == Transaction::OP_COLL_REMOVE);
- assert(ocid2 == ocid);
- assert(oid2 == oid);
-
- tracepoint(objectstore, coll_add_enter);
- r = _collection_add(ncid, ocid, oid, spos);
- tracepoint(objectstore, coll_add_exit, r);
- spos.op++;
- if (r < 0)
- break;
- tracepoint(objectstore, coll_remove_enter, osr_name);
- if (_check_replay_guard(ocid, oid, spos) > 0)
- r = _remove(ocid, oid, spos);
- tracepoint(objectstore, coll_remove_exit, r);
- }
- break;
-
- case Transaction::OP_COLL_MOVE:
- {
- // WARNING: this is deprecated and buggy; only here to replay old journals.
- coll_t ocid = i.get_cid(op->cid);
- coll_t ncid = i.get_cid(op->dest_cid);
- ghobject_t oid = i.get_oid(op->oid);
- tracepoint(objectstore, coll_move_enter);
- r = _collection_add(ocid, ncid, oid, spos);
- if (r == 0 &&
- (_check_replay_guard(ocid, oid, spos) > 0))
- r = _remove(ocid, oid, spos);
- tracepoint(objectstore, coll_move_exit, r);
- }
- break;
-
- case Transaction::OP_COLL_MOVE_RENAME:
- {
- coll_t oldcid = i.get_cid(op->cid);
- ghobject_t oldoid = i.get_oid(op->oid);
- coll_t newcid = i.get_cid(op->dest_cid);
- ghobject_t newoid = i.get_oid(op->dest_oid);
- _kludge_temp_object_collection(oldcid, oldoid);
- _kludge_temp_object_collection(newcid, newoid);
- tracepoint(objectstore, coll_move_rename_enter);
- r = _collection_move_rename(oldcid, oldoid, newcid, newoid, spos);
- tracepoint(objectstore, coll_move_rename_exit, r);
- }
- break;
-
- case Transaction::OP_COLL_SETATTR:
- {
- coll_t cid = i.get_cid(op->cid);
- string name = i.decode_string();
- bufferlist bl;
- i.decode_bl(bl);
- tracepoint(objectstore, coll_setattr_enter, osr_name);
- if (_check_replay_guard(cid, spos) > 0)
- r = _collection_setattr(cid, name.c_str(), bl.c_str(), bl.length());
- tracepoint(objectstore, coll_setattr_exit, r);
- }
- break;
-
- case Transaction::OP_COLL_RMATTR:
- {
- coll_t cid = i.get_cid(op->cid);
- string name = i.decode_string();
- tracepoint(objectstore, coll_rmattr_enter, osr_name);
- if (_check_replay_guard(cid, spos) > 0)
- r = _collection_rmattr(cid, name.c_str());
- tracepoint(objectstore, coll_rmattr_exit, r);
- }
- break;
-
- case Transaction::OP_STARTSYNC:
- tracepoint(objectstore, startsync_enter, osr_name);
- _start_sync();
- tracepoint(objectstore, startsync_exit);
- break;
-
- case Transaction::OP_COLL_RENAME:
- {
- r = -EOPNOTSUPP;
- }
- break;
-
- case Transaction::OP_OMAP_CLEAR:
- {
- coll_t cid = i.get_cid(op->cid);
- ghobject_t oid = i.get_oid(op->oid);
- _kludge_temp_object_collection(cid, oid);
- tracepoint(objectstore, omap_clear_enter, osr_name);
- r = _omap_clear(cid, oid, spos);
- tracepoint(objectstore, omap_clear_exit, r);
- }
- break;
- case Transaction::OP_OMAP_SETKEYS:
- {
- coll_t cid = i.get_cid(op->cid);
- ghobject_t oid = i.get_oid(op->oid);
- _kludge_temp_object_collection(cid, oid);
- map<string, bufferlist> aset;
- i.decode_attrset(aset);
- tracepoint(objectstore, omap_setkeys_enter, osr_name);
- r = _omap_setkeys(cid, oid, aset, spos);
- tracepoint(objectstore, omap_setkeys_exit, r);
- }
- break;
- case Transaction::OP_OMAP_RMKEYS:
- {
- coll_t cid = i.get_cid(op->cid);
- ghobject_t oid = i.get_oid(op->oid);
- _kludge_temp_object_collection(cid, oid);
- set<string> keys;
- i.decode_keyset(keys);
- tracepoint(objectstore, omap_rmkeys_enter, osr_name);
- r = _omap_rmkeys(cid, oid, keys, spos);
- tracepoint(objectstore, omap_rmkeys_exit, r);
- }
- break;
- case Transaction::OP_OMAP_RMKEYRANGE:
- {
- coll_t cid = i.get_cid(op->cid);
- ghobject_t oid = i.get_oid(op->oid);
- _kludge_temp_object_collection(cid, oid);
- string first, last;
- first = i.decode_string();
- last = i.decode_string();
- tracepoint(objectstore, omap_rmkeyrange_enter, osr_name);
- r = _omap_rmkeyrange(cid, oid, first, last, spos);
- tracepoint(objectstore, omap_rmkeyrange_exit, r);
- }
- break;
- case Transaction::OP_OMAP_SETHEADER:
- {
- coll_t cid = i.get_cid(op->cid);
- ghobject_t oid = i.get_oid(op->oid);
- _kludge_temp_object_collection(cid, oid);
- bufferlist bl;
- i.decode_bl(bl);
- tracepoint(objectstore, omap_setheader_enter, osr_name);
- r = _omap_setheader(cid, oid, bl, spos);
- tracepoint(objectstore, omap_setheader_exit, r);
- }
- break;
- case Transaction::OP_SPLIT_COLLECTION:
- {
- assert(0 == "not legacy journal; upgrade to firefly first");
- }
- break;
- case Transaction::OP_SPLIT_COLLECTION2:
- {
- coll_t cid = i.get_cid(op->cid);
- uint32_t bits = op->split_bits;
- uint32_t rem = op->split_rem;
- coll_t dest = i.get_cid(op->dest_cid);
- tracepoint(objectstore, split_coll2_enter, osr_name);
- r = _split_collection(cid, bits, rem, dest, spos);
- tracepoint(objectstore, split_coll2_exit, r);
- }
- break;
-
- case Transaction::OP_SETALLOCHINT:
- {
- coll_t cid = i.get_cid(op->cid);
- ghobject_t oid = i.get_oid(op->oid);
- _kludge_temp_object_collection(cid, oid);
- uint64_t expected_object_size = op->expected_object_size;
- uint64_t expected_write_size = op->expected_write_size;
- tracepoint(objectstore, setallochint_enter, osr_name);
- if (_check_replay_guard(cid, oid, spos) > 0)
- r = _set_alloc_hint(cid, oid, expected_object_size,
- expected_write_size);
- tracepoint(objectstore, setallochint_exit, r);
- }
- break;
-
- default:
- derr << "bad op " << op->op << dendl;
- assert(0);
- }
-
- if (r < 0) {
- bool ok = false;
-
- if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
- op->op == Transaction::OP_CLONE ||
- op->op == Transaction::OP_CLONERANGE2 ||
- op->op == Transaction::OP_COLL_ADD))
- // -ENOENT is normally okay
- // ...including on a replayed OP_RMCOLL with checkpoint mode
- ok = true;
- if (r == -ENODATA)
- ok = true;
-
- if (op->op == Transaction::OP_SETALLOCHINT)
- // Either EOPNOTSUPP or EINVAL most probably. EINVAL in most
- // cases means invalid hint size (e.g. too big, not a multiple
- // of block size, etc) or, at least on xfs, an attempt to set
- // or change it when the file is not empty. However,
- // OP_SETALLOCHINT is advisory, so ignore all errors.
- ok = true;
-
- if (replaying && !backend->can_checkpoint()) {
- if (r == -EEXIST && op->op == Transaction::OP_MKCOLL) {
- dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl;
- ok = true;
- }
- if (r == -EEXIST && op->op == Transaction::OP_COLL_ADD) {
- dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl;
- ok = true;
- }
- if (r == -EEXIST && op->op == Transaction::OP_COLL_MOVE) {
- dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl;
- ok = true;
- }
- if (r == -ERANGE) {
- dout(10) << "tolerating ERANGE on replay" << dendl;
- ok = true;
- }
- if (r == -ENOENT) {
- dout(10) << "tolerating ENOENT on replay" << dendl;
- ok = true;
- }
- }
-
- if (!ok) {
- const char *msg = "unexpected error code";
-
- if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
- op->op == Transaction::OP_CLONE ||
- op->op == Transaction::OP_CLONERANGE2))
- msg = "ENOENT on clone suggests osd bug";
-
- if (r == -ENOSPC)
- // For now, if we hit _any_ ENOSPC, crash, before we do any damage
- // by partially applying transactions.
- msg = "ENOSPC handling not implemented";
-
- if (r == -ENOTEMPTY) {
- msg = "ENOTEMPTY suggests garbage data in osd data dir";
- }
-
- dout(0) << " error " << cpp_strerror(r) << " not handled on operation " << op
- << " (" << spos << ", or op " << spos.op << ", counting from 0)" << dendl;
- dout(0) << msg << dendl;
- dout(0) << " transaction dump:\n";
- JSONFormatter f(true);
- f.open_object_section("transaction");
- t.dump(&f);
- f.close_section();
- f.flush(*_dout);
- *_dout << dendl;
-
- if (r == -EMFILE) {
- dump_open_fds(g_ceph_context);
- }
-
- assert(0 == "unexpected error");
- }
- }
-
- spos.op++;
- }
-
- _inject_failure();
-
- return 0; // FIXME count errors
-}
-
- /*********************************************/
-
-
-
-// --------------------
-// objects
-
-bool FileStore::exists(coll_t cid, const ghobject_t& oid)
-{
- tracepoint(objectstore, exists_enter, cid.c_str());
- _kludge_temp_object_collection(cid, oid);
- struct stat st;
- bool retval = stat(cid, oid, &st) == 0;
- tracepoint(objectstore, exists_exit, retval);
- return retval;
-}
-
-int FileStore::stat(
- coll_t cid, const ghobject_t& oid, struct stat *st, bool allow_eio)
-{
- tracepoint(objectstore, stat_enter, cid.c_str());
- _kludge_temp_object_collection(cid, oid);
- int r = lfn_stat(cid, oid, st);
- assert(allow_eio || !m_filestore_fail_eio || r != -EIO);
- if (r < 0) {
- dout(10) << "stat " << cid << "/" << oid
- << " = " << r << dendl;
- } else {
- dout(10) << "stat " << cid << "/" << oid
- << " = " << r
- << " (size " << st->st_size << ")" << dendl;
- }
- if (g_conf->filestore_debug_inject_read_err &&
- debug_mdata_eio(oid)) {
- return -EIO;
- } else {
- tracepoint(objectstore, stat_exit, r);
- return r;
- }
-}
-
-int FileStore::read(
- coll_t cid,
- const ghobject_t& oid,
- uint64_t offset,
- size_t len,
- bufferlist& bl,
- uint32_t op_flags,
- bool allow_eio)
-{
- int got;
- tracepoint(objectstore, read_enter, cid.c_str(), offset, len);
- _kludge_temp_object_collection(cid, oid);
-
- dout(15) << "read " << cid << "/" << oid << " " << offset << "~" << len << dendl;
-
- FDRef fd;
- int r = lfn_open(cid, oid, false, &fd);
- if (r < 0) {
- dout(10) << "FileStore::read(" << cid << "/" << oid << ") open error: "
- << cpp_strerror(r) << dendl;
- return r;
- }
-
- if (len == 0) {
- struct stat st;
- memset(&st, 0, sizeof(struct stat));
- int r = ::fstat(**fd, &st);
- assert(r == 0);
- len = st.st_size;
- }
-
-#ifdef HAVE_POSIX_FADVISE
- if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_RANDOM)
- posix_fadvise(**fd, offset, len, POSIX_FADV_RANDOM);
- if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL)
- posix_fadvise(**fd, offset, len, POSIX_FADV_SEQUENTIAL);
-#endif
-
- bufferptr bptr(len); // prealloc space for entire read
- got = safe_pread(**fd, bptr.c_str(), len, offset);
- if (got < 0) {
- dout(10) << "FileStore::read(" << cid << "/" << oid << ") pread error: " << cpp_strerror(got) << dendl;
- lfn_close(fd);
- assert(allow_eio || !m_filestore_fail_eio || got != -EIO);
- return got;
- }
- bptr.set_length(got); // properly size the buffer
- bl.push_back(bptr); // put it in the target bufferlist
-
-#ifdef HAVE_POSIX_FADVISE
- if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)
- posix_fadvise(**fd, offset, len, POSIX_FADV_DONTNEED);
- if (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_RANDOM | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL))
- posix_fadvise(**fd, offset, len, POSIX_FADV_NORMAL);
-#endif
-
- if (m_filestore_sloppy_crc && (!replaying || backend->can_checkpoint())) {
- ostringstream ss;
- int errors = backend->_crc_verify_read(**fd, offset, got, bl, &ss);
- if (errors > 0) {
- dout(0) << "FileStore::read " << cid << "/" << oid << " " << offset << "~"
- << got << " ... BAD CRC:\n" << ss.str() << dendl;
- assert(0 == "bad crc on read");
- }
- }
-
- lfn_close(fd);
-
- dout(10) << "FileStore::read " << cid << "/" << oid << " " << offset << "~"
- << got << "/" << len << dendl;
- if (g_conf->filestore_debug_inject_read_err &&
- debug_data_eio(oid)) {
- return -EIO;
- } else {
- tracepoint(objectstore, read_exit, got);
- return got;
- }
-}
-
-int FileStore::_do_fiemap(int fd, uint64_t offset, size_t len,
- map<uint64_t, uint64_t> *m)
-{
- struct fiemap *fiemap = NULL;
- uint64_t i;
- struct fiemap_extent *extent = NULL;
- int r = 0;
-
- r = backend->do_fiemap(fd, offset, len, &fiemap);
- if (r < 0)
- return r;
-
- if (fiemap->fm_mapped_extents == 0) {
- free(fiemap);
- return r;
- }
-
- extent = &fiemap->fm_extents[0];
-
- /* start where we were asked to start */
- if (extent->fe_logical < offset) {
- extent->fe_length -= offset - extent->fe_logical;
- extent->fe_logical = offset;
- }
-
- i = 0;
-
- while (i < fiemap->fm_mapped_extents) {
- struct fiemap_extent *next = extent + 1;
-
- dout(10) << "FileStore::fiemap() fm_mapped_extents=" << fiemap->fm_mapped_extents
- << " fe_logical=" << extent->fe_logical << " fe_length=" << extent->fe_length << dendl;
-
- /* try to merge extents */
- while ((i < fiemap->fm_mapped_extents - 1) &&
- (extent->fe_logical + extent->fe_length == next->fe_logical)) {
- next->fe_length += extent->fe_length;
- next->fe_logical = extent->fe_logical;
- extent = next;
- next = extent + 1;
- i++;
- }
-
- if (extent->fe_logical + extent->fe_length > offset + len)
- extent->fe_length = offset + len - extent->fe_logical;
- (*m)[extent->fe_logical] = extent->fe_length;
- i++;
- extent++;
- }
- free(fiemap);
-
- return r;
-}
-
-int FileStore::_do_seek_hole_data(int fd, uint64_t offset, size_t len,
- map<uint64_t, uint64_t> *m)
-{
-#if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA)
- off_t hole_pos, data_pos;
- int r = 0;
-
- // If lseek fails with errno setting to be ENXIO, this means the current
- // file offset is beyond the end of the file.
- off_t start = offset;
- while(start < (off_t)(offset + len)) {
- data_pos = lseek(fd, start, SEEK_DATA);
- if (data_pos < 0) {
- if (errno == ENXIO)
- break;
- else {
- r = -errno;
- dout(10) << "failed to lseek: " << cpp_strerror(r) << dendl;
- return r;
- }
- } else if (data_pos > (off_t)(offset + len)) {
- break;
- }
-
- hole_pos = lseek(fd, data_pos, SEEK_HOLE);
- if (hole_pos < 0) {
- if (errno == ENXIO) {
- break;
- } else {
- r = -errno;
- dout(10) << "failed to lseek: " << cpp_strerror(r) << dendl;
- return r;
- }
- }
-
- if (hole_pos >= (off_t)(offset + len)) {
- (*m)[data_pos] = offset + len - data_pos;
- break;
- }
- (*m)[data_pos] = hole_pos - data_pos;
- start = hole_pos;
- }
-
- return r;
-#else
- (*m)[offset] = len;
- return 0;
-#endif
-}
-
-int FileStore::fiemap(coll_t cid, const ghobject_t& oid,
- uint64_t offset, size_t len,
- bufferlist& bl)
-{
- tracepoint(objectstore, fiemap_enter, cid.c_str(), offset, len);
- _kludge_temp_object_collection(cid, oid);
-
- if ((!backend->has_seek_data_hole() && !backend->has_fiemap()) ||
- len <= (size_t)m_filestore_fiemap_threshold) {
- map<uint64_t, uint64_t> m;
- m[offset] = len;
- ::encode(m, bl);
- return 0;
- }
-
- dout(15) << "fiemap " << cid << "/" << oid << " " << offset << "~" << len << dendl;
-
- map<uint64_t, uint64_t> exomap;
- FDRef fd;
-
- int r = lfn_open(cid, oid, false, &fd);
- if (r < 0) {
- dout(10) << "read couldn't open " << cid << "/" << oid << ": " << cpp_strerror(r) << dendl;
- goto done;
- }
-
- if (backend->has_seek_data_hole()) {
- dout(15) << "seek_data/seek_hole " << cid << "/" << oid << " " << offset << "~" << len << dendl;
- r = _do_seek_hole_data(**fd, offset, len, &exomap);
- } else if (backend->has_fiemap()) {
- dout(15) << "fiemap ioctl" << cid << "/" << oid << " " << offset << "~" << len << dendl;
- r = _do_fiemap(**fd, offset, len, &exomap);
- }
-
-done:
- if (r >= 0) {
- lfn_close(fd);
- ::encode(exomap, bl);
- }
-
- dout(10) << "fiemap " << cid << "/" << oid << " " << offset << "~" << len << " = " << r << " num_extents=" << exomap.size() << " " << exomap << dendl;
- assert(!m_filestore_fail_eio || r != -EIO);
- tracepoint(objectstore, fiemap_exit, r);
- return r;
-}
-
-
-int FileStore::_remove(coll_t cid, const ghobject_t& oid,
- const SequencerPosition &spos)
-{
- dout(15) << "remove " << cid << "/" << oid << dendl;
- int r = lfn_unlink(cid, oid, spos);
- dout(10) << "remove " << cid << "/" << oid << " = " << r << dendl;
- return r;
-}
-
-int FileStore::_truncate(coll_t cid, const ghobject_t& oid, uint64_t size)
-{
- dout(15) << "truncate " << cid << "/" << oid << " size " << size << dendl;
- int r = lfn_truncate(cid, oid, size);
- dout(10) << "truncate " << cid << "/" << oid << " size " << size << " = " << r << dendl;
- return r;
-}
-
-
-int FileStore::_touch(coll_t cid, const ghobject_t& oid)
-{
- dout(15) << "touch " << cid << "/" << oid << dendl;
-
- FDRef fd;
- int r = lfn_open(cid, oid, true, &fd);
- if (r < 0) {
- return r;
- } else {
- lfn_close(fd);
- }
- dout(10) << "touch " << cid << "/" << oid << " = " << r << dendl;
- return r;
-}
-
-int FileStore::_write(coll_t cid, const ghobject_t& oid,
- uint64_t offset, size_t len,
- const bufferlist& bl, uint32_t fadvise_flags)
-{
- dout(15) << "write " << cid << "/" << oid << " " << offset << "~" << len << dendl;
- int r;
-
- int64_t actual;
-
- FDRef fd;
- r = lfn_open(cid, oid, true, &fd);
- if (r < 0) {
- dout(0) << "write couldn't open " << cid << "/"
- << oid << ": "
- << cpp_strerror(r) << dendl;
- goto out;
- }
-
- // seek
- actual = ::lseek64(**fd, offset, SEEK_SET);
- if (actual < 0) {
- r = -errno;
- dout(0) << "write lseek64 to " << offset << " failed: " << cpp_strerror(r) << dendl;
- lfn_close(fd);
- goto out;
- }
- if (actual != (int64_t)offset) {
- dout(0) << "write lseek64 to " << offset << " gave bad offset " << actual << dendl;
- r = -EIO;
- lfn_close(fd);
- goto out;
- }
-
- // write
- r = bl.write_fd(**fd);
- if (r == 0)
- r = bl.length();
-
- if (r >= 0 && m_filestore_sloppy_crc) {
- int rc = backend->_crc_update_write(**fd, offset, len, bl);
- assert(rc >= 0);
- }
-
- // flush?
- if (!replaying &&
- g_conf->filestore_wbthrottle_enable)
- wbthrottle.queue_wb(fd, oid, offset, len,
- fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
- lfn_close(fd);
-
- out:
- dout(10) << "write " << cid << "/" << oid << " " << offset << "~" << len << " = " << r << dendl;
- return r;
-}
-
-int FileStore::_zero(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len)
-{
- dout(15) << "zero " << cid << "/" << oid << " " << offset << "~" << len << dendl;
- int ret = 0;
-
-#ifdef CEPH_HAVE_FALLOCATE
-# if !defined(DARWIN) && !defined(__FreeBSD__)
- // first try to punch a hole.
- FDRef fd;
- ret = lfn_open(cid, oid, false, &fd);
- if (ret < 0) {
- goto out;
- }
-
- // first try fallocate
- ret = fallocate(**fd, FALLOC_FL_PUNCH_HOLE, offset, len);
- if (ret < 0)
- ret = -errno;
- lfn_close(fd);
-
- if (ret >= 0 && m_filestore_sloppy_crc) {
- int rc = backend->_crc_update_zero(**fd, offset, len);
- assert(rc >= 0);
- }
-
- if (ret == 0)
- goto out; // yay!
- if (ret != -EOPNOTSUPP)
- goto out; // some other error
-# endif
-#endif
-
- // lame, kernel is old and doesn't support it.
- // write zeros.. yuck!
- dout(20) << "zero FALLOC_FL_PUNCH_HOLE not supported, falling back to writing zeros" << dendl;
- {
- bufferptr bp(len);
- bp.zero();
- bufferlist bl;
- bl.push_back(bp);
- ret = _write(cid, oid, offset, len, bl);
- }
-
- out:
- dout(20) << "zero " << cid << "/" << oid << " " << offset << "~" << len << " = " << ret << dendl;
- return ret;
-}
-
-int FileStore::_clone(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid,
- const SequencerPosition& spos)
-{
- dout(15) << "clone " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << dendl;
-
- if (_check_replay_guard(cid, newoid, spos) < 0)
- return 0;
-
- int r;
- FDRef o, n;
- {
- Index index;
- r = lfn_open(cid, oldoid, false, &o, &index);
- if (r < 0) {
- goto out2;
- }
- assert(NULL != (index.index));
- RWLock::WLocker l((index.index)->access_lock);
-
- r = lfn_open(cid, newoid, true, &n, &index);
- if (r < 0) {
- goto out;
- }
- r = ::ftruncate(**n, 0);
- if (r < 0) {
- goto out3;
- }
- struct stat st;
- ::fstat(**o, &st);
- r = _do_clone_range(**o, **n, 0, st.st_size, 0);
- if (r < 0) {
- r = -errno;
- goto out3;
- }
-
- dout(20) << "objectmap clone" << dendl;
- r = object_map->clone(oldoid, newoid, &spos);
- if (r < 0 && r != -ENOENT)
- goto out3;
- }
-
- {
- char buf[2];
- map<string, bufferptr> aset;
- r = _fgetattrs(**o, aset);
- if (r < 0)
- goto out3;
-
- r = chain_fgetxattr(**o, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
- if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) {
- r = chain_fsetxattr(**n, XATTR_SPILL_OUT_NAME, XATTR_NO_SPILL_OUT,
- sizeof(XATTR_NO_SPILL_OUT), true);
- } else {
- r = chain_fsetxattr(**n, XATTR_SPILL_OUT_NAME, XATTR_SPILL_OUT,
- sizeof(XATTR_SPILL_OUT), true);
- }
- if (r < 0)
- goto out3;
-
- r = _fsetattrs(**n, aset);
- if (r < 0)
- goto out3;
- }
-
- // clone is non-idempotent; record our work.
- _set_replay_guard(**n, spos, &newoid);
-
- out3:
- lfn_close(n);
- out:
- lfn_close(o);
- out2:
- dout(10) << "clone " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << " = " << r << dendl;
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
-}
-
-int FileStore::_do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff)
-{
- dout(20) << "_do_clone_range copy " << srcoff << "~" << len << " to " << dstoff << dendl;
- return backend->clone_range(from, to, srcoff, len, dstoff);
-}
-
-int FileStore::_do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff)
-{
- dout(20) << __func__ << " " << srcoff << "~" << len << " to " << dstoff << dendl;
- int r = 0;
- map<uint64_t, uint64_t> exomap;
- // fiemap doesn't allow zero length
- if (len == 0)
- return 0;
-
- if (backend->has_seek_data_hole()) {
- dout(15) << "seek_data/seek_hole " << from << " " << srcoff << "~" << len << dendl;
- r = _do_seek_hole_data(from, srcoff, len, &exomap);
- } else if (backend->has_fiemap()) {
- dout(15) << "fiemap ioctl" << from << " " << srcoff << "~" << len << dendl;
- r = _do_fiemap(from, srcoff, len, &exomap);
- }
-
- int64_t written = 0;
- for (map<uint64_t, uint64_t>::iterator miter = exomap.begin(); miter != exomap.end(); ++miter) {
- uint64_t it_off = miter->first - srcoff + dstoff;
- r = _do_copy_range(from, to, miter->first, miter->second, it_off, true);
- if (r < 0) {
- r = -errno;
- derr << "FileStore::_do_copy_range: copy error at " << miter->first << "~" << miter->second
- << " to " << it_off << ", " << cpp_strerror(r) << dendl;
- break;
- }
- written += miter->second;
- }
-
- if (r >= 0) {
- if (m_filestore_sloppy_crc) {
- int rc = backend->_crc_update_clone_range(from, to, srcoff, len, dstoff);
- assert(rc >= 0);
- }
- struct stat st;
- r = ::fstat(to, &st);
- if (r < 0) {
- r = -errno;
- derr << __func__ << ": fstat error at " << to << " " << cpp_strerror(r) << dendl;
- goto out;
- }
- if (st.st_size < (int)(dstoff + len)) {
- r = ::ftruncate(to, dstoff + len);
- if (r < 0) {
- r = -errno;
- derr << __func__ << ": ftruncate error at " << dstoff+len << " " << cpp_strerror(r) << dendl;
- goto out;
- }
- }
- r = written;
- }
-
- out:
- dout(20) << __func__ << " " << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl;
- return r;
-}
-
-int FileStore::_do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc)
-{
- dout(20) << "_do_copy_range " << srcoff << "~" << len << " to " << dstoff << dendl;
- int r = 0;
- loff_t pos = srcoff;
- loff_t end = srcoff + len;
- int buflen = 4096 * 16; //limit by pipe max size.see fcntl
-
-#ifdef CEPH_HAVE_SPLICE
- if (backend->has_splice()) {
- int pipefd[2];
- if (pipe(pipefd) < 0) {
- r = errno;
- derr << " pipe " << " got " << cpp_strerror(r) << dendl;
- return r;
- }
-
- loff_t dstpos = dstoff;
- while (pos < end) {
- int l = MIN(end-pos, buflen);
- r = safe_splice(from, &pos, pipefd[1], NULL, l, SPLICE_F_NONBLOCK);
- dout(10) << " safe_splice read from " << pos << "~" << l << " got " << r << dendl;
- if (r < 0) {
- derr << "FileStore::_do_copy_range: safe_splice read error at " << pos << "~" << len
- << ", " << cpp_strerror(r) << dendl;
- break;
- }
- if (r == 0) {
- // hrm, bad source range, wtf.
- r = -ERANGE;
- derr << "FileStore::_do_copy_range got short read result at " << pos
- << " of fd " << from << " len " << len << dendl;
- break;
- }
-
- r = safe_splice(pipefd[0], NULL, to, &dstpos, r, 0);
- dout(10) << " safe_splice write to " << to << " len " << r
- << " got " << r << dendl;
- if (r < 0) {
- derr << "FileStore::_do_copy_range: write error at " << pos << "~"
- << r << ", " << cpp_strerror(r) << dendl;
- break;
- }
- }
- close(pipefd[0]);
- close(pipefd[1]);
- } else
-#endif
- {
- int64_t actual;
-
- actual = ::lseek64(from, srcoff, SEEK_SET);
- if (actual != (int64_t)srcoff) {
- r = errno;
- derr << "lseek64 to " << srcoff << " got " << cpp_strerror(r) << dendl;
- return r;
- }
- actual = ::lseek64(to, dstoff, SEEK_SET);
- if (actual != (int64_t)dstoff) {
- r = errno;
- derr << "lseek64 to " << dstoff << " got " << cpp_strerror(r) << dendl;
- return r;
- }
-
- char buf[buflen];
- while (pos < end) {
- int l = MIN(end-pos, buflen);
- r = ::read(from, buf, l);
- dout(25) << " read from " << pos << "~" << l << " got " << r << dendl;
- if (r < 0) {
- if (errno == EINTR) {
- continue;
- } else {
- r = -errno;
- derr << "FileStore::_do_copy_range: read error at " << pos << "~" << len
- << ", " << cpp_strerror(r) << dendl;
- break;
- }
- }
- if (r == 0) {
- // hrm, bad source range, wtf.
- r = -ERANGE;
- derr << "FileStore::_do_copy_range got short read result at " << pos
- << " of fd " << from << " len " << len << dendl;
- break;
- }
- int op = 0;
- while (op < r) {
- int r2 = safe_write(to, buf+op, r-op);
- dout(25) << " write to " << to << " len " << (r-op)
- << " got " << r2 << dendl;
- if (r2 < 0) {
- r = r2;
- derr << "FileStore::_do_copy_range: write error at " << pos << "~"
- << r-op << ", " << cpp_strerror(r) << dendl;
-
- break;
- }
- op += (r-op);
- }
- if (r < 0)
- break;
- pos += r;
- }
- }
-
- assert(pos == end);
- if (r >= 0 && !skip_sloppycrc && m_filestore_sloppy_crc) {
- int rc = backend->_crc_update_clone_range(from, to, srcoff, len, dstoff);
- assert(rc >= 0);
- }
- dout(20) << "_do_copy_range " << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl;
- return r;
-}
-
-int FileStore::_clone_range(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid,
- uint64_t srcoff, uint64_t len, uint64_t dstoff,
- const SequencerPosition& spos)
-{
- dout(15) << "clone_range " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << " " << srcoff << "~" << len << " to " << dstoff << dendl;
-
- if (_check_replay_guard(cid, newoid, spos) < 0)
- return 0;
-
- int r;
- FDRef o, n;
- r = lfn_open(cid, oldoid, false, &o);
- if (r < 0) {
- goto out2;
- }
- r = lfn_open(cid, newoid, true, &n);
- if (r < 0) {
- goto out;
- }
- r = _do_clone_range(**o, **n, srcoff, len, dstoff);
- if (r < 0) {
- r = -errno;
- goto out3;
- }
-
- // clone is non-idempotent; record our work.
- _set_replay_guard(**n, spos, &newoid);
-
- out3:
- lfn_close(n);
- out:
- lfn_close(o);
- out2:
- dout(10) << "clone_range " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << " "
- << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl;
- return r;
-}
-
-class SyncEntryTimeout : public Context {
-public:
- SyncEntryTimeout(int commit_timeo)
- : m_commit_timeo(commit_timeo)
- {
- }
-
- void finish(int r) {
- BackTrace *bt = new BackTrace(1);
- generic_dout(-1) << "FileStore: sync_entry timed out after "
- << m_commit_timeo << " seconds.\n";
- bt->print(*_dout);
- *_dout << dendl;
- delete bt;
- ceph_abort();
- }
-private:
- int m_commit_timeo;
-};
-
-void FileStore::sync_entry()
-{
- lock.Lock();
- while (!stop) {
- utime_t max_interval;
- max_interval.set_from_double(m_filestore_max_sync_interval);
- utime_t min_interval;
- min_interval.set_from_double(m_filestore_min_sync_interval);
-
- utime_t startwait = ceph_clock_now(g_ceph_context);
- if (!force_sync) {
- dout(20) << "sync_entry waiting for max_interval " << max_interval << dendl;
- sync_cond.WaitInterval(g_ceph_context, lock, max_interval);
- } else {
- dout(20) << "sync_entry not waiting, force_sync set" << dendl;
- }
-
- if (force_sync) {
- dout(20) << "sync_entry force_sync set" << dendl;
- force_sync = false;
- } else {
- // wait for at least the min interval
- utime_t woke = ceph_clock_now(g_ceph_context);
- woke -= startwait;
- dout(20) << "sync_entry woke after " << woke << dendl;
- if (woke < min_interval) {
- utime_t t = min_interval;
- t -= woke;
- dout(20) << "sync_entry waiting for another " << t
- << " to reach min interval " << min_interval << dendl;
- sync_cond.WaitInterval(g_ceph_context, lock, t);
- }
- }
-
- list<Context*> fin;
- again:
- fin.swap(sync_waiters);
- lock.Unlock();
-
- op_tp.pause();
- if (apply_manager.commit_start()) {
- utime_t start = ceph_clock_now(g_ceph_context);
- uint64_t cp = apply_manager.get_committing_seq();
-
- sync_entry_timeo_lock.Lock();
- SyncEntryTimeout *sync_entry_timeo =
- new SyncEntryTimeout(m_filestore_commit_timeout);
- timer.add_event_after(m_filestore_commit_timeout, sync_entry_timeo);
- sync_entry_timeo_lock.Unlock();
-
- logger->set(l_os_committing, 1);
-
- dout(15) << "sync_entry committing " << cp << dendl;
- stringstream errstream;
- if (g_conf->filestore_debug_omap_check && !object_map->check(errstream)) {
- derr << errstream.str() << dendl;
- assert(0);
- }
-
- if (backend->can_checkpoint()) {
- int err = write_op_seq(op_fd, cp);
- if (err < 0) {
- derr << "Error during write_op_seq: " << cpp_strerror(err) << dendl;
- assert(0 == "error during write_op_seq");
- }
-
- char s[NAME_MAX];
- snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)cp);
- uint64_t cid = 0;
- err = backend->create_checkpoint(s, &cid);
- if (err < 0) {
- int err = errno;
- derr << "snap create '" << s << "' got error " << err << dendl;
- assert(err == 0);
- }
-
- snaps.push_back(cp);
- apply_manager.commit_started();
- op_tp.unpause();
-
- if (cid > 0) {
- dout(20) << " waiting for checkpoint " << cid << " to complete" << dendl;
- err = backend->sync_checkpoint(cid);
- if (err < 0) {
- derr << "ioctl WAIT_SYNC got " << cpp_strerror(err) << dendl;
- assert(0 == "wait_sync got error");
- }
- dout(20) << " done waiting for checkpoint" << cid << " to complete" << dendl;
- }
- } else
- {
- apply_manager.commit_started();
- op_tp.unpause();
-
- object_map->sync();
- int err = backend->syncfs();
- if (err < 0) {
- derr << "syncfs got " << cpp_strerror(err) << dendl;
- assert(0 == "syncfs returned error");
- }
-
- err = write_op_seq(op_fd, cp);
- if (err < 0) {
- derr << "Error during write_op_seq: " << cpp_strerror(err) << dendl;
- assert(0 == "error during write_op_seq");
- }
- err = ::fsync(op_fd);
- if (err < 0) {
- derr << "Error during fsync of op_seq: " << cpp_strerror(err) << dendl;
- assert(0 == "error during fsync of op_seq");
- }
- }
-
- utime_t done = ceph_clock_now(g_ceph_context);
- utime_t lat = done - start;
- utime_t dur = done - startwait;
- dout(10) << "sync_entry commit took " << lat << ", interval was " << dur << dendl;
-
- logger->inc(l_os_commit);
- logger->tinc(l_os_commit_lat, lat);
- logger->tinc(l_os_commit_len, dur);
-
- apply_manager.commit_finish();
- wbthrottle.clear();
-
- logger->set(l_os_committing, 0);
-
- // remove old snaps?
- if (backend->can_checkpoint()) {
- char s[NAME_MAX];
- while (snaps.size() > 2) {
- snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)snaps.front());
- snaps.pop_front();
- dout(10) << "removing snap '" << s << "'" << dendl;
- int r = backend->destroy_checkpoint(s);
- if (r) {
- int err = errno;
- derr << "unable to destroy snap '" << s << "' got " << cpp_strerror(err) << dendl;
- }
- }
- }
-
- dout(15) << "sync_entry committed to op_seq " << cp << dendl;
-
- sync_entry_timeo_lock.Lock();
- timer.cancel_event(sync_entry_timeo);
- sync_entry_timeo_lock.Unlock();
- } else {
- op_tp.unpause();
- }
-
- lock.Lock();
- finish_contexts(g_ceph_context, fin, 0);
- fin.clear();
- if (!sync_waiters.empty()) {
- dout(10) << "sync_entry more waiters, committing again" << dendl;
- goto again;
- }
- if (!stop && journal && journal->should_commit_now()) {
- dout(10) << "sync_entry journal says we should commit again (probably is/was full)" << dendl;
- goto again;
- }
- }
- stop = false;
- lock.Unlock();
-}
-
-void FileStore::_start_sync()
-{
- if (!journal) { // don't do a big sync if the journal is on
- dout(10) << "start_sync" << dendl;
- sync_cond.Signal();
- } else {
- dout(10) << "start_sync - NOOP (journal is on)" << dendl;
- }
-}
-
-void FileStore::do_force_sync()
-{
- dout(10) << __func__ << dendl;
- Mutex::Locker l(lock);
- force_sync = true;
- sync_cond.Signal();
-}
-
-void FileStore::start_sync(Context *onsafe)
-{
- Mutex::Locker l(lock);
- sync_waiters.push_back(onsafe);
- sync_cond.Signal();
- force_sync = true;
- dout(10) << "start_sync" << dendl;
-}
-
-void FileStore::sync()
-{
- Mutex l("FileStore::sync");
- Cond c;
- bool done;
- C_SafeCond *fin = new C_SafeCond(&l, &c, &done);
-
- start_sync(fin);
-
- l.Lock();
- while (!done) {
- dout(10) << "sync waiting" << dendl;
- c.Wait(l);
- }
- l.Unlock();
- dout(10) << "sync done" << dendl;
-}
-
-void FileStore::_flush_op_queue()
-{
- dout(10) << "_flush_op_queue draining op tp" << dendl;
- op_wq.drain();
- dout(10) << "_flush_op_queue waiting for apply finisher" << dendl;
- for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
- (*it)->wait_for_empty();
- }
-}
-
-/*
- * flush - make every queued write readable
- */
-void FileStore::flush()
-{
- dout(10) << "flush" << dendl;
-
- if (g_conf->filestore_blackhole) {
- // wait forever
- Mutex lock("FileStore::flush::lock");
- Cond cond;
- lock.Lock();
- while (true)
- cond.Wait(lock);
- assert(0);
- }
-
- if (m_filestore_journal_writeahead) {
- if (journal)
- journal->flush();
- dout(10) << "flush draining ondisk finisher" << dendl;
- for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
- (*it)->wait_for_empty();
- }
- }
-
- _flush_op_queue();
- dout(10) << "flush complete" << dendl;
-}
-
-/*
- * sync_and_flush - make every queued write readable AND committed to disk
- */
-void FileStore::sync_and_flush()
-{
- dout(10) << "sync_and_flush" << dendl;
-
- if (m_filestore_journal_writeahead) {
- if (journal)
- journal->flush();
- _flush_op_queue();
- } else {
- // includes m_filestore_journal_parallel
- _flush_op_queue();
- sync();
- }
- dout(10) << "sync_and_flush done" << dendl;
-}
-
-int FileStore::flush_journal()
-{
- dout(10) << __func__ << dendl;
- sync_and_flush();
- sync();
- return 0;
-}
-
-int FileStore::snapshot(const string& name)
-{
- dout(10) << "snapshot " << name << dendl;
- sync_and_flush();
-
- if (!backend->can_checkpoint()) {
- dout(0) << "snapshot " << name << " failed, not supported" << dendl;
- return -EOPNOTSUPP;
- }
-
- char s[NAME_MAX];
- snprintf(s, sizeof(s), CLUSTER_SNAP_ITEM, name.c_str());
-
- int r = backend->create_checkpoint(s, NULL);
- if (r) {
- r = -errno;
- derr << "snapshot " << name << " failed: " << cpp_strerror(r) << dendl;
- }
-
- return r;
-}
-
-// -------------------------------
-// attributes
-
-int FileStore::_fgetattr(int fd, const char *name, bufferptr& bp)
-{
- char val[CHAIN_XATTR_MAX_BLOCK_LEN];
- int l = chain_fgetxattr(fd, name, val, sizeof(val));
- if (l >= 0) {
- bp = buffer::create(l);
- memcpy(bp.c_str(), val, l);
- } else if (l == -ERANGE) {
- l = chain_fgetxattr(fd, name, 0, 0);
- if (l > 0) {
- bp = buffer::create(l);
- l = chain_fgetxattr(fd, name, bp.c_str(), l);
- }
- }
- assert(!m_filestore_fail_eio || l != -EIO);
- return l;
-}
-
-int FileStore::_fgetattrs(int fd, map<string,bufferptr>& aset)
-{
- // get attr list
- char names1[100];
- int len = chain_flistxattr(fd, names1, sizeof(names1)-1);
- char *names2 = 0;
- char *name = 0;
- if (len == -ERANGE) {
- len = chain_flistxattr(fd, 0, 0);
- if (len < 0) {
- assert(!m_filestore_fail_eio || len != -EIO);
- return len;
- }
- dout(10) << " -ERANGE, len is " << len << dendl;
- names2 = new char[len+1];
- len = chain_flistxattr(fd, names2, len);
- dout(10) << " -ERANGE, got " << len << dendl;
- if (len < 0) {
- assert(!m_filestore_fail_eio || len != -EIO);
- delete[] names2;
- return len;
- }
- name = names2;
- } else if (len < 0) {
- assert(!m_filestore_fail_eio || len != -EIO);
- return len;
- } else {
- name = names1;
- }
- name[len] = 0;
-
- char *end = name + len;
- while (name < end) {
- char *attrname = name;
- if (parse_attrname(&name)) {
- if (*name) {
- dout(20) << "fgetattrs " << fd << " getting '" << name << "'" << dendl;
- int r = _fgetattr(fd, attrname, aset[name]);
- if (r < 0) {
- delete[] names2;
- return r;
- }
- }
- }
- name += strlen(name) + 1;
- }
-
- delete[] names2;
- return 0;
-}
-
-int FileStore::_fsetattrs(int fd, map<string, bufferptr> &aset)
-{
- for (map<string, bufferptr>::iterator p = aset.begin();
- p != aset.end();
- ++p) {
- char n[CHAIN_XATTR_MAX_NAME_LEN];
- get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN);
- const char *val;
- if (p->second.length())
- val = p->second.c_str();
- else
- val = "";
- // ??? Why do we skip setting all the other attrs if one fails?
- int r = chain_fsetxattr(fd, n, val, p->second.length());
- if (r < 0) {
- derr << "FileStore::_setattrs: chain_setxattr returned " << r << dendl;
- return r;
- }
- }
- return 0;
-}
-
-// debug EIO injection
-void FileStore::inject_data_error(const ghobject_t &oid) {
- Mutex::Locker l(read_error_lock);
- dout(10) << __func__ << ": init error on " << oid << dendl;
- data_error_set.insert(oid);
-}
-void FileStore::inject_mdata_error(const ghobject_t &oid) {
- Mutex::Locker l(read_error_lock);
- dout(10) << __func__ << ": init error on " << oid << dendl;
- mdata_error_set.insert(oid);
-}
-void FileStore::debug_obj_on_delete(const ghobject_t &oid) {
- Mutex::Locker l(read_error_lock);
- dout(10) << __func__ << ": clear error on " << oid << dendl;
- data_error_set.erase(oid);
- mdata_error_set.erase(oid);
-}
-bool FileStore::debug_data_eio(const ghobject_t &oid) {
- Mutex::Locker l(read_error_lock);
- if (data_error_set.count(oid)) {
- dout(10) << __func__ << ": inject error on " << oid << dendl;
- return true;
- } else {
- return false;
- }
-}
-bool FileStore::debug_mdata_eio(const ghobject_t &oid) {
- Mutex::Locker l(read_error_lock);
- if (mdata_error_set.count(oid)) {
- dout(10) << __func__ << ": inject error on " << oid << dendl;
- return true;
- } else {
- return false;
- }
-}
-
-
-// objects
-
-int FileStore::getattr(coll_t cid, const ghobject_t& oid, const char *name, bufferptr &bp)
-{
- tracepoint(objectstore, getattr_enter, cid.c_str());
- _kludge_temp_object_collection(cid, oid);
- dout(15) << "getattr " << cid << "/" << oid << " '" << name << "'" << dendl;
- FDRef fd;
- int r = lfn_open(cid, oid, false, &fd);
- if (r < 0) {
- goto out;
- }
- char n[CHAIN_XATTR_MAX_NAME_LEN];
- get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN);
- r = _fgetattr(**fd, n, bp);
- lfn_close(fd);
- if (r == -ENODATA) {
- map<string, bufferlist> got;
- set<string> to_get;
- to_get.insert(string(name));
- Index index;
- r = get_index(cid, &index);
- if (r < 0) {
- dout(10) << __func__ << " could not get index r = " << r << dendl;
- goto out;
- }
- r = object_map->get_xattrs(oid, to_get, &got);
- if (r < 0 && r != -ENOENT) {
- dout(10) << __func__ << " get_xattrs err r =" << r << dendl;
- goto out;
- }
- if (got.empty()) {
- dout(10) << __func__ << " got.size() is 0" << dendl;
- return -ENODATA;
- }
- bp = bufferptr(got.begin()->second.c_str(),
- got.begin()->second.length());
- r = bp.length();
- }
- out:
- dout(10) << "getattr " << cid << "/" << oid << " '" << name << "' = " << r << dendl;
- assert(!m_filestore_fail_eio || r != -EIO);
- if (g_conf->filestore_debug_inject_read_err &&
- debug_mdata_eio(oid)) {
- return -EIO;
- } else {
- tracepoint(objectstore, getattr_exit, r);
- return r < 0 ? r : 0;
- }
-}
-
-int FileStore::getattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset)
-{
- tracepoint(objectstore, getattrs_enter, cid.c_str());
- _kludge_temp_object_collection(cid, oid);
- set<string> omap_attrs;
- map<string, bufferlist> omap_aset;
- Index index;
- dout(15) << "getattrs " << cid << "/" << oid << dendl;
- FDRef fd;
- bool spill_out = true;
- char buf[2];
-
- int r = lfn_open(cid, oid, false, &fd);
- if (r < 0) {
- goto out;
- }
-
- r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
- if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT)))
- spill_out = false;
-
- r = _fgetattrs(**fd, aset);
- if (r < 0) {
- goto out;
- }
- lfn_close(fd);
-
- if (!spill_out) {
- dout(10) << __func__ << " no xattr exists in object_map r = " << r << dendl;
- goto out;
- }
-
- r = get_index(cid, &index);
- if (r < 0) {
- dout(10) << __func__ << " could not get index r = " << r << dendl;
- goto out;
- }
- {
- r = object_map->get_all_xattrs(oid, &omap_attrs);
- if (r < 0 && r != -ENOENT) {
- dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
- goto out;
- }
-
- r = object_map->get_xattrs(oid, omap_attrs, &omap_aset);
- if (r < 0 && r != -ENOENT) {
- dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
- goto out;
- }
- if (r == -ENOENT)
- r = 0;
- }
- assert(omap_attrs.size() == omap_aset.size());
- for (map<string, bufferlist>::iterator i = omap_aset.begin();
- i != omap_aset.end();
- ++i) {
- string key(i->first);
- aset.insert(make_pair(key,
- bufferptr(i->second.c_str(), i->second.length())));
- }
- out:
- dout(10) << "getattrs " << cid << "/" << oid << " = " << r << dendl;
- assert(!m_filestore_fail_eio || r != -EIO);
-
- if (g_conf->filestore_debug_inject_read_err &&
- debug_mdata_eio(oid)) {
- return -EIO;
- } else {
- tracepoint(objectstore, getattrs_exit, r);
- return r;
- }
-}
-
-int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset,
- const SequencerPosition &spos)
-{
- map<string, bufferlist> omap_set;
- set<string> omap_remove;
- map<string, bufferptr> inline_set;
- map<string, bufferptr> inline_to_set;
- FDRef fd;
- int spill_out = -1;
- bool incomplete_inline = false;
-
- int r = lfn_open(cid, oid, false, &fd);
- if (r < 0) {
- goto out;
- }
-
- char buf[2];
- r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
- if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT)))
- spill_out = 0;
- else
- spill_out = 1;
-
- r = _fgetattrs(**fd, inline_set);
- incomplete_inline = (r == -E2BIG);
- assert(!m_filestore_fail_eio || r != -EIO);
- dout(15) << "setattrs " << cid << "/" << oid
- << (incomplete_inline ? " (incomplete_inline, forcing omap)" : "")
- << dendl;
-
- for (map<string,bufferptr>::iterator p = aset.begin();
- p != aset.end();
- ++p) {
- char n[CHAIN_XATTR_MAX_NAME_LEN];
- get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN);
-
- if (incomplete_inline) {
- chain_fremovexattr(**fd, n); // ignore any error
- omap_set[p->first].push_back(p->second);
- continue;
- }
-
- if (p->second.length() > m_filestore_max_inline_xattr_size) {
- if (inline_set.count(p->first)) {
- inline_set.erase(p->first);
- r = chain_fremovexattr(**fd, n);
- if (r < 0)
- goto out_close;
- }
- omap_set[p->first].push_back(p->second);
- continue;
- }
-
- if (!inline_set.count(p->first) &&
- inline_set.size() >= m_filestore_max_inline_xattrs) {
- omap_set[p->first].push_back(p->second);
- continue;
- }
- omap_remove.insert(p->first);
- inline_set.insert(*p);
-
- inline_to_set.insert(*p);
- }
-
- if (spill_out != 1 && !omap_set.empty()) {
- chain_fsetxattr(**fd, XATTR_SPILL_OUT_NAME, XATTR_SPILL_OUT,
- sizeof(XATTR_SPILL_OUT));
- }
-
- r = _fsetattrs(**fd, inline_to_set);
- if (r < 0)
- goto out_close;
-
- if (spill_out && !omap_remove.empty()) {
- r = object_map->remove_xattrs(oid, omap_remove, &spos);
- if (r < 0 && r != -ENOENT) {
- dout(10) << __func__ << " could not remove_xattrs r = " << r << dendl;
- assert(!m_filestore_fail_eio || r != -EIO);
- goto out_close;
- } else {
- r = 0; // don't confuse the debug output
- }
- }
-
- if (!omap_set.empty()) {
- r = object_map->set_xattrs(oid, omap_set, &spos);
- if (r < 0) {
- dout(10) << __func__ << " could not set_xattrs r = " << r << dendl;
- assert(!m_filestore_fail_eio || r != -EIO);
- goto out_close;
- }
- }
- out_close:
- lfn_close(fd);
- out:
- dout(10) << "setattrs " << cid << "/" << oid << " = " << r << dendl;
- return r;
-}
-
-
-int FileStore::_rmattr(coll_t cid, const ghobject_t& oid, const char *name,
- const SequencerPosition &spos)
-{
- dout(15) << "rmattr " << cid << "/" << oid << " '" << name << "'" << dendl;
- FDRef fd;
- bool spill_out = true;
- bufferptr bp;
-
- int r = lfn_open(cid, oid, false, &fd);
- if (r < 0) {
- goto out;
- }
-
- char buf[2];
- r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
- if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) {
- spill_out = false;
- }
-
- char n[CHAIN_XATTR_MAX_NAME_LEN];
- get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN);
- r = chain_fremovexattr(**fd, n);
- if (r == -ENODATA && spill_out) {
- Index index;
- r = get_index(cid, &index);
- if (r < 0) {
- dout(10) << __func__ << " could not get index r = " << r << dendl;
- goto out_close;
- }
- set<string> to_remove;
- to_remove.insert(string(name));
- r = object_map->remove_xattrs(oid, to_remove, &spos);
- if (r < 0 && r != -ENOENT) {
- dout(10) << __func__ << " could not remove_xattrs index r = " << r << dendl;
- assert(!m_filestore_fail_eio || r != -EIO);
- goto out_close;
- }
- }
- out_close:
- lfn_close(fd);
- out:
- dout(10) << "rmattr " << cid << "/" << oid << " '" << name << "' = " << r << dendl;
- return r;
-}
-
-int FileStore::_rmattrs(coll_t cid, const ghobject_t& oid,
- const SequencerPosition &spos)
-{
- dout(15) << "rmattrs " << cid << "/" << oid << dendl;
-
- map<string,bufferptr> aset;
- FDRef fd;
- set<string> omap_attrs;
- Index index;
- bool spill_out = true;
-
- int r = lfn_open(cid, oid, false, &fd);
- if (r < 0) {
- goto out;
- }
-
- char buf[2];
- r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
- if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) {
- spill_out = false;
- }
-
- r = _fgetattrs(**fd, aset);
- if (r >= 0) {
- for (map<string,bufferptr>::iterator p = aset.begin(); p != aset.end(); ++p) {
- char n[CHAIN_XATTR_MAX_NAME_LEN];
- get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN);
- r = chain_fremovexattr(**fd, n);
- if (r < 0)
- break;
- }
- }
-
- if (!spill_out) {
- dout(10) << __func__ << " no xattr exists in object_map r = " << r << dendl;
- goto out_close;
- }
-
- r = get_index(cid, &index);
- if (r < 0) {
- dout(10) << __func__ << " could not get index r = " << r << dendl;
- goto out_close;
- }
- {
- r = object_map->get_all_xattrs(oid, &omap_attrs);
- if (r < 0 && r != -ENOENT) {
- dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
- assert(!m_filestore_fail_eio || r != -EIO);
- goto out_close;
- }
- r = object_map->remove_xattrs(oid, omap_attrs, &spos);
- if (r < 0 && r != -ENOENT) {
- dout(10) << __func__ << " could not remove omap_attrs r = " << r << dendl;
- goto out_close;
- }
- if (r == -ENOENT)
- r = 0;
- chain_fsetxattr(**fd, XATTR_SPILL_OUT_NAME, XATTR_NO_SPILL_OUT,
- sizeof(XATTR_NO_SPILL_OUT));
- }
-
- out_close:
- lfn_close(fd);
- out:
- dout(10) << "rmattrs " << cid << "/" << oid << " = " << r << dendl;
- return r;
-}
-
-
-
-// collections
-
-int FileStore::collection_getattr(coll_t c, const char *name,
- void *value, size_t size)
-{
- char fn[PATH_MAX];
- get_cdir(c, fn, sizeof(fn));
- dout(15) << "collection_getattr " << fn << " '" << name << "' len " << size << dendl;
- int r;
- int fd = ::open(fn, O_RDONLY);
- if (fd < 0) {
- r = -errno;
- goto out;
- }
- char n[PATH_MAX];
- get_attrname(name, n, PATH_MAX);
- r = chain_fgetxattr(fd, n, value, size);
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- out:
- dout(10) << "collection_getattr " << fn << " '" << name << "' len " << size << " = " << r << dendl;
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
-}
-
-int FileStore::collection_getattr(coll_t c, const char *name, bufferlist& bl)
-{
- char fn[PATH_MAX];
- get_cdir(c, fn, sizeof(fn));
- dout(15) << "collection_getattr " << fn << " '" << name << "'" << dendl;
- char n[PATH_MAX];
- get_attrname(name, n, PATH_MAX);
- buffer::ptr bp;
- int r;
- int fd = ::open(fn, O_RDONLY);
- if (fd < 0) {
- r = -errno;
- goto out;
- }
- r = _fgetattr(fd, n, bp);
- bl.push_back(bp);
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- out:
- dout(10) << "collection_getattr " << fn << " '" << name << "' = " << r << dendl;
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
-}
-
-int FileStore::collection_getattrs(coll_t cid, map<string,bufferptr>& aset)
-{
- char fn[PATH_MAX];
- get_cdir(cid, fn, sizeof(fn));
- dout(10) << "collection_getattrs " << fn << dendl;
- int r = 0;
- int fd = ::open(fn, O_RDONLY);
- if (fd < 0) {
- r = -errno;
- goto out;
- }
- r = _fgetattrs(fd, aset);
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- out:
- dout(10) << "collection_getattrs " << fn << " = " << r << dendl;
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
-}
-
-
-int FileStore::_collection_setattr(coll_t c, const char *name,
- const void *value, size_t size)
-{
- char fn[PATH_MAX];
- get_cdir(c, fn, sizeof(fn));
- dout(10) << "collection_setattr " << fn << " '" << name << "' len " << size << dendl;
- char n[PATH_MAX];
- int r;
- int fd = ::open(fn, O_RDONLY);
- if (fd < 0) {
- r = -errno;
- goto out;
- }
- get_attrname(name, n, PATH_MAX);
- r = chain_fsetxattr(fd, n, value, size);
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- out:
- dout(10) << "collection_setattr " << fn << " '" << name << "' len " << size << " = " << r << dendl;
- return r;
-}
-
-int FileStore::_collection_rmattr(coll_t c, const char *name)
-{
- char fn[PATH_MAX];
- get_cdir(c, fn, sizeof(fn));
- dout(15) << "collection_rmattr " << fn << dendl;
- char n[PATH_MAX];
- get_attrname(name, n, PATH_MAX);
- int r;
- int fd = ::open(fn, O_RDONLY);
- if (fd < 0) {
- r = -errno;
- goto out;
- }
- r = chain_fremovexattr(fd, n);
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- out:
- dout(10) << "collection_rmattr " << fn << " = " << r << dendl;
- return r;
-}
-
-
-int FileStore::_collection_setattrs(coll_t cid, map<string,bufferptr>& aset)
-{
- char fn[PATH_MAX];
- get_cdir(cid, fn, sizeof(fn));
- dout(15) << "collection_setattrs " << fn << dendl;
- int r = 0;
- int fd = ::open(fn, O_RDONLY);
- if (fd < 0) {
- r = -errno;
- goto out;
- }
- for (map<string,bufferptr>::iterator p = aset.begin();
- p != aset.end();
- ++p) {
- char n[PATH_MAX];
- get_attrname(p->first.c_str(), n, PATH_MAX);
- r = chain_fsetxattr(fd, n, p->second.c_str(), p->second.length());
- if (r < 0)
- break;
- }
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- out:
- dout(10) << "collection_setattrs " << fn << " = " << r << dendl;
- return r;
-}
-
-int FileStore::_collection_remove_recursive(const coll_t &cid,
- const SequencerPosition &spos)
-{
- struct stat st;
- int r = collection_stat(cid, &st);
- if (r < 0) {
- if (r == -ENOENT)
- return 0;
- return r;
- }
-
- vector<ghobject_t> objects;
- ghobject_t max;
- while (!max.is_max()) {
- r = collection_list(cid, max, ghobject_t::get_max(), true,
- 300, &objects, &max);
- if (r < 0)
- return r;
- for (vector<ghobject_t>::iterator i = objects.begin();
- i != objects.end();
- ++i) {
- assert(_check_replay_guard(cid, *i, spos));
- r = _remove(cid, *i, spos);
- if (r < 0)
- return r;
- }
- }
- return _destroy_collection(cid);
-}
-
-// --------------------------
-// collections
-
-int FileStore::collection_version_current(coll_t c, uint32_t *version)
-{
- Index index;
- int r = get_index(c, &index);
- if (r < 0)
- return r;
-
- assert(NULL != index.index);
- RWLock::RLocker l((index.index)->access_lock);
-
- *version = index->collection_version();
- if (*version == target_version)
- return 1;
- else
- return 0;
-}
-
-int FileStore::list_collections(vector<coll_t>& ls)
-{
- return list_collections(ls, false);
-}
-
-int FileStore::list_collections(vector<coll_t>& ls, bool include_temp)
-{
- tracepoint(objectstore, list_collections_enter);
- dout(10) << "list_collections" << dendl;
-
- char fn[PATH_MAX];
- snprintf(fn, sizeof(fn), "%s/current", basedir.c_str());
-
- int r = 0;
- DIR *dir = ::opendir(fn);
- if (!dir) {
- r = -errno;
- derr << "tried opening directory " << fn << ": " << cpp_strerror(-r) << dendl;
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
- }
-
- char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1];
- struct dirent *de;
- while ((r = ::readdir_r(dir, (struct dirent *)&buf, &de)) == 0) {
- if (!de)
- break;
- if (de->d_type == DT_UNKNOWN) {
- // d_type not supported (non-ext[234], btrfs), must stat
- struct stat sb;
- char filename[PATH_MAX];
- snprintf(filename, sizeof(filename), "%s/%s", fn, de->d_name);
-
- r = ::stat(filename, &sb);
- if (r < 0) {
- r = -errno;
- derr << "stat on " << filename << ": " << cpp_strerror(-r) << dendl;
- assert(!m_filestore_fail_eio || r != -EIO);
- break;
- }
- if (!S_ISDIR(sb.st_mode)) {
- continue;
- }
- } else if (de->d_type != DT_DIR) {
- continue;
- }
- if (strcmp(de->d_name, "omap") == 0) {
- continue;
- }
- if (de->d_name[0] == '.' &&
- (de->d_name[1] == '\0' ||
- (de->d_name[1] == '.' &&
- de->d_name[2] == '\0')))
- continue;
- coll_t cid;
- if (!cid.parse(de->d_name)) {
- derr << "ignoging invalid collection '" << de->d_name << "'" << dendl;
- continue;
- }
- if (!cid.is_temp() || include_temp)
- ls.push_back(cid);
- }
-
- if (r > 0) {
- derr << "trying readdir_r " << fn << ": " << cpp_strerror(r) << dendl;
- r = -r;
- }
-
- ::closedir(dir);
- assert(!m_filestore_fail_eio || r != -EIO);
- tracepoint(objectstore, list_collections_exit, r);
- return r;
-}
-
-int FileStore::collection_stat(coll_t c, struct stat *st)
-{
- tracepoint(objectstore, collection_stat_enter, c.c_str());
- char fn[PATH_MAX];
- get_cdir(c, fn, sizeof(fn));
- dout(15) << "collection_stat " << fn << dendl;
- int r = ::stat(fn, st);
- if (r < 0)
- r = -errno;
- dout(10) << "collection_stat " << fn << " = " << r << dendl;
- assert(!m_filestore_fail_eio || r != -EIO);
- tracepoint(objectstore, collection_stat_exit, r);
- return r;
-}
-
-bool FileStore::collection_exists(coll_t c)
-{
- tracepoint(objectstore, collection_exists_enter, c.c_str());
- struct stat st;
- bool ret = collection_stat(c, &st) == 0;
- tracepoint(objectstore, collection_exists_exit, ret);
- return ret;
-}
-
-bool FileStore::collection_empty(coll_t c)
-{
- tracepoint(objectstore, collection_empty_enter, c.c_str());
- dout(15) << "collection_empty " << c << dendl;
- Index index;
- int r = get_index(c, &index);
- if (r < 0)
- return false;
-
- assert(NULL != index.index);
- RWLock::RLocker l((index.index)->access_lock);
-
- vector<ghobject_t> ls;
- r = index->collection_list_partial(ghobject_t(), ghobject_t::get_max(), true,
- 1, &ls, NULL);
- if (r < 0) {
- assert(!m_filestore_fail_eio || r != -EIO);
- return false;
- }
- bool ret = ls.empty();
- tracepoint(objectstore, collection_empty_exit, ret);
- return ret;
-}
-int FileStore::collection_list(coll_t c, ghobject_t start, ghobject_t end,
- bool sort_bitwise, int max,
- vector<ghobject_t> *ls, ghobject_t *next)
-{
- if (start.is_max())
- return 0;
-
- ghobject_t temp_next;
- if (!next)
- next = &temp_next;
- // figure out the pool id. we need this in order to generate a
- // meaningful 'next' value.
- int64_t pool = -1;
- shard_id_t shard;
- {
- spg_t pgid;
- if (c.is_temp(&pgid)) {
- pool = -2 - pgid.pool();
- shard = pgid.shard;
- } else if (c.is_pg(&pgid)) {
- pool = pgid.pool();
- shard = pgid.shard;
- } else if (c.is_meta()) {
- pool = -1;
- shard = shard_id_t::NO_SHARD;
- } else {
- // hrm, the caller is test code! we should get kill it off. for now,
- // tolerate it.
- pool = 0;
- shard = shard_id_t::NO_SHARD;
- }
- dout(20) << __func__ << " pool is " << pool << " shard is " << shard
- << " pgid " << pgid << dendl;
- }
- ghobject_t sep;
- sep.hobj.pool = -1;
- sep.set_shard(shard);
- if (!c.is_temp() && !c.is_meta()) {
- if (cmp_bitwise(start, sep) < 0) { // bitwise vs nibble doesn't matter here
- dout(10) << __func__ << " first checking temp pool" << dendl;
- coll_t temp = c.get_temp();
- int r = collection_list(temp, start, end, sort_bitwise, max, ls, next);
- if (r < 0)
- return r;
- if (*next != ghobject_t::get_max())
- return r;
- start = sep;
- dout(10) << __func__ << " fall through to non-temp collection, start "
- << start << dendl;
- } else {
- dout(10) << __func__ << " start " << start << " >= sep " << sep << dendl;
- }
- }
-
- Index index;
- int r = get_index(c, &index);
- if (r < 0)
- return r;
-
- assert(NULL != index.index);
- RWLock::RLocker l((index.index)->access_lock);
-
- r = index->collection_list_partial(start, end, sort_bitwise, max, ls, next);
-
- if (r < 0) {
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
- }
- dout(20) << "objects: " << ls << dendl;
-
- // HashIndex doesn't know the pool when constructing a 'next' value
- if (next && !next->is_max()) {
- next->hobj.pool = pool;
- next->set_shard(shard);
- dout(20) << " next " << *next << dendl;
- }
-
- return 0;
-}
-
-int FileStore::omap_get(coll_t c, const ghobject_t &hoid,
- bufferlist *header,
- map<string, bufferlist> *out)
-{
- tracepoint(objectstore, omap_get_enter, c.c_str());
- _kludge_temp_object_collection(c, hoid);
- dout(15) << __func__ << " " << c << "/" << hoid << dendl;
- Index index;
- int r = get_index(c, &index);
- if (r < 0)
- return r;
- {
- assert(NULL != index.index);
- RWLock::RLocker l((index.index)->access_lock);
- r = lfn_find(hoid, index);
- if (r < 0)
- return r;
- }
- r = object_map->get(hoid, header, out);
- if (r < 0 && r != -ENOENT) {
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
- }
- tracepoint(objectstore, omap_get_exit, 0);
- return 0;
-}
-
-int FileStore::omap_get_header(
- coll_t c,
- const ghobject_t &hoid,
- bufferlist *bl,
- bool allow_eio)
-{
- tracepoint(objectstore, omap_get_header_enter, c.c_str());
- _kludge_temp_object_collection(c, hoid);
- dout(15) << __func__ << " " << c << "/" << hoid << dendl;
- Index index;
- int r = get_index(c, &index);
- if (r < 0)
- return r;
- {
- assert(NULL != index.index);
- RWLock::RLocker l((index.index)->access_lock);
- r = lfn_find(hoid, index);
- if (r < 0)
- return r;
- }
- r = object_map->get_header(hoid, bl);
- if (r < 0 && r != -ENOENT) {
- assert(allow_eio || !m_filestore_fail_eio || r != -EIO);
- return r;
- }
- tracepoint(objectstore, omap_get_header_exit, 0);
- return 0;
-}
-
-int FileStore::omap_get_keys(coll_t c, const ghobject_t &hoid, set<string> *keys)
-{
- tracepoint(objectstore, omap_get_keys_enter, c.c_str());
- _kludge_temp_object_collection(c, hoid);
- dout(15) << __func__ << " " << c << "/" << hoid << dendl;
- Index index;
- int r = get_index(c, &index);
- if (r < 0)
- return r;
- {
- assert(NULL != index.index);
- RWLock::RLocker l((index.index)->access_lock);
- r = lfn_find(hoid, index);
- if (r < 0)
- return r;
- }
- r = object_map->get_keys(hoid, keys);
- if (r < 0 && r != -ENOENT) {
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
- }
- tracepoint(objectstore, omap_get_keys_exit, 0);
- return 0;
-}
-
-int FileStore::omap_get_values(coll_t c, const ghobject_t &hoid,
- const set<string> &keys,
- map<string, bufferlist> *out)
-{
- tracepoint(objectstore, omap_get_values_enter, c.c_str());
- _kludge_temp_object_collection(c, hoid);
- dout(15) << __func__ << " " << c << "/" << hoid << dendl;
- Index index;
- const char *where = 0;
- int r = get_index(c, &index);
- if (r < 0) {
- where = " (get_index)";
- goto out;
- }
- {
- assert(NULL != index.index);
- RWLock::RLocker l((index.index)->access_lock);
- r = lfn_find(hoid, index);
- if (r < 0) {
- where = " (lfn_find)";
- goto out;
- }
- }
- r = object_map->get_values(hoid, keys, out);
- if (r < 0 && r != -ENOENT) {
- assert(!m_filestore_fail_eio || r != -EIO);
- goto out;
- }
- r = 0;
- out:
- tracepoint(objectstore, omap_get_values_exit, r);
- dout(15) << __func__ << " " << c << "/" << hoid << " = " << r
- << where << dendl;
- return r;
-}
-
-int FileStore::omap_check_keys(coll_t c, const ghobject_t &hoid,
- const set<string> &keys,
- set<string> *out)
-{
- tracepoint(objectstore, omap_check_keys_enter, c.c_str());
- _kludge_temp_object_collection(c, hoid);
- dout(15) << __func__ << " " << c << "/" << hoid << dendl;
-
- Index index;
- int r = get_index(c, &index);
- if (r < 0)
- return r;
- {
- assert(NULL != index.index);
- RWLock::RLocker l((index.index)->access_lock);
- r = lfn_find(hoid, index);
- if (r < 0)
- return r;
- }
- r = object_map->check_keys(hoid, keys, out);
- if (r < 0 && r != -ENOENT) {
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
- }
- tracepoint(objectstore, omap_check_keys_exit, 0);
- return 0;
-}
-
-ObjectMap::ObjectMapIterator FileStore::get_omap_iterator(coll_t c,
- const ghobject_t &hoid)
-{
- tracepoint(objectstore, get_omap_iterator, c.c_str());
- _kludge_temp_object_collection(c, hoid);
- dout(15) << __func__ << " " << c << "/" << hoid << dendl;
- Index index;
- int r = get_index(c, &index);
- if (r < 0) {
- dout(10) << __func__ << " " << c << "/" << hoid << " = 0 "
- << "(get_index failed with " << cpp_strerror(r) << ")" << dendl;
- return ObjectMap::ObjectMapIterator();
- }
- {
- assert(NULL != index.index);
- RWLock::RLocker l((index.index)->access_lock);
- r = lfn_find(hoid, index);
- if (r < 0) {
- dout(10) << __func__ << " " << c << "/" << hoid << " = 0 "
- << "(lfn_find failed with " << cpp_strerror(r) << ")" << dendl;
- return ObjectMap::ObjectMapIterator();
- }
- }
- return object_map->get_iterator(hoid);
-}
-
-int FileStore::_collection_hint_expected_num_objs(coll_t c, uint32_t pg_num,
- uint64_t expected_num_objs,
- const SequencerPosition &spos)
-{
- dout(15) << __func__ << " collection: " << c << " pg number: "
- << pg_num << " expected number of objects: " << expected_num_objs << dendl;
-
- if (!collection_empty(c) && !replaying) {
- dout(0) << "Failed to give an expected number of objects hint to collection : "
- << c << ", only empty collection can take such type of hint. " << dendl;
- return 0;
- }
-
- int ret;
- Index index;
- ret = get_index(c, &index);
- if (ret < 0)
- return ret;
- // Pre-hash the collection
- ret = index->pre_hash_collection(pg_num, expected_num_objs);
- dout(10) << "pre_hash_collection " << c << " = " << ret << dendl;
- if (ret < 0)
- return ret;
- _set_replay_guard(c, spos);
-
- return 0;
-}
-
-int FileStore::_create_collection(
- coll_t c,
- const SequencerPosition &spos)
-{
- char fn[PATH_MAX];
- get_cdir(c, fn, sizeof(fn));
- dout(15) << "create_collection " << fn << dendl;
- int r = ::mkdir(fn, 0755);
- if (r < 0)
- r = -errno;
- if (r == -EEXIST && replaying)
- r = 0;
- dout(10) << "create_collection " << fn << " = " << r << dendl;
-
- if (r < 0)
- return r;
- r = init_index(c);
- if (r < 0)
- return r;
-
- // create parallel temp collection, too
- if (!c.is_meta() && !c.is_temp()) {
- coll_t temp = c.get_temp();
- r = _create_collection(temp, spos);
- if (r < 0)
- return r;
- }
-
- _set_replay_guard(c, spos);
- return 0;
-}
-
-int FileStore::_destroy_collection(coll_t c)
-{
- int r = 0;
- char fn[PATH_MAX];
- get_cdir(c, fn, sizeof(fn));
- dout(15) << "_destroy_collection " << fn << dendl;
- {
- Index from;
- r = get_index(c, &from);
- if (r < 0)
- goto out;
- assert(NULL != from.index);
- RWLock::WLocker l((from.index)->access_lock);
-
- r = from->prep_delete();
- if (r < 0)
- goto out;
- }
- r = ::rmdir(fn);
- if (r < 0) {
- r = -errno;
- goto out;
- }
-
- out:
- // destroy parallel temp collection, too
- if (!c.is_meta() && !c.is_temp()) {
- coll_t temp = c.get_temp();
- int r2 = _destroy_collection(temp);
- if (r2 < 0) {
- r = r2;
- goto out_final;
- }
- }
-
- out_final:
- dout(10) << "_destroy_collection " << fn << " = " << r << dendl;
- return r;
-}
-
-
-int FileStore::_collection_add(coll_t c, coll_t oldcid, const ghobject_t& o,
- const SequencerPosition& spos)
-{
- dout(15) << "collection_add " << c << "/" << o << " from " << oldcid << "/" << o << dendl;
-
- int dstcmp = _check_replay_guard(c, o, spos);
- if (dstcmp < 0)
- return 0;
-
- // check the src name too; it might have a newer guard, and we don't
- // want to clobber it
- int srccmp = _check_replay_guard(oldcid, o, spos);
- if (srccmp < 0)
- return 0;
-
- // open guard on object so we don't any previous operations on the
- // new name that will modify the source inode.
- FDRef fd;
- int r = lfn_open(oldcid, o, 0, &fd);
- if (r < 0) {
- // the source collection/object does not exist. If we are replaying, we
- // should be safe, so just return 0 and move on.
- assert(replaying);
- dout(10) << "collection_add " << c << "/" << o << " from "
- << oldcid << "/" << o << " (dne, continue replay) " << dendl;
- return 0;
- }
- if (dstcmp > 0) { // if dstcmp == 0 the guard already says "in-progress"
- _set_replay_guard(**fd, spos, &o, true);
- }
-
- r = lfn_link(oldcid, c, o, o);
- if (replaying && !backend->can_checkpoint() &&
- r == -EEXIST) // crashed between link() and set_replay_guard()
- r = 0;
-
- _inject_failure();
-
- // close guard on object so we don't do this again
- if (r == 0) {
- _close_replay_guard(**fd, spos);
- }
- lfn_close(fd);
-
- dout(10) << "collection_add " << c << "/" << o << " from " << oldcid << "/" << o << " = " << r << dendl;
- return r;
-}
-
-int FileStore::_collection_move_rename(coll_t oldcid, const ghobject_t& oldoid,
- coll_t c, const ghobject_t& o,
- const SequencerPosition& spos)
-{
- dout(15) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" << oldoid << dendl;
- int r = 0;
- int dstcmp, srccmp;
-
- if (replaying) {
- /* If the destination collection doesn't exist during replay,
- * we need to delete the src object and continue on
- */
- if (!collection_exists(c))
- goto out_rm_src;
- }
-
- dstcmp = _check_replay_guard(c, o, spos);
- if (dstcmp < 0)
- goto out_rm_src;
-
- // check the src name too; it might have a newer guard, and we don't
- // want to clobber it
- srccmp = _check_replay_guard(oldcid, oldoid, spos);
- if (srccmp < 0)
- return 0;
-
- {
- // open guard on object so we don't any previous operations on the
- // new name that will modify the source inode.
- FDRef fd;
- r = lfn_open(oldcid, oldoid, 0, &fd);
- if (r < 0) {
- // the source collection/object does not exist. If we are replaying, we
- // should be safe, so just return 0 and move on.
- assert(replaying);
- dout(10) << __func__ << " " << c << "/" << o << " from "
- << oldcid << "/" << oldoid << " (dne, continue replay) " << dendl;
- return 0;
- }
- if (dstcmp > 0) { // if dstcmp == 0 the guard already says "in-progress"
- _set_replay_guard(**fd, spos, &o, true);
- }
-
- r = lfn_link(oldcid, c, oldoid, o);
- if (replaying && !backend->can_checkpoint() &&
- r == -EEXIST) // crashed between link() and set_replay_guard()
- r = 0;
-
- _inject_failure();
-
- if (r == 0) {
- // the name changed; link the omap content
- r = object_map->clone(oldoid, o, &spos);
- if (r == -ENOENT)
- r = 0;
- }
-
- _inject_failure();
-
- lfn_close(fd);
- fd = FDRef();
-
- if (r == 0)
- r = lfn_unlink(oldcid, oldoid, spos, true);
-
- if (r == 0)
- r = lfn_open(c, o, 0, &fd);
-
- // close guard on object so we don't do this again
- if (r == 0)
- _close_replay_guard(**fd, spos);
-
- lfn_close(fd);
- }
-
- dout(10) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" << oldoid
- << " = " << r << dendl;
- return r;
-
- out_rm_src:
- // remove source
- if (_check_replay_guard(oldcid, oldoid, spos) > 0) {
- r = lfn_unlink(oldcid, oldoid, spos, true);
- }
-
- dout(10) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" << oldoid
- << " = " << r << dendl;
- return r;
-}
-
-void FileStore::_inject_failure()
-{
- if (m_filestore_kill_at.read()) {
- int final = m_filestore_kill_at.dec();
- dout(5) << "_inject_failure " << (final+1) << " -> " << final << dendl;
- if (final == 0) {
- derr << "_inject_failure KILLING" << dendl;
- g_ceph_context->_log->flush();
- _exit(1);
- }
- }
-}
-
-int FileStore::_omap_clear(coll_t cid, const ghobject_t &hoid,
- const SequencerPosition &spos) {
- dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
- Index index;
- int r = get_index(cid, &index);
- if (r < 0)
- return r;
- {
- assert(NULL != index.index);
- RWLock::RLocker l((index.index)->access_lock);
- r = lfn_find(hoid, index);
- if (r < 0)
- return r;
- }
- r = object_map->clear_keys_header(hoid, &spos);
- if (r < 0 && r != -ENOENT)
- return r;
- return 0;
-}
-
-int FileStore::_omap_setkeys(coll_t cid, const ghobject_t &hoid,
- const map<string, bufferlist> &aset,
- const SequencerPosition &spos) {
- dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
- Index index;
- int r;
- //treat pgmeta as a logical object, skip to check exist
- if (hoid.is_pgmeta())
- goto skip;
-
- r = get_index(cid, &index);
- if (r < 0) {
- dout(20) << __func__ << " get_index got " << cpp_strerror(r) << dendl;
- return r;
- }
- {
- assert(NULL != index.index);
- RWLock::RLocker l((index.index)->access_lock);
- r = lfn_find(hoid, index);
- if (r < 0) {
- dout(20) << __func__ << " lfn_find got " << cpp_strerror(r) << dendl;
- return r;
- }
- }
-skip:
- r = object_map->set_keys(hoid, aset, &spos);
- dout(20) << __func__ << " " << cid << "/" << hoid << " = " << r << dendl;
- return r;
-}
-
-int FileStore::_omap_rmkeys(coll_t cid, const ghobject_t &hoid,
- const set<string> &keys,
- const SequencerPosition &spos) {
- dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
- Index index;
- int r;
- //treat pgmeta as a logical object, skip to check exist
- if (hoid.is_pgmeta())
- goto skip;
-
- r = get_index(cid, &index);
- if (r < 0)
- return r;
- {
- assert(NULL != index.index);
- RWLock::RLocker l((index.index)->access_lock);
- r = lfn_find(hoid, index);
- if (r < 0)
- return r;
- }
-skip:
- r = object_map->rm_keys(hoid, keys, &spos);
- if (r < 0 && r != -ENOENT)
- return r;
- return 0;
-}
-
-int FileStore::_omap_rmkeyrange(coll_t cid, const ghobject_t &hoid,
- const string& first, const string& last,
- const SequencerPosition &spos) {
- dout(15) << __func__ << " " << cid << "/" << hoid << " [" << first << "," << last << "]" << dendl;
- set<string> keys;
- {
- ObjectMap::ObjectMapIterator iter = get_omap_iterator(cid, hoid);
- if (!iter)
- return -ENOENT;
- for (iter->lower_bound(first); iter->valid() && iter->key() < last;
- iter->next()) {
- keys.insert(iter->key());
- }
- }
- return _omap_rmkeys(cid, hoid, keys, spos);
-}
-
-int FileStore::_omap_setheader(coll_t cid, const ghobject_t &hoid,
- const bufferlist &bl,
- const SequencerPosition &spos)
-{
- dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
- Index index;
- int r = get_index(cid, &index);
- if (r < 0)
- return r;
- {
- assert(NULL != index.index);
- RWLock::RLocker l((index.index)->access_lock);
- r = lfn_find(hoid, index);
- if (r < 0)
- return r;
- }
- return object_map->set_header(hoid, bl, &spos);
-}
-
-int FileStore::_split_collection(coll_t cid,
- uint32_t bits,
- uint32_t rem,
- coll_t dest,
- const SequencerPosition &spos)
-{
- int r;
- {
- dout(15) << __func__ << " " << cid << " bits: " << bits << dendl;
- if (!collection_exists(cid)) {
- dout(2) << __func__ << ": " << cid << " DNE" << dendl;
- assert(replaying);
- return 0;
- }
- if (!collection_exists(dest)) {
- dout(2) << __func__ << ": " << dest << " DNE" << dendl;
- assert(replaying);
- return 0;
- }
-
- int dstcmp = _check_replay_guard(dest, spos);
- if (dstcmp < 0)
- return 0;
-
- int srccmp = _check_replay_guard(cid, spos);
- if (srccmp < 0)
- return 0;
-
- _set_global_replay_guard(cid, spos);
- _set_replay_guard(cid, spos, true);
- _set_replay_guard(dest, spos, true);
-
- Index from;
- r = get_index(cid, &from);
-
- Index to;
- if (!r)
- r = get_index(dest, &to);
-
- if (!r) {
- assert(NULL != from.index);
- RWLock::WLocker l1((from.index)->access_lock);
-
- assert(NULL != to.index);
- RWLock::WLocker l2((to.index)->access_lock);
-
- r = from->split(rem, bits, to.index);
- }
-
- _close_replay_guard(cid, spos);
- _close_replay_guard(dest, spos);
- }
- if (g_conf->filestore_debug_verify_split) {
- vector<ghobject_t> objects;
- ghobject_t next;
- while (1) {
- collection_list(
- cid,
- next, ghobject_t::get_max(),
- true,
- get_ideal_list_max(),
- &objects,
- &next);
- if (objects.empty())
- break;
- for (vector<ghobject_t>::iterator i = objects.begin();
- i != objects.end();
- ++i) {
- dout(20) << __func__ << ": " << *i << " still in source "
- << cid << dendl;
- assert(!i->match(bits, rem));
- }
- objects.clear();
- }
- next = ghobject_t();
- while (1) {
- collection_list(
- dest,
- next, ghobject_t::get_max(),
- true,
- get_ideal_list_max(),
- &objects,
- &next);
- if (objects.empty())
- break;
- for (vector<ghobject_t>::iterator i = objects.begin();
- i != objects.end();
- ++i) {
- dout(20) << __func__ << ": " << *i << " now in dest "
- << *i << dendl;
- assert(i->match(bits, rem));
- }
- objects.clear();
- }
- }
- return r;
-}
-
-int FileStore::_set_alloc_hint(coll_t cid, const ghobject_t& oid,
- uint64_t expected_object_size,
- uint64_t expected_write_size)
-{
- dout(15) << "set_alloc_hint " << cid << "/" << oid << " object_size " << expected_object_size << " write_size " << expected_write_size << dendl;
-
- FDRef fd;
- int ret;
-
- ret = lfn_open(cid, oid, false, &fd);
- if (ret < 0)
- goto out;
-
- {
- // TODO: a more elaborate hint calculation
- uint64_t hint = MIN(expected_write_size, m_filestore_max_alloc_hint_size);
-
- ret = backend->set_alloc_hint(**fd, hint);
- dout(20) << "set_alloc_hint hint " << hint << " ret " << ret << dendl;
- }
-
- lfn_close(fd);
-out:
- dout(10) << "set_alloc_hint " << cid << "/" << oid << " object_size " << expected_object_size << " write_size " << expected_write_size << " = " << ret << dendl;
- assert(!m_filestore_fail_eio || ret != -EIO);
- return ret;
-}
-
-const char** FileStore::get_tracked_conf_keys() const
-{
- static const char* KEYS[] = {
- "filestore_min_sync_interval",
- "filestore_max_sync_interval",
- "filestore_queue_max_ops",
- "filestore_queue_max_bytes",
- "filestore_queue_committing_max_ops",
- "filestore_queue_committing_max_bytes",
- "filestore_commit_timeout",
- "filestore_dump_file",
- "filestore_kill_at",
- "filestore_fail_eio",
- "filestore_fadvise",
- "filestore_sloppy_crc",
- "filestore_sloppy_crc_block_size",
- "filestore_max_alloc_hint_size",
- NULL
- };
- return KEYS;
-}
-
-void FileStore::handle_conf_change(const struct md_config_t *conf,
- const std::set <std::string> &changed)
-{
- if (changed.count("filestore_max_inline_xattr_size") ||
- changed.count("filestore_max_inline_xattr_size_xfs") ||
- changed.count("filestore_max_inline_xattr_size_btrfs") ||
- changed.count("filestore_max_inline_xattr_size_other") ||
- changed.count("filestore_max_inline_xattrs") ||
- changed.count("filestore_max_inline_xattrs_xfs") ||
- changed.count("filestore_max_inline_xattrs_btrfs") ||
- changed.count("filestore_max_inline_xattrs_other")) {
- Mutex::Locker l(lock);
- set_xattr_limits_via_conf();
- }
- if (changed.count("filestore_min_sync_interval") ||
- changed.count("filestore_max_sync_interval") ||
- changed.count("filestore_queue_max_ops") ||
- changed.count("filestore_queue_max_bytes") ||
- changed.count("filestore_queue_committing_max_ops") ||
- changed.count("filestore_queue_committing_max_bytes") ||
- changed.count("filestore_kill_at") ||
- changed.count("filestore_fail_eio") ||
- changed.count("filestore_sloppy_crc") ||
- changed.count("filestore_sloppy_crc_block_size") ||
- changed.count("filestore_max_alloc_hint_size") ||
- changed.count("filestore_fadvise")) {
- Mutex::Locker l(lock);
- m_filestore_min_sync_interval = conf->filestore_min_sync_interval;
- m_filestore_max_sync_interval = conf->filestore_max_sync_interval;
- m_filestore_queue_max_ops = conf->filestore_queue_max_ops;
- m_filestore_queue_max_bytes = conf->filestore_queue_max_bytes;
- m_filestore_queue_committing_max_ops = conf->filestore_queue_committing_max_ops;
- m_filestore_queue_committing_max_bytes = conf->filestore_queue_committing_max_bytes;
- m_filestore_kill_at.set(conf->filestore_kill_at);
- m_filestore_fail_eio = conf->filestore_fail_eio;
- m_filestore_fadvise = conf->filestore_fadvise;
- m_filestore_sloppy_crc = conf->filestore_sloppy_crc;
- m_filestore_sloppy_crc_block_size = conf->filestore_sloppy_crc_block_size;
- m_filestore_max_alloc_hint_size = conf->filestore_max_alloc_hint_size;
- throttle_ops.reset_max(conf->filestore_queue_max_ops);
- throttle_bytes.reset_max(conf->filestore_queue_max_bytes);
- }
- if (changed.count("filestore_commit_timeout")) {
- Mutex::Locker l(sync_entry_timeo_lock);
- m_filestore_commit_timeout = conf->filestore_commit_timeout;
- }
- if (changed.count("filestore_dump_file")) {
- if (conf->filestore_dump_file.length() &&
- conf->filestore_dump_file != "-") {
- dump_start(conf->filestore_dump_file);
- } else {
- dump_stop();
- }
- }
-}
-
-void FileStore::dump_start(const std::string& file)
-{
- dout(10) << "dump_start " << file << dendl;
- if (m_filestore_do_dump) {
- dump_stop();
- }
- m_filestore_dump_fmt.reset();
- m_filestore_dump_fmt.open_array_section("dump");
- m_filestore_dump.open(file.c_str());
- m_filestore_do_dump = true;
-}
-
-void FileStore::dump_stop()
-{
- dout(10) << "dump_stop" << dendl;
- m_filestore_do_dump = false;
- if (m_filestore_dump.is_open()) {
- m_filestore_dump_fmt.close_section();
- m_filestore_dump_fmt.flush(m_filestore_dump);
- m_filestore_dump.flush();
- m_filestore_dump.close();
- }
-}
-
-void FileStore::dump_transactions(list<ObjectStore::Transaction*>& ls, uint64_t seq, OpSequencer *osr)
-{
- m_filestore_dump_fmt.open_array_section("transactions");
- unsigned trans_num = 0;
- for (list<ObjectStore::Transaction*>::iterator i = ls.begin(); i != ls.end(); ++i, ++trans_num) {
- m_filestore_dump_fmt.open_object_section("transaction");
- m_filestore_dump_fmt.dump_string("osr", osr->get_name());
- m_filestore_dump_fmt.dump_unsigned("seq", seq);
- m_filestore_dump_fmt.dump_unsigned("trans_num", trans_num);
- (*i)->dump(&m_filestore_dump_fmt);
- m_filestore_dump_fmt.close_section();
- }
- m_filestore_dump_fmt.close_section();
- m_filestore_dump_fmt.flush(m_filestore_dump);
- m_filestore_dump.flush();
-}
-
-void FileStore::set_xattr_limits_via_conf()
-{
- uint32_t fs_xattr_size;
- uint32_t fs_xattrs;
-
- switch (m_fs_type) {
-#if defined(__linux__)
- case XFS_SUPER_MAGIC:
- fs_xattr_size = g_conf->filestore_max_inline_xattr_size_xfs;
- fs_xattrs = g_conf->filestore_max_inline_xattrs_xfs;
- break;
- case BTRFS_SUPER_MAGIC:
- fs_xattr_size = g_conf->filestore_max_inline_xattr_size_btrfs;
- fs_xattrs = g_conf->filestore_max_inline_xattrs_btrfs;
- break;
-#endif
- default:
- fs_xattr_size = g_conf->filestore_max_inline_xattr_size_other;
- fs_xattrs = g_conf->filestore_max_inline_xattrs_other;
- break;
- }
-
- // Use override value if set
- if (g_conf->filestore_max_inline_xattr_size)
- m_filestore_max_inline_xattr_size = g_conf->filestore_max_inline_xattr_size;
- else
- m_filestore_max_inline_xattr_size = fs_xattr_size;
-
- // Use override value if set
- if (g_conf->filestore_max_inline_xattrs)
- m_filestore_max_inline_xattrs = g_conf->filestore_max_inline_xattrs;
- else
- m_filestore_max_inline_xattrs = fs_xattrs;
-}
-
-// -- FSSuperblock --
-
-void FSSuperblock::encode(bufferlist &bl) const
-{
- ENCODE_START(2, 1, bl);
- compat_features.encode(bl);
- ::encode(omap_backend, bl);
- ENCODE_FINISH(bl);
-}
-
-void FSSuperblock::decode(bufferlist::iterator &bl)
-{
- DECODE_START(2, bl);
- compat_features.decode(bl);
- if (struct_v >= 2)
- ::decode(omap_backend, bl);
- else
- omap_backend = "leveldb";
- DECODE_FINISH(bl);
-}
-
-void FSSuperblock::dump(Formatter *f) const
-{
- f->open_object_section("compat");
- compat_features.dump(f);
- f->dump_string("omap_backend", omap_backend);
- f->close_section();
-}
-
-void FSSuperblock::generate_test_instances(list<FSSuperblock*>& o)
-{
- FSSuperblock z;
- o.push_back(new FSSuperblock(z));
- CompatSet::FeatureSet feature_compat;
- CompatSet::FeatureSet feature_ro_compat;
- CompatSet::FeatureSet feature_incompat;
- feature_incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS);
- z.compat_features = CompatSet(feature_compat, feature_ro_compat,
- feature_incompat);
- o.push_back(new FSSuperblock(z));
- z.omap_backend = "rocksdb";
- o.push_back(new FSSuperblock(z));
-}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-#ifndef CEPH_FILESTORE_H
-#define CEPH_FILESTORE_H
-
-#include "include/types.h"
-
-#include <map>
-#include <deque>
-#include <boost/scoped_ptr.hpp>
-#include <fstream>
-using namespace std;
-
-#include "include/unordered_map.h"
-
-#include "include/assert.h"
-
-#include "ObjectStore.h"
-#include "JournalingObjectStore.h"
-
-#include "common/Timer.h"
-#include "common/WorkQueue.h"
-
-#include "common/Mutex.h"
-#include "HashIndex.h"
-#include "IndexManager.h"
-#include "ObjectMap.h"
-#include "SequencerPosition.h"
-#include "FDCache.h"
-#include "WBThrottle.h"
-
-#include "include/uuid.h"
-
-
-// from include/linux/falloc.h:
-#ifndef FALLOC_FL_PUNCH_HOLE
-# define FALLOC_FL_PUNCH_HOLE 0x2
-#endif
-
-#if defined(__linux__)
-# ifndef BTRFS_SUPER_MAGIC
-#define BTRFS_SUPER_MAGIC 0x9123683E
-# endif
-# ifndef XFS_SUPER_MAGIC
-#define XFS_SUPER_MAGIC 0x58465342
-# endif
-# ifndef ZFS_SUPER_MAGIC
-#define ZFS_SUPER_MAGIC 0x2fc12fc1
-# endif
-#endif
-
-
-class FileStoreBackend;
-
-#define CEPH_FS_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(1, "sharded objects")
-
-class FSSuperblock {
-public:
- CompatSet compat_features;
- string omap_backend;
-
- FSSuperblock() { }
-
- void encode(bufferlist &bl) const;
- void decode(bufferlist::iterator &bl);
- void dump(Formatter *f) const;
- static void generate_test_instances(list<FSSuperblock*>& o);
-};
-WRITE_CLASS_ENCODER(FSSuperblock)
-
-inline ostream& operator<<(ostream& out, const FSSuperblock& sb)
-{
- return out << "sb(" << sb.compat_features << "): "
- << sb.omap_backend;
-}
-
-class FileStore : public JournalingObjectStore,
- public md_config_obs_t
-{
- static const uint32_t target_version = 4;
-public:
- uint32_t get_target_version() {
- return target_version;
- }
-
- static int get_block_device_fsid(const string& path, uuid_d *fsid);
-
- struct FSPerfTracker {
- PerfCounters::avg_tracker<uint64_t> os_commit_latency;
- PerfCounters::avg_tracker<uint64_t> os_apply_latency;
-
- objectstore_perf_stat_t get_cur_stats() const {
- objectstore_perf_stat_t ret;
- ret.filestore_commit_latency = os_commit_latency.avg();
- ret.filestore_apply_latency = os_apply_latency.avg();
- return ret;
- }
-
- void update_from_perfcounters(PerfCounters &logger);
- } perf_tracker;
- objectstore_perf_stat_t get_cur_stats() {
- perf_tracker.update_from_perfcounters(*logger);
- return perf_tracker.get_cur_stats();
- }
-
-private:
- string internal_name; ///< internal name, used to name the perfcounter instance
- string basedir, journalpath;
- osflagbits_t generic_flags;
- std::string current_fn;
- std::string current_op_seq_fn;
- std::string omap_dir;
- uuid_d fsid;
-
- size_t blk_size; ///< fs block size
-
- int fsid_fd, op_fd, basedir_fd, current_fd;
-
- FileStoreBackend *backend;
-
- void create_backend(long f_type);
-
- deque<uint64_t> snaps;
-
- // Indexed Collections
- IndexManager index_manager;
- int get_index(coll_t c, Index *index);
- int init_index(coll_t c);
-
- void _kludge_temp_object_collection(coll_t& cid, const ghobject_t& oid) {
- // - normal temp case: cid is pg, object is temp (pool < -1)
- // - hammer temp case: cid is pg (or already temp), object pool is -1
- if (cid.is_pg() && (oid.hobj.pool < -1 ||
- oid.hobj.pool == -1))
- cid = cid.get_temp();
- }
- void init_temp_collections();
-
- // ObjectMap
- boost::scoped_ptr<ObjectMap> object_map;
-
- // helper fns
- int get_cdir(coll_t cid, char *s, int len);
-
- /// read a uuid from fd
- int read_fsid(int fd, uuid_d *uuid);
-
- /// lock fsid_fd
- int lock_fsid();
-
- // sync thread
- Mutex lock;
- bool force_sync;
- Cond sync_cond;
-
- Mutex sync_entry_timeo_lock;
- SafeTimer timer;
-
- list<Context*> sync_waiters;
- bool stop;
- void sync_entry();
- struct SyncThread : public Thread {
- FileStore *fs;
- SyncThread(FileStore *f) : fs(f) {}
- void *entry() {
- fs->sync_entry();
- return 0;
- }
- } sync_thread;
-
- // -- op workqueue --
- struct Op {
- utime_t start;
- uint64_t op;
- list<Transaction*> tls;
- Context *onreadable, *onreadable_sync;
- uint64_t ops, bytes;
- TrackedOpRef osd_op;
- };
- class OpSequencer : public Sequencer_impl {
- Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock)
- list<Op*> q;
- list<uint64_t> jq;
- list<pair<uint64_t, Context*> > flush_commit_waiters;
- Cond cond;
- public:
- Sequencer *parent;
- Mutex apply_lock; // for apply mutual exclusion
- int id;
-
- /// get_max_uncompleted
- bool _get_max_uncompleted(
- uint64_t *seq ///< [out] max uncompleted seq
- ) {
- assert(qlock.is_locked());
- assert(seq);
- *seq = 0;
- if (q.empty() && jq.empty())
- return true;
-
- if (!q.empty())
- *seq = q.back()->op;
- if (!jq.empty() && jq.back() > *seq)
- *seq = jq.back();
-
- return false;
- } /// @returns true if both queues are empty
-
- /// get_min_uncompleted
- bool _get_min_uncompleted(
- uint64_t *seq ///< [out] min uncompleted seq
- ) {
- assert(qlock.is_locked());
- assert(seq);
- *seq = 0;
- if (q.empty() && jq.empty())
- return true;
-
- if (!q.empty())
- *seq = q.front()->op;
- if (!jq.empty() && jq.front() < *seq)
- *seq = jq.front();
-
- return false;
- } /// @returns true if both queues are empty
-
- void _wake_flush_waiters(list<Context*> *to_queue) {
- uint64_t seq;
- if (_get_min_uncompleted(&seq))
- seq = -1;
-
- for (list<pair<uint64_t, Context*> >::iterator i =
- flush_commit_waiters.begin();
- i != flush_commit_waiters.end() && i->first < seq;
- flush_commit_waiters.erase(i++)) {
- to_queue->push_back(i->second);
- }
- }
-
- void queue_journal(uint64_t s) {
- Mutex::Locker l(qlock);
- jq.push_back(s);
- }
- void dequeue_journal(list<Context*> *to_queue) {
- Mutex::Locker l(qlock);
- jq.pop_front();
- cond.Signal();
- _wake_flush_waiters(to_queue);
- }
- void queue(Op *o) {
- Mutex::Locker l(qlock);
- q.push_back(o);
- }
- Op *peek_queue() {
- Mutex::Locker l(qlock);
- assert(apply_lock.is_locked());
- return q.front();
- }
-
- Op *dequeue(list<Context*> *to_queue) {
- assert(to_queue);
- assert(apply_lock.is_locked());
- Mutex::Locker l(qlock);
- Op *o = q.front();
- q.pop_front();
- cond.Signal();
-
- _wake_flush_waiters(to_queue);
- return o;
- }
-
- void flush() {
- Mutex::Locker l(qlock);
-
- while (g_conf->filestore_blackhole)
- cond.Wait(qlock); // wait forever
-
-
- // get max for journal _or_ op queues
- uint64_t seq = 0;
- if (!q.empty())
- seq = q.back()->op;
- if (!jq.empty() && jq.back() > seq)
- seq = jq.back();
-
- if (seq) {
- // everything prior to our watermark to drain through either/both queues
- while ((!q.empty() && q.front()->op <= seq) ||
- (!jq.empty() && jq.front() <= seq))
- cond.Wait(qlock);
- }
- }
- bool flush_commit(Context *c) {
- Mutex::Locker l(qlock);
- uint64_t seq = 0;
- if (_get_max_uncompleted(&seq)) {
- return true;
- } else {
- flush_commit_waiters.push_back(make_pair(seq, c));
- return false;
- }
- }
-
- OpSequencer(int i)
- : qlock("FileStore::OpSequencer::qlock", false, false),
- parent(0),
- apply_lock("FileStore::OpSequencer::apply_lock", false, false),
- id(i) {}
- ~OpSequencer() {
- assert(q.empty());
- }
-
- const string& get_name() const {
- return parent->get_name();
- }
- };
-
- friend ostream& operator<<(ostream& out, const OpSequencer& s);
-
- FDCache fdcache;
- WBThrottle wbthrottle;
-
- atomic_t next_osr_id;
- deque<OpSequencer*> op_queue;
- Throttle throttle_ops, throttle_bytes;
- const int m_ondisk_finisher_num;
- const int m_apply_finisher_num;
- vector<Finisher*> ondisk_finishers;
- vector<Finisher*> apply_finishers;
-
- ThreadPool op_tp;
- struct OpWQ : public ThreadPool::WorkQueue<OpSequencer> {
- FileStore *store;
- OpWQ(FileStore *fs, time_t timeout, time_t suicide_timeout, ThreadPool *tp)
- : ThreadPool::WorkQueue<OpSequencer>("FileStore::OpWQ", timeout, suicide_timeout, tp), store(fs) {}
-
- bool _enqueue(OpSequencer *osr) {
- store->op_queue.push_back(osr);
- return true;
- }
- void _dequeue(OpSequencer *o) {
- assert(0);
- }
- bool _empty() {
- return store->op_queue.empty();
- }
- OpSequencer *_dequeue() {
- if (store->op_queue.empty())
- return NULL;
- OpSequencer *osr = store->op_queue.front();
- store->op_queue.pop_front();
- return osr;
- }
- void _process(OpSequencer *osr, ThreadPool::TPHandle &handle) {
- store->_do_op(osr, handle);
- }
- using ThreadPool::WorkQueue<OpSequencer>::_process;
- void _process_finish(OpSequencer *osr) {
- store->_finish_op(osr);
- }
- void _clear() {
- assert(store->op_queue.empty());
- }
- } op_wq;
-
- void _do_op(OpSequencer *o, ThreadPool::TPHandle &handle);
- void _finish_op(OpSequencer *o);
- Op *build_op(list<Transaction*>& tls,
- Context *onreadable, Context *onreadable_sync,
- TrackedOpRef osd_op);
- void queue_op(OpSequencer *osr, Op *o);
- void op_queue_reserve_throttle(Op *o, ThreadPool::TPHandle *handle = NULL);
- void op_queue_release_throttle(Op *o);
- void _journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk);
- friend struct C_JournaledAhead;
-
- void new_journal();
-
- PerfCounters *logger;
-
-public:
- int lfn_find(const ghobject_t& oid, const Index& index,
- IndexedPath *path = NULL);
- int lfn_truncate(coll_t cid, const ghobject_t& oid, off_t length);
- int lfn_stat(coll_t cid, const ghobject_t& oid, struct stat *buf);
- int lfn_open(
- coll_t cid,
- const ghobject_t& oid,
- bool create,
- FDRef *outfd,
- Index *index = 0);
-
- void lfn_close(FDRef fd);
- int lfn_link(coll_t c, coll_t newcid, const ghobject_t& o, const ghobject_t& newoid) ;
- int lfn_unlink(coll_t cid, const ghobject_t& o, const SequencerPosition &spos,
- bool force_clear_omap=false);
-
-public:
- FileStore(const std::string &base, const std::string &jdev,
- osflagbits_t flags = 0,
- const char *internal_name = "filestore", bool update_to=false);
- ~FileStore();
-
- int _detect_fs();
- int _sanity_check_fs();
-
- bool test_mount_in_use();
- int read_op_seq(uint64_t *seq);
- int write_op_seq(int, uint64_t seq);
- int mount();
- int umount();
- unsigned get_max_object_name_length() {
- // not safe for all file systems, btw! use the tunable to limit this.
- return 4096;
- }
- unsigned get_max_attr_name_length() {
- // xattr limit is 128; leave room for our prefixes (user.ceph._),
- // some margin, and cap at 100
- return 100;
- }
- int mkfs();
- int mkjournal();
- bool wants_journal() {
- return true;
- }
- bool allows_journal() {
- return true;
- }
- bool needs_journal() {
- return false;
- }
-
- int write_version_stamp();
- int version_stamp_is_valid(uint32_t *version);
- int update_version_stamp();
- int upgrade();
-
- bool can_sort_nibblewise() {
- return true; // i support legacy sort order
- }
-
- void collect_metadata(map<string,string> *pm);
-
- int statfs(struct statfs *buf);
-
- int _do_transactions(
- list<Transaction*> &tls, uint64_t op_seq,
- ThreadPool::TPHandle *handle);
- int do_transactions(list<Transaction*> &tls, uint64_t op_seq) {
- return _do_transactions(tls, op_seq, 0);
- }
- unsigned _do_transaction(
- Transaction& t, uint64_t op_seq, int trans_num,
- ThreadPool::TPHandle *handle);
-
- int queue_transactions(Sequencer *osr, list<Transaction*>& tls,
- TrackedOpRef op = TrackedOpRef(),
- ThreadPool::TPHandle *handle = NULL);
-
- /**
- * set replay guard xattr on given file
- *
- * This will ensure that we will not replay this (or any previous) operation
- * against this particular inode/object.
- *
- * @param fd open file descriptor for the file/object
- * @param spos sequencer position of the last operation we should not replay
- */
- void _set_replay_guard(int fd,
- const SequencerPosition& spos,
- const ghobject_t *oid=0,
- bool in_progress=false);
- void _set_replay_guard(coll_t cid,
- const SequencerPosition& spos,
- bool in_progress);
- void _set_global_replay_guard(coll_t cid,
- const SequencerPosition &spos);
-
- /// close a replay guard opened with in_progress=true
- void _close_replay_guard(int fd, const SequencerPosition& spos);
- void _close_replay_guard(coll_t cid, const SequencerPosition& spos);
-
- /**
- * check replay guard xattr on given file
- *
- * Check the current position against any marker on the file that
- * indicates which operations have already been applied. If the
- * current or a newer operation has been marked as applied, we
- * should not replay the current operation again.
- *
- * If we are not replaying the journal, we already return true. It
- * is only on replay that we might return false, indicated that the
- * operation should not be performed (again).
- *
- * @param fd open fd on the file/object in question
- * @param spos sequencerposition for an operation we could apply/replay
- * @return 1 if we can apply (maybe replay) this operation, -1 if spos has already been applied, 0 if it was in progress
- */
- int _check_replay_guard(int fd, const SequencerPosition& spos);
- int _check_replay_guard(coll_t cid, const SequencerPosition& spos);
- int _check_replay_guard(coll_t cid, ghobject_t oid, const SequencerPosition& pos);
- int _check_global_replay_guard(coll_t cid, const SequencerPosition& spos);
-
- // ------------------
- // objects
- int pick_object_revision_lt(ghobject_t& oid) {
- return 0;
- }
- bool exists(coll_t cid, const ghobject_t& oid);
- int stat(
- coll_t cid,
- const ghobject_t& oid,
- struct stat *st,
- bool allow_eio = false);
- int read(
- coll_t cid,
- const ghobject_t& oid,
- uint64_t offset,
- size_t len,
- bufferlist& bl,
- uint32_t op_flags = 0,
- bool allow_eio = false);
- int _do_fiemap(int fd, uint64_t offset, size_t len,
- map<uint64_t, uint64_t> *m);
- int _do_seek_hole_data(int fd, uint64_t offset, size_t len,
- map<uint64_t, uint64_t> *m);
- int fiemap(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl);
-
- int _touch(coll_t cid, const ghobject_t& oid);
- int _write(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len,
- const bufferlist& bl, uint32_t fadvise_flags = 0);
- int _zero(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len);
- int _truncate(coll_t cid, const ghobject_t& oid, uint64_t size);
- int _clone(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid,
- const SequencerPosition& spos);
- int _clone_range(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid,
- uint64_t srcoff, uint64_t len, uint64_t dstoff,
- const SequencerPosition& spos);
- int _do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
- int _do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
- int _do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc=false);
- int _remove(coll_t cid, const ghobject_t& oid, const SequencerPosition &spos);
-
- int _fgetattr(int fd, const char *name, bufferptr& bp);
- int _fgetattrs(int fd, map<string,bufferptr>& aset);
- int _fsetattrs(int fd, map<string, bufferptr> &aset);
-
- void _start_sync();
-
- void do_force_sync();
- void start_sync(Context *onsafe);
- void sync();
- void _flush_op_queue();
- void flush();
- void sync_and_flush();
-
- int flush_journal();
- int dump_journal(ostream& out);
-
- void set_fsid(uuid_d u) {
- fsid = u;
- }
- uuid_d get_fsid() { return fsid; }
-
- // DEBUG read error injection, an object is removed from both on delete()
- Mutex read_error_lock;
- set<ghobject_t, ghobject_t::BitwiseComparator> data_error_set; // read() will return -EIO
- set<ghobject_t, ghobject_t::BitwiseComparator> mdata_error_set; // getattr(),stat() will return -EIO
- void inject_data_error(const ghobject_t &oid);
- void inject_mdata_error(const ghobject_t &oid);
- void debug_obj_on_delete(const ghobject_t &oid);
- bool debug_data_eio(const ghobject_t &oid);
- bool debug_mdata_eio(const ghobject_t &oid);
-
- int snapshot(const string& name);
-
- // attrs
- int getattr(coll_t cid, const ghobject_t& oid, const char *name, bufferptr &bp);
- int getattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset);
-
- int _setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset,
- const SequencerPosition &spos);
- int _rmattr(coll_t cid, const ghobject_t& oid, const char *name,
- const SequencerPosition &spos);
- int _rmattrs(coll_t cid, const ghobject_t& oid,
- const SequencerPosition &spos);
-
- int collection_getattr(coll_t c, const char *name, void *value, size_t size);
- int collection_getattr(coll_t c, const char *name, bufferlist& bl);
- int collection_getattrs(coll_t cid, map<string,bufferptr> &aset);
-
- int _collection_setattr(coll_t c, const char *name, const void *value, size_t size);
- int _collection_rmattr(coll_t c, const char *name);
- int _collection_setattrs(coll_t cid, map<string,bufferptr> &aset);
- int _collection_remove_recursive(const coll_t &cid,
- const SequencerPosition &spos);
-
- // collections
- int collection_list(coll_t c, ghobject_t start, ghobject_t end,
- bool sort_bitwise, int max,
- vector<ghobject_t> *ls, ghobject_t *next);
- int list_collections(vector<coll_t>& ls);
- int list_collections(vector<coll_t>& ls, bool include_temp);
- int collection_version_current(coll_t c, uint32_t *version);
- int collection_stat(coll_t c, struct stat *st);
- bool collection_exists(coll_t c);
- bool collection_empty(coll_t c);
-
- // omap (see ObjectStore.h for documentation)
- int omap_get(coll_t c, const ghobject_t &oid, bufferlist *header,
- map<string, bufferlist> *out);
- int omap_get_header(
- coll_t c,
- const ghobject_t &oid,
- bufferlist *out,
- bool allow_eio = false);
- int omap_get_keys(coll_t c, const ghobject_t &oid, set<string> *keys);
- int omap_get_values(coll_t c, const ghobject_t &oid, const set<string> &keys,
- map<string, bufferlist> *out);
- int omap_check_keys(coll_t c, const ghobject_t &oid, const set<string> &keys,
- set<string> *out);
- ObjectMap::ObjectMapIterator get_omap_iterator(coll_t c, const ghobject_t &oid);
-
- int _create_collection(coll_t c, const SequencerPosition &spos);
- int _destroy_collection(coll_t c);
- /**
- * Give an expected number of objects hint to the collection.
- *
- * @param c - collection id.
- * @param pg_num - pg number of the pool this collection belongs to
- * @param expected_num_objs - expected number of objects in this collection
- * @param spos - sequence position
- *
- * @return 0 on success, an error code otherwise
- */
- int _collection_hint_expected_num_objs(coll_t c, uint32_t pg_num,
- uint64_t expected_num_objs,
- const SequencerPosition &spos);
- int _collection_add(coll_t c, coll_t ocid, const ghobject_t& oid,
- const SequencerPosition& spos);
- int _collection_move_rename(coll_t oldcid, const ghobject_t& oldoid,
- coll_t c, const ghobject_t& o,
- const SequencerPosition& spos);
-
- int _set_alloc_hint(coll_t cid, const ghobject_t& oid,
- uint64_t expected_object_size,
- uint64_t expected_write_size);
-
- void dump_start(const std::string& file);
- void dump_stop();
- void dump_transactions(list<ObjectStore::Transaction*>& ls, uint64_t seq, OpSequencer *osr);
-
-private:
- void _inject_failure();
-
- // omap
- int _omap_clear(coll_t cid, const ghobject_t &oid,
- const SequencerPosition &spos);
- int _omap_setkeys(coll_t cid, const ghobject_t &oid,
- const map<string, bufferlist> &aset,
- const SequencerPosition &spos);
- int _omap_rmkeys(coll_t cid, const ghobject_t &oid, const set<string> &keys,
- const SequencerPosition &spos);
- int _omap_rmkeyrange(coll_t cid, const ghobject_t &oid,
- const string& first, const string& last,
- const SequencerPosition &spos);
- int _omap_setheader(coll_t cid, const ghobject_t &oid, const bufferlist &bl,
- const SequencerPosition &spos);
- int _split_collection(coll_t cid, uint32_t bits, uint32_t rem, coll_t dest,
- const SequencerPosition &spos);
- int _split_collection_create(coll_t cid, uint32_t bits, uint32_t rem,
- coll_t dest,
- const SequencerPosition &spos);
-
- virtual const char** get_tracked_conf_keys() const;
- virtual void handle_conf_change(const struct md_config_t *conf,
- const std::set <std::string> &changed);
- float m_filestore_commit_timeout;
- bool m_filestore_journal_parallel;
- bool m_filestore_journal_trailing;
- bool m_filestore_journal_writeahead;
- int m_filestore_fiemap_threshold;
- double m_filestore_max_sync_interval;
- double m_filestore_min_sync_interval;
- bool m_filestore_fail_eio;
- bool m_filestore_fadvise;
- int do_update;
- bool m_journal_dio, m_journal_aio, m_journal_force_aio;
- std::string m_osd_rollback_to_cluster_snap;
- bool m_osd_use_stale_snap;
- int m_filestore_queue_max_ops;
- int m_filestore_queue_max_bytes;
- int m_filestore_queue_committing_max_ops;
- int m_filestore_queue_committing_max_bytes;
- bool m_filestore_do_dump;
- std::ofstream m_filestore_dump;
- JSONFormatter m_filestore_dump_fmt;
- atomic_t m_filestore_kill_at;
- bool m_filestore_sloppy_crc;
- int m_filestore_sloppy_crc_block_size;
- uint64_t m_filestore_max_alloc_hint_size;
- long m_fs_type;
-
- //Determined xattr handling based on fs type
- void set_xattr_limits_via_conf();
- uint32_t m_filestore_max_inline_xattr_size;
- uint32_t m_filestore_max_inline_xattrs;
-
- FSSuperblock superblock;
-
- /**
- * write_superblock()
- *
- * Write superblock to persisent storage
- *
- * return value: 0 on success, otherwise negative errno
- */
- int write_superblock();
-
- /**
- * read_superblock()
- *
- * Fill in FileStore::superblock by reading persistent storage
- *
- * return value: 0 on success, otherwise negative errno
- */
- int read_superblock();
-
- friend class FileStoreBackend;
- friend class TestFileStore;
-};
-
-ostream& operator<<(ostream& out, const FileStore::OpSequencer& s);
-
-struct fiemap;
-
-class FileStoreBackend {
-private:
- FileStore *filestore;
-protected:
- int get_basedir_fd() {
- return filestore->basedir_fd;
- }
- int get_current_fd() {
- return filestore->current_fd;
- }
- int get_op_fd() {
- return filestore->op_fd;
- }
- size_t get_blksize() {
- return filestore->blk_size;
- }
- const string& get_basedir_path() {
- return filestore->basedir;
- }
- const string& get_current_path() {
- return filestore->current_fn;
- }
- int _copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) {
- if (has_fiemap() || has_seek_data_hole()) {
- return filestore->_do_sparse_copy_range(from, to, srcoff, len, dstoff);
- } else {
- return filestore->_do_copy_range(from, to, srcoff, len, dstoff);
- }
- }
- int get_crc_block_size() {
- return filestore->m_filestore_sloppy_crc_block_size;
- }
-
-public:
- FileStoreBackend(FileStore *fs) : filestore(fs) {}
- virtual ~FileStoreBackend() {}
-
- static FileStoreBackend *create(long f_type, FileStore *fs);
-
- virtual const char *get_name() = 0;
- virtual int detect_features() = 0;
- virtual int create_current() = 0;
- virtual bool can_checkpoint() = 0;
- virtual int list_checkpoints(list<string>& ls) = 0;
- virtual int create_checkpoint(const string& name, uint64_t *cid) = 0;
- virtual int sync_checkpoint(uint64_t id) = 0;
- virtual int rollback_to(const string& name) = 0;
- virtual int destroy_checkpoint(const string& name) = 0;
- virtual int syncfs() = 0;
- virtual bool has_fiemap() = 0;
- virtual bool has_seek_data_hole() = 0;
- virtual int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) = 0;
- virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) = 0;
- virtual int set_alloc_hint(int fd, uint64_t hint) = 0;
- virtual bool has_splice() const = 0;
-
- // hooks for (sloppy) crc tracking
- virtual int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) = 0;
- virtual int _crc_update_truncate(int fd, loff_t off) = 0;
- virtual int _crc_update_zero(int fd, loff_t off, size_t len) = 0;
- virtual int _crc_update_clone_range(int srcfd, int destfd,
- loff_t srcoff, size_t len, loff_t dstoff) = 0;
- virtual int _crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
- ostream *out) = 0;
-};
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "include/int_types.h"
-#include "include/types.h"
-
-#include <unistd.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/ioctl.h>
-
-#if defined(__linux__)
-#include <linux/fs.h>
-#endif
-
-#include "include/compat.h"
-#include "include/linux_fiemap.h"
-
-#include <iostream>
-#include <fstream>
-#include <sstream>
-
-#include "GenericFileStoreBackend.h"
-
-#include "common/errno.h"
-#include "common/config.h"
-#include "common/sync_filesystem.h"
-
-#include "common/SloppyCRCMap.h"
-#include "os/chain_xattr.h"
-
-#define SLOPPY_CRC_XATTR "user.cephos.scrc"
-
-
-#define dout_subsys ceph_subsys_filestore
-#undef dout_prefix
-#define dout_prefix *_dout << "genericfilestorebackend(" << get_basedir_path() << ") "
-
-#define ALIGN_DOWN(x, by) ((x) - ((x) % (by)))
-#define ALIGNED(x, by) (!((x) % (by)))
-#define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by)))
-
-GenericFileStoreBackend::GenericFileStoreBackend(FileStore *fs):
- FileStoreBackend(fs),
- ioctl_fiemap(false),
- seek_data_hole(false),
- m_filestore_fiemap(g_conf->filestore_fiemap),
- m_filestore_seek_data_hole(g_conf->filestore_seek_data_hole),
- m_filestore_fsync_flushes_journal_data(g_conf->filestore_fsync_flushes_journal_data),
- m_filestore_splice(false) {}
-
-int GenericFileStoreBackend::detect_features()
-{
- char fn[PATH_MAX];
- snprintf(fn, sizeof(fn), "%s/fiemap_test", get_basedir_path().c_str());
-
- int fd = ::open(fn, O_CREAT|O_RDWR|O_TRUNC, 0644);
- if (fd < 0) {
- fd = -errno;
- derr << "detect_features: unable to create " << fn << ": " << cpp_strerror(fd) << dendl;
- return fd;
- }
-
- // ext4 has a bug in older kernels where fiemap will return an empty
- // result in some cases. this is a file layout that triggers the bug
- // on 2.6.34-rc5.
- int v[] = {
- 0x0000000000016000, 0x0000000000007000,
- 0x000000000004a000, 0x0000000000007000,
- 0x0000000000060000, 0x0000000000001000,
- 0x0000000000061000, 0x0000000000008000,
- 0x0000000000069000, 0x0000000000007000,
- 0x00000000000a3000, 0x000000000000c000,
- 0x000000000024e000, 0x000000000000c000,
- 0x000000000028b000, 0x0000000000009000,
- 0x00000000002b1000, 0x0000000000003000,
- 0, 0
- };
- for (int i=0; v[i]; i++) {
- int off = v[i++];
- int len = v[i];
-
- // write a large extent
- char buf[len];
- memset(buf, 1, sizeof(buf));
- int r = ::lseek(fd, off, SEEK_SET);
- if (r < 0) {
- r = -errno;
- derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(r) << dendl;
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- return r;
- }
- r = write(fd, buf, sizeof(buf));
- if (r < 0) {
- derr << "detect_features: failed to write to " << fn << ": " << cpp_strerror(r) << dendl;
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- return r;
- }
- }
-
- // fiemap an extent inside that
- if (!m_filestore_fiemap) {
- dout(0) << "detect_features: FIEMAP ioctl is disabled via 'filestore fiemap' config option" << dendl;
- ioctl_fiemap = false;
- } else {
- struct fiemap *fiemap;
- int r = do_fiemap(fd, 2430421, 59284, &fiemap);
- if (r < 0) {
- dout(0) << "detect_features: FIEMAP ioctl is NOT supported" << dendl;
- ioctl_fiemap = false;
- } else {
- if (fiemap->fm_mapped_extents == 0) {
- dout(0) << "detect_features: FIEMAP ioctl is supported, but buggy -- upgrade your kernel" << dendl;
- ioctl_fiemap = false;
- } else {
- dout(0) << "detect_features: FIEMAP ioctl is supported and appears to work" << dendl;
- ioctl_fiemap = true;
- }
- free(fiemap);
- }
- }
-
- // SEEK_DATA/SEEK_HOLE detection
- if (!m_filestore_seek_data_hole) {
- dout(0) << "detect_features: SEEK_DATA/SEEK_HOLE is disabled via 'filestore seek data hole' config option" << dendl;
- seek_data_hole = false;
- } else {
-#if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA)
- // If compiled on an OS with SEEK_HOLE/SEEK_DATA support, but running
- // on an OS that doesn't support SEEK_HOLE/SEEK_DATA, EINVAL is returned.
- // Fall back to use fiemap.
- off_t hole_pos;
-
- hole_pos = lseek(fd, 0, SEEK_HOLE);
- if (hole_pos < 0) {
- if (errno == EINVAL) {
- dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is NOT supported" << dendl;
- seek_data_hole = false;
- } else {
- derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(-errno) << dendl;
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- return -errno;
- }
- } else {
- dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is supported" << dendl;
- seek_data_hole = true;
- }
-#endif
- }
-
- //splice detection
-#ifdef CEPH_HAVE_SPLICE
- if (!m_filestore_splice) {
- int pipefd[2];
- loff_t off_in = 0;
- int r;
- if ((r = pipe(pipefd)) < 0)
- dout(0) << "detect_features: splice pipe met error " << cpp_strerror(errno) << dendl;
- else {
- lseek(fd, 0, SEEK_SET);
- r = splice(fd, &off_in, pipefd[1], NULL, 10, 0);
- if (!(r < 0 && errno == EINVAL)) {
- m_filestore_splice = true;
- dout(0) << "detect_features: splice is supported" << dendl;
- } else
- dout(0) << "detect_features: splice is NOT supported" << dendl;
- close(pipefd[0]);
- close(pipefd[1]);
- }
- }
-#endif
- ::unlink(fn);
- VOID_TEMP_FAILURE_RETRY(::close(fd));
-
-
- bool have_syncfs = false;
-#ifdef HAVE_SYS_SYNCFS
- if (::syncfs(get_basedir_fd()) == 0) {
- dout(0) << "detect_features: syncfs(2) syscall fully supported (by glibc and kernel)" << dendl;
- have_syncfs = true;
- } else {
- dout(0) << "detect_features: syncfs(2) syscall supported by glibc BUT NOT the kernel" << dendl;
- }
-#elif defined(SYS_syncfs)
- if (syscall(SYS_syncfs, get_basedir_fd()) == 0) {
- dout(0) << "detect_features: syscall(SYS_syncfs, fd) fully supported" << dendl;
- have_syncfs = true;
- } else {
- dout(0) << "detect_features: syscall(SYS_syncfs, fd) supported by libc BUT NOT the kernel" << dendl;
- }
-#elif defined(__NR_syncfs)
- if (syscall(__NR_syncfs, get_basedir_fd()) == 0) {
- dout(0) << "detect_features: syscall(__NR_syncfs, fd) fully supported" << dendl;
- have_syncfs = true;
- } else {
- dout(0) << "detect_features: syscall(__NR_syncfs, fd) supported by libc BUT NOT the kernel" << dendl;
- }
-#endif
- if (!have_syncfs) {
- dout(0) << "detect_features: syncfs(2) syscall not supported" << dendl;
- if (m_filestore_fsync_flushes_journal_data) {
- dout(0) << "detect_features: no syncfs(2), but 'filestore fsync flushes journal data = true', so fsync will suffice." << dendl;
- } else {
- dout(0) << "detect_features: no syncfs(2), must use sync(2)." << dendl;
- dout(0) << "detect_features: WARNING: multiple ceph-osd daemons on the same host will be slow" << dendl;
- }
- }
-
- return 0;
-}
-
-int GenericFileStoreBackend::create_current()
-{
- struct stat st;
- int ret = ::stat(get_current_path().c_str(), &st);
- if (ret == 0) {
- // current/ exists
- if (!S_ISDIR(st.st_mode)) {
- dout(0) << "_create_current: current/ exists but is not a directory" << dendl;
- ret = -EINVAL;
- }
- } else {
- ret = ::mkdir(get_current_path().c_str(), 0755);
- if (ret < 0) {
- ret = -errno;
- dout(0) << "_create_current: mkdir " << get_current_path() << " failed: "<< cpp_strerror(ret) << dendl;
- }
- }
- return ret;
-}
-
-int GenericFileStoreBackend::syncfs()
-{
- int ret;
- if (m_filestore_fsync_flushes_journal_data) {
- dout(15) << "syncfs: doing fsync on " << get_op_fd() << dendl;
- // make the file system's journal commit.
- // this works with ext3, but NOT ext4
- ret = ::fsync(get_op_fd());
- if (ret < 0)
- ret = -errno;
- } else {
- dout(15) << "syncfs: doing a full sync (syncfs(2) if possible)" << dendl;
- ret = sync_filesystem(get_current_fd());
- }
- return ret;
-}
-
-int GenericFileStoreBackend::do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap)
-{
- struct fiemap *fiemap = NULL;
- struct fiemap *_realloc_fiemap = NULL;
- int size;
- int ret;
-
- fiemap = (struct fiemap*)calloc(sizeof(struct fiemap), 1);
- if (!fiemap)
- return -ENOMEM;
- /*
- * There is a bug on xfs about fiemap. Suppose(offset=3990, len=4096),
- * the result is (logical=4096, len=4096). It leak the [3990, 4096).
- * Commit:"xfs: fix rounding error of fiemap length parameter
- * (eedf32bfcace7d8e20cc66757d74fc68f3439ff7)" fix this bug.
- * Here, we make offset aligned with CEPH_PAGE_SIZE to avoid this bug.
- */
- fiemap->fm_start = start - start % CEPH_PAGE_SIZE;
- fiemap->fm_length = len + start % CEPH_PAGE_SIZE;
- fiemap->fm_flags = FIEMAP_FLAG_SYNC; /* flush extents to disk if needed */
-
-#if defined(DARWIN) || defined(__FreeBSD__)
- ret = -ENOTSUP;
- goto done_err;
-#else
- if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) {
- ret = -errno;
- goto done_err;
- }
-#endif
- size = sizeof(struct fiemap_extent) * (fiemap->fm_mapped_extents);
-
- _realloc_fiemap = (struct fiemap *)realloc(fiemap, sizeof(struct fiemap) + size);
- if (!_realloc_fiemap) {
- ret = -ENOMEM;
- goto done_err;
- } else {
- fiemap = _realloc_fiemap;
- }
-
- memset(fiemap->fm_extents, 0, size);
-
- fiemap->fm_extent_count = fiemap->fm_mapped_extents;
- fiemap->fm_mapped_extents = 0;
-
-#if defined(DARWIN) || defined(__FreeBSD__)
- ret = -ENOTSUP;
- goto done_err;
-#else
- if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) {
- ret = -errno;
- goto done_err;
- }
- *pfiemap = fiemap;
-#endif
- return 0;
-
-done_err:
- *pfiemap = NULL;
- free(fiemap);
- return ret;
-}
-
-
-int GenericFileStoreBackend::_crc_load_or_init(int fd, SloppyCRCMap *cm)
-{
- char buf[100];
- bufferptr bp;
- int r = 0;
- int l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, buf, sizeof(buf));
- if (l == -ENODATA) {
- return 0;
- }
- if (l >= 0) {
- bp = buffer::create(l);
- memcpy(bp.c_str(), buf, l);
- } else if (l == -ERANGE) {
- l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, 0, 0);
- if (l > 0) {
- bp = buffer::create(l);
- l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, bp.c_str(), l);
- }
- }
- bufferlist bl;
- bl.append(bp);
- bufferlist::iterator p = bl.begin();
- try {
- ::decode(*cm, p);
- }
- catch (buffer::error &e) {
- r = -EIO;
- }
- if (r < 0)
- derr << __func__ << " got " << cpp_strerror(r) << dendl;
- return r;
-}
-
-int GenericFileStoreBackend::_crc_save(int fd, SloppyCRCMap *cm)
-{
- bufferlist bl;
- ::encode(*cm, bl);
- int r = chain_fsetxattr(fd, SLOPPY_CRC_XATTR, bl.c_str(), bl.length());
- if (r < 0)
- derr << __func__ << " got " << cpp_strerror(r) << dendl;
- return r;
-}
-
-int GenericFileStoreBackend::_crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl)
-{
- SloppyCRCMap scm(get_crc_block_size());
- int r = _crc_load_or_init(fd, &scm);
- if (r < 0)
- return r;
- ostringstream ss;
- scm.write(off, len, bl, &ss);
- dout(30) << __func__ << "\n" << ss.str() << dendl;
- r = _crc_save(fd, &scm);
- return r;
-}
-
-int GenericFileStoreBackend::_crc_update_truncate(int fd, loff_t off)
-{
- SloppyCRCMap scm(get_crc_block_size());
- int r = _crc_load_or_init(fd, &scm);
- if (r < 0)
- return r;
- scm.truncate(off);
- r = _crc_save(fd, &scm);
- return r;
-}
-
-int GenericFileStoreBackend::_crc_update_zero(int fd, loff_t off, size_t len)
-{
- SloppyCRCMap scm(get_crc_block_size());
- int r = _crc_load_or_init(fd, &scm);
- if (r < 0)
- return r;
- scm.zero(off, len);
- r = _crc_save(fd, &scm);
- return r;
-}
-
-int GenericFileStoreBackend::_crc_update_clone_range(int srcfd, int destfd,
- loff_t srcoff, size_t len, loff_t dstoff)
-{
- SloppyCRCMap scm_src(get_crc_block_size());
- SloppyCRCMap scm_dst(get_crc_block_size());
- int r = _crc_load_or_init(srcfd, &scm_src);
- if (r < 0)
- return r;
- r = _crc_load_or_init(destfd, &scm_dst);
- if (r < 0)
- return r;
- ostringstream ss;
- scm_dst.clone_range(srcoff, len, dstoff, scm_src, &ss);
- dout(30) << __func__ << "\n" << ss.str() << dendl;
- r = _crc_save(destfd, &scm_dst);
- return r;
-}
-
-int GenericFileStoreBackend::_crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
- ostream *out)
-{
- SloppyCRCMap scm(get_crc_block_size());
- int r = _crc_load_or_init(fd, &scm);
- if (r < 0)
- return r;
- return scm.read(off, len, bl, out);
-}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#ifndef CEPH_GENERICFILESTOREBACKEDN_H
-#define CEPH_GENERICFILESTOREBACKEDN_H
-
-#include "FileStore.h"
-
-class SloppyCRCMap;
-
-class GenericFileStoreBackend : public FileStoreBackend {
-private:
- bool ioctl_fiemap;
- bool seek_data_hole;
- bool m_filestore_fiemap;
- bool m_filestore_seek_data_hole;
- bool m_filestore_fsync_flushes_journal_data;
- bool m_filestore_splice;
-public:
- GenericFileStoreBackend(FileStore *fs);
- virtual ~GenericFileStoreBackend() {}
-
- virtual const char *get_name() {
- return "generic";
- }
- virtual int detect_features();
- virtual int create_current();
- virtual bool can_checkpoint() { return false; }
- virtual int list_checkpoints(list<string>& ls) { return 0; }
- virtual int create_checkpoint(const string& name, uint64_t *cid) { return -EOPNOTSUPP; }
- virtual int sync_checkpoint(uint64_t id) { return -EOPNOTSUPP; }
- virtual int rollback_to(const string& name) { return -EOPNOTSUPP; }
- virtual int destroy_checkpoint(const string& name) { return -EOPNOTSUPP; }
- virtual int syncfs();
- virtual bool has_fiemap() { return ioctl_fiemap; }
- virtual bool has_seek_data_hole() { return seek_data_hole; }
- virtual int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap);
- virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) {
- return _copy_range(from, to, srcoff, len, dstoff);
- }
- virtual int set_alloc_hint(int fd, uint64_t hint) { return -EOPNOTSUPP; }
- virtual bool has_splice() const { return m_filestore_splice; }
-private:
- int _crc_load_or_init(int fd, SloppyCRCMap *cm);
- int _crc_save(int fd, SloppyCRCMap *cm);
-public:
- virtual int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl);
- virtual int _crc_update_truncate(int fd, loff_t off);
- virtual int _crc_update_zero(int fd, loff_t off, size_t len);
- virtual int _crc_update_clone_range(int srcfd, int destfd,
- loff_t srcoff, size_t len, loff_t dstoff);
- virtual int _crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
- ostream *out);
-};
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "include/types.h"
-#include "include/buffer.h"
-#include "osd/osd_types.h"
-#include <errno.h>
-
-#include "HashIndex.h"
-
-#include "common/debug.h"
-#define dout_subsys ceph_subsys_filestore
-
-const string HashIndex::SUBDIR_ATTR = "contents";
-const string HashIndex::IN_PROGRESS_OP_TAG = "in_progress_op";
-
-/// hex digit to integer value
-int hex_to_int(char c)
-{
- if (c >= '0' && c <= '9')
- return c - '0';
- if (c >= 'A' && c <= 'F')
- return c - 'A' + 10;
- assert(0);
-}
-
-/// int value to hex digit
-char int_to_hex(int v)
-{
- assert(v < 16);
- if (v < 10)
- return '0' + v;
- return 'A' + v - 10;
-}
-
-/// reverse bits in a nibble (0..15)
-int reverse_nibble_bits(int in)
-{
- assert(in < 16);
- return
- ((in & 8) >> 3) |
- ((in & 4) >> 1) |
- ((in & 2) << 1) |
- ((in & 1) << 3);
-}
-
-/// reverse nibble bits in a hex digit
-char reverse_hexdigit_bits(char c)
-{
- return int_to_hex(reverse_nibble_bits(hex_to_int(c)));
-}
-
-/// reverse nibble bits in a hex string
-string reverse_hexdigit_bits_string(string s)
-{
- for (unsigned i=0; i<s.size(); ++i)
- s[i] = reverse_hexdigit_bits(s[i]);
- return s;
-}
-
-/// compare hex digit (as length 1 string) bitwise
-bool cmp_hexdigit_bitwise(const string& l, const string& r)
-{
- assert(l.length() == 1 && r.length() == 1);
- int lv = hex_to_int(l[0]);
- int rv = hex_to_int(r[0]);
- assert(lv < 16);
- assert(rv < 16);
- return reverse_nibble_bits(lv) < reverse_nibble_bits(rv);
-}
-
-/// compare hex digit string bitwise
-bool cmp_hexdigit_string_bitwise(const string& l, const string& r)
-{
- string ll = reverse_hexdigit_bits_string(l);
- string rr = reverse_hexdigit_bits_string(r);
- return ll < rr;
-}
-
-int HashIndex::cleanup() {
- bufferlist bl;
- int r = get_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
- if (r < 0) {
- // No in progress operations!
- return 0;
- }
- bufferlist::iterator i = bl.begin();
- InProgressOp in_progress(i);
- subdir_info_s info;
- r = get_info(in_progress.path, &info);
- if (r == -ENOENT) {
- return end_split_or_merge(in_progress.path);
- } else if (r < 0) {
- return r;
- }
-
- if (in_progress.is_split())
- return complete_split(in_progress.path, info);
- else if (in_progress.is_merge())
- return complete_merge(in_progress.path, info);
- else if (in_progress.is_col_split()) {
- for (vector<string>::iterator i = in_progress.path.begin();
- i != in_progress.path.end();
- ++i) {
- vector<string> path(in_progress.path.begin(), i);
- int r = reset_attr(path);
- if (r < 0)
- return r;
- }
- return 0;
- }
- else
- return -EINVAL;
-}
-
-int HashIndex::reset_attr(
- const vector<string> &path)
-{
- int exists = 0;
- int r = path_exists(path, &exists);
- if (r < 0)
- return r;
- if (!exists)
- return 0;
- map<string, ghobject_t> objects;
- vector<string> subdirs;
- r = list_objects(path, 0, 0, &objects);
- if (r < 0)
- return r;
- r = list_subdirs(path, &subdirs);
- if (r < 0)
- return r;
-
- subdir_info_s info;
- info.hash_level = path.size();
- info.objs = objects.size();
- info.subdirs = subdirs.size();
- return set_info(path, info);
-}
-
-int HashIndex::col_split_level(
- HashIndex &from,
- HashIndex &to,
- const vector<string> &path,
- uint32_t inbits,
- uint32_t match,
- unsigned *mkdirred)
-{
- /* For each subdir, move, recurse, or ignore based on comparing the low order
- * bits of the hash represented by the subdir path with inbits, match passed
- * in.
- */
- vector<string> subdirs;
- int r = from.list_subdirs(path, &subdirs);
- if (r < 0)
- return r;
- map<string, ghobject_t> objects;
- r = from.list_objects(path, 0, 0, &objects);
- if (r < 0)
- return r;
-
- set<string> to_move;
- for (vector<string>::iterator i = subdirs.begin();
- i != subdirs.end();
- ++i) {
- uint32_t bits = 0;
- uint32_t hash = 0;
- vector<string> sub_path(path.begin(), path.end());
- sub_path.push_back(*i);
- path_to_hobject_hash_prefix(sub_path, &bits, &hash);
- if (bits < inbits) {
- if (hobject_t::match_hash(hash, bits, match)) {
- r = col_split_level(
- from,
- to,
- sub_path,
- inbits,
- match,
- mkdirred);
- if (r < 0)
- return r;
- if (*mkdirred > path.size())
- *mkdirred = path.size();
- } // else, skip, doesn't need to be moved or recursed into
- } else {
- if (hobject_t::match_hash(hash, inbits, match)) {
- to_move.insert(*i);
- }
- } // else, skip, doesn't need to be moved or recursed into
- }
-
- /* Then, do the same for each object */
- map<string, ghobject_t> objs_to_move;
- for (map<string, ghobject_t>::iterator i = objects.begin();
- i != objects.end();
- ++i) {
- if (i->second.match(inbits, match)) {
- objs_to_move.insert(*i);
- }
- }
-
- if (objs_to_move.empty() && to_move.empty())
- return 0;
-
- // Make parent directories as needed
- while (*mkdirred < path.size()) {
- ++*mkdirred;
- int exists = 0;
- vector<string> creating_path(path.begin(), path.begin()+*mkdirred);
- r = to.path_exists(creating_path, &exists);
- if (r < 0)
- return r;
- if (exists)
- continue;
- subdir_info_s info;
- info.objs = 0;
- info.subdirs = 0;
- info.hash_level = creating_path.size();
- if (*mkdirred < path.size() - 1)
- info.subdirs = 1;
- r = to.start_col_split(creating_path);
- if (r < 0)
- return r;
- r = to.create_path(creating_path);
- if (r < 0)
- return r;
- r = to.set_info(creating_path, info);
- if (r < 0)
- return r;
- r = to.end_split_or_merge(creating_path);
- if (r < 0)
- return r;
- }
-
- subdir_info_s from_info;
- subdir_info_s to_info;
- r = from.get_info(path, &from_info);
- if (r < 0)
- return r;
- r = to.get_info(path, &to_info);
- if (r < 0)
- return r;
-
- from.start_col_split(path);
- to.start_col_split(path);
-
- // Do subdir moves
- for (set<string>::iterator i = to_move.begin();
- i != to_move.end();
- ++i) {
- from_info.subdirs--;
- to_info.subdirs++;
- r = move_subdir(from, to, path, *i);
- if (r < 0)
- return r;
- }
-
- for (map<string, ghobject_t>::iterator i = objs_to_move.begin();
- i != objs_to_move.end();
- ++i) {
- from_info.objs--;
- to_info.objs++;
- r = move_object(from, to, path, *i);
- if (r < 0)
- return r;
- }
-
-
- r = to.set_info(path, to_info);
- if (r < 0)
- return r;
- r = from.set_info(path, from_info);
- if (r < 0)
- return r;
- from.end_split_or_merge(path);
- to.end_split_or_merge(path);
- return 0;
-}
-
-int HashIndex::_split(
- uint32_t match,
- uint32_t bits,
- CollectionIndex* dest) {
- assert(collection_version() == dest->collection_version());
- unsigned mkdirred = 0;
- return col_split_level(
- *this,
- *static_cast<HashIndex*>(dest),
- vector<string>(),
- bits,
- match,
- &mkdirred);
-}
-
-int HashIndex::_init() {
- subdir_info_s info;
- vector<string> path;
- return set_info(path, info);
-}
-
-/* LFNIndex virtual method implementations */
-int HashIndex::_created(const vector<string> &path,
- const ghobject_t &oid,
- const string &mangled_name) {
- subdir_info_s info;
- int r;
- r = get_info(path, &info);
- if (r < 0)
- return r;
- info.objs++;
- r = set_info(path, info);
- if (r < 0)
- return r;
-
- if (must_split(info)) {
- int r = initiate_split(path, info);
- if (r < 0)
- return r;
- return complete_split(path, info);
- } else {
- return 0;
- }
-}
-
-int HashIndex::_remove(const vector<string> &path,
- const ghobject_t &oid,
- const string &mangled_name) {
- int r;
- r = remove_object(path, oid);
- if (r < 0)
- return r;
- subdir_info_s info;
- r = get_info(path, &info);
- if (r < 0)
- return r;
- info.objs--;
- r = set_info(path, info);
- if (r < 0)
- return r;
- if (must_merge(info)) {
- r = initiate_merge(path, info);
- if (r < 0)
- return r;
- return complete_merge(path, info);
- } else {
- return 0;
- }
-}
-
-int HashIndex::_lookup(const ghobject_t &oid,
- vector<string> *path,
- string *mangled_name,
- int *hardlink) {
- vector<string> path_comp;
- get_path_components(oid, &path_comp);
- vector<string>::iterator next = path_comp.begin();
- int exists;
- while (1) {
- int r = path_exists(*path, &exists);
- if (r < 0)
- return r;
- if (!exists) {
- if (path->empty())
- return -ENOENT;
- path->pop_back();
- break;
- }
- if (next == path_comp.end())
- break;
- path->push_back(*(next++));
- }
- return get_mangled_name(*path, oid, mangled_name, hardlink);
-}
-
-int HashIndex::_collection_list_partial(const ghobject_t &start,
- const ghobject_t &end,
- bool sort_bitwise,
- int max_count,
- vector<ghobject_t> *ls,
- ghobject_t *next) {
- vector<string> path;
- ghobject_t _next;
- if (!next)
- next = &_next;
- *next = start;
- dout(20) << __func__ << " start:" << start << " end:" << end << "-" << max_count << " ls.size " << ls->size() << dendl;
- return list_by_hash(path, end, sort_bitwise, max_count, next, ls);
-}
-
-int HashIndex::prep_delete() {
- return recursive_remove(vector<string>());
-}
-
-int HashIndex::_pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs) {
- int ret;
- vector<string> path;
- subdir_info_s root_info;
- // Make sure there is neither objects nor sub-folders
- // in this collection
- ret = get_info(path, &root_info);
- if (ret < 0)
- return ret;
-
- // Do the folder splitting first
- ret = pre_split_folder(pg_num, expected_num_objs);
- if (ret < 0)
- return ret;
- // Initialize the folder info starting from root
- return init_split_folder(path, 0);
-}
-
-int HashIndex::pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs)
-{
- // If folder merging is enabled (by setting the threshold positive),
- // no need to split
- if (merge_threshold > 0)
- return 0;
- const coll_t c = coll();
- // Do not split if the expected number of objects in this collection is zero (by default)
- if (expected_num_objs == 0)
- return 0;
-
- // Calculate the number of leaf folders (which actually store files)
- // need to be created
- const uint64_t objs_per_folder = (uint64_t)(abs(merge_threshold)) * (uint64_t)split_multiplier * 16;
- uint64_t leavies = expected_num_objs / objs_per_folder ;
- // No need to split
- if (leavies == 0 || expected_num_objs == objs_per_folder)
- return 0;
-
- spg_t spgid;
- if (!c.is_pg_prefix(&spgid))
- return -EINVAL;
- const ps_t ps = spgid.pgid.ps();
-
- // the most significant bits of pg_num
- const int pg_num_bits = calc_num_bits(pg_num - 1);
- ps_t tmp_id = ps;
- // calculate the number of levels we only create one sub folder
- int num = pg_num_bits / 4;
- // pg num's hex value is like 1xxx,xxxx,xxxx but not 1111,1111,1111,
- // so that splitting starts at level 3
- if (pg_num_bits % 4 == 0 && pg_num < ((uint32_t)1 << pg_num_bits)) {
- --num;
- }
-
- int ret;
- // Start with creation that only has one subfolder
- vector<string> paths;
- int dump_num = num;
- while (num-- > 0) {
- ps_t v = tmp_id & 0x0000000f;
- paths.push_back(to_hex(v));
- ret = create_path(paths);
- if (ret < 0 && ret != -EEXIST)
- return ret;
- tmp_id = tmp_id >> 4;
- }
-
- // Starting from here, we can split by creating multiple subfolders
- const int left_bits = pg_num_bits - dump_num * 4;
- // this variable denotes how many bits (for this level) that can be
- // used for sub folder splitting
- int split_bits = 4 - left_bits;
- // the below logic is inspired by rados.h#ceph_stable_mod,
- // it basically determines how many sub-folders should we
- // create for splitting
- assert(pg_num_bits > 0); // otherwise BAD_SHIFT
- if (((1 << (pg_num_bits - 1)) | ps) >= pg_num) {
- ++split_bits;
- }
- const uint32_t subs = (1 << split_bits);
- // Calculate how many levels we create starting from here
- int level = 0;
- leavies /= subs;
- while (leavies > 1) {
- ++level;
- leavies = leavies >> 4;
- }
- for (uint32_t i = 0; i < subs; ++i) {
- assert(split_bits <= 4); // otherwise BAD_SHIFT
- int v = tmp_id | (i << ((4 - split_bits) % 4));
- paths.push_back(to_hex(v));
- ret = create_path(paths);
- if (ret < 0 && ret != -EEXIST)
- return ret;
- ret = recursive_create_path(paths, level);
- if (ret < 0)
- return ret;
- paths.pop_back();
- }
- return 0;
-}
-
-int HashIndex::init_split_folder(vector<string> &path, uint32_t hash_level)
-{
- // Get the number of sub directories for the current path
- vector<string> subdirs;
- int ret = list_subdirs(path, &subdirs);
- if (ret < 0)
- return ret;
- subdir_info_s info;
- info.subdirs = subdirs.size();
- info.hash_level = hash_level;
- ret = set_info(path, info);
- if (ret < 0)
- return ret;
- ret = fsync_dir(path);
- if (ret < 0)
- return ret;
-
- // Do the same for subdirs
- vector<string>::const_iterator iter;
- for (iter = subdirs.begin(); iter != subdirs.end(); ++iter) {
- path.push_back(*iter);
- ret = init_split_folder(path, hash_level + 1);
- if (ret < 0)
- return ret;
- path.pop_back();
- }
- return 0;
-}
-
-int HashIndex::recursive_create_path(vector<string>& path, int level)
-{
- if (level == 0)
- return 0;
- for (int i = 0; i < 16; ++i) {
- path.push_back(to_hex(i));
- int ret = create_path(path);
- if (ret < 0 && ret != -EEXIST)
- return ret;
- ret = recursive_create_path(path, level - 1);
- if (ret < 0)
- return ret;
- path.pop_back();
- }
- return 0;
-}
-
-int HashIndex::recursive_remove(const vector<string> &path) {
- vector<string> subdirs;
- int r = list_subdirs(path, &subdirs);
- if (r < 0)
- return r;
- map<string, ghobject_t> objects;
- r = list_objects(path, 0, 0, &objects);
- if (r < 0)
- return r;
- if (!objects.empty())
- return -ENOTEMPTY;
- vector<string> subdir(path);
- for (vector<string>::iterator i = subdirs.begin();
- i != subdirs.end();
- ++i) {
- subdir.push_back(*i);
- r = recursive_remove(subdir);
- if (r < 0)
- return r;
- subdir.pop_back();
- }
- return remove_path(path);
-}
-
-int HashIndex::start_col_split(const vector<string> &path) {
- bufferlist bl;
- InProgressOp op_tag(InProgressOp::COL_SPLIT, path);
- op_tag.encode(bl);
- int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
- if (r < 0)
- return r;
- return fsync_dir(vector<string>());
-}
-
-int HashIndex::start_split(const vector<string> &path) {
- bufferlist bl;
- InProgressOp op_tag(InProgressOp::SPLIT, path);
- op_tag.encode(bl);
- int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
- if (r < 0)
- return r;
- return fsync_dir(vector<string>());
-}
-
-int HashIndex::start_merge(const vector<string> &path) {
- bufferlist bl;
- InProgressOp op_tag(InProgressOp::MERGE, path);
- op_tag.encode(bl);
- int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
- if (r < 0)
- return r;
- return fsync_dir(vector<string>());
-}
-
-int HashIndex::end_split_or_merge(const vector<string> &path) {
- return remove_attr_path(vector<string>(), IN_PROGRESS_OP_TAG);
-}
-
-int HashIndex::get_info(const vector<string> &path, subdir_info_s *info) {
- bufferlist buf;
- int r = get_attr_path(path, SUBDIR_ATTR, buf);
- if (r < 0)
- return r;
- bufferlist::iterator bufiter = buf.begin();
- info->decode(bufiter);
- assert(path.size() == (unsigned)info->hash_level);
- return 0;
-}
-
-int HashIndex::set_info(const vector<string> &path, const subdir_info_s &info) {
- bufferlist buf;
- assert(path.size() == (unsigned)info.hash_level);
- info.encode(buf);
- return add_attr_path(path, SUBDIR_ATTR, buf);
-}
-
-bool HashIndex::must_merge(const subdir_info_s &info) {
- return (info.hash_level > 0 &&
- merge_threshold > 0 &&
- info.objs < (unsigned)merge_threshold &&
- info.subdirs == 0);
-}
-
-bool HashIndex::must_split(const subdir_info_s &info) {
- return (info.hash_level < (unsigned)MAX_HASH_LEVEL &&
- info.objs > ((unsigned)(abs(merge_threshold)) * 16 * split_multiplier));
-
-}
-
-int HashIndex::initiate_merge(const vector<string> &path, subdir_info_s info) {
- return start_merge(path);
-}
-
-int HashIndex::complete_merge(const vector<string> &path, subdir_info_s info) {
- vector<string> dst = path;
- dst.pop_back();
- subdir_info_s dstinfo;
- int r, exists;
- r = path_exists(path, &exists);
- if (r < 0)
- return r;
- r = get_info(dst, &dstinfo);
- if (r < 0)
- return r;
- if (exists) {
- r = move_objects(path, dst);
- if (r < 0)
- return r;
- r = reset_attr(dst);
- if (r < 0)
- return r;
- r = remove_path(path);
- if (r < 0)
- return r;
- }
- if (must_merge(dstinfo)) {
- r = initiate_merge(dst, dstinfo);
- if (r < 0)
- return r;
- r = fsync_dir(dst);
- if (r < 0)
- return r;
- return complete_merge(dst, dstinfo);
- }
- r = fsync_dir(dst);
- if (r < 0)
- return r;
- return end_split_or_merge(path);
-}
-
-int HashIndex::initiate_split(const vector<string> &path, subdir_info_s info) {
- return start_split(path);
-}
-
-int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
- int level = info.hash_level;
- map<string, ghobject_t> objects;
- vector<string> dst = path;
- int r;
- dst.push_back("");
- r = list_objects(path, 0, 0, &objects);
- if (r < 0)
- return r;
- vector<string> subdirs_vec;
- r = list_subdirs(path, &subdirs_vec);
- if (r < 0)
- return r;
- set<string> subdirs;
- subdirs.insert(subdirs_vec.begin(), subdirs_vec.end());
- map<string, map<string, ghobject_t> > mapped;
- map<string, ghobject_t> moved;
- int num_moved = 0;
- for (map<string, ghobject_t>::iterator i = objects.begin();
- i != objects.end();
- ++i) {
- vector<string> new_path;
- get_path_components(i->second, &new_path);
- mapped[new_path[level]][i->first] = i->second;
- }
- for (map<string, map<string, ghobject_t> >::iterator i = mapped.begin();
- i != mapped.end();
- ) {
- dst[level] = i->first;
- /* If the info already exists, it must be correct,
- * we may be picking up a partially finished split */
- subdir_info_s temp;
- // subdir has already been fully copied
- if (subdirs.count(i->first) && !get_info(dst, &temp)) {
- for (map<string, ghobject_t>::iterator j = i->second.begin();
- j != i->second.end();
- ++j) {
- moved[j->first] = j->second;
- num_moved++;
- objects.erase(j->first);
- }
- ++i;
- continue;
- }
-
- subdir_info_s info_new;
- info_new.objs = i->second.size();
- info_new.subdirs = 0;
- info_new.hash_level = level + 1;
- if (must_merge(info_new) && !subdirs.count(i->first)) {
- mapped.erase(i++);
- continue;
- }
-
- // Subdir doesn't yet exist
- if (!subdirs.count(i->first)) {
- info.subdirs += 1;
- r = create_path(dst);
- if (r < 0)
- return r;
- } // else subdir has been created but only partially copied
-
- for (map<string, ghobject_t>::iterator j = i->second.begin();
- j != i->second.end();
- ++j) {
- moved[j->first] = j->second;
- num_moved++;
- objects.erase(j->first);
- r = link_object(path, dst, j->second, j->first);
- // May be a partially finished split
- if (r < 0 && r != -EEXIST) {
- return r;
- }
- }
-
- r = fsync_dir(dst);
- if (r < 0)
- return r;
-
- // Presence of info must imply that all objects have been copied
- r = set_info(dst, info_new);
- if (r < 0)
- return r;
-
- r = fsync_dir(dst);
- if (r < 0)
- return r;
-
- ++i;
- }
- r = remove_objects(path, moved, &objects);
- if (r < 0)
- return r;
- info.objs = objects.size();
- r = reset_attr(path);
- if (r < 0)
- return r;
- r = fsync_dir(path);
- if (r < 0)
- return r;
- return end_split_or_merge(path);
-}
-
-void HashIndex::get_path_components(const ghobject_t &oid,
- vector<string> *path) {
- char buf[MAX_HASH_LEVEL + 1];
- snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, (uint32_t)oid.hobj.get_nibblewise_key());
-
- // Path components are the hex characters of oid.hobj.hash, least
- // significant first
- for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
- path->push_back(string(&buf[i], 1));
- }
-}
-
-string HashIndex::get_hash_str(uint32_t hash) {
- char buf[MAX_HASH_LEVEL + 1];
- snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, hash);
- string retval;
- for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
- retval.push_back(buf[MAX_HASH_LEVEL - 1 - i]);
- }
- return retval;
-}
-
-string HashIndex::get_path_str(const ghobject_t &oid) {
- assert(!oid.is_max());
- return get_hash_str(oid.hobj.get_hash());
-}
-
-uint32_t HashIndex::hash_prefix_to_hash(string prefix) {
- while (prefix.size() < sizeof(uint32_t) * 2) {
- prefix.push_back('0');
- }
- uint32_t hash;
- sscanf(prefix.c_str(), "%x", &hash);
- // nibble reverse
- hash = ((hash & 0x0f0f0f0f) << 4) | ((hash & 0xf0f0f0f0) >> 4);
- hash = ((hash & 0x00ff00ff) << 8) | ((hash & 0xff00ff00) >> 8);
- hash = ((hash & 0x0000ffff) << 16) | ((hash & 0xffff0000) >> 16);
- return hash;
-}
-
-int HashIndex::get_path_contents_by_hash_bitwise(
- const vector<string> &path,
- const ghobject_t *next_object,
- set<string, CmpHexdigitStringBitwise> *hash_prefixes,
- set<pair<string, ghobject_t>, CmpPairBitwise> *objects)
-{
- map<string, ghobject_t> rev_objects;
- int r;
- r = list_objects(path, 0, 0, &rev_objects);
- if (r < 0)
- return r;
- // bitwise sort
- for (map<string, ghobject_t>::iterator i = rev_objects.begin();
- i != rev_objects.end();
- ++i) {
- if (next_object && cmp_bitwise(i->second, *next_object) < 0)
- continue;
- string hash_prefix = get_path_str(i->second);
- hash_prefixes->insert(hash_prefix);
- objects->insert(pair<string, ghobject_t>(hash_prefix, i->second));
- }
- vector<string> subdirs;
- r = list_subdirs(path, &subdirs);
- if (r < 0)
- return r;
-
- // sort subdirs bitwise (by reversing hex digit nibbles)
- std::sort(subdirs.begin(), subdirs.end(), cmp_hexdigit_bitwise);
-
- // Local to this function, we will convert the prefix strings
- // (previously simply the reversed hex digits) to also have each
- // digit's nibbles reversed. This will make the strings sort
- // bitwise.
- string cur_prefix;
- for (vector<string>::const_iterator i = path.begin();
- i != path.end();
- ++i) {
- cur_prefix.append(reverse_hexdigit_bits_string(*i));
- }
- string next_object_string;
- if (next_object)
- next_object_string = reverse_hexdigit_bits_string(get_path_str(*next_object));
- for (vector<string>::iterator i = subdirs.begin();
- i != subdirs.end();
- ++i) {
- string candidate = cur_prefix + reverse_hexdigit_bits_string(*i);
- if (next_object) {
- if (next_object->is_max())
- continue;
- if (candidate < next_object_string.substr(0, candidate.size()))
- continue;
- }
- // re-reverse the hex digit nibbles for the caller
- hash_prefixes->insert(reverse_hexdigit_bits_string(candidate));
- }
- return 0;
-}
-
-int HashIndex::get_path_contents_by_hash_nibblewise(
- const vector<string> &path,
- const ghobject_t *next_object,
- set<string> *hash_prefixes,
- set<pair<string, ghobject_t>, CmpPairNibblewise > *objects)
-{
- map<string, ghobject_t> rev_objects;
- int r;
- r = list_objects(path, 0, 0, &rev_objects);
- if (r < 0)
- return r;
-
- for (map<string, ghobject_t>::iterator i = rev_objects.begin();
- i != rev_objects.end();
- ++i) {
- string hash_prefix = get_path_str(i->second);
- if (next_object && cmp_nibblewise(i->second, *next_object) < 0)
- continue;
- hash_prefixes->insert(hash_prefix);
- objects->insert(pair<string, ghobject_t>(hash_prefix, i->second));
- }
-
- vector<string> subdirs;
- r = list_subdirs(path, &subdirs);
- if (r < 0)
- return r;
-
- // sort nibblewise (string sort of (reversed) hex digits)
- std::sort(subdirs.begin(), subdirs.end());
-
- string cur_prefix;
- for (vector<string>::const_iterator i = path.begin();
- i != path.end();
- ++i) {
- cur_prefix.append(*i);
- }
- string next_object_string;
- if (next_object)
- next_object_string = get_path_str(*next_object);
-
- for (vector<string>::iterator i = subdirs.begin();
- i != subdirs.end();
- ++i) {
- string candidate = cur_prefix + *i;
- if (next_object) {
- if (next_object->is_max())
- continue;
- if (candidate < next_object_string.substr(0, candidate.size()))
- continue;
- }
- hash_prefixes->insert(cur_prefix + *i);
- }
- return 0;
-}
-
-int HashIndex::list_by_hash(const vector<string> &path,
- const ghobject_t &end,
- bool sort_bitwise,
- int max_count,
- ghobject_t *next,
- vector<ghobject_t> *out)
-{
- assert(out);
- if (sort_bitwise)
- return list_by_hash_bitwise(path, end, max_count, next, out);
- else
- return list_by_hash_nibblewise(path, end, max_count, next, out);
-}
-
-int HashIndex::list_by_hash_bitwise(
- const vector<string> &path,
- const ghobject_t& end,
- int max_count,
- ghobject_t *next,
- vector<ghobject_t> *out)
-{
- vector<string> next_path = path;
- next_path.push_back("");
- set<string, CmpHexdigitStringBitwise> hash_prefixes;
- set<pair<string, ghobject_t>, CmpPairBitwise> objects;
- int r = get_path_contents_by_hash_bitwise(path,
- next,
- &hash_prefixes,
- &objects);
- if (r < 0)
- return r;
- for (set<string, CmpHexdigitStringBitwise>::iterator i = hash_prefixes.begin();
- i != hash_prefixes.end();
- ++i) {
- dout(20) << __func__ << " prefix " << *i << dendl;
- set<pair<string, ghobject_t>, CmpPairBitwise>::iterator j = objects.lower_bound(
- make_pair(*i, ghobject_t()));
- if (j == objects.end() || j->first != *i) {
- *(next_path.rbegin()) = *(i->rbegin());
- ghobject_t next_recurse;
- if (next)
- next_recurse = *next;
- r = list_by_hash_bitwise(next_path,
- end,
- max_count,
- &next_recurse,
- out);
-
- if (r < 0)
- return r;
- if (!next_recurse.is_max()) {
- if (next)
- *next = next_recurse;
- return 0;
- }
- } else {
- while (j != objects.end() && j->first == *i) {
- if (max_count > 0 && out->size() == (unsigned)max_count) {
- if (next)
- *next = j->second;
- return 0;
- }
- if (cmp_bitwise(j->second, end) >= 0) {
- if (next)
- *next = ghobject_t::get_max();
- return 0;
- }
- if (!next || cmp_bitwise(j->second, *next) >= 0) {
- dout(20) << __func__ << " prefix " << *i << " ob " << j->second << dendl;
- out->push_back(j->second);
- }
- ++j;
- }
- }
- }
- if (next)
- *next = ghobject_t::get_max();
- return 0;
-}
-
-int HashIndex::list_by_hash_nibblewise(
- const vector<string> &path,
- const ghobject_t& end,
- int max_count,
- ghobject_t *next,
- vector<ghobject_t> *out)
-{
- vector<string> next_path = path;
- next_path.push_back("");
- set<string> hash_prefixes;
- set<pair<string, ghobject_t>, CmpPairNibblewise> objects;
- int r = get_path_contents_by_hash_nibblewise(path,
- next,
- &hash_prefixes,
- &objects);
- if (r < 0)
- return r;
- for (set<string>::iterator i = hash_prefixes.begin();
- i != hash_prefixes.end();
- ++i) {
- dout(20) << __func__ << " prefix " << *i << dendl;
- set<pair<string, ghobject_t>, CmpPairNibblewise >::iterator j =
- objects.lower_bound(make_pair(*i, ghobject_t()));
- if (j == objects.end() || j->first != *i) {
- *(next_path.rbegin()) = *(i->rbegin());
- ghobject_t next_recurse;
- if (next)
- next_recurse = *next;
- r = list_by_hash_nibblewise(next_path,
- end,
- max_count,
- &next_recurse,
- out);
-
- if (r < 0)
- return r;
- if (!next_recurse.is_max()) {
- if (next)
- *next = next_recurse;
- return 0;
- }
- } else {
- while (j != objects.end() && j->first == *i) {
- if (max_count > 0 && out->size() == (unsigned)max_count) {
- if (next)
- *next = j->second;
- return 0;
- }
- if (cmp_nibblewise(j->second, end) >= 0) {
- if (next)
- *next = ghobject_t::get_max();
- return 0;
- }
- if (!next || cmp_nibblewise(j->second, *next) >= 0) {
- out->push_back(j->second);
- }
- ++j;
- }
- }
- }
- if (next)
- *next = ghobject_t::get_max();
- return 0;
-}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#ifndef CEPH_HASHINDEX_H
-#define CEPH_HASHINDEX_H
-
-#include "include/buffer_fwd.h"
-#include "include/encoding.h"
-#include "LFNIndex.h"
-
-extern string reverse_hexdigit_bits_string(string l);
-
-/**
- * Implements collection prehashing.
- *
- * @verbatim
- * (root) - 0 - 0
- * - 1
- * - E
- * - 1
- * - 2 - D - 0
- * .
- * .
- * .
- * - F - 0
- * @endverbatim
- *
- * A file is located at the longest existing directory from the root
- * given by the hex characters in the hash beginning with the least
- * significant.
- *
- * ex: ghobject_t("object", CEPH_NO_SNAP, 0xA4CEE0D2)
- * would be located in (root)/2/D/0/
- *
- * Subdirectories are created when the number of objects in a directory
- * exceed (abs(merge_threshhold)) * 16 * split_multiplier. The number of objects in a directory
- * is encoded as subdir_info_s in an xattr on the directory.
- */
-class HashIndex : public LFNIndex {
-private:
- /// Attribute name for storing subdir info @see subdir_info_s
- static const string SUBDIR_ATTR;
- /// Attribute name for storing in progress op tag
- static const string IN_PROGRESS_OP_TAG;
- /// Size (bits) in object hash
- static const int PATH_HASH_LEN = 32;
- /// Max length of hashed path
- static const int MAX_HASH_LEVEL = (PATH_HASH_LEN/4);
-
- /**
- * Merges occur when the number of object drops below
- * merge_threshold and splits occur when the number of objects
- * exceeds 16 * abs(merge_threshold) * split_multiplier.
- * Please note if merge_threshold is less than zero, it will never do merging
- */
- int merge_threshold;
- int split_multiplier;
-
- /// Encodes current subdir state for determining when to split/merge.
- struct subdir_info_s {
- uint64_t objs; ///< Objects in subdir.
- uint32_t subdirs; ///< Subdirs in subdir.
- uint32_t hash_level; ///< Hashlevel of subdir.
-
- subdir_info_s() : objs(0), subdirs(0), hash_level(0) {}
-
- void encode(bufferlist &bl) const
- {
- __u8 v = 1;
- ::encode(v, bl);
- ::encode(objs, bl);
- ::encode(subdirs, bl);
- ::encode(hash_level, bl);
- }
-
- void decode(bufferlist::iterator &bl)
- {
- __u8 v;
- ::decode(v, bl);
- assert(v == 1);
- ::decode(objs, bl);
- ::decode(subdirs, bl);
- ::decode(hash_level, bl);
- }
- };
-
- /// Encodes in progress split or merge
- struct InProgressOp {
- static const int SPLIT = 0;
- static const int MERGE = 1;
- static const int COL_SPLIT = 2;
- int op;
- vector<string> path;
-
- InProgressOp(int op, const vector<string> &path)
- : op(op), path(path) {}
-
- InProgressOp(bufferlist::iterator &bl) {
- decode(bl);
- }
-
- bool is_split() const { return op == SPLIT; }
- bool is_col_split() const { return op == COL_SPLIT; }
- bool is_merge() const { return op == MERGE; }
-
- void encode(bufferlist &bl) const {
- __u8 v = 1;
- ::encode(v, bl);
- ::encode(op, bl);
- ::encode(path, bl);
- }
-
- void decode(bufferlist::iterator &bl) {
- __u8 v;
- ::decode(v, bl);
- assert(v == 1);
- ::decode(op, bl);
- ::decode(path, bl);
- }
- };
-
-
-public:
- /// Constructor.
- HashIndex(
- coll_t collection, ///< [in] Collection
- const char *base_path, ///< [in] Path to the index root.
- int merge_at, ///< [in] Merge threshhold.
- int split_multiple, ///< [in] Split threshhold.
- uint32_t index_version,///< [in] Index version
- double retry_probability=0) ///< [in] retry probability
- : LFNIndex(collection, base_path, index_version, retry_probability),
- merge_threshold(merge_at),
- split_multiplier(split_multiple) {}
-
- /// @see CollectionIndex
- uint32_t collection_version() { return index_version; }
-
- /// @see CollectionIndex
- int cleanup();
-
- /// @see CollectionIndex
- int prep_delete();
-
- /// @see CollectionIndex
- int _split(
- uint32_t match,
- uint32_t bits,
- CollectionIndex* dest
- );
-
-protected:
- int _init();
-
- int _created(
- const vector<string> &path,
- const ghobject_t &oid,
- const string &mangled_name
- );
- int _remove(
- const vector<string> &path,
- const ghobject_t &oid,
- const string &mangled_name
- );
- int _lookup(
- const ghobject_t &oid,
- vector<string> *path,
- string *mangled_name,
- int *hardlink
- );
-
- /**
- * Pre-hash the collection to create folders according to the expected number
- * of objects in this collection.
- */
- int _pre_hash_collection(
- uint32_t pg_num,
- uint64_t expected_num_objs
- );
-
- int _collection_list_partial(
- const ghobject_t &start,
- const ghobject_t &end,
- bool sort_bitwise,
- int max_count,
- vector<ghobject_t> *ls,
- ghobject_t *next
- );
-private:
- /// Recursively remove path and its subdirs
- int recursive_remove(
- const vector<string> &path ///< [in] path to remove
- ); /// @return Error Code, 0 on success
- /// Tag root directory at beginning of col_split
- int start_col_split(
- const vector<string> &path ///< [in] path to split
- ); ///< @return Error Code, 0 on success
- /// Tag root directory at beginning of split
- int start_split(
- const vector<string> &path ///< [in] path to split
- ); ///< @return Error Code, 0 on success
- /// Tag root directory at beginning of split
- int start_merge(
- const vector<string> &path ///< [in] path to merge
- ); ///< @return Error Code, 0 on success
- /// Remove tag at end of split or merge
- int end_split_or_merge(
- const vector<string> &path ///< [in] path to split or merged
- ); ///< @return Error Code, 0 on success
- /// Gets info from the xattr on the subdir represented by path
- int get_info(
- const vector<string> &path, ///< [in] Path from which to read attribute.
- subdir_info_s *info ///< [out] Attribute value
- ); /// @return Error Code, 0 on success
-
- /// Sets info to the xattr on the subdir represented by path
- int set_info(
- const vector<string> &path, ///< [in] Path on which to set attribute.
- const subdir_info_s &info ///< [in] Value to set
- ); /// @return Error Code, 0 on success
-
- /// Encapsulates logic for when to split.
- bool must_merge(
- const subdir_info_s &info ///< [in] Info to check
- ); /// @return True if info must be merged, False otherwise
-
- /// Encapsulates logic for when to merge.
- bool must_split(
- const subdir_info_s &info ///< [in] Info to check
- ); /// @return True if info must be split, False otherwise
-
- /// Initiates merge
- int initiate_merge(
- const vector<string> &path, ///< [in] Subdir to merge
- subdir_info_s info ///< [in] Info attached to path
- ); /// @return Error Code, 0 on success
-
- /// Completes merge
- int complete_merge(
- const vector<string> &path, ///< [in] Subdir to merge
- subdir_info_s info ///< [in] Info attached to path
- ); /// @return Error Code, 0 on success
-
- /// Resets attr to match actual subdir contents
- int reset_attr(
- const vector<string> &path ///< [in] path to cleanup
- );
-
- /// Initiate Split
- int initiate_split(
- const vector<string> &path, ///< [in] Subdir to split
- subdir_info_s info ///< [in] Info attached to path
- ); /// @return Error Code, 0 on success
-
- /// Completes Split
- int complete_split(
- const vector<string> &path, ///< [in] Subdir to split
- subdir_info_s info ///< [in] Info attached to path
- ); /// @return Error Code, 0 on success
-
- /// Determine path components from hoid hash
- void get_path_components(
- const ghobject_t &oid, ///< [in] Object for which to get path components
- vector<string> *path ///< [out] Path components for hoid.
- );
-
- /// Pre-hash and split folders to avoid runtime splitting
- /// according to the given expected object number.
- int pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs);
-
- /// Initialize the folder (dir info) with the given hash
- /// level and number of its subdirs.
- int init_split_folder(vector<string> &path, uint32_t hash_level);
-
- /// do collection split for path
- static int col_split_level(
- HashIndex &from, ///< [in] from index
- HashIndex &dest, ///< [in] to index
- const vector<string> &path, ///< [in] path to split
- uint32_t bits, ///< [in] num bits to match
- uint32_t match, ///< [in] bits to match
- unsigned *mkdirred ///< [in,out] path[:mkdirred] has been mkdirred
- );
-
-
- /**
- * Get string representation of ghobject_t/hash
- *
- * e.g: 0x01234567 -> "76543210"
- */
- static string get_path_str(
- const ghobject_t &oid ///< [in] Object to get hash string for
- ); ///< @return Hash string for hoid.
-
- /// Get string from hash, @see get_path_str
- static string get_hash_str(
- uint32_t hash ///< [in] Hash to convert to a string.
- ); ///< @return String representation of hash
-
- /// Get hash from hash prefix string e.g. "FFFFAB" -> 0xFFFFAB00
- static uint32_t hash_prefix_to_hash(
- string prefix ///< [in] string to convert
- ); ///< @return Hash
-
- /// Get hash mod from path
- static void path_to_hobject_hash_prefix(
- const vector<string> &path,///< [in] path to convert
- uint32_t *bits, ///< [out] bits
- uint32_t *hash ///< [out] hash
- ) {
- string hash_str;
- for (vector<string>::const_iterator i = path.begin();
- i != path.end();
- ++i) {
- hash_str.push_back(*i->begin());
- }
- uint32_t rev_hash = hash_prefix_to_hash(hash_str);
- if (hash)
- *hash = rev_hash;
- if (bits)
- *bits = path.size() * 4;
- }
-
- /// Calculate the number of bits.
- static int calc_num_bits(uint64_t n) {
- int ret = 0;
- while (n > 0) {
- n = n >> 1;
- ret++;
- }
- return ret;
- }
-
- /// Convert a number to hex string (upper case).
- static string to_hex(int n) {
- assert(n >= 0 && n < 16);
- char c = (n <= 9 ? ('0' + n) : ('A' + n - 10));
- string str;
- str.append(1, c);
- return str;
- }
-
- struct CmpPairNibblewise {
- bool operator()(const pair<string, ghobject_t>& l,
- const pair<string, ghobject_t>& r)
- {
- if (l.first < r.first)
- return true;
- if (l.first > r.first)
- return false;
- if (cmp_nibblewise(l.second, r.second) < 0)
- return true;
- return false;
- }
- };
-
- struct CmpPairBitwise {
- bool operator()(const pair<string, ghobject_t>& l,
- const pair<string, ghobject_t>& r)
- {
- if (l.first < r.first)
- return true;
- if (l.first > r.first)
- return false;
- if (cmp_bitwise(l.second, r.second) < 0)
- return true;
- return false;
- }
- };
-
- struct CmpHexdigitStringBitwise {
- bool operator()(const string& l, const string& r) {
- return reverse_hexdigit_bits_string(l) < reverse_hexdigit_bits_string(r);
- }
- };
-
- /// Get path contents by hash
- int get_path_contents_by_hash_bitwise(
- const vector<string> &path, /// [in] Path to list
- const ghobject_t *next_object, /// [in] list > *next_object
- set<string, CmpHexdigitStringBitwise> *hash_prefixes, /// [out] prefixes in dir
- set<pair<string, ghobject_t>, CmpPairBitwise> *objects /// [out] objects
- );
- int get_path_contents_by_hash_nibblewise(
- const vector<string> &path, /// [in] Path to list
- const ghobject_t *next_object, /// [in] list > *next_object
- set<string> *hash_prefixes, /// [out] prefixes in dir
- set<pair<string, ghobject_t>, CmpPairNibblewise> *objects /// [out] objects
- );
-
- /// List objects in collection in ghobject_t order
- int list_by_hash(
- const vector<string> &path, /// [in] Path to list
- const ghobject_t &end, /// [in] List only objects < end
- bool sort_bitwise, /// [in] sort bitwise
- int max_count, /// [in] List at most max_count
- ghobject_t *next, /// [in,out] List objects >= *next
- vector<ghobject_t> *out /// [out] Listed objects
- ); ///< @return Error Code, 0 on success
- /// List objects in collection in ghobject_t order
- int list_by_hash_bitwise(
- const vector<string> &path, /// [in] Path to list
- const ghobject_t &end, /// [in] List only objects < end
- int max_count, /// [in] List at most max_count
- ghobject_t *next, /// [in,out] List objects >= *next
- vector<ghobject_t> *out /// [out] Listed objects
- ); ///< @return Error Code, 0 on success
- int list_by_hash_nibblewise(
- const vector<string> &path, /// [in] Path to list
- const ghobject_t &end, /// [in] List only objects < end
- int max_count, /// [in] List at most max_count
- ghobject_t *next, /// [in,out] List objects >= *next
- vector<ghobject_t> *out /// [out] Listed objects
- ); ///< @return Error Code, 0 on success
-
- /// Create the given levels of sub directories from the given root.
- /// The contents of *path* is not changed after calling this function.
- int recursive_create_path(vector<string>& path, int level);
-};
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "include/memory.h"
-#include "include/unordered_map.h"
-
-#if defined(__FreeBSD__)
-#include <sys/param.h>
-#endif
-
-#include <errno.h>
-
-#include "common/Mutex.h"
-#include "common/Cond.h"
-#include "common/config.h"
-#include "common/debug.h"
-#include "include/buffer.h"
-
-#include "IndexManager.h"
-#include "HashIndex.h"
-#include "CollectionIndex.h"
-
-#include "chain_xattr.h"
-
-static int set_version(const char *path, uint32_t version) {
- bufferlist bl;
- ::encode(version, bl);
- return chain_setxattr(path, "user.cephos.collection_version", bl.c_str(),
- bl.length(), true);
-}
-
-static int get_version(const char *path, uint32_t *version) {
- bufferptr bp(PATH_MAX);
- int r = chain_getxattr(path, "user.cephos.collection_version",
- bp.c_str(), bp.length());
- if (r < 0) {
- if (r != -ENOENT) {
- *version = 0;
- return 0;
- } else {
- return r;
- }
- }
- bp.set_length(r);
- bufferlist bl;
- bl.push_back(bp);
- bufferlist::iterator i = bl.begin();
- ::decode(*version, i);
- return 0;
-}
-
-IndexManager::~IndexManager() {
-
- for (ceph::unordered_map<coll_t, CollectionIndex* > ::iterator it = col_indices.begin();
- it != col_indices.end(); ++it) {
-
- delete it->second;
- it->second = NULL;
- }
- col_indices.clear();
-}
-
-
-int IndexManager::init_index(coll_t c, const char *path, uint32_t version) {
- Mutex::Locker l(lock);
- int r = set_version(path, version);
- if (r < 0)
- return r;
- HashIndex index(c, path, g_conf->filestore_merge_threshold,
- g_conf->filestore_split_multiple,
- version,
- g_conf->filestore_index_retry_probability);
- return index.init();
-}
-
-int IndexManager::build_index(coll_t c, const char *path, CollectionIndex **index) {
- if (upgrade) {
- // Need to check the collection generation
- int r;
- uint32_t version = 0;
- r = get_version(path, &version);
- if (r < 0)
- return r;
-
- switch (version) {
- case CollectionIndex::FLAT_INDEX_TAG:
- case CollectionIndex::HASH_INDEX_TAG: // fall through
- case CollectionIndex::HASH_INDEX_TAG_2: // fall through
- case CollectionIndex::HOBJECT_WITH_POOL: {
- // Must be a HashIndex
- *index = new HashIndex(c, path, g_conf->filestore_merge_threshold,
- g_conf->filestore_split_multiple, version);
- return 0;
- }
- default: assert(0);
- }
-
- } else {
- // No need to check
- *index = new HashIndex(c, path, g_conf->filestore_merge_threshold,
- g_conf->filestore_split_multiple,
- CollectionIndex::HOBJECT_WITH_POOL,
- g_conf->filestore_index_retry_probability);
- return 0;
- }
-}
-
-int IndexManager::get_index(coll_t c, const string& baseDir, Index *index) {
-
- Mutex::Locker l(lock);
- ceph::unordered_map<coll_t, CollectionIndex* > ::iterator it = col_indices.find(c);
- if (it == col_indices.end()) {
- char path[PATH_MAX];
- snprintf(path, sizeof(path), "%s/current/%s", baseDir.c_str(), c.to_str().c_str());
- CollectionIndex* colIndex = NULL;
- int r = build_index(c, path, &colIndex);
- if (r < 0)
- return r;
- col_indices[c] = colIndex;
- index->index = colIndex;
- } else {
- index->index = it->second;
- }
- return 0;
-}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-#ifndef OS_INDEXMANAGER_H
-#define OS_INDEXMANAGER_H
-
-#include "include/memory.h"
-#include "include/unordered_map.h"
-
-#include "common/Mutex.h"
-#include "common/Cond.h"
-#include "common/config.h"
-#include "common/debug.h"
-
-#include "CollectionIndex.h"
-#include "HashIndex.h"
-
-
-/// Public type for Index
-struct Index {
- CollectionIndex *index;
-
- Index() : index(NULL) {}
- Index(CollectionIndex* index) : index(index) {}
-
- CollectionIndex *operator->() { return index; }
- CollectionIndex &operator*() { return *index; }
-};
-
-
-/**
- * Encapsulates mutual exclusion for CollectionIndexes.
- *
- * Allowing a modification (removal or addition of an object) to occur
- * while a read is occuring (lookup of an object's path and use of
- * that path) may result in the path becoming invalid. Thus, during
- * the lifetime of a CollectionIndex object and any paths returned
- * by it, no other concurrent accesses may be allowed.
- * This is enforced by using CollectionIndex::access_lock
- */
-class IndexManager {
- Mutex lock; ///< Lock for Index Manager
- bool upgrade;
- ceph::unordered_map<coll_t, CollectionIndex* > col_indices;
-
- /**
- * Index factory
- *
- * Encapsulates logic for handling legacy FileStore
- * layouts
- *
- * @param [in] c Collection for which to get index
- * @param [in] path Path to collection
- * @param [out] index Index for c
- * @return error code
- */
- int build_index(coll_t c, const char *path, CollectionIndex **index);
-public:
- /// Constructor
- IndexManager(bool upgrade) : lock("IndexManager lock"),
- upgrade(upgrade) {}
-
- ~IndexManager();
-
- /**
- * Reserve and return index for c
- *
- * @param [in] c Collection for which to get index
- * @param [in] baseDir base directory of collections
- * @param [out] index Index for c
- * @return error code
- */
- int get_index(coll_t c, const string& baseDir, Index *index);
-
- /**
- * Initialize index for collection c at path
- *
- * @param [in] c Collection for which to init Index
- * @param [in] path Path to collection
- * @param [in] filestore_version version of containing FileStore
- * @return error code
- */
- int init_index(coll_t c, const char *path, uint32_t filestore_version);
-};
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-#ifndef CEPH_JOURNAL_H
-#define CEPH_JOURNAL_H
-
-#include <errno.h>
-
-#include "include/buffer_fwd.h"
-#include "include/Context.h"
-#include "common/Finisher.h"
-#include "common/TrackedOp.h"
-#include "os/ObjectStore.h"
-
-class PerfCounters;
-
-class Journal {
-protected:
- uuid_d fsid;
- Finisher *finisher;
-public:
- PerfCounters *logger;
-protected:
- Cond *do_sync_cond;
- bool wait_on_full;
-
-public:
- Journal(uuid_d f, Finisher *fin, Cond *c=0) :
- fsid(f), finisher(fin), logger(NULL),
- do_sync_cond(c),
- wait_on_full(false) { }
- virtual ~Journal() { }
-
- virtual int check() = 0; ///< check if journal appears valid
- virtual int create() = 0; ///< create a fresh journal
- virtual int open(uint64_t fs_op_seq) = 0; ///< open an existing journal
- virtual void close() = 0; ///< close an open journal
-
- virtual void flush() = 0;
- virtual void throttle() = 0;
-
- virtual int dump(ostream& out) { return -EOPNOTSUPP; }
-
- void set_wait_on_full(bool b) { wait_on_full = b; }
-
- // writes
- virtual bool is_writeable() = 0;
- virtual int make_writeable() = 0;
- virtual void submit_entry(uint64_t seq, bufferlist& e, uint32_t orig_len,
- Context *oncommit,
- TrackedOpRef osd_op = TrackedOpRef()) = 0;
- virtual void commit_start(uint64_t seq) = 0;
- virtual void committed_thru(uint64_t seq) = 0;
-
- /// Read next journal entry - asserts on invalid journal
- virtual bool read_entry(
- bufferlist &bl, ///< [out] payload on successful read
- uint64_t &seq ///< [in,out] sequence number on last successful read
- ) = 0; ///< @return true on successful read, false on journal end
-
- virtual bool should_commit_now() = 0;
-
- virtual int prepare_entry(list<ObjectStore::Transaction*>& tls, bufferlist* tbl) = 0;
-
- // reads/recovery
-
-};
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-
-#include "JournalingObjectStore.h"
-
-#include "common/errno.h"
-#include "common/debug.h"
-
-#define dout_subsys ceph_subsys_journal
-#undef dout_prefix
-#define dout_prefix *_dout << "journal "
-
-
-
-void JournalingObjectStore::journal_start()
-{
- dout(10) << "journal_start" << dendl;
- finisher.start();
-}
-
-void JournalingObjectStore::journal_stop()
-{
- dout(10) << "journal_stop" << dendl;
- finisher.stop();
-}
-
-// A journal_replay() makes journal writeable, this closes that out.
-void JournalingObjectStore::journal_write_close()
-{
- if (journal) {
- journal->close();
- delete journal;
- journal = 0;
- }
- apply_manager.reset();
-}
-
-int JournalingObjectStore::journal_replay(uint64_t fs_op_seq)
-{
- dout(10) << "journal_replay fs op_seq " << fs_op_seq << dendl;
-
- if (g_conf->journal_replay_from) {
- dout(0) << "journal_replay forcing replay from " << g_conf->journal_replay_from
- << " instead of " << fs_op_seq << dendl;
- // the previous op is the last one committed
- fs_op_seq = g_conf->journal_replay_from - 1;
- }
-
- uint64_t op_seq = fs_op_seq;
- apply_manager.init_seq(fs_op_seq);
-
- if (!journal) {
- submit_manager.set_op_seq(op_seq);
- return 0;
- }
-
- int err = journal->open(op_seq);
- if (err < 0) {
- dout(3) << "journal_replay open failed with "
- << cpp_strerror(err) << dendl;
- delete journal;
- journal = 0;
- return err;
- }
-
- replaying = true;
-
- int count = 0;
- while (1) {
- bufferlist bl;
- uint64_t seq = op_seq + 1;
- if (!journal->read_entry(bl, seq)) {
- dout(3) << "journal_replay: end of journal, done." << dendl;
- break;
- }
-
- if (seq <= op_seq) {
- dout(3) << "journal_replay: skipping old op seq " << seq << " <= " << op_seq << dendl;
- continue;
- }
- assert(op_seq == seq-1);
-
- dout(3) << "journal_replay: applying op seq " << seq << dendl;
- bufferlist::iterator p = bl.begin();
- list<Transaction*> tls;
- while (!p.end()) {
- Transaction *t = new Transaction(p);
- tls.push_back(t);
- }
-
- apply_manager.op_apply_start(seq);
- int r = do_transactions(tls, seq);
- apply_manager.op_apply_finish(seq);
-
- op_seq = seq;
-
- while (!tls.empty()) {
- delete tls.front();
- tls.pop_front();
- }
-
- dout(3) << "journal_replay: r = " << r << ", op_seq now " << op_seq << dendl;
- }
-
- replaying = false;
-
- submit_manager.set_op_seq(op_seq);
-
- // done reading, make writeable.
- err = journal->make_writeable();
- if (err < 0)
- return err;
-
- return count;
-}
-
-
-// ------------------------------------
-
-uint64_t JournalingObjectStore::ApplyManager::op_apply_start(uint64_t op)
-{
- Mutex::Locker l(apply_lock);
- while (blocked) {
- // note: this only happens during journal replay
- dout(10) << "op_apply_start blocked, waiting" << dendl;
- blocked_cond.Wait(apply_lock);
- }
- dout(10) << "op_apply_start " << op << " open_ops " << open_ops << " -> " << (open_ops+1) << dendl;
- assert(!blocked);
- assert(op > committed_seq);
- open_ops++;
- return op;
-}
-
-void JournalingObjectStore::ApplyManager::op_apply_finish(uint64_t op)
-{
- Mutex::Locker l(apply_lock);
- dout(10) << "op_apply_finish " << op << " open_ops " << open_ops
- << " -> " << (open_ops-1)
- << ", max_applied_seq " << max_applied_seq << " -> " << MAX(op, max_applied_seq)
- << dendl;
- --open_ops;
- assert(open_ops >= 0);
-
- // signal a blocked commit_start (only needed during journal replay)
- if (blocked) {
- blocked_cond.Signal();
- }
-
- // there can be multiple applies in flight; track the max value we
- // note. note that we can't _read_ this value and learn anything
- // meaningful unless/until we've quiesced all in-flight applies.
- if (op > max_applied_seq)
- max_applied_seq = op;
-}
-
-uint64_t JournalingObjectStore::SubmitManager::op_submit_start()
-{
- lock.Lock();
- uint64_t op = ++op_seq;
- dout(10) << "op_submit_start " << op << dendl;
- return op;
-}
-
-void JournalingObjectStore::SubmitManager::op_submit_finish(uint64_t op)
-{
- dout(10) << "op_submit_finish " << op << dendl;
- if (op != op_submitted + 1) {
- dout(0) << "op_submit_finish " << op << " expected " << (op_submitted + 1)
- << ", OUT OF ORDER" << dendl;
- assert(0 == "out of order op_submit_finish");
- }
- op_submitted = op;
- lock.Unlock();
-}
-
-
-// ------------------------------------------
-
-void JournalingObjectStore::ApplyManager::add_waiter(uint64_t op, Context *c)
-{
- Mutex::Locker l(com_lock);
- assert(c);
- commit_waiters[op].push_back(c);
-}
-
-bool JournalingObjectStore::ApplyManager::commit_start()
-{
- bool ret = false;
-
- uint64_t _committing_seq = 0;
- {
- Mutex::Locker l(apply_lock);
- dout(10) << "commit_start max_applied_seq " << max_applied_seq
- << ", open_ops " << open_ops
- << dendl;
- blocked = true;
- while (open_ops > 0) {
- dout(10) << "commit_start waiting for " << open_ops << " open ops to drain" << dendl;
- blocked_cond.Wait(apply_lock);
- }
- assert(open_ops == 0);
- dout(10) << "commit_start blocked, all open_ops have completed" << dendl;
- {
- Mutex::Locker l(com_lock);
- if (max_applied_seq == committed_seq) {
- dout(10) << "commit_start nothing to do" << dendl;
- blocked = false;
- assert(commit_waiters.empty());
- goto out;
- }
-
- _committing_seq = committing_seq = max_applied_seq;
-
- dout(10) << "commit_start committing " << committing_seq
- << ", still blocked" << dendl;
- }
- }
- ret = true;
-
- out:
- if (journal)
- journal->commit_start(_committing_seq); // tell the journal too
- return ret;
-}
-
-void JournalingObjectStore::ApplyManager::commit_started()
-{
- Mutex::Locker l(apply_lock);
- // allow new ops. (underlying fs should now be committing all prior ops)
- dout(10) << "commit_started committing " << committing_seq << ", unblocking" << dendl;
- blocked = false;
- blocked_cond.Signal();
-}
-
-void JournalingObjectStore::ApplyManager::commit_finish()
-{
- Mutex::Locker l(com_lock);
- dout(10) << "commit_finish thru " << committing_seq << dendl;
-
- if (journal)
- journal->committed_thru(committing_seq);
-
- committed_seq = committing_seq;
-
- map<version_t, vector<Context*> >::iterator p = commit_waiters.begin();
- while (p != commit_waiters.end() &&
- p->first <= committing_seq) {
- finisher.queue(p->second);
- commit_waiters.erase(p++);
- }
-}
-
-void JournalingObjectStore::_op_journal_transactions(
- bufferlist& tbl, uint32_t orig_len, uint64_t op,
- Context *onjournal, TrackedOpRef osd_op)
-{
- if (osd_op.get())
- dout(10) << "op_journal_transactions " << op << " reqid_t "
- << (static_cast<OpRequest *>(osd_op.get()))->get_reqid() << dendl;
- else
- dout(10) << "op_journal_transactions " << op << dendl;
-
- if (journal && journal->is_writeable()) {
- journal->submit_entry(op, tbl, orig_len, onjournal, osd_op);
- } else if (onjournal) {
- apply_manager.add_waiter(op, onjournal);
- }
-}
-
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#ifndef CEPH_JOURNALINGOBJECTSTORE_H
-#define CEPH_JOURNALINGOBJECTSTORE_H
-
-#include "ObjectStore.h"
-#include "Journal.h"
-#include "FileJournal.h"
-#include "common/RWLock.h"
-
-class JournalingObjectStore : public ObjectStore {
-protected:
- Journal *journal;
- Finisher finisher;
-
-
- class SubmitManager {
- Mutex lock;
- uint64_t op_seq;
- uint64_t op_submitted;
- public:
- SubmitManager() :
- lock("JOS::SubmitManager::lock", false, true, false, g_ceph_context),
- op_seq(0), op_submitted(0)
- {}
- uint64_t op_submit_start();
- void op_submit_finish(uint64_t op);
- void set_op_seq(uint64_t seq) {
- Mutex::Locker l(lock);
- op_submitted = op_seq = seq;
- }
- uint64_t get_op_seq() {
- return op_seq;
- }
- } submit_manager;
-
- class ApplyManager {
- Journal *&journal;
- Finisher &finisher;
-
- Mutex apply_lock;
- bool blocked;
- Cond blocked_cond;
- int open_ops;
- uint64_t max_applied_seq;
-
- Mutex com_lock;
- map<version_t, vector<Context*> > commit_waiters;
- uint64_t committing_seq, committed_seq;
-
- public:
- ApplyManager(Journal *&j, Finisher &f) :
- journal(j), finisher(f),
- apply_lock("JOS::ApplyManager::apply_lock", false, true, false, g_ceph_context),
- blocked(false),
- open_ops(0),
- max_applied_seq(0),
- com_lock("JOS::ApplyManager::com_lock", false, true, false, g_ceph_context),
- committing_seq(0), committed_seq(0) {}
- void reset() {
- assert(open_ops == 0);
- assert(blocked == false);
- max_applied_seq = 0;
- committing_seq = 0;
- committed_seq = 0;
- }
- void add_waiter(uint64_t, Context*);
- uint64_t op_apply_start(uint64_t op);
- void op_apply_finish(uint64_t op);
- bool commit_start();
- void commit_started();
- void commit_finish();
- bool is_committing() {
- Mutex::Locker l(com_lock);
- return committing_seq != committed_seq;
- }
- uint64_t get_committed_seq() {
- Mutex::Locker l(com_lock);
- return committed_seq;
- }
- uint64_t get_committing_seq() {
- Mutex::Locker l(com_lock);
- return committing_seq;
- }
- void init_seq(uint64_t fs_op_seq) {
- {
- Mutex::Locker l(com_lock);
- committed_seq = fs_op_seq;
- committing_seq = fs_op_seq;
- }
- {
- Mutex::Locker l(apply_lock);
- max_applied_seq = fs_op_seq;
- }
- }
- } apply_manager;
-
- bool replaying;
-
-protected:
- void journal_start();
- void journal_stop();
- void journal_write_close();
- int journal_replay(uint64_t fs_op_seq);
-
- void _op_journal_transactions(bufferlist& tls, uint32_t orig_len, uint64_t op,
- Context *onjournal, TrackedOpRef osd_op);
-
- virtual int do_transactions(list<ObjectStore::Transaction*>& tls, uint64_t op_seq) = 0;
-
-public:
- bool is_committing() {
- return apply_manager.is_committing();
- }
- uint64_t get_committed_seq() {
- return apply_manager.get_committed_seq();
- }
-
-public:
- JournalingObjectStore(const std::string& path)
- : ObjectStore(path),
- journal(NULL),
- finisher(g_ceph_context, "JournalObjectStore"),
- apply_manager(journal, finisher),
- replaying(false) {}
-
- ~JournalingObjectStore() {
- }
-};
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include <string>
-#include <map>
-#include <set>
-#include <vector>
-#include <errno.h>
-#include <string.h>
-
-#if defined(__FreeBSD__)
-#include <sys/param.h>
-#endif
-
-#include "osd/osd_types.h"
-#include "include/object.h"
-#include "common/config.h"
-#include "common/debug.h"
-#include "include/buffer.h"
-#include "common/ceph_crypto.h"
-#include "include/compat.h"
-#include "chain_xattr.h"
-
-#include "LFNIndex.h"
-using ceph::crypto::SHA1;
-
-#define dout_subsys ceph_subsys_filestore
-#undef dout_prefix
-#define dout_prefix *_dout << "LFNIndex(" << get_base_path() << ") "
-
-
-const string LFNIndex::LFN_ATTR = "user.cephos.lfn";
-const string LFNIndex::PHASH_ATTR_PREFIX = "user.cephos.phash.";
-const string LFNIndex::SUBDIR_PREFIX = "DIR_";
-const string LFNIndex::FILENAME_COOKIE = "long";
-const int LFNIndex::FILENAME_PREFIX_LEN = FILENAME_SHORT_LEN - FILENAME_HASH_LEN -
- FILENAME_COOKIE.size() -
- FILENAME_EXTRA;
-void LFNIndex::maybe_inject_failure()
-{
- if (error_injection_enabled) {
- if (current_failure > last_failure &&
- (((double)(rand() % 10000))/((double)(10000))
- < error_injection_probability)) {
- last_failure = current_failure;
- current_failure = 0;
- throw RetryException();
- }
- ++current_failure;
- }
-}
-
-// Helper to close fd's when we leave scope. This is useful when used
-// in combination with RetryException, thrown by the above.
-struct FDCloser {
- int fd;
- FDCloser(int f) : fd(f) {}
- ~FDCloser() {
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- }
-};
-
-
-/* Public methods */
-
-
-int LFNIndex::init()
-{
- return _init();
-}
-
-int LFNIndex::created(const ghobject_t &oid, const char *path)
-{
- WRAP_RETRY(
- vector<string> path_comp;
- string short_name;
- r = decompose_full_path(path, &path_comp, 0, &short_name);
- if (r < 0)
- goto out;
- r = lfn_created(path_comp, oid, short_name);
- if (r < 0)
- goto out;
- r = _created(path_comp, oid, short_name);
- if (r < 0)
- goto out;
- );
-}
-
-int LFNIndex::unlink(const ghobject_t &oid)
-{
- WRAP_RETRY(
- vector<string> path;
- string short_name;
- r = _lookup(oid, &path, &short_name, NULL);
- if (r < 0) {
- goto out;
- }
- r = _remove(path, oid, short_name);
- if (r < 0) {
- goto out;
- }
- );
-}
-
-int LFNIndex::lookup(const ghobject_t &oid,
- IndexedPath *out_path,
- int *hardlink)
-{
- WRAP_RETRY(
- vector<string> path;
- string short_name;
- r = _lookup(oid, &path, &short_name, hardlink);
- if (r < 0)
- goto out;
- string full_path = get_full_path(path, short_name);
- *out_path = IndexedPath(new Path(full_path, this));
- r = 0;
- );
-}
-
-int LFNIndex::pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs)
-{
- return _pre_hash_collection(pg_num, expected_num_objs);
-}
-
-
-int LFNIndex::collection_list_partial(const ghobject_t &start,
- const ghobject_t &end,
- bool sort_bitwise,
- int max_count,
- vector<ghobject_t> *ls,
- ghobject_t *next)
-{
- return _collection_list_partial(start, end, sort_bitwise, max_count, ls, next);
-}
-
-/* Derived class utility methods */
-
-int LFNIndex::fsync_dir(const vector<string> &path)
-{
- maybe_inject_failure();
- int fd = ::open(get_full_path_subdir(path).c_str(), O_RDONLY);
- if (fd < 0)
- return -errno;
- FDCloser f(fd);
- maybe_inject_failure();
- int r = ::fsync(fd);
- maybe_inject_failure();
- if (r < 0)
- return -errno;
- else
- return 0;
-}
-
-int LFNIndex::link_object(const vector<string> &from,
- const vector<string> &to,
- const ghobject_t &oid,
- const string &from_short_name)
-{
- int r;
- string from_path = get_full_path(from, from_short_name);
- string to_path;
- maybe_inject_failure();
- r = lfn_get_name(to, oid, 0, &to_path, 0);
- if (r < 0)
- return r;
- maybe_inject_failure();
- r = ::link(from_path.c_str(), to_path.c_str());
- maybe_inject_failure();
- if (r < 0)
- return -errno;
- else
- return 0;
-}
-
-int LFNIndex::remove_objects(const vector<string> &dir,
- const map<string, ghobject_t> &to_remove,
- map<string, ghobject_t> *remaining)
-{
- set<string> clean_chains;
- for (map<string, ghobject_t>::const_iterator to_clean = to_remove.begin();
- to_clean != to_remove.end();
- ++to_clean) {
- if (!lfn_is_hashed_filename(to_clean->first)) {
- maybe_inject_failure();
- int r = ::unlink(get_full_path(dir, to_clean->first).c_str());
- maybe_inject_failure();
- if (r < 0)
- return -errno;
- continue;
- }
- if (clean_chains.count(lfn_get_short_name(to_clean->second, 0)))
- continue;
- set<int> holes;
- map<int, pair<string, ghobject_t> > chain;
- for (int i = 0; ; ++i) {
- string short_name = lfn_get_short_name(to_clean->second, i);
- if (remaining->count(short_name)) {
- chain[i] = *(remaining->find(short_name));
- } else if (to_remove.count(short_name)) {
- holes.insert(i);
- } else {
- break;
- }
- }
-
- map<int, pair<string, ghobject_t > >::reverse_iterator candidate = chain.rbegin();
- for (set<int>::iterator i = holes.begin();
- i != holes.end();
- ++i) {
- if (candidate == chain.rend() || *i > candidate->first) {
- string remove_path_name =
- get_full_path(dir, lfn_get_short_name(to_clean->second, *i));
- maybe_inject_failure();
- int r = ::unlink(remove_path_name.c_str());
- maybe_inject_failure();
- if (r < 0)
- return -errno;
- continue;
- }
- string from = get_full_path(dir, candidate->second.first);
- string to = get_full_path(dir, lfn_get_short_name(candidate->second.second, *i));
- maybe_inject_failure();
- int r = ::rename(from.c_str(), to.c_str());
- maybe_inject_failure();
- if (r < 0)
- return -errno;
- remaining->erase(candidate->second.first);
- remaining->insert(pair<string, ghobject_t>(
- lfn_get_short_name(candidate->second.second, *i),
- candidate->second.second));
- ++candidate;
- }
- if (!holes.empty())
- clean_chains.insert(lfn_get_short_name(to_clean->second, 0));
- }
- return 0;
-}
-
-int LFNIndex::move_objects(const vector<string> &from,
- const vector<string> &to)
-{
- map<string, ghobject_t> to_move;
- int r;
- r = list_objects(from, 0, NULL, &to_move);
- if (r < 0)
- return r;
- for (map<string,ghobject_t>::iterator i = to_move.begin();
- i != to_move.end();
- ++i) {
- string from_path = get_full_path(from, i->first);
- string to_path, to_name;
- r = lfn_get_name(to, i->second, &to_name, &to_path, 0);
- if (r < 0)
- return r;
- maybe_inject_failure();
- r = ::link(from_path.c_str(), to_path.c_str());
- if (r < 0 && errno != EEXIST)
- return -errno;
- maybe_inject_failure();
- r = lfn_created(to, i->second, to_name);
- maybe_inject_failure();
- if (r < 0)
- return r;
- }
- r = fsync_dir(to);
- if (r < 0)
- return r;
- for (map<string,ghobject_t>::iterator i = to_move.begin();
- i != to_move.end();
- ++i) {
- maybe_inject_failure();
- r = ::unlink(get_full_path(from, i->first).c_str());
- maybe_inject_failure();
- if (r < 0)
- return -errno;
- }
- return fsync_dir(from);
-}
-
-int LFNIndex::remove_object(const vector<string> &from,
- const ghobject_t &oid)
-{
- string short_name;
- int r, exist;
- maybe_inject_failure();
- r = get_mangled_name(from, oid, &short_name, &exist);
- maybe_inject_failure();
- if (r < 0)
- return r;
- if (exist == 0)
- return -ENOENT;
- return lfn_unlink(from, oid, short_name);
-}
-
-int LFNIndex::get_mangled_name(const vector<string> &from,
- const ghobject_t &oid,
- string *mangled_name, int *hardlink)
-{
- return lfn_get_name(from, oid, mangled_name, 0, hardlink);
-}
-
-int LFNIndex::move_subdir(
- LFNIndex &from,
- LFNIndex &dest,
- const vector<string> &path,
- string dir
- )
-{
- vector<string> sub_path(path.begin(), path.end());
- sub_path.push_back(dir);
- string from_path(from.get_full_path_subdir(sub_path));
- string to_path(dest.get_full_path_subdir(sub_path));
- int r = ::rename(from_path.c_str(), to_path.c_str());
- if (r < 0)
- return -errno;
- return 0;
-}
-
-int LFNIndex::move_object(
- LFNIndex &from,
- LFNIndex &dest,
- const vector<string> &path,
- const pair<string, ghobject_t> &obj
- )
-{
- string from_path(from.get_full_path(path, obj.first));
- string to_path;
- string to_name;
- int exists;
- int r = dest.lfn_get_name(path, obj.second, &to_name, &to_path, &exists);
- if (r < 0)
- return r;
- if (!exists) {
- r = ::link(from_path.c_str(), to_path.c_str());
- if (r < 0)
- return r;
- }
- r = dest.lfn_created(path, obj.second, to_name);
- if (r < 0)
- return r;
- r = dest.fsync_dir(path);
- if (r < 0)
- return r;
- r = from.remove_object(path, obj.second);
- if (r < 0)
- return r;
- return from.fsync_dir(path);
-}
-
-
-static int get_hobject_from_oinfo(const char *dir, const char *file,
- ghobject_t *o)
-{
- char path[PATH_MAX];
- bufferptr bp(PATH_MAX);
- snprintf(path, sizeof(path), "%s/%s", dir, file);
- // Hack, user.ceph._ is the attribute used to store the object info
- int r = chain_getxattr(path, "user.ceph._", bp.c_str(), bp.length());
- if (r < 0)
- return r;
- bufferlist bl;
- bl.push_back(bp);
- object_info_t oi(bl);
- *o = ghobject_t(oi.soid);
- return 0;
-}
-
-
-int LFNIndex::list_objects(const vector<string> &to_list, int max_objs,
- long *handle, map<string, ghobject_t> *out)
-{
- string to_list_path = get_full_path_subdir(to_list);
- DIR *dir = ::opendir(to_list_path.c_str());
- char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1];
- int r;
- if (!dir) {
- return -errno;
- }
-
- if (handle && *handle) {
- seekdir(dir, *handle);
- }
-
- struct dirent *de;
- int listed = 0;
- bool end = false;
- while (!::readdir_r(dir, reinterpret_cast<struct dirent*>(buf), &de)) {
- if (!de) {
- end = true;
- break;
- }
- if (max_objs > 0 && listed >= max_objs) {
- break;
- }
- if (de->d_name[0] == '.')
- continue;
- string short_name(de->d_name);
- ghobject_t obj;
- if (lfn_is_object(short_name)) {
- r = lfn_translate(to_list, short_name, &obj);
- if (r < 0) {
- r = -errno;
- goto cleanup;
- } else if (r > 0) {
- string long_name = lfn_generate_object_name(obj);
- if (!lfn_must_hash(long_name)) {
- assert(long_name == short_name);
- }
- if (index_version == HASH_INDEX_TAG)
- get_hobject_from_oinfo(to_list_path.c_str(), short_name.c_str(), &obj);
-
- out->insert(pair<string, ghobject_t>(short_name, obj));
- ++listed;
- } else {
- continue;
- }
- }
- }
-
- if (handle && !end) {
- *handle = telldir(dir);
- }
-
- r = 0;
- cleanup:
- ::closedir(dir);
- return r;
-}
-
-int LFNIndex::list_subdirs(const vector<string> &to_list,
- vector<string> *out)
-{
- string to_list_path = get_full_path_subdir(to_list);
- DIR *dir = ::opendir(to_list_path.c_str());
- char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1];
- if (!dir)
- return -errno;
-
- struct dirent *de;
- while (!::readdir_r(dir, reinterpret_cast<struct dirent*>(buf), &de)) {
- if (!de) {
- break;
- }
- string short_name(de->d_name);
- string demangled_name;
- if (lfn_is_subdir(short_name, &demangled_name)) {
- out->push_back(demangled_name);
- }
- }
-
- ::closedir(dir);
- return 0;
-}
-
-int LFNIndex::create_path(const vector<string> &to_create)
-{
- maybe_inject_failure();
- int r = ::mkdir(get_full_path_subdir(to_create).c_str(), 0777);
- maybe_inject_failure();
- if (r < 0)
- return -errno;
- else
- return 0;
-}
-
-int LFNIndex::remove_path(const vector<string> &to_remove)
-{
- maybe_inject_failure();
- int r = ::rmdir(get_full_path_subdir(to_remove).c_str());
- maybe_inject_failure();
- if (r < 0)
- return -errno;
- else
- return 0;
-}
-
-int LFNIndex::path_exists(const vector<string> &to_check, int *exists)
-{
- string full_path = get_full_path_subdir(to_check);
- struct stat buf;
- if (::stat(full_path.c_str(), &buf)) {
- int r = -errno;
- if (r == -ENOENT) {
- *exists = 0;
- return 0;
- } else {
- return r;
- }
- } else {
- *exists = 1;
- return 0;
- }
-}
-
-int LFNIndex::add_attr_path(const vector<string> &path,
- const string &attr_name,
- bufferlist &attr_value)
-{
- string full_path = get_full_path_subdir(path);
- maybe_inject_failure();
- return chain_setxattr(full_path.c_str(), mangle_attr_name(attr_name).c_str(),
- reinterpret_cast<void *>(attr_value.c_str()),
- attr_value.length());
-}
-
-int LFNIndex::get_attr_path(const vector<string> &path,
- const string &attr_name,
- bufferlist &attr_value)
-{
- string full_path = get_full_path_subdir(path);
- size_t size = 1024; // Initial
- while (1) {
- bufferptr buf(size);
- int r = chain_getxattr(full_path.c_str(), mangle_attr_name(attr_name).c_str(),
- reinterpret_cast<void *>(buf.c_str()),
- size);
- if (r > 0) {
- buf.set_length(r);
- attr_value.push_back(buf);
- break;
- } else {
- r = -errno;
- if (r == -ERANGE) {
- size *= 2;
- } else {
- return r;
- }
- }
- }
- return 0;
-}
-
-int LFNIndex::remove_attr_path(const vector<string> &path,
- const string &attr_name)
-{
- string full_path = get_full_path_subdir(path);
- string mangled_attr_name = mangle_attr_name(attr_name);
- maybe_inject_failure();
- return chain_removexattr(full_path.c_str(), mangled_attr_name.c_str());
-}
-
-string LFNIndex::lfn_generate_object_name_keyless(const ghobject_t &oid)
-{
- char s[FILENAME_MAX_LEN];
- char *end = s + sizeof(s);
- char *t = s;
-
- assert(oid.generation == ghobject_t::NO_GEN);
- const char *i = oid.hobj.oid.name.c_str();
- // Escape subdir prefix
- if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
- *t++ = '\\';
- *t++ = 'd';
- i += 4;
- }
- while (*i && t < end) {
- if (*i == '\\') {
- *t++ = '\\';
- *t++ = '\\';
- } else if (*i == '.' && i == oid.hobj.oid.name.c_str()) { // only escape leading .
- *t++ = '\\';
- *t++ = '.';
- } else if (*i == '/') {
- *t++ = '\\';
- *t++ = 's';
- } else
- *t++ = *i;
- i++;
- }
-
- if (oid.hobj.snap == CEPH_NOSNAP)
- t += snprintf(t, end - t, "_head");
- else if (oid.hobj.snap == CEPH_SNAPDIR)
- t += snprintf(t, end - t, "_snapdir");
- else
- t += snprintf(t, end - t, "_%llx", (long long unsigned)oid.hobj.snap);
- snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash());
-
- return string(s);
-}
-
-static void append_escaped(string::const_iterator begin,
- string::const_iterator end,
- string *out)
-{
- for (string::const_iterator i = begin; i != end; ++i) {
- if (*i == '\\') {
- out->append("\\\\");
- } else if (*i == '/') {
- out->append("\\s");
- } else if (*i == '_') {
- out->append("\\u");
- } else if (*i == '\0') {
- out->append("\\n");
- } else {
- out->append(i, i+1);
- }
- }
-}
-
-string LFNIndex::lfn_generate_object_name(const ghobject_t &oid)
-{
- if (index_version == HASH_INDEX_TAG)
- return lfn_generate_object_name_keyless(oid);
- if (index_version == HASH_INDEX_TAG_2)
- return lfn_generate_object_name_poolless(oid);
-
- string full_name;
- string::const_iterator i = oid.hobj.oid.name.begin();
- if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
- full_name.append("\\d");
- i += 4;
- } else if (oid.hobj.oid.name[0] == '.') {
- full_name.append("\\.");
- ++i;
- }
- append_escaped(i, oid.hobj.oid.name.end(), &full_name);
- full_name.append("_");
- append_escaped(oid.hobj.get_key().begin(), oid.hobj.get_key().end(), &full_name);
- full_name.append("_");
-
- char buf[PATH_MAX];
- char *t = buf;
- char *end = t + sizeof(buf);
- if (oid.hobj.snap == CEPH_NOSNAP)
- t += snprintf(t, end - t, "head");
- else if (oid.hobj.snap == CEPH_SNAPDIR)
- t += snprintf(t, end - t, "snapdir");
- else
- t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
- snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash());
- full_name += string(buf);
- full_name.append("_");
-
- append_escaped(oid.hobj.nspace.begin(), oid.hobj.nspace.end(), &full_name);
- full_name.append("_");
-
- t = buf;
- end = t + sizeof(buf);
- if (oid.hobj.pool == -1)
- t += snprintf(t, end - t, "none");
- else
- t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.pool);
- full_name += string(buf);
-
- if (oid.generation != ghobject_t::NO_GEN ||
- oid.shard_id != shard_id_t::NO_SHARD) {
- full_name.append("_");
-
- t = buf;
- end = t + sizeof(buf);
- t += snprintf(t, end - t, "%llx", (long long unsigned)oid.generation);
- full_name += string(buf);
-
- full_name.append("_");
-
- t = buf;
- end = t + sizeof(buf);
- t += snprintf(t, end - t, "%x", (int)oid.shard_id);
- full_name += string(buf);
- }
-
- return full_name;
-}
-
-string LFNIndex::lfn_generate_object_name_poolless(const ghobject_t &oid)
-{
- if (index_version == HASH_INDEX_TAG)
- return lfn_generate_object_name_keyless(oid);
-
- assert(oid.generation == ghobject_t::NO_GEN);
- string full_name;
- string::const_iterator i = oid.hobj.oid.name.begin();
- if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
- full_name.append("\\d");
- i += 4;
- } else if (oid.hobj.oid.name[0] == '.') {
- full_name.append("\\.");
- ++i;
- }
- append_escaped(i, oid.hobj.oid.name.end(), &full_name);
- full_name.append("_");
- append_escaped(oid.hobj.get_key().begin(), oid.hobj.get_key().end(), &full_name);
- full_name.append("_");
-
- char snap_with_hash[PATH_MAX];
- char *t = snap_with_hash;
- char *end = t + sizeof(snap_with_hash);
- if (oid.hobj.snap == CEPH_NOSNAP)
- t += snprintf(t, end - t, "head");
- else if (oid.hobj.snap == CEPH_SNAPDIR)
- t += snprintf(t, end - t, "snapdir");
- else
- t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
- snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash());
- full_name += string(snap_with_hash);
- return full_name;
-}
-
-int LFNIndex::lfn_get_name(const vector<string> &path,
- const ghobject_t &oid,
- string *mangled_name, string *out_path,
- int *hardlink)
-{
- string subdir_path = get_full_path_subdir(path);
- string full_name = lfn_generate_object_name(oid);
- int r;
-
- if (!lfn_must_hash(full_name)) {
- if (mangled_name)
- *mangled_name = full_name;
- if (out_path)
- *out_path = get_full_path(path, full_name);
- if (hardlink) {
- struct stat buf;
- string full_path = get_full_path(path, full_name);
- maybe_inject_failure();
- r = ::stat(full_path.c_str(), &buf);
- if (r < 0) {
- if (errno == ENOENT)
- *hardlink = 0;
- else
- return -errno;
- } else {
- *hardlink = buf.st_nlink;
- }
- }
- return 0;
- }
-
- int i = 0;
- string candidate;
- string candidate_path;
- char buf[FILENAME_MAX_LEN + 1];
- for ( ; ; ++i) {
- candidate = lfn_get_short_name(oid, i);
- candidate_path = get_full_path(path, candidate);
- r = chain_getxattr(candidate_path.c_str(), get_lfn_attr().c_str(),
- buf, sizeof(buf));
- if (r < 0) {
- if (errno != ENODATA && errno != ENOENT)
- return -errno;
- if (errno == ENODATA) {
- // Left over from incomplete transaction, it'll be replayed
- maybe_inject_failure();
- r = ::unlink(candidate_path.c_str());
- maybe_inject_failure();
- if (r < 0)
- return -errno;
- }
- if (mangled_name)
- *mangled_name = candidate;
- if (out_path)
- *out_path = candidate_path;
- if (hardlink)
- *hardlink = 0;
- return 0;
- }
- assert(r > 0);
- buf[MIN((int)sizeof(buf) - 1, r)] = '\0';
- if (!strcmp(buf, full_name.c_str())) {
- if (mangled_name)
- *mangled_name = candidate;
- if (out_path)
- *out_path = candidate_path;
- if (hardlink) {
- struct stat st;
- r = ::stat(candidate_path.c_str(), &st);
- *hardlink = st.st_nlink;
- }
- return 0;
- }
- r = chain_getxattr(candidate_path.c_str(), get_alt_lfn_attr().c_str(),
- buf, sizeof(buf));
- if (r > 0) {
- // only consider alt name if nlink > 1
- struct stat st;
- int rc = ::stat(candidate_path.c_str(), &st);
- if (rc < 0)
- return -errno;
- if (st.st_nlink <= 1) {
- // left over from incomplete unlink, remove
- maybe_inject_failure();
- dout(20) << __func__ << " found extra alt attr for " << candidate_path
- << ", long name " << string(buf, r) << dendl;
- rc = chain_removexattr(candidate_path.c_str(),
- get_alt_lfn_attr().c_str());
- maybe_inject_failure();
- if (rc < 0)
- return rc;
- continue;
- }
- buf[MIN((int)sizeof(buf) - 1, r)] = '\0';
- if (!strcmp(buf, full_name.c_str())) {
- dout(20) << __func__ << " used alt attr for " << full_name << dendl;
- if (mangled_name)
- *mangled_name = candidate;
- if (out_path)
- *out_path = candidate_path;
- if (hardlink)
- *hardlink = st.st_nlink;
- return 0;
- }
- }
- }
- assert(0); // Unreachable
- return 0;
-}
-
-int LFNIndex::lfn_created(const vector<string> &path,
- const ghobject_t &oid,
- const string &mangled_name)
-{
- if (!lfn_is_hashed_filename(mangled_name))
- return 0;
- string full_path = get_full_path(path, mangled_name);
- string full_name = lfn_generate_object_name(oid);
- maybe_inject_failure();
-
- // if the main attr exists and is different, move it to the alt attr.
- char buf[FILENAME_MAX_LEN + 1];
- int r = chain_getxattr(full_path.c_str(), get_lfn_attr().c_str(),
- buf, sizeof(buf));
- if (r >= 0 && (r != (int)full_name.length() ||
- memcmp(buf, full_name.c_str(), full_name.length()))) {
- dout(20) << __func__ << " " << mangled_name
- << " moving old name to alt attr "
- << string(buf, r)
- << ", new name is " << full_name << dendl;
- r = chain_setxattr(full_path.c_str(), get_alt_lfn_attr().c_str(),
- buf, r);
- if (r < 0)
- return r;
- }
-
- return chain_setxattr(full_path.c_str(), get_lfn_attr().c_str(),
- full_name.c_str(), full_name.size());
-}
-
-int LFNIndex::lfn_unlink(const vector<string> &path,
- const ghobject_t &oid,
- const string &mangled_name)
-{
- if (!lfn_is_hashed_filename(mangled_name)) {
- string full_path = get_full_path(path, mangled_name);
- maybe_inject_failure();
- int r = ::unlink(full_path.c_str());
- maybe_inject_failure();
- if (r < 0)
- return -errno;
- return 0;
- }
- string subdir_path = get_full_path_subdir(path);
-
-
- int i = 0;
- for ( ; ; ++i) {
- string candidate = lfn_get_short_name(oid, i);
- if (candidate == mangled_name)
- break;
- }
- int removed_index = i;
- ++i;
- for ( ; ; ++i) {
- struct stat buf;
- string to_check = lfn_get_short_name(oid, i);
- string to_check_path = get_full_path(path, to_check);
- int r = ::stat(to_check_path.c_str(), &buf);
- if (r < 0) {
- if (errno == ENOENT) {
- break;
- } else {
- return -errno;
- }
- }
- }
- string full_path = get_full_path(path, mangled_name);
- int fd = ::open(full_path.c_str(), O_RDONLY);
- if (fd < 0)
- return -errno;
- FDCloser f(fd);
- if (i == removed_index + 1) {
- maybe_inject_failure();
- int r = ::unlink(full_path.c_str());
- maybe_inject_failure();
- if (r < 0)
- return -errno;
- } else {
- string& rename_to = full_path;
- string rename_from = get_full_path(path, lfn_get_short_name(oid, i - 1));
- maybe_inject_failure();
- int r = ::rename(rename_from.c_str(), rename_to.c_str());
- maybe_inject_failure();
- if (r < 0)
- return -errno;
- }
- struct stat st;
- int r = ::fstat(fd, &st);
- if (r == 0 && st.st_nlink > 0) {
- // remove alt attr
- dout(20) << __func__ << " removing alt attr from " << full_path << dendl;
- fsync_dir(path);
- chain_fremovexattr(fd, get_alt_lfn_attr().c_str());
- }
- return r;
-}
-
-int LFNIndex::lfn_translate(const vector<string> &path,
- const string &short_name,
- ghobject_t *out)
-{
- if (!lfn_is_hashed_filename(short_name)) {
- return lfn_parse_object_name(short_name, out);
- }
- // Get lfn_attr
- string full_path = get_full_path(path, short_name);
- char attr[PATH_MAX];
- int r = chain_getxattr(full_path.c_str(), get_lfn_attr().c_str(), attr, sizeof(attr) - 1);
- if (r < 0)
- return -errno;
- if (r < (int)sizeof(attr))
- attr[r] = '\0';
-
- string long_name(attr);
- return lfn_parse_object_name(long_name, out);
-}
-
-bool LFNIndex::lfn_is_object(const string &short_name)
-{
- return lfn_is_hashed_filename(short_name) || !lfn_is_subdir(short_name, 0);
-}
-
-bool LFNIndex::lfn_is_subdir(const string &name, string *demangled)
-{
- if (name.substr(0, SUBDIR_PREFIX.size()) == SUBDIR_PREFIX) {
- if (demangled)
- *demangled = demangle_path_component(name);
- return 1;
- }
- return 0;
-}
-
-static int parse_object(const char *s, ghobject_t& o)
-{
- const char *hash = s + strlen(s) - 1;
- while (*hash != '_' &&
- hash > s)
- hash--;
- const char *bar = hash - 1;
- while (*bar != '_' &&
- bar > s)
- bar--;
- if (*bar == '_') {
- char buf[bar-s + 1];
- char *t = buf;
- const char *i = s;
- while (i < bar) {
- if (*i == '\\') {
- i++;
- switch (*i) {
- case '\\': *t++ = '\\'; break;
- case '.': *t++ = '.'; break;
- case 's': *t++ = '/'; break;
- case 'd': {
- *t++ = 'D';
- *t++ = 'I';
- *t++ = 'R';
- *t++ = '_';
- break;
- }
- default: assert(0);
- }
- } else {
- *t++ = *i;
- }
- i++;
- }
- *t = 0;
- o.hobj.oid.name = string(buf, t-buf);
- if (strncmp(bar+1, "head", 4) == 0)
- o.hobj.snap = CEPH_NOSNAP;
- else if (strncmp(bar+1, "snapdir", 7) == 0)
- o.hobj.snap = CEPH_SNAPDIR;
- else
- o.hobj.snap = strtoull(bar+1, NULL, 16);
-
- uint32_t hobject_hash_input;
- sscanf(hash, "_%X", &hobject_hash_input);
- o.hobj.set_hash(hobject_hash_input);
-
- return 1;
- }
- return 0;
-}
-
-bool LFNIndex::lfn_parse_object_name_keyless(const string &long_name, ghobject_t *out)
-{
- bool r = parse_object(long_name.c_str(), *out);
- int64_t pool = -1;
- spg_t pg;
- if (coll().is_pg_prefix(&pg))
- pool = (int64_t)pg.pgid.pool();
- out->hobj.pool = pool;
- if (!r) return r;
- string temp = lfn_generate_object_name(*out);
- return r;
-}
-
-static bool append_unescaped(string::const_iterator begin,
- string::const_iterator end,
- string *out)
-{
- for (string::const_iterator i = begin; i != end; ++i) {
- if (*i == '\\') {
- ++i;
- if (*i == '\\')
- out->append("\\");
- else if (*i == 's')
- out->append("/");
- else if (*i == 'n')
- (*out) += '\0';
- else if (*i == 'u')
- out->append("_");
- else
- return false;
- } else {
- out->append(i, i+1);
- }
- }
- return true;
-}
-
-bool LFNIndex::lfn_parse_object_name_poolless(const string &long_name,
- ghobject_t *out)
-{
- string name;
- string key;
- uint32_t hash;
- snapid_t snap;
-
- string::const_iterator current = long_name.begin();
- if (*current == '\\') {
- ++current;
- if (current == long_name.end()) {
- return false;
- } else if (*current == 'd') {
- name.append("DIR_");
- ++current;
- } else if (*current == '.') {
- name.append(".");
- ++current;
- } else {
- --current;
- }
- }
-
- string::const_iterator end = current;
- for ( ; end != long_name.end() && *end != '_'; ++end) ;
- if (end == long_name.end())
- return false;
- if (!append_unescaped(current, end, &name))
- return false;
-
- current = ++end;
- for ( ; end != long_name.end() && *end != '_'; ++end) ;
- if (end == long_name.end())
- return false;
- if (!append_unescaped(current, end, &key))
- return false;
-
- current = ++end;
- for ( ; end != long_name.end() && *end != '_'; ++end) ;
- if (end == long_name.end())
- return false;
- string snap_str(current, end);
-
- current = ++end;
- for ( ; end != long_name.end() && *end != '_'; ++end) ;
- if (end != long_name.end())
- return false;
- string hash_str(current, end);
-
- if (snap_str == "head")
- snap = CEPH_NOSNAP;
- else if (snap_str == "snapdir")
- snap = CEPH_SNAPDIR;
- else
- snap = strtoull(snap_str.c_str(), NULL, 16);
- sscanf(hash_str.c_str(), "%X", &hash);
-
-
- int64_t pool = -1;
- spg_t pg;
- if (coll().is_pg_prefix(&pg))
- pool = (int64_t)pg.pgid.pool();
- (*out) = ghobject_t(hobject_t(name, key, snap, hash, pool, ""));
- return true;
-}
-
-
-bool LFNIndex::lfn_parse_object_name(const string &long_name, ghobject_t *out)
-{
- string name;
- string key;
- string ns;
- uint32_t hash;
- snapid_t snap;
- uint64_t pool;
- gen_t generation = ghobject_t::NO_GEN;
- shard_id_t shard_id = shard_id_t::NO_SHARD;
-
- if (index_version == HASH_INDEX_TAG)
- return lfn_parse_object_name_keyless(long_name, out);
- if (index_version == HASH_INDEX_TAG_2)
- return lfn_parse_object_name_poolless(long_name, out);
-
- string::const_iterator current = long_name.begin();
- if (*current == '\\') {
- ++current;
- if (current == long_name.end()) {
- return false;
- } else if (*current == 'd') {
- name.append("DIR_");
- ++current;
- } else if (*current == '.') {
- name.append(".");
- ++current;
- } else {
- --current;
- }
- }
-
- string::const_iterator end = current;
- for ( ; end != long_name.end() && *end != '_'; ++end) ;
- if (end == long_name.end())
- return false;
- if (!append_unescaped(current, end, &name))
- return false;
-
- current = ++end;
- for ( ; end != long_name.end() && *end != '_'; ++end) ;
- if (end == long_name.end())
- return false;
- if (!append_unescaped(current, end, &key))
- return false;
-
- current = ++end;
- for ( ; end != long_name.end() && *end != '_'; ++end) ;
- if (end == long_name.end())
- return false;
- string snap_str(current, end);
-
- current = ++end;
- for ( ; end != long_name.end() && *end != '_'; ++end) ;
- if (end == long_name.end())
- return false;
- string hash_str(current, end);
-
- current = ++end;
- for ( ; end != long_name.end() && *end != '_'; ++end) ;
- if (end == long_name.end())
- return false;
- if (!append_unescaped(current, end, &ns))
- return false;
-
- current = ++end;
- for ( ; end != long_name.end() && *end != '_'; ++end) ;
- string pstring(current, end);
-
- // Optional generation/shard_id
- string genstring, shardstring;
- if (end != long_name.end()) {
- current = ++end;
- for ( ; end != long_name.end() && *end != '_'; ++end) ;
- if (end == long_name.end())
- return false;
- genstring = string(current, end);
-
- generation = (gen_t)strtoull(genstring.c_str(), NULL, 16);
-
- current = ++end;
- for ( ; end != long_name.end() && *end != '_'; ++end) ;
- if (end != long_name.end())
- return false;
- shardstring = string(current, end);
-
- shard_id = (shard_id_t)strtoul(shardstring.c_str(), NULL, 16);
- }
-
- if (snap_str == "head")
- snap = CEPH_NOSNAP;
- else if (snap_str == "snapdir")
- snap = CEPH_SNAPDIR;
- else
- snap = strtoull(snap_str.c_str(), NULL, 16);
- sscanf(hash_str.c_str(), "%X", &hash);
-
- if (pstring == "none")
- pool = (uint64_t)-1;
- else
- pool = strtoull(pstring.c_str(), NULL, 16);
-
- (*out) = ghobject_t(hobject_t(name, key, snap, hash, (int64_t)pool, ns), generation, shard_id);
- return true;
-}
-
-bool LFNIndex::lfn_is_hashed_filename(const string &name)
-{
- if (name.size() < (unsigned)FILENAME_SHORT_LEN) {
- return 0;
- }
- if (name.substr(name.size() - FILENAME_COOKIE.size(), FILENAME_COOKIE.size())
- == FILENAME_COOKIE) {
- return 1;
- } else {
- return 0;
- }
-}
-
-bool LFNIndex::lfn_must_hash(const string &long_name)
-{
- return (int)long_name.size() >= FILENAME_SHORT_LEN;
-}
-
-static inline void buf_to_hex(const unsigned char *buf, int len, char *str)
-{
- int i;
- str[0] = '\0';
- for (i = 0; i < len; i++) {
- sprintf(&str[i*2], "%02x", (int)buf[i]);
- }
-}
-
-int LFNIndex::hash_filename(const char *filename, char *hash, int buf_len)
-{
- if (buf_len < FILENAME_HASH_LEN + 1)
- return -EINVAL;
-
- char buf[FILENAME_LFN_DIGEST_SIZE];
- char hex[FILENAME_LFN_DIGEST_SIZE * 2];
-
- SHA1 h;
- h.Update((const byte *)filename, strlen(filename));
- h.Final((byte *)buf);
-
- buf_to_hex((byte *)buf, (FILENAME_HASH_LEN + 1) / 2, hex);
- strncpy(hash, hex, FILENAME_HASH_LEN);
- hash[FILENAME_HASH_LEN] = '\0';
- return 0;
-}
-
-void LFNIndex::build_filename(const char *old_filename, int i, char *filename, int len)
-{
- char hash[FILENAME_HASH_LEN + 1];
-
- assert(len >= FILENAME_SHORT_LEN + 4);
-
- strncpy(filename, old_filename, FILENAME_PREFIX_LEN);
- filename[FILENAME_PREFIX_LEN] = '\0';
- if ((int)strlen(filename) < FILENAME_PREFIX_LEN)
- return;
- if (old_filename[FILENAME_PREFIX_LEN] == '\0')
- return;
-
- hash_filename(old_filename, hash, sizeof(hash));
- int ofs = FILENAME_PREFIX_LEN;
- while (1) {
- int suffix_len = sprintf(filename + ofs, "_%s_%d_%s", hash, i, FILENAME_COOKIE.c_str());
- if (ofs + suffix_len <= FILENAME_SHORT_LEN || !ofs)
- break;
- ofs--;
- }
-}
-
-string LFNIndex::lfn_get_short_name(const ghobject_t &oid, int i)
-{
- string long_name = lfn_generate_object_name(oid);
- assert(lfn_must_hash(long_name));
- char buf[FILENAME_SHORT_LEN + 4];
- build_filename(long_name.c_str(), i, buf, sizeof(buf));
- return string(buf);
-}
-
-const string &LFNIndex::get_base_path()
-{
- return base_path;
-}
-
-string LFNIndex::get_full_path_subdir(const vector<string> &rel)
-{
- string retval = get_base_path();
- for (vector<string>::const_iterator i = rel.begin();
- i != rel.end();
- ++i) {
- retval += "/";
- retval += mangle_path_component(*i);
- }
- return retval;
-}
-
-string LFNIndex::get_full_path(const vector<string> &rel, const string &name)
-{
- return get_full_path_subdir(rel) + "/" + name;
-}
-
-string LFNIndex::mangle_path_component(const string &component)
-{
- return SUBDIR_PREFIX + component;
-}
-
-string LFNIndex::demangle_path_component(const string &component)
-{
- return component.substr(SUBDIR_PREFIX.size(), component.size() - SUBDIR_PREFIX.size());
-}
-
-int LFNIndex::decompose_full_path(const char *in, vector<string> *out,
- ghobject_t *oid, string *shortname)
-{
- const char *beginning = in + get_base_path().size();
- const char *end = beginning;
- while (1) {
- end++;
- beginning = end++;
- for ( ; *end != '\0' && *end != '/'; ++end) ;
- if (*end != '\0') {
- out->push_back(demangle_path_component(string(beginning, end - beginning)));
- continue;
- } else {
- break;
- }
- }
- *shortname = string(beginning, end - beginning);
- if (oid) {
- int r = lfn_translate(*out, *shortname, oid);
- if (r < 0)
- return r;
- }
- return 0;
-}
-
-string LFNIndex::mangle_attr_name(const string &attr)
-{
- return PHASH_ATTR_PREFIX + attr;
-}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-#ifndef OS_LFNINDEX_H
-#define OS_LFNINDEX_H
-
-#include <string>
-#include <map>
-#include <set>
-#include <vector>
-#include "include/memory.h"
-#include <exception>
-
-#include "osd/osd_types.h"
-#include "include/object.h"
-#include "common/ceph_crypto.h"
-
-#include "CollectionIndex.h"
-
-/**
- * LFNIndex also encapsulates logic for manipulating
- * subdirectories of of a collection as well as the long filename
- * logic.
- *
- * The protected methods provide machinery for derived classes to
- * manipulate subdirectories and objects.
- *
- * The virtual methods are to be overridden to provide the actual
- * hashed layout.
- *
- * User must call created when an object is created.
- *
- * Syncronization: Calling code must ensure that there are no object
- * creations or deletions during the lifetime of a Path object (except
- * of an object at that path).
- *
- * Unless otherwise noted, methods which return an int return 0 on sucess
- * and a negative error code on failure.
- */
-#define WRAP_RETRY(x) { \
- bool failed = false; \
- int r = 0; \
- init_inject_failure(); \
- while (1) { \
- try { \
- if (failed) { \
- r = cleanup(); \
- assert(r == 0); \
- } \
- { x } \
- out: \
- complete_inject_failure(); \
- return r; \
- } catch (RetryException) { \
- failed = true; \
- } catch (...) { \
- assert(0); \
- } \
- } \
- return -1; \
- } \
-
-
-
-class LFNIndex : public CollectionIndex {
- /// Hash digest output size.
- static const int FILENAME_LFN_DIGEST_SIZE = CEPH_CRYPTO_SHA1_DIGESTSIZE;
- /// Length of filename hash.
- static const int FILENAME_HASH_LEN = FILENAME_LFN_DIGEST_SIZE;
- /// Max filename size.
- static const int FILENAME_MAX_LEN = 4096;
- /// Length of hashed filename.
- static const int FILENAME_SHORT_LEN = 255;
- /// Length of hashed filename prefix.
- static const int FILENAME_PREFIX_LEN;
- /// Length of hashed filename cookie.
- static const int FILENAME_EXTRA = 4;
- /// Lfn cookie value.
- static const string FILENAME_COOKIE;
- /// Name of LFN attribute for storing full name.
- static const string LFN_ATTR;
- /// Prefix for subdir index attributes.
- static const string PHASH_ATTR_PREFIX;
- /// Prefix for index subdirectories.
- static const string SUBDIR_PREFIX;
-
- /// Path to Index base.
- const string base_path;
-
-protected:
- const uint32_t index_version;
-
- /// true if retry injection is enabled
- struct RetryException : public exception {};
- bool error_injection_enabled;
- bool error_injection_on;
- double error_injection_probability;
- uint64_t last_failure;
- uint64_t current_failure;
- void init_inject_failure() {
- if (error_injection_on) {
- error_injection_enabled = true;
- last_failure = current_failure = 0;
- }
- }
- void maybe_inject_failure();
- void complete_inject_failure() {
- error_injection_enabled = false;
- }
-
-private:
- string lfn_attribute, lfn_alt_attribute;
- coll_t collection;
-
-public:
- /// Constructor
- LFNIndex(
- coll_t collection,
- const char *base_path, ///< [in] path to Index root
- uint32_t index_version,
- double _error_injection_probability=0)
- : CollectionIndex(collection),
- base_path(base_path),
- index_version(index_version),
- error_injection_enabled(false),
- error_injection_on(_error_injection_probability != 0),
- error_injection_probability(_error_injection_probability),
- last_failure(0), current_failure(0),
- collection(collection) {
- if (index_version == HASH_INDEX_TAG) {
- lfn_attribute = LFN_ATTR;
- } else {
- char buf[100];
- snprintf(buf, sizeof(buf), "%d", index_version);
- lfn_attribute = LFN_ATTR + string(buf);
- lfn_alt_attribute = LFN_ATTR + string(buf) + "-alt";
- }
- }
-
- coll_t coll() const { return collection; }
-
- /// Virtual destructor
- virtual ~LFNIndex() {}
-
- /// @see CollectionIndex
- int init();
-
- /// @see CollectionIndex
- int cleanup() = 0;
-
- /// @see CollectionIndex
- int created(
- const ghobject_t &oid,
- const char *path
- );
-
- /// @see CollectionIndex
- int unlink(
- const ghobject_t &oid
- );
-
- /// @see CollectionIndex
- int lookup(
- const ghobject_t &oid,
- IndexedPath *path,
- int *hardlink
- );
-
- /// @see CollectionIndex;
- int pre_hash_collection(
- uint32_t pg_num,
- uint64_t expected_num_objs
- );
-
- /// @see CollectionIndex
- int collection_list_partial(
- const ghobject_t &start,
- const ghobject_t &end,
- bool sort_bitwise,
- int max_count,
- vector<ghobject_t> *ls,
- ghobject_t *next
- );
-
- virtual int _split(
- uint32_t match, //< [in] value to match
- uint32_t bits, //< [in] bits to check
- CollectionIndex* dest //< [in] destination index
- ) = 0;
-
- /// @see CollectionIndex
- int split(
- uint32_t match,
- uint32_t bits,
- CollectionIndex* dest
- ) {
- WRAP_RETRY(
- r = _split(match, bits, dest);
- goto out;
- );
- }
-
-
-protected:
- virtual int _init() = 0;
-
- /// Will be called upon object creation
- virtual int _created(
- const vector<string> &path, ///< [in] Path to subdir.
- const ghobject_t &oid, ///< [in] Object created.
- const string &mangled_name ///< [in] Mangled filename.
- ) = 0;
-
- /// Will be called to remove an object
- virtual int _remove(
- const vector<string> &path, ///< [in] Path to subdir.
- const ghobject_t &oid, ///< [in] Object to remove.
- const string &mangled_name ///< [in] Mangled filename.
- ) = 0;
-
- /// Return the path and mangled_name for oid.
- virtual int _lookup(
- const ghobject_t &oid,///< [in] Object for lookup.
- vector<string> *path, ///< [out] Path to the object.
- string *mangled_name, ///< [out] Mangled filename.
- int *exists ///< [out] True if the object exists.
- ) = 0;
-
- /// Pre-hash the collection with the given pg number and
- /// expected number of objects in the collection.
- virtual int _pre_hash_collection(
- uint32_t pg_num,
- uint64_t expected_num_objs
- ) = 0;
-
- /// @see CollectionIndex
- virtual int _collection_list_partial(
- const ghobject_t &start,
- const ghobject_t &end,
- bool sort_bitwise,
- int max_count,
- vector<ghobject_t> *ls,
- ghobject_t *next
- ) = 0;
-
-protected:
-
- /* Non-virtual utility methods */
-
- /// Sync a subdirectory
- int fsync_dir(
- const vector<string> &path ///< [in] Path to sync
- ); ///< @return Error Code, 0 on success
-
- /// Link an object from from into to
- int link_object(
- const vector<string> &from, ///< [in] Source subdirectory.
- const vector<string> &to, ///< [in] Dest subdirectory.
- const ghobject_t &oid, ///< [in] Object to move.
- const string &from_short_name ///< [in] Mangled filename of oid.
- ); ///< @return Error Code, 0 on success
-
- /**
- * Efficiently remove objects from a subdirectory
- *
- * remove_object invalidates mangled names in the directory requiring
- * the mangled name of each additional object to be looked up a second
- * time. remove_objects removes the need for additional lookups
- *
- * @param [in] dir Directory from which to remove.
- * @param [in] map of objects to remove to mangle names
- * @param [in,out] map of filenames to objects
- * @return Error Code, 0 on success.
- */
- int remove_objects(
- const vector<string> &dir,
- const map<string, ghobject_t> &to_remove,
- map<string, ghobject_t> *remaining
- );
-
-
- /**
- * Moves contents of from into to.
- *
- * Invalidates mangled names in to. If interupted, all objects will be
- * present in to before objects are removed from from. Ignores EEXIST
- * while linking into to.
- * @return Error Code, 0 on success
- */
- int move_objects(
- const vector<string> &from, ///< [in] Source subdirectory.
- const vector<string> &to ///< [in] Dest subdirectory.
- );
-
- /**
- * Remove an object from from.
- *
- * Invalidates mangled names in from.
- * @return Error Code, 0 on success
- */
- int remove_object(
- const vector<string> &from, ///< [in] Directory from which to remove.
- const ghobject_t &to_remove ///< [in] Object to remove.
- );
-
- /**
- * Gets the filename corresponding to oid in from.
- *
- * The filename may differ between subdirectories. Furthermore,
- * file creations ore removals in from may invalidate the name.
- * @return Error code on failure, 0 on success
- */
- int get_mangled_name(
- const vector<string> &from, ///< [in] Subdirectory
- const ghobject_t &oid, ///< [in] Object
- string *mangled_name, ///< [out] Filename
- int *hardlink ///< [out] hardlink for this file, hardlink=0 mean no-exist
- );
-
- /// do move subdir from from to dest
- static int move_subdir(
- LFNIndex &from, ///< [in] from index
- LFNIndex &dest, ///< [in] to index
- const vector<string> &path, ///< [in] path containing dir
- string dir ///< [in] dir to move
- );
-
- /// do move object from from to dest
- static int move_object(
- LFNIndex &from, ///< [in] from index
- LFNIndex &dest, ///< [in] to index
- const vector<string> &path, ///< [in] path to split
- const pair<string, ghobject_t> &obj ///< [in] obj to move
- );
-
- /**
- * Lists objects in to_list.
- *
- * @param [in] to_list Directory to list.
- * @param [in] max_objects Max number to list.
- * @param [in,out] handle Cookie for continuing the listing.
- * Initialize to zero to start at the beginning of the directory.
- * @param [out] out Mapping of listed object filenames to objects.
- * @return Error code on failure, 0 on success
- */
- int list_objects(
- const vector<string> &to_list,
- int max_objects,
- long *handle,
- map<string, ghobject_t> *out
- );
-
- /// Lists subdirectories.
- int list_subdirs(
- const vector<string> &to_list, ///< [in] Directory to list.
- vector<string> *out ///< [out] Subdirectories listed.
- );
-
- /// Create subdirectory.
- int create_path(
- const vector<string> &to_create ///< [in] Subdirectory to create.
- );
-
- /// Remove subdirectory.
- int remove_path(
- const vector<string> &to_remove ///< [in] Subdirectory to remove.
- );
-
- /// Check whether to_check exists.
- int path_exists(
- const vector<string> &to_check, ///< [in] Subdirectory to check.
- int *exists ///< [out] 1 if it exists, 0 else
- );
-
- /// Save attr_value to attr_name attribute on path.
- int add_attr_path(
- const vector<string> &path, ///< [in] Path to modify.
- const string &attr_name, ///< [in] Name of attribute.
- bufferlist &attr_value ///< [in] Value to save.
- );
-
- /// Read into attr_value atribute attr_name on path.
- int get_attr_path(
- const vector<string> &path, ///< [in] Path to read.
- const string &attr_name, ///< [in] Attribute to read.
- bufferlist &attr_value ///< [out] Attribute value read.
- );
-
- /// Remove attr from path
- int remove_attr_path(
- const vector<string> &path, ///< [in] path from which to remove attr
- const string &attr_name ///< [in] attr to remove
- ); ///< @return Error code, 0 on success
-
-private:
- /* lfn translation functions */
-
- /**
- * Gets the version specific lfn attribute tag
- */
- const string &get_lfn_attr() const {
- return lfn_attribute;
- }
- const string &get_alt_lfn_attr() const {
- return lfn_alt_attribute;
- }
-
- /**
- * Gets the filename corresponsing to oid in path.
- *
- * @param [in] path Path in which to get filename for oid.
- * @param [in] oid Object for which to get filename.
- * @param [out] mangled_name Filename for oid, pass NULL if not needed.
- * @param [out] full_path Fullpath for oid, pass NULL if not needed.
- * @param [out] hardlink of this file, 0 mean no-exist, pass NULL if
- * not needed
- * @return Error Code, 0 on success.
- */
- int lfn_get_name(
- const vector<string> &path,
- const ghobject_t &oid,
- string *mangled_name,
- string *full_path,
- int *hardlink
- );
-
- /// Adjusts path contents when oid is created at name mangled_name.
- int lfn_created(
- const vector<string> &path, ///< [in] Path to adjust.
- const ghobject_t &oid, ///< [in] Object created.
- const string &mangled_name ///< [in] Filename of created object.
- );
-
- /// Removes oid from path while adjusting path contents
- int lfn_unlink(
- const vector<string> &path, ///< [in] Path containing oid.
- const ghobject_t &oid, ///< [in] Object to remove.
- const string &mangled_name ///< [in] Filename of object to remove.
- );
-
- ///Transate a file into and ghobject_t.
- int lfn_translate(
- const vector<string> &path, ///< [in] Path containing the file.
- const string &short_name, ///< [in] Filename to translate.
- ghobject_t *out ///< [out] Object found.
- ); ///< @return Negative error code on error, 0 if not an object, 1 else
-
- /* manglers/demanglers */
- /// Filters object filenames
- bool lfn_is_object(
- const string &short_name ///< [in] Filename to check
- ); ///< True if short_name is an object, false otherwise
-
- /// Filters subdir filenames
- bool lfn_is_subdir(
- const string &short_name, ///< [in] Filename to check.
- string *demangled_name ///< [out] Demangled subdir name.
- ); ///< @return True if short_name is a subdir, false otherwise
-
- /// Generate object name
- string lfn_generate_object_name_keyless(
- const ghobject_t &oid ///< [in] Object for which to generate.
- ); ///< @return Generated object name.
-
- /// Generate object name
- string lfn_generate_object_name_poolless(
- const ghobject_t &oid ///< [in] Object for which to generate.
- ); ///< @return Generated object name.
-
- /// Generate object name
- string lfn_generate_object_name(
- const ghobject_t &oid ///< [in] Object for which to generate.
- ); ///< @return Generated object name.
-
- /// Parse object name
- bool lfn_parse_object_name_keyless(
- const string &long_name, ///< [in] Name to parse
- ghobject_t *out ///< [out] Resulting Object
- ); ///< @return True if successfull, False otherwise.
-
- /// Parse object name
- bool lfn_parse_object_name_poolless(
- const string &long_name, ///< [in] Name to parse
- ghobject_t *out ///< [out] Resulting Object
- ); ///< @return True if successfull, False otherwise.
-
- /// Parse object name
- bool lfn_parse_object_name(
- const string &long_name, ///< [in] Name to parse
- ghobject_t *out ///< [out] Resulting Object
- ); ///< @return True if successfull, False otherwise.
-
- /// Checks whether short_name is a hashed filename.
- bool lfn_is_hashed_filename(
- const string &short_name ///< [in] Name to check.
- ); ///< @return True if short_name is hashed, False otherwise.
-
- /// Checks whether long_name must be hashed.
- bool lfn_must_hash(
- const string &long_name ///< [in] Name to check.
- ); ///< @return True if long_name must be hashed, False otherwise.
-
- /// Generate hashed name.
- string lfn_get_short_name(
- const ghobject_t &oid, ///< [in] Object for which to generate.
- int i ///< [in] Index of hashed name to generate.
- ); ///< @return Hashed filename.
-
- /* other common methods */
- /// Gets the base path
- const string &get_base_path(); ///< @return Index base_path
-
- /// Get full path the subdir
- string get_full_path_subdir(
- const vector<string> &rel ///< [in] The subdir.
- ); ///< @return Full path to rel.
-
- /// Get full path to object
- string get_full_path(
- const vector<string> &rel, ///< [in] Path to object.
- const string &name ///< [in] Filename of object.
- ); ///< @return Fullpath to object at name in rel.
-
- /// Get mangled path component
- string mangle_path_component(
- const string &component ///< [in] Component to mangle
- ); /// @return Mangled component
-
- /// Demangle component
- string demangle_path_component(
- const string &component ///< [in] Subdir name to demangle
- ); ///< @return Demangled path component.
-
- /// Decompose full path into object name and filename.
- int decompose_full_path(
- const char *in, ///< [in] Full path to object.
- vector<string> *out, ///< [out] Path to object at in.
- ghobject_t *oid, ///< [out] Object at in.
- string *shortname ///< [out] Filename of object at in.
- ); ///< @return Error Code, 0 on success.
-
- /// Mangle attribute name
- string mangle_attr_name(
- const string &attr ///< [in] Attribute to mangle.
- ); ///< @return Mangled attribute name.
-
- /// Builds hashed filename
- void build_filename(
- const char *old_filename, ///< [in] Filename to convert.
- int i, ///< [in] Index of hash.
- char *filename, ///< [out] Resulting filename.
- int len ///< [in] Size of buffer for filename
- ); ///< @return Error Code, 0 on success
-
- /// Get hash of filename
- int hash_filename(
- const char *filename, ///< [in] Filename to hash.
- char *hash, ///< [out] Hash of filename.
- int len ///< [in] Size of hash buffer.
- ); ///< @return Error Code, 0 on success.
-
- friend class TestWrapLFNIndex;
-};
-typedef LFNIndex::IndexedPath IndexedPath;
-
-#endif
if ENABLE_SERVER
libos_a_SOURCES = \
- os/chain_xattr.cc \
os/fs/FS.cc \
os/bluestore/kv.cc \
os/bluestore/Allocator.cc \
os/bluestore/BlueStore.cc \
os/bluestore/FreelistManager.cc \
os/bluestore/StupidAllocator.cc \
+ os/filestore/chain_xattr.cc \
+ os/filestore/DBObjectMap.cc \
+ os/filestore/FileJournal.cc \
+ os/filestore/FileStore.cc \
+ os/filestore/GenericFileStoreBackend.cc \
+ os/filestore/HashIndex.cc \
+ os/filestore/IndexManager.cc \
+ os/filestore/JournalingObjectStore.cc \
+ os/filestore/LFNIndex.cc \
+ os/filestore/WBThrottle.cc \
os/kstore/kv.cc \
os/kstore/KStore.cc \
- os/DBObjectMap.cc \
os/GenericObjectMap.cc \
- os/FileJournal.cc \
- os/FileStore.cc \
- os/GenericFileStoreBackend.cc \
- os/HashIndex.cc \
- os/IndexManager.cc \
- os/JournalingObjectStore.cc \
- os/LFNIndex.cc \
os/MemStore.cc \
os/KeyValueStore.cc \
- os/ObjectStore.cc \
- os/WBThrottle.cc
+ os/ObjectStore.cc
if LINUX
-libos_a_SOURCES += os/BtrfsFileStoreBackend.cc
+libos_a_SOURCES += os/filestore/BtrfsFileStoreBackend.cc
endif
if WITH_LIBXFS
libos_a_SOURCES += \
os/fs/XFS.cc \
- os/XfsFileStoreBackend.cc
+ os/filestore/XfsFileStoreBackend.cc
endif
if WITH_LIBZFS
-libos_a_SOURCES += os/ZFSFileStoreBackend.cc
+libos_a_SOURCES += os/filestore/ZFSFileStoreBackend.cc
endif
libos_a_CXXFLAGS = ${AM_CXXFLAGS} -I rocksdb/include -fPIC
endif
noinst_HEADERS += \
- os/btrfs_ioctl.h \
- os/chain_xattr.h \
os/bluestore/bluefs_types.h \
os/bluestore/bluestore_types.h \
os/bluestore/kv.h \
os/bluestore/BlueStore.h \
os/bluestore/FreelistManager.h \
os/bluestore/StupidAllocator.h \
+ os/btrfs_ioctl.h \
+ os/filestore/chain_xattr.h \
+ os/filestore/BtrfsFileStoreBackend.h \
+ os/filestore/CollectionIndex.h \
+ os/filestore/DBObjectMap.h \
+ os/filestore/FileJournal.h \
+ os/filestore/FileStore.h \
+ os/filestore/FDCache.h \
+ os/filestore/GenericFileStoreBackend.h \
+ os/filestore/HashIndex.h \
+ os/filestore/IndexManager.h \
+ os/filestore/Journal.h \
+ os/filestore/JournalingObjectStore.h \
+ os/filestore/LFNIndex.h \
+ os/filestore/SequencerPosition.h \
+ os/filestore/WBThrottle.h \
+ os/filestore/XfsFileStoreBackend.h \
+ os/filestore/ZFSFileStoreBackend.h
os/kstore/kstore_types.h \
os/kstore/KStore.h \
os/kstore/kv.h \
- os/BtrfsFileStoreBackend.h \
- os/CollectionIndex.h \
- os/DBObjectMap.h \
os/GenericObjectMap.h \
- os/FileJournal.h \
- os/FileStore.h \
- os/FDCache.h \
os/fs/FS.h \
os/fs/XFS.h \
- os/GenericFileStoreBackend.h \
- os/HashIndex.h \
- os/IndexManager.h \
- os/Journal.h \
- os/JournalingObjectStore.h \
- os/LFNIndex.h \
os/MemStore.h \
os/KeyValueStore.h \
os/ObjectMap.h \
os/ObjectStore.h \
- os/PageSet.h \
- os/SequencerPosition.h \
- os/WBThrottle.h \
- os/XfsFileStoreBackend.h \
- os/ZFSFileStoreBackend.h
+ os/PageSet.h
if WITH_LIBZFS
libos_zfs_a_SOURCES = os/ZFS.cc
#ifndef OS_KEYVALUESTORE_H
#define OS_KEYVALUESTORE_H
-#include "IndexManager.h"
-#include "SequencerPosition.h"
#include <string>
#include <vector>
#include "include/memory.h"
#include "kv/KeyValueDB.h"
+#include "common/hobject.h"
+
+class SequencerPosition;
/**
* Encapsulates the FileStore key value store
#include "common/Formatter.h"
#include "common/safe_io.h"
-#include "FileStore.h"
+#include "filestore/FileStore.h"
#include "MemStore.h"
#include "KeyValueStore.h"
#if defined(HAVE_LIBAIO)
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef __CEPH_OS_SEQUENCERPOSITION_H
-#define __CEPH_OS_SEQUENCERPOSITION_H
-
-#include "include/types.h"
-#include "include/cmp.h"
-#include "include/encoding.h"
-#include "common/Formatter.h"
-
-#include <ostream>
-
-/**
- * transaction and op offset
- */
-struct SequencerPosition {
- uint64_t seq; ///< seq
- uint32_t trans; ///< transaction in that seq (0-based)
- uint32_t op; ///< op in that transaction (0-based)
-
- SequencerPosition(uint64_t s=0, int32_t t=0, int32_t o=0) : seq(s), trans(t), op(o) {}
-
- void encode(bufferlist& bl) const {
- ENCODE_START(1, 1, bl);
- ::encode(seq, bl);
- ::encode(trans, bl);
- ::encode(op, bl);
- ENCODE_FINISH(bl);
- }
- void decode(bufferlist::iterator& p) {
- DECODE_START(1, p);
- ::decode(seq, p);
- ::decode(trans, p);
- ::decode(op, p);
- DECODE_FINISH(p);
- }
- void dump(Formatter *f) const {
- f->dump_unsigned("seq", seq);
- f->dump_unsigned("trans", trans);
- f->dump_unsigned("op", op);
- }
- static void generate_test_instances(list<SequencerPosition*>& o) {
- o.push_back(new SequencerPosition);
- o.push_back(new SequencerPosition(1, 2, 3));
- o.push_back(new SequencerPosition(4, 5, 6));
- }
-};
-WRITE_CLASS_ENCODER(SequencerPosition)
-
-inline ostream& operator<<(ostream& out, const SequencerPosition& t) {
- return out << t.seq << "." << t.trans << "." << t.op;
-}
-
-WRITE_EQ_OPERATORS_3(SequencerPosition, seq, trans, op)
-WRITE_CMP_OPERATORS_3(SequencerPosition, seq, trans, op)
-
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "acconfig.h"
-
-#include "os/WBThrottle.h"
-#include "common/perf_counters.h"
-
-WBThrottle::WBThrottle(CephContext *cct) :
- cur_ios(0), cur_size(0),
- cct(cct),
- logger(NULL),
- stopping(true),
- lock("WBThrottle::lock", false, true, false, cct),
- fs(XFS)
-{
- {
- Mutex::Locker l(lock);
- set_from_conf();
- }
- assert(cct);
- PerfCountersBuilder b(
- cct, string("WBThrottle"),
- l_wbthrottle_first, l_wbthrottle_last);
- b.add_u64(l_wbthrottle_bytes_dirtied, "bytes_dirtied", "Dirty data");
- b.add_u64(l_wbthrottle_bytes_wb, "bytes_wb", "Written data");
- b.add_u64(l_wbthrottle_ios_dirtied, "ios_dirtied", "Dirty operations");
- b.add_u64(l_wbthrottle_ios_wb, "ios_wb", "Written operations");
- b.add_u64(l_wbthrottle_inodes_dirtied, "inodes_dirtied", "Entries waiting for write");
- b.add_u64(l_wbthrottle_inodes_wb, "inodes_wb", "Written entries");
- logger = b.create_perf_counters();
- cct->get_perfcounters_collection()->add(logger);
- for (unsigned i = l_wbthrottle_first + 1; i != l_wbthrottle_last; ++i)
- logger->set(i, 0);
-
- cct->_conf->add_observer(this);
-}
-
-WBThrottle::~WBThrottle() {
- assert(cct);
- cct->get_perfcounters_collection()->remove(logger);
- delete logger;
- cct->_conf->remove_observer(this);
-}
-
-void WBThrottle::start()
-{
- {
- Mutex::Locker l(lock);
- stopping = false;
- }
- create();
-}
-
-void WBThrottle::stop()
-{
- {
- Mutex::Locker l(lock);
- stopping = true;
- cond.Signal();
- }
-
- join();
-}
-
-const char** WBThrottle::get_tracked_conf_keys() const
-{
- static const char* KEYS[] = {
- "filestore_wbthrottle_btrfs_bytes_start_flusher",
- "filestore_wbthrottle_btrfs_bytes_hard_limit",
- "filestore_wbthrottle_btrfs_ios_start_flusher",
- "filestore_wbthrottle_btrfs_ios_hard_limit",
- "filestore_wbthrottle_btrfs_inodes_start_flusher",
- "filestore_wbthrottle_btrfs_inodes_hard_limit",
- "filestore_wbthrottle_xfs_bytes_start_flusher",
- "filestore_wbthrottle_xfs_bytes_hard_limit",
- "filestore_wbthrottle_xfs_ios_start_flusher",
- "filestore_wbthrottle_xfs_ios_hard_limit",
- "filestore_wbthrottle_xfs_inodes_start_flusher",
- "filestore_wbthrottle_xfs_inodes_hard_limit",
- NULL
- };
- return KEYS;
-}
-
-void WBThrottle::set_from_conf()
-{
- assert(lock.is_locked());
- if (fs == BTRFS) {
- size_limits.first =
- cct->_conf->filestore_wbthrottle_btrfs_bytes_start_flusher;
- size_limits.second =
- cct->_conf->filestore_wbthrottle_btrfs_bytes_hard_limit;
- io_limits.first =
- cct->_conf->filestore_wbthrottle_btrfs_ios_start_flusher;
- io_limits.second =
- cct->_conf->filestore_wbthrottle_btrfs_ios_hard_limit;
- fd_limits.first =
- cct->_conf->filestore_wbthrottle_btrfs_inodes_start_flusher;
- fd_limits.second =
- cct->_conf->filestore_wbthrottle_btrfs_inodes_hard_limit;
- } else if (fs == XFS) {
- size_limits.first =
- cct->_conf->filestore_wbthrottle_xfs_bytes_start_flusher;
- size_limits.second =
- cct->_conf->filestore_wbthrottle_xfs_bytes_hard_limit;
- io_limits.first =
- cct->_conf->filestore_wbthrottle_xfs_ios_start_flusher;
- io_limits.second =
- cct->_conf->filestore_wbthrottle_xfs_ios_hard_limit;
- fd_limits.first =
- cct->_conf->filestore_wbthrottle_xfs_inodes_start_flusher;
- fd_limits.second =
- cct->_conf->filestore_wbthrottle_xfs_inodes_hard_limit;
- } else {
- assert(0 == "invalid value for fs");
- }
- cond.Signal();
-}
-
-void WBThrottle::handle_conf_change(const md_config_t *conf,
- const std::set<std::string> &changed)
-{
- Mutex::Locker l(lock);
- for (const char** i = get_tracked_conf_keys(); *i; ++i) {
- if (changed.count(*i)) {
- set_from_conf();
- return;
- }
- }
-}
-
-bool WBThrottle::get_next_should_flush(
- boost::tuple<ghobject_t, FDRef, PendingWB> *next)
-{
- assert(lock.is_locked());
- assert(next);
- while (!stopping && !beyond_limit())
- cond.Wait(lock);
- if (stopping)
- return false;
- assert(!pending_wbs.empty());
- ghobject_t obj(pop_object());
-
- ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator i =
- pending_wbs.find(obj);
- *next = boost::make_tuple(obj, i->second.second, i->second.first);
- pending_wbs.erase(i);
- return true;
-}
-
-
-void *WBThrottle::entry()
-{
- Mutex::Locker l(lock);
- boost::tuple<ghobject_t, FDRef, PendingWB> wb;
- while (get_next_should_flush(&wb)) {
- clearing = wb.get<0>();
- cur_ios -= wb.get<2>().ios;
- logger->dec(l_wbthrottle_ios_dirtied, wb.get<2>().ios);
- logger->inc(l_wbthrottle_ios_wb, wb.get<2>().ios);
- cur_size -= wb.get<2>().size;
- logger->dec(l_wbthrottle_bytes_dirtied, wb.get<2>().size);
- logger->inc(l_wbthrottle_bytes_wb, wb.get<2>().size);
- logger->dec(l_wbthrottle_inodes_dirtied);
- logger->inc(l_wbthrottle_inodes_wb);
- lock.Unlock();
-#ifdef HAVE_FDATASYNC
- ::fdatasync(**wb.get<1>());
-#else
- ::fsync(**wb.get<1>());
-#endif
-#ifdef HAVE_POSIX_FADVISE
- if (g_conf->filestore_fadvise && wb.get<2>().nocache) {
- int fa_r = posix_fadvise(**wb.get<1>(), 0, 0, POSIX_FADV_DONTNEED);
- assert(fa_r == 0);
- }
-#endif
- lock.Lock();
- clearing = ghobject_t();
- cond.Signal();
- wb = boost::tuple<ghobject_t, FDRef, PendingWB>();
- }
- return 0;
-}
-
-void WBThrottle::queue_wb(
- FDRef fd, const ghobject_t &hoid, uint64_t offset, uint64_t len,
- bool nocache)
-{
- Mutex::Locker l(lock);
- ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator wbiter =
- pending_wbs.find(hoid);
- if (wbiter == pending_wbs.end()) {
- wbiter = pending_wbs.insert(
- make_pair(hoid,
- make_pair(
- PendingWB(),
- fd))).first;
- logger->inc(l_wbthrottle_inodes_dirtied);
- } else {
- remove_object(hoid);
- }
-
- cur_ios++;
- logger->inc(l_wbthrottle_ios_dirtied);
- cur_size += len;
- logger->inc(l_wbthrottle_bytes_dirtied, len);
-
- wbiter->second.first.add(nocache, len, 1);
- insert_object(hoid);
- if (beyond_limit())
- cond.Signal();
-}
-
-void WBThrottle::clear()
-{
- Mutex::Locker l(lock);
- for (ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator i =
- pending_wbs.begin();
- i != pending_wbs.end();
- ++i) {
-#ifdef HAVE_POSIX_FADVISE
- if (g_conf->filestore_fadvise && i->second.first.nocache) {
- int fa_r = posix_fadvise(**i->second.second, 0, 0, POSIX_FADV_DONTNEED);
- assert(fa_r == 0);
- }
-#endif
-
- }
- cur_ios = cur_size = 0;
- logger->set(l_wbthrottle_ios_dirtied, 0);
- logger->set(l_wbthrottle_bytes_dirtied, 0);
- logger->set(l_wbthrottle_inodes_dirtied, 0);
- pending_wbs.clear();
- lru.clear();
- rev_lru.clear();
- cond.Signal();
-}
-
-void WBThrottle::clear_object(const ghobject_t &hoid)
-{
- Mutex::Locker l(lock);
- while (clearing == hoid)
- cond.Wait(lock);
- ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator i =
- pending_wbs.find(hoid);
- if (i == pending_wbs.end())
- return;
-
- cur_ios -= i->second.first.ios;
- logger->dec(l_wbthrottle_ios_dirtied, i->second.first.ios);
- cur_size -= i->second.first.size;
- logger->dec(l_wbthrottle_bytes_dirtied, i->second.first.size);
- logger->dec(l_wbthrottle_inodes_dirtied);
-
- pending_wbs.erase(i);
- remove_object(hoid);
- cond.Signal();
-}
-
-void WBThrottle::throttle()
-{
- Mutex::Locker l(lock);
- while (!stopping && need_flush())
- cond.Wait(lock);
-}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2013 Inktank Storage, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#ifndef WBTHROTTLE_H
-#define WBTHROTTLE_H
-
-#include "include/unordered_map.h"
-#include <boost/tuple/tuple.hpp>
-#include "include/memory.h"
-#include "common/Formatter.h"
-#include "common/hobject.h"
-#include "include/interval_set.h"
-#include "FDCache.h"
-#include "common/Thread.h"
-#include "common/ceph_context.h"
-
-class PerfCounters;
-enum {
- l_wbthrottle_first = 999090,
- l_wbthrottle_bytes_dirtied,
- l_wbthrottle_bytes_wb,
- l_wbthrottle_ios_dirtied,
- l_wbthrottle_ios_wb,
- l_wbthrottle_inodes_dirtied,
- l_wbthrottle_inodes_wb,
- l_wbthrottle_last
-};
-
-/**
- * WBThrottle
- *
- * Tracks, throttles, and flushes outstanding IO
- */
-class WBThrottle : Thread, public md_config_obs_t {
- ghobject_t clearing;
- /* *_limits.first is the start_flusher limit and
- * *_limits.second is the hard limit
- */
-
- /// Limits on unflushed bytes
- pair<uint64_t, uint64_t> size_limits;
-
- /// Limits on unflushed ios
- pair<uint64_t, uint64_t> io_limits;
-
- /// Limits on unflushed objects
- pair<uint64_t, uint64_t> fd_limits;
-
- uint64_t cur_ios; /// Currently unflushed IOs
- uint64_t cur_size; /// Currently unflushed bytes
-
- /**
- * PendingWB tracks the ios pending on an object.
- */
- class PendingWB {
- public:
- bool nocache;
- uint64_t size;
- uint64_t ios;
- PendingWB() : nocache(true), size(0), ios(0) {}
- void add(bool _nocache, uint64_t _size, uint64_t _ios) {
- if (!_nocache)
- nocache = false; // only nocache if all writes are nocache
- size += _size;
- ios += _ios;
- }
- };
-
- CephContext *cct;
- PerfCounters *logger;
- bool stopping;
- Mutex lock;
- Cond cond;
-
-
- /**
- * Flush objects in lru order
- */
- list<ghobject_t> lru;
- ceph::unordered_map<ghobject_t, list<ghobject_t>::iterator> rev_lru;
- void remove_object(const ghobject_t &oid) {
- assert(lock.is_locked());
- ceph::unordered_map<ghobject_t, list<ghobject_t>::iterator>::iterator iter =
- rev_lru.find(oid);
- if (iter == rev_lru.end())
- return;
-
- lru.erase(iter->second);
- rev_lru.erase(iter);
- }
- ghobject_t pop_object() {
- assert(!lru.empty());
- ghobject_t oid(lru.front());
- lru.pop_front();
- rev_lru.erase(oid);
- return oid;
- }
- void insert_object(const ghobject_t &oid) {
- assert(rev_lru.find(oid) == rev_lru.end());
- lru.push_back(oid);
- rev_lru.insert(make_pair(oid, --lru.end()));
- }
-
- ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> > pending_wbs;
-
- /// get next flush to perform
- bool get_next_should_flush(
- boost::tuple<ghobject_t, FDRef, PendingWB> *next ///< [out] next to flush
- ); ///< @return false if we are shutting down
-public:
- enum FS {
- BTRFS,
- XFS
- };
-
-private:
- FS fs;
-
- void set_from_conf();
- bool beyond_limit() const {
- if (cur_ios < io_limits.first &&
- pending_wbs.size() < fd_limits.first &&
- cur_size < size_limits.first)
- return false;
- else
- return true;
- }
- bool need_flush() const {
- if (cur_ios < io_limits.second &&
- pending_wbs.size() < fd_limits.second &&
- cur_size < size_limits.second)
- return false;
- else
- return true;
- }
-
-public:
- WBThrottle(CephContext *cct);
- ~WBThrottle();
-
- void start();
- void stop();
- /// Set fs as XFS or BTRFS
- void set_fs(FS new_fs) {
- Mutex::Locker l(lock);
- fs = new_fs;
- set_from_conf();
- }
-
- /// Queue wb on oid, fd taking throttle (does not block)
- void queue_wb(
- FDRef fd, ///< [in] FDRef to oid
- const ghobject_t &oid, ///< [in] object
- uint64_t offset, ///< [in] offset written
- uint64_t len, ///< [in] length written
- bool nocache ///< [in] try to clear out of cache after write
- );
-
- /// Clear all wb (probably due to sync)
- void clear();
-
- /// Clear object
- void clear_object(const ghobject_t &oid);
-
- /// Block until there is throttle available
- void throttle();
-
- /// md_config_obs_t
- const char** get_tracked_conf_keys() const;
- void handle_conf_change(const md_config_t *conf,
- const std::set<std::string> &changed);
-
- /// Thread
- void *entry();
-};
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2014 Inktank, Inc
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "XfsFileStoreBackend.h"
-
-#include <errno.h>
-#include <fcntl.h>
-#include <sys/ioctl.h>
-#include <sys/stat.h>
-#include <unistd.h>
-#include <sys/utsname.h>
-
-#include <xfs/xfs.h>
-
-#include "common/errno.h"
-#include "common/linux_version.h"
-#include "include/assert.h"
-#include "include/compat.h"
-
-#define dout_subsys ceph_subsys_filestore
-#undef dout_prefix
-#define dout_prefix *_dout << "xfsfilestorebackend(" << get_basedir_path() << ") "
-
-XfsFileStoreBackend::XfsFileStoreBackend(FileStore *fs):
- GenericFileStoreBackend(fs), m_has_extsize(false) { }
-
-/*
- * Set extsize attr on a file to val. Should be a free-standing
- * function, but dout_prefix expanding to a call to get_basedir_path()
- * protected member function won't let it.
- */
-int XfsFileStoreBackend::set_extsize(int fd, unsigned int val)
-{
- struct fsxattr fsx;
- struct stat sb;
- int ret;
-
- if (fstat(fd, &sb) < 0) {
- ret = -errno;
- dout(0) << "set_extsize: fstat: " << cpp_strerror(ret) << dendl;
- return ret;
- }
- if (!S_ISREG(sb.st_mode)) {
- dout(0) << "set_extsize: invalid target file type" << dendl;
- return -EINVAL;
- }
-
- if (ioctl(fd, XFS_IOC_FSGETXATTR, &fsx) < 0) {
- ret = -errno;
- dout(0) << "set_extsize: FSGETXATTR: " << cpp_strerror(ret) << dendl;
- return ret;
- }
-
- // already set?
- if ((fsx.fsx_xflags & XFS_XFLAG_EXTSIZE) && fsx.fsx_extsize == val)
- return 0;
-
- // xfs won't change extent size if any extents are allocated
- if (fsx.fsx_nextents != 0)
- return 0;
-
- fsx.fsx_xflags |= XFS_XFLAG_EXTSIZE;
- fsx.fsx_extsize = val;
-
- if (ioctl(fd, XFS_IOC_FSSETXATTR, &fsx) < 0) {
- ret = -errno;
- dout(0) << "set_extsize: FSSETXATTR: " << cpp_strerror(ret) << dendl;
- return ret;
- }
-
- return 0;
-}
-
-int XfsFileStoreBackend::detect_features()
-{
- int ret;
-
- ret = GenericFileStoreBackend::detect_features();
- if (ret < 0)
- return ret;
-
- // extsize?
- int fd = ::openat(get_basedir_fd(), "extsize_test", O_CREAT|O_WRONLY, 0600);
- if (fd < 0) {
- ret = -errno;
- dout(0) << "detect_feature: failed to create test file for extsize attr: "
- << cpp_strerror(ret) << dendl;
- goto out;
- }
- if (::unlinkat(get_basedir_fd(), "extsize_test", 0) < 0) {
- ret = -errno;
- dout(0) << "detect_feature: failed to unlink test file for extsize attr: "
- << cpp_strerror(ret) << dendl;
- goto out_close;
- }
-
- if (g_conf->filestore_xfs_extsize) {
- ret = set_extsize(fd, 1U << 15); // a few pages
- if (ret) {
- ret = 0;
- dout(0) << "detect_feature: failed to set test file extsize, assuming extsize is NOT supported" << dendl;
- goto out_close;
- }
-
- // make sure we have 3.5 or newer, which includes this fix
- // aff3a9edb7080f69f07fe76a8bd089b3dfa4cb5d
- // for this set_extsize bug
- // http://oss.sgi.com/bugzilla/show_bug.cgi?id=874
- int ver = get_linux_version();
- if (ver == 0) {
- dout(0) << __func__ << ": couldn't verify extsize not buggy, disabling extsize" << dendl;
- m_has_extsize = false;
- } else if (ver < KERNEL_VERSION(3, 5, 0)) {
- dout(0) << __func__ << ": disabling extsize, your kernel < 3.5 and has buggy extsize ioctl" << dendl;
- m_has_extsize = false;
- } else {
- dout(0) << __func__ << ": extsize is supported and your kernel >= 3.5" << dendl;
- m_has_extsize = true;
- }
- } else {
- dout(0) << "detect_feature: extsize is disabled by conf" << dendl;
- }
-
-out_close:
- TEMP_FAILURE_RETRY(::close(fd));
-out:
- return ret;
-}
-
-int XfsFileStoreBackend::set_alloc_hint(int fd, uint64_t hint)
-{
- if (!m_has_extsize)
- return -EOPNOTSUPP;
-
- assert(hint < UINT_MAX);
- return set_extsize(fd, hint);
-}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2014 Inktank, Inc
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#ifndef CEPH_XFSFILESTOREBACKEND_H
-#define CEPH_XFSFILESTOREBACKEND_H
-
-#include "GenericFileStoreBackend.h"
-
-#include "include/int_types.h"
-
-class XfsFileStoreBackend : public GenericFileStoreBackend {
-private:
- bool m_has_extsize;
- int set_extsize(int fd, unsigned int val);
-public:
- XfsFileStoreBackend(FileStore *fs);
- ~XfsFileStoreBackend() {}
- const char *get_name() {
- return "xfs";
- }
- int detect_features();
- int set_alloc_hint(int fd, uint64_t hint);
-};
-
-#endif /* CEPH_XFSFILESTOREBACKEND_H */
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "include/int_types.h"
-#include "include/types.h"
-
-#include <unistd.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/ioctl.h>
-
-#include "include/compat.h"
-#include "include/linux_fiemap.h"
-#include "include/color.h"
-#include "include/buffer.h"
-#include "include/assert.h"
-
-#include <iostream>
-#include <fstream>
-#include <sstream>
-
-#include "common/errno.h"
-#include "common/config.h"
-#include "common/sync_filesystem.h"
-
-#ifdef HAVE_LIBZFS
-
-#include "ZFSFileStoreBackend.h"
-
-#define dout_subsys ceph_subsys_filestore
-#undef dout_prefix
-#define dout_prefix *_dout << "zfsfilestorebackend(" << get_basedir_path() << ") "
-
-ZFSFileStoreBackend::ZFSFileStoreBackend(FileStore *fs) :
- GenericFileStoreBackend(fs), base_zh(NULL), current_zh(NULL),
- m_filestore_zfs_snap(g_conf->filestore_zfs_snap)
-{
- int ret = zfs.init();
- if (ret < 0) {
- dout(0) << "ZFSFileStoreBackend: failed to init libzfs" << dendl;
- return;
- }
-
- base_zh = zfs.path_to_zhandle(get_basedir_path().c_str(), ZFS::TYPE_FILESYSTEM);
- if (!base_zh) {
- dout(0) << "ZFSFileStoreBackend: failed to get zfs handler for basedir" << dendl;
- return;
- }
-
- update_current_zh();
-}
-
-ZFSFileStoreBackend::~ZFSFileStoreBackend()
-{
- if (base_zh)
- zfs.close(base_zh);
- if (current_zh)
- zfs.close(current_zh);
-}
-
-int ZFSFileStoreBackend::update_current_zh()
-{
- char path[PATH_MAX];
- snprintf(path, sizeof(path), "%s/current", zfs.get_name(base_zh));
- ZFS::Handle *zh = zfs.open(path, ZFS::TYPE_FILESYSTEM);
- if (zh) {
- char *mnt;
- if (zfs.is_mounted(zh, &mnt)) {
- int ret = get_current_path() == mnt;
- free(mnt);
- if (ret) {
- current_zh = zh;
- return 0;
- }
- } else {
- int ret = zfs.mount(zh, NULL, 0);
- if (ret < 0) {
- ret = -errno;
- dout(0) << "update_current_zh: zfs_mount '" << zfs.get_name(zh)
- << "' got " << cpp_strerror(ret) << dendl;
- return ret;
- }
- }
- zfs.close(zh);
- } else {
- dout(0) << "update_current_zh: zfs_open '" << path << "' got NULL" << dendl;
- return -ENOENT;
- }
-
- zh = zfs.path_to_zhandle(get_current_path().c_str(), ZFS::TYPE_FILESYSTEM);
- if (zh) {
- if (strcmp(zfs.get_name(base_zh), zfs.get_name(zh))) {
- current_zh = zh;
- return 0;
- }
- zfs.close(zh);
- dout(0) << "update_current_zh: basedir and current/ on the same filesystem" << dendl;
- } else {
- dout(0) << "update_current_zh: current/ not exist" << dendl;
- }
- return -ENOENT;
-}
-
-int ZFSFileStoreBackend::detect_features()
-{
- if (!current_zh)
- dout(0) << "detect_features: null zfs handle for current/" << dendl;
- return 0;
-}
-
-bool ZFSFileStoreBackend::can_checkpoint()
-{
- return m_filestore_zfs_snap && current_zh != NULL;
-}
-
-int ZFSFileStoreBackend::create_current()
-{
- struct stat st;
- int ret = ::stat(get_current_path().c_str(), &st);
- if (ret == 0) {
- // current/ exists
- if (!S_ISDIR(st.st_mode)) {
- dout(0) << "create_current: current/ exists but is not a directory" << dendl;
- return -ENOTDIR;
- }
- return 0;
- } else if (errno != ENOENT) {
- ret = -errno;
- dout(0) << "create_current: cannot stat current/ " << cpp_strerror(ret) << dendl;
- return ret;
- }
-
- char path[PATH_MAX];
- snprintf(path, sizeof(path), "%s/current", zfs.get_name(base_zh));
- ret = zfs.create(path, ZFS::TYPE_FILESYSTEM);
- if (ret < 0 && errno != EEXIST) {
- ret = -errno;
- dout(0) << "create_current: zfs_create '" << path << "' got " << cpp_strerror(ret) << dendl;
- return ret;
- }
-
- ret = update_current_zh();
- return ret;
-}
-
-static int list_checkpoints_callback(ZFS::Handle *zh, void *data)
-{
- list<string> *ls = static_cast<list<string> *>(data);
- string str = ZFS::get_name(zh);
- size_t pos = str.find('@');
- assert(pos != string::npos && pos + 1 != str.length());
- ls->push_back(str.substr(pos + 1));
- return 0;
-}
-
-int ZFSFileStoreBackend::list_checkpoints(list<string>& ls)
-{
- dout(10) << "list_checkpoints:" << dendl;
- if (!current_zh)
- return -EINVAL;
-
- list<string> snaps;
- int ret = zfs.iter_snapshots_sorted(current_zh, list_checkpoints_callback, &snaps);
- if (ret < 0) {
- ret = -errno;
- dout(0) << "list_checkpoints: zfs_iter_snapshots_sorted got" << cpp_strerror(ret) << dendl;
- return ret;
- }
- ls.swap(snaps);
- return 0;
-}
-
-int ZFSFileStoreBackend::create_checkpoint(const string& name, uint64_t *cid)
-{
- dout(10) << "create_checkpoint: '" << name << "'" << dendl;
- if (!current_zh)
- return -EINVAL;
-
- // looks like zfsonlinux doesn't flush dirty data when taking snapshot
- int ret = sync_filesystem(get_current_fd());
- if (ret < 0) {
- ret = -errno;
- dout(0) << "create_checkpoint: sync_filesystem got" << cpp_strerror(ret) << dendl;
- return ret;
- }
-
- char path[PATH_MAX];
- snprintf(path, sizeof(path), "%s@%s", zfs.get_name(current_zh), name.c_str());
- ret = zfs.snapshot(path, false);
- if (ret < 0) {
- ret = -errno;
- dout(0) << "create_checkpoint: zfs_snapshot '" << path << "' got" << cpp_strerror(ret) << dendl;
- return ret;
- }
- if (cid)
- *cid = 0;
- return 0;
-}
-
-int ZFSFileStoreBackend::rollback_to(const string& name)
-{
- dout(10) << "rollback_to: '" << name << "'" << dendl;
- if (!current_zh)
- return -EINVAL;
-
- // umount current to avoid triggering online rollback deadlock
- int ret;
- if (zfs.is_mounted(current_zh, NULL)) {
- ret = zfs.umount(current_zh, NULL, 0);
- if (ret < 0) {
- ret = -errno;
- dout(0) << "rollback_to: zfs_umount '" << zfs.get_name(current_zh) << "' got" << cpp_strerror(ret) << dendl;
- }
- }
-
- char path[PATH_MAX];
- snprintf(path, sizeof(path), "%s@%s", zfs.get_name(current_zh), name.c_str());
-
- ZFS::Handle *snap_zh = zfs.open(path, ZFS::TYPE_SNAPSHOT);
- if (!snap_zh) {
- dout(0) << "rollback_to: zfs_open '" << path << "' got NULL" << dendl;
- return -ENOENT;
- }
-
- ret = zfs.rollback(current_zh, snap_zh, false);
- if (ret < 0) {
- ret = -errno;
- dout(0) << "rollback_to: zfs_rollback '" << zfs.get_name(snap_zh) << "' got" << cpp_strerror(ret) << dendl;
- }
-
- if (!zfs.is_mounted(current_zh, NULL)) {
- int ret = zfs.mount(current_zh, NULL, 0);
- if (ret < 0) {
- ret = -errno;
- dout(0) << "update_current_zh: zfs_mount '" << zfs.get_name(current_zh) << "' got " << cpp_strerror(ret) << dendl;
- return ret;
- }
- }
-
- zfs.close(snap_zh);
- return ret;
-}
-
-int ZFSFileStoreBackend::destroy_checkpoint(const string& name)
-{
- dout(10) << "destroy_checkpoint: '" << name << "'" << dendl;
- if (!current_zh)
- return -EINVAL;
-
- int ret = zfs.destroy_snaps(current_zh, name.c_str(), true);
- if (ret < 0) {
- ret = -errno;
- dout(0) << "destroy_checkpoint: zfs_destroy_snaps '" << name << "' got" << cpp_strerror(ret) << dendl;
- }
- return ret;
-}
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_ZFSFILESTOREBACKEND_H
-#define CEPH_ZFSFILESTOREBACKEND_H
-
-#ifdef HAVE_LIBZFS
-#include "GenericFileStoreBackend.h"
-#include "ZFS.h"
-
-class ZFSFileStoreBackend : public GenericFileStoreBackend {
-private:
- ZFS zfs;
- ZFS::Handle *base_zh;
- ZFS::Handle *current_zh;
- bool m_filestore_zfs_snap;
- int update_current_zh();
-public:
- ZFSFileStoreBackend(FileStore *fs);
- ~ZFSFileStoreBackend();
- int detect_features();
- bool can_checkpoint();
- int create_current();
- int list_checkpoints(list<string>& ls);
- int create_checkpoint(const string& name, uint64_t *cid);
- int rollback_to(const string& name);
- int destroy_checkpoint(const string& name);
-};
-#endif
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "chain_xattr.h"
-
-#include "include/int_types.h"
-
-#include <unistd.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <sys/file.h>
-#include <errno.h>
-#include <dirent.h>
-#include <sys/ioctl.h>
-#include <string.h>
-#include <stdio.h>
-#include "include/assert.h"
-
-#if defined(__linux__)
-#include <linux/fs.h>
-#endif
-
-#include "common/xattr.h"
-#include "include/compat.h"
-
-/*
- * chaining xattrs
- *
- * In order to support xattrs that are larger than the xattr size limit that some file systems
- * impose, we use multiple xattrs to store the value of a single xattr. The xattrs keys
- * are set as follows:
- * The first xattr in the chain, has a key that holds the original xattr name, with any '@' char
- * being esacped ("@@").
- * The chained keys will have the first xattr's key (with the escaping), and a suffix: "@<id>"
- * where <id> marks the num of xattr in the chain.
- */
-
-static void get_raw_xattr_name(const char *name, int i, char *raw_name, int raw_len)
-{
- int pos = 0;
-
- while (*name) {
- switch (*name) {
- case '@': /* escape it */
- pos += 2;
- assert (pos < raw_len - 1);
- *raw_name = '@';
- raw_name++;
- *raw_name = '@';
- break;
- default:
- pos++;
- assert(pos < raw_len - 1);
- *raw_name = *name;
- break;
- }
- name++;
- raw_name++;
- }
-
- if (!i) {
- *raw_name = '\0';
- } else {
- int r = snprintf(raw_name, raw_len - pos, "@%d", i);
- assert(r < raw_len - pos);
- }
-}
-
-static int translate_raw_name(const char *raw_name, char *name, int name_len, bool *is_first)
-{
- int pos = 0;
-
- *is_first = true;
- while (*raw_name) {
- switch (*raw_name) {
- case '@': /* escape it */
- raw_name++;
- if (!*raw_name)
- break;
- if (*raw_name != '@') {
- *is_first = false;
- goto done;
- }
-
- /* fall through */
- default:
- *name = *raw_name;
- break;
- }
- pos++;
- assert(pos < name_len);
- name++;
- raw_name++;
- }
-done:
- *name = '\0';
- return pos;
-}
-
-
-// setxattr
-
-static int getxattr_len(const char *fn, const char *name)
-{
- int i = 0, total = 0;
- char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
- int r;
-
- do {
- get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
- r = sys_getxattr(fn, raw_name, 0, 0);
- if (!i && r < 0)
- return r;
- if (r < 0)
- break;
- total += r;
- i++;
- } while (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
- r == CHAIN_XATTR_SHORT_BLOCK_LEN);
-
- return total;
-}
-
-int chain_getxattr(const char *fn, const char *name, void *val, size_t size)
-{
- int i = 0, pos = 0;
- char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
- int ret = 0;
- int r;
- size_t chunk_size;
-
- if (!size)
- return getxattr_len(fn, name);
-
- do {
- chunk_size = (size < CHAIN_XATTR_MAX_BLOCK_LEN ? size : CHAIN_XATTR_MAX_BLOCK_LEN);
- get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
-
- r = sys_getxattr(fn, raw_name, (char *)val + pos, chunk_size);
- if (i && r == -ENODATA) {
- ret = pos;
- break;
- }
- if (r < 0) {
- ret = r;
- break;
- }
-
- if (r > 0) {
- pos += r;
- size -= r;
- }
-
- i++;
- } while (size && (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
- r == CHAIN_XATTR_SHORT_BLOCK_LEN));
-
- if (r >= 0) {
- ret = pos;
- /* is there another chunk? that can happen if the last read size span over
- exactly one block */
- if (chunk_size == CHAIN_XATTR_MAX_BLOCK_LEN ||
- chunk_size == CHAIN_XATTR_SHORT_BLOCK_LEN) {
- get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
- r = sys_getxattr(fn, raw_name, 0, 0);
- if (r > 0) { // there's another chunk.. the original buffer was too small
- ret = -ERANGE;
- }
- }
- }
- return ret;
-}
-
-static int chain_fgetxattr_len(int fd, const char *name)
-{
- int i = 0, total = 0;
- char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
- int r;
-
- do {
- get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
- r = sys_fgetxattr(fd, raw_name, 0, 0);
- if (!i && r < 0)
- return r;
- if (r < 0)
- break;
- total += r;
- i++;
- } while (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
- r == CHAIN_XATTR_SHORT_BLOCK_LEN);
-
- return total;
-}
-
-int chain_fgetxattr(int fd, const char *name, void *val, size_t size)
-{
- int i = 0, pos = 0;
- char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
- int ret = 0;
- int r;
- size_t chunk_size;
-
- if (!size)
- return chain_fgetxattr_len(fd, name);
-
- do {
- chunk_size = (size < CHAIN_XATTR_MAX_BLOCK_LEN ? size : CHAIN_XATTR_MAX_BLOCK_LEN);
- get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
-
- r = sys_fgetxattr(fd, raw_name, (char *)val + pos, chunk_size);
- if (i && r == -ENODATA) {
- ret = pos;
- break;
- }
- if (r < 0) {
- ret = r;
- break;
- }
-
- if (r > 0) {
- pos += r;
- size -= r;
- }
-
- i++;
- } while (size && (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
- r == CHAIN_XATTR_SHORT_BLOCK_LEN));
-
- if (r >= 0) {
- ret = pos;
- /* is there another chunk? that can happen if the last read size span over
- exactly one block */
- if (chunk_size == CHAIN_XATTR_MAX_BLOCK_LEN ||
- chunk_size == CHAIN_XATTR_SHORT_BLOCK_LEN) {
- get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
- r = sys_fgetxattr(fd, raw_name, 0, 0);
- if (r > 0) { // there's another chunk.. the original buffer was too small
- ret = -ERANGE;
- }
- }
- }
- return ret;
-}
-
-
-// setxattr
-
-static int get_xattr_block_size(size_t size)
-{
- if (size <= CHAIN_XATTR_SHORT_LEN_THRESHOLD)
- // this may fit in the inode; stripe over short attrs so that XFS
- // won't kick it out.
- return CHAIN_XATTR_SHORT_BLOCK_LEN;
- return CHAIN_XATTR_MAX_BLOCK_LEN;
-}
-
-int chain_setxattr(const char *fn, const char *name, const void *val, size_t size, bool onechunk)
-{
- int i = 0, pos = 0;
- char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
- int ret = 0;
- size_t max_chunk_size = get_xattr_block_size(size);
-
- do {
- size_t chunk_size = (size < max_chunk_size ? size : max_chunk_size);
- get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
- size -= chunk_size;
-
- int r = sys_setxattr(fn, raw_name, (char *)val + pos, chunk_size);
- if (r < 0) {
- ret = r;
- break;
- }
- pos += chunk_size;
- ret = pos;
- i++;
- } while (size);
-
- if (ret >= 0 && !onechunk) {
- int r;
- do {
- get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
- r = sys_removexattr(fn, raw_name);
- if (r < 0 && r != -ENODATA)
- ret = r;
- i++;
- } while (r != -ENODATA);
- }
-
- return ret;
-}
-
-int chain_fsetxattr(int fd, const char *name, const void *val, size_t size, bool onechunk)
-{
- int i = 0, pos = 0;
- char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
- int ret = 0;
- size_t max_chunk_size = get_xattr_block_size(size);
-
- do {
- size_t chunk_size = (size < max_chunk_size ? size : max_chunk_size);
- get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
- size -= chunk_size;
-
- int r = sys_fsetxattr(fd, raw_name, (char *)val + pos, chunk_size);
- if (r < 0) {
- ret = r;
- break;
- }
- pos += chunk_size;
- ret = pos;
- i++;
- } while (size);
-
- if (ret >= 0 && !onechunk) {
- int r;
- do {
- get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
- r = sys_fremovexattr(fd, raw_name);
- if (r < 0 && r != -ENODATA)
- ret = r;
- i++;
- } while (r != -ENODATA);
- }
-
- return ret;
-}
-
-
-// removexattr
-
-int chain_removexattr(const char *fn, const char *name)
-{
- int i = 0;
- char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
- int r;
-
- do {
- get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
- r = sys_removexattr(fn, raw_name);
- if (!i && r < 0) {
- return r;
- }
- i++;
- } while (r >= 0);
- return 0;
-}
-
-int chain_fremovexattr(int fd, const char *name)
-{
- int i = 0;
- char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
- int r;
-
- do {
- get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
- r = sys_fremovexattr(fd, raw_name);
- if (!i && r < 0) {
- return r;
- }
- i++;
- } while (r >= 0);
- return 0;
-}
-
-
-// listxattr
-
-int chain_listxattr(const char *fn, char *names, size_t len) {
- int r;
-
- if (!len)
- return sys_listxattr(fn, names, len) * 2;
-
- r = sys_listxattr(fn, 0, 0);
- if (r < 0)
- return r;
-
- size_t total_len = r * 2; // should be enough
- char *full_buf = (char *)malloc(total_len);
- if (!full_buf)
- return -ENOMEM;
-
- r = sys_listxattr(fn, full_buf, total_len);
- if (r < 0) {
- free(full_buf);
- return r;
- }
-
- char *p = full_buf;
- const char *end = full_buf + r;
- char *dest = names;
- char *dest_end = names + len;
-
- while (p < end) {
- char name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
- int attr_len = strlen(p);
- bool is_first;
- int name_len = translate_raw_name(p, name, sizeof(name), &is_first);
- if (is_first) {
- if (dest + name_len > dest_end) {
- r = -ERANGE;
- goto done;
- }
- strcpy(dest, name);
- dest += name_len + 1;
- }
- p += attr_len + 1;
- }
- r = dest - names;
-
-done:
- free(full_buf);
- return r;
-}
-
-int chain_flistxattr(int fd, char *names, size_t len) {
- int r;
- char *p;
- const char * end;
- char *dest;
- char *dest_end;
-
- if (!len)
- return sys_flistxattr(fd, names, len) * 2;
-
- r = sys_flistxattr(fd, 0, 0);
- if (r < 0)
- return r;
-
- size_t total_len = r * 2; // should be enough
- char *full_buf = (char *)malloc(total_len);
- if (!full_buf)
- return -ENOMEM;
-
- r = sys_flistxattr(fd, full_buf, total_len);
- if (r < 0)
- goto done;
-
- p = full_buf;
- end = full_buf + r;
- dest = names;
- dest_end = names + len;
-
- while (p < end) {
- char name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
- int attr_len = strlen(p);
- bool is_first;
- int name_len = translate_raw_name(p, name, sizeof(name), &is_first);
- if (is_first) {
- if (dest + name_len > dest_end) {
- r = -ERANGE;
- goto done;
- }
- strcpy(dest, name);
- dest += name_len + 1;
- }
- p += attr_len + 1;
- }
- r = dest - names;
-
-done:
- free(full_buf);
- return r;
-}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef __CEPH_OSD_CHAIN_XATTR_H
-#define __CEPH_OSD_CHAIN_XATTR_H
-
-#include "common/xattr.h"
-
-#include <errno.h>
-
-#if defined(__linux__)
-#include <linux/limits.h>
-#define CHAIN_XATTR_MAX_NAME_LEN ((XATTR_NAME_MAX + 1) / 2)
-#elif defined(__APPLE__)
-#include <sys/xattr.h>
-#define CHAIN_XATTR_MAX_NAME_LEN ((XATTR_MAXNAMELEN + 1) / 2)
-#else
-#define CHAIN_XATTR_MAX_NAME_LEN 128
-#endif
-
-#define CHAIN_XATTR_MAX_BLOCK_LEN 2048
-
-/*
- * XFS will only inline xattrs < 255 bytes, so for xattrs that are
- * likely to fit in the inode, stripe over short xattrs.
- */
-#define CHAIN_XATTR_SHORT_BLOCK_LEN 250
-#define CHAIN_XATTR_SHORT_LEN_THRESHOLD 1000
-
-// wrappers to hide annoying errno handling.
-
-static inline int sys_fgetxattr(int fd, const char *name, void *val, size_t size)
-{
- int r = ::ceph_os_fgetxattr(fd, name, val, size);
- return (r < 0 ? -errno : r);
-}
-static inline int sys_getxattr(const char *fn, const char *name, void *val, size_t size)
-{
- int r = ::ceph_os_getxattr(fn, name, val, size);
- return (r < 0 ? -errno : r);
-}
-
-static inline int sys_setxattr(const char *fn, const char *name, const void *val, size_t size)
-{
- int r = ::ceph_os_setxattr(fn, name, val, size);
- return (r < 0 ? -errno : r);
-}
-static inline int sys_fsetxattr(int fd, const char *name, const void *val, size_t size)
-{
- int r = ::ceph_os_fsetxattr(fd, name, val, size);
- return (r < 0 ? -errno : r);
-}
-
-static inline int sys_listxattr(const char *fn, char *names, size_t len)
-{
- int r = ::ceph_os_listxattr(fn, names, len);
- return (r < 0 ? -errno : r);
-}
-static inline int sys_flistxattr(int fd, char *names, size_t len)
-{
- int r = ::ceph_os_flistxattr(fd, names, len);
- return (r < 0 ? -errno : r);
-}
-
-static inline int sys_removexattr(const char *fn, const char *name)
-{
- int r = ::ceph_os_removexattr(fn, name);
- return (r < 0 ? -errno : r);
-}
-static inline int sys_fremovexattr(int fd, const char *name)
-{
- int r = ::ceph_os_fremovexattr(fd, name);
- return (r < 0 ? -errno : r);
-}
-
-
-// wrappers to chain large values across multiple xattrs
-
-int chain_getxattr(const char *fn, const char *name, void *val, size_t size);
-int chain_fgetxattr(int fd, const char *name, void *val, size_t size);
-int chain_setxattr(const char *fn, const char *name, const void *val, size_t size, bool onechunk=false);
-int chain_fsetxattr(int fd, const char *name, const void *val, size_t size, bool onechunk=false);
-int chain_listxattr(const char *fn, char *names, size_t len);
-int chain_flistxattr(int fd, char *names, size_t len);
-int chain_removexattr(const char *fn, const char *name);
-int chain_fremovexattr(int fd, const char *name);
-
-#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/int_types.h"
+#include "include/types.h"
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include "include/compat.h"
+#include "include/linux_fiemap.h"
+#include "include/color.h"
+#include "include/buffer.h"
+#include "include/assert.h"
+
+#ifndef __CYGWIN__
+#include "os/btrfs_ioctl.h"
+#endif
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "BtrfsFileStoreBackend.h"
+
+#include "common/errno.h"
+#include "common/config.h"
+
+#if defined(__linux__)
+
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "btrfsfilestorebackend(" << get_basedir_path() << ") "
+
+#define ALIGN_DOWN(x, by) ((x) - ((x) % (by)))
+#define ALIGNED(x, by) (!((x) % (by)))
+#define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by)))
+
+BtrfsFileStoreBackend::BtrfsFileStoreBackend(FileStore *fs):
+ GenericFileStoreBackend(fs), has_clone_range(false),
+ has_snap_create(false), has_snap_destroy(false),
+ has_snap_create_v2(false), has_wait_sync(false), stable_commits(false),
+ m_filestore_btrfs_clone_range(g_conf->filestore_btrfs_clone_range),
+ m_filestore_btrfs_snap (g_conf->filestore_btrfs_snap) { }
+
+int BtrfsFileStoreBackend::detect_features()
+{
+ int r;
+
+ r = GenericFileStoreBackend::detect_features();
+ if (r < 0)
+ return r;
+
+ // clone_range?
+ if (m_filestore_btrfs_clone_range) {
+ int fd = ::openat(get_basedir_fd(), "clone_range_test", O_CREAT|O_WRONLY, 0600);
+ if (fd >= 0) {
+ if (::unlinkat(get_basedir_fd(), "clone_range_test", 0) < 0) {
+ r = -errno;
+ dout(0) << "detect_feature: failed to unlink test file for CLONE_RANGE ioctl: "
+ << cpp_strerror(r) << dendl;
+ }
+ btrfs_ioctl_clone_range_args clone_args;
+ memset(&clone_args, 0, sizeof(clone_args));
+ clone_args.src_fd = -1;
+ r = ::ioctl(fd, BTRFS_IOC_CLONE_RANGE, &clone_args);
+ if (r < 0 && errno == EBADF) {
+ dout(0) << "detect_feature: CLONE_RANGE ioctl is supported" << dendl;
+ has_clone_range = true;
+ } else {
+ r = -errno;
+ dout(0) << "detect_feature: CLONE_RANGE ioctl is NOT supported: " << cpp_strerror(r) << dendl;
+ }
+ TEMP_FAILURE_RETRY(::close(fd));
+ } else {
+ r = -errno;
+ dout(0) << "detect_feature: failed to create test file for CLONE_RANGE ioctl: "
+ << cpp_strerror(r) << dendl;
+ }
+ } else {
+ dout(0) << "detect_feature: CLONE_RANGE ioctl is DISABLED via 'filestore btrfs clone range' option" << dendl;
+ }
+
+ struct btrfs_ioctl_vol_args vol_args;
+ memset(&vol_args, 0, sizeof(vol_args));
+
+ // create test source volume
+ vol_args.fd = 0;
+ strcpy(vol_args.name, "test_subvol");
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE, &vol_args);
+ if (r != 0) {
+ r = -errno;
+ dout(0) << "detect_feature: failed to create simple subvolume " << vol_args.name << ": " << cpp_strerror(r) << dendl;
+ }
+ int srcfd = ::openat(get_basedir_fd(), vol_args.name, O_RDONLY);
+ if (srcfd < 0) {
+ r = -errno;
+ dout(0) << "detect_feature: failed to open " << vol_args.name << ": " << cpp_strerror(r) << dendl;
+ }
+
+ // snap_create and snap_destroy?
+ vol_args.fd = srcfd;
+ strcpy(vol_args.name, "sync_snap_test");
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args);
+ int err = errno;
+ if (r == 0 || errno == EEXIST) {
+ dout(0) << "detect_feature: SNAP_CREATE is supported" << dendl;
+ has_snap_create = true;
+
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+ if (r == 0) {
+ dout(0) << "detect_feature: SNAP_DESTROY is supported" << dendl;
+ has_snap_destroy = true;
+ } else {
+ err = -errno;
+ dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err) << dendl;
+
+ if (err == -EPERM && getuid() != 0) {
+ dout(0) << "detect_feature: failed with EPERM as non-root; remount with -o user_subvol_rm_allowed" << dendl;
+ cerr << TEXT_YELLOW
+ << "btrfs SNAP_DESTROY failed as non-root; remount with -o user_subvol_rm_allowed"
+ << TEXT_NORMAL << std::endl;
+ } else if (err == -EOPNOTSUPP) {
+ derr << "btrfs SNAP_DESTROY ioctl not supported; you need a kernel newer than 2.6.32" << dendl;
+ }
+ }
+ } else {
+ dout(0) << "detect_feature: SNAP_CREATE failed: " << cpp_strerror(err) << dendl;
+ }
+
+ if (m_filestore_btrfs_snap) {
+ if (has_snap_destroy)
+ stable_commits = true;
+ else
+ dout(0) << "detect_feature: snaps enabled, but no SNAP_DESTROY ioctl; DISABLING" << dendl;
+ }
+
+ // start_sync?
+ __u64 transid = 0;
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_START_SYNC, &transid);
+ if (r < 0) {
+ int err = errno;
+ dout(0) << "detect_feature: START_SYNC got " << cpp_strerror(err) << dendl;
+ }
+ if (r == 0 && transid > 0) {
+ dout(0) << "detect_feature: START_SYNC is supported (transid " << transid << ")" << dendl;
+
+ // do we have wait_sync too?
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_WAIT_SYNC, &transid);
+ if (r == 0 || errno == ERANGE) {
+ dout(0) << "detect_feature: WAIT_SYNC is supported" << dendl;
+ has_wait_sync = true;
+ } else {
+ int err = errno;
+ dout(0) << "detect_feature: WAIT_SYNC is NOT supported: " << cpp_strerror(err) << dendl;
+ }
+ } else {
+ int err = errno;
+ dout(0) << "detect_feature: START_SYNC is NOT supported: " << cpp_strerror(err) << dendl;
+ }
+
+ if (has_wait_sync) {
+ // async snap creation?
+ struct btrfs_ioctl_vol_args_v2 async_args;
+ memset(&async_args, 0, sizeof(async_args));
+ async_args.fd = srcfd;
+ async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC;
+ strcpy(async_args.name, "async_snap_test");
+
+ // remove old one, first
+ struct stat st;
+ strcpy(vol_args.name, async_args.name);
+ if (::fstatat(get_basedir_fd(), vol_args.name, &st, 0) == 0) {
+ dout(0) << "detect_feature: removing old async_snap_test" << dendl;
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+ if (r != 0) {
+ int err = errno;
+ dout(0) << "detect_feature: failed to remove old async_snap_test: " << cpp_strerror(err) << dendl;
+ }
+ }
+
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2, &async_args);
+ if (r == 0 || errno == EEXIST) {
+ dout(0) << "detect_feature: SNAP_CREATE_V2 is supported" << dendl;
+ has_snap_create_v2 = true;
+
+ // clean up
+ strcpy(vol_args.name, "async_snap_test");
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+ if (r != 0) {
+ int err = errno;
+ dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err) << dendl;
+ }
+ } else {
+ int err = errno;
+ dout(0) << "detect_feature: SNAP_CREATE_V2 is NOT supported: " << cpp_strerror(err) << dendl;
+ }
+ }
+
+ // clean up test subvol
+ if (srcfd >= 0)
+ TEMP_FAILURE_RETRY(::close(srcfd));
+
+ strcpy(vol_args.name, "test_subvol");
+ r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+ if (r < 0) {
+ r = -errno;
+ dout(0) << "detect_feature: failed to remove " << vol_args.name << ": " << cpp_strerror(r) << dendl;
+ }
+
+ if (m_filestore_btrfs_snap && !has_snap_create_v2) {
+ dout(0) << "mount WARNING: btrfs snaps enabled, but no SNAP_CREATE_V2 ioctl (from kernel 2.6.37+)" << dendl;
+ cerr << TEXT_YELLOW
+ << " ** WARNING: 'filestore btrfs snap' is enabled (for safe transactions,\n"
+ << " rollback), but btrfs does not support the SNAP_CREATE_V2 ioctl\n"
+ << " (added in Linux 2.6.37). Expect slow btrfs sync/commit\n"
+ << " performance.\n"
+ << TEXT_NORMAL;
+ }
+
+ return 0;
+}
+
+bool BtrfsFileStoreBackend::can_checkpoint()
+{
+ return stable_commits;
+}
+
+int BtrfsFileStoreBackend::create_current()
+{
+ struct stat st;
+ int ret = ::stat(get_current_path().c_str(), &st);
+ if (ret == 0) {
+ // current/ exists
+ if (!S_ISDIR(st.st_mode)) {
+ dout(0) << "create_current: current/ exists but is not a directory" << dendl;
+ return -EINVAL;
+ }
+
+ struct stat basest;
+ struct statfs currentfs;
+ ret = ::fstat(get_basedir_fd(), &basest);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "create_current: cannot fstat basedir " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ ret = ::statfs(get_current_path().c_str(), ¤tfs);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "create_current: cannot statsf basedir " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ if (currentfs.f_type == BTRFS_SUPER_MAGIC && basest.st_dev != st.st_dev) {
+ dout(2) << "create_current: current appears to be a btrfs subvolume" << dendl;
+ stable_commits = true;
+ }
+ return 0;
+ }
+
+ struct btrfs_ioctl_vol_args volargs;
+ memset(&volargs, 0, sizeof(volargs));
+
+ volargs.fd = 0;
+ strcpy(volargs.name, "current");
+ if (::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE, (unsigned long int)&volargs) < 0) {
+ ret = -errno;
+ dout(0) << "create_current: BTRFS_IOC_SUBVOL_CREATE failed with error "
+ << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ dout(2) << "create_current: created btrfs subvol " << get_current_path() << dendl;
+ if (::chmod(get_current_path().c_str(), 0755) < 0) {
+ ret = -errno;
+ dout(0) << "create_current: failed to chmod " << get_current_path() << " to 0755: "
+ << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ stable_commits = true;
+ return 0;
+}
+
+int BtrfsFileStoreBackend::list_checkpoints(list<string>& ls)
+{
+ int ret, err = 0;
+
+ struct stat basest;
+ ret = ::fstat(get_basedir_fd(), &basest);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "list_checkpoints: cannot fstat basedir " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ // get snap list
+ DIR *dir = ::opendir(get_basedir_path().c_str());
+ if (!dir) {
+ ret = -errno;
+ dout(0) << "list_checkpoints: opendir '" << get_basedir_path() << "' failed: "
+ << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ list<string> snaps;
+ char path[PATH_MAX];
+ char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1];
+ struct dirent *de;
+ while (::readdir_r(dir, (struct dirent *)&buf, &de) == 0) {
+ if (!de)
+ break;
+
+ snprintf(path, sizeof(path), "%s/%s", get_basedir_path().c_str(), de->d_name);
+
+ struct stat st;
+ ret = ::stat(path, &st);
+ if (ret < 0) {
+ err = -errno;
+ dout(0) << "list_checkpoints: stat '" << path << "' failed: "
+ << cpp_strerror(err) << dendl;
+ break;
+ }
+
+ if (!S_ISDIR(st.st_mode))
+ continue;
+
+ struct statfs fs;
+ ret = ::statfs(path, &fs);
+ if (ret < 0) {
+ err = -errno;
+ dout(0) << "list_checkpoints: statfs '" << path << "' failed: "
+ << cpp_strerror(err) << dendl;
+ break;
+ }
+
+ if (fs.f_type == BTRFS_SUPER_MAGIC && basest.st_dev != st.st_dev)
+ snaps.push_back(string(de->d_name));
+ }
+
+ if (::closedir(dir) < 0) {
+ ret = -errno;
+ dout(0) << "list_checkpoints: closedir failed: " << cpp_strerror(ret) << dendl;
+ if (!err)
+ err = ret;
+ }
+
+ if (err)
+ return err;
+
+ ls.swap(snaps);
+ return 0;
+}
+
+int BtrfsFileStoreBackend::create_checkpoint(const string& name, uint64_t *transid)
+{
+ dout(10) << "create_checkpoint: '" << name << "'" << dendl;
+ if (has_snap_create_v2 && transid) {
+ struct btrfs_ioctl_vol_args_v2 async_args;
+ memset(&async_args, 0, sizeof(async_args));
+ async_args.fd = get_current_fd();
+ async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC;
+
+ size_t name_size = sizeof(async_args.name);
+ strncpy(async_args.name, name.c_str(), name_size);
+ async_args.name[name_size-1] = '\0';
+
+ int r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2, &async_args);
+ if (r < 0) {
+ r = -errno;
+ dout(0) << "create_checkpoint: async snap create '" << name << "' got " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ dout(20) << "create_checkpoint: async snap create '" << name << "' transid " << async_args.transid << dendl;
+ *transid = async_args.transid;
+ } else {
+ struct btrfs_ioctl_vol_args vol_args;
+ memset(&vol_args, 0, sizeof(vol_args));
+ vol_args.fd = get_current_fd();
+
+ size_t name_size = sizeof(vol_args.name);
+ strncpy(vol_args.name, name.c_str(), name_size);
+ vol_args.name[name_size-1] = '\0';
+
+ int r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args);
+ if (r < 0) {
+ r = -errno;
+ dout(0) << "create_checkpoint: snap create '" << name << "' got " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ if (transid)
+ *transid = 0;
+ }
+ return 0;
+}
+
+int BtrfsFileStoreBackend::sync_checkpoint(uint64_t transid)
+{
+ // wait for commit
+ dout(10) << "sync_checkpoint: transid " << transid << " to complete" << dendl;
+ int ret = ::ioctl(get_op_fd(), BTRFS_IOC_WAIT_SYNC, &transid);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "sync_checkpoint: ioctl WAIT_SYNC got " << cpp_strerror(ret) << dendl;
+ return -errno;
+ }
+ dout(20) << "sync_checkpoint: done waiting for transid " << transid << dendl;
+ return 0;
+}
+
+int BtrfsFileStoreBackend::rollback_to(const string& name)
+{
+ dout(10) << "rollback_to: to '" << name << "'" << dendl;
+ char s[PATH_MAX];
+ btrfs_ioctl_vol_args vol_args;
+
+ memset(&vol_args, 0, sizeof(vol_args));
+ vol_args.fd = 0;
+ strcpy(vol_args.name, "current");
+
+ int ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+ if (ret && errno != ENOENT) {
+ dout(0) << "rollback_to: error removing old current subvol: " << cpp_strerror(ret) << dendl;
+ snprintf(s, sizeof(s), "%s/current.remove.me.%d", get_basedir_path().c_str(), rand());
+ if (::rename(get_current_path().c_str(), s)) {
+ ret = -errno;
+ dout(0) << "rollback_to: error renaming old current subvol: "
+ << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ }
+
+ snprintf(s, sizeof(s), "%s/%s", get_basedir_path().c_str(), name.c_str());
+
+ // roll back
+ vol_args.fd = ::open(s, O_RDONLY);
+ if (vol_args.fd < 0) {
+ ret = -errno;
+ dout(0) << "rollback_to: error opening '" << s << "': " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args);
+ if (ret < 0 ) {
+ ret = -errno;
+ dout(0) << "rollback_to: ioctl SNAP_CREATE got " << cpp_strerror(ret) << dendl;
+ }
+ TEMP_FAILURE_RETRY(::close(vol_args.fd));
+ return ret;
+}
+
+int BtrfsFileStoreBackend::destroy_checkpoint(const string& name)
+{
+ dout(10) << "destroy_checkpoint: '" << name << "'" << dendl;
+ btrfs_ioctl_vol_args vol_args;
+ memset(&vol_args, 0, sizeof(vol_args));
+ vol_args.fd = 0;
+ strncpy(vol_args.name, name.c_str(), sizeof(vol_args.name));
+
+ int ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+ if (ret) {
+ ret = -errno;
+ dout(0) << "destroy_checkpoint: ioctl SNAP_DESTROY got " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ return 0;
+}
+
+int BtrfsFileStoreBackend::syncfs()
+{
+ dout(15) << "syncfs" << dendl;
+ // do a full btrfs commit
+ int ret = ::ioctl(get_op_fd(), BTRFS_IOC_SYNC);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "syncfs: btrfs IOC_SYNC got " << cpp_strerror(ret) << dendl;
+ }
+ return ret;
+}
+
+int BtrfsFileStoreBackend::clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff)
+{
+ dout(20) << "clone_range: " << srcoff << "~" << len << " to " << dstoff << dendl;
+ size_t blk_size = get_blksize();
+ if (!has_clone_range ||
+ srcoff % blk_size != dstoff % blk_size) {
+ dout(20) << "clone_range: using copy" << dendl;
+ return _copy_range(from, to, srcoff, len, dstoff);
+ }
+
+ int err = 0;
+ int r = 0;
+
+ uint64_t srcoffclone = ALIGN_UP(srcoff, blk_size);
+ uint64_t dstoffclone = ALIGN_UP(dstoff, blk_size);
+ if (srcoffclone >= srcoff + len) {
+ dout(20) << "clone_range: using copy, extent too short to align srcoff" << dendl;
+ return _copy_range(from, to, srcoff, len, dstoff);
+ }
+
+ uint64_t lenclone = len - (srcoffclone - srcoff);
+ if (!ALIGNED(lenclone, blk_size)) {
+ struct stat from_stat, to_stat;
+ err = ::fstat(from, &from_stat);
+ if (err) return -errno;
+ err = ::fstat(to , &to_stat);
+ if (err) return -errno;
+
+ if (srcoff + len != (uint64_t)from_stat.st_size ||
+ dstoff + len < (uint64_t)to_stat.st_size) {
+ // Not to the end of the file, need to align length as well
+ lenclone = ALIGN_DOWN(lenclone, blk_size);
+ }
+ }
+ if (lenclone == 0) {
+ // too short
+ return _copy_range(from, to, srcoff, len, dstoff);
+ }
+
+ dout(20) << "clone_range: cloning " << srcoffclone << "~" << lenclone
+ << " to " << dstoffclone << " = " << r << dendl;
+ btrfs_ioctl_clone_range_args a;
+ a.src_fd = from;
+ a.src_offset = srcoffclone;
+ a.src_length = lenclone;
+ a.dest_offset = dstoffclone;
+ err = ::ioctl(to, BTRFS_IOC_CLONE_RANGE, &a);
+ if (err >= 0) {
+ r += err;
+ } else if (errno == EINVAL) {
+ // Still failed, might be compressed
+ dout(20) << "clone_range: failed CLONE_RANGE call with -EINVAL, using copy" << dendl;
+ return _copy_range(from, to, srcoff, len, dstoff);
+ } else {
+ return -errno;
+ }
+
+ // Take care any trimmed from front
+ if (srcoffclone != srcoff) {
+ err = _copy_range(from, to, srcoff, srcoffclone - srcoff, dstoff);
+ if (err >= 0) {
+ r += err;
+ } else {
+ return err;
+ }
+ }
+
+ // Copy end
+ if (srcoffclone + lenclone != srcoff + len) {
+ err = _copy_range(from, to,
+ srcoffclone + lenclone,
+ (srcoff + len) - (srcoffclone + lenclone),
+ dstoffclone + lenclone);
+ if (err >= 0) {
+ r += err;
+ } else {
+ return err;
+ }
+ }
+ dout(20) << "clone_range: finished " << srcoff << "~" << len
+ << " to " << dstoff << " = " << r << dendl;
+ return r;
+}
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_BTRFSFILESTOREBACKEDN_H
+#define CEPH_BTRFSFILESTOREBACKEDN_H
+
+#if defined(__linux__)
+#include "GenericFileStoreBackend.h"
+
+class BtrfsFileStoreBackend : public GenericFileStoreBackend {
+private:
+ bool has_clone_range; ///< clone range ioctl is supported
+ bool has_snap_create; ///< snap create ioctl is supported
+ bool has_snap_destroy; ///< snap destroy ioctl is supported
+ bool has_snap_create_v2; ///< snap create v2 ioctl (async!) is supported
+ bool has_wait_sync; ///< wait sync ioctl is supported
+ bool stable_commits;
+ bool m_filestore_btrfs_clone_range;
+ bool m_filestore_btrfs_snap;
+public:
+ BtrfsFileStoreBackend(FileStore *fs);
+ ~BtrfsFileStoreBackend() {}
+ const char *get_name() {
+ return "btrfs";
+ }
+ int detect_features();
+ bool can_checkpoint();
+ int create_current();
+ int list_checkpoints(list<string>& ls);
+ int create_checkpoint(const string& name, uint64_t *cid);
+ int sync_checkpoint(uint64_t cid);
+ int rollback_to(const string& name);
+ int destroy_checkpoint(const string& name);
+ int syncfs();
+ int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
+};
+#endif
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef OS_COLLECTIONINDEX_H
+#define OS_COLLECTIONINDEX_H
+
+#include <string>
+#include <vector>
+#include "include/memory.h"
+
+#include "osd/osd_types.h"
+#include "include/object.h"
+#include "common/RWLock.h"
+
+/**
+ * CollectionIndex provides an interface for manipulating indexed collections
+ */
+class CollectionIndex {
+protected:
+ /**
+ * Object encapsulating a returned path.
+ *
+ * A path to an object (existent or non-existent) becomes invalid
+ * when a different object is created in the index. Path stores
+ * a shared_ptr to the CollectionIndex to keep the index alive
+ * during its lifetime.
+ * @see IndexManager
+ * @see self_ref
+ * @see set_ref
+ */
+ class Path {
+ public:
+ /// Returned path
+ string full_path;
+ /// Ref to parent Index
+ CollectionIndex* parent_ref;
+ /// coll_t for parent Index
+ coll_t parent_coll;
+
+ /// Normal Constructor
+ Path(
+ string path, ///< [in] Path to return.
+ CollectionIndex* ref)
+ : full_path(path), parent_ref(ref), parent_coll(parent_ref->coll()) {}
+
+ /// Debugging Constructor
+ Path(
+ string path, ///< [in] Path to return.
+ coll_t coll) ///< [in] collection
+ : full_path(path), parent_coll(coll) {}
+
+ /// Getter for the stored path.
+ const char *path() const { return full_path.c_str(); }
+
+ /// Getter for collection
+ coll_t coll() const { return parent_coll; }
+
+ /// Getter for parent
+ CollectionIndex* get_index() const {
+ return parent_ref;
+ }
+ };
+ public:
+
+ string access_lock_name;
+ RWLock access_lock;
+ /// Type of returned paths
+ typedef ceph::shared_ptr<Path> IndexedPath;
+
+ static IndexedPath get_testing_path(string path, coll_t collection) {
+ return IndexedPath(new Path(path, collection));
+ }
+
+ static const uint32_t FLAT_INDEX_TAG = 0;
+ static const uint32_t HASH_INDEX_TAG = 1;
+ static const uint32_t HASH_INDEX_TAG_2 = 2;
+ static const uint32_t HOBJECT_WITH_POOL = 3;
+ /**
+ * For tracking Filestore collection versions.
+ *
+ * @return Collection version represented by the Index implementation
+ */
+ virtual uint32_t collection_version() = 0;
+
+ /**
+ * Returns the collection managed by this CollectionIndex
+ */
+ virtual coll_t coll() const = 0;
+
+
+ /**
+ * Initializes the index.
+ *
+ * @return Error Code, 0 for success
+ */
+ virtual int init() = 0;
+
+ /**
+ * Cleanup before replaying journal
+ *
+ * Index implemenations may need to perform compound operations
+ * which may leave the collection unstable if interupted. cleanup
+ * is called on mount to allow the CollectionIndex implementation
+ * to stabilize.
+ *
+ * @see HashIndex
+ * @return Error Code, 0 for success
+ */
+ virtual int cleanup() = 0;
+
+ /**
+ * Call when a file is created using a path returned from lookup.
+ *
+ * @return Error Code, 0 for success
+ */
+ virtual int created(
+ const ghobject_t &oid, ///< [in] Created object.
+ const char *path ///< [in] Path to created object.
+ ) = 0;
+
+ /**
+ * Removes oid from the collection
+ *
+ * @return Error Code, 0 for success
+ */
+ virtual int unlink(
+ const ghobject_t &oid ///< [in] Object to remove
+ ) = 0;
+
+ /**
+ * Gets the IndexedPath for oid.
+ *
+ * @return Error Code, 0 for success
+ */
+ virtual int lookup(
+ const ghobject_t &oid, ///< [in] Object to lookup
+ IndexedPath *path, ///< [out] Path to object
+ int *hardlink ///< [out] number of hard links of this object. *hardlink=0 mean object no-exist.
+ ) = 0;
+
+ /**
+ * Moves objects matching @e match in the lsb @e bits
+ *
+ * dest and this must be the same subclass
+ *
+ * @return Error Code, 0 for success
+ */
+ virtual int split(
+ uint32_t match, //< [in] value to match
+ uint32_t bits, //< [in] bits to check
+ CollectionIndex* dest //< [in] destination index
+ ) { assert(0); return 0; }
+
+
+ /// List contents of collection by hash
+ virtual int collection_list_partial(
+ const ghobject_t &start, ///< [in] object at which to start
+ const ghobject_t &end, ///< [in] list only objects < end
+ bool sort_bitwise, ///< [in] use bitwise sort
+ int max_count, ///< [in] return at most max_count objects
+ vector<ghobject_t> *ls, ///< [out] Listed objects
+ ghobject_t *next ///< [out] Next object to list
+ ) = 0;
+
+ /// Call prior to removing directory
+ virtual int prep_delete() { return 0; }
+
+ CollectionIndex(coll_t collection):
+ access_lock_name ("CollectionIndex::access_lock::" + collection.to_str()),
+ access_lock(access_lock_name.c_str()) {}
+
+ /*
+ * Pre-hash the collection, this collection should map to a PG folder.
+ *
+ * @param pg_num - pg number of the pool this collection belongs to.
+ * @param expected_num_objs - expected number of objects in this collection.
+ * @Return 0 on success, an error code otherwise.
+ */
+ virtual int pre_hash_collection(
+ uint32_t pg_num, ///< [in] pg number of the pool this collection belongs to
+ uint64_t expected_num_objs ///< [in] expected number of objects this collection has
+ ) { assert(0); return 0; }
+
+ /// Virtual destructor
+ virtual ~CollectionIndex() {}
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+
+#include <iostream>
+#include <set>
+#include <map>
+#include <string>
+#include "include/memory.h"
+#include <vector>
+
+#include "os/ObjectMap.h"
+#include "kv/KeyValueDB.h"
+#include "DBObjectMap.h"
+#include <errno.h>
+
+#include "common/debug.h"
+#include "common/config.h"
+#include "include/assert.h"
+
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "filestore "
+
+const string DBObjectMap::USER_PREFIX = "_USER_";
+const string DBObjectMap::XATTR_PREFIX = "_AXATTR_";
+const string DBObjectMap::SYS_PREFIX = "_SYS_";
+const string DBObjectMap::COMPLETE_PREFIX = "_COMPLETE_";
+const string DBObjectMap::HEADER_KEY = "HEADER";
+const string DBObjectMap::USER_HEADER_KEY = "USER_HEADER";
+const string DBObjectMap::GLOBAL_STATE_KEY = "HEADER";
+const string DBObjectMap::HOBJECT_TO_SEQ = "_HOBJTOSEQ_";
+
+// Legacy
+const string DBObjectMap::LEAF_PREFIX = "_LEAF_";
+const string DBObjectMap::REVERSE_LEAF_PREFIX = "_REVLEAF_";
+
+static void append_escaped(const string &in, string *out)
+{
+ for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
+ if (*i == '%') {
+ out->push_back('%');
+ out->push_back('p');
+ } else if (*i == '.') {
+ out->push_back('%');
+ out->push_back('e');
+ } else if (*i == '_') {
+ out->push_back('%');
+ out->push_back('u');
+ } else {
+ out->push_back(*i);
+ }
+ }
+}
+
+bool DBObjectMap::check(std::ostream &out)
+{
+ bool retval = true;
+ map<uint64_t, uint64_t> parent_to_num_children;
+ map<uint64_t, uint64_t> parent_to_actual_num_children;
+ KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
+ for (iter->seek_to_first(); iter->valid(); iter->next()) {
+ _Header header;
+ assert(header.num_children == 1);
+ header.num_children = 0; // Hack for leaf node
+ bufferlist bl = iter->value();
+ while (true) {
+ bufferlist::iterator bliter = bl.begin();
+ header.decode(bliter);
+ if (header.seq != 0)
+ parent_to_actual_num_children[header.seq] = header.num_children;
+ if (header.parent == 0)
+ break;
+
+ if (!parent_to_num_children.count(header.parent))
+ parent_to_num_children[header.parent] = 0;
+ parent_to_num_children[header.parent]++;
+ if (parent_to_actual_num_children.count(header.parent))
+ break;
+
+ set<string> to_get;
+ map<string, bufferlist> got;
+ to_get.insert(HEADER_KEY);
+ db->get(sys_parent_prefix(header), to_get, &got);
+ if (got.empty()) {
+ out << "Missing: seq " << header.parent << std::endl;
+ retval = false;
+ break;
+ } else {
+ bl = got.begin()->second;
+ }
+ }
+ }
+
+ for (map<uint64_t, uint64_t>::iterator i = parent_to_num_children.begin();
+ i != parent_to_num_children.end();
+ parent_to_num_children.erase(i++)) {
+ if (!parent_to_actual_num_children.count(i->first))
+ continue;
+ if (parent_to_actual_num_children[i->first] != i->second) {
+ out << "Invalid: seq " << i->first << " recorded children: "
+ << parent_to_actual_num_children[i->first] << " found: "
+ << i->second << std::endl;
+ retval = false;
+ }
+ parent_to_actual_num_children.erase(i->first);
+ }
+ return retval;
+}
+
+string DBObjectMap::ghobject_key(const ghobject_t &oid)
+{
+ string out;
+ append_escaped(oid.hobj.oid.name, &out);
+ out.push_back('.');
+ append_escaped(oid.hobj.get_key(), &out);
+ out.push_back('.');
+ append_escaped(oid.hobj.nspace, &out);
+ out.push_back('.');
+
+ char snap_with_hash[1000];
+ char *t = snap_with_hash;
+ char *end = t + sizeof(snap_with_hash);
+ if (oid.hobj.snap == CEPH_NOSNAP)
+ t += snprintf(t, end - t, "head");
+ else if (oid.hobj.snap == CEPH_SNAPDIR)
+ t += snprintf(t, end - t, "snapdir");
+ else
+ t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
+
+ if (oid.hobj.pool == -1)
+ t += snprintf(t, end - t, ".none");
+ else
+ t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.hobj.pool);
+ t += snprintf(t, end - t, ".%.*X", (int)(sizeof(uint32_t)*2), oid.hobj.get_hash());
+
+ if (oid.generation != ghobject_t::NO_GEN ||
+ oid.shard_id != shard_id_t::NO_SHARD) {
+ t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.generation);
+ t += snprintf(t, end - t, ".%x", (int)oid.shard_id);
+ }
+ out += string(snap_with_hash);
+ return out;
+}
+
+// ok: pglog%u3%efs1...0.none.0017B237
+// bad: plana8923501-10...4c.3.ffffffffffffffff.2
+// fixed: plana8923501-10...4c.3.CB767F2D.ffffffffffffffff.2
+// returns 0 for false, 1 for true, negative for error
+int DBObjectMap::is_buggy_ghobject_key_v1(const string &in)
+{
+ int dots = 5; // skip 5 .'s
+ const char *s = in.c_str();
+ do {
+ while (*s && *s != '.')
+ ++s;
+ if (!*s) {
+ derr << "unexpected null at " << (int)(s-in.c_str()) << dendl;
+ return -EINVAL;
+ }
+ ++s;
+ } while (*s && --dots);
+ if (!*s) {
+ derr << "unexpected null at " << (int)(s-in.c_str()) << dendl;
+ return -EINVAL;
+ }
+ // we are now either at a hash value (32 bits, 8 chars) or a generation
+ // value (64 bits) '.' and shard id. count the dots!
+ int len = 0;
+ while (*s && *s != '.') {
+ ++s;
+ ++len;
+ }
+ if (*s == '\0') {
+ if (len != 8) {
+ derr << "hash value is not 8 chars" << dendl;
+ return -EINVAL; // the hash value is always 8 chars.
+ }
+ return 0;
+ }
+ if (*s != '.') { // the shard follows.
+ derr << "missing final . and shard id at " << (int)(s-in.c_str()) << dendl;
+ return -EINVAL;
+ }
+ return 1;
+}
+
+
+string DBObjectMap::map_header_key(const ghobject_t &oid)
+{
+ return ghobject_key(oid);
+}
+
+string DBObjectMap::header_key(uint64_t seq)
+{
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%.*" PRId64, (int)(2*sizeof(seq)), seq);
+ return string(buf);
+}
+
+string DBObjectMap::complete_prefix(Header header)
+{
+ return USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX;
+}
+
+string DBObjectMap::user_prefix(Header header)
+{
+ return USER_PREFIX + header_key(header->seq) + USER_PREFIX;
+}
+
+string DBObjectMap::sys_prefix(Header header)
+{
+ return USER_PREFIX + header_key(header->seq) + SYS_PREFIX;
+}
+
+string DBObjectMap::xattr_prefix(Header header)
+{
+ return USER_PREFIX + header_key(header->seq) + XATTR_PREFIX;
+}
+
+string DBObjectMap::sys_parent_prefix(_Header header)
+{
+ return USER_PREFIX + header_key(header.parent) + SYS_PREFIX;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::init()
+{
+ invalid = false;
+ if (ready) {
+ return 0;
+ }
+ assert(!parent_iter);
+ if (header->parent) {
+ Header parent = map->lookup_parent(header);
+ if (!parent) {
+ assert(0);
+ return -EINVAL;
+ }
+ parent_iter.reset(new DBObjectMapIteratorImpl(map, parent));
+ }
+ key_iter = map->db->get_iterator(map->user_prefix(header));
+ assert(key_iter);
+ complete_iter = map->db->get_iterator(map->complete_prefix(header));
+ assert(complete_iter);
+ cur_iter = key_iter;
+ assert(cur_iter);
+ ready = true;
+ return 0;
+}
+
+ObjectMap::ObjectMapIterator DBObjectMap::get_iterator(
+ const ghobject_t &oid)
+{
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return ObjectMapIterator(new EmptyIteratorImpl());
+ DBObjectMapIterator iter = _get_iterator(header);
+ iter->hlock.swap(hl);
+ return iter;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::seek_to_first()
+{
+ init();
+ r = 0;
+ if (parent_iter) {
+ r = parent_iter->seek_to_first();
+ if (r < 0)
+ return r;
+ }
+ r = key_iter->seek_to_first();
+ if (r < 0)
+ return r;
+ return adjust();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::seek_to_last()
+{
+ init();
+ r = 0;
+ if (parent_iter) {
+ r = parent_iter->seek_to_last();
+ if (r < 0)
+ return r;
+ if (parent_iter->valid())
+ r = parent_iter->next();
+ if (r < 0)
+ return r;
+ }
+ r = key_iter->seek_to_last();
+ if (r < 0)
+ return r;
+ if (key_iter->valid())
+ r = key_iter->next();
+ if (r < 0)
+ return r;
+ return adjust();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::lower_bound(const string &to)
+{
+ init();
+ r = 0;
+ if (parent_iter) {
+ r = parent_iter->lower_bound(to);
+ if (r < 0)
+ return r;
+ }
+ r = key_iter->lower_bound(to);
+ if (r < 0)
+ return r;
+ return adjust();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::upper_bound(const string &after)
+{
+ init();
+ r = 0;
+ if (parent_iter) {
+ r = parent_iter->upper_bound(after);
+ if (r < 0)
+ return r;
+ }
+ r = key_iter->upper_bound(after);
+ if (r < 0)
+ return r;
+ return adjust();
+}
+
+bool DBObjectMap::DBObjectMapIteratorImpl::valid()
+{
+ bool valid = !invalid && ready;
+ assert(!valid || cur_iter->valid());
+ return valid;
+}
+
+bool DBObjectMap::DBObjectMapIteratorImpl::valid_parent()
+{
+ if (parent_iter && parent_iter->valid() &&
+ (!key_iter->valid() || key_iter->key() > parent_iter->key()))
+ return true;
+ return false;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::next(bool validate)
+{
+ assert(cur_iter->valid());
+ assert(valid());
+ cur_iter->next();
+ return adjust();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::next_parent()
+{
+ if (!parent_iter || !parent_iter->valid()) {
+ invalid = true;
+ return 0;
+ }
+ r = next();
+ if (r < 0)
+ return r;
+ if (!valid() || on_parent() || !parent_iter->valid())
+ return 0;
+
+ return lower_bound(parent_iter->key());
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::in_complete_region(const string &to_test,
+ string *begin,
+ string *end)
+{
+ complete_iter->upper_bound(to_test);
+ if (complete_iter->valid())
+ complete_iter->prev();
+ else
+ complete_iter->seek_to_last();
+
+ if (!complete_iter->valid())
+ return false;
+
+ string _end;
+ if (begin)
+ *begin = complete_iter->key();
+ _end = string(complete_iter->value().c_str());
+ if (end)
+ *end = _end;
+ return (to_test >= complete_iter->key()) && (!_end.size() || _end > to_test);
+}
+
+/**
+ * Moves parent_iter to the next position both out of the complete_region and
+ * not equal to key_iter. Then, we set cur_iter to parent_iter if valid and
+ * less than key_iter and key_iter otherwise.
+ */
+int DBObjectMap::DBObjectMapIteratorImpl::adjust()
+{
+ string begin, end;
+ while (parent_iter && parent_iter->valid()) {
+ if (in_complete_region(parent_iter->key(), &begin, &end)) {
+ if (end.size() == 0) {
+ parent_iter->seek_to_last();
+ if (parent_iter->valid())
+ parent_iter->next();
+ } else
+ parent_iter->lower_bound(end);
+ } else if (key_iter->valid() && key_iter->key() == parent_iter->key()) {
+ parent_iter->next();
+ } else {
+ break;
+ }
+ }
+ if (valid_parent()) {
+ cur_iter = parent_iter;
+ } else if (key_iter->valid()) {
+ cur_iter = key_iter;
+ } else {
+ invalid = true;
+ }
+ assert(invalid || cur_iter->valid());
+ return 0;
+}
+
+
+string DBObjectMap::DBObjectMapIteratorImpl::key()
+{
+ return cur_iter->key();
+}
+
+bufferlist DBObjectMap::DBObjectMapIteratorImpl::value()
+{
+ return cur_iter->value();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::status()
+{
+ return r;
+}
+
+int DBObjectMap::set_keys(const ghobject_t &oid,
+ const map<string, bufferlist> &set,
+ const SequencerPosition *spos)
+{
+ KeyValueDB::Transaction t = db->get_transaction();
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_create_map_header(hl, oid, t);
+ if (!header)
+ return -EINVAL;
+ if (check_spos(oid, header, spos))
+ return 0;
+
+ t->set(user_prefix(header), set);
+
+ return db->submit_transaction(t);
+}
+
+int DBObjectMap::set_header(const ghobject_t &oid,
+ const bufferlist &bl,
+ const SequencerPosition *spos)
+{
+ KeyValueDB::Transaction t = db->get_transaction();
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_create_map_header(hl, oid, t);
+ if (!header)
+ return -EINVAL;
+ if (check_spos(oid, header, spos))
+ return 0;
+ _set_header(header, bl, t);
+ return db->submit_transaction(t);
+}
+
+void DBObjectMap::_set_header(Header header, const bufferlist &bl,
+ KeyValueDB::Transaction t)
+{
+ map<string, bufferlist> to_set;
+ to_set[USER_HEADER_KEY] = bl;
+ t->set(sys_prefix(header), to_set);
+}
+
+int DBObjectMap::get_header(const ghobject_t &oid,
+ bufferlist *bl)
+{
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header) {
+ return 0;
+ }
+ return _get_header(header, bl);
+}
+
+int DBObjectMap::_get_header(Header header,
+ bufferlist *bl)
+{
+ map<string, bufferlist> out;
+ while (true) {
+ out.clear();
+ set<string> to_get;
+ to_get.insert(USER_HEADER_KEY);
+ int r = db->get(sys_prefix(header), to_get, &out);
+ if (r == 0 && !out.empty())
+ break;
+ if (r < 0)
+ return r;
+ Header current(header);
+ if (!current->parent)
+ break;
+ header = lookup_parent(current);
+ }
+
+ if (!out.empty())
+ bl->swap(out.begin()->second);
+ return 0;
+}
+
+int DBObjectMap::clear(const ghobject_t &oid,
+ const SequencerPosition *spos)
+{
+ KeyValueDB::Transaction t = db->get_transaction();
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ if (check_spos(oid, header, spos))
+ return 0;
+ remove_map_header(hl, oid, header, t);
+ assert(header->num_children > 0);
+ header->num_children--;
+ int r = _clear(header, t);
+ if (r < 0)
+ return r;
+ return db->submit_transaction(t);
+}
+
+int DBObjectMap::_clear(Header header,
+ KeyValueDB::Transaction t)
+{
+ while (1) {
+ if (header->num_children) {
+ set_header(header, t);
+ break;
+ }
+ clear_header(header, t);
+ if (!header->parent)
+ break;
+ Header parent = lookup_parent(header);
+ if (!parent) {
+ return -EINVAL;
+ }
+ assert(parent->num_children > 0);
+ parent->num_children--;
+ header.swap(parent);
+ }
+ return 0;
+}
+
+int DBObjectMap::merge_new_complete(Header header,
+ const map<string, string> &new_complete,
+ DBObjectMapIterator iter,
+ KeyValueDB::Transaction t)
+{
+ KeyValueDB::Iterator complete_iter = db->get_iterator(
+ complete_prefix(header)
+ );
+ map<string, string>::const_iterator i = new_complete.begin();
+ set<string> to_remove;
+ map<string, bufferlist> to_add;
+
+ string begin, end;
+ while (i != new_complete.end()) {
+ string new_begin = i->first;
+ string new_end = i->second;
+ int r = iter->in_complete_region(new_begin, &begin, &end);
+ if (r < 0)
+ return r;
+ if (r) {
+ to_remove.insert(begin);
+ new_begin = begin;
+ }
+ ++i;
+ while (i != new_complete.end()) {
+ if (!new_end.size() || i->first <= new_end) {
+ if (!new_end.size() && i->second > new_end) {
+ new_end = i->second;
+ }
+ ++i;
+ continue;
+ }
+
+ r = iter->in_complete_region(new_end, &begin, &end);
+ if (r < 0)
+ return r;
+ if (r) {
+ to_remove.insert(begin);
+ new_end = end;
+ continue;
+ }
+ break;
+ }
+ bufferlist bl;
+ bl.append(bufferptr(new_end.c_str(), new_end.size() + 1));
+ to_add.insert(make_pair(new_begin, bl));
+ }
+ t->rmkeys(complete_prefix(header), to_remove);
+ t->set(complete_prefix(header), to_add);
+ return 0;
+}
+
+int DBObjectMap::copy_up_header(Header header,
+ KeyValueDB::Transaction t)
+{
+ bufferlist bl;
+ int r = _get_header(header, &bl);
+ if (r < 0)
+ return r;
+
+ _set_header(header, bl, t);
+ return 0;
+}
+
+int DBObjectMap::need_parent(DBObjectMapIterator iter)
+{
+ int r = iter->seek_to_first();
+ if (r < 0)
+ return r;
+
+ if (!iter->valid())
+ return 0;
+
+ string begin, end;
+ if (iter->in_complete_region(iter->key(), &begin, &end) && end == "") {
+ return 0;
+ }
+ return 1;
+}
+
+int DBObjectMap::rm_keys(const ghobject_t &oid,
+ const set<string> &to_clear,
+ const SequencerPosition *spos)
+{
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ KeyValueDB::Transaction t = db->get_transaction();
+ if (check_spos(oid, header, spos))
+ return 0;
+ t->rmkeys(user_prefix(header), to_clear);
+ if (!header->parent) {
+ return db->submit_transaction(t);
+ }
+
+ // Copy up keys from parent around to_clear
+ int keep_parent;
+ {
+ DBObjectMapIterator iter = _get_iterator(header);
+ iter->seek_to_first();
+ map<string, string> new_complete;
+ map<string, bufferlist> to_write;
+ for(set<string>::const_iterator i = to_clear.begin();
+ i != to_clear.end();
+ ) {
+ unsigned copied = 0;
+ iter->lower_bound(*i);
+ ++i;
+ if (!iter->valid())
+ break;
+ string begin = iter->key();
+ if (!iter->on_parent())
+ iter->next_parent();
+ if (new_complete.size() && new_complete.rbegin()->second == begin) {
+ begin = new_complete.rbegin()->first;
+ }
+ while (iter->valid() && copied < 20) {
+ if (!to_clear.count(iter->key()))
+ to_write[iter->key()].append(iter->value());
+ if (i != to_clear.end() && *i <= iter->key()) {
+ ++i;
+ copied = 0;
+ }
+
+ iter->next_parent();
+ copied++;
+ }
+ if (iter->valid()) {
+ new_complete[begin] = iter->key();
+ } else {
+ new_complete[begin] = "";
+ break;
+ }
+ }
+ t->set(user_prefix(header), to_write);
+ merge_new_complete(header, new_complete, iter, t);
+ keep_parent = need_parent(iter);
+ if (keep_parent < 0)
+ return keep_parent;
+ }
+ if (!keep_parent) {
+ copy_up_header(header, t);
+ Header parent = lookup_parent(header);
+ if (!parent)
+ return -EINVAL;
+ parent->num_children--;
+ _clear(parent, t);
+ header->parent = 0;
+ set_map_header(hl, oid, *header, t);
+ t->rmkeys_by_prefix(complete_prefix(header));
+ }
+ return db->submit_transaction(t);
+}
+
+int DBObjectMap::clear_keys_header(const ghobject_t &oid,
+ const SequencerPosition *spos)
+{
+ KeyValueDB::Transaction t = db->get_transaction();
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ if (check_spos(oid, header, spos))
+ return 0;
+
+ // save old attrs
+ KeyValueDB::Iterator iter = db->get_iterator(xattr_prefix(header));
+ if (!iter)
+ return -EINVAL;
+ map<string, bufferlist> attrs;
+ for (iter->seek_to_first(); !iter->status() && iter->valid(); iter->next())
+ attrs.insert(make_pair(iter->key(), iter->value()));
+ if (iter->status())
+ return iter->status();
+
+ // remove current header
+ remove_map_header(hl, oid, header, t);
+ assert(header->num_children > 0);
+ header->num_children--;
+ int r = _clear(header, t);
+ if (r < 0)
+ return r;
+
+ // create new header
+ Header newheader = generate_new_header(oid, Header());
+ set_map_header(hl, oid, *newheader, t);
+ if (!attrs.empty())
+ t->set(xattr_prefix(newheader), attrs);
+ return db->submit_transaction(t);
+}
+
+int DBObjectMap::get(const ghobject_t &oid,
+ bufferlist *_header,
+ map<string, bufferlist> *out)
+{
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ _get_header(header, _header);
+ ObjectMapIterator iter = _get_iterator(header);
+ for (iter->seek_to_first(); iter->valid(); iter->next()) {
+ if (iter->status())
+ return iter->status();
+ out->insert(make_pair(iter->key(), iter->value()));
+ }
+ return 0;
+}
+
+int DBObjectMap::get_keys(const ghobject_t &oid,
+ set<string> *keys)
+{
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ ObjectMapIterator iter = _get_iterator(header);
+ for (iter->seek_to_first(); iter->valid(); iter->next()) {
+ if (iter->status())
+ return iter->status();
+ keys->insert(iter->key());
+ }
+ return 0;
+}
+
+int DBObjectMap::scan(Header header,
+ const set<string> &in_keys,
+ set<string> *out_keys,
+ map<string, bufferlist> *out_values)
+{
+ ObjectMapIterator db_iter = _get_iterator(header);
+ for (set<string>::const_iterator key_iter = in_keys.begin();
+ key_iter != in_keys.end();
+ ++key_iter) {
+ db_iter->lower_bound(*key_iter);
+ if (db_iter->status())
+ return db_iter->status();
+ if (db_iter->valid() && db_iter->key() == *key_iter) {
+ if (out_keys)
+ out_keys->insert(*key_iter);
+ if (out_values)
+ out_values->insert(make_pair(db_iter->key(), db_iter->value()));
+ }
+ }
+ return 0;
+}
+
+int DBObjectMap::get_values(const ghobject_t &oid,
+ const set<string> &keys,
+ map<string, bufferlist> *out)
+{
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ return scan(header, keys, 0, out);
+}
+
+int DBObjectMap::check_keys(const ghobject_t &oid,
+ const set<string> &keys,
+ set<string> *out)
+{
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ return scan(header, keys, out, 0);
+}
+
+int DBObjectMap::get_xattrs(const ghobject_t &oid,
+ const set<string> &to_get,
+ map<string, bufferlist> *out)
+{
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ return db->get(xattr_prefix(header), to_get, out);
+}
+
+int DBObjectMap::get_all_xattrs(const ghobject_t &oid,
+ set<string> *out)
+{
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ KeyValueDB::Iterator iter = db->get_iterator(xattr_prefix(header));
+ if (!iter)
+ return -EINVAL;
+ for (iter->seek_to_first(); !iter->status() && iter->valid(); iter->next())
+ out->insert(iter->key());
+ return iter->status();
+}
+
+int DBObjectMap::set_xattrs(const ghobject_t &oid,
+ const map<string, bufferlist> &to_set,
+ const SequencerPosition *spos)
+{
+ KeyValueDB::Transaction t = db->get_transaction();
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_create_map_header(hl, oid, t);
+ if (!header)
+ return -EINVAL;
+ if (check_spos(oid, header, spos))
+ return 0;
+ t->set(xattr_prefix(header), to_set);
+ return db->submit_transaction(t);
+}
+
+int DBObjectMap::remove_xattrs(const ghobject_t &oid,
+ const set<string> &to_remove,
+ const SequencerPosition *spos)
+{
+ KeyValueDB::Transaction t = db->get_transaction();
+ MapHeaderLock hl(this, oid);
+ Header header = lookup_map_header(hl, oid);
+ if (!header)
+ return -ENOENT;
+ if (check_spos(oid, header, spos))
+ return 0;
+ t->rmkeys(xattr_prefix(header), to_remove);
+ return db->submit_transaction(t);
+}
+
+int DBObjectMap::clone(const ghobject_t &oid,
+ const ghobject_t &target,
+ const SequencerPosition *spos)
+{
+ if (oid == target)
+ return 0;
+
+ MapHeaderLock _l1(this, MIN_GHOBJ(oid, target, true));
+ MapHeaderLock _l2(this, MAX_GHOBJ(oid, target, true));
+ MapHeaderLock *lsource, *ltarget;
+ if (cmp_bitwise(oid, target) > 0) {
+ lsource = &_l2;
+ ltarget= &_l1;
+ } else {
+ lsource = &_l1;
+ ltarget= &_l2;
+ }
+
+ KeyValueDB::Transaction t = db->get_transaction();
+ {
+ Header destination = lookup_map_header(*ltarget, target);
+ if (destination) {
+ remove_map_header(*ltarget, target, destination, t);
+ if (check_spos(target, destination, spos))
+ return 0;
+ destination->num_children--;
+ _clear(destination, t);
+ }
+ }
+
+ Header parent = lookup_map_header(*lsource, oid);
+ if (!parent)
+ return db->submit_transaction(t);
+
+ Header source = generate_new_header(oid, parent);
+ Header destination = generate_new_header(target, parent);
+ if (spos)
+ destination->spos = *spos;
+
+ parent->num_children = 2;
+ set_header(parent, t);
+ set_map_header(*lsource, oid, *source, t);
+ set_map_header(*ltarget, target, *destination, t);
+
+ map<string, bufferlist> to_set;
+ KeyValueDB::Iterator xattr_iter = db->get_iterator(xattr_prefix(parent));
+ for (xattr_iter->seek_to_first();
+ xattr_iter->valid();
+ xattr_iter->next())
+ to_set.insert(make_pair(xattr_iter->key(), xattr_iter->value()));
+ t->set(xattr_prefix(source), to_set);
+ t->set(xattr_prefix(destination), to_set);
+ t->rmkeys_by_prefix(xattr_prefix(parent));
+ return db->submit_transaction(t);
+}
+
+int DBObjectMap::upgrade_to_v2()
+{
+ dout(1) << __func__ << " start" << dendl;
+ KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
+ iter->seek_to_first();
+ while (iter->valid()) {
+ unsigned count = 0;
+ KeyValueDB::Transaction t = db->get_transaction();
+ set<string> remove;
+ map<string, bufferlist> add;
+ for (;
+ iter->valid() && count < 300;
+ iter->next()) {
+ dout(20) << __func__ << " key is " << iter->key() << dendl;
+ int r = is_buggy_ghobject_key_v1(iter->key());
+ if (r < 0) {
+ derr << __func__ << " bad key '" << iter->key() << "'" << dendl;
+ return r;
+ }
+ if (!r) {
+ dout(20) << __func__ << " " << iter->key() << " ok" << dendl;
+ continue;
+ }
+
+ // decode header to get oid
+ _Header hdr;
+ bufferlist bl = iter->value();
+ bufferlist::iterator bliter = bl.begin();
+ hdr.decode(bliter);
+
+ string newkey(ghobject_key(hdr.oid));
+ dout(20) << __func__ << " " << iter->key() << " -> " << newkey << dendl;
+ add[newkey] = iter->value();
+ remove.insert(iter->key());
+ ++count;
+ }
+
+ if (!remove.empty()) {
+ dout(20) << __func__ << " updating " << remove.size() << " keys" << dendl;
+ t->rmkeys(HOBJECT_TO_SEQ, remove);
+ t->set(HOBJECT_TO_SEQ, add);
+ int r = db->submit_transaction(t);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ state.v = 2;
+
+ Mutex::Locker l(header_lock);
+ KeyValueDB::Transaction t = db->get_transaction();
+ write_state(t);
+ db->submit_transaction_sync(t);
+ dout(1) << __func__ << " done" << dendl;
+ return 0;
+}
+
+int DBObjectMap::init(bool do_upgrade)
+{
+ map<string, bufferlist> result;
+ set<string> to_get;
+ to_get.insert(GLOBAL_STATE_KEY);
+ int r = db->get(SYS_PREFIX, to_get, &result);
+ if (r < 0)
+ return r;
+ if (!result.empty()) {
+ bufferlist::iterator bliter = result.begin()->second.begin();
+ state.decode(bliter);
+ if (state.v < 1) {
+ dout(1) << "DBObjectMap is *very* old; upgrade to an older version first"
+ << dendl;
+ return -ENOTSUP;
+ }
+ if (state.v < 2) { // Needs upgrade
+ if (!do_upgrade) {
+ dout(1) << "DOBjbectMap requires an upgrade,"
+ << " set filestore_update_to"
+ << dendl;
+ return -ENOTSUP;
+ } else {
+ r = upgrade_to_v2();
+ if (r < 0)
+ return r;
+ }
+ }
+ } else {
+ // New store
+ state.v = 2;
+ state.seq = 1;
+ }
+ dout(20) << "(init)dbobjectmap: seq is " << state.seq << dendl;
+ return 0;
+}
+
+int DBObjectMap::sync(const ghobject_t *oid,
+ const SequencerPosition *spos) {
+ KeyValueDB::Transaction t = db->get_transaction();
+ if (oid) {
+ assert(spos);
+ MapHeaderLock hl(this, *oid);
+ Header header = lookup_map_header(hl, *oid);
+ if (header) {
+ dout(10) << "oid: " << *oid << " setting spos to "
+ << *spos << dendl;
+ header->spos = *spos;
+ set_map_header(hl, *oid, *header, t);
+ }
+ /* It may appear that this and the identical portion of the else
+ * block can combined below, but in this block, the transaction
+ * must be submitted under *both* the MapHeaderLock and the full
+ * header_lock.
+ *
+ * See 2b63dd25fc1c73fa42e52e9ea4ab5a45dd9422a0 and bug 9891.
+ */
+ Mutex::Locker l(header_lock);
+ write_state(t);
+ return db->submit_transaction_sync(t);
+ } else {
+ Mutex::Locker l(header_lock);
+ write_state(t);
+ return db->submit_transaction_sync(t);
+ }
+}
+
+int DBObjectMap::write_state(KeyValueDB::Transaction _t) {
+ assert(header_lock.is_locked_by_me());
+ dout(20) << "dbobjectmap: seq is " << state.seq << dendl;
+ KeyValueDB::Transaction t = _t ? _t : db->get_transaction();
+ bufferlist bl;
+ state.encode(bl);
+ map<string, bufferlist> to_write;
+ to_write[GLOBAL_STATE_KEY] = bl;
+ t->set(SYS_PREFIX, to_write);
+ return _t ? 0 : db->submit_transaction(t);
+}
+
+
+DBObjectMap::Header DBObjectMap::_lookup_map_header(
+ const MapHeaderLock &l,
+ const ghobject_t &oid)
+{
+ assert(l.get_locked() == oid);
+
+ _Header *header = new _Header();
+ {
+ Mutex::Locker l(cache_lock);
+ if (caches.lookup(oid, header)) {
+ assert(!in_use.count(header->seq));
+ in_use.insert(header->seq);
+ return Header(header, RemoveOnDelete(this));
+ }
+ }
+
+ bufferlist out;
+ int r = db->get(HOBJECT_TO_SEQ, map_header_key(oid), &out);
+ if (r < 0 || out.length()==0) {
+ delete header;
+ return Header();
+ }
+
+ Header ret(header, RemoveOnDelete(this));
+ bufferlist::iterator iter = out.begin();
+
+ ret->decode(iter);
+ {
+ Mutex::Locker l(cache_lock);
+ caches.add(oid, *ret);
+ }
+
+ assert(!in_use.count(header->seq));
+ in_use.insert(header->seq);
+ return ret;
+}
+
+DBObjectMap::Header DBObjectMap::_generate_new_header(const ghobject_t &oid,
+ Header parent)
+{
+ Header header = Header(new _Header(), RemoveOnDelete(this));
+ header->seq = state.seq++;
+ if (parent) {
+ header->parent = parent->seq;
+ header->spos = parent->spos;
+ }
+ header->num_children = 1;
+ header->oid = oid;
+ assert(!in_use.count(header->seq));
+ in_use.insert(header->seq);
+
+ write_state();
+ return header;
+}
+
+DBObjectMap::Header DBObjectMap::lookup_parent(Header input)
+{
+ Mutex::Locker l(header_lock);
+ while (in_use.count(input->parent))
+ header_cond.Wait(header_lock);
+ map<string, bufferlist> out;
+ set<string> keys;
+ keys.insert(HEADER_KEY);
+
+ dout(20) << "lookup_parent: parent " << input->parent
+ << " for seq " << input->seq << dendl;
+ int r = db->get(sys_parent_prefix(input), keys, &out);
+ if (r < 0) {
+ assert(0);
+ return Header();
+ }
+ if (out.empty()) {
+ assert(0);
+ return Header();
+ }
+
+ Header header = Header(new _Header(), RemoveOnDelete(this));
+ header->seq = input->parent;
+ bufferlist::iterator iter = out.begin()->second.begin();
+ header->decode(iter);
+ dout(20) << "lookup_parent: parent seq is " << header->seq << " with parent "
+ << header->parent << dendl;
+ in_use.insert(header->seq);
+ return header;
+}
+
+DBObjectMap::Header DBObjectMap::lookup_create_map_header(
+ const MapHeaderLock &hl,
+ const ghobject_t &oid,
+ KeyValueDB::Transaction t)
+{
+ Mutex::Locker l(header_lock);
+ Header header = _lookup_map_header(hl, oid);
+ if (!header) {
+ header = _generate_new_header(oid, Header());
+ set_map_header(hl, oid, *header, t);
+ }
+ return header;
+}
+
+void DBObjectMap::clear_header(Header header, KeyValueDB::Transaction t)
+{
+ dout(20) << "clear_header: clearing seq " << header->seq << dendl;
+ t->rmkeys_by_prefix(user_prefix(header));
+ t->rmkeys_by_prefix(sys_prefix(header));
+ t->rmkeys_by_prefix(complete_prefix(header));
+ t->rmkeys_by_prefix(xattr_prefix(header));
+ set<string> keys;
+ keys.insert(header_key(header->seq));
+ t->rmkeys(USER_PREFIX, keys);
+}
+
+void DBObjectMap::set_header(Header header, KeyValueDB::Transaction t)
+{
+ dout(20) << "set_header: setting seq " << header->seq << dendl;
+ map<string, bufferlist> to_write;
+ header->encode(to_write[HEADER_KEY]);
+ t->set(sys_prefix(header), to_write);
+}
+
+void DBObjectMap::remove_map_header(
+ const MapHeaderLock &l,
+ const ghobject_t &oid,
+ Header header,
+ KeyValueDB::Transaction t)
+{
+ assert(l.get_locked() == oid);
+ dout(20) << "remove_map_header: removing " << header->seq
+ << " oid " << oid << dendl;
+ set<string> to_remove;
+ to_remove.insert(map_header_key(oid));
+ t->rmkeys(HOBJECT_TO_SEQ, to_remove);
+ {
+ Mutex::Locker l(cache_lock);
+ caches.clear(oid);
+ }
+}
+
+void DBObjectMap::set_map_header(
+ const MapHeaderLock &l,
+ const ghobject_t &oid, _Header header,
+ KeyValueDB::Transaction t)
+{
+ assert(l.get_locked() == oid);
+ dout(20) << "set_map_header: setting " << header.seq
+ << " oid " << oid << " parent seq "
+ << header.parent << dendl;
+ map<string, bufferlist> to_set;
+ header.encode(to_set[map_header_key(oid)]);
+ t->set(HOBJECT_TO_SEQ, to_set);
+ {
+ Mutex::Locker l(cache_lock);
+ caches.add(oid, header);
+ }
+}
+
+bool DBObjectMap::check_spos(const ghobject_t &oid,
+ Header header,
+ const SequencerPosition *spos)
+{
+ if (!spos || *spos > header->spos) {
+ stringstream out;
+ if (spos)
+ dout(10) << "oid: " << oid << " not skipping op, *spos "
+ << *spos << dendl;
+ else
+ dout(10) << "oid: " << oid << " not skipping op, *spos "
+ << "empty" << dendl;
+ dout(10) << " > header.spos " << header->spos << dendl;
+ return false;
+ } else {
+ dout(10) << "oid: " << oid << " skipping op, *spos " << *spos
+ << " <= header.spos " << header->spos << dendl;
+ return true;
+ }
+}
+
+int DBObjectMap::list_objects(vector<ghobject_t> *out)
+{
+ KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
+ for (iter->seek_to_first(); iter->valid(); iter->next()) {
+ bufferlist bl = iter->value();
+ bufferlist::iterator bliter = bl.begin();
+ _Header header;
+ header.decode(bliter);
+ out->push_back(header.oid);
+ }
+ return 0;
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+#ifndef DBOBJECTMAP_DB_H
+#define DBOBJECTMAP_DB_H
+
+#include "include/buffer_fwd.h"
+#include <set>
+#include <map>
+#include <string>
+
+#include <vector>
+#include "include/memory.h"
+#include <boost/scoped_ptr.hpp>
+
+#include "os/ObjectMap.h"
+#include "kv/KeyValueDB.h"
+#include "osd/osd_types.h"
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/simple_cache.hpp"
+#include <boost/optional/optional_io.hpp>
+
+#include "SequencerPosition.h"
+
+/**
+ * DBObjectMap: Implements ObjectMap in terms of KeyValueDB
+ *
+ * Prefix space structure:
+ *
+ * @see complete_prefix
+ * @see user_prefix
+ * @see sys_prefix
+ *
+ * - GHOBJECT_TO_SEQ: Contains leaf mapping from ghobject_t->hobj.seq and
+ * corresponding omap header
+ * - SYS_PREFIX: GLOBAL_STATE_KEY - contains next seq number
+ * @see State
+ * @see write_state
+ * @see init
+ * @see generate_new_header
+ * - USER_PREFIX + header_key(header->seq) + USER_PREFIX
+ * : key->value for header->seq
+ * - USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX: see below
+ * - USER_PREFIX + header_key(header->seq) + XATTR_PREFIX: xattrs
+ * - USER_PREFIX + header_key(header->seq) + SYS_PREFIX
+ * : USER_HEADER_KEY - omap header for header->seq
+ * : HEADER_KEY - encoding of header for header->seq
+ *
+ * For each node (represented by a header), we
+ * store three mappings: the key mapping, the complete mapping, and the parent.
+ * The complete mapping (COMPLETE_PREFIX space) is key->key. Each x->y entry in
+ * this mapping indicates that the key mapping contains all entries on [x,y).
+ * Note, max string is represented by "", so ""->"" indicates that the parent
+ * is unnecessary (@see rm_keys). When looking up a key not contained in the
+ * the complete set, we have to check the parent if we don't find it in the
+ * key set. During rm_keys, we copy keys from the parent and update the
+ * complete set to reflect the change @see rm_keys.
+ */
+class DBObjectMap : public ObjectMap {
+public:
+ boost::scoped_ptr<KeyValueDB> db;
+
+ /**
+ * Serializes access to next_seq as well as the in_use set
+ */
+ Mutex header_lock;
+ Cond header_cond;
+ Cond map_header_cond;
+
+ /**
+ * Set of headers currently in use
+ */
+ set<uint64_t> in_use;
+ set<ghobject_t, ghobject_t::BitwiseComparator> map_header_in_use;
+
+ /**
+ * Takes the map_header_in_use entry in constructor, releases in
+ * destructor
+ */
+ class MapHeaderLock {
+ DBObjectMap *db;
+ boost::optional<ghobject_t> locked;
+
+ MapHeaderLock(const MapHeaderLock &);
+ MapHeaderLock &operator=(const MapHeaderLock &);
+ public:
+ MapHeaderLock(DBObjectMap *db) : db(db) {}
+ MapHeaderLock(DBObjectMap *db, const ghobject_t &oid) : db(db), locked(oid) {
+ Mutex::Locker l(db->header_lock);
+ while (db->map_header_in_use.count(*locked))
+ db->map_header_cond.Wait(db->header_lock);
+ db->map_header_in_use.insert(*locked);
+ }
+
+ const ghobject_t &get_locked() const {
+ assert(locked);
+ return *locked;
+ }
+
+ void swap(MapHeaderLock &o) {
+ assert(db == o.db);
+
+ // centos6's boost optional doesn't seem to have swap :(
+ boost::optional<ghobject_t> _locked = o.locked;
+ o.locked = locked;
+ locked = _locked;
+ }
+
+ ~MapHeaderLock() {
+ if (locked) {
+ Mutex::Locker l(db->header_lock);
+ assert(db->map_header_in_use.count(*locked));
+ db->map_header_cond.Signal();
+ db->map_header_in_use.erase(*locked);
+ }
+ }
+ };
+
+ DBObjectMap(KeyValueDB *db) : db(db), header_lock("DBOBjectMap"),
+ cache_lock("DBObjectMap::CacheLock"),
+ caches(g_conf->filestore_omap_header_cache_size)
+ {}
+
+ int set_keys(
+ const ghobject_t &oid,
+ const map<string, bufferlist> &set,
+ const SequencerPosition *spos=0
+ );
+
+ int set_header(
+ const ghobject_t &oid,
+ const bufferlist &bl,
+ const SequencerPosition *spos=0
+ );
+
+ int get_header(
+ const ghobject_t &oid,
+ bufferlist *bl
+ );
+
+ int clear(
+ const ghobject_t &oid,
+ const SequencerPosition *spos=0
+ );
+
+ int clear_keys_header(
+ const ghobject_t &oid,
+ const SequencerPosition *spos=0
+ );
+
+ int rm_keys(
+ const ghobject_t &oid,
+ const set<string> &to_clear,
+ const SequencerPosition *spos=0
+ );
+
+ int get(
+ const ghobject_t &oid,
+ bufferlist *header,
+ map<string, bufferlist> *out
+ );
+
+ int get_keys(
+ const ghobject_t &oid,
+ set<string> *keys
+ );
+
+ int get_values(
+ const ghobject_t &oid,
+ const set<string> &keys,
+ map<string, bufferlist> *out
+ );
+
+ int check_keys(
+ const ghobject_t &oid,
+ const set<string> &keys,
+ set<string> *out
+ );
+
+ int get_xattrs(
+ const ghobject_t &oid,
+ const set<string> &to_get,
+ map<string, bufferlist> *out
+ );
+
+ int get_all_xattrs(
+ const ghobject_t &oid,
+ set<string> *out
+ );
+
+ int set_xattrs(
+ const ghobject_t &oid,
+ const map<string, bufferlist> &to_set,
+ const SequencerPosition *spos=0
+ );
+
+ int remove_xattrs(
+ const ghobject_t &oid,
+ const set<string> &to_remove,
+ const SequencerPosition *spos=0
+ );
+
+ int clone(
+ const ghobject_t &oid,
+ const ghobject_t &target,
+ const SequencerPosition *spos=0
+ );
+
+ /// Read initial state from backing store
+ int init(bool upgrade = false);
+
+ /// Upgrade store to current version
+ int upgrade_to_v2();
+
+ /// Consistency check, debug, there must be no parallel writes
+ bool check(std::ostream &out);
+
+ /// Ensure that all previous operations are durable
+ int sync(const ghobject_t *oid=0, const SequencerPosition *spos=0);
+
+ /// Util, list all objects, there must be no other concurrent access
+ int list_objects(vector<ghobject_t> *objs ///< [out] objects
+ );
+
+ ObjectMapIterator get_iterator(const ghobject_t &oid);
+
+ static const string USER_PREFIX;
+ static const string XATTR_PREFIX;
+ static const string SYS_PREFIX;
+ static const string COMPLETE_PREFIX;
+ static const string HEADER_KEY;
+ static const string USER_HEADER_KEY;
+ static const string GLOBAL_STATE_KEY;
+ static const string HOBJECT_TO_SEQ;
+
+ /// Legacy
+ static const string LEAF_PREFIX;
+ static const string REVERSE_LEAF_PREFIX;
+
+ /// persistent state for store @see generate_header
+ struct State {
+ __u8 v;
+ uint64_t seq;
+ State() : v(0), seq(1) {}
+ State(uint64_t seq) : v(0), seq(seq) {}
+
+ void encode(bufferlist &bl) const {
+ ENCODE_START(2, 1, bl);
+ ::encode(v, bl);
+ ::encode(seq, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::iterator &bl) {
+ DECODE_START(2, bl);
+ if (struct_v >= 2)
+ ::decode(v, bl);
+ else
+ v = 0;
+ ::decode(seq, bl);
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const {
+ f->dump_unsigned("seq", seq);
+ }
+
+ static void generate_test_instances(list<State*> &o) {
+ o.push_back(new State(0));
+ o.push_back(new State(20));
+ }
+ } state;
+
+ struct _Header {
+ uint64_t seq;
+ uint64_t parent;
+ uint64_t num_children;
+
+ coll_t c;
+ ghobject_t oid;
+
+ SequencerPosition spos;
+
+ void encode(bufferlist &bl) const {
+ ENCODE_START(2, 1, bl);
+ ::encode(seq, bl);
+ ::encode(parent, bl);
+ ::encode(num_children, bl);
+ ::encode(c, bl);
+ ::encode(oid, bl);
+ ::encode(spos, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::iterator &bl) {
+ DECODE_START(2, bl);
+ ::decode(seq, bl);
+ ::decode(parent, bl);
+ ::decode(num_children, bl);
+ ::decode(c, bl);
+ ::decode(oid, bl);
+ if (struct_v >= 2)
+ ::decode(spos, bl);
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const {
+ f->dump_unsigned("seq", seq);
+ f->dump_unsigned("parent", parent);
+ f->dump_unsigned("num_children", num_children);
+ f->dump_stream("coll") << c;
+ f->dump_stream("oid") << oid;
+ }
+
+ static void generate_test_instances(list<_Header*> &o) {
+ o.push_back(new _Header);
+ o.push_back(new _Header);
+ o.back()->parent = 20;
+ o.back()->seq = 30;
+ }
+
+ _Header() : seq(0), parent(0), num_children(1) {}
+ };
+
+ /// String munging (public for testing)
+ static string ghobject_key(const ghobject_t &oid);
+ static string ghobject_key_v0(coll_t c, const ghobject_t &oid);
+ static int is_buggy_ghobject_key_v1(const string &in);
+private:
+ /// Implicit lock on Header->seq
+ typedef ceph::shared_ptr<_Header> Header;
+ Mutex cache_lock;
+ SimpleLRU<ghobject_t, _Header, ghobject_t::BitwiseComparator> caches;
+
+ string map_header_key(const ghobject_t &oid);
+ string header_key(uint64_t seq);
+ string complete_prefix(Header header);
+ string user_prefix(Header header);
+ string sys_prefix(Header header);
+ string xattr_prefix(Header header);
+ string sys_parent_prefix(_Header header);
+ string sys_parent_prefix(Header header) {
+ return sys_parent_prefix(*header);
+ }
+
+ class EmptyIteratorImpl : public ObjectMapIteratorImpl {
+ public:
+ int seek_to_first() { return 0; }
+ int seek_to_last() { return 0; }
+ int upper_bound(const string &after) { return 0; }
+ int lower_bound(const string &to) { return 0; }
+ bool valid() { return false; }
+ int next(bool validate=true) { assert(0); return 0; }
+ string key() { assert(0); return ""; }
+ bufferlist value() { assert(0); return bufferlist(); }
+ int status() { return 0; }
+ };
+
+
+ /// Iterator
+ class DBObjectMapIteratorImpl : public ObjectMapIteratorImpl {
+ public:
+ DBObjectMap *map;
+
+ /// NOTE: implicit lock hlock->get_locked() when returned out of the class
+ MapHeaderLock hlock;
+ /// NOTE: implicit lock on header->seq AND for all ancestors
+ Header header;
+
+ /// parent_iter == NULL iff no parent
+ ceph::shared_ptr<DBObjectMapIteratorImpl> parent_iter;
+ KeyValueDB::Iterator key_iter;
+ KeyValueDB::Iterator complete_iter;
+
+ /// cur_iter points to currently valid iterator
+ ceph::shared_ptr<ObjectMapIteratorImpl> cur_iter;
+ int r;
+
+ /// init() called, key_iter, complete_iter, parent_iter filled in
+ bool ready;
+ /// past end
+ bool invalid;
+
+ DBObjectMapIteratorImpl(DBObjectMap *map, Header header) :
+ map(map), hlock(map), header(header), r(0), ready(false), invalid(true) {}
+ int seek_to_first();
+ int seek_to_last();
+ int upper_bound(const string &after);
+ int lower_bound(const string &to);
+ bool valid();
+ int next(bool validate=true);
+ string key();
+ bufferlist value();
+ int status();
+
+ bool on_parent() {
+ return cur_iter == parent_iter;
+ }
+
+ /// skips to next valid parent entry
+ int next_parent();
+
+ /// Tests whether to_test is in complete region
+ int in_complete_region(const string &to_test, ///< [in] key to test
+ string *begin, ///< [out] beginning of region
+ string *end ///< [out] end of region
+ ); ///< @returns true if to_test is in the complete region, else false
+
+ private:
+ int init();
+ bool valid_parent();
+ int adjust();
+ };
+
+ typedef ceph::shared_ptr<DBObjectMapIteratorImpl> DBObjectMapIterator;
+ DBObjectMapIterator _get_iterator(Header header) {
+ return DBObjectMapIterator(new DBObjectMapIteratorImpl(this, header));
+ }
+
+ /// sys
+
+ /// Removes node corresponding to header
+ void clear_header(Header header, KeyValueDB::Transaction t);
+
+ /// Set node containing input to new contents
+ void set_header(Header input, KeyValueDB::Transaction t);
+
+ /// Remove leaf node corresponding to oid in c
+ void remove_map_header(
+ const MapHeaderLock &l,
+ const ghobject_t &oid,
+ Header header,
+ KeyValueDB::Transaction t);
+
+ /// Set leaf node for c and oid to the value of header
+ void set_map_header(
+ const MapHeaderLock &l,
+ const ghobject_t &oid, _Header header,
+ KeyValueDB::Transaction t);
+
+ /// Set leaf node for c and oid to the value of header
+ bool check_spos(const ghobject_t &oid,
+ Header header,
+ const SequencerPosition *spos);
+
+ /// Lookup or create header for c oid
+ Header lookup_create_map_header(
+ const MapHeaderLock &l,
+ const ghobject_t &oid,
+ KeyValueDB::Transaction t);
+
+ /**
+ * Generate new header for c oid with new seq number
+ *
+ * Has the side effect of syncronously saving the new DBObjectMap state
+ */
+ Header _generate_new_header(const ghobject_t &oid, Header parent);
+ Header generate_new_header(const ghobject_t &oid, Header parent) {
+ Mutex::Locker l(header_lock);
+ return _generate_new_header(oid, parent);
+ }
+
+ /// Lookup leaf header for c oid
+ Header _lookup_map_header(
+ const MapHeaderLock &l,
+ const ghobject_t &oid);
+ Header lookup_map_header(
+ const MapHeaderLock &l2,
+ const ghobject_t &oid) {
+ Mutex::Locker l(header_lock);
+ return _lookup_map_header(l2, oid);
+ }
+
+ /// Lookup header node for input
+ Header lookup_parent(Header input);
+
+
+ /// Helpers
+ int _get_header(Header header, bufferlist *bl);
+
+ /// Scan keys in header into out_keys and out_values (if nonnull)
+ int scan(Header header,
+ const set<string> &in_keys,
+ set<string> *out_keys,
+ map<string, bufferlist> *out_values);
+
+ /// Remove header and all related prefixes
+ int _clear(Header header,
+ KeyValueDB::Transaction t);
+ /// Adds to t operations necessary to add new_complete to the complete set
+ int merge_new_complete(Header header,
+ const map<string, string> &new_complete,
+ DBObjectMapIterator iter,
+ KeyValueDB::Transaction t);
+
+ /// Writes out State (mainly next_seq)
+ int write_state(KeyValueDB::Transaction _t =
+ KeyValueDB::Transaction());
+
+ /// 0 if the complete set now contains all of key space, < 0 on error, 1 else
+ int need_parent(DBObjectMapIterator iter);
+
+ /// Copies header entry from parent @see rm_keys
+ int copy_up_header(Header header,
+ KeyValueDB::Transaction t);
+
+ /// Sets header @see set_header
+ void _set_header(Header header, const bufferlist &bl,
+ KeyValueDB::Transaction t);
+
+ /**
+ * Removes header seq lock and possibly object lock
+ * once Header is out of scope
+ * @see lookup_parent
+ * @see generate_new_header
+ */
+ class RemoveOnDelete {
+ public:
+ DBObjectMap *db;
+ RemoveOnDelete(DBObjectMap *db) :
+ db(db) {}
+ void operator() (_Header *header) {
+ Mutex::Locker l(db->header_lock);
+ assert(db->in_use.count(header->seq));
+ db->in_use.erase(header->seq);
+ db->header_cond.Signal();
+ delete header;
+ }
+ };
+ friend class RemoveOnDelete;
+};
+WRITE_CLASS_ENCODER(DBObjectMap::_Header)
+WRITE_CLASS_ENCODER(DBObjectMap::State)
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_FDCACHE_H
+#define CEPH_FDCACHE_H
+
+#include <memory>
+#include <errno.h>
+#include <cstdio>
+#include "common/hobject.h"
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/shared_cache.hpp"
+#include "include/compat.h"
+#include "include/intarith.h"
+
+/**
+ * FD Cache
+ */
+class FDCache : public md_config_obs_t {
+public:
+ /**
+ * FD
+ *
+ * Wrapper for an fd. Destructor closes the fd.
+ */
+ class FD {
+ public:
+ const int fd;
+ FD(int _fd) : fd(_fd) {
+ assert(_fd >= 0);
+ }
+ int operator*() const {
+ return fd;
+ }
+ ~FD() {
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ }
+ };
+
+private:
+ CephContext *cct;
+ const int registry_shards;
+ SharedLRU<ghobject_t, FD, ghobject_t::BitwiseComparator> *registry;
+
+public:
+ FDCache(CephContext *cct) : cct(cct),
+ registry_shards(cct->_conf->filestore_fd_cache_shards) {
+ assert(cct);
+ cct->_conf->add_observer(this);
+ registry = new SharedLRU<ghobject_t, FD, ghobject_t::BitwiseComparator>[registry_shards];
+ for (int i = 0; i < registry_shards; ++i) {
+ registry[i].set_cct(cct);
+ registry[i].set_size(
+ MAX((cct->_conf->filestore_fd_cache_size / registry_shards), 1));
+ }
+ }
+ ~FDCache() {
+ cct->_conf->remove_observer(this);
+ delete[] registry;
+ }
+ typedef ceph::shared_ptr<FD> FDRef;
+
+ FDRef lookup(const ghobject_t &hoid) {
+ int registry_id = hoid.hobj.get_hash() % registry_shards;
+ return registry[registry_id].lookup(hoid);
+ }
+
+ FDRef add(const ghobject_t &hoid, int fd, bool *existed) {
+ int registry_id = hoid.hobj.get_hash() % registry_shards;
+ return registry[registry_id].add(hoid, new FD(fd), existed);
+ }
+
+ /// clear cached fd for hoid, subsequent lookups will get an empty FD
+ void clear(const ghobject_t &hoid) {
+ int registry_id = hoid.hobj.get_hash() % registry_shards;
+ registry[registry_id].purge(hoid);
+ }
+
+ /// md_config_obs_t
+ const char** get_tracked_conf_keys() const {
+ static const char* KEYS[] = {
+ "filestore_fd_cache_size",
+ NULL
+ };
+ return KEYS;
+ }
+ void handle_conf_change(const md_config_t *conf,
+ const std::set<std::string> &changed) {
+ if (changed.count("filestore_fd_cache_size")) {
+ for (int i = 0; i < registry_shards; ++i)
+ registry[i].set_size(
+ MAX((conf->filestore_fd_cache_size / registry_shards), 1));
+ }
+ }
+
+};
+typedef FDCache::FDRef FDRef;
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include "acconfig.h"
+
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "FileJournal.h"
+#include "include/color.h"
+#include "common/perf_counters.h"
+#include "FileStore.h"
+
+#include "include/compat.h"
+
+#include <fcntl.h>
+#include <limits.h>
+#include <sstream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+
+#include "common/blkdev.h"
+#include "common/linux_version.h"
+
+#if defined(__FreeBSD__)
+#define O_DSYNC O_SYNC
+#endif
+
+#define dout_subsys ceph_subsys_journal
+#undef dout_prefix
+#define dout_prefix *_dout << "journal "
+
+const static int64_t ONE_MEG(1 << 20);
+const static int CEPH_MINIMUM_BLOCK_SIZE(4096);
+
+int FileJournal::_open(bool forwrite, bool create)
+{
+ int flags, ret;
+
+ if (forwrite) {
+ flags = O_RDWR;
+ if (directio)
+ flags |= O_DIRECT | O_DSYNC;
+ } else {
+ flags = O_RDONLY;
+ }
+ if (create)
+ flags |= O_CREAT;
+
+ if (fd >= 0) {
+ if (TEMP_FAILURE_RETRY(::close(fd))) {
+ int err = errno;
+ derr << "FileJournal::_open: error closing old fd: "
+ << cpp_strerror(err) << dendl;
+ }
+ }
+ fd = TEMP_FAILURE_RETRY(::open(fn.c_str(), flags, 0644));
+ if (fd < 0) {
+ int err = errno;
+ dout(2) << "FileJournal::_open unable to open journal "
+ << fn << ": " << cpp_strerror(err) << dendl;
+ return -err;
+ }
+
+ struct stat st;
+ ret = ::fstat(fd, &st);
+ if (ret) {
+ ret = errno;
+ derr << "FileJournal::_open: unable to fstat journal: " << cpp_strerror(ret) << dendl;
+ ret = -ret;
+ goto out_fd;
+ }
+
+ if (S_ISBLK(st.st_mode)) {
+ ret = _open_block_device();
+ } else {
+ if (aio && !force_aio) {
+ derr << "FileJournal::_open: disabling aio for non-block journal. Use "
+ << "journal_force_aio to force use of aio anyway" << dendl;
+ aio = false;
+ }
+ ret = _open_file(st.st_size, st.st_blksize, create);
+ }
+
+ if (ret)
+ goto out_fd;
+
+#ifdef HAVE_LIBAIO
+ if (aio) {
+ aio_ctx = 0;
+ ret = io_setup(128, &aio_ctx);
+ if (ret < 0) {
+ ret = errno;
+ derr << "FileJournal::_open: unable to setup io_context " << cpp_strerror(ret) << dendl;
+ ret = -ret;
+ goto out_fd;
+ }
+ }
+#endif
+
+ /* We really want max_size to be a multiple of block_size. */
+ max_size -= max_size % block_size;
+
+ dout(1) << "_open " << fn << " fd " << fd
+ << ": " << max_size
+ << " bytes, block size " << block_size
+ << " bytes, directio = " << directio
+ << ", aio = " << aio
+ << dendl;
+ return 0;
+
+ out_fd:
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return ret;
+}
+
+int FileJournal::_open_block_device()
+{
+ int64_t bdev_sz = 0;
+ int ret = get_block_device_size(fd, &bdev_sz);
+ if (ret) {
+ dout(0) << __func__ << ": failed to read block device size." << dendl;
+ return -EIO;
+ }
+
+ /* Check for bdev_sz too small */
+ if (bdev_sz < ONE_MEG) {
+ dout(0) << __func__ << ": your block device must be at least "
+ << ONE_MEG << " bytes to be used for a Ceph journal." << dendl;
+ return -EINVAL;
+ }
+
+ dout(10) << __func__ << ": ignoring osd journal size. "
+ << "We'll use the entire block device (size: " << bdev_sz << ")"
+ << dendl;
+ max_size = bdev_sz;
+
+ block_size = CEPH_MINIMUM_BLOCK_SIZE;
+
+ if (g_conf->journal_discard) {
+ discard = block_device_support_discard(fn.c_str());
+ dout(10) << fn << " support discard: " << (int)discard << dendl;
+ }
+ _check_disk_write_cache();
+ return 0;
+}
+
+void FileJournal::_check_disk_write_cache() const
+{
+ ostringstream hdparm_cmd;
+ FILE *fp = NULL;
+
+ if (geteuid() != 0) {
+ dout(10) << "_check_disk_write_cache: not root, NOT checking disk write "
+ << "cache on raw block device " << fn << dendl;
+ goto done;
+ }
+
+ hdparm_cmd << "/sbin/hdparm -W " << fn;
+ fp = popen(hdparm_cmd.str().c_str(), "r");
+ if (!fp) {
+ dout(10) << "_check_disk_write_cache: failed to run /sbin/hdparm: NOT "
+ << "checking disk write cache on raw block device " << fn << dendl;
+ goto done;
+ }
+
+ while (true) {
+ char buf[256];
+ memset(buf, 0, sizeof(buf));
+ char *line = fgets(buf, sizeof(buf) - 1, fp);
+ if (!line) {
+ if (ferror(fp)) {
+ int ret = -errno;
+ derr << "_check_disk_write_cache: fgets error: " << cpp_strerror(ret)
+ << dendl;
+ goto close_f;
+ }
+ else {
+ // EOF.
+ break;
+ }
+ }
+
+ int on;
+ if (sscanf(line, " write-caching = %d", &on) != 1)
+ continue;
+ if (!on) {
+ dout(10) << "_check_disk_write_cache: disk write cache is off (good) on "
+ << fn << dendl;
+ break;
+ }
+
+ // is our kernel new enough?
+ int ver = get_linux_version();
+ if (ver == 0) {
+ dout(10) << "_check_disk_write_cache: get_linux_version failed" << dendl;
+ } else if (ver >= KERNEL_VERSION(2, 6, 33)) {
+ dout(20) << "_check_disk_write_cache: disk write cache is on, but your "
+ << "kernel is new enough to handle it correctly. (fn:"
+ << fn << ")" << dendl;
+ break;
+ }
+ derr << TEXT_RED
+ << " ** WARNING: disk write cache is ON on " << fn << ".\n"
+ << " Journaling will not be reliable on kernels prior to 2.6.33\n"
+ << " (recent kernels are safe). You can disable the write cache with\n"
+ << " 'hdparm -W 0 " << fn << "'"
+ << TEXT_NORMAL
+ << dendl;
+ break;
+ }
+
+close_f:
+ if (pclose(fp)) {
+ int ret = -errno;
+ derr << "_check_disk_write_cache: pclose failed: " << cpp_strerror(ret)
+ << dendl;
+ }
+done:
+ ;
+}
+
+int FileJournal::_open_file(int64_t oldsize, blksize_t blksize,
+ bool create)
+{
+ int ret;
+ int64_t conf_journal_sz(g_conf->osd_journal_size);
+ conf_journal_sz <<= 20;
+
+ if ((g_conf->osd_journal_size == 0) && (oldsize < ONE_MEG)) {
+ derr << "I'm sorry, I don't know how large of a journal to create."
+ << "Please specify a block device to use as the journal OR "
+ << "set osd_journal_size in your ceph.conf" << dendl;
+ return -EINVAL;
+ }
+
+ if (create && (oldsize < conf_journal_sz)) {
+ uint64_t newsize(g_conf->osd_journal_size);
+ newsize <<= 20;
+ dout(10) << "_open extending to " << newsize << " bytes" << dendl;
+ ret = ::ftruncate(fd, newsize);
+ if (ret < 0) {
+ int err = errno;
+ derr << "FileJournal::_open_file : unable to extend journal to "
+ << newsize << " bytes: " << cpp_strerror(err) << dendl;
+ return -err;
+ }
+#ifdef HAVE_POSIX_FALLOCATE
+ ret = ::posix_fallocate(fd, 0, newsize);
+ if (ret) {
+ derr << "FileJournal::_open_file : unable to preallocation journal to "
+ << newsize << " bytes: " << cpp_strerror(ret) << dendl;
+ return -ret;
+ }
+ max_size = newsize;
+#elif defined(__APPLE__)
+ fstore_t store;
+ store.fst_flags = F_ALLOCATECONTIG;
+ store.fst_posmode = F_PEOFPOSMODE;
+ store.fst_offset = 0;
+ store.fst_length = newsize;
+
+ ret = ::fcntl(fd, F_PREALLOCATE, &store);
+ if (ret == -1) {
+ ret = -errno;
+ derr << "FileJournal::_open_file : unable to preallocation journal to "
+ << newsize << " bytes: " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ max_size = newsize;
+#else
+# error "Journal pre-allocation not supported on platform."
+#endif
+ }
+ else {
+ max_size = oldsize;
+ }
+ block_size = MAX(blksize, (blksize_t)CEPH_MINIMUM_BLOCK_SIZE);
+
+ if (create && g_conf->journal_zero_on_create) {
+ derr << "FileJournal::_open_file : zeroing journal" << dendl;
+ uint64_t write_size = 1 << 20;
+ char *buf;
+ ret = ::posix_memalign((void **)&buf, block_size, write_size);
+ if (ret != 0) {
+ return -ret;
+ }
+ memset(static_cast<void*>(buf), 0, write_size);
+ uint64_t i = 0;
+ for (; (i + write_size) <= (uint64_t)max_size; i += write_size) {
+ ret = ::pwrite(fd, static_cast<void*>(buf), write_size, i);
+ if (ret < 0) {
+ free(buf);
+ return -errno;
+ }
+ }
+ if (i < (uint64_t)max_size) {
+ ret = ::pwrite(fd, static_cast<void*>(buf), max_size - i, i);
+ if (ret < 0) {
+ free(buf);
+ return -errno;
+ }
+ }
+ free(buf);
+ }
+
+
+ dout(10) << "_open journal is not a block device, NOT checking disk "
+ << "write cache on '" << fn << "'" << dendl;
+
+ return 0;
+}
+
+// This can not be used on an active journal
+int FileJournal::check()
+{
+ int ret;
+
+ assert(fd == -1);
+ ret = _open(false, false);
+ if (ret)
+ return ret;
+
+ ret = read_header(&header);
+ if (ret < 0)
+ goto done;
+
+ if (header.fsid != fsid) {
+ derr << "check: ondisk fsid " << header.fsid << " doesn't match expected " << fsid
+ << ", invalid (someone else's?) journal" << dendl;
+ ret = -EINVAL;
+ goto done;
+ }
+
+ dout(1) << "check: header looks ok" << dendl;
+ ret = 0;
+
+ done:
+ close();
+ return ret;
+}
+
+
+int FileJournal::create()
+{
+ void *buf = 0;
+ int64_t needed_space;
+ int ret;
+ buffer::ptr bp;
+ dout(2) << "create " << fn << " fsid " << fsid << dendl;
+
+ ret = _open(true, true);
+ if (ret)
+ goto done;
+
+ // write empty header
+ header = header_t();
+ header.flags = header_t::FLAG_CRC; // enable crcs on any new journal.
+ header.fsid = fsid;
+ header.max_size = max_size;
+ header.block_size = block_size;
+ if (g_conf->journal_block_align || directio)
+ header.alignment = block_size;
+ else
+ header.alignment = 16; // at least stay word aligned on 64bit machines...
+
+ header.start = get_top();
+ header.start_seq = 0;
+
+ print_header(header);
+
+ // static zeroed buffer for alignment padding
+ delete [] zero_buf;
+ zero_buf = new char[header.alignment];
+ memset(zero_buf, 0, header.alignment);
+
+ bp = prepare_header();
+ if (TEMP_FAILURE_RETRY(::pwrite(fd, bp.c_str(), bp.length(), 0)) < 0) {
+ ret = -errno;
+ derr << "FileJournal::create : create write header error "
+ << cpp_strerror(ret) << dendl;
+ goto close_fd;
+ }
+
+ // zero first little bit, too.
+ ret = posix_memalign(&buf, block_size, block_size);
+ if (ret) {
+ ret = -ret;
+ derr << "FileJournal::create: failed to allocate " << block_size
+ << " bytes of memory: " << cpp_strerror(ret) << dendl;
+ goto close_fd;
+ }
+ memset(buf, 0, block_size);
+ if (TEMP_FAILURE_RETRY(::pwrite(fd, buf, block_size, get_top())) < 0) {
+ ret = -errno;
+ derr << "FileJournal::create: error zeroing first " << block_size
+ << " bytes " << cpp_strerror(ret) << dendl;
+ goto free_buf;
+ }
+
+ needed_space = ((int64_t)g_conf->osd_max_write_size) << 20;
+ needed_space += (2 * sizeof(entry_header_t)) + get_top();
+ if (header.max_size - header.start < needed_space) {
+ derr << "FileJournal::create: OSD journal is not large enough to hold "
+ << "osd_max_write_size bytes!" << dendl;
+ ret = -ENOSPC;
+ goto free_buf;
+ }
+
+ dout(2) << "create done" << dendl;
+ ret = 0;
+
+free_buf:
+ free(buf);
+ buf = 0;
+close_fd:
+ if (TEMP_FAILURE_RETRY(::close(fd)) < 0) {
+ ret = -errno;
+ derr << "FileJournal::create: error closing fd: " << cpp_strerror(ret)
+ << dendl;
+ }
+done:
+ fd = -1;
+ return ret;
+}
+
+// This can not be used on an active journal
+int FileJournal::peek_fsid(uuid_d& fsid)
+{
+ assert(fd == -1);
+ int r = _open(false, false);
+ if (r)
+ return r;
+ r = read_header(&header);
+ if (r < 0)
+ goto out;
+ fsid = header.fsid;
+out:
+ close();
+ return r;
+}
+
+int FileJournal::open(uint64_t fs_op_seq)
+{
+ dout(2) << "open " << fn << " fsid " << fsid << " fs_op_seq " << fs_op_seq << dendl;
+
+ uint64_t next_seq = fs_op_seq + 1;
+
+ int err = _open(false);
+ if (err)
+ return err;
+
+ // assume writeable, unless...
+ read_pos = 0;
+ write_pos = get_top();
+
+ // read header?
+ err = read_header(&header);
+ if (err < 0)
+ return err;
+
+ // static zeroed buffer for alignment padding
+ delete [] zero_buf;
+ zero_buf = new char[header.alignment];
+ memset(zero_buf, 0, header.alignment);
+
+ dout(10) << "open header.fsid = " << header.fsid
+ //<< " vs expected fsid = " << fsid
+ << dendl;
+ if (header.fsid != fsid) {
+ derr << "FileJournal::open: ondisk fsid " << header.fsid << " doesn't match expected " << fsid
+ << ", invalid (someone else's?) journal" << dendl;
+ return -EINVAL;
+ }
+ if (header.max_size > max_size) {
+ dout(2) << "open journal size " << header.max_size << " > current " << max_size << dendl;
+ return -EINVAL;
+ }
+ if (header.block_size != block_size) {
+ dout(2) << "open journal block size " << header.block_size << " != current " << block_size << dendl;
+ return -EINVAL;
+ }
+ if (header.max_size % header.block_size) {
+ dout(2) << "open journal max size " << header.max_size
+ << " not a multiple of block size " << header.block_size << dendl;
+ return -EINVAL;
+ }
+ if (header.alignment != block_size && directio) {
+ dout(0) << "open journal alignment " << header.alignment << " does not match block size "
+ << block_size << " (required for direct_io journal mode)" << dendl;
+ return -EINVAL;
+ }
+ if ((header.alignment % CEPH_MINIMUM_BLOCK_SIZE) && directio) {
+ dout(0) << "open journal alignment " << header.alignment << " is not multiple of minimum block size "
+ << CEPH_MINIMUM_BLOCK_SIZE << " (required for direct_io journal mode)" << dendl;
+ return -EINVAL;
+ }
+
+ // looks like a valid header.
+ write_pos = 0; // not writeable yet
+
+ journaled_seq = header.committed_up_to;
+
+ // find next entry
+ read_pos = header.start;
+ uint64_t seq = header.start_seq;
+
+ // last_committed_seq is 1 before the start of the journal or
+ // 0 if the start is 0
+ last_committed_seq = seq > 0 ? seq - 1 : seq;
+ if (last_committed_seq < fs_op_seq) {
+ dout(2) << "open advancing committed_seq " << last_committed_seq
+ << " to fs op_seq " << fs_op_seq << dendl;
+ last_committed_seq = fs_op_seq;
+ }
+
+ while (1) {
+ bufferlist bl;
+ off64_t old_pos = read_pos;
+ if (!read_entry(bl, seq)) {
+ dout(10) << "open reached end of journal." << dendl;
+ break;
+ }
+ if (seq > next_seq) {
+ dout(10) << "open entry " << seq << " len " << bl.length() << " > next_seq " << next_seq
+ << ", ignoring journal contents"
+ << dendl;
+ read_pos = -1;
+ last_committed_seq = 0;
+ seq = 0;
+ return 0;
+ }
+ if (seq == next_seq) {
+ dout(10) << "open reached seq " << seq << dendl;
+ read_pos = old_pos;
+ break;
+ }
+ seq++; // next event should follow.
+ }
+
+ return 0;
+}
+
+void FileJournal::_close(int fd) const
+{
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+}
+
+void FileJournal::close()
+{
+ dout(1) << "close " << fn << dendl;
+
+ // stop writer thread
+ stop_writer();
+
+ // close
+ assert(writeq_empty());
+ assert(!must_write_header);
+ assert(fd >= 0);
+ _close(fd);
+ fd = -1;
+}
+
+
+int FileJournal::dump(ostream& out)
+{
+ return _dump(out, false);
+}
+
+int FileJournal::simple_dump(ostream& out)
+{
+ return _dump(out, true);
+}
+
+int FileJournal::_dump(ostream& out, bool simple)
+{
+ JSONFormatter f(true);
+ int ret = _fdump(f, simple);
+ f.flush(out);
+ return ret;
+}
+
+int FileJournal::_fdump(Formatter &f, bool simple)
+{
+ dout(10) << "_fdump" << dendl;
+
+ assert(fd == -1);
+ int err = _open(false, false);
+ if (err)
+ return err;
+
+ err = read_header(&header);
+ if (err < 0) {
+ close();
+ return err;
+ }
+
+ off64_t next_pos = header.start;
+
+ f.open_object_section("journal");
+
+ f.open_object_section("header");
+ f.dump_unsigned("flags", header.flags);
+ ostringstream os;
+ os << header.fsid;
+ f.dump_string("fsid", os.str());
+ f.dump_unsigned("block_size", header.block_size);
+ f.dump_unsigned("alignment", header.alignment);
+ f.dump_int("max_size", header.max_size);
+ f.dump_int("start", header.start);
+ f.dump_unsigned("committed_up_to", header.committed_up_to);
+ f.dump_unsigned("start_seq", header.start_seq);
+ f.close_section();
+
+ f.open_array_section("entries");
+ uint64_t seq = header.start_seq;
+ while (1) {
+ bufferlist bl;
+ off64_t pos = next_pos;
+
+ if (!pos) {
+ dout(2) << "_dump -- not readable" << dendl;
+ err = -EINVAL;
+ break;
+ }
+ stringstream ss;
+ read_entry_result result = do_read_entry(
+ pos,
+ &next_pos,
+ &bl,
+ &seq,
+ &ss);
+ if (result != SUCCESS) {
+ if (seq < header.committed_up_to) {
+ dout(2) << "Unable to read past sequence " << seq
+ << " but header indicates the journal has committed up through "
+ << header.committed_up_to << ", journal is corrupt" << dendl;
+ err = -EINVAL;
+ }
+ dout(25) << ss.str() << dendl;
+ dout(25) << "No further valid entries found, journal is most likely valid"
+ << dendl;
+ break;
+ }
+
+ f.open_object_section("entry");
+ f.dump_unsigned("offset", pos);
+ f.dump_unsigned("seq", seq);
+ if (simple) {
+ f.dump_unsigned("bl.length", bl.length());
+ } else {
+ f.open_array_section("transactions");
+ bufferlist::iterator p = bl.begin();
+ int trans_num = 0;
+ while (!p.end()) {
+ ObjectStore::Transaction t(p);
+ f.open_object_section("transaction");
+ f.dump_unsigned("trans_num", trans_num);
+ t.dump(&f);
+ f.close_section();
+ trans_num++;
+ }
+ f.close_section();
+ }
+ f.close_section();
+ }
+
+ f.close_section();
+ f.close_section();
+ dout(10) << "dump finish" << dendl;
+
+ close();
+ return err;
+}
+
+
+void FileJournal::start_writer()
+{
+ write_stop = false;
+ aio_stop = false;
+ write_thread.create();
+#ifdef HAVE_LIBAIO
+ if (aio)
+ write_finish_thread.create();
+#endif
+}
+
+void FileJournal::stop_writer()
+{
+ // Do nothing if writer already stopped or never started
+ if (!write_stop)
+ {
+ {
+ Mutex::Locker l(write_lock);
+ Mutex::Locker p(writeq_lock);
+ write_stop = true;
+ writeq_cond.Signal();
+ // Doesn't hurt to signal commit_cond in case thread is waiting there
+ // and caller didn't use committed_thru() first.
+ commit_cond.Signal();
+ }
+ write_thread.join();
+
+ // write journal header now so that we have less to replay on remount
+ write_header_sync();
+ }
+
+#ifdef HAVE_LIBAIO
+ // stop aio completeion thread *after* writer thread has stopped
+ // and has submitted all of its io
+ if (aio && !aio_stop) {
+ aio_lock.Lock();
+ aio_stop = true;
+ aio_cond.Signal();
+ write_finish_cond.Signal();
+ aio_lock.Unlock();
+ write_finish_thread.join();
+ }
+#endif
+}
+
+
+
+void FileJournal::print_header(const header_t &header) const
+{
+ dout(10) << "header: block_size " << header.block_size
+ << " alignment " << header.alignment
+ << " max_size " << header.max_size
+ << dendl;
+ dout(10) << "header: start " << header.start << dendl;
+ dout(10) << " write_pos " << write_pos << dendl;
+}
+
+int FileJournal::read_header(header_t *hdr) const
+{
+ dout(10) << "read_header" << dendl;
+ bufferlist bl;
+
+ buffer::ptr bp = buffer::create_page_aligned(block_size);
+ char* bpdata = bp.c_str();
+ int r = ::pread(fd, bpdata, bp.length(), 0);
+
+ if (r < 0) {
+ int err = errno;
+ dout(0) << "read_header got " << cpp_strerror(err) << dendl;
+ return -err;
+ }
+
+ // don't use bp.zero() here, because it also invalidates
+ // crc cache (which is not yet populated anyway)
+ if (bp.length() != (size_t)r) {
+ // r will be always less or equal than bp.length
+ bpdata += r;
+ memset(bpdata, 0, bp.length() - r);
+ }
+
+ bl.push_back(bp);
+
+ try {
+ bufferlist::iterator p = bl.begin();
+ ::decode(*hdr, p);
+ }
+ catch (buffer::error& e) {
+ derr << "read_header error decoding journal header" << dendl;
+ return -EINVAL;
+ }
+
+
+ /*
+ * Unfortunately we weren't initializing the flags field for new
+ * journals! Aie. This is safe(ish) now that we have only one
+ * flag. Probably around when we add the next flag we need to
+ * remove this or else this (eventually old) code will clobber newer
+ * code's flags.
+ */
+ if (hdr->flags > 3) {
+ derr << "read_header appears to have gibberish flags; assuming 0" << dendl;
+ hdr->flags = 0;
+ }
+
+ print_header(*hdr);
+
+ return 0;
+}
+
+bufferptr FileJournal::prepare_header()
+{
+ bufferlist bl;
+ {
+ Mutex::Locker l(finisher_lock);
+ header.committed_up_to = journaled_seq;
+ }
+ ::encode(header, bl);
+ bufferptr bp = buffer::create_page_aligned(get_top());
+ // don't use bp.zero() here, because it also invalidates
+ // crc cache (which is not yet populated anyway)
+ char* data = bp.c_str();
+ memcpy(data, bl.c_str(), bl.length());
+ data += bl.length();
+ memset(data, 0, bp.length()-bl.length());
+ return bp;
+}
+
+void FileJournal::write_header_sync()
+{
+ Mutex::Locker locker(write_lock);
+ must_write_header = true;
+ bufferlist bl;
+ do_write(bl);
+ dout(20) << __func__ << " finish" << dendl;
+}
+
+int FileJournal::check_for_full(uint64_t seq, off64_t pos, off64_t size)
+{
+ // already full?
+ if (full_state != FULL_NOTFULL)
+ return -ENOSPC;
+
+ // take 1 byte off so that we only get pos == header.start on EMPTY, never on FULL.
+ off64_t room;
+ if (pos >= header.start)
+ room = (header.max_size - pos) + (header.start - get_top()) - 1;
+ else
+ room = header.start - pos - 1;
+ dout(10) << "room " << room << " max_size " << max_size << " pos " << pos << " header.start " << header.start
+ << " top " << get_top() << dendl;
+
+ if (do_sync_cond) {
+ if (room >= (header.max_size >> 1) &&
+ room - size < (header.max_size >> 1)) {
+ dout(10) << " passing half full mark, triggering commit" << dendl;
+ do_sync_cond->SloppySignal(); // initiate a real commit so we can trim
+ }
+ }
+
+ if (room >= size) {
+ dout(10) << "check_for_full at " << pos << " : " << size << " < " << room << dendl;
+ if (pos + size > header.max_size)
+ must_write_header = true;
+ return 0;
+ }
+
+ // full
+ dout(1) << "check_for_full at " << pos << " : JOURNAL FULL "
+ << pos << " >= " << room
+ << " (max_size " << header.max_size << " start " << header.start << ")"
+ << dendl;
+
+ off64_t max = header.max_size - get_top();
+ if (size > max)
+ dout(0) << "JOURNAL TOO SMALL: continuing, but slow: item " << size << " > journal " << max << " (usable)" << dendl;
+
+ return -ENOSPC;
+}
+
+int FileJournal::prepare_multi_write(bufferlist& bl, uint64_t& orig_ops, uint64_t& orig_bytes)
+{
+ // gather queued writes
+ off64_t queue_pos = write_pos;
+
+ int eleft = g_conf->journal_max_write_entries;
+ unsigned bmax = g_conf->journal_max_write_bytes;
+
+ if (full_state != FULL_NOTFULL)
+ return -ENOSPC;
+
+ while (!writeq_empty()) {
+ list<write_item> items;
+ batch_pop_write(items);
+ list<write_item>::iterator it = items.begin();
+ while (it != items.end()) {
+ int r = prepare_single_write(*it, bl, queue_pos, orig_ops, orig_bytes);
+ if (r == 0) { // prepare ok, delete it
+ items.erase(it++);
+ }
+ if (r == -ENOSPC) {
+ // the journal maybe full, insert the left item to writeq
+ batch_unpop_write(items);
+ if (orig_ops)
+ goto out; // commit what we have
+
+ if (logger)
+ logger->inc(l_os_j_full);
+
+ if (wait_on_full) {
+ dout(20) << "prepare_multi_write full on first entry, need to wait" << dendl;
+ } else {
+ dout(20) << "prepare_multi_write full on first entry, restarting journal" << dendl;
+
+ // throw out what we have so far
+ full_state = FULL_FULL;
+ while (!writeq_empty()) {
+ put_throttle(1, peek_write().orig_len);
+ pop_write();
+ }
+ print_header(header);
+ }
+
+ return -ENOSPC; // hrm, full on first op
+ }
+ if (eleft) {
+ if (--eleft == 0) {
+ dout(20) << "prepare_multi_write hit max events per write " << g_conf->journal_max_write_entries << dendl;
+ batch_unpop_write(items);
+ goto out;
+ }
+ }
+ if (bmax) {
+ if (bl.length() >= bmax) {
+ dout(20) << "prepare_multi_write hit max write size " << g_conf->journal_max_write_bytes << dendl;
+ batch_unpop_write(items);
+ goto out;
+ }
+ }
+ }
+ }
+
+out:
+ dout(20) << "prepare_multi_write queue_pos now " << queue_pos << dendl;
+ assert((write_pos + bl.length() == queue_pos) ||
+ (write_pos + bl.length() - header.max_size + get_top() == queue_pos));
+ return 0;
+}
+
+/*
+void FileJournal::queue_write_fin(uint64_t seq, Context *fin)
+{
+ writing_seq.push_back(seq);
+ if (!waiting_for_notfull.empty()) {
+ // make sure previously unjournaled stuff waiting for UNFULL triggers
+ // _before_ newly journaled stuff does
+ dout(10) << "queue_write_fin will defer seq " << seq << " callback " << fin
+ << " until after UNFULL" << dendl;
+ C_Gather *g = new C_Gather(writeq.front().fin);
+ writing_fin.push_back(g->new_sub());
+ waiting_for_notfull.push_back(g->new_sub());
+ } else {
+ writing_fin.push_back(writeq.front().fin);
+ dout(20) << "queue_write_fin seq " << seq << " callback " << fin << dendl;
+ }
+}
+*/
+
+void FileJournal::queue_completions_thru(uint64_t seq)
+{
+ assert(finisher_lock.is_locked());
+ utime_t now = ceph_clock_now(g_ceph_context);
+ list<completion_item> items;
+ batch_pop_completions(items);
+ list<completion_item>::iterator it = items.begin();
+ while (it != items.end()) {
+ completion_item& next = *it;
+ if (next.seq > seq)
+ break;
+ utime_t lat = now;
+ lat -= next.start;
+ dout(10) << "queue_completions_thru seq " << seq
+ << " queueing seq " << next.seq
+ << " " << next.finish
+ << " lat " << lat << dendl;
+ if (logger) {
+ logger->tinc(l_os_j_lat, lat);
+ }
+ if (next.finish)
+ finisher->queue(next.finish);
+ if (next.tracked_op)
+ next.tracked_op->mark_event("journaled_completion_queued");
+ items.erase(it++);
+ }
+ batch_unpop_completions(items);
+ finisher_cond.Signal();
+}
+
+
+int FileJournal::prepare_single_write(write_item &next_write, bufferlist& bl, off64_t& queue_pos, uint64_t& orig_ops, uint64_t& orig_bytes)
+{
+ uint64_t seq = next_write.seq;
+ bufferlist &ebl = next_write.bl;
+ off64_t size = ebl.length();
+
+ int r = check_for_full(seq, queue_pos, size);
+ if (r < 0)
+ return r; // ENOSPC or EAGAIN
+
+ uint32_t orig_len = next_write.orig_len;
+ orig_bytes += orig_len;
+ orig_ops++;
+
+ // add to write buffer
+ dout(15) << "prepare_single_write " << orig_ops << " will write " << queue_pos << " : seq " << seq
+ << " len " << orig_len << " -> " << size << dendl;
+
+ unsigned seq_offset = offsetof(entry_header_t, seq);
+ unsigned magic1_offset = offsetof(entry_header_t, magic1);
+ unsigned magic2_offset = offsetof(entry_header_t, magic2);
+
+ bufferptr headerptr = ebl.buffers().front();
+ uint64_t _seq = seq;
+ uint64_t _queue_pos = queue_pos;
+ uint64_t magic2 = entry_header_t::make_magic(seq, orig_len, header.get_fsid64());
+ headerptr.copy_in(seq_offset, sizeof(uint64_t), (char *)&_seq);
+ headerptr.copy_in(magic1_offset, sizeof(uint64_t), (char *)&_queue_pos);
+ headerptr.copy_in(magic2_offset, sizeof(uint64_t), (char *)&magic2);
+
+ bufferptr footerptr = ebl.buffers().back();
+ unsigned post_offset = footerptr.length() - sizeof(entry_header_t);
+ footerptr.copy_in(post_offset + seq_offset, sizeof(uint64_t), (char *)&_seq);
+ footerptr.copy_in(post_offset + magic1_offset, sizeof(uint64_t), (char *)&_queue_pos);
+ footerptr.copy_in(post_offset + magic2_offset, sizeof(uint64_t), (char *)&magic2);
+
+ bl.claim_append(ebl);
+ if (next_write.tracked_op)
+ next_write.tracked_op->mark_event("write_thread_in_journal_buffer");
+
+ journalq.push_back(pair<uint64_t,off64_t>(seq, queue_pos));
+ writing_seq = seq;
+
+ queue_pos += size;
+ if (queue_pos >= header.max_size)
+ queue_pos = queue_pos + get_top() - header.max_size;
+
+ return 0;
+}
+
+void FileJournal::align_bl(off64_t pos, bufferlist& bl)
+{
+ // make sure list segments are page aligned
+ if (directio && (!bl.is_aligned(block_size) ||
+ !bl.is_n_align_sized(CEPH_MINIMUM_BLOCK_SIZE))) {
+ assert(0 == "bl should be align");
+ if ((bl.length() & (CEPH_MINIMUM_BLOCK_SIZE - 1)) != 0 ||
+ (pos & (CEPH_MINIMUM_BLOCK_SIZE - 1)) != 0)
+ dout(0) << "rebuild_page_aligned failed, " << bl << dendl;
+ assert((bl.length() & (CEPH_MINIMUM_BLOCK_SIZE - 1)) == 0);
+ assert((pos & (CEPH_MINIMUM_BLOCK_SIZE - 1)) == 0);
+ }
+}
+
+int FileJournal::write_bl(off64_t& pos, bufferlist& bl)
+{
+ int ret;
+
+ off64_t spos = ::lseek64(fd, pos, SEEK_SET);
+ if (spos < 0) {
+ ret = -errno;
+ derr << "FileJournal::write_bl : lseek64 failed " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ ret = bl.write_fd(fd);
+ if (ret) {
+ derr << "FileJournal::write_bl : write_fd failed: " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ pos += bl.length();
+ if (pos == header.max_size)
+ pos = get_top();
+ return 0;
+}
+
+void FileJournal::do_write(bufferlist& bl)
+{
+ // nothing to do?
+ if (bl.length() == 0 && !must_write_header)
+ return;
+
+ buffer::ptr hbp;
+ if (g_conf->journal_write_header_frequency &&
+ (((++journaled_since_start) %
+ g_conf->journal_write_header_frequency) == 0)) {
+ must_write_header = true;
+ }
+
+ if (must_write_header) {
+ must_write_header = false;
+ hbp = prepare_header();
+ }
+
+ dout(15) << "do_write writing " << write_pos << "~" << bl.length()
+ << (hbp.length() ? " + header":"")
+ << dendl;
+
+ utime_t from = ceph_clock_now(g_ceph_context);
+
+ // entry
+ off64_t pos = write_pos;
+
+ // Adjust write_pos
+ align_bl(pos, bl);
+ write_pos += bl.length();
+ if (write_pos >= header.max_size)
+ write_pos = write_pos - header.max_size + get_top();
+
+ write_lock.Unlock();
+
+ // split?
+ off64_t split = 0;
+ if (pos + bl.length() > header.max_size) {
+ bufferlist first, second;
+ split = header.max_size - pos;
+ first.substr_of(bl, 0, split);
+ second.substr_of(bl, split, bl.length() - split);
+ assert(first.length() + second.length() == bl.length());
+ dout(10) << "do_write wrapping, first bit at " << pos << " len " << first.length()
+ << " second bit len " << second.length() << " (orig len " << bl.length() << ")" << dendl;
+
+ //Save pos to write first piece second
+ off64_t first_pos = pos;
+ off64_t orig_pos;
+ pos = get_top();
+ // header too?
+ if (hbp.length()) {
+ // be sneaky: include the header in the second fragment
+ second.push_front(hbp);
+ pos = 0; // we included the header
+ }
+ // Write the second portion first possible with the header, so
+ // do_read_entry() won't even get a valid entry_header_t if there
+ // is a crash between the two writes.
+ orig_pos = pos;
+ if (write_bl(pos, second)) {
+ derr << "FileJournal::do_write: write_bl(pos=" << orig_pos
+ << ") failed" << dendl;
+ ceph_abort();
+ }
+ orig_pos = first_pos;
+ if (write_bl(first_pos, first)) {
+ derr << "FileJournal::do_write: write_bl(pos=" << orig_pos
+ << ") failed" << dendl;
+ ceph_abort();
+ }
+ assert(first_pos == get_top());
+ } else {
+ // header too?
+ if (hbp.length()) {
+ if (TEMP_FAILURE_RETRY(::pwrite(fd, hbp.c_str(), hbp.length(), 0)) < 0) {
+ int err = errno;
+ derr << "FileJournal::do_write: pwrite(fd=" << fd
+ << ", hbp.length=" << hbp.length() << ") failed :"
+ << cpp_strerror(err) << dendl;
+ ceph_abort();
+ }
+ }
+
+ if (write_bl(pos, bl)) {
+ derr << "FileJournal::do_write: write_bl(pos=" << pos
+ << ") failed" << dendl;
+ ceph_abort();
+ }
+ }
+
+ if (!directio) {
+ dout(20) << "do_write fsync" << dendl;
+
+ /*
+ * We'd really love to have a fsync_range or fdatasync_range and do a:
+ *
+ * if (split) {
+ * ::fsync_range(fd, header.max_size - split, split)l
+ * ::fsync_range(fd, get_top(), bl.length() - split);
+ * else
+ * ::fsync_range(fd, write_pos, bl.length())
+ *
+ * NetBSD and AIX apparently have it, and adding it to Linux wouldn't be
+ * too hard given all the underlying infrastructure already exist.
+ *
+ * NOTE: using sync_file_range here would not be safe as it does not
+ * flush disk caches or commits any sort of metadata.
+ */
+ int ret = 0;
+#if defined(DARWIN) || defined(__FreeBSD__)
+ ret = ::fsync(fd);
+#else
+ ret = ::fdatasync(fd);
+#endif
+ if (ret < 0) {
+ derr << __func__ << " fsync/fdatasync failed: " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+#ifdef HAVE_POSIX_FADVISE
+ if (g_conf->filestore_fadvise)
+ posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
+#endif
+ }
+
+ utime_t lat = ceph_clock_now(g_ceph_context) - from;
+ dout(20) << "do_write latency " << lat << dendl;
+
+ write_lock.Lock();
+
+ assert(write_pos == pos);
+ assert(write_pos % header.alignment == 0);
+
+ {
+ Mutex::Locker locker(finisher_lock);
+ journaled_seq = writing_seq;
+
+ // kick finisher?
+ // only if we haven't filled up recently!
+ if (full_state != FULL_NOTFULL) {
+ dout(10) << "do_write NOT queueing finisher seq " << journaled_seq
+ << ", full_commit_seq|full_restart_seq" << dendl;
+ } else {
+ if (plug_journal_completions) {
+ dout(20) << "do_write NOT queueing finishers through seq " << journaled_seq
+ << " due to completion plug" << dendl;
+ } else {
+ dout(20) << "do_write queueing finishers through seq " << journaled_seq << dendl;
+ queue_completions_thru(journaled_seq);
+ }
+ }
+ }
+}
+
+void FileJournal::flush()
+{
+ dout(10) << "waiting for completions to empty" << dendl;
+ {
+ Mutex::Locker l(finisher_lock);
+ while (!completions_empty())
+ finisher_cond.Wait(finisher_lock);
+ }
+ dout(10) << "flush waiting for finisher" << dendl;
+ finisher->wait_for_empty();
+ dout(10) << "flush done" << dendl;
+}
+
+
+void FileJournal::write_thread_entry()
+{
+ dout(10) << "write_thread_entry start" << dendl;
+ while (1) {
+ {
+ Mutex::Locker locker(writeq_lock);
+ if (writeq.empty() && !must_write_header) {
+ if (write_stop)
+ break;
+ dout(20) << "write_thread_entry going to sleep" << dendl;
+ writeq_cond.Wait(writeq_lock);
+ dout(20) << "write_thread_entry woke up" << dendl;
+ continue;
+ }
+ }
+
+#ifdef HAVE_LIBAIO
+ if (aio) {
+ Mutex::Locker locker(aio_lock);
+ // should we back off to limit aios in flight? try to do this
+ // adaptively so that we submit larger aios once we have lots of
+ // them in flight.
+ //
+ // NOTE: our condition here is based on aio_num (protected by
+ // aio_lock) and throttle_bytes (part of the write queue). when
+ // we sleep, we *only* wait for aio_num to change, and do not
+ // wake when more data is queued. this is not strictly correct,
+ // but should be fine given that we will have plenty of aios in
+ // flight if we hit this limit to ensure we keep the device
+ // saturated.
+ while (aio_num > 0) {
+ int exp = MIN(aio_num * 2, 24);
+ long unsigned min_new = 1ull << exp;
+ long unsigned cur = throttle_bytes.get_current();
+ dout(20) << "write_thread_entry aio throttle: aio num " << aio_num << " bytes " << aio_bytes
+ << " ... exp " << exp << " min_new " << min_new
+ << " ... pending " << cur << dendl;
+ if (cur >= min_new)
+ break;
+ dout(20) << "write_thread_entry deferring until more aios complete: "
+ << aio_num << " aios with " << aio_bytes << " bytes needs " << min_new
+ << " bytes to start a new aio (currently " << cur << " pending)" << dendl;
+ aio_cond.Wait(aio_lock);
+ dout(20) << "write_thread_entry woke up" << dendl;
+ }
+ }
+#endif
+
+ Mutex::Locker locker(write_lock);
+ uint64_t orig_ops = 0;
+ uint64_t orig_bytes = 0;
+
+ bufferlist bl;
+ int r = prepare_multi_write(bl, orig_ops, orig_bytes);
+ // Don't care about journal full if stoppping, so drop queue and
+ // possibly let header get written and loop above to notice stop
+ if (r == -ENOSPC) {
+ if (write_stop) {
+ dout(20) << "write_thread_entry full and stopping, throw out queue and finish up" << dendl;
+ while (!writeq_empty()) {
+ put_throttle(1, peek_write().orig_len);
+ pop_write();
+ }
+ print_header(header);
+ r = 0;
+ } else {
+ dout(20) << "write_thread_entry full, going to sleep (waiting for commit)" << dendl;
+ commit_cond.Wait(write_lock);
+ dout(20) << "write_thread_entry woke up" << dendl;
+ continue;
+ }
+ }
+ assert(r == 0);
+
+ if (logger) {
+ logger->inc(l_os_j_wr);
+ logger->inc(l_os_j_wr_bytes, bl.length());
+ }
+
+#ifdef HAVE_LIBAIO
+ if (aio)
+ do_aio_write(bl);
+ else
+ do_write(bl);
+#else
+ do_write(bl);
+#endif
+ put_throttle(orig_ops, orig_bytes);
+ }
+
+ dout(10) << "write_thread_entry finish" << dendl;
+}
+
+#ifdef HAVE_LIBAIO
+void FileJournal::do_aio_write(bufferlist& bl)
+{
+
+ if (g_conf->journal_write_header_frequency &&
+ (((++journaled_since_start) %
+ g_conf->journal_write_header_frequency) == 0)) {
+ must_write_header = true;
+ }
+
+ // nothing to do?
+ if (bl.length() == 0 && !must_write_header)
+ return;
+
+ buffer::ptr hbp;
+ if (must_write_header) {
+ must_write_header = false;
+ hbp = prepare_header();
+ }
+
+ // entry
+ off64_t pos = write_pos;
+
+ dout(15) << "do_aio_write writing " << pos << "~" << bl.length()
+ << (hbp.length() ? " + header":"")
+ << dendl;
+
+ // split?
+ off64_t split = 0;
+ if (pos + bl.length() > header.max_size) {
+ bufferlist first, second;
+ split = header.max_size - pos;
+ first.substr_of(bl, 0, split);
+ second.substr_of(bl, split, bl.length() - split);
+ assert(first.length() + second.length() == bl.length());
+ dout(10) << "do_aio_write wrapping, first bit at " << pos << "~" << first.length() << dendl;
+
+ if (write_aio_bl(pos, first, 0)) {
+ derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos
+ << ") failed" << dendl;
+ ceph_abort();
+ }
+ assert(pos == header.max_size);
+ if (hbp.length()) {
+ // be sneaky: include the header in the second fragment
+ second.push_front(hbp);
+ pos = 0; // we included the header
+ } else
+ pos = get_top(); // no header, start after that
+ if (write_aio_bl(pos, second, writing_seq)) {
+ derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos
+ << ") failed" << dendl;
+ ceph_abort();
+ }
+ } else {
+ // header too?
+ if (hbp.length()) {
+ bufferlist hbl;
+ hbl.push_back(hbp);
+ loff_t pos = 0;
+ if (write_aio_bl(pos, hbl, 0)) {
+ derr << "FileJournal::do_aio_write: write_aio_bl(header) failed" << dendl;
+ ceph_abort();
+ }
+ }
+
+ if (write_aio_bl(pos, bl, writing_seq)) {
+ derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos
+ << ") failed" << dendl;
+ ceph_abort();
+ }
+ }
+
+ write_pos = pos;
+ if (write_pos == header.max_size)
+ write_pos = get_top();
+ assert(write_pos % header.alignment == 0);
+}
+
+/**
+ * write a buffer using aio
+ *
+ * @param seq seq to trigger when this aio completes. if 0, do not update any state
+ * on completion.
+ */
+int FileJournal::write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq)
+{
+ align_bl(pos, bl);
+
+ dout(20) << "write_aio_bl " << pos << "~" << bl.length() << " seq " << seq << dendl;
+
+ while (bl.length() > 0) {
+ int max = MIN(bl.buffers().size(), IOV_MAX-1);
+ iovec *iov = new iovec[max];
+ int n = 0;
+ unsigned len = 0;
+ for (std::list<buffer::ptr>::const_iterator p = bl.buffers().begin();
+ n < max;
+ ++p, ++n) {
+ assert(p != bl.buffers().end());
+ iov[n].iov_base = (void *)p->c_str();
+ iov[n].iov_len = p->length();
+ len += p->length();
+ }
+
+ bufferlist tbl;
+ bl.splice(0, len, &tbl); // move bytes from bl -> tbl
+
+ // lock only aio_queue, current aio, aio_num, aio_bytes, which may be
+ // modified in check_aio_completion
+ aio_lock.Lock();
+ aio_queue.push_back(aio_info(tbl, pos, bl.length() > 0 ? 0 : seq));
+ aio_info& aio = aio_queue.back();
+ aio.iov = iov;
+
+ io_prep_pwritev(&aio.iocb, fd, aio.iov, n, pos);
+
+ dout(20) << "write_aio_bl .. " << aio.off << "~" << aio.len
+ << " in " << n << dendl;
+
+ aio_num++;
+ aio_bytes += aio.len;
+
+ // need to save current aio len to update write_pos later because current
+ // aio could be ereased from aio_queue once it is done
+ uint64_t cur_len = aio.len;
+ // unlock aio_lock because following io_submit might take time to return
+ aio_lock.Unlock();
+
+ iocb *piocb = &aio.iocb;
+ int attempts = 10;
+ do {
+ int r = io_submit(aio_ctx, 1, &piocb);
+ dout(20) << "write_aio_bl io_submit return value: " << r << dendl;
+ if (r < 0) {
+ derr << "io_submit to " << aio.off << "~" << cur_len
+ << " got " << cpp_strerror(r) << dendl;
+ if (r == -EAGAIN && attempts-- > 0) {
+ usleep(500);
+ continue;
+ }
+ assert(0 == "io_submit got unexpected error");
+ } else {
+ break;
+ }
+ } while (true);
+ pos += cur_len;
+ }
+ aio_lock.Lock();
+ write_finish_cond.Signal();
+ aio_lock.Unlock();
+ return 0;
+}
+#endif
+
+void FileJournal::write_finish_thread_entry()
+{
+#ifdef HAVE_LIBAIO
+ dout(10) << "write_finish_thread_entry enter" << dendl;
+ while (true) {
+ {
+ Mutex::Locker locker(aio_lock);
+ if (aio_queue.empty()) {
+ if (aio_stop)
+ break;
+ dout(20) << "write_finish_thread_entry sleeping" << dendl;
+ write_finish_cond.Wait(aio_lock);
+ continue;
+ }
+ }
+
+ dout(20) << "write_finish_thread_entry waiting for aio(s)" << dendl;
+ io_event event[16];
+ int r = io_getevents(aio_ctx, 1, 16, event, NULL);
+ if (r < 0) {
+ if (r == -EINTR) {
+ dout(0) << "io_getevents got " << cpp_strerror(r) << dendl;
+ continue;
+ }
+ derr << "io_getevents got " << cpp_strerror(r) << dendl;
+ assert(0 == "got unexpected error from io_getevents");
+ }
+
+ {
+ Mutex::Locker locker(aio_lock);
+ for (int i=0; i<r; i++) {
+ aio_info *ai = (aio_info *)event[i].obj;
+ if (event[i].res != ai->len) {
+ derr << "aio to " << ai->off << "~" << ai->len
+ << " wrote " << event[i].res << dendl;
+ assert(0 == "unexpected aio error");
+ }
+ dout(10) << "write_finish_thread_entry aio " << ai->off
+ << "~" << ai->len << " done" << dendl;
+ ai->done = true;
+ }
+ check_aio_completion();
+ }
+ }
+ dout(10) << "write_finish_thread_entry exit" << dendl;
+#endif
+}
+
+#ifdef HAVE_LIBAIO
+/**
+ * check aio_wait for completed aio, and update state appropriately.
+ */
+void FileJournal::check_aio_completion()
+{
+ assert(aio_lock.is_locked());
+ dout(20) << "check_aio_completion" << dendl;
+
+ bool completed_something = false, signal = false;
+ uint64_t new_journaled_seq = 0;
+
+ list<aio_info>::iterator p = aio_queue.begin();
+ while (p != aio_queue.end() && p->done) {
+ dout(20) << "check_aio_completion completed seq " << p->seq << " "
+ << p->off << "~" << p->len << dendl;
+ if (p->seq) {
+ new_journaled_seq = p->seq;
+ completed_something = true;
+ }
+ aio_num--;
+ aio_bytes -= p->len;
+ aio_queue.erase(p++);
+ signal = true;
+ }
+
+ if (completed_something) {
+ // kick finisher?
+ // only if we haven't filled up recently!
+ Mutex::Locker locker(finisher_lock);
+ journaled_seq = new_journaled_seq;
+ if (full_state != FULL_NOTFULL) {
+ dout(10) << "check_aio_completion NOT queueing finisher seq " << journaled_seq
+ << ", full_commit_seq|full_restart_seq" << dendl;
+ } else {
+ if (plug_journal_completions) {
+ dout(20) << "check_aio_completion NOT queueing finishers through seq " << journaled_seq
+ << " due to completion plug" << dendl;
+ } else {
+ dout(20) << "check_aio_completion queueing finishers through seq " << journaled_seq << dendl;
+ queue_completions_thru(journaled_seq);
+ }
+ }
+ }
+ if (signal) {
+ // maybe write queue was waiting for aio count to drop?
+ aio_cond.Signal();
+ }
+}
+#endif
+
+int FileJournal::prepare_entry(list<ObjectStore::Transaction*>& tls, bufferlist* tbl) {
+ dout(10) << "prepare_entry " << tls << dendl;
+ unsigned data_len = 0;
+ int data_align = -1; // -1 indicates that we don't care about the alignment
+ bufferlist bl;
+ for (list<ObjectStore::Transaction*>::iterator p = tls.begin();
+ p != tls.end(); ++p) {
+ ObjectStore::Transaction *t = *p;
+ if (t->get_data_length() > data_len &&
+ (int)t->get_data_length() >= g_conf->journal_align_min_size) {
+ data_len = t->get_data_length();
+ data_align = (t->get_data_alignment() - bl.length()) & ~CEPH_PAGE_MASK;
+ }
+ ::encode(*t, bl);
+ }
+ if (tbl->length()) {
+ bl.claim_append(*tbl);
+ }
+ // add it this entry
+ entry_header_t h;
+ unsigned head_size = sizeof(entry_header_t);
+ off64_t base_size = 2*head_size + bl.length();
+ memset(&h, 0, sizeof(h));
+ if (data_align >= 0)
+ h.pre_pad = ((unsigned int)data_align - (unsigned int)head_size) & ~CEPH_PAGE_MASK;
+ off64_t size = ROUND_UP_TO(base_size + h.pre_pad, header.alignment);
+ unsigned post_pad = size - base_size - h.pre_pad;
+ h.len = bl.length();
+ h.post_pad = post_pad;
+ h.crc32c = bl.crc32c(0);
+ dout(10) << " len " << bl.length() << " -> " << size
+ << " (head " << head_size << " pre_pad " << h.pre_pad
+ << " bl " << bl.length() << " post_pad " << post_pad << " tail " << head_size << ")"
+ << " (bl alignment " << data_align << ")"
+ << dendl;
+ bufferlist ebl;
+ // header
+ ebl.append((const char*)&h, sizeof(h));
+ if (h.pre_pad) {
+ ebl.push_back(buffer::create_static(h.pre_pad, zero_buf));
+ }
+ // payload
+ ebl.claim_append(bl, buffer::list::CLAIM_ALLOW_NONSHAREABLE); // potential zero-copy
+ if (h.post_pad) {
+ ebl.push_back(buffer::create_static(h.post_pad, zero_buf));
+ }
+ // footer
+ ebl.append((const char*)&h, sizeof(h));
+ ebl.rebuild_aligned(CEPH_MINIMUM_BLOCK_SIZE);
+ tbl->claim(ebl);
+ return h.len;
+}
+
+void FileJournal::submit_entry(uint64_t seq, bufferlist& e, uint32_t orig_len,
+ Context *oncommit, TrackedOpRef osd_op)
+{
+ // dump on queue
+ dout(5) << "submit_entry seq " << seq
+ << " len " << e.length()
+ << " (" << oncommit << ")" << dendl;
+ assert(e.length() > 0);
+
+ throttle_ops.take(1);
+ throttle_bytes.take(orig_len);
+ if (osd_op)
+ osd_op->mark_event("commit_queued_for_journal_write");
+ if (logger) {
+ logger->set(l_os_jq_max_ops, throttle_ops.get_max());
+ logger->set(l_os_jq_max_bytes, throttle_bytes.get_max());
+ logger->set(l_os_jq_ops, throttle_ops.get_current());
+ logger->set(l_os_jq_bytes, throttle_bytes.get_current());
+ }
+
+ {
+ Mutex::Locker l1(writeq_lock); // ** lock **
+ Mutex::Locker l2(completions_lock); // ** lock **
+ completions.push_back(
+ completion_item(
+ seq, oncommit, ceph_clock_now(g_ceph_context), osd_op));
+ if (writeq.empty())
+ writeq_cond.Signal();
+ writeq.push_back(write_item(seq, e, orig_len, osd_op));
+ }
+}
+
+bool FileJournal::writeq_empty()
+{
+ Mutex::Locker locker(writeq_lock);
+ return writeq.empty();
+}
+
+FileJournal::write_item &FileJournal::peek_write()
+{
+ assert(write_lock.is_locked());
+ Mutex::Locker locker(writeq_lock);
+ return writeq.front();
+}
+
+void FileJournal::pop_write()
+{
+ assert(write_lock.is_locked());
+ Mutex::Locker locker(writeq_lock);
+ writeq.pop_front();
+}
+
+void FileJournal::batch_pop_write(list<write_item> &items)
+{
+ assert(write_lock.is_locked());
+ Mutex::Locker locker(writeq_lock);
+ writeq.swap(items);
+}
+
+void FileJournal::batch_unpop_write(list<write_item> &items)
+{
+ assert(write_lock.is_locked());
+ Mutex::Locker locker(writeq_lock);
+ writeq.splice(writeq.begin(), items);
+}
+
+void FileJournal::commit_start(uint64_t seq)
+{
+ dout(10) << "commit_start" << dendl;
+
+ // was full?
+ switch (full_state) {
+ case FULL_NOTFULL:
+ break; // all good
+
+ case FULL_FULL:
+ if (seq >= journaled_seq) {
+ dout(1) << " FULL_FULL -> FULL_WAIT. commit_start on seq "
+ << seq << " > journaled_seq " << journaled_seq
+ << ", moving to FULL_WAIT."
+ << dendl;
+ full_state = FULL_WAIT;
+ } else {
+ dout(1) << "FULL_FULL commit_start on seq "
+ << seq << " < journaled_seq " << journaled_seq
+ << ", remaining in FULL_FULL"
+ << dendl;
+ }
+ break;
+
+ case FULL_WAIT:
+ dout(1) << " FULL_WAIT -> FULL_NOTFULL. journal now active, setting completion plug." << dendl;
+ full_state = FULL_NOTFULL;
+ plug_journal_completions = true;
+ break;
+ }
+}
+
+/*
+ *send discard command to joural block deivce
+ */
+void FileJournal::do_discard(int64_t offset, int64_t end)
+{
+ dout(10) << __func__ << "trim(" << offset << ", " << end << dendl;
+
+ offset = ROUND_UP_TO(offset, block_size);
+ if (offset >= end)
+ return;
+ end = ROUND_UP_TO(end - block_size, block_size);
+ assert(end >= offset);
+ if (offset < end)
+ if (block_device_discard(fd, offset, end - offset) < 0)
+ dout(1) << __func__ << "ioctl(BLKDISCARD) error:" << cpp_strerror(errno) << dendl;
+}
+
+void FileJournal::committed_thru(uint64_t seq)
+{
+ Mutex::Locker locker(write_lock);
+
+ if (seq < last_committed_seq) {
+ dout(5) << "committed_thru " << seq << " < last_committed_seq " << last_committed_seq << dendl;
+ assert(seq >= last_committed_seq);
+ return;
+ }
+ if (seq == last_committed_seq) {
+ dout(5) << "committed_thru " << seq << " == last_committed_seq " << last_committed_seq << dendl;
+ return;
+ }
+
+ dout(5) << "committed_thru " << seq << " (last_committed_seq " << last_committed_seq << ")" << dendl;
+ last_committed_seq = seq;
+
+ // completions!
+ {
+ Mutex::Locker locker(finisher_lock);
+ queue_completions_thru(seq);
+ if (plug_journal_completions && seq >= header.start_seq) {
+ dout(10) << " removing completion plug, queuing completions thru journaled_seq " << journaled_seq << dendl;
+ plug_journal_completions = false;
+ queue_completions_thru(journaled_seq);
+ }
+ }
+
+ // adjust start pointer
+ while (!journalq.empty() && journalq.front().first <= seq) {
+ journalq.pop_front();
+ }
+
+ int64_t old_start = header.start;
+ if (!journalq.empty()) {
+ header.start = journalq.front().second;
+ header.start_seq = journalq.front().first;
+ } else {
+ header.start = write_pos;
+ header.start_seq = seq + 1;
+ }
+
+ if (discard) {
+ dout(10) << __func__ << " will trim (" << old_start << ", " << header.start << ")" << dendl;
+ if (old_start < header.start)
+ do_discard(old_start, header.start - 1);
+ else {
+ do_discard(old_start, header.max_size - 1);
+ do_discard(get_top(), header.start - 1);
+ }
+ }
+
+ must_write_header = true;
+ print_header(header);
+
+ // committed but unjournaled items
+ while (!writeq_empty() && peek_write().seq <= seq) {
+ dout(15) << " dropping committed but unwritten seq " << peek_write().seq
+ << " len " << peek_write().bl.length()
+ << dendl;
+ put_throttle(1, peek_write().orig_len);
+ pop_write();
+ }
+
+ commit_cond.Signal();
+
+ dout(10) << "committed_thru done" << dendl;
+}
+
+
+void FileJournal::put_throttle(uint64_t ops, uint64_t bytes)
+{
+ uint64_t new_ops = throttle_ops.put(ops);
+ uint64_t new_bytes = throttle_bytes.put(bytes);
+ dout(5) << "put_throttle finished " << ops << " ops and "
+ << bytes << " bytes, now "
+ << new_ops << " ops and " << new_bytes << " bytes"
+ << dendl;
+
+ if (logger) {
+ logger->inc(l_os_j_ops, ops);
+ logger->inc(l_os_j_bytes, bytes);
+ logger->set(l_os_jq_ops, new_ops);
+ logger->set(l_os_jq_bytes, new_bytes);
+ logger->set(l_os_jq_max_ops, throttle_ops.get_max());
+ logger->set(l_os_jq_max_bytes, throttle_bytes.get_max());
+ }
+}
+
+int FileJournal::make_writeable()
+{
+ dout(10) << __func__ << dendl;
+ int r = _open(true);
+ if (r < 0)
+ return r;
+
+ if (read_pos > 0)
+ write_pos = read_pos;
+ else
+ write_pos = get_top();
+ read_pos = 0;
+
+ must_write_header = true;
+ start_writer();
+ return 0;
+}
+
+void FileJournal::wrap_read_bl(
+ off64_t pos,
+ int64_t olen,
+ bufferlist* bl,
+ off64_t *out_pos
+ ) const
+{
+ while (olen > 0) {
+ while (pos >= header.max_size)
+ pos = pos + get_top() - header.max_size;
+
+ int64_t len;
+ if (pos + olen > header.max_size)
+ len = header.max_size - pos; // partial
+ else
+ len = olen; // rest
+
+ int64_t actual = ::lseek64(fd, pos, SEEK_SET);
+ assert(actual == pos);
+
+ bufferptr bp = buffer::create(len);
+ int r = safe_read_exact(fd, bp.c_str(), len);
+ if (r) {
+ derr << "FileJournal::wrap_read_bl: safe_read_exact " << pos << "~" << len << " returned "
+ << r << dendl;
+ ceph_abort();
+ }
+ bl->push_back(bp);
+ pos += len;
+ olen -= len;
+ }
+ if (pos >= header.max_size)
+ pos = pos + get_top() - header.max_size;
+ if (out_pos)
+ *out_pos = pos;
+}
+
+bool FileJournal::read_entry(
+ bufferlist &bl,
+ uint64_t &next_seq,
+ bool *corrupt)
+{
+ if (corrupt)
+ *corrupt = false;
+ uint64_t seq = next_seq;
+
+ if (!read_pos) {
+ dout(2) << "read_entry -- not readable" << dendl;
+ return false;
+ }
+
+ off64_t pos = read_pos;
+ off64_t next_pos = pos;
+ stringstream ss;
+ read_entry_result result = do_read_entry(
+ pos,
+ &next_pos,
+ &bl,
+ &seq,
+ &ss);
+ if (result == SUCCESS) {
+ journalq.push_back( pair<uint64_t,off64_t>(seq, pos));
+ if (next_seq > seq) {
+ return false;
+ } else {
+ read_pos = next_pos;
+ next_seq = seq;
+ if (seq > journaled_seq)
+ journaled_seq = seq;
+ return true;
+ }
+ }
+
+ if (seq && seq < header.committed_up_to) {
+ derr << "Unable to read past sequence " << seq
+ << " but header indicates the journal has committed up through "
+ << header.committed_up_to << ", journal is corrupt" << dendl;
+ if (g_conf->journal_ignore_corruption) {
+ if (corrupt)
+ *corrupt = true;
+ return false;
+ } else {
+ assert(0);
+ }
+ }
+
+ dout(25) << ss.str() << dendl;
+ dout(2) << "No further valid entries found, journal is most likely valid"
+ << dendl;
+ return false;
+}
+
+FileJournal::read_entry_result FileJournal::do_read_entry(
+ off64_t init_pos,
+ off64_t *next_pos,
+ bufferlist *bl,
+ uint64_t *seq,
+ ostream *ss,
+ entry_header_t *_h) const
+{
+ off64_t cur_pos = init_pos;
+ bufferlist _bl;
+ if (!bl)
+ bl = &_bl;
+
+ // header
+ entry_header_t *h;
+ bufferlist hbl;
+ off64_t _next_pos;
+ wrap_read_bl(cur_pos, sizeof(*h), &hbl, &_next_pos);
+ h = reinterpret_cast<entry_header_t *>(hbl.c_str());
+
+ if (!h->check_magic(cur_pos, header.get_fsid64())) {
+ dout(25) << "read_entry " << init_pos
+ << " : bad header magic, end of journal" << dendl;
+ if (ss)
+ *ss << "bad header magic";
+ if (next_pos)
+ *next_pos = init_pos + (4<<10); // check 4k ahead
+ return MAYBE_CORRUPT;
+ }
+ cur_pos = _next_pos;
+
+ // pad + body + pad
+ if (h->pre_pad)
+ cur_pos += h->pre_pad;
+
+ bl->clear();
+ wrap_read_bl(cur_pos, h->len, bl, &cur_pos);
+
+ if (h->post_pad)
+ cur_pos += h->post_pad;
+
+ // footer
+ entry_header_t *f;
+ bufferlist fbl;
+ wrap_read_bl(cur_pos, sizeof(*f), &fbl, &cur_pos);
+ f = reinterpret_cast<entry_header_t *>(fbl.c_str());
+ if (memcmp(f, h, sizeof(*f))) {
+ if (ss)
+ *ss << "bad footer magic, partial entry";
+ if (next_pos)
+ *next_pos = cur_pos;
+ return MAYBE_CORRUPT;
+ }
+
+ if ((header.flags & header_t::FLAG_CRC) || // if explicitly enabled (new journal)
+ h->crc32c != 0) { // newer entry in old journal
+ uint32_t actual_crc = bl->crc32c(0);
+ if (actual_crc != h->crc32c) {
+ if (ss)
+ *ss << "header crc (" << h->crc32c
+ << ") doesn't match body crc (" << actual_crc << ")";
+ if (next_pos)
+ *next_pos = cur_pos;
+ return MAYBE_CORRUPT;
+ }
+ }
+
+ // yay!
+ dout(2) << "read_entry " << init_pos << " : seq " << h->seq
+ << " " << h->len << " bytes"
+ << dendl;
+
+ // ok!
+ if (seq)
+ *seq = h->seq;
+
+
+ if (next_pos)
+ *next_pos = cur_pos;
+
+ if (_h)
+ *_h = *h;
+
+ assert(cur_pos % header.alignment == 0);
+ return SUCCESS;
+}
+
+void FileJournal::throttle()
+{
+ if (throttle_ops.wait(g_conf->journal_queue_max_ops))
+ dout(2) << "throttle: waited for ops" << dendl;
+ if (throttle_bytes.wait(g_conf->journal_queue_max_bytes))
+ dout(2) << "throttle: waited for bytes" << dendl;
+}
+
+void FileJournal::get_header(
+ uint64_t wanted_seq,
+ off64_t *_pos,
+ entry_header_t *h)
+{
+ off64_t pos = header.start;
+ off64_t next_pos = pos;
+ bufferlist bl;
+ uint64_t seq = 0;
+ dout(2) << __func__ << dendl;
+ while (1) {
+ bl.clear();
+ pos = next_pos;
+ read_entry_result result = do_read_entry(
+ pos,
+ &next_pos,
+ &bl,
+ &seq,
+ 0,
+ h);
+ if (result == FAILURE || result == MAYBE_CORRUPT)
+ assert(0);
+ if (seq == wanted_seq) {
+ if (_pos)
+ *_pos = pos;
+ return;
+ }
+ }
+ assert(0); // not reachable
+}
+
+void FileJournal::corrupt(
+ int wfd,
+ off64_t corrupt_at)
+{
+ dout(2) << __func__ << dendl;
+ if (corrupt_at >= header.max_size)
+ corrupt_at = corrupt_at + get_top() - header.max_size;
+
+ int64_t actual = ::lseek64(fd, corrupt_at, SEEK_SET);
+ assert(actual == corrupt_at);
+
+ char buf[10];
+ int r = safe_read_exact(fd, buf, 1);
+ assert(r == 0);
+
+ actual = ::lseek64(wfd, corrupt_at, SEEK_SET);
+ assert(actual == corrupt_at);
+
+ buf[0]++;
+ r = safe_write(wfd, buf, 1);
+ assert(r == 0);
+}
+
+void FileJournal::corrupt_payload(
+ int wfd,
+ uint64_t seq)
+{
+ dout(2) << __func__ << dendl;
+ off64_t pos = 0;
+ entry_header_t h;
+ get_header(seq, &pos, &h);
+ off64_t corrupt_at =
+ pos + sizeof(entry_header_t) + h.pre_pad;
+ corrupt(wfd, corrupt_at);
+}
+
+
+void FileJournal::corrupt_footer_magic(
+ int wfd,
+ uint64_t seq)
+{
+ dout(2) << __func__ << dendl;
+ off64_t pos = 0;
+ entry_header_t h;
+ get_header(seq, &pos, &h);
+ off64_t corrupt_at =
+ pos + sizeof(entry_header_t) + h.pre_pad +
+ h.len + h.post_pad +
+ (reinterpret_cast<char*>(&h.magic2) - reinterpret_cast<char*>(&h));
+ corrupt(wfd, corrupt_at);
+}
+
+
+void FileJournal::corrupt_header_magic(
+ int wfd,
+ uint64_t seq)
+{
+ dout(2) << __func__ << dendl;
+ off64_t pos = 0;
+ entry_header_t h;
+ get_header(seq, &pos, &h);
+ off64_t corrupt_at =
+ pos +
+ (reinterpret_cast<char*>(&h.magic2) - reinterpret_cast<char*>(&h));
+ corrupt(wfd, corrupt_at);
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_FILEJOURNAL_H
+#define CEPH_FILEJOURNAL_H
+
+#include <deque>
+using std::deque;
+
+#include "Journal.h"
+#include "common/Cond.h"
+#include "common/Mutex.h"
+#include "common/Thread.h"
+#include "common/Throttle.h"
+
+#ifdef HAVE_LIBAIO
+# include <libaio.h>
+#endif
+
+/**
+ * Implements journaling on top of block device or file.
+ *
+ * Lock ordering is write_lock > aio_lock > finisher_lock
+ */
+class FileJournal : public Journal {
+public:
+ /// Protected by finisher_lock
+ struct completion_item {
+ uint64_t seq;
+ Context *finish;
+ utime_t start;
+ TrackedOpRef tracked_op;
+ completion_item(uint64_t o, Context *c, utime_t s,
+ TrackedOpRef opref)
+ : seq(o), finish(c), start(s), tracked_op(opref) {}
+ completion_item() : seq(0), finish(0), start(0) {}
+ };
+ struct write_item {
+ uint64_t seq;
+ bufferlist bl;
+ uint32_t orig_len;
+ TrackedOpRef tracked_op;
+ write_item(uint64_t s, bufferlist& b, int ol, TrackedOpRef opref) :
+ seq(s), orig_len(ol), tracked_op(opref) {
+ bl.claim(b, buffer::list::CLAIM_ALLOW_NONSHAREABLE); // potential zero-copy
+ }
+ write_item() : seq(0), orig_len(0) {}
+ };
+
+ Mutex finisher_lock;
+ Cond finisher_cond;
+ uint64_t journaled_seq;
+ bool plug_journal_completions;
+
+ Mutex writeq_lock;
+ Cond writeq_cond;
+ list<write_item> writeq;
+ bool writeq_empty();
+ write_item &peek_write();
+ void pop_write();
+ void batch_pop_write(list<write_item> &items);
+ void batch_unpop_write(list<write_item> &items);
+
+ Mutex completions_lock;
+ list<completion_item> completions;
+ bool completions_empty() {
+ Mutex::Locker l(completions_lock);
+ return completions.empty();
+ }
+ void batch_pop_completions(list<completion_item> &items) {
+ Mutex::Locker l(completions_lock);
+ completions.swap(items);
+ }
+ void batch_unpop_completions(list<completion_item> &items) {
+ Mutex::Locker l(completions_lock);
+ completions.splice(completions.begin(), items);
+ }
+ completion_item completion_peek_front() {
+ Mutex::Locker l(completions_lock);
+ assert(!completions.empty());
+ return completions.front();
+ }
+ void completion_pop_front() {
+ Mutex::Locker l(completions_lock);
+ assert(!completions.empty());
+ completions.pop_front();
+ }
+
+ int prepare_entry(list<ObjectStore::Transaction*>& tls, bufferlist* tbl);
+
+ void submit_entry(uint64_t seq, bufferlist& bl, uint32_t orig_len,
+ Context *oncommit,
+ TrackedOpRef osd_op = TrackedOpRef());
+ /// End protected by finisher_lock
+
+ /*
+ * journal header
+ */
+ struct header_t {
+ enum {
+ FLAG_CRC = (1<<0),
+ // NOTE: remove kludgey weirdness in read_header() next time a flag is added.
+ };
+
+ uint64_t flags;
+ uuid_d fsid;
+ __u32 block_size;
+ __u32 alignment;
+ int64_t max_size; // max size of journal ring buffer
+ int64_t start; // offset of first entry
+ uint64_t committed_up_to; // committed up to
+
+ /**
+ * start_seq
+ *
+ * entry at header.start has sequence >= start_seq
+ *
+ * Generally, the entry at header.start will have sequence
+ * start_seq if it exists. The only exception is immediately
+ * after journal creation since the first sequence number is
+ * not known.
+ *
+ * If the first read on open fails, we can assume corruption
+ * if start_seq > committed_up_thru because the entry would have
+ * a sequence >= start_seq and therefore > committed_up_thru.
+ */
+ uint64_t start_seq;
+
+ header_t() :
+ flags(0), block_size(0), alignment(0), max_size(0), start(0),
+ committed_up_to(0), start_seq(0) {}
+
+ void clear() {
+ start = block_size;
+ }
+
+ uint64_t get_fsid64() const {
+ return *(uint64_t*)fsid.bytes();
+ }
+
+ void encode(bufferlist& bl) const {
+ __u32 v = 4;
+ ::encode(v, bl);
+ bufferlist em;
+ {
+ ::encode(flags, em);
+ ::encode(fsid, em);
+ ::encode(block_size, em);
+ ::encode(alignment, em);
+ ::encode(max_size, em);
+ ::encode(start, em);
+ ::encode(committed_up_to, em);
+ ::encode(start_seq, em);
+ }
+ ::encode(em, bl);
+ }
+ void decode(bufferlist::iterator& bl) {
+ __u32 v;
+ ::decode(v, bl);
+ if (v < 2) { // normally 0, but concievably 1
+ // decode old header_t struct (pre v0.40).
+ bl.advance(4); // skip __u32 flags (it was unused by any old code)
+ flags = 0;
+ uint64_t tfsid;
+ ::decode(tfsid, bl);
+ *(uint64_t*)&fsid.bytes()[0] = tfsid;
+ *(uint64_t*)&fsid.bytes()[8] = tfsid;
+ ::decode(block_size, bl);
+ ::decode(alignment, bl);
+ ::decode(max_size, bl);
+ ::decode(start, bl);
+ committed_up_to = 0;
+ start_seq = 0;
+ return;
+ }
+ bufferlist em;
+ ::decode(em, bl);
+ bufferlist::iterator t = em.begin();
+ ::decode(flags, t);
+ ::decode(fsid, t);
+ ::decode(block_size, t);
+ ::decode(alignment, t);
+ ::decode(max_size, t);
+ ::decode(start, t);
+
+ if (v > 2)
+ ::decode(committed_up_to, t);
+ else
+ committed_up_to = 0;
+
+ if (v > 3)
+ ::decode(start_seq, t);
+ else
+ start_seq = 0;
+ }
+ } header;
+
+ struct entry_header_t {
+ uint64_t seq; // fs op seq #
+ uint32_t crc32c; // payload only. not header, pre_pad, post_pad, or footer.
+ uint32_t len;
+ uint32_t pre_pad, post_pad;
+ uint64_t magic1;
+ uint64_t magic2;
+
+ static uint64_t make_magic(uint64_t seq, uint32_t len, uint64_t fsid) {
+ return (fsid ^ seq ^ len);
+ }
+ bool check_magic(off64_t pos, uint64_t fsid) {
+ return
+ magic1 == (uint64_t)pos &&
+ magic2 == (fsid ^ seq ^ len);
+ }
+ } __attribute__((__packed__, aligned(4)));
+
+ bool journalq_empty() { return journalq.empty(); }
+
+private:
+ string fn;
+
+ char *zero_buf;
+ off64_t max_size;
+ size_t block_size;
+ bool directio, aio, force_aio;
+ bool must_write_header;
+ off64_t write_pos; // byte where the next entry to be written will go
+ off64_t read_pos; //
+ bool discard; //for block journal whether support discard
+
+#ifdef HAVE_LIBAIO
+ /// state associated with an in-flight aio request
+ /// Protected by aio_lock
+ struct aio_info {
+ struct iocb iocb;
+ bufferlist bl;
+ struct iovec *iov;
+ bool done;
+ uint64_t off, len; ///< these are for debug only
+ uint64_t seq; ///< seq number to complete on aio completion, if non-zero
+
+ aio_info(bufferlist& b, uint64_t o, uint64_t s)
+ : iov(NULL), done(false), off(o), len(b.length()), seq(s) {
+ bl.claim(b);
+ memset((void*)&iocb, 0, sizeof(iocb));
+ }
+ ~aio_info() {
+ delete[] iov;
+ }
+ };
+ Mutex aio_lock;
+ Cond aio_cond;
+ Cond write_finish_cond;
+ io_context_t aio_ctx;
+ list<aio_info> aio_queue;
+ int aio_num, aio_bytes;
+ /// End protected by aio_lock
+#endif
+
+ uint64_t last_committed_seq;
+ uint64_t journaled_since_start;
+
+ /*
+ * full states cycle at the beginnging of each commit epoch, when commit_start()
+ * is called.
+ * FULL - we just filled up during this epoch.
+ * WAIT - we filled up last epoch; now we have to wait until everything during
+ * that epoch commits to the fs before we can start writing over it.
+ * NOTFULL - all good, journal away.
+ */
+ enum {
+ FULL_NOTFULL = 0,
+ FULL_FULL = 1,
+ FULL_WAIT = 2,
+ } full_state;
+
+ int fd;
+
+ // in journal
+ deque<pair<uint64_t, off64_t> > journalq; // track seq offsets, so we can trim later.
+ uint64_t writing_seq;
+
+
+ // throttle
+ Throttle throttle_ops, throttle_bytes;
+
+ void put_throttle(uint64_t ops, uint64_t bytes);
+
+ // write thread
+ Mutex write_lock;
+ bool write_stop;
+ bool aio_stop;
+
+ Cond commit_cond;
+
+ int _open(bool wr, bool create=false);
+ int _open_block_device();
+ void _close(int fd) const;
+ void _check_disk_write_cache() const;
+ int _open_file(int64_t oldsize, blksize_t blksize, bool create);
+ int _dump(ostream& out, bool simple);
+ void print_header(const header_t &hdr) const;
+ int read_header(header_t *hdr) const;
+ bufferptr prepare_header();
+ void start_writer();
+ void stop_writer();
+ void write_thread_entry();
+
+ void queue_completions_thru(uint64_t seq);
+
+ int check_for_full(uint64_t seq, off64_t pos, off64_t size);
+ int prepare_multi_write(bufferlist& bl, uint64_t& orig_ops, uint64_t& orig_bytee);
+ int prepare_single_write(write_item &next_write, bufferlist& bl, off64_t& queue_pos,
+ uint64_t& orig_ops, uint64_t& orig_bytes);
+ void do_write(bufferlist& bl);
+
+ void write_finish_thread_entry();
+ void check_aio_completion();
+ void do_aio_write(bufferlist& bl);
+ int write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq);
+
+
+ void align_bl(off64_t pos, bufferlist& bl);
+ int write_bl(off64_t& pos, bufferlist& bl);
+
+ /// read len from journal starting at in_pos and wrapping up to len
+ void wrap_read_bl(
+ off64_t in_pos, ///< [in] start position
+ int64_t len, ///< [in] length to read
+ bufferlist* bl, ///< [out] result
+ off64_t *out_pos ///< [out] next position to read, will be wrapped
+ ) const;
+
+ void do_discard(int64_t offset, int64_t end);
+
+ class Writer : public Thread {
+ FileJournal *journal;
+ public:
+ Writer(FileJournal *fj) : journal(fj) {}
+ void *entry() {
+ journal->write_thread_entry();
+ return 0;
+ }
+ } write_thread;
+
+ class WriteFinisher : public Thread {
+ FileJournal *journal;
+ public:
+ WriteFinisher(FileJournal *fj) : journal(fj) {}
+ void *entry() {
+ journal->write_finish_thread_entry();
+ return 0;
+ }
+ } write_finish_thread;
+
+ off64_t get_top() const {
+ return ROUND_UP_TO(sizeof(header), block_size);
+ }
+
+ public:
+ FileJournal(uuid_d fsid, Finisher *fin, Cond *sync_cond, const char *f, bool dio=false, bool ai=true, bool faio=false) :
+ Journal(fsid, fin, sync_cond),
+ finisher_lock("FileJournal::finisher_lock", false, true, false, g_ceph_context),
+ journaled_seq(0),
+ plug_journal_completions(false),
+ writeq_lock("FileJournal::writeq_lock", false, true, false, g_ceph_context),
+ completions_lock(
+ "FileJournal::completions_lock", false, true, false, g_ceph_context),
+ fn(f),
+ zero_buf(NULL),
+ max_size(0), block_size(0),
+ directio(dio), aio(ai), force_aio(faio),
+ must_write_header(false),
+ write_pos(0), read_pos(0),
+ discard(false),
+#ifdef HAVE_LIBAIO
+ aio_lock("FileJournal::aio_lock"),
+ aio_ctx(0),
+ aio_num(0), aio_bytes(0),
+#endif
+ last_committed_seq(0),
+ journaled_since_start(0),
+ full_state(FULL_NOTFULL),
+ fd(-1),
+ writing_seq(0),
+ throttle_ops(g_ceph_context, "journal_ops", g_conf->journal_queue_max_ops),
+ throttle_bytes(g_ceph_context, "journal_bytes", g_conf->journal_queue_max_bytes),
+ write_lock("FileJournal::write_lock", false, true, false, g_ceph_context),
+ write_stop(true),
+ aio_stop(true),
+ write_thread(this),
+ write_finish_thread(this) {
+
+ if (aio && !directio) {
+ derr << "FileJournal::_open_any: aio not supported without directio; disabling aio" << dendl;
+ aio = false;
+ }
+#ifndef HAVE_LIBAIO
+ if (aio) {
+ derr << "FileJournal::_open_any: libaio not compiled in; disabling aio" << dendl;
+ aio = false;
+ }
+#endif
+ }
+ ~FileJournal() {
+ assert(fd == -1);
+ delete[] zero_buf;
+ }
+
+ int check();
+ int create();
+ int open(uint64_t fs_op_seq);
+ void close();
+ int peek_fsid(uuid_d& fsid);
+
+ int dump(ostream& out);
+ int simple_dump(ostream& out);
+ int _fdump(Formatter &f, bool simple);
+
+ void flush();
+
+ void throttle();
+
+ bool is_writeable() {
+ return read_pos == 0;
+ }
+ int make_writeable();
+
+ // writes
+ void commit_start(uint64_t seq);
+ void committed_thru(uint64_t seq);
+ bool should_commit_now() {
+ return full_state != FULL_NOTFULL && !write_stop;
+ }
+
+ void write_header_sync();
+
+ void set_wait_on_full(bool b) { wait_on_full = b; }
+
+ // reads
+
+ /// Result code for read_entry
+ enum read_entry_result {
+ SUCCESS,
+ FAILURE,
+ MAYBE_CORRUPT
+ };
+
+ /**
+ * read_entry
+ *
+ * Reads next entry starting at pos. If the entry appears
+ * clean, *bl will contain the payload, *seq will contain
+ * the sequence number, and *out_pos will reflect the next
+ * read position. If the entry is invalid *ss will contain
+ * debug text, while *seq, *out_pos, and *bl will be unchanged.
+ *
+ * If the entry suggests a corrupt log, *ss will contain debug
+ * text, *out_pos will contain the next index to check. If
+ * we find an entry in this way that returns SUCCESS, the journal
+ * is most likely corrupt.
+ */
+ read_entry_result do_read_entry(
+ off64_t pos, ///< [in] position to read
+ off64_t *next_pos, ///< [out] next position to read
+ bufferlist* bl, ///< [out] payload for successful read
+ uint64_t *seq, ///< [out] seq of successful read
+ ostream *ss, ///< [out] error output
+ entry_header_t *h = 0 ///< [out] header
+ ) const; ///< @return result code
+
+ bool read_entry(
+ bufferlist &bl,
+ uint64_t &last_seq,
+ bool *corrupt
+ );
+
+ bool read_entry(
+ bufferlist &bl,
+ uint64_t &last_seq) {
+ return read_entry(bl, last_seq, 0);
+ }
+
+ // Debug/Testing
+ void get_header(
+ uint64_t wanted_seq,
+ off64_t *_pos,
+ entry_header_t *h);
+ void corrupt(
+ int wfd,
+ off64_t corrupt_at);
+ void corrupt_payload(
+ int wfd,
+ uint64_t seq);
+ void corrupt_footer_magic(
+ int wfd,
+ uint64_t seq);
+ void corrupt_header_magic(
+ int wfd,
+ uint64_t seq);
+};
+
+WRITE_CLASS_ENCODER(FileJournal::header_t)
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (c) 2015 Hewlett-Packard Development Company, L.P.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include "include/int_types.h"
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/file.h>
+#include <errno.h>
+#include <dirent.h>
+#include <sys/ioctl.h>
+
+#if defined(__linux__)
+#include <linux/fs.h>
+#endif
+
+#include <iostream>
+#include <map>
+
+#include "include/compat.h"
+#include "include/linux_fiemap.h"
+
+#include "common/xattr.h"
+#include "chain_xattr.h"
+
+#if defined(DARWIN) || defined(__FreeBSD__)
+#include <sys/param.h>
+#include <sys/mount.h>
+#endif // DARWIN
+
+
+#include <fstream>
+#include <sstream>
+
+#include "FileStore.h"
+#include "GenericFileStoreBackend.h"
+#include "BtrfsFileStoreBackend.h"
+#include "XfsFileStoreBackend.h"
+#include "ZFSFileStoreBackend.h"
+#include "common/BackTrace.h"
+#include "include/types.h"
+#include "FileJournal.h"
+
+#include "osd/osd_types.h"
+#include "include/color.h"
+#include "include/buffer.h"
+
+#include "common/Timer.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/run_cmd.h"
+#include "common/safe_io.h"
+#include "common/perf_counters.h"
+#include "common/sync_filesystem.h"
+#include "common/fd.h"
+#include "HashIndex.h"
+#include "DBObjectMap.h"
+#include "kv/KeyValueDB.h"
+
+#include "common/ceph_crypto.h"
+using ceph::crypto::SHA1;
+
+#include "include/assert.h"
+
+#include "common/config.h"
+#include "common/blkdev.h"
+
+#ifdef WITH_LTTNG
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#include "tracing/objectstore.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
+#else
+#define tracepoint(...)
+#endif
+
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "filestore(" << basedir << ") "
+
+#define COMMIT_SNAP_ITEM "snap_%llu"
+#define CLUSTER_SNAP_ITEM "clustersnap_%s"
+
+#define REPLAY_GUARD_XATTR "user.cephos.seq"
+#define GLOBAL_REPLAY_GUARD_XATTR "user.cephos.gseq"
+
+// XATTR_SPILL_OUT_NAME as a xattr is used to maintain that indicates whether
+// xattrs spill over into DBObjectMap, if XATTR_SPILL_OUT_NAME exists in file
+// xattrs and the value is "no", it indicates no xattrs in DBObjectMap
+#define XATTR_SPILL_OUT_NAME "user.cephos.spill_out"
+#define XATTR_NO_SPILL_OUT "0"
+#define XATTR_SPILL_OUT "1"
+
+//Initial features in new superblock.
+static CompatSet get_fs_initial_compat_set() {
+ CompatSet::FeatureSet ceph_osd_feature_compat;
+ CompatSet::FeatureSet ceph_osd_feature_ro_compat;
+ CompatSet::FeatureSet ceph_osd_feature_incompat;
+ return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
+ ceph_osd_feature_incompat);
+}
+
+//Features are added here that this FileStore supports.
+static CompatSet get_fs_supported_compat_set() {
+ CompatSet compat = get_fs_initial_compat_set();
+ //Any features here can be set in code, but not in initial superblock
+ compat.incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS);
+ return compat;
+}
+
+int FileStore::get_block_device_fsid(const string& path, uuid_d *fsid)
+{
+ // make sure we don't try to use aio or direct_io (and get annoying
+ // error messages from failing to do so); performance implications
+ // should be irrelevant for this use
+ FileJournal j(*fsid, 0, 0, path.c_str(), false, false);
+ return j.peek_fsid(*fsid);
+}
+
+void FileStore::FSPerfTracker::update_from_perfcounters(
+ PerfCounters &logger)
+{
+ os_commit_latency.consume_next(
+ logger.get_tavg_ms(
+ l_os_j_lat));
+ os_apply_latency.consume_next(
+ logger.get_tavg_ms(
+ l_os_apply_lat));
+}
+
+
+ostream& operator<<(ostream& out, const FileStore::OpSequencer& s)
+{
+ assert(&out);
+ return out << *s.parent;
+}
+
+int FileStore::get_cdir(coll_t cid, char *s, int len)
+{
+ const string &cid_str(cid.to_str());
+ return snprintf(s, len, "%s/current/%s", basedir.c_str(), cid_str.c_str());
+}
+
+int FileStore::get_index(coll_t cid, Index *index)
+{
+ int r = index_manager.get_index(cid, basedir, index);
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
+}
+
+int FileStore::init_index(coll_t cid)
+{
+ char path[PATH_MAX];
+ get_cdir(cid, path, sizeof(path));
+ int r = index_manager.init_index(cid, path, target_version);
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
+}
+
+int FileStore::lfn_find(const ghobject_t& oid, const Index& index, IndexedPath *path)
+{
+ IndexedPath path2;
+ if (!path)
+ path = &path2;
+ int r, exist;
+ assert(NULL != index.index);
+ r = (index.index)->lookup(oid, path, &exist);
+ if (r < 0) {
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
+ }
+ if (!exist)
+ return -ENOENT;
+ return 0;
+}
+
+int FileStore::lfn_truncate(coll_t cid, const ghobject_t& oid, off_t length)
+{
+ FDRef fd;
+ int r = lfn_open(cid, oid, false, &fd);
+ if (r < 0)
+ return r;
+ r = ::ftruncate(**fd, length);
+ if (r < 0)
+ r = -errno;
+ if (r >= 0 && m_filestore_sloppy_crc) {
+ int rc = backend->_crc_update_truncate(**fd, length);
+ assert(rc >= 0);
+ }
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
+}
+
+int FileStore::lfn_stat(coll_t cid, const ghobject_t& oid, struct stat *buf)
+{
+ IndexedPath path;
+ Index index;
+ int r = get_index(cid, &index);
+ if (r < 0)
+ return r;
+
+ assert(NULL != index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+
+ r = lfn_find(oid, index, &path);
+ if (r < 0)
+ return r;
+ r = ::stat(path->path(), buf);
+ if (r < 0)
+ r = -errno;
+ return r;
+}
+
+int FileStore::lfn_open(coll_t cid,
+ const ghobject_t& oid,
+ bool create,
+ FDRef *outfd,
+ Index *index)
+{
+ assert(outfd);
+ int r = 0;
+ bool need_lock = true;
+ int flags = O_RDWR;
+
+ if (create)
+ flags |= O_CREAT;
+
+ Index index2;
+ if (!index) {
+ index = &index2;
+ }
+ if (!((*index).index)) {
+ r = get_index(cid, index);
+ if (r < 0) {
+ dout(10) << __func__ << " could not get index r = " << r << dendl;
+ return r;
+ }
+ } else {
+ need_lock = false;
+ }
+
+ int fd, exist;
+ assert(NULL != (*index).index);
+ if (need_lock) {
+ ((*index).index)->access_lock.get_write();
+ }
+ if (!replaying) {
+ *outfd = fdcache.lookup(oid);
+ if (*outfd) {
+ if (need_lock) {
+ ((*index).index)->access_lock.put_write();
+ }
+ return 0;
+ }
+ }
+
+
+ IndexedPath path2;
+ IndexedPath *path = &path2;
+
+ r = (*index)->lookup(oid, path, &exist);
+ if (r < 0) {
+ derr << "could not find " << oid << " in index: "
+ << cpp_strerror(-r) << dendl;
+ goto fail;
+ }
+
+ r = ::open((*path)->path(), flags, 0644);
+ if (r < 0) {
+ r = -errno;
+ dout(10) << "error opening file " << (*path)->path() << " with flags="
+ << flags << ": " << cpp_strerror(-r) << dendl;
+ goto fail;
+ }
+ fd = r;
+ if (create && (!exist)) {
+ r = (*index)->created(oid, (*path)->path());
+ if (r < 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ derr << "error creating " << oid << " (" << (*path)->path()
+ << ") in index: " << cpp_strerror(-r) << dendl;
+ goto fail;
+ }
+ r = chain_fsetxattr(fd, XATTR_SPILL_OUT_NAME,
+ XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT), true);
+ if (r < 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ derr << "error setting spillout xattr for oid " << oid << " (" << (*path)->path()
+ << "):" << cpp_strerror(-r) << dendl;
+ goto fail;
+ }
+ }
+
+ if (!replaying) {
+ bool existed;
+ *outfd = fdcache.add(oid, fd, &existed);
+ if (existed) {
+ TEMP_FAILURE_RETRY(::close(fd));
+ }
+ } else {
+ *outfd = FDRef(new FDCache::FD(fd));
+ }
+
+ if (need_lock) {
+ ((*index).index)->access_lock.put_write();
+ }
+
+ return 0;
+
+ fail:
+
+ if (need_lock) {
+ ((*index).index)->access_lock.put_write();
+ }
+
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
+}
+
+void FileStore::lfn_close(FDRef fd)
+{
+}
+
+int FileStore::lfn_link(coll_t c, coll_t newcid, const ghobject_t& o, const ghobject_t& newoid)
+{
+ Index index_new, index_old;
+ IndexedPath path_new, path_old;
+ int exist;
+ int r;
+ bool index_same = false;
+ if (c < newcid) {
+ r = get_index(newcid, &index_new);
+ if (r < 0)
+ return r;
+ r = get_index(c, &index_old);
+ if (r < 0)
+ return r;
+ } else if (c == newcid) {
+ r = get_index(c, &index_old);
+ if (r < 0)
+ return r;
+ index_new = index_old;
+ index_same = true;
+ } else {
+ r = get_index(c, &index_old);
+ if (r < 0)
+ return r;
+ r = get_index(newcid, &index_new);
+ if (r < 0)
+ return r;
+ }
+
+ assert(NULL != index_old.index);
+ assert(NULL != index_new.index);
+
+ if (!index_same) {
+
+ RWLock::RLocker l1((index_old.index)->access_lock);
+
+ r = index_old->lookup(o, &path_old, &exist);
+ if (r < 0) {
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
+ }
+ if (!exist)
+ return -ENOENT;
+
+ RWLock::WLocker l2((index_new.index)->access_lock);
+
+ r = index_new->lookup(newoid, &path_new, &exist);
+ if (r < 0) {
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
+ }
+ if (exist)
+ return -EEXIST;
+
+ dout(25) << "lfn_link path_old: " << path_old << dendl;
+ dout(25) << "lfn_link path_new: " << path_new << dendl;
+ r = ::link(path_old->path(), path_new->path());
+ if (r < 0)
+ return -errno;
+
+ r = index_new->created(newoid, path_new->path());
+ if (r < 0) {
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
+ }
+ } else {
+ RWLock::WLocker l1((index_old.index)->access_lock);
+
+ r = index_old->lookup(o, &path_old, &exist);
+ if (r < 0) {
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
+ }
+ if (!exist)
+ return -ENOENT;
+
+ r = index_new->lookup(newoid, &path_new, &exist);
+ if (r < 0) {
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
+ }
+ if (exist)
+ return -EEXIST;
+
+ dout(25) << "lfn_link path_old: " << path_old << dendl;
+ dout(25) << "lfn_link path_new: " << path_new << dendl;
+ r = ::link(path_old->path(), path_new->path());
+ if (r < 0)
+ return -errno;
+
+ // make sure old fd for unlinked/overwritten file is gone
+ fdcache.clear(newoid);
+
+ r = index_new->created(newoid, path_new->path());
+ if (r < 0) {
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
+ }
+ }
+ return 0;
+}
+
+int FileStore::lfn_unlink(coll_t cid, const ghobject_t& o,
+ const SequencerPosition &spos,
+ bool force_clear_omap)
+{
+ Index index;
+ int r = get_index(cid, &index);
+ if (r < 0) {
+ dout(25) << __func__ << " get_index failed " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ assert(NULL != index.index);
+ RWLock::WLocker l((index.index)->access_lock);
+
+ {
+ IndexedPath path;
+ int hardlink;
+ r = index->lookup(o, &path, &hardlink);
+ if (r < 0) {
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
+ }
+
+ if (!force_clear_omap) {
+ if (hardlink == 0) {
+ wbthrottle.clear_object(o); // should be only non-cache ref
+ fdcache.clear(o);
+ return 0;
+ } else if (hardlink == 1) {
+ force_clear_omap = true;
+ }
+ }
+ if (force_clear_omap) {
+ dout(20) << __func__ << ": clearing omap on " << o
+ << " in cid " << cid << dendl;
+ r = object_map->clear(o, &spos);
+ if (r < 0 && r != -ENOENT) {
+ dout(25) << __func__ << " omap clear failed " << cpp_strerror(r) << dendl;
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
+ }
+ if (g_conf->filestore_debug_inject_read_err) {
+ debug_obj_on_delete(o);
+ }
+ wbthrottle.clear_object(o); // should be only non-cache ref
+ fdcache.clear(o);
+ } else {
+ /* Ensure that replay of this op doesn't result in the object_map
+ * going away.
+ */
+ if (!backend->can_checkpoint())
+ object_map->sync(&o, &spos);
+ }
+ }
+ r = index->unlink(o);
+ if (r < 0) {
+ dout(25) << __func__ << " index unlink failed " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ return 0;
+}
+
+FileStore::FileStore(const std::string &base, const std::string &jdev, osflagbits_t flags, const char *name, bool do_update) :
+ JournalingObjectStore(base),
+ internal_name(name),
+ basedir(base), journalpath(jdev),
+ generic_flags(flags),
+ blk_size(0),
+ fsid_fd(-1), op_fd(-1),
+ basedir_fd(-1), current_fd(-1),
+ backend(NULL),
+ index_manager(do_update),
+ lock("FileStore::lock"),
+ force_sync(false),
+ sync_entry_timeo_lock("sync_entry_timeo_lock"),
+ timer(g_ceph_context, sync_entry_timeo_lock),
+ stop(false), sync_thread(this),
+ fdcache(g_ceph_context),
+ wbthrottle(g_ceph_context),
+ next_osr_id(0),
+ throttle_ops(g_ceph_context, "filestore_ops",g_conf->filestore_queue_max_ops),
+ throttle_bytes(g_ceph_context, "filestore_bytes",g_conf->filestore_queue_max_bytes),
+ m_ondisk_finisher_num(g_conf->filestore_ondisk_finisher_threads),
+ m_apply_finisher_num(g_conf->filestore_apply_finisher_threads),
+ op_tp(g_ceph_context, "FileStore::op_tp", g_conf->filestore_op_threads, "filestore_op_threads"),
+ op_wq(this, g_conf->filestore_op_thread_timeout,
+ g_conf->filestore_op_thread_suicide_timeout, &op_tp),
+ logger(NULL),
+ read_error_lock("FileStore::read_error_lock"),
+ m_filestore_commit_timeout(g_conf->filestore_commit_timeout),
+ m_filestore_journal_parallel(g_conf->filestore_journal_parallel ),
+ m_filestore_journal_trailing(g_conf->filestore_journal_trailing),
+ m_filestore_journal_writeahead(g_conf->filestore_journal_writeahead),
+ m_filestore_fiemap_threshold(g_conf->filestore_fiemap_threshold),
+ m_filestore_max_sync_interval(g_conf->filestore_max_sync_interval),
+ m_filestore_min_sync_interval(g_conf->filestore_min_sync_interval),
+ m_filestore_fail_eio(g_conf->filestore_fail_eio),
+ m_filestore_fadvise(g_conf->filestore_fadvise),
+ do_update(do_update),
+ m_journal_dio(g_conf->journal_dio),
+ m_journal_aio(g_conf->journal_aio),
+ m_journal_force_aio(g_conf->journal_force_aio),
+ m_osd_rollback_to_cluster_snap(g_conf->osd_rollback_to_cluster_snap),
+ m_osd_use_stale_snap(g_conf->osd_use_stale_snap),
+ m_filestore_queue_max_ops(g_conf->filestore_queue_max_ops),
+ m_filestore_queue_max_bytes(g_conf->filestore_queue_max_bytes),
+ m_filestore_queue_committing_max_ops(g_conf->filestore_queue_committing_max_ops),
+ m_filestore_queue_committing_max_bytes(g_conf->filestore_queue_committing_max_bytes),
+ m_filestore_do_dump(false),
+ m_filestore_dump_fmt(true),
+ m_filestore_sloppy_crc(g_conf->filestore_sloppy_crc),
+ m_filestore_sloppy_crc_block_size(g_conf->filestore_sloppy_crc_block_size),
+ m_filestore_max_alloc_hint_size(g_conf->filestore_max_alloc_hint_size),
+ m_fs_type(0),
+ m_filestore_max_inline_xattr_size(0),
+ m_filestore_max_inline_xattrs(0)
+{
+ m_filestore_kill_at.set(g_conf->filestore_kill_at);
+ for (int i = 0; i < m_ondisk_finisher_num; ++i) {
+ ostringstream oss;
+ oss << "filestore-ondisk-" << i;
+ Finisher *f = new Finisher(g_ceph_context, oss.str());
+ ondisk_finishers.push_back(f);
+ }
+ for (int i = 0; i < m_apply_finisher_num; ++i) {
+ ostringstream oss;
+ oss << "filestore-apply-" << i;
+ Finisher *f = new Finisher(g_ceph_context, oss.str());
+ apply_finishers.push_back(f);
+ }
+
+ ostringstream oss;
+ oss << basedir << "/current";
+ current_fn = oss.str();
+
+ ostringstream sss;
+ sss << basedir << "/current/commit_op_seq";
+ current_op_seq_fn = sss.str();
+
+ ostringstream omss;
+ omss << basedir << "/current/omap";
+ omap_dir = omss.str();
+
+ // initialize logger
+ PerfCountersBuilder plb(g_ceph_context, internal_name, l_os_first, l_os_last);
+
+ plb.add_u64(l_os_jq_max_ops, "journal_queue_max_ops", "Max operations in journal queue");
+ plb.add_u64(l_os_jq_ops, "journal_queue_ops", "Operations in journal queue");
+ plb.add_u64_counter(l_os_j_ops, "journal_ops", "Total journal entries written");
+ plb.add_u64(l_os_jq_max_bytes, "journal_queue_max_bytes", "Max data in journal queue");
+ plb.add_u64(l_os_jq_bytes, "journal_queue_bytes", "Size of journal queue");
+ plb.add_u64_counter(l_os_j_bytes, "journal_bytes", "Total operations size in journal");
+ plb.add_time_avg(l_os_j_lat, "journal_latency", "Average journal queue completing latency");
+ plb.add_u64_counter(l_os_j_wr, "journal_wr", "Journal write IOs");
+ plb.add_u64_avg(l_os_j_wr_bytes, "journal_wr_bytes", "Journal data written");
+ plb.add_u64(l_os_oq_max_ops, "op_queue_max_ops", "Max operations in writing to FS queue");
+ plb.add_u64(l_os_oq_ops, "op_queue_ops", "Operations in writing to FS queue");
+ plb.add_u64_counter(l_os_ops, "ops", "Operations written to store");
+ plb.add_u64(l_os_oq_max_bytes, "op_queue_max_bytes", "Max data in writing to FS queue");
+ plb.add_u64(l_os_oq_bytes, "op_queue_bytes", "Size of writing to FS queue");
+ plb.add_u64_counter(l_os_bytes, "bytes", "Data written to store");
+ plb.add_time_avg(l_os_apply_lat, "apply_latency", "Apply latency");
+ plb.add_u64(l_os_committing, "committing", "Is currently committing");
+
+ plb.add_u64_counter(l_os_commit, "commitcycle", "Commit cycles");
+ plb.add_time_avg(l_os_commit_len, "commitcycle_interval", "Average interval between commits");
+ plb.add_time_avg(l_os_commit_lat, "commitcycle_latency", "Average latency of commit");
+ plb.add_u64_counter(l_os_j_full, "journal_full", "Journal writes while full");
+ plb.add_time_avg(l_os_queue_lat, "queue_transaction_latency_avg", "Store operation queue latency");
+
+ logger = plb.create_perf_counters();
+
+ g_ceph_context->get_perfcounters_collection()->add(logger);
+ g_ceph_context->_conf->add_observer(this);
+
+ superblock.compat_features = get_fs_initial_compat_set();
+}
+
+FileStore::~FileStore()
+{
+ for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
+ delete *it;
+ *it = NULL;
+ }
+ for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
+ delete *it;
+ *it = NULL;
+ }
+ g_ceph_context->_conf->remove_observer(this);
+ g_ceph_context->get_perfcounters_collection()->remove(logger);
+
+ if (journal)
+ journal->logger = NULL;
+ delete logger;
+
+ if (m_filestore_do_dump) {
+ dump_stop();
+ }
+}
+
+static void get_attrname(const char *name, char *buf, int len)
+{
+ snprintf(buf, len, "user.ceph.%s", name);
+}
+
+bool parse_attrname(char **name)
+{
+ if (strncmp(*name, "user.ceph.", 10) == 0) {
+ *name += 10;
+ return true;
+ }
+ return false;
+}
+
+void FileStore::collect_metadata(map<string,string> *pm)
+{
+ char partition_path[PATH_MAX];
+ char dev_node[PATH_MAX];
+ int rc = 0;
+
+ (*pm)["filestore_backend"] = backend->get_name();
+ ostringstream ss;
+ ss << "0x" << std::hex << m_fs_type << std::dec;
+ (*pm)["filestore_f_type"] = ss.str();
+
+ if (g_conf->filestore_collect_device_partition_information) {
+ rc = get_device_by_uuid(get_fsid(), "PARTUUID", partition_path,
+ dev_node);
+ } else {
+ rc = -EINVAL;
+ }
+
+ switch (rc) {
+ case -EOPNOTSUPP:
+ case -EINVAL:
+ (*pm)["backend_filestore_partition_path"] = "unknown";
+ (*pm)["backend_filestore_dev_node"] = "unknown";
+ break;
+ case -ENODEV:
+ (*pm)["backend_filestore_partition_path"] = string(partition_path);
+ (*pm)["backend_filestore_dev_node"] = "unknown";
+ break;
+ default:
+ (*pm)["backend_filestore_partition_path"] = string(partition_path);
+ (*pm)["backend_filestore_dev_node"] = string(dev_node);
+ }
+}
+
+int FileStore::statfs(struct statfs *buf)
+{
+ if (::statfs(basedir.c_str(), buf) < 0) {
+ int r = -errno;
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
+ }
+ return 0;
+}
+
+
+void FileStore::new_journal()
+{
+ if (journalpath.length()) {
+ dout(10) << "open_journal at " << journalpath << dendl;
+ journal = new FileJournal(fsid, &finisher, &sync_cond, journalpath.c_str(),
+ m_journal_dio, m_journal_aio, m_journal_force_aio);
+ if (journal)
+ journal->logger = logger;
+ }
+ return;
+}
+
+int FileStore::dump_journal(ostream& out)
+{
+ int r;
+
+ if (!journalpath.length())
+ return -EINVAL;
+
+ FileJournal *journal = new FileJournal(fsid, &finisher, &sync_cond, journalpath.c_str(), m_journal_dio);
+ r = journal->dump(out);
+ delete journal;
+ return r;
+}
+
+FileStoreBackend *FileStoreBackend::create(long f_type, FileStore *fs)
+{
+ switch (f_type) {
+#if defined(__linux__)
+ case BTRFS_SUPER_MAGIC:
+ return new BtrfsFileStoreBackend(fs);
+# ifdef HAVE_LIBXFS
+ case XFS_SUPER_MAGIC:
+ return new XfsFileStoreBackend(fs);
+# endif
+#endif
+#ifdef HAVE_LIBZFS
+ case ZFS_SUPER_MAGIC:
+ return new ZFSFileStoreBackend(fs);
+#endif
+ default:
+ return new GenericFileStoreBackend(fs);
+ }
+}
+
+void FileStore::create_backend(long f_type)
+{
+ m_fs_type = f_type;
+
+ assert(backend == NULL);
+ backend = FileStoreBackend::create(f_type, this);
+
+ dout(0) << "backend " << backend->get_name()
+ << " (magic 0x" << std::hex << f_type << std::dec << ")"
+ << dendl;
+
+ switch (f_type) {
+#if defined(__linux__)
+ case BTRFS_SUPER_MAGIC:
+ wbthrottle.set_fs(WBThrottle::BTRFS);
+ break;
+
+ case XFS_SUPER_MAGIC:
+ // wbthrottle is constructed with fs(WBThrottle::XFS)
+ break;
+#endif
+ }
+
+ set_xattr_limits_via_conf();
+}
+
+int FileStore::mkfs()
+{
+ int ret = 0;
+ char fsid_fn[PATH_MAX];
+ uuid_d old_fsid;
+
+ dout(1) << "mkfs in " << basedir << dendl;
+ basedir_fd = ::open(basedir.c_str(), O_RDONLY);
+ if (basedir_fd < 0) {
+ ret = -errno;
+ derr << "mkfs failed to open base dir " << basedir << ": " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ // open+lock fsid
+ snprintf(fsid_fn, sizeof(fsid_fn), "%s/fsid", basedir.c_str());
+ fsid_fd = ::open(fsid_fn, O_RDWR|O_CREAT, 0644);
+ if (fsid_fd < 0) {
+ ret = -errno;
+ derr << "mkfs: failed to open " << fsid_fn << ": " << cpp_strerror(ret) << dendl;
+ goto close_basedir_fd;
+ }
+
+ if (lock_fsid() < 0) {
+ ret = -EBUSY;
+ goto close_fsid_fd;
+ }
+
+ if (read_fsid(fsid_fd, &old_fsid) < 0 || old_fsid.is_zero()) {
+ if (fsid.is_zero()) {
+ fsid.generate_random();
+ dout(1) << "mkfs generated fsid " << fsid << dendl;
+ } else {
+ dout(1) << "mkfs using provided fsid " << fsid << dendl;
+ }
+
+ char fsid_str[40];
+ fsid.print(fsid_str);
+ strcat(fsid_str, "\n");
+ ret = ::ftruncate(fsid_fd, 0);
+ if (ret < 0) {
+ ret = -errno;
+ derr << "mkfs: failed to truncate fsid: "
+ << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+ ret = safe_write(fsid_fd, fsid_str, strlen(fsid_str));
+ if (ret < 0) {
+ derr << "mkfs: failed to write fsid: "
+ << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+ if (::fsync(fsid_fd) < 0) {
+ ret = errno;
+ derr << "mkfs: close failed: can't write fsid: "
+ << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+ dout(10) << "mkfs fsid is " << fsid << dendl;
+ } else {
+ if (!fsid.is_zero() && fsid != old_fsid) {
+ derr << "mkfs on-disk fsid " << old_fsid << " != provided " << fsid << dendl;
+ ret = -EINVAL;
+ goto close_fsid_fd;
+ }
+ fsid = old_fsid;
+ dout(1) << "mkfs fsid is already set to " << fsid << dendl;
+ }
+
+ // version stamp
+ ret = write_version_stamp();
+ if (ret < 0) {
+ derr << "mkfs: write_version_stamp() failed: "
+ << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+
+ // superblock
+ superblock.omap_backend = g_conf->filestore_omap_backend;
+ ret = write_superblock();
+ if (ret < 0) {
+ derr << "mkfs: write_superblock() failed: "
+ << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+
+ struct statfs basefs;
+ ret = ::fstatfs(basedir_fd, &basefs);
+ if (ret < 0) {
+ ret = -errno;
+ derr << "mkfs cannot fstatfs basedir "
+ << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+
+ create_backend(basefs.f_type);
+
+ ret = backend->create_current();
+ if (ret < 0) {
+ derr << "mkfs: failed to create current/ " << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+
+ // write initial op_seq
+ {
+ uint64_t initial_seq = 0;
+ int fd = read_op_seq(&initial_seq);
+ if (fd < 0) {
+ derr << "mkfs: failed to create " << current_op_seq_fn << ": "
+ << cpp_strerror(fd) << dendl;
+ goto close_fsid_fd;
+ }
+ if (initial_seq == 0) {
+ int err = write_op_seq(fd, 1);
+ if (err < 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ derr << "mkfs: failed to write to " << current_op_seq_fn << ": "
+ << cpp_strerror(err) << dendl;
+ goto close_fsid_fd;
+ }
+
+ if (backend->can_checkpoint()) {
+ // create snap_1 too
+ current_fd = ::open(current_fn.c_str(), O_RDONLY);
+ assert(current_fd >= 0);
+ char s[NAME_MAX];
+ snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, 1ull);
+ ret = backend->create_checkpoint(s, NULL);
+ VOID_TEMP_FAILURE_RETRY(::close(current_fd));
+ if (ret < 0 && ret != -EEXIST) {
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ derr << "mkfs: failed to create snap_1: " << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+ }
+ }
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ }
+ ret = KeyValueDB::test_init(superblock.omap_backend, omap_dir);
+ if (ret < 0) {
+ derr << "mkfs failed to create " << g_conf->filestore_omap_backend << dendl;
+ ret = -1;
+ goto close_fsid_fd;
+ }
+ dout(1) << g_conf->filestore_omap_backend << " db exists/created" << dendl;
+
+ // journal?
+ ret = mkjournal();
+ if (ret)
+ goto close_fsid_fd;
+
+ ret = write_meta("type", "filestore");
+ if (ret)
+ goto close_fsid_fd;
+
+ dout(1) << "mkfs done in " << basedir << dendl;
+ ret = 0;
+
+ close_fsid_fd:
+ VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+ fsid_fd = -1;
+ close_basedir_fd:
+ VOID_TEMP_FAILURE_RETRY(::close(basedir_fd));
+ delete backend;
+ backend = NULL;
+ return ret;
+}
+
+int FileStore::mkjournal()
+{
+ // read fsid
+ int ret;
+ char fn[PATH_MAX];
+ snprintf(fn, sizeof(fn), "%s/fsid", basedir.c_str());
+ int fd = ::open(fn, O_RDONLY, 0644);
+ if (fd < 0) {
+ int err = errno;
+ derr << "FileStore::mkjournal: open error: " << cpp_strerror(err) << dendl;
+ return -err;
+ }
+ ret = read_fsid(fd, &fsid);
+ if (ret < 0) {
+ derr << "FileStore::mkjournal: read error: " << cpp_strerror(ret) << dendl;
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return ret;
+ }
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+
+ ret = 0;
+
+ new_journal();
+ if (journal) {
+ ret = journal->check();
+ if (ret < 0) {
+ ret = journal->create();
+ if (ret)
+ derr << "mkjournal error creating journal on " << journalpath
+ << ": " << cpp_strerror(ret) << dendl;
+ else
+ dout(0) << "mkjournal created journal on " << journalpath << dendl;
+ }
+ delete journal;
+ journal = 0;
+ }
+ return ret;
+}
+
+int FileStore::read_fsid(int fd, uuid_d *uuid)
+{
+ char fsid_str[40];
+ int ret = safe_read(fd, fsid_str, sizeof(fsid_str));
+ if (ret < 0)
+ return ret;
+ if (ret == 8) {
+ // old 64-bit fsid... mirror it.
+ *(uint64_t*)&uuid->bytes()[0] = *(uint64_t*)fsid_str;
+ *(uint64_t*)&uuid->bytes()[8] = *(uint64_t*)fsid_str;
+ return 0;
+ }
+
+ if (ret > 36)
+ fsid_str[36] = 0;
+ if (!uuid->parse(fsid_str))
+ return -EINVAL;
+ return 0;
+}
+
+int FileStore::lock_fsid()
+{
+ struct flock l;
+ memset(&l, 0, sizeof(l));
+ l.l_type = F_WRLCK;
+ l.l_whence = SEEK_SET;
+ l.l_start = 0;
+ l.l_len = 0;
+ int r = ::fcntl(fsid_fd, F_SETLK, &l);
+ if (r < 0) {
+ int err = errno;
+ dout(0) << "lock_fsid failed to lock " << basedir << "/fsid, is another ceph-osd still running? "
+ << cpp_strerror(err) << dendl;
+ return -err;
+ }
+ return 0;
+}
+
+bool FileStore::test_mount_in_use()
+{
+ dout(5) << "test_mount basedir " << basedir << " journal " << journalpath << dendl;
+ char fn[PATH_MAX];
+ snprintf(fn, sizeof(fn), "%s/fsid", basedir.c_str());
+
+ // verify fs isn't in use
+
+ fsid_fd = ::open(fn, O_RDWR, 0644);
+ if (fsid_fd < 0)
+ return 0; // no fsid, ok.
+ bool inuse = lock_fsid() < 0;
+ VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+ fsid_fd = -1;
+ return inuse;
+}
+
+int FileStore::_detect_fs()
+{
+ struct statfs st;
+ int r = ::fstatfs(basedir_fd, &st);
+ if (r < 0)
+ return -errno;
+
+ blk_size = st.f_bsize;
+
+ create_backend(st.f_type);
+
+ r = backend->detect_features();
+ if (r < 0) {
+ derr << "_detect_fs: detect_features error: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // test xattrs
+ char fn[PATH_MAX];
+ int x = rand();
+ int y = x+1;
+ snprintf(fn, sizeof(fn), "%s/xattr_test", basedir.c_str());
+ int tmpfd = ::open(fn, O_CREAT|O_WRONLY|O_TRUNC, 0700);
+ if (tmpfd < 0) {
+ int ret = -errno;
+ derr << "_detect_fs unable to create " << fn << ": " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ int ret = chain_fsetxattr(tmpfd, "user.test", &x, sizeof(x));
+ if (ret >= 0)
+ ret = chain_fgetxattr(tmpfd, "user.test", &y, sizeof(y));
+ if ((ret < 0) || (x != y)) {
+ derr << "Extended attributes don't appear to work. ";
+ if (ret)
+ *_dout << "Got error " + cpp_strerror(ret) + ". ";
+ *_dout << "If you are using ext3 or ext4, be sure to mount the underlying "
+ << "file system with the 'user_xattr' option." << dendl;
+ ::unlink(fn);
+ VOID_TEMP_FAILURE_RETRY(::close(tmpfd));
+ return -ENOTSUP;
+ }
+
+ char buf[1000];
+ memset(buf, 0, sizeof(buf)); // shut up valgrind
+ chain_fsetxattr(tmpfd, "user.test", &buf, sizeof(buf));
+ chain_fsetxattr(tmpfd, "user.test2", &buf, sizeof(buf));
+ chain_fsetxattr(tmpfd, "user.test3", &buf, sizeof(buf));
+ chain_fsetxattr(tmpfd, "user.test4", &buf, sizeof(buf));
+ ret = chain_fsetxattr(tmpfd, "user.test5", &buf, sizeof(buf));
+ if (ret == -ENOSPC) {
+ dout(0) << "limited size xattrs" << dendl;
+ }
+ chain_fremovexattr(tmpfd, "user.test");
+ chain_fremovexattr(tmpfd, "user.test2");
+ chain_fremovexattr(tmpfd, "user.test3");
+ chain_fremovexattr(tmpfd, "user.test4");
+ chain_fremovexattr(tmpfd, "user.test5");
+
+ ::unlink(fn);
+ VOID_TEMP_FAILURE_RETRY(::close(tmpfd));
+
+ return 0;
+}
+
+int FileStore::_sanity_check_fs()
+{
+ // sanity check(s)
+
+ if (((int)m_filestore_journal_writeahead +
+ (int)m_filestore_journal_parallel +
+ (int)m_filestore_journal_trailing) > 1) {
+ dout(0) << "mount ERROR: more than one of filestore journal {writeahead,parallel,trailing} enabled" << dendl;
+ cerr << TEXT_RED
+ << " ** WARNING: more than one of 'filestore journal {writeahead,parallel,trailing}'\n"
+ << " is enabled in ceph.conf. You must choose a single journal mode."
+ << TEXT_NORMAL << std::endl;
+ return -EINVAL;
+ }
+
+ if (!backend->can_checkpoint()) {
+ if (!journal || !m_filestore_journal_writeahead) {
+ dout(0) << "mount WARNING: no btrfs, and no journal in writeahead mode; data may be lost" << dendl;
+ cerr << TEXT_RED
+ << " ** WARNING: no btrfs AND (no journal OR journal not in writeahead mode)\n"
+ << " For non-btrfs volumes, a writeahead journal is required to\n"
+ << " maintain on-disk consistency in the event of a crash. Your conf\n"
+ << " should include something like:\n"
+ << " osd journal = /path/to/journal_device_or_file\n"
+ << " filestore journal writeahead = true\n"
+ << TEXT_NORMAL;
+ }
+ }
+
+ if (!journal) {
+ dout(0) << "mount WARNING: no journal" << dendl;
+ cerr << TEXT_YELLOW
+ << " ** WARNING: No osd journal is configured: write latency may be high.\n"
+ << " If you will not be using an osd journal, write latency may be\n"
+ << " relatively high. It can be reduced somewhat by lowering\n"
+ << " filestore_max_sync_interval, but lower values mean lower write\n"
+ << " throughput, especially with spinning disks.\n"
+ << TEXT_NORMAL;
+ }
+
+ return 0;
+}
+
+int FileStore::write_superblock()
+{
+ bufferlist bl;
+ ::encode(superblock, bl);
+ return safe_write_file(basedir.c_str(), "superblock",
+ bl.c_str(), bl.length());
+}
+
+int FileStore::read_superblock()
+{
+ bufferptr bp(PATH_MAX);
+ int ret = safe_read_file(basedir.c_str(), "superblock",
+ bp.c_str(), bp.length());
+ if (ret < 0) {
+ if (ret == -ENOENT) {
+ // If the file doesn't exist write initial CompatSet
+ return write_superblock();
+ }
+ return ret;
+ }
+
+ bufferlist bl;
+ bl.push_back(bp);
+ bufferlist::iterator i = bl.begin();
+ ::decode(superblock, i);
+ return 0;
+}
+
+int FileStore::update_version_stamp()
+{
+ return write_version_stamp();
+}
+
+int FileStore::version_stamp_is_valid(uint32_t *version)
+{
+ bufferptr bp(PATH_MAX);
+ int ret = safe_read_file(basedir.c_str(), "store_version",
+ bp.c_str(), bp.length());
+ if (ret < 0) {
+ if (ret == -ENOENT)
+ return 0;
+ return ret;
+ }
+ bufferlist bl;
+ bl.push_back(bp);
+ bufferlist::iterator i = bl.begin();
+ ::decode(*version, i);
+ dout(10) << __func__ << " was " << *version << " vs target "
+ << target_version << dendl;
+ if (*version == target_version)
+ return 1;
+ else
+ return 0;
+}
+
+int FileStore::write_version_stamp()
+{
+ dout(1) << __func__ << " " << target_version << dendl;
+ bufferlist bl;
+ ::encode(target_version, bl);
+
+ return safe_write_file(basedir.c_str(), "store_version",
+ bl.c_str(), bl.length());
+}
+
+int FileStore::upgrade()
+{
+ dout(1) << "upgrade" << dendl;
+ uint32_t version;
+ int r = version_stamp_is_valid(&version);
+ if (r < 0)
+ return r;
+ if (r == 1)
+ return 0;
+
+ if (version < 3) {
+ derr << "ObjectStore is old at version " << version << ". Please upgrade to firefly v0.80.x, convert your store, and then upgrade." << dendl;
+ return -EINVAL;
+ }
+
+ // nothing necessary in FileStore for v3 -> v4 upgrade; we just need to
+ // open up DBObjectMap with the do_upgrade flag, which we already did.
+ update_version_stamp();
+ return 0;
+}
+
+int FileStore::read_op_seq(uint64_t *seq)
+{
+ int op_fd = ::open(current_op_seq_fn.c_str(), O_CREAT|O_RDWR, 0644);
+ if (op_fd < 0) {
+ int r = -errno;
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
+ }
+ char s[40];
+ memset(s, 0, sizeof(s));
+ int ret = safe_read(op_fd, s, sizeof(s) - 1);
+ if (ret < 0) {
+ derr << "error reading " << current_op_seq_fn << ": " << cpp_strerror(ret) << dendl;
+ VOID_TEMP_FAILURE_RETRY(::close(op_fd));
+ assert(!m_filestore_fail_eio || ret != -EIO);
+ return ret;
+ }
+ *seq = atoll(s);
+ return op_fd;
+}
+
+int FileStore::write_op_seq(int fd, uint64_t seq)
+{
+ char s[30];
+ snprintf(s, sizeof(s), "%" PRId64 "\n", seq);
+ int ret = TEMP_FAILURE_RETRY(::pwrite(fd, s, strlen(s), 0));
+ if (ret < 0) {
+ ret = -errno;
+ assert(!m_filestore_fail_eio || ret != -EIO);
+ }
+ return ret;
+}
+
+int FileStore::mount()
+{
+ int ret;
+ char buf[PATH_MAX];
+ uint64_t initial_op_seq;
+ set<string> cluster_snaps;
+ CompatSet supported_compat_set = get_fs_supported_compat_set();
+
+ dout(5) << "basedir " << basedir << " journal " << journalpath << dendl;
+
+ // make sure global base dir exists
+ if (::access(basedir.c_str(), R_OK | W_OK)) {
+ ret = -errno;
+ derr << "FileStore::mount: unable to access basedir '" << basedir << "': "
+ << cpp_strerror(ret) << dendl;
+ goto done;
+ }
+
+ // get fsid
+ snprintf(buf, sizeof(buf), "%s/fsid", basedir.c_str());
+ fsid_fd = ::open(buf, O_RDWR, 0644);
+ if (fsid_fd < 0) {
+ ret = -errno;
+ derr << "FileStore::mount: error opening '" << buf << "': "
+ << cpp_strerror(ret) << dendl;
+ goto done;
+ }
+
+ ret = read_fsid(fsid_fd, &fsid);
+ if (ret < 0) {
+ derr << "FileStore::mount: error reading fsid_fd: " << cpp_strerror(ret)
+ << dendl;
+ goto close_fsid_fd;
+ }
+
+ if (lock_fsid() < 0) {
+ derr << "FileStore::mount: lock_fsid failed" << dendl;
+ ret = -EBUSY;
+ goto close_fsid_fd;
+ }
+
+ dout(10) << "mount fsid is " << fsid << dendl;
+
+
+ uint32_t version_stamp;
+ ret = version_stamp_is_valid(&version_stamp);
+ if (ret < 0) {
+ derr << "FileStore::mount : error in version_stamp_is_valid: "
+ << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ } else if (ret == 0) {
+ if (do_update || (int)version_stamp < g_conf->filestore_update_to) {
+ derr << "FileStore::mount : stale version stamp detected: "
+ << version_stamp
+ << ". Proceeding, do_update "
+ << "is set, performing disk format upgrade."
+ << dendl;
+ do_update = true;
+ } else {
+ ret = -EINVAL;
+ derr << "FileStore::mount : stale version stamp " << version_stamp
+ << ". Please run the FileStore update script before starting the "
+ << "OSD, or set filestore_update_to to " << target_version
+ << " (currently " << g_conf->filestore_update_to << ")"
+ << dendl;
+ goto close_fsid_fd;
+ }
+ }
+
+ ret = read_superblock();
+ if (ret < 0) {
+ ret = -EINVAL;
+ goto close_fsid_fd;
+ }
+
+ // Check if this FileStore supports all the necessary features to mount
+ if (supported_compat_set.compare(superblock.compat_features) == -1) {
+ derr << "FileStore::mount : Incompatible features set "
+ << superblock.compat_features << dendl;
+ ret = -EINVAL;
+ goto close_fsid_fd;
+ }
+
+ // open some dir handles
+ basedir_fd = ::open(basedir.c_str(), O_RDONLY);
+ if (basedir_fd < 0) {
+ ret = -errno;
+ derr << "FileStore::mount: failed to open " << basedir << ": "
+ << cpp_strerror(ret) << dendl;
+ basedir_fd = -1;
+ goto close_fsid_fd;
+ }
+
+ // test for btrfs, xattrs, etc.
+ ret = _detect_fs();
+ if (ret < 0) {
+ derr << "FileStore::mount : error in _detect_fs: "
+ << cpp_strerror(ret) << dendl;
+ goto close_basedir_fd;
+ }
+
+ {
+ list<string> ls;
+ ret = backend->list_checkpoints(ls);
+ if (ret < 0) {
+ derr << "FileStore::mount : error in _list_snaps: "<< cpp_strerror(ret) << dendl;
+ goto close_basedir_fd;
+ }
+
+ long long unsigned c, prev = 0;
+ char clustersnap[NAME_MAX];
+ for (list<string>::iterator it = ls.begin(); it != ls.end(); ++it) {
+ if (sscanf(it->c_str(), COMMIT_SNAP_ITEM, &c) == 1) {
+ assert(c > prev);
+ prev = c;
+ snaps.push_back(c);
+ } else if (sscanf(it->c_str(), CLUSTER_SNAP_ITEM, clustersnap) == 1)
+ cluster_snaps.insert(*it);
+ }
+ }
+
+ if (m_osd_rollback_to_cluster_snap.length() &&
+ cluster_snaps.count(m_osd_rollback_to_cluster_snap) == 0) {
+ derr << "rollback to cluster snapshot '" << m_osd_rollback_to_cluster_snap << "': not found" << dendl;
+ ret = -ENOENT;
+ goto close_basedir_fd;
+ }
+
+ char nosnapfn[200];
+ snprintf(nosnapfn, sizeof(nosnapfn), "%s/nosnap", current_fn.c_str());
+
+ if (backend->can_checkpoint()) {
+ if (snaps.empty()) {
+ dout(0) << "mount WARNING: no consistent snaps found, store may be in inconsistent state" << dendl;
+ } else {
+ char s[NAME_MAX];
+ uint64_t curr_seq = 0;
+
+ if (m_osd_rollback_to_cluster_snap.length()) {
+ derr << TEXT_RED
+ << " ** NOTE: rolling back to cluster snapshot " << m_osd_rollback_to_cluster_snap << " **"
+ << TEXT_NORMAL
+ << dendl;
+ assert(cluster_snaps.count(m_osd_rollback_to_cluster_snap));
+ snprintf(s, sizeof(s), CLUSTER_SNAP_ITEM, m_osd_rollback_to_cluster_snap.c_str());
+ } else {
+ {
+ int fd = read_op_seq(&curr_seq);
+ if (fd >= 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ }
+ }
+ if (curr_seq)
+ dout(10) << " current/ seq was " << curr_seq << dendl;
+ else
+ dout(10) << " current/ missing entirely (unusual, but okay)" << dendl;
+
+ uint64_t cp = snaps.back();
+ dout(10) << " most recent snap from " << snaps << " is " << cp << dendl;
+
+ // if current/ is marked as non-snapshotted, refuse to roll
+ // back (without clear direction) to avoid throwing out new
+ // data.
+ struct stat st;
+ if (::stat(nosnapfn, &st) == 0) {
+ if (!m_osd_use_stale_snap) {
+ derr << "ERROR: " << nosnapfn << " exists, not rolling back to avoid losing new data" << dendl;
+ derr << "Force rollback to old snapshotted version with 'osd use stale snap = true'" << dendl;
+ derr << "config option for --osd-use-stale-snap startup argument." << dendl;
+ ret = -ENOTSUP;
+ goto close_basedir_fd;
+ }
+ derr << "WARNING: user forced start with data sequence mismatch: current was " << curr_seq
+ << ", newest snap is " << cp << dendl;
+ cerr << TEXT_YELLOW
+ << " ** WARNING: forcing the use of stale snapshot data **"
+ << TEXT_NORMAL << std::endl;
+ }
+
+ dout(10) << "mount rolling back to consistent snap " << cp << dendl;
+ snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)cp);
+ }
+
+ // drop current?
+ ret = backend->rollback_to(s);
+ if (ret) {
+ derr << "FileStore::mount: error rolling back to " << s << ": "
+ << cpp_strerror(ret) << dendl;
+ goto close_basedir_fd;
+ }
+ }
+ }
+ initial_op_seq = 0;
+
+ current_fd = ::open(current_fn.c_str(), O_RDONLY);
+ if (current_fd < 0) {
+ ret = -errno;
+ derr << "FileStore::mount: error opening: " << current_fn << ": " << cpp_strerror(ret) << dendl;
+ goto close_basedir_fd;
+ }
+
+ assert(current_fd >= 0);
+
+ op_fd = read_op_seq(&initial_op_seq);
+ if (op_fd < 0) {
+ derr << "FileStore::mount: read_op_seq failed" << dendl;
+ goto close_current_fd;
+ }
+
+ dout(5) << "mount op_seq is " << initial_op_seq << dendl;
+ if (initial_op_seq == 0) {
+ derr << "mount initial op seq is 0; something is wrong" << dendl;
+ ret = -EINVAL;
+ goto close_current_fd;
+ }
+
+ if (!backend->can_checkpoint()) {
+ // mark current/ as non-snapshotted so that we don't rollback away
+ // from it.
+ int r = ::creat(nosnapfn, 0644);
+ if (r < 0) {
+ derr << "FileStore::mount: failed to create current/nosnap" << dendl;
+ goto close_current_fd;
+ }
+ VOID_TEMP_FAILURE_RETRY(::close(r));
+ } else {
+ // clear nosnap marker, if present.
+ ::unlink(nosnapfn);
+ }
+
+ if (!(generic_flags & SKIP_MOUNT_OMAP)) {
+ KeyValueDB * omap_store = KeyValueDB::create(g_ceph_context,
+ superblock.omap_backend,
+ omap_dir);
+ if (omap_store == NULL)
+ {
+ derr << "Error creating " << superblock.omap_backend << dendl;
+ ret = -1;
+ goto close_current_fd;
+ }
+
+ if (superblock.omap_backend == "rocksdb")
+ omap_store->init(g_conf->filestore_rocksdb_options);
+ else
+ omap_store->init();
+
+ stringstream err;
+ if (omap_store->create_and_open(err)) {
+ delete omap_store;
+ derr << "Error initializing " << superblock.omap_backend
+ << " : " << err.str() << dendl;
+ ret = -1;
+ goto close_current_fd;
+ }
+
+ DBObjectMap *dbomap = new DBObjectMap(omap_store);
+ ret = dbomap->init(do_update);
+ if (ret < 0) {
+ delete dbomap;
+ derr << "Error initializing DBObjectMap: " << ret << dendl;
+ goto close_current_fd;
+ }
+ stringstream err2;
+
+ if (g_conf->filestore_debug_omap_check && !dbomap->check(err2)) {
+ derr << err2.str() << dendl;
+ delete dbomap;
+ ret = -EINVAL;
+ goto close_current_fd;
+ }
+ object_map.reset(dbomap);
+ }
+
+ // journal
+ new_journal();
+
+ // select journal mode?
+ if (journal) {
+ if (!m_filestore_journal_writeahead &&
+ !m_filestore_journal_parallel &&
+ !m_filestore_journal_trailing) {
+ if (!backend->can_checkpoint()) {
+ m_filestore_journal_writeahead = true;
+ dout(0) << "mount: enabling WRITEAHEAD journal mode: checkpoint is not enabled" << dendl;
+ } else {
+ m_filestore_journal_parallel = true;
+ dout(0) << "mount: enabling PARALLEL journal mode: fs, checkpoint is enabled" << dendl;
+ }
+ } else {
+ if (m_filestore_journal_writeahead)
+ dout(0) << "mount: WRITEAHEAD journal mode explicitly enabled in conf" << dendl;
+ if (m_filestore_journal_parallel)
+ dout(0) << "mount: PARALLEL journal mode explicitly enabled in conf" << dendl;
+ if (m_filestore_journal_trailing)
+ dout(0) << "mount: TRAILING journal mode explicitly enabled in conf" << dendl;
+ }
+ if (m_filestore_journal_writeahead)
+ journal->set_wait_on_full(true);
+ } else {
+ dout(0) << "mount: no journal" << dendl;
+ }
+
+ ret = _sanity_check_fs();
+ if (ret) {
+ derr << "FileStore::mount: _sanity_check_fs failed with error "
+ << ret << dendl;
+ goto close_current_fd;
+ }
+
+ // Cleanup possibly invalid collections
+ {
+ vector<coll_t> collections;
+ ret = list_collections(collections, true);
+ if (ret < 0) {
+ derr << "Error " << ret << " while listing collections" << dendl;
+ goto close_current_fd;
+ }
+ for (vector<coll_t>::iterator i = collections.begin();
+ i != collections.end();
+ ++i) {
+ Index index;
+ ret = get_index(*i, &index);
+ if (ret < 0) {
+ derr << "Unable to mount index " << *i
+ << " with error: " << ret << dendl;
+ goto close_current_fd;
+ }
+ assert(NULL != index.index);
+ RWLock::WLocker l((index.index)->access_lock);
+
+ index->cleanup();
+ }
+ }
+
+ wbthrottle.start();
+ sync_thread.create();
+
+ if (!(generic_flags & SKIP_JOURNAL_REPLAY)) {
+ ret = journal_replay(initial_op_seq);
+ if (ret < 0) {
+ derr << "mount failed to open journal " << journalpath << ": " << cpp_strerror(ret) << dendl;
+ if (ret == -ENOTTY) {
+ derr << "maybe journal is not pointing to a block device and its size "
+ << "wasn't configured?" << dendl;
+ }
+
+ // stop sync thread
+ lock.Lock();
+ stop = true;
+ sync_cond.Signal();
+ lock.Unlock();
+ sync_thread.join();
+
+ wbthrottle.stop();
+
+ goto close_current_fd;
+ }
+ }
+
+ {
+ stringstream err2;
+ if (g_conf->filestore_debug_omap_check && !object_map->check(err2)) {
+ derr << err2.str() << dendl;
+ ret = -EINVAL;
+ goto close_current_fd;
+ }
+ }
+
+ init_temp_collections();
+
+ journal_start();
+
+ op_tp.start();
+ for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
+ (*it)->start();
+ }
+ for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
+ (*it)->start();
+ }
+
+ timer.init();
+
+ // upgrade?
+ if (g_conf->filestore_update_to >= (int)get_target_version()) {
+ int err = upgrade();
+ if (err < 0) {
+ derr << "error converting store" << dendl;
+ umount();
+ return err;
+ }
+ }
+
+ // all okay.
+ return 0;
+
+close_current_fd:
+ VOID_TEMP_FAILURE_RETRY(::close(current_fd));
+ current_fd = -1;
+close_basedir_fd:
+ VOID_TEMP_FAILURE_RETRY(::close(basedir_fd));
+ basedir_fd = -1;
+close_fsid_fd:
+ VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+ fsid_fd = -1;
+done:
+ assert(!m_filestore_fail_eio || ret != -EIO);
+ return ret;
+}
+
+void FileStore::init_temp_collections()
+{
+ dout(10) << __func__ << dendl;
+ vector<coll_t> ls;
+ int r = list_collections(ls, true);
+ assert(r >= 0);
+
+ dout(20) << " ls " << ls << dendl;
+
+ SequencerPosition spos;
+
+ set<coll_t> temps;
+ for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p)
+ if (p->is_temp())
+ temps.insert(*p);
+ dout(20) << " temps " << temps << dendl;
+
+ for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
+ if (p->is_temp())
+ continue;
+ if (p->is_meta())
+ continue;
+ coll_t temp = p->get_temp();
+ if (temps.count(temp)) {
+ temps.erase(temp);
+ } else {
+ dout(10) << __func__ << " creating " << temp << dendl;
+ r = _create_collection(temp, spos);
+ assert(r == 0);
+ }
+ }
+
+ for (set<coll_t>::iterator p = temps.begin(); p != temps.end(); ++p) {
+ dout(10) << __func__ << " removing stray " << *p << dendl;
+ r = _collection_remove_recursive(*p, spos);
+ assert(r == 0);
+ }
+}
+
+int FileStore::umount()
+{
+ dout(5) << "umount " << basedir << dendl;
+
+ flush();
+ sync();
+ do_force_sync();
+
+ lock.Lock();
+ stop = true;
+ sync_cond.Signal();
+ lock.Unlock();
+ sync_thread.join();
+ wbthrottle.stop();
+ op_tp.stop();
+
+ journal_stop();
+ if (!(generic_flags & SKIP_JOURNAL_REPLAY))
+ journal_write_close();
+
+ for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
+ (*it)->stop();
+ }
+ for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
+ (*it)->stop();
+ }
+
+ if (fsid_fd >= 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+ fsid_fd = -1;
+ }
+ if (op_fd >= 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(op_fd));
+ op_fd = -1;
+ }
+ if (current_fd >= 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(current_fd));
+ current_fd = -1;
+ }
+ if (basedir_fd >= 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(basedir_fd));
+ basedir_fd = -1;
+ }
+
+ force_sync = false;
+
+ delete backend;
+ backend = NULL;
+
+ object_map.reset();
+
+ {
+ Mutex::Locker l(sync_entry_timeo_lock);
+ timer.shutdown();
+ }
+
+ // nothing
+ return 0;
+}
+
+
+
+
+/// -----------------------------
+
+FileStore::Op *FileStore::build_op(list<Transaction*>& tls,
+ Context *onreadable,
+ Context *onreadable_sync,
+ TrackedOpRef osd_op)
+{
+ uint64_t bytes = 0, ops = 0;
+ for (list<Transaction*>::iterator p = tls.begin();
+ p != tls.end();
+ ++p) {
+ bytes += (*p)->get_num_bytes();
+ ops += (*p)->get_num_ops();
+ }
+
+ Op *o = new Op;
+ o->start = ceph_clock_now(g_ceph_context);
+ o->tls.swap(tls);
+ o->onreadable = onreadable;
+ o->onreadable_sync = onreadable_sync;
+ o->ops = ops;
+ o->bytes = bytes;
+ o->osd_op = osd_op;
+ return o;
+}
+
+
+
+void FileStore::queue_op(OpSequencer *osr, Op *o)
+{
+ // queue op on sequencer, then queue sequencer for the threadpool,
+ // so that regardless of which order the threads pick up the
+ // sequencer, the op order will be preserved.
+
+ osr->queue(o);
+
+ logger->inc(l_os_ops);
+ logger->inc(l_os_bytes, o->bytes);
+
+ dout(5) << "queue_op " << o << " seq " << o->op
+ << " " << *osr
+ << " " << o->bytes << " bytes"
+ << " (queue has " << throttle_ops.get_current() << " ops and " << throttle_bytes.get_current() << " bytes)"
+ << dendl;
+ op_wq.queue(osr);
+}
+
+void FileStore::op_queue_reserve_throttle(Op *o, ThreadPool::TPHandle *handle)
+{
+ // Do not call while holding the journal lock!
+ uint64_t max_ops = m_filestore_queue_max_ops;
+ uint64_t max_bytes = m_filestore_queue_max_bytes;
+
+ if (backend->can_checkpoint() && is_committing()) {
+ max_ops += m_filestore_queue_committing_max_ops;
+ max_bytes += m_filestore_queue_committing_max_bytes;
+ }
+
+ logger->set(l_os_oq_max_ops, max_ops);
+ logger->set(l_os_oq_max_bytes, max_bytes);
+
+ if (handle)
+ handle->suspend_tp_timeout();
+ if (throttle_ops.should_wait(1) ||
+ (throttle_bytes.get_current() // let single large ops through!
+ && throttle_bytes.should_wait(o->bytes))) {
+ dout(2) << "waiting " << throttle_ops.get_current() + 1 << " > " << max_ops << " ops || "
+ << throttle_bytes.get_current() + o->bytes << " > " << max_bytes << dendl;
+ }
+ throttle_ops.get();
+ throttle_bytes.get(o->bytes);
+ if (handle)
+ handle->reset_tp_timeout();
+
+ logger->set(l_os_oq_ops, throttle_ops.get_current());
+ logger->set(l_os_oq_bytes, throttle_bytes.get_current());
+}
+
+void FileStore::op_queue_release_throttle(Op *o)
+{
+ throttle_ops.put();
+ throttle_bytes.put(o->bytes);
+ logger->set(l_os_oq_ops, throttle_ops.get_current());
+ logger->set(l_os_oq_bytes, throttle_bytes.get_current());
+}
+
+void FileStore::_do_op(OpSequencer *osr, ThreadPool::TPHandle &handle)
+{
+ wbthrottle.throttle();
+ // inject a stall?
+ if (g_conf->filestore_inject_stall) {
+ int orig = g_conf->filestore_inject_stall;
+ dout(5) << "_do_op filestore_inject_stall " << orig << ", sleeping" << dendl;
+ for (int n = 0; n < g_conf->filestore_inject_stall; n++)
+ sleep(1);
+ g_conf->set_val("filestore_inject_stall", "0");
+ dout(5) << "_do_op done stalling" << dendl;
+ }
+
+ osr->apply_lock.Lock();
+ Op *o = osr->peek_queue();
+ apply_manager.op_apply_start(o->op);
+ dout(5) << "_do_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << " start" << dendl;
+ int r = _do_transactions(o->tls, o->op, &handle);
+ apply_manager.op_apply_finish(o->op);
+ dout(10) << "_do_op " << o << " seq " << o->op << " r = " << r
+ << ", finisher " << o->onreadable << " " << o->onreadable_sync << dendl;
+}
+
+void FileStore::_finish_op(OpSequencer *osr)
+{
+ list<Context*> to_queue;
+ Op *o = osr->dequeue(&to_queue);
+
+ utime_t lat = ceph_clock_now(g_ceph_context);
+ lat -= o->start;
+
+ dout(10) << "_finish_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << " lat " << lat << dendl;
+ osr->apply_lock.Unlock(); // locked in _do_op
+
+ // called with tp lock held
+ op_queue_release_throttle(o);
+
+ logger->tinc(l_os_apply_lat, lat);
+
+ if (o->onreadable_sync) {
+ o->onreadable_sync->complete(0);
+ }
+ if (o->onreadable) {
+ apply_finishers[osr->id % m_apply_finisher_num]->queue(o->onreadable);
+ }
+ if (!to_queue.empty()) {
+ apply_finishers[osr->id % m_apply_finisher_num]->queue(to_queue);
+ }
+ delete o;
+}
+
+
+struct C_JournaledAhead : public Context {
+ FileStore *fs;
+ FileStore::OpSequencer *osr;
+ FileStore::Op *o;
+ Context *ondisk;
+
+ C_JournaledAhead(FileStore *f, FileStore::OpSequencer *os, FileStore::Op *o, Context *ondisk):
+ fs(f), osr(os), o(o), ondisk(ondisk) { }
+ void finish(int r) {
+ fs->_journaled_ahead(osr, o, ondisk);
+ }
+};
+
+int FileStore::queue_transactions(Sequencer *posr, list<Transaction*> &tls,
+ TrackedOpRef osd_op,
+ ThreadPool::TPHandle *handle)
+{
+ Context *onreadable;
+ Context *ondisk;
+ Context *onreadable_sync;
+ ObjectStore::Transaction::collect_contexts(
+ tls, &onreadable, &ondisk, &onreadable_sync);
+ if (g_conf->filestore_blackhole) {
+ dout(0) << "queue_transactions filestore_blackhole = TRUE, dropping transaction" << dendl;
+ delete ondisk;
+ delete onreadable;
+ delete onreadable_sync;
+ return 0;
+ }
+
+ utime_t start = ceph_clock_now(g_ceph_context);
+ // set up the sequencer
+ OpSequencer *osr;
+ assert(posr);
+ if (posr->p) {
+ osr = static_cast<OpSequencer *>(posr->p.get());
+ dout(5) << "queue_transactions existing " << osr << " " << *osr << dendl;
+ } else {
+ osr = new OpSequencer(next_osr_id.inc());
+ osr->set_cct(g_ceph_context);
+ osr->parent = posr;
+ posr->p = osr;
+ dout(5) << "queue_transactions new " << osr << " " << *osr << dendl;
+ }
+
+ // used to include osr information in tracepoints during transaction apply
+ for (list<ObjectStore::Transaction*>::iterator i = tls.begin(); i != tls.end(); ++i) {
+ (*i)->set_osr(osr);
+ }
+
+ if (journal && journal->is_writeable() && !m_filestore_journal_trailing) {
+ Op *o = build_op(tls, onreadable, onreadable_sync, osd_op);
+ op_queue_reserve_throttle(o, handle);
+ journal->throttle();
+ //prepare and encode transactions data out of lock
+ bufferlist tbl;
+ int orig_len = journal->prepare_entry(o->tls, &tbl);
+ uint64_t op_num = submit_manager.op_submit_start();
+ o->op = op_num;
+
+ if (m_filestore_do_dump)
+ dump_transactions(o->tls, o->op, osr);
+
+ if (m_filestore_journal_parallel) {
+ dout(5) << "queue_transactions (parallel) " << o->op << " " << o->tls << dendl;
+
+ _op_journal_transactions(tbl, orig_len, o->op, ondisk, osd_op);
+
+ // queue inside submit_manager op submission lock
+ queue_op(osr, o);
+ } else if (m_filestore_journal_writeahead) {
+ dout(5) << "queue_transactions (writeahead) " << o->op << " " << o->tls << dendl;
+
+ osr->queue_journal(o->op);
+
+ _op_journal_transactions(tbl, orig_len, o->op,
+ new C_JournaledAhead(this, osr, o, ondisk),
+ osd_op);
+ } else {
+ assert(0);
+ }
+ submit_manager.op_submit_finish(op_num);
+ utime_t end = ceph_clock_now(g_ceph_context);
+ logger->tinc(l_os_queue_lat, end - start);
+ return 0;
+ }
+
+ if (!journal) {
+ Op *o = build_op(tls, onreadable, onreadable_sync, osd_op);
+ dout(5) << __func__ << " (no journal) " << o << " " << tls << dendl;
+
+ op_queue_reserve_throttle(o, handle);
+
+ uint64_t op_num = submit_manager.op_submit_start();
+ o->op = op_num;
+
+ if (m_filestore_do_dump)
+ dump_transactions(o->tls, o->op, osr);
+
+ queue_op(osr, o);
+
+ if (ondisk)
+ apply_manager.add_waiter(op_num, ondisk);
+ submit_manager.op_submit_finish(op_num);
+ utime_t end = ceph_clock_now(g_ceph_context);
+ logger->tinc(l_os_queue_lat, end - start);
+ return 0;
+ }
+
+ assert(journal);
+ //prepare and encode transactions data out of lock
+ bufferlist tbl;
+ int orig_len = -1;
+ if (journal->is_writeable()) {
+ orig_len = journal->prepare_entry(tls, &tbl);
+ }
+ uint64_t op = submit_manager.op_submit_start();
+ dout(5) << "queue_transactions (trailing journal) " << op << " " << tls << dendl;
+
+ if (m_filestore_do_dump)
+ dump_transactions(tls, op, osr);
+
+ apply_manager.op_apply_start(op);
+ int r = do_transactions(tls, op);
+
+ if (r >= 0) {
+ _op_journal_transactions(tbl, orig_len, op, ondisk, osd_op);
+ } else {
+ delete ondisk;
+ }
+
+ // start on_readable finisher after we queue journal item, as on_readable callback
+ // is allowed to delete the Transaction
+ if (onreadable_sync) {
+ onreadable_sync->complete(r);
+ }
+ apply_finishers[osr->id % m_apply_finisher_num]->queue(onreadable, r);
+
+ submit_manager.op_submit_finish(op);
+ apply_manager.op_apply_finish(op);
+
+ utime_t end = ceph_clock_now(g_ceph_context);
+ logger->tinc(l_os_queue_lat, end - start);
+ return r;
+}
+
+void FileStore::_journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk)
+{
+ dout(5) << "_journaled_ahead " << o << " seq " << o->op << " " << *osr << " " << o->tls << dendl;
+
+ // this should queue in order because the journal does it's completions in order.
+ queue_op(osr, o);
+
+ list<Context*> to_queue;
+ osr->dequeue_journal(&to_queue);
+
+ // do ondisk completions async, to prevent any onreadable_sync completions
+ // getting blocked behind an ondisk completion.
+ if (ondisk) {
+ dout(10) << " queueing ondisk " << ondisk << dendl;
+ ondisk_finishers[osr->id % m_ondisk_finisher_num]->queue(ondisk);
+ }
+ if (!to_queue.empty()) {
+ ondisk_finishers[osr->id % m_ondisk_finisher_num]->queue(to_queue);
+ }
+}
+
+int FileStore::_do_transactions(
+ list<Transaction*> &tls,
+ uint64_t op_seq,
+ ThreadPool::TPHandle *handle)
+{
+ int r = 0;
+ int trans_num = 0;
+
+ for (list<Transaction*>::iterator p = tls.begin();
+ p != tls.end();
+ ++p, trans_num++) {
+ r = _do_transaction(**p, op_seq, trans_num, handle);
+ if (r < 0)
+ break;
+ if (handle)
+ handle->reset_tp_timeout();
+ }
+
+ return r;
+}
+
+void FileStore::_set_global_replay_guard(coll_t cid,
+ const SequencerPosition &spos)
+{
+ if (backend->can_checkpoint())
+ return;
+
+ // sync all previous operations on this sequencer
+ int ret = object_map->sync();
+ if (ret < 0) {
+ derr << __func__ << " : omap sync error " << cpp_strerror(ret) << dendl;
+ assert(0 == "_set_global_replay_guard failed");
+ }
+ ret = sync_filesystem(basedir_fd);
+ if (ret < 0) {
+ derr << __func__ << " :sync_filesytem error " << cpp_strerror(ret) << dendl;
+ assert(0 == "_set_global_replay_guard failed");
+ }
+
+ char fn[PATH_MAX];
+ get_cdir(cid, fn, sizeof(fn));
+ int fd = ::open(fn, O_RDONLY);
+ if (fd < 0) {
+ int err = errno;
+ derr << __func__ << ": " << cid << " error " << cpp_strerror(err) << dendl;
+ assert(0 == "_set_global_replay_guard failed");
+ }
+
+ _inject_failure();
+
+ // then record that we did it
+ bufferlist v;
+ ::encode(spos, v);
+ int r = chain_fsetxattr(fd, GLOBAL_REPLAY_GUARD_XATTR, v.c_str(), v.length(), true);
+ if (r < 0) {
+ derr << __func__ << ": fsetxattr " << GLOBAL_REPLAY_GUARD_XATTR
+ << " got " << cpp_strerror(r) << dendl;
+ assert(0 == "fsetxattr failed");
+ }
+
+ // and make sure our xattr is durable.
+ ::fsync(fd);
+
+ _inject_failure();
+
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ dout(10) << __func__ << ": " << spos << " done" << dendl;
+}
+
+int FileStore::_check_global_replay_guard(coll_t cid,
+ const SequencerPosition& spos)
+{
+ char fn[PATH_MAX];
+ get_cdir(cid, fn, sizeof(fn));
+ int fd = ::open(fn, O_RDONLY);
+ if (fd < 0) {
+ dout(10) << __func__ << ": " << cid << " dne" << dendl;
+ return 1; // if collection does not exist, there is no guard, and we can replay.
+ }
+
+ char buf[100];
+ int r = chain_fgetxattr(fd, GLOBAL_REPLAY_GUARD_XATTR, buf, sizeof(buf));
+ if (r < 0) {
+ dout(20) << __func__ << " no xattr" << dendl;
+ assert(!m_filestore_fail_eio || r != -EIO);
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return 1; // no xattr
+ }
+ bufferlist bl;
+ bl.append(buf, r);
+
+ SequencerPosition opos;
+ bufferlist::iterator p = bl.begin();
+ ::decode(opos, p);
+
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return spos >= opos ? 1 : -1;
+}
+
+
+void FileStore::_set_replay_guard(coll_t cid,
+ const SequencerPosition &spos,
+ bool in_progress=false)
+{
+ char fn[PATH_MAX];
+ get_cdir(cid, fn, sizeof(fn));
+ int fd = ::open(fn, O_RDONLY);
+ if (fd < 0) {
+ int err = errno;
+ derr << "_set_replay_guard " << cid << " error " << cpp_strerror(err) << dendl;
+ assert(0 == "_set_replay_guard failed");
+ }
+ _set_replay_guard(fd, spos, 0, in_progress);
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+}
+
+
+void FileStore::_set_replay_guard(int fd,
+ const SequencerPosition& spos,
+ const ghobject_t *hoid,
+ bool in_progress)
+{
+ if (backend->can_checkpoint())
+ return;
+
+ dout(10) << "_set_replay_guard " << spos << (in_progress ? " START" : "") << dendl;
+
+ _inject_failure();
+
+ // first make sure the previous operation commits
+ ::fsync(fd);
+
+ // sync object_map too. even if this object has a header or keys,
+ // it have had them in the past and then removed them, so always
+ // sync.
+ object_map->sync(hoid, &spos);
+
+ _inject_failure();
+
+ // then record that we did it
+ bufferlist v(40);
+ ::encode(spos, v);
+ ::encode(in_progress, v);
+ int r = chain_fsetxattr(fd, REPLAY_GUARD_XATTR, v.c_str(), v.length(), true);
+ if (r < 0) {
+ derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl;
+ assert(0 == "fsetxattr failed");
+ }
+
+ // and make sure our xattr is durable.
+ ::fsync(fd);
+
+ _inject_failure();
+
+ dout(10) << "_set_replay_guard " << spos << " done" << dendl;
+}
+
+void FileStore::_close_replay_guard(coll_t cid,
+ const SequencerPosition &spos)
+{
+ char fn[PATH_MAX];
+ get_cdir(cid, fn, sizeof(fn));
+ int fd = ::open(fn, O_RDONLY);
+ if (fd < 0) {
+ int err = errno;
+ derr << "_close_replay_guard " << cid << " error " << cpp_strerror(err) << dendl;
+ assert(0 == "_close_replay_guard failed");
+ }
+ _close_replay_guard(fd, spos);
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+}
+
+void FileStore::_close_replay_guard(int fd, const SequencerPosition& spos)
+{
+ if (backend->can_checkpoint())
+ return;
+
+ dout(10) << "_close_replay_guard " << spos << dendl;
+
+ _inject_failure();
+
+ // then record that we are done with this operation
+ bufferlist v(40);
+ ::encode(spos, v);
+ bool in_progress = false;
+ ::encode(in_progress, v);
+ int r = chain_fsetxattr(fd, REPLAY_GUARD_XATTR, v.c_str(), v.length(), true);
+ if (r < 0) {
+ derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl;
+ assert(0 == "fsetxattr failed");
+ }
+
+ // and make sure our xattr is durable.
+ ::fsync(fd);
+
+ _inject_failure();
+
+ dout(10) << "_close_replay_guard " << spos << " done" << dendl;
+}
+
+int FileStore::_check_replay_guard(coll_t cid, ghobject_t oid, const SequencerPosition& spos)
+{
+ if (!replaying || backend->can_checkpoint())
+ return 1;
+
+ int r = _check_global_replay_guard(cid, spos);
+ if (r < 0)
+ return r;
+
+ FDRef fd;
+ r = lfn_open(cid, oid, false, &fd);
+ if (r < 0) {
+ dout(10) << "_check_replay_guard " << cid << " " << oid << " dne" << dendl;
+ return 1; // if file does not exist, there is no guard, and we can replay.
+ }
+ int ret = _check_replay_guard(**fd, spos);
+ lfn_close(fd);
+ return ret;
+}
+
+int FileStore::_check_replay_guard(coll_t cid, const SequencerPosition& spos)
+{
+ if (!replaying || backend->can_checkpoint())
+ return 1;
+
+ char fn[PATH_MAX];
+ get_cdir(cid, fn, sizeof(fn));
+ int fd = ::open(fn, O_RDONLY);
+ if (fd < 0) {
+ dout(10) << "_check_replay_guard " << cid << " dne" << dendl;
+ return 1; // if collection does not exist, there is no guard, and we can replay.
+ }
+ int ret = _check_replay_guard(fd, spos);
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return ret;
+}
+
+int FileStore::_check_replay_guard(int fd, const SequencerPosition& spos)
+{
+ if (!replaying || backend->can_checkpoint())
+ return 1;
+
+ char buf[100];
+ int r = chain_fgetxattr(fd, REPLAY_GUARD_XATTR, buf, sizeof(buf));
+ if (r < 0) {
+ dout(20) << "_check_replay_guard no xattr" << dendl;
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return 1; // no xattr
+ }
+ bufferlist bl;
+ bl.append(buf, r);
+
+ SequencerPosition opos;
+ bufferlist::iterator p = bl.begin();
+ ::decode(opos, p);
+ bool in_progress = false;
+ if (!p.end()) // older journals don't have this
+ ::decode(in_progress, p);
+ if (opos > spos) {
+ dout(10) << "_check_replay_guard object has " << opos << " > current pos " << spos
+ << ", now or in future, SKIPPING REPLAY" << dendl;
+ return -1;
+ } else if (opos == spos) {
+ if (in_progress) {
+ dout(10) << "_check_replay_guard object has " << opos << " == current pos " << spos
+ << ", in_progress=true, CONDITIONAL REPLAY" << dendl;
+ return 0;
+ } else {
+ dout(10) << "_check_replay_guard object has " << opos << " == current pos " << spos
+ << ", in_progress=false, SKIPPING REPLAY" << dendl;
+ return -1;
+ }
+ } else {
+ dout(10) << "_check_replay_guard object has " << opos << " < current pos " << spos
+ << ", in past, will replay" << dendl;
+ return 1;
+ }
+}
+
+unsigned FileStore::_do_transaction(
+ Transaction& t, uint64_t op_seq, int trans_num,
+ ThreadPool::TPHandle *handle)
+{
+ dout(10) << "_do_transaction on " << &t << dendl;
+
+#ifdef WITH_LTTNG
+ const char *osr_name = t.get_osr() ? static_cast<OpSequencer*>(t.get_osr())->get_name().c_str() : "<NULL>";
+#endif
+
+ Transaction::iterator i = t.begin();
+
+ SequencerPosition spos(op_seq, trans_num, 0);
+ while (i.have_op()) {
+ if (handle)
+ handle->reset_tp_timeout();
+
+ Transaction::Op *op = i.decode_op();
+ int r = 0;
+
+ _inject_failure();
+
+ switch (op->op) {
+ case Transaction::OP_NOP:
+ break;
+ case Transaction::OP_TOUCH:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ _kludge_temp_object_collection(cid, oid);
+ tracepoint(objectstore, touch_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _touch(cid, oid);
+ tracepoint(objectstore, touch_exit, r);
+ }
+ break;
+
+ case Transaction::OP_WRITE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ _kludge_temp_object_collection(cid, oid);
+ uint64_t off = op->off;
+ uint64_t len = op->len;
+ uint32_t fadvise_flags = i.get_fadvise_flags();
+ bufferlist bl;
+ i.decode_bl(bl);
+ tracepoint(objectstore, write_enter, osr_name, off, len);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _write(cid, oid, off, len, bl, fadvise_flags);
+ tracepoint(objectstore, write_exit, r);
+ }
+ break;
+
+ case Transaction::OP_ZERO:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ _kludge_temp_object_collection(cid, oid);
+ uint64_t off = op->off;
+ uint64_t len = op->len;
+ tracepoint(objectstore, zero_enter, osr_name, off, len);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _zero(cid, oid, off, len);
+ tracepoint(objectstore, zero_exit, r);
+ }
+ break;
+
+ case Transaction::OP_TRIMCACHE:
+ {
+ // deprecated, no-op
+ }
+ break;
+
+ case Transaction::OP_TRUNCATE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ _kludge_temp_object_collection(cid, oid);
+ uint64_t off = op->off;
+ tracepoint(objectstore, truncate_enter, osr_name, off);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _truncate(cid, oid, off);
+ tracepoint(objectstore, truncate_exit, r);
+ }
+ break;
+
+ case Transaction::OP_REMOVE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ _kludge_temp_object_collection(cid, oid);
+ tracepoint(objectstore, remove_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _remove(cid, oid, spos);
+ tracepoint(objectstore, remove_exit, r);
+ }
+ break;
+
+ case Transaction::OP_SETATTR:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ _kludge_temp_object_collection(cid, oid);
+ string name = i.decode_string();
+ bufferlist bl;
+ i.decode_bl(bl);
+ tracepoint(objectstore, setattr_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0) {
+ map<string, bufferptr> to_set;
+ to_set[name] = bufferptr(bl.c_str(), bl.length());
+ r = _setattrs(cid, oid, to_set, spos);
+ if (r == -ENOSPC)
+ dout(0) << " ENOSPC on setxattr on " << cid << "/" << oid
+ << " name " << name << " size " << bl.length() << dendl;
+ }
+ tracepoint(objectstore, setattr_exit, r);
+ }
+ break;
+
+ case Transaction::OP_SETATTRS:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ _kludge_temp_object_collection(cid, oid);
+ map<string, bufferptr> aset;
+ i.decode_attrset(aset);
+ tracepoint(objectstore, setattrs_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _setattrs(cid, oid, aset, spos);
+ tracepoint(objectstore, setattrs_exit, r);
+ if (r == -ENOSPC)
+ dout(0) << " ENOSPC on setxattrs on " << cid << "/" << oid << dendl;
+ }
+ break;
+
+ case Transaction::OP_RMATTR:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ _kludge_temp_object_collection(cid, oid);
+ string name = i.decode_string();
+ tracepoint(objectstore, rmattr_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _rmattr(cid, oid, name.c_str(), spos);
+ tracepoint(objectstore, rmattr_exit, r);
+ }
+ break;
+
+ case Transaction::OP_RMATTRS:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ _kludge_temp_object_collection(cid, oid);
+ tracepoint(objectstore, rmattrs_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _rmattrs(cid, oid, spos);
+ tracepoint(objectstore, rmattrs_exit, r);
+ }
+ break;
+
+ case Transaction::OP_CLONE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ _kludge_temp_object_collection(cid, oid);
+ ghobject_t noid = i.get_oid(op->dest_oid);
+ tracepoint(objectstore, clone_enter, osr_name);
+ r = _clone(cid, oid, noid, spos);
+ tracepoint(objectstore, clone_exit, r);
+ }
+ break;
+
+ case Transaction::OP_CLONERANGE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ _kludge_temp_object_collection(cid, oid);
+ ghobject_t noid = i.get_oid(op->dest_oid);
+ _kludge_temp_object_collection(cid, noid);
+ uint64_t off = op->off;
+ uint64_t len = op->len;
+ tracepoint(objectstore, clone_range_enter, osr_name, len);
+ r = _clone_range(cid, oid, noid, off, len, off, spos);
+ tracepoint(objectstore, clone_range_exit, r);
+ }
+ break;
+
+ case Transaction::OP_CLONERANGE2:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ _kludge_temp_object_collection(cid, oid);
+ ghobject_t noid = i.get_oid(op->dest_oid);
+ _kludge_temp_object_collection(cid, noid);
+ uint64_t srcoff = op->off;
+ uint64_t len = op->len;
+ uint64_t dstoff = op->dest_off;
+ tracepoint(objectstore, clone_range2_enter, osr_name, len);
+ r = _clone_range(cid, oid, noid, srcoff, len, dstoff, spos);
+ tracepoint(objectstore, clone_range2_exit, r);
+ }
+ break;
+
+ case Transaction::OP_MKCOLL:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ tracepoint(objectstore, mkcoll_enter, osr_name);
+ if (_check_replay_guard(cid, spos) > 0)
+ r = _create_collection(cid, spos);
+ tracepoint(objectstore, mkcoll_exit, r);
+ }
+ break;
+
+ case Transaction::OP_COLL_HINT:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ uint32_t type = op->hint_type;
+ bufferlist hint;
+ i.decode_bl(hint);
+ bufferlist::iterator hiter = hint.begin();
+ if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
+ uint32_t pg_num;
+ uint64_t num_objs;
+ ::decode(pg_num, hiter);
+ ::decode(num_objs, hiter);
+ if (_check_replay_guard(cid, spos) > 0) {
+ r = _collection_hint_expected_num_objs(cid, pg_num, num_objs, spos);
+ }
+ } else {
+ // Ignore the hint
+ dout(10) << "Unrecognized collection hint type: " << type << dendl;
+ }
+ }
+ break;
+
+ case Transaction::OP_RMCOLL:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ tracepoint(objectstore, rmcoll_enter, osr_name);
+ if (_check_replay_guard(cid, spos) > 0)
+ r = _destroy_collection(cid);
+ tracepoint(objectstore, rmcoll_exit, r);
+ }
+ break;
+
+ case Transaction::OP_COLL_ADD:
+ {
+ coll_t ocid = i.get_cid(op->cid);
+ coll_t ncid = i.get_cid(op->dest_cid);
+ ghobject_t oid = i.get_oid(op->oid);
+
+ assert(oid.hobj.pool >= -1);
+
+ // always followed by OP_COLL_REMOVE
+ Transaction::Op *op2 = i.decode_op();
+ coll_t ocid2 = i.get_cid(op2->cid);
+ ghobject_t oid2 = i.get_oid(op2->oid);
+ assert(op2->op == Transaction::OP_COLL_REMOVE);
+ assert(ocid2 == ocid);
+ assert(oid2 == oid);
+
+ tracepoint(objectstore, coll_add_enter);
+ r = _collection_add(ncid, ocid, oid, spos);
+ tracepoint(objectstore, coll_add_exit, r);
+ spos.op++;
+ if (r < 0)
+ break;
+ tracepoint(objectstore, coll_remove_enter, osr_name);
+ if (_check_replay_guard(ocid, oid, spos) > 0)
+ r = _remove(ocid, oid, spos);
+ tracepoint(objectstore, coll_remove_exit, r);
+ }
+ break;
+
+ case Transaction::OP_COLL_MOVE:
+ {
+ // WARNING: this is deprecated and buggy; only here to replay old journals.
+ coll_t ocid = i.get_cid(op->cid);
+ coll_t ncid = i.get_cid(op->dest_cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ tracepoint(objectstore, coll_move_enter);
+ r = _collection_add(ocid, ncid, oid, spos);
+ if (r == 0 &&
+ (_check_replay_guard(ocid, oid, spos) > 0))
+ r = _remove(ocid, oid, spos);
+ tracepoint(objectstore, coll_move_exit, r);
+ }
+ break;
+
+ case Transaction::OP_COLL_MOVE_RENAME:
+ {
+ coll_t oldcid = i.get_cid(op->cid);
+ ghobject_t oldoid = i.get_oid(op->oid);
+ coll_t newcid = i.get_cid(op->dest_cid);
+ ghobject_t newoid = i.get_oid(op->dest_oid);
+ _kludge_temp_object_collection(oldcid, oldoid);
+ _kludge_temp_object_collection(newcid, newoid);
+ tracepoint(objectstore, coll_move_rename_enter);
+ r = _collection_move_rename(oldcid, oldoid, newcid, newoid, spos);
+ tracepoint(objectstore, coll_move_rename_exit, r);
+ }
+ break;
+
+ case Transaction::OP_COLL_SETATTR:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ string name = i.decode_string();
+ bufferlist bl;
+ i.decode_bl(bl);
+ tracepoint(objectstore, coll_setattr_enter, osr_name);
+ if (_check_replay_guard(cid, spos) > 0)
+ r = _collection_setattr(cid, name.c_str(), bl.c_str(), bl.length());
+ tracepoint(objectstore, coll_setattr_exit, r);
+ }
+ break;
+
+ case Transaction::OP_COLL_RMATTR:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ string name = i.decode_string();
+ tracepoint(objectstore, coll_rmattr_enter, osr_name);
+ if (_check_replay_guard(cid, spos) > 0)
+ r = _collection_rmattr(cid, name.c_str());
+ tracepoint(objectstore, coll_rmattr_exit, r);
+ }
+ break;
+
+ case Transaction::OP_STARTSYNC:
+ tracepoint(objectstore, startsync_enter, osr_name);
+ _start_sync();
+ tracepoint(objectstore, startsync_exit);
+ break;
+
+ case Transaction::OP_COLL_RENAME:
+ {
+ r = -EOPNOTSUPP;
+ }
+ break;
+
+ case Transaction::OP_OMAP_CLEAR:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ _kludge_temp_object_collection(cid, oid);
+ tracepoint(objectstore, omap_clear_enter, osr_name);
+ r = _omap_clear(cid, oid, spos);
+ tracepoint(objectstore, omap_clear_exit, r);
+ }
+ break;
+ case Transaction::OP_OMAP_SETKEYS:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ _kludge_temp_object_collection(cid, oid);
+ map<string, bufferlist> aset;
+ i.decode_attrset(aset);
+ tracepoint(objectstore, omap_setkeys_enter, osr_name);
+ r = _omap_setkeys(cid, oid, aset, spos);
+ tracepoint(objectstore, omap_setkeys_exit, r);
+ }
+ break;
+ case Transaction::OP_OMAP_RMKEYS:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ _kludge_temp_object_collection(cid, oid);
+ set<string> keys;
+ i.decode_keyset(keys);
+ tracepoint(objectstore, omap_rmkeys_enter, osr_name);
+ r = _omap_rmkeys(cid, oid, keys, spos);
+ tracepoint(objectstore, omap_rmkeys_exit, r);
+ }
+ break;
+ case Transaction::OP_OMAP_RMKEYRANGE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ _kludge_temp_object_collection(cid, oid);
+ string first, last;
+ first = i.decode_string();
+ last = i.decode_string();
+ tracepoint(objectstore, omap_rmkeyrange_enter, osr_name);
+ r = _omap_rmkeyrange(cid, oid, first, last, spos);
+ tracepoint(objectstore, omap_rmkeyrange_exit, r);
+ }
+ break;
+ case Transaction::OP_OMAP_SETHEADER:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ _kludge_temp_object_collection(cid, oid);
+ bufferlist bl;
+ i.decode_bl(bl);
+ tracepoint(objectstore, omap_setheader_enter, osr_name);
+ r = _omap_setheader(cid, oid, bl, spos);
+ tracepoint(objectstore, omap_setheader_exit, r);
+ }
+ break;
+ case Transaction::OP_SPLIT_COLLECTION:
+ {
+ assert(0 == "not legacy journal; upgrade to firefly first");
+ }
+ break;
+ case Transaction::OP_SPLIT_COLLECTION2:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ uint32_t bits = op->split_bits;
+ uint32_t rem = op->split_rem;
+ coll_t dest = i.get_cid(op->dest_cid);
+ tracepoint(objectstore, split_coll2_enter, osr_name);
+ r = _split_collection(cid, bits, rem, dest, spos);
+ tracepoint(objectstore, split_coll2_exit, r);
+ }
+ break;
+
+ case Transaction::OP_SETALLOCHINT:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ _kludge_temp_object_collection(cid, oid);
+ uint64_t expected_object_size = op->expected_object_size;
+ uint64_t expected_write_size = op->expected_write_size;
+ tracepoint(objectstore, setallochint_enter, osr_name);
+ if (_check_replay_guard(cid, oid, spos) > 0)
+ r = _set_alloc_hint(cid, oid, expected_object_size,
+ expected_write_size);
+ tracepoint(objectstore, setallochint_exit, r);
+ }
+ break;
+
+ default:
+ derr << "bad op " << op->op << dendl;
+ assert(0);
+ }
+
+ if (r < 0) {
+ bool ok = false;
+
+ if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
+ op->op == Transaction::OP_CLONE ||
+ op->op == Transaction::OP_CLONERANGE2 ||
+ op->op == Transaction::OP_COLL_ADD))
+ // -ENOENT is normally okay
+ // ...including on a replayed OP_RMCOLL with checkpoint mode
+ ok = true;
+ if (r == -ENODATA)
+ ok = true;
+
+ if (op->op == Transaction::OP_SETALLOCHINT)
+ // Either EOPNOTSUPP or EINVAL most probably. EINVAL in most
+ // cases means invalid hint size (e.g. too big, not a multiple
+ // of block size, etc) or, at least on xfs, an attempt to set
+ // or change it when the file is not empty. However,
+ // OP_SETALLOCHINT is advisory, so ignore all errors.
+ ok = true;
+
+ if (replaying && !backend->can_checkpoint()) {
+ if (r == -EEXIST && op->op == Transaction::OP_MKCOLL) {
+ dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl;
+ ok = true;
+ }
+ if (r == -EEXIST && op->op == Transaction::OP_COLL_ADD) {
+ dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl;
+ ok = true;
+ }
+ if (r == -EEXIST && op->op == Transaction::OP_COLL_MOVE) {
+ dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl;
+ ok = true;
+ }
+ if (r == -ERANGE) {
+ dout(10) << "tolerating ERANGE on replay" << dendl;
+ ok = true;
+ }
+ if (r == -ENOENT) {
+ dout(10) << "tolerating ENOENT on replay" << dendl;
+ ok = true;
+ }
+ }
+
+ if (!ok) {
+ const char *msg = "unexpected error code";
+
+ if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
+ op->op == Transaction::OP_CLONE ||
+ op->op == Transaction::OP_CLONERANGE2))
+ msg = "ENOENT on clone suggests osd bug";
+
+ if (r == -ENOSPC)
+ // For now, if we hit _any_ ENOSPC, crash, before we do any damage
+ // by partially applying transactions.
+ msg = "ENOSPC handling not implemented";
+
+ if (r == -ENOTEMPTY) {
+ msg = "ENOTEMPTY suggests garbage data in osd data dir";
+ }
+
+ dout(0) << " error " << cpp_strerror(r) << " not handled on operation " << op
+ << " (" << spos << ", or op " << spos.op << ", counting from 0)" << dendl;
+ dout(0) << msg << dendl;
+ dout(0) << " transaction dump:\n";
+ JSONFormatter f(true);
+ f.open_object_section("transaction");
+ t.dump(&f);
+ f.close_section();
+ f.flush(*_dout);
+ *_dout << dendl;
+
+ if (r == -EMFILE) {
+ dump_open_fds(g_ceph_context);
+ }
+
+ assert(0 == "unexpected error");
+ }
+ }
+
+ spos.op++;
+ }
+
+ _inject_failure();
+
+ return 0; // FIXME count errors
+}
+
+ /*********************************************/
+
+
+
+// --------------------
+// objects
+
+bool FileStore::exists(coll_t cid, const ghobject_t& oid)
+{
+ tracepoint(objectstore, exists_enter, cid.c_str());
+ _kludge_temp_object_collection(cid, oid);
+ struct stat st;
+ bool retval = stat(cid, oid, &st) == 0;
+ tracepoint(objectstore, exists_exit, retval);
+ return retval;
+}
+
+int FileStore::stat(
+ coll_t cid, const ghobject_t& oid, struct stat *st, bool allow_eio)
+{
+ tracepoint(objectstore, stat_enter, cid.c_str());
+ _kludge_temp_object_collection(cid, oid);
+ int r = lfn_stat(cid, oid, st);
+ assert(allow_eio || !m_filestore_fail_eio || r != -EIO);
+ if (r < 0) {
+ dout(10) << "stat " << cid << "/" << oid
+ << " = " << r << dendl;
+ } else {
+ dout(10) << "stat " << cid << "/" << oid
+ << " = " << r
+ << " (size " << st->st_size << ")" << dendl;
+ }
+ if (g_conf->filestore_debug_inject_read_err &&
+ debug_mdata_eio(oid)) {
+ return -EIO;
+ } else {
+ tracepoint(objectstore, stat_exit, r);
+ return r;
+ }
+}
+
+int FileStore::read(
+ coll_t cid,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ bufferlist& bl,
+ uint32_t op_flags,
+ bool allow_eio)
+{
+ int got;
+ tracepoint(objectstore, read_enter, cid.c_str(), offset, len);
+ _kludge_temp_object_collection(cid, oid);
+
+ dout(15) << "read " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+
+ FDRef fd;
+ int r = lfn_open(cid, oid, false, &fd);
+ if (r < 0) {
+ dout(10) << "FileStore::read(" << cid << "/" << oid << ") open error: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (len == 0) {
+ struct stat st;
+ memset(&st, 0, sizeof(struct stat));
+ int r = ::fstat(**fd, &st);
+ assert(r == 0);
+ len = st.st_size;
+ }
+
+#ifdef HAVE_POSIX_FADVISE
+ if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_RANDOM)
+ posix_fadvise(**fd, offset, len, POSIX_FADV_RANDOM);
+ if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL)
+ posix_fadvise(**fd, offset, len, POSIX_FADV_SEQUENTIAL);
+#endif
+
+ bufferptr bptr(len); // prealloc space for entire read
+ got = safe_pread(**fd, bptr.c_str(), len, offset);
+ if (got < 0) {
+ dout(10) << "FileStore::read(" << cid << "/" << oid << ") pread error: " << cpp_strerror(got) << dendl;
+ lfn_close(fd);
+ assert(allow_eio || !m_filestore_fail_eio || got != -EIO);
+ return got;
+ }
+ bptr.set_length(got); // properly size the buffer
+ bl.push_back(bptr); // put it in the target bufferlist
+
+#ifdef HAVE_POSIX_FADVISE
+ if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)
+ posix_fadvise(**fd, offset, len, POSIX_FADV_DONTNEED);
+ if (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_RANDOM | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL))
+ posix_fadvise(**fd, offset, len, POSIX_FADV_NORMAL);
+#endif
+
+ if (m_filestore_sloppy_crc && (!replaying || backend->can_checkpoint())) {
+ ostringstream ss;
+ int errors = backend->_crc_verify_read(**fd, offset, got, bl, &ss);
+ if (errors > 0) {
+ dout(0) << "FileStore::read " << cid << "/" << oid << " " << offset << "~"
+ << got << " ... BAD CRC:\n" << ss.str() << dendl;
+ assert(0 == "bad crc on read");
+ }
+ }
+
+ lfn_close(fd);
+
+ dout(10) << "FileStore::read " << cid << "/" << oid << " " << offset << "~"
+ << got << "/" << len << dendl;
+ if (g_conf->filestore_debug_inject_read_err &&
+ debug_data_eio(oid)) {
+ return -EIO;
+ } else {
+ tracepoint(objectstore, read_exit, got);
+ return got;
+ }
+}
+
+int FileStore::_do_fiemap(int fd, uint64_t offset, size_t len,
+ map<uint64_t, uint64_t> *m)
+{
+ struct fiemap *fiemap = NULL;
+ uint64_t i;
+ struct fiemap_extent *extent = NULL;
+ int r = 0;
+
+ r = backend->do_fiemap(fd, offset, len, &fiemap);
+ if (r < 0)
+ return r;
+
+ if (fiemap->fm_mapped_extents == 0) {
+ free(fiemap);
+ return r;
+ }
+
+ extent = &fiemap->fm_extents[0];
+
+ /* start where we were asked to start */
+ if (extent->fe_logical < offset) {
+ extent->fe_length -= offset - extent->fe_logical;
+ extent->fe_logical = offset;
+ }
+
+ i = 0;
+
+ while (i < fiemap->fm_mapped_extents) {
+ struct fiemap_extent *next = extent + 1;
+
+ dout(10) << "FileStore::fiemap() fm_mapped_extents=" << fiemap->fm_mapped_extents
+ << " fe_logical=" << extent->fe_logical << " fe_length=" << extent->fe_length << dendl;
+
+ /* try to merge extents */
+ while ((i < fiemap->fm_mapped_extents - 1) &&
+ (extent->fe_logical + extent->fe_length == next->fe_logical)) {
+ next->fe_length += extent->fe_length;
+ next->fe_logical = extent->fe_logical;
+ extent = next;
+ next = extent + 1;
+ i++;
+ }
+
+ if (extent->fe_logical + extent->fe_length > offset + len)
+ extent->fe_length = offset + len - extent->fe_logical;
+ (*m)[extent->fe_logical] = extent->fe_length;
+ i++;
+ extent++;
+ }
+ free(fiemap);
+
+ return r;
+}
+
+int FileStore::_do_seek_hole_data(int fd, uint64_t offset, size_t len,
+ map<uint64_t, uint64_t> *m)
+{
+#if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA)
+ off_t hole_pos, data_pos;
+ int r = 0;
+
+ // If lseek fails with errno setting to be ENXIO, this means the current
+ // file offset is beyond the end of the file.
+ off_t start = offset;
+ while(start < (off_t)(offset + len)) {
+ data_pos = lseek(fd, start, SEEK_DATA);
+ if (data_pos < 0) {
+ if (errno == ENXIO)
+ break;
+ else {
+ r = -errno;
+ dout(10) << "failed to lseek: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ } else if (data_pos > (off_t)(offset + len)) {
+ break;
+ }
+
+ hole_pos = lseek(fd, data_pos, SEEK_HOLE);
+ if (hole_pos < 0) {
+ if (errno == ENXIO) {
+ break;
+ } else {
+ r = -errno;
+ dout(10) << "failed to lseek: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ if (hole_pos >= (off_t)(offset + len)) {
+ (*m)[data_pos] = offset + len - data_pos;
+ break;
+ }
+ (*m)[data_pos] = hole_pos - data_pos;
+ start = hole_pos;
+ }
+
+ return r;
+#else
+ (*m)[offset] = len;
+ return 0;
+#endif
+}
+
+int FileStore::fiemap(coll_t cid, const ghobject_t& oid,
+ uint64_t offset, size_t len,
+ bufferlist& bl)
+{
+ tracepoint(objectstore, fiemap_enter, cid.c_str(), offset, len);
+ _kludge_temp_object_collection(cid, oid);
+
+ if ((!backend->has_seek_data_hole() && !backend->has_fiemap()) ||
+ len <= (size_t)m_filestore_fiemap_threshold) {
+ map<uint64_t, uint64_t> m;
+ m[offset] = len;
+ ::encode(m, bl);
+ return 0;
+ }
+
+ dout(15) << "fiemap " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+
+ map<uint64_t, uint64_t> exomap;
+ FDRef fd;
+
+ int r = lfn_open(cid, oid, false, &fd);
+ if (r < 0) {
+ dout(10) << "read couldn't open " << cid << "/" << oid << ": " << cpp_strerror(r) << dendl;
+ goto done;
+ }
+
+ if (backend->has_seek_data_hole()) {
+ dout(15) << "seek_data/seek_hole " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+ r = _do_seek_hole_data(**fd, offset, len, &exomap);
+ } else if (backend->has_fiemap()) {
+ dout(15) << "fiemap ioctl" << cid << "/" << oid << " " << offset << "~" << len << dendl;
+ r = _do_fiemap(**fd, offset, len, &exomap);
+ }
+
+done:
+ if (r >= 0) {
+ lfn_close(fd);
+ ::encode(exomap, bl);
+ }
+
+ dout(10) << "fiemap " << cid << "/" << oid << " " << offset << "~" << len << " = " << r << " num_extents=" << exomap.size() << " " << exomap << dendl;
+ assert(!m_filestore_fail_eio || r != -EIO);
+ tracepoint(objectstore, fiemap_exit, r);
+ return r;
+}
+
+
+int FileStore::_remove(coll_t cid, const ghobject_t& oid,
+ const SequencerPosition &spos)
+{
+ dout(15) << "remove " << cid << "/" << oid << dendl;
+ int r = lfn_unlink(cid, oid, spos);
+ dout(10) << "remove " << cid << "/" << oid << " = " << r << dendl;
+ return r;
+}
+
+int FileStore::_truncate(coll_t cid, const ghobject_t& oid, uint64_t size)
+{
+ dout(15) << "truncate " << cid << "/" << oid << " size " << size << dendl;
+ int r = lfn_truncate(cid, oid, size);
+ dout(10) << "truncate " << cid << "/" << oid << " size " << size << " = " << r << dendl;
+ return r;
+}
+
+
+int FileStore::_touch(coll_t cid, const ghobject_t& oid)
+{
+ dout(15) << "touch " << cid << "/" << oid << dendl;
+
+ FDRef fd;
+ int r = lfn_open(cid, oid, true, &fd);
+ if (r < 0) {
+ return r;
+ } else {
+ lfn_close(fd);
+ }
+ dout(10) << "touch " << cid << "/" << oid << " = " << r << dendl;
+ return r;
+}
+
+int FileStore::_write(coll_t cid, const ghobject_t& oid,
+ uint64_t offset, size_t len,
+ const bufferlist& bl, uint32_t fadvise_flags)
+{
+ dout(15) << "write " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+ int r;
+
+ int64_t actual;
+
+ FDRef fd;
+ r = lfn_open(cid, oid, true, &fd);
+ if (r < 0) {
+ dout(0) << "write couldn't open " << cid << "/"
+ << oid << ": "
+ << cpp_strerror(r) << dendl;
+ goto out;
+ }
+
+ // seek
+ actual = ::lseek64(**fd, offset, SEEK_SET);
+ if (actual < 0) {
+ r = -errno;
+ dout(0) << "write lseek64 to " << offset << " failed: " << cpp_strerror(r) << dendl;
+ lfn_close(fd);
+ goto out;
+ }
+ if (actual != (int64_t)offset) {
+ dout(0) << "write lseek64 to " << offset << " gave bad offset " << actual << dendl;
+ r = -EIO;
+ lfn_close(fd);
+ goto out;
+ }
+
+ // write
+ r = bl.write_fd(**fd);
+ if (r == 0)
+ r = bl.length();
+
+ if (r >= 0 && m_filestore_sloppy_crc) {
+ int rc = backend->_crc_update_write(**fd, offset, len, bl);
+ assert(rc >= 0);
+ }
+
+ // flush?
+ if (!replaying &&
+ g_conf->filestore_wbthrottle_enable)
+ wbthrottle.queue_wb(fd, oid, offset, len,
+ fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
+ lfn_close(fd);
+
+ out:
+ dout(10) << "write " << cid << "/" << oid << " " << offset << "~" << len << " = " << r << dendl;
+ return r;
+}
+
+int FileStore::_zero(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len)
+{
+ dout(15) << "zero " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+ int ret = 0;
+
+#ifdef CEPH_HAVE_FALLOCATE
+# if !defined(DARWIN) && !defined(__FreeBSD__)
+ // first try to punch a hole.
+ FDRef fd;
+ ret = lfn_open(cid, oid, false, &fd);
+ if (ret < 0) {
+ goto out;
+ }
+
+ // first try fallocate
+ ret = fallocate(**fd, FALLOC_FL_PUNCH_HOLE, offset, len);
+ if (ret < 0)
+ ret = -errno;
+ lfn_close(fd);
+
+ if (ret >= 0 && m_filestore_sloppy_crc) {
+ int rc = backend->_crc_update_zero(**fd, offset, len);
+ assert(rc >= 0);
+ }
+
+ if (ret == 0)
+ goto out; // yay!
+ if (ret != -EOPNOTSUPP)
+ goto out; // some other error
+# endif
+#endif
+
+ // lame, kernel is old and doesn't support it.
+ // write zeros.. yuck!
+ dout(20) << "zero FALLOC_FL_PUNCH_HOLE not supported, falling back to writing zeros" << dendl;
+ {
+ bufferptr bp(len);
+ bp.zero();
+ bufferlist bl;
+ bl.push_back(bp);
+ ret = _write(cid, oid, offset, len, bl);
+ }
+
+ out:
+ dout(20) << "zero " << cid << "/" << oid << " " << offset << "~" << len << " = " << ret << dendl;
+ return ret;
+}
+
+int FileStore::_clone(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid,
+ const SequencerPosition& spos)
+{
+ dout(15) << "clone " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << dendl;
+
+ if (_check_replay_guard(cid, newoid, spos) < 0)
+ return 0;
+
+ int r;
+ FDRef o, n;
+ {
+ Index index;
+ r = lfn_open(cid, oldoid, false, &o, &index);
+ if (r < 0) {
+ goto out2;
+ }
+ assert(NULL != (index.index));
+ RWLock::WLocker l((index.index)->access_lock);
+
+ r = lfn_open(cid, newoid, true, &n, &index);
+ if (r < 0) {
+ goto out;
+ }
+ r = ::ftruncate(**n, 0);
+ if (r < 0) {
+ goto out3;
+ }
+ struct stat st;
+ ::fstat(**o, &st);
+ r = _do_clone_range(**o, **n, 0, st.st_size, 0);
+ if (r < 0) {
+ r = -errno;
+ goto out3;
+ }
+
+ dout(20) << "objectmap clone" << dendl;
+ r = object_map->clone(oldoid, newoid, &spos);
+ if (r < 0 && r != -ENOENT)
+ goto out3;
+ }
+
+ {
+ char buf[2];
+ map<string, bufferptr> aset;
+ r = _fgetattrs(**o, aset);
+ if (r < 0)
+ goto out3;
+
+ r = chain_fgetxattr(**o, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
+ if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) {
+ r = chain_fsetxattr(**n, XATTR_SPILL_OUT_NAME, XATTR_NO_SPILL_OUT,
+ sizeof(XATTR_NO_SPILL_OUT), true);
+ } else {
+ r = chain_fsetxattr(**n, XATTR_SPILL_OUT_NAME, XATTR_SPILL_OUT,
+ sizeof(XATTR_SPILL_OUT), true);
+ }
+ if (r < 0)
+ goto out3;
+
+ r = _fsetattrs(**n, aset);
+ if (r < 0)
+ goto out3;
+ }
+
+ // clone is non-idempotent; record our work.
+ _set_replay_guard(**n, spos, &newoid);
+
+ out3:
+ lfn_close(n);
+ out:
+ lfn_close(o);
+ out2:
+ dout(10) << "clone " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << " = " << r << dendl;
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
+}
+
+int FileStore::_do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff)
+{
+ dout(20) << "_do_clone_range copy " << srcoff << "~" << len << " to " << dstoff << dendl;
+ return backend->clone_range(from, to, srcoff, len, dstoff);
+}
+
+int FileStore::_do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff)
+{
+ dout(20) << __func__ << " " << srcoff << "~" << len << " to " << dstoff << dendl;
+ int r = 0;
+ map<uint64_t, uint64_t> exomap;
+ // fiemap doesn't allow zero length
+ if (len == 0)
+ return 0;
+
+ if (backend->has_seek_data_hole()) {
+ dout(15) << "seek_data/seek_hole " << from << " " << srcoff << "~" << len << dendl;
+ r = _do_seek_hole_data(from, srcoff, len, &exomap);
+ } else if (backend->has_fiemap()) {
+ dout(15) << "fiemap ioctl" << from << " " << srcoff << "~" << len << dendl;
+ r = _do_fiemap(from, srcoff, len, &exomap);
+ }
+
+ int64_t written = 0;
+ for (map<uint64_t, uint64_t>::iterator miter = exomap.begin(); miter != exomap.end(); ++miter) {
+ uint64_t it_off = miter->first - srcoff + dstoff;
+ r = _do_copy_range(from, to, miter->first, miter->second, it_off, true);
+ if (r < 0) {
+ r = -errno;
+ derr << "FileStore::_do_copy_range: copy error at " << miter->first << "~" << miter->second
+ << " to " << it_off << ", " << cpp_strerror(r) << dendl;
+ break;
+ }
+ written += miter->second;
+ }
+
+ if (r >= 0) {
+ if (m_filestore_sloppy_crc) {
+ int rc = backend->_crc_update_clone_range(from, to, srcoff, len, dstoff);
+ assert(rc >= 0);
+ }
+ struct stat st;
+ r = ::fstat(to, &st);
+ if (r < 0) {
+ r = -errno;
+ derr << __func__ << ": fstat error at " << to << " " << cpp_strerror(r) << dendl;
+ goto out;
+ }
+ if (st.st_size < (int)(dstoff + len)) {
+ r = ::ftruncate(to, dstoff + len);
+ if (r < 0) {
+ r = -errno;
+ derr << __func__ << ": ftruncate error at " << dstoff+len << " " << cpp_strerror(r) << dendl;
+ goto out;
+ }
+ }
+ r = written;
+ }
+
+ out:
+ dout(20) << __func__ << " " << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl;
+ return r;
+}
+
+int FileStore::_do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc)
+{
+ dout(20) << "_do_copy_range " << srcoff << "~" << len << " to " << dstoff << dendl;
+ int r = 0;
+ loff_t pos = srcoff;
+ loff_t end = srcoff + len;
+ int buflen = 4096 * 16; //limit by pipe max size.see fcntl
+
+#ifdef CEPH_HAVE_SPLICE
+ if (backend->has_splice()) {
+ int pipefd[2];
+ if (pipe(pipefd) < 0) {
+ r = errno;
+ derr << " pipe " << " got " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ loff_t dstpos = dstoff;
+ while (pos < end) {
+ int l = MIN(end-pos, buflen);
+ r = safe_splice(from, &pos, pipefd[1], NULL, l, SPLICE_F_NONBLOCK);
+ dout(10) << " safe_splice read from " << pos << "~" << l << " got " << r << dendl;
+ if (r < 0) {
+ derr << "FileStore::_do_copy_range: safe_splice read error at " << pos << "~" << len
+ << ", " << cpp_strerror(r) << dendl;
+ break;
+ }
+ if (r == 0) {
+ // hrm, bad source range, wtf.
+ r = -ERANGE;
+ derr << "FileStore::_do_copy_range got short read result at " << pos
+ << " of fd " << from << " len " << len << dendl;
+ break;
+ }
+
+ r = safe_splice(pipefd[0], NULL, to, &dstpos, r, 0);
+ dout(10) << " safe_splice write to " << to << " len " << r
+ << " got " << r << dendl;
+ if (r < 0) {
+ derr << "FileStore::_do_copy_range: write error at " << pos << "~"
+ << r << ", " << cpp_strerror(r) << dendl;
+ break;
+ }
+ }
+ close(pipefd[0]);
+ close(pipefd[1]);
+ } else
+#endif
+ {
+ int64_t actual;
+
+ actual = ::lseek64(from, srcoff, SEEK_SET);
+ if (actual != (int64_t)srcoff) {
+ r = errno;
+ derr << "lseek64 to " << srcoff << " got " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ actual = ::lseek64(to, dstoff, SEEK_SET);
+ if (actual != (int64_t)dstoff) {
+ r = errno;
+ derr << "lseek64 to " << dstoff << " got " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ char buf[buflen];
+ while (pos < end) {
+ int l = MIN(end-pos, buflen);
+ r = ::read(from, buf, l);
+ dout(25) << " read from " << pos << "~" << l << " got " << r << dendl;
+ if (r < 0) {
+ if (errno == EINTR) {
+ continue;
+ } else {
+ r = -errno;
+ derr << "FileStore::_do_copy_range: read error at " << pos << "~" << len
+ << ", " << cpp_strerror(r) << dendl;
+ break;
+ }
+ }
+ if (r == 0) {
+ // hrm, bad source range, wtf.
+ r = -ERANGE;
+ derr << "FileStore::_do_copy_range got short read result at " << pos
+ << " of fd " << from << " len " << len << dendl;
+ break;
+ }
+ int op = 0;
+ while (op < r) {
+ int r2 = safe_write(to, buf+op, r-op);
+ dout(25) << " write to " << to << " len " << (r-op)
+ << " got " << r2 << dendl;
+ if (r2 < 0) {
+ r = r2;
+ derr << "FileStore::_do_copy_range: write error at " << pos << "~"
+ << r-op << ", " << cpp_strerror(r) << dendl;
+
+ break;
+ }
+ op += (r-op);
+ }
+ if (r < 0)
+ break;
+ pos += r;
+ }
+ }
+
+ assert(pos == end);
+ if (r >= 0 && !skip_sloppycrc && m_filestore_sloppy_crc) {
+ int rc = backend->_crc_update_clone_range(from, to, srcoff, len, dstoff);
+ assert(rc >= 0);
+ }
+ dout(20) << "_do_copy_range " << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl;
+ return r;
+}
+
+int FileStore::_clone_range(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid,
+ uint64_t srcoff, uint64_t len, uint64_t dstoff,
+ const SequencerPosition& spos)
+{
+ dout(15) << "clone_range " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << " " << srcoff << "~" << len << " to " << dstoff << dendl;
+
+ if (_check_replay_guard(cid, newoid, spos) < 0)
+ return 0;
+
+ int r;
+ FDRef o, n;
+ r = lfn_open(cid, oldoid, false, &o);
+ if (r < 0) {
+ goto out2;
+ }
+ r = lfn_open(cid, newoid, true, &n);
+ if (r < 0) {
+ goto out;
+ }
+ r = _do_clone_range(**o, **n, srcoff, len, dstoff);
+ if (r < 0) {
+ r = -errno;
+ goto out3;
+ }
+
+ // clone is non-idempotent; record our work.
+ _set_replay_guard(**n, spos, &newoid);
+
+ out3:
+ lfn_close(n);
+ out:
+ lfn_close(o);
+ out2:
+ dout(10) << "clone_range " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << " "
+ << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl;
+ return r;
+}
+
+class SyncEntryTimeout : public Context {
+public:
+ SyncEntryTimeout(int commit_timeo)
+ : m_commit_timeo(commit_timeo)
+ {
+ }
+
+ void finish(int r) {
+ BackTrace *bt = new BackTrace(1);
+ generic_dout(-1) << "FileStore: sync_entry timed out after "
+ << m_commit_timeo << " seconds.\n";
+ bt->print(*_dout);
+ *_dout << dendl;
+ delete bt;
+ ceph_abort();
+ }
+private:
+ int m_commit_timeo;
+};
+
+void FileStore::sync_entry()
+{
+ lock.Lock();
+ while (!stop) {
+ utime_t max_interval;
+ max_interval.set_from_double(m_filestore_max_sync_interval);
+ utime_t min_interval;
+ min_interval.set_from_double(m_filestore_min_sync_interval);
+
+ utime_t startwait = ceph_clock_now(g_ceph_context);
+ if (!force_sync) {
+ dout(20) << "sync_entry waiting for max_interval " << max_interval << dendl;
+ sync_cond.WaitInterval(g_ceph_context, lock, max_interval);
+ } else {
+ dout(20) << "sync_entry not waiting, force_sync set" << dendl;
+ }
+
+ if (force_sync) {
+ dout(20) << "sync_entry force_sync set" << dendl;
+ force_sync = false;
+ } else {
+ // wait for at least the min interval
+ utime_t woke = ceph_clock_now(g_ceph_context);
+ woke -= startwait;
+ dout(20) << "sync_entry woke after " << woke << dendl;
+ if (woke < min_interval) {
+ utime_t t = min_interval;
+ t -= woke;
+ dout(20) << "sync_entry waiting for another " << t
+ << " to reach min interval " << min_interval << dendl;
+ sync_cond.WaitInterval(g_ceph_context, lock, t);
+ }
+ }
+
+ list<Context*> fin;
+ again:
+ fin.swap(sync_waiters);
+ lock.Unlock();
+
+ op_tp.pause();
+ if (apply_manager.commit_start()) {
+ utime_t start = ceph_clock_now(g_ceph_context);
+ uint64_t cp = apply_manager.get_committing_seq();
+
+ sync_entry_timeo_lock.Lock();
+ SyncEntryTimeout *sync_entry_timeo =
+ new SyncEntryTimeout(m_filestore_commit_timeout);
+ timer.add_event_after(m_filestore_commit_timeout, sync_entry_timeo);
+ sync_entry_timeo_lock.Unlock();
+
+ logger->set(l_os_committing, 1);
+
+ dout(15) << "sync_entry committing " << cp << dendl;
+ stringstream errstream;
+ if (g_conf->filestore_debug_omap_check && !object_map->check(errstream)) {
+ derr << errstream.str() << dendl;
+ assert(0);
+ }
+
+ if (backend->can_checkpoint()) {
+ int err = write_op_seq(op_fd, cp);
+ if (err < 0) {
+ derr << "Error during write_op_seq: " << cpp_strerror(err) << dendl;
+ assert(0 == "error during write_op_seq");
+ }
+
+ char s[NAME_MAX];
+ snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)cp);
+ uint64_t cid = 0;
+ err = backend->create_checkpoint(s, &cid);
+ if (err < 0) {
+ int err = errno;
+ derr << "snap create '" << s << "' got error " << err << dendl;
+ assert(err == 0);
+ }
+
+ snaps.push_back(cp);
+ apply_manager.commit_started();
+ op_tp.unpause();
+
+ if (cid > 0) {
+ dout(20) << " waiting for checkpoint " << cid << " to complete" << dendl;
+ err = backend->sync_checkpoint(cid);
+ if (err < 0) {
+ derr << "ioctl WAIT_SYNC got " << cpp_strerror(err) << dendl;
+ assert(0 == "wait_sync got error");
+ }
+ dout(20) << " done waiting for checkpoint" << cid << " to complete" << dendl;
+ }
+ } else
+ {
+ apply_manager.commit_started();
+ op_tp.unpause();
+
+ object_map->sync();
+ int err = backend->syncfs();
+ if (err < 0) {
+ derr << "syncfs got " << cpp_strerror(err) << dendl;
+ assert(0 == "syncfs returned error");
+ }
+
+ err = write_op_seq(op_fd, cp);
+ if (err < 0) {
+ derr << "Error during write_op_seq: " << cpp_strerror(err) << dendl;
+ assert(0 == "error during write_op_seq");
+ }
+ err = ::fsync(op_fd);
+ if (err < 0) {
+ derr << "Error during fsync of op_seq: " << cpp_strerror(err) << dendl;
+ assert(0 == "error during fsync of op_seq");
+ }
+ }
+
+ utime_t done = ceph_clock_now(g_ceph_context);
+ utime_t lat = done - start;
+ utime_t dur = done - startwait;
+ dout(10) << "sync_entry commit took " << lat << ", interval was " << dur << dendl;
+
+ logger->inc(l_os_commit);
+ logger->tinc(l_os_commit_lat, lat);
+ logger->tinc(l_os_commit_len, dur);
+
+ apply_manager.commit_finish();
+ wbthrottle.clear();
+
+ logger->set(l_os_committing, 0);
+
+ // remove old snaps?
+ if (backend->can_checkpoint()) {
+ char s[NAME_MAX];
+ while (snaps.size() > 2) {
+ snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)snaps.front());
+ snaps.pop_front();
+ dout(10) << "removing snap '" << s << "'" << dendl;
+ int r = backend->destroy_checkpoint(s);
+ if (r) {
+ int err = errno;
+ derr << "unable to destroy snap '" << s << "' got " << cpp_strerror(err) << dendl;
+ }
+ }
+ }
+
+ dout(15) << "sync_entry committed to op_seq " << cp << dendl;
+
+ sync_entry_timeo_lock.Lock();
+ timer.cancel_event(sync_entry_timeo);
+ sync_entry_timeo_lock.Unlock();
+ } else {
+ op_tp.unpause();
+ }
+
+ lock.Lock();
+ finish_contexts(g_ceph_context, fin, 0);
+ fin.clear();
+ if (!sync_waiters.empty()) {
+ dout(10) << "sync_entry more waiters, committing again" << dendl;
+ goto again;
+ }
+ if (!stop && journal && journal->should_commit_now()) {
+ dout(10) << "sync_entry journal says we should commit again (probably is/was full)" << dendl;
+ goto again;
+ }
+ }
+ stop = false;
+ lock.Unlock();
+}
+
+void FileStore::_start_sync()
+{
+ if (!journal) { // don't do a big sync if the journal is on
+ dout(10) << "start_sync" << dendl;
+ sync_cond.Signal();
+ } else {
+ dout(10) << "start_sync - NOOP (journal is on)" << dendl;
+ }
+}
+
+void FileStore::do_force_sync()
+{
+ dout(10) << __func__ << dendl;
+ Mutex::Locker l(lock);
+ force_sync = true;
+ sync_cond.Signal();
+}
+
+void FileStore::start_sync(Context *onsafe)
+{
+ Mutex::Locker l(lock);
+ sync_waiters.push_back(onsafe);
+ sync_cond.Signal();
+ force_sync = true;
+ dout(10) << "start_sync" << dendl;
+}
+
+void FileStore::sync()
+{
+ Mutex l("FileStore::sync");
+ Cond c;
+ bool done;
+ C_SafeCond *fin = new C_SafeCond(&l, &c, &done);
+
+ start_sync(fin);
+
+ l.Lock();
+ while (!done) {
+ dout(10) << "sync waiting" << dendl;
+ c.Wait(l);
+ }
+ l.Unlock();
+ dout(10) << "sync done" << dendl;
+}
+
+void FileStore::_flush_op_queue()
+{
+ dout(10) << "_flush_op_queue draining op tp" << dendl;
+ op_wq.drain();
+ dout(10) << "_flush_op_queue waiting for apply finisher" << dendl;
+ for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
+ (*it)->wait_for_empty();
+ }
+}
+
+/*
+ * flush - make every queued write readable
+ */
+void FileStore::flush()
+{
+ dout(10) << "flush" << dendl;
+
+ if (g_conf->filestore_blackhole) {
+ // wait forever
+ Mutex lock("FileStore::flush::lock");
+ Cond cond;
+ lock.Lock();
+ while (true)
+ cond.Wait(lock);
+ assert(0);
+ }
+
+ if (m_filestore_journal_writeahead) {
+ if (journal)
+ journal->flush();
+ dout(10) << "flush draining ondisk finisher" << dendl;
+ for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
+ (*it)->wait_for_empty();
+ }
+ }
+
+ _flush_op_queue();
+ dout(10) << "flush complete" << dendl;
+}
+
+/*
+ * sync_and_flush - make every queued write readable AND committed to disk
+ */
+void FileStore::sync_and_flush()
+{
+ dout(10) << "sync_and_flush" << dendl;
+
+ if (m_filestore_journal_writeahead) {
+ if (journal)
+ journal->flush();
+ _flush_op_queue();
+ } else {
+ // includes m_filestore_journal_parallel
+ _flush_op_queue();
+ sync();
+ }
+ dout(10) << "sync_and_flush done" << dendl;
+}
+
+int FileStore::flush_journal()
+{
+ dout(10) << __func__ << dendl;
+ sync_and_flush();
+ sync();
+ return 0;
+}
+
+int FileStore::snapshot(const string& name)
+{
+ dout(10) << "snapshot " << name << dendl;
+ sync_and_flush();
+
+ if (!backend->can_checkpoint()) {
+ dout(0) << "snapshot " << name << " failed, not supported" << dendl;
+ return -EOPNOTSUPP;
+ }
+
+ char s[NAME_MAX];
+ snprintf(s, sizeof(s), CLUSTER_SNAP_ITEM, name.c_str());
+
+ int r = backend->create_checkpoint(s, NULL);
+ if (r) {
+ r = -errno;
+ derr << "snapshot " << name << " failed: " << cpp_strerror(r) << dendl;
+ }
+
+ return r;
+}
+
+// -------------------------------
+// attributes
+
+int FileStore::_fgetattr(int fd, const char *name, bufferptr& bp)
+{
+ char val[CHAIN_XATTR_MAX_BLOCK_LEN];
+ int l = chain_fgetxattr(fd, name, val, sizeof(val));
+ if (l >= 0) {
+ bp = buffer::create(l);
+ memcpy(bp.c_str(), val, l);
+ } else if (l == -ERANGE) {
+ l = chain_fgetxattr(fd, name, 0, 0);
+ if (l > 0) {
+ bp = buffer::create(l);
+ l = chain_fgetxattr(fd, name, bp.c_str(), l);
+ }
+ }
+ assert(!m_filestore_fail_eio || l != -EIO);
+ return l;
+}
+
+int FileStore::_fgetattrs(int fd, map<string,bufferptr>& aset)
+{
+ // get attr list
+ char names1[100];
+ int len = chain_flistxattr(fd, names1, sizeof(names1)-1);
+ char *names2 = 0;
+ char *name = 0;
+ if (len == -ERANGE) {
+ len = chain_flistxattr(fd, 0, 0);
+ if (len < 0) {
+ assert(!m_filestore_fail_eio || len != -EIO);
+ return len;
+ }
+ dout(10) << " -ERANGE, len is " << len << dendl;
+ names2 = new char[len+1];
+ len = chain_flistxattr(fd, names2, len);
+ dout(10) << " -ERANGE, got " << len << dendl;
+ if (len < 0) {
+ assert(!m_filestore_fail_eio || len != -EIO);
+ delete[] names2;
+ return len;
+ }
+ name = names2;
+ } else if (len < 0) {
+ assert(!m_filestore_fail_eio || len != -EIO);
+ return len;
+ } else {
+ name = names1;
+ }
+ name[len] = 0;
+
+ char *end = name + len;
+ while (name < end) {
+ char *attrname = name;
+ if (parse_attrname(&name)) {
+ if (*name) {
+ dout(20) << "fgetattrs " << fd << " getting '" << name << "'" << dendl;
+ int r = _fgetattr(fd, attrname, aset[name]);
+ if (r < 0) {
+ delete[] names2;
+ return r;
+ }
+ }
+ }
+ name += strlen(name) + 1;
+ }
+
+ delete[] names2;
+ return 0;
+}
+
+int FileStore::_fsetattrs(int fd, map<string, bufferptr> &aset)
+{
+ for (map<string, bufferptr>::iterator p = aset.begin();
+ p != aset.end();
+ ++p) {
+ char n[CHAIN_XATTR_MAX_NAME_LEN];
+ get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN);
+ const char *val;
+ if (p->second.length())
+ val = p->second.c_str();
+ else
+ val = "";
+ // ??? Why do we skip setting all the other attrs if one fails?
+ int r = chain_fsetxattr(fd, n, val, p->second.length());
+ if (r < 0) {
+ derr << "FileStore::_setattrs: chain_setxattr returned " << r << dendl;
+ return r;
+ }
+ }
+ return 0;
+}
+
+// debug EIO injection
+void FileStore::inject_data_error(const ghobject_t &oid) {
+ Mutex::Locker l(read_error_lock);
+ dout(10) << __func__ << ": init error on " << oid << dendl;
+ data_error_set.insert(oid);
+}
+void FileStore::inject_mdata_error(const ghobject_t &oid) {
+ Mutex::Locker l(read_error_lock);
+ dout(10) << __func__ << ": init error on " << oid << dendl;
+ mdata_error_set.insert(oid);
+}
+void FileStore::debug_obj_on_delete(const ghobject_t &oid) {
+ Mutex::Locker l(read_error_lock);
+ dout(10) << __func__ << ": clear error on " << oid << dendl;
+ data_error_set.erase(oid);
+ mdata_error_set.erase(oid);
+}
+bool FileStore::debug_data_eio(const ghobject_t &oid) {
+ Mutex::Locker l(read_error_lock);
+ if (data_error_set.count(oid)) {
+ dout(10) << __func__ << ": inject error on " << oid << dendl;
+ return true;
+ } else {
+ return false;
+ }
+}
+bool FileStore::debug_mdata_eio(const ghobject_t &oid) {
+ Mutex::Locker l(read_error_lock);
+ if (mdata_error_set.count(oid)) {
+ dout(10) << __func__ << ": inject error on " << oid << dendl;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+
+// objects
+
+int FileStore::getattr(coll_t cid, const ghobject_t& oid, const char *name, bufferptr &bp)
+{
+ tracepoint(objectstore, getattr_enter, cid.c_str());
+ _kludge_temp_object_collection(cid, oid);
+ dout(15) << "getattr " << cid << "/" << oid << " '" << name << "'" << dendl;
+ FDRef fd;
+ int r = lfn_open(cid, oid, false, &fd);
+ if (r < 0) {
+ goto out;
+ }
+ char n[CHAIN_XATTR_MAX_NAME_LEN];
+ get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN);
+ r = _fgetattr(**fd, n, bp);
+ lfn_close(fd);
+ if (r == -ENODATA) {
+ map<string, bufferlist> got;
+ set<string> to_get;
+ to_get.insert(string(name));
+ Index index;
+ r = get_index(cid, &index);
+ if (r < 0) {
+ dout(10) << __func__ << " could not get index r = " << r << dendl;
+ goto out;
+ }
+ r = object_map->get_xattrs(oid, to_get, &got);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __func__ << " get_xattrs err r =" << r << dendl;
+ goto out;
+ }
+ if (got.empty()) {
+ dout(10) << __func__ << " got.size() is 0" << dendl;
+ return -ENODATA;
+ }
+ bp = bufferptr(got.begin()->second.c_str(),
+ got.begin()->second.length());
+ r = bp.length();
+ }
+ out:
+ dout(10) << "getattr " << cid << "/" << oid << " '" << name << "' = " << r << dendl;
+ assert(!m_filestore_fail_eio || r != -EIO);
+ if (g_conf->filestore_debug_inject_read_err &&
+ debug_mdata_eio(oid)) {
+ return -EIO;
+ } else {
+ tracepoint(objectstore, getattr_exit, r);
+ return r < 0 ? r : 0;
+ }
+}
+
+int FileStore::getattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset)
+{
+ tracepoint(objectstore, getattrs_enter, cid.c_str());
+ _kludge_temp_object_collection(cid, oid);
+ set<string> omap_attrs;
+ map<string, bufferlist> omap_aset;
+ Index index;
+ dout(15) << "getattrs " << cid << "/" << oid << dendl;
+ FDRef fd;
+ bool spill_out = true;
+ char buf[2];
+
+ int r = lfn_open(cid, oid, false, &fd);
+ if (r < 0) {
+ goto out;
+ }
+
+ r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
+ if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT)))
+ spill_out = false;
+
+ r = _fgetattrs(**fd, aset);
+ if (r < 0) {
+ goto out;
+ }
+ lfn_close(fd);
+
+ if (!spill_out) {
+ dout(10) << __func__ << " no xattr exists in object_map r = " << r << dendl;
+ goto out;
+ }
+
+ r = get_index(cid, &index);
+ if (r < 0) {
+ dout(10) << __func__ << " could not get index r = " << r << dendl;
+ goto out;
+ }
+ {
+ r = object_map->get_all_xattrs(oid, &omap_attrs);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
+ goto out;
+ }
+
+ r = object_map->get_xattrs(oid, omap_attrs, &omap_aset);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
+ goto out;
+ }
+ if (r == -ENOENT)
+ r = 0;
+ }
+ assert(omap_attrs.size() == omap_aset.size());
+ for (map<string, bufferlist>::iterator i = omap_aset.begin();
+ i != omap_aset.end();
+ ++i) {
+ string key(i->first);
+ aset.insert(make_pair(key,
+ bufferptr(i->second.c_str(), i->second.length())));
+ }
+ out:
+ dout(10) << "getattrs " << cid << "/" << oid << " = " << r << dendl;
+ assert(!m_filestore_fail_eio || r != -EIO);
+
+ if (g_conf->filestore_debug_inject_read_err &&
+ debug_mdata_eio(oid)) {
+ return -EIO;
+ } else {
+ tracepoint(objectstore, getattrs_exit, r);
+ return r;
+ }
+}
+
+int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset,
+ const SequencerPosition &spos)
+{
+ map<string, bufferlist> omap_set;
+ set<string> omap_remove;
+ map<string, bufferptr> inline_set;
+ map<string, bufferptr> inline_to_set;
+ FDRef fd;
+ int spill_out = -1;
+ bool incomplete_inline = false;
+
+ int r = lfn_open(cid, oid, false, &fd);
+ if (r < 0) {
+ goto out;
+ }
+
+ char buf[2];
+ r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
+ if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT)))
+ spill_out = 0;
+ else
+ spill_out = 1;
+
+ r = _fgetattrs(**fd, inline_set);
+ incomplete_inline = (r == -E2BIG);
+ assert(!m_filestore_fail_eio || r != -EIO);
+ dout(15) << "setattrs " << cid << "/" << oid
+ << (incomplete_inline ? " (incomplete_inline, forcing omap)" : "")
+ << dendl;
+
+ for (map<string,bufferptr>::iterator p = aset.begin();
+ p != aset.end();
+ ++p) {
+ char n[CHAIN_XATTR_MAX_NAME_LEN];
+ get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN);
+
+ if (incomplete_inline) {
+ chain_fremovexattr(**fd, n); // ignore any error
+ omap_set[p->first].push_back(p->second);
+ continue;
+ }
+
+ if (p->second.length() > m_filestore_max_inline_xattr_size) {
+ if (inline_set.count(p->first)) {
+ inline_set.erase(p->first);
+ r = chain_fremovexattr(**fd, n);
+ if (r < 0)
+ goto out_close;
+ }
+ omap_set[p->first].push_back(p->second);
+ continue;
+ }
+
+ if (!inline_set.count(p->first) &&
+ inline_set.size() >= m_filestore_max_inline_xattrs) {
+ omap_set[p->first].push_back(p->second);
+ continue;
+ }
+ omap_remove.insert(p->first);
+ inline_set.insert(*p);
+
+ inline_to_set.insert(*p);
+ }
+
+ if (spill_out != 1 && !omap_set.empty()) {
+ chain_fsetxattr(**fd, XATTR_SPILL_OUT_NAME, XATTR_SPILL_OUT,
+ sizeof(XATTR_SPILL_OUT));
+ }
+
+ r = _fsetattrs(**fd, inline_to_set);
+ if (r < 0)
+ goto out_close;
+
+ if (spill_out && !omap_remove.empty()) {
+ r = object_map->remove_xattrs(oid, omap_remove, &spos);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __func__ << " could not remove_xattrs r = " << r << dendl;
+ assert(!m_filestore_fail_eio || r != -EIO);
+ goto out_close;
+ } else {
+ r = 0; // don't confuse the debug output
+ }
+ }
+
+ if (!omap_set.empty()) {
+ r = object_map->set_xattrs(oid, omap_set, &spos);
+ if (r < 0) {
+ dout(10) << __func__ << " could not set_xattrs r = " << r << dendl;
+ assert(!m_filestore_fail_eio || r != -EIO);
+ goto out_close;
+ }
+ }
+ out_close:
+ lfn_close(fd);
+ out:
+ dout(10) << "setattrs " << cid << "/" << oid << " = " << r << dendl;
+ return r;
+}
+
+
+int FileStore::_rmattr(coll_t cid, const ghobject_t& oid, const char *name,
+ const SequencerPosition &spos)
+{
+ dout(15) << "rmattr " << cid << "/" << oid << " '" << name << "'" << dendl;
+ FDRef fd;
+ bool spill_out = true;
+ bufferptr bp;
+
+ int r = lfn_open(cid, oid, false, &fd);
+ if (r < 0) {
+ goto out;
+ }
+
+ char buf[2];
+ r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
+ if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) {
+ spill_out = false;
+ }
+
+ char n[CHAIN_XATTR_MAX_NAME_LEN];
+ get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN);
+ r = chain_fremovexattr(**fd, n);
+ if (r == -ENODATA && spill_out) {
+ Index index;
+ r = get_index(cid, &index);
+ if (r < 0) {
+ dout(10) << __func__ << " could not get index r = " << r << dendl;
+ goto out_close;
+ }
+ set<string> to_remove;
+ to_remove.insert(string(name));
+ r = object_map->remove_xattrs(oid, to_remove, &spos);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __func__ << " could not remove_xattrs index r = " << r << dendl;
+ assert(!m_filestore_fail_eio || r != -EIO);
+ goto out_close;
+ }
+ }
+ out_close:
+ lfn_close(fd);
+ out:
+ dout(10) << "rmattr " << cid << "/" << oid << " '" << name << "' = " << r << dendl;
+ return r;
+}
+
+int FileStore::_rmattrs(coll_t cid, const ghobject_t& oid,
+ const SequencerPosition &spos)
+{
+ dout(15) << "rmattrs " << cid << "/" << oid << dendl;
+
+ map<string,bufferptr> aset;
+ FDRef fd;
+ set<string> omap_attrs;
+ Index index;
+ bool spill_out = true;
+
+ int r = lfn_open(cid, oid, false, &fd);
+ if (r < 0) {
+ goto out;
+ }
+
+ char buf[2];
+ r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
+ if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) {
+ spill_out = false;
+ }
+
+ r = _fgetattrs(**fd, aset);
+ if (r >= 0) {
+ for (map<string,bufferptr>::iterator p = aset.begin(); p != aset.end(); ++p) {
+ char n[CHAIN_XATTR_MAX_NAME_LEN];
+ get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN);
+ r = chain_fremovexattr(**fd, n);
+ if (r < 0)
+ break;
+ }
+ }
+
+ if (!spill_out) {
+ dout(10) << __func__ << " no xattr exists in object_map r = " << r << dendl;
+ goto out_close;
+ }
+
+ r = get_index(cid, &index);
+ if (r < 0) {
+ dout(10) << __func__ << " could not get index r = " << r << dendl;
+ goto out_close;
+ }
+ {
+ r = object_map->get_all_xattrs(oid, &omap_attrs);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
+ assert(!m_filestore_fail_eio || r != -EIO);
+ goto out_close;
+ }
+ r = object_map->remove_xattrs(oid, omap_attrs, &spos);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __func__ << " could not remove omap_attrs r = " << r << dendl;
+ goto out_close;
+ }
+ if (r == -ENOENT)
+ r = 0;
+ chain_fsetxattr(**fd, XATTR_SPILL_OUT_NAME, XATTR_NO_SPILL_OUT,
+ sizeof(XATTR_NO_SPILL_OUT));
+ }
+
+ out_close:
+ lfn_close(fd);
+ out:
+ dout(10) << "rmattrs " << cid << "/" << oid << " = " << r << dendl;
+ return r;
+}
+
+
+
+// collections
+
+int FileStore::collection_getattr(coll_t c, const char *name,
+ void *value, size_t size)
+{
+ char fn[PATH_MAX];
+ get_cdir(c, fn, sizeof(fn));
+ dout(15) << "collection_getattr " << fn << " '" << name << "' len " << size << dendl;
+ int r;
+ int fd = ::open(fn, O_RDONLY);
+ if (fd < 0) {
+ r = -errno;
+ goto out;
+ }
+ char n[PATH_MAX];
+ get_attrname(name, n, PATH_MAX);
+ r = chain_fgetxattr(fd, n, value, size);
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ out:
+ dout(10) << "collection_getattr " << fn << " '" << name << "' len " << size << " = " << r << dendl;
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
+}
+
+int FileStore::collection_getattr(coll_t c, const char *name, bufferlist& bl)
+{
+ char fn[PATH_MAX];
+ get_cdir(c, fn, sizeof(fn));
+ dout(15) << "collection_getattr " << fn << " '" << name << "'" << dendl;
+ char n[PATH_MAX];
+ get_attrname(name, n, PATH_MAX);
+ buffer::ptr bp;
+ int r;
+ int fd = ::open(fn, O_RDONLY);
+ if (fd < 0) {
+ r = -errno;
+ goto out;
+ }
+ r = _fgetattr(fd, n, bp);
+ bl.push_back(bp);
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ out:
+ dout(10) << "collection_getattr " << fn << " '" << name << "' = " << r << dendl;
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
+}
+
+int FileStore::collection_getattrs(coll_t cid, map<string,bufferptr>& aset)
+{
+ char fn[PATH_MAX];
+ get_cdir(cid, fn, sizeof(fn));
+ dout(10) << "collection_getattrs " << fn << dendl;
+ int r = 0;
+ int fd = ::open(fn, O_RDONLY);
+ if (fd < 0) {
+ r = -errno;
+ goto out;
+ }
+ r = _fgetattrs(fd, aset);
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ out:
+ dout(10) << "collection_getattrs " << fn << " = " << r << dendl;
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
+}
+
+
+int FileStore::_collection_setattr(coll_t c, const char *name,
+ const void *value, size_t size)
+{
+ char fn[PATH_MAX];
+ get_cdir(c, fn, sizeof(fn));
+ dout(10) << "collection_setattr " << fn << " '" << name << "' len " << size << dendl;
+ char n[PATH_MAX];
+ int r;
+ int fd = ::open(fn, O_RDONLY);
+ if (fd < 0) {
+ r = -errno;
+ goto out;
+ }
+ get_attrname(name, n, PATH_MAX);
+ r = chain_fsetxattr(fd, n, value, size);
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ out:
+ dout(10) << "collection_setattr " << fn << " '" << name << "' len " << size << " = " << r << dendl;
+ return r;
+}
+
+int FileStore::_collection_rmattr(coll_t c, const char *name)
+{
+ char fn[PATH_MAX];
+ get_cdir(c, fn, sizeof(fn));
+ dout(15) << "collection_rmattr " << fn << dendl;
+ char n[PATH_MAX];
+ get_attrname(name, n, PATH_MAX);
+ int r;
+ int fd = ::open(fn, O_RDONLY);
+ if (fd < 0) {
+ r = -errno;
+ goto out;
+ }
+ r = chain_fremovexattr(fd, n);
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ out:
+ dout(10) << "collection_rmattr " << fn << " = " << r << dendl;
+ return r;
+}
+
+
+int FileStore::_collection_setattrs(coll_t cid, map<string,bufferptr>& aset)
+{
+ char fn[PATH_MAX];
+ get_cdir(cid, fn, sizeof(fn));
+ dout(15) << "collection_setattrs " << fn << dendl;
+ int r = 0;
+ int fd = ::open(fn, O_RDONLY);
+ if (fd < 0) {
+ r = -errno;
+ goto out;
+ }
+ for (map<string,bufferptr>::iterator p = aset.begin();
+ p != aset.end();
+ ++p) {
+ char n[PATH_MAX];
+ get_attrname(p->first.c_str(), n, PATH_MAX);
+ r = chain_fsetxattr(fd, n, p->second.c_str(), p->second.length());
+ if (r < 0)
+ break;
+ }
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ out:
+ dout(10) << "collection_setattrs " << fn << " = " << r << dendl;
+ return r;
+}
+
+int FileStore::_collection_remove_recursive(const coll_t &cid,
+ const SequencerPosition &spos)
+{
+ struct stat st;
+ int r = collection_stat(cid, &st);
+ if (r < 0) {
+ if (r == -ENOENT)
+ return 0;
+ return r;
+ }
+
+ vector<ghobject_t> objects;
+ ghobject_t max;
+ while (!max.is_max()) {
+ r = collection_list(cid, max, ghobject_t::get_max(), true,
+ 300, &objects, &max);
+ if (r < 0)
+ return r;
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ assert(_check_replay_guard(cid, *i, spos));
+ r = _remove(cid, *i, spos);
+ if (r < 0)
+ return r;
+ }
+ }
+ return _destroy_collection(cid);
+}
+
+// --------------------------
+// collections
+
+int FileStore::collection_version_current(coll_t c, uint32_t *version)
+{
+ Index index;
+ int r = get_index(c, &index);
+ if (r < 0)
+ return r;
+
+ assert(NULL != index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+
+ *version = index->collection_version();
+ if (*version == target_version)
+ return 1;
+ else
+ return 0;
+}
+
+int FileStore::list_collections(vector<coll_t>& ls)
+{
+ return list_collections(ls, false);
+}
+
+int FileStore::list_collections(vector<coll_t>& ls, bool include_temp)
+{
+ tracepoint(objectstore, list_collections_enter);
+ dout(10) << "list_collections" << dendl;
+
+ char fn[PATH_MAX];
+ snprintf(fn, sizeof(fn), "%s/current", basedir.c_str());
+
+ int r = 0;
+ DIR *dir = ::opendir(fn);
+ if (!dir) {
+ r = -errno;
+ derr << "tried opening directory " << fn << ": " << cpp_strerror(-r) << dendl;
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
+ }
+
+ char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1];
+ struct dirent *de;
+ while ((r = ::readdir_r(dir, (struct dirent *)&buf, &de)) == 0) {
+ if (!de)
+ break;
+ if (de->d_type == DT_UNKNOWN) {
+ // d_type not supported (non-ext[234], btrfs), must stat
+ struct stat sb;
+ char filename[PATH_MAX];
+ snprintf(filename, sizeof(filename), "%s/%s", fn, de->d_name);
+
+ r = ::stat(filename, &sb);
+ if (r < 0) {
+ r = -errno;
+ derr << "stat on " << filename << ": " << cpp_strerror(-r) << dendl;
+ assert(!m_filestore_fail_eio || r != -EIO);
+ break;
+ }
+ if (!S_ISDIR(sb.st_mode)) {
+ continue;
+ }
+ } else if (de->d_type != DT_DIR) {
+ continue;
+ }
+ if (strcmp(de->d_name, "omap") == 0) {
+ continue;
+ }
+ if (de->d_name[0] == '.' &&
+ (de->d_name[1] == '\0' ||
+ (de->d_name[1] == '.' &&
+ de->d_name[2] == '\0')))
+ continue;
+ coll_t cid;
+ if (!cid.parse(de->d_name)) {
+ derr << "ignoging invalid collection '" << de->d_name << "'" << dendl;
+ continue;
+ }
+ if (!cid.is_temp() || include_temp)
+ ls.push_back(cid);
+ }
+
+ if (r > 0) {
+ derr << "trying readdir_r " << fn << ": " << cpp_strerror(r) << dendl;
+ r = -r;
+ }
+
+ ::closedir(dir);
+ assert(!m_filestore_fail_eio || r != -EIO);
+ tracepoint(objectstore, list_collections_exit, r);
+ return r;
+}
+
+int FileStore::collection_stat(coll_t c, struct stat *st)
+{
+ tracepoint(objectstore, collection_stat_enter, c.c_str());
+ char fn[PATH_MAX];
+ get_cdir(c, fn, sizeof(fn));
+ dout(15) << "collection_stat " << fn << dendl;
+ int r = ::stat(fn, st);
+ if (r < 0)
+ r = -errno;
+ dout(10) << "collection_stat " << fn << " = " << r << dendl;
+ assert(!m_filestore_fail_eio || r != -EIO);
+ tracepoint(objectstore, collection_stat_exit, r);
+ return r;
+}
+
+bool FileStore::collection_exists(coll_t c)
+{
+ tracepoint(objectstore, collection_exists_enter, c.c_str());
+ struct stat st;
+ bool ret = collection_stat(c, &st) == 0;
+ tracepoint(objectstore, collection_exists_exit, ret);
+ return ret;
+}
+
+bool FileStore::collection_empty(coll_t c)
+{
+ tracepoint(objectstore, collection_empty_enter, c.c_str());
+ dout(15) << "collection_empty " << c << dendl;
+ Index index;
+ int r = get_index(c, &index);
+ if (r < 0)
+ return false;
+
+ assert(NULL != index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+
+ vector<ghobject_t> ls;
+ r = index->collection_list_partial(ghobject_t(), ghobject_t::get_max(), true,
+ 1, &ls, NULL);
+ if (r < 0) {
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return false;
+ }
+ bool ret = ls.empty();
+ tracepoint(objectstore, collection_empty_exit, ret);
+ return ret;
+}
+int FileStore::collection_list(coll_t c, ghobject_t start, ghobject_t end,
+ bool sort_bitwise, int max,
+ vector<ghobject_t> *ls, ghobject_t *next)
+{
+ if (start.is_max())
+ return 0;
+
+ ghobject_t temp_next;
+ if (!next)
+ next = &temp_next;
+ // figure out the pool id. we need this in order to generate a
+ // meaningful 'next' value.
+ int64_t pool = -1;
+ shard_id_t shard;
+ {
+ spg_t pgid;
+ if (c.is_temp(&pgid)) {
+ pool = -2 - pgid.pool();
+ shard = pgid.shard;
+ } else if (c.is_pg(&pgid)) {
+ pool = pgid.pool();
+ shard = pgid.shard;
+ } else if (c.is_meta()) {
+ pool = -1;
+ shard = shard_id_t::NO_SHARD;
+ } else {
+ // hrm, the caller is test code! we should get kill it off. for now,
+ // tolerate it.
+ pool = 0;
+ shard = shard_id_t::NO_SHARD;
+ }
+ dout(20) << __func__ << " pool is " << pool << " shard is " << shard
+ << " pgid " << pgid << dendl;
+ }
+ ghobject_t sep;
+ sep.hobj.pool = -1;
+ sep.set_shard(shard);
+ if (!c.is_temp() && !c.is_meta()) {
+ if (cmp_bitwise(start, sep) < 0) { // bitwise vs nibble doesn't matter here
+ dout(10) << __func__ << " first checking temp pool" << dendl;
+ coll_t temp = c.get_temp();
+ int r = collection_list(temp, start, end, sort_bitwise, max, ls, next);
+ if (r < 0)
+ return r;
+ if (*next != ghobject_t::get_max())
+ return r;
+ start = sep;
+ dout(10) << __func__ << " fall through to non-temp collection, start "
+ << start << dendl;
+ } else {
+ dout(10) << __func__ << " start " << start << " >= sep " << sep << dendl;
+ }
+ }
+
+ Index index;
+ int r = get_index(c, &index);
+ if (r < 0)
+ return r;
+
+ assert(NULL != index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+
+ r = index->collection_list_partial(start, end, sort_bitwise, max, ls, next);
+
+ if (r < 0) {
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
+ }
+ dout(20) << "objects: " << ls << dendl;
+
+ // HashIndex doesn't know the pool when constructing a 'next' value
+ if (next && !next->is_max()) {
+ next->hobj.pool = pool;
+ next->set_shard(shard);
+ dout(20) << " next " << *next << dendl;
+ }
+
+ return 0;
+}
+
+int FileStore::omap_get(coll_t c, const ghobject_t &hoid,
+ bufferlist *header,
+ map<string, bufferlist> *out)
+{
+ tracepoint(objectstore, omap_get_enter, c.c_str());
+ _kludge_temp_object_collection(c, hoid);
+ dout(15) << __func__ << " " << c << "/" << hoid << dendl;
+ Index index;
+ int r = get_index(c, &index);
+ if (r < 0)
+ return r;
+ {
+ assert(NULL != index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0)
+ return r;
+ }
+ r = object_map->get(hoid, header, out);
+ if (r < 0 && r != -ENOENT) {
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
+ }
+ tracepoint(objectstore, omap_get_exit, 0);
+ return 0;
+}
+
+int FileStore::omap_get_header(
+ coll_t c,
+ const ghobject_t &hoid,
+ bufferlist *bl,
+ bool allow_eio)
+{
+ tracepoint(objectstore, omap_get_header_enter, c.c_str());
+ _kludge_temp_object_collection(c, hoid);
+ dout(15) << __func__ << " " << c << "/" << hoid << dendl;
+ Index index;
+ int r = get_index(c, &index);
+ if (r < 0)
+ return r;
+ {
+ assert(NULL != index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0)
+ return r;
+ }
+ r = object_map->get_header(hoid, bl);
+ if (r < 0 && r != -ENOENT) {
+ assert(allow_eio || !m_filestore_fail_eio || r != -EIO);
+ return r;
+ }
+ tracepoint(objectstore, omap_get_header_exit, 0);
+ return 0;
+}
+
+int FileStore::omap_get_keys(coll_t c, const ghobject_t &hoid, set<string> *keys)
+{
+ tracepoint(objectstore, omap_get_keys_enter, c.c_str());
+ _kludge_temp_object_collection(c, hoid);
+ dout(15) << __func__ << " " << c << "/" << hoid << dendl;
+ Index index;
+ int r = get_index(c, &index);
+ if (r < 0)
+ return r;
+ {
+ assert(NULL != index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0)
+ return r;
+ }
+ r = object_map->get_keys(hoid, keys);
+ if (r < 0 && r != -ENOENT) {
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
+ }
+ tracepoint(objectstore, omap_get_keys_exit, 0);
+ return 0;
+}
+
+int FileStore::omap_get_values(coll_t c, const ghobject_t &hoid,
+ const set<string> &keys,
+ map<string, bufferlist> *out)
+{
+ tracepoint(objectstore, omap_get_values_enter, c.c_str());
+ _kludge_temp_object_collection(c, hoid);
+ dout(15) << __func__ << " " << c << "/" << hoid << dendl;
+ Index index;
+ const char *where = 0;
+ int r = get_index(c, &index);
+ if (r < 0) {
+ where = " (get_index)";
+ goto out;
+ }
+ {
+ assert(NULL != index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0) {
+ where = " (lfn_find)";
+ goto out;
+ }
+ }
+ r = object_map->get_values(hoid, keys, out);
+ if (r < 0 && r != -ENOENT) {
+ assert(!m_filestore_fail_eio || r != -EIO);
+ goto out;
+ }
+ r = 0;
+ out:
+ tracepoint(objectstore, omap_get_values_exit, r);
+ dout(15) << __func__ << " " << c << "/" << hoid << " = " << r
+ << where << dendl;
+ return r;
+}
+
+int FileStore::omap_check_keys(coll_t c, const ghobject_t &hoid,
+ const set<string> &keys,
+ set<string> *out)
+{
+ tracepoint(objectstore, omap_check_keys_enter, c.c_str());
+ _kludge_temp_object_collection(c, hoid);
+ dout(15) << __func__ << " " << c << "/" << hoid << dendl;
+
+ Index index;
+ int r = get_index(c, &index);
+ if (r < 0)
+ return r;
+ {
+ assert(NULL != index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0)
+ return r;
+ }
+ r = object_map->check_keys(hoid, keys, out);
+ if (r < 0 && r != -ENOENT) {
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
+ }
+ tracepoint(objectstore, omap_check_keys_exit, 0);
+ return 0;
+}
+
+ObjectMap::ObjectMapIterator FileStore::get_omap_iterator(coll_t c,
+ const ghobject_t &hoid)
+{
+ tracepoint(objectstore, get_omap_iterator, c.c_str());
+ _kludge_temp_object_collection(c, hoid);
+ dout(15) << __func__ << " " << c << "/" << hoid << dendl;
+ Index index;
+ int r = get_index(c, &index);
+ if (r < 0) {
+ dout(10) << __func__ << " " << c << "/" << hoid << " = 0 "
+ << "(get_index failed with " << cpp_strerror(r) << ")" << dendl;
+ return ObjectMap::ObjectMapIterator();
+ }
+ {
+ assert(NULL != index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0) {
+ dout(10) << __func__ << " " << c << "/" << hoid << " = 0 "
+ << "(lfn_find failed with " << cpp_strerror(r) << ")" << dendl;
+ return ObjectMap::ObjectMapIterator();
+ }
+ }
+ return object_map->get_iterator(hoid);
+}
+
+int FileStore::_collection_hint_expected_num_objs(coll_t c, uint32_t pg_num,
+ uint64_t expected_num_objs,
+ const SequencerPosition &spos)
+{
+ dout(15) << __func__ << " collection: " << c << " pg number: "
+ << pg_num << " expected number of objects: " << expected_num_objs << dendl;
+
+ if (!collection_empty(c) && !replaying) {
+ dout(0) << "Failed to give an expected number of objects hint to collection : "
+ << c << ", only empty collection can take such type of hint. " << dendl;
+ return 0;
+ }
+
+ int ret;
+ Index index;
+ ret = get_index(c, &index);
+ if (ret < 0)
+ return ret;
+ // Pre-hash the collection
+ ret = index->pre_hash_collection(pg_num, expected_num_objs);
+ dout(10) << "pre_hash_collection " << c << " = " << ret << dendl;
+ if (ret < 0)
+ return ret;
+ _set_replay_guard(c, spos);
+
+ return 0;
+}
+
+int FileStore::_create_collection(
+ coll_t c,
+ const SequencerPosition &spos)
+{
+ char fn[PATH_MAX];
+ get_cdir(c, fn, sizeof(fn));
+ dout(15) << "create_collection " << fn << dendl;
+ int r = ::mkdir(fn, 0755);
+ if (r < 0)
+ r = -errno;
+ if (r == -EEXIST && replaying)
+ r = 0;
+ dout(10) << "create_collection " << fn << " = " << r << dendl;
+
+ if (r < 0)
+ return r;
+ r = init_index(c);
+ if (r < 0)
+ return r;
+
+ // create parallel temp collection, too
+ if (!c.is_meta() && !c.is_temp()) {
+ coll_t temp = c.get_temp();
+ r = _create_collection(temp, spos);
+ if (r < 0)
+ return r;
+ }
+
+ _set_replay_guard(c, spos);
+ return 0;
+}
+
+int FileStore::_destroy_collection(coll_t c)
+{
+ int r = 0;
+ char fn[PATH_MAX];
+ get_cdir(c, fn, sizeof(fn));
+ dout(15) << "_destroy_collection " << fn << dendl;
+ {
+ Index from;
+ r = get_index(c, &from);
+ if (r < 0)
+ goto out;
+ assert(NULL != from.index);
+ RWLock::WLocker l((from.index)->access_lock);
+
+ r = from->prep_delete();
+ if (r < 0)
+ goto out;
+ }
+ r = ::rmdir(fn);
+ if (r < 0) {
+ r = -errno;
+ goto out;
+ }
+
+ out:
+ // destroy parallel temp collection, too
+ if (!c.is_meta() && !c.is_temp()) {
+ coll_t temp = c.get_temp();
+ int r2 = _destroy_collection(temp);
+ if (r2 < 0) {
+ r = r2;
+ goto out_final;
+ }
+ }
+
+ out_final:
+ dout(10) << "_destroy_collection " << fn << " = " << r << dendl;
+ return r;
+}
+
+
+int FileStore::_collection_add(coll_t c, coll_t oldcid, const ghobject_t& o,
+ const SequencerPosition& spos)
+{
+ dout(15) << "collection_add " << c << "/" << o << " from " << oldcid << "/" << o << dendl;
+
+ int dstcmp = _check_replay_guard(c, o, spos);
+ if (dstcmp < 0)
+ return 0;
+
+ // check the src name too; it might have a newer guard, and we don't
+ // want to clobber it
+ int srccmp = _check_replay_guard(oldcid, o, spos);
+ if (srccmp < 0)
+ return 0;
+
+ // open guard on object so we don't any previous operations on the
+ // new name that will modify the source inode.
+ FDRef fd;
+ int r = lfn_open(oldcid, o, 0, &fd);
+ if (r < 0) {
+ // the source collection/object does not exist. If we are replaying, we
+ // should be safe, so just return 0 and move on.
+ assert(replaying);
+ dout(10) << "collection_add " << c << "/" << o << " from "
+ << oldcid << "/" << o << " (dne, continue replay) " << dendl;
+ return 0;
+ }
+ if (dstcmp > 0) { // if dstcmp == 0 the guard already says "in-progress"
+ _set_replay_guard(**fd, spos, &o, true);
+ }
+
+ r = lfn_link(oldcid, c, o, o);
+ if (replaying && !backend->can_checkpoint() &&
+ r == -EEXIST) // crashed between link() and set_replay_guard()
+ r = 0;
+
+ _inject_failure();
+
+ // close guard on object so we don't do this again
+ if (r == 0) {
+ _close_replay_guard(**fd, spos);
+ }
+ lfn_close(fd);
+
+ dout(10) << "collection_add " << c << "/" << o << " from " << oldcid << "/" << o << " = " << r << dendl;
+ return r;
+}
+
+int FileStore::_collection_move_rename(coll_t oldcid, const ghobject_t& oldoid,
+ coll_t c, const ghobject_t& o,
+ const SequencerPosition& spos)
+{
+ dout(15) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" << oldoid << dendl;
+ int r = 0;
+ int dstcmp, srccmp;
+
+ if (replaying) {
+ /* If the destination collection doesn't exist during replay,
+ * we need to delete the src object and continue on
+ */
+ if (!collection_exists(c))
+ goto out_rm_src;
+ }
+
+ dstcmp = _check_replay_guard(c, o, spos);
+ if (dstcmp < 0)
+ goto out_rm_src;
+
+ // check the src name too; it might have a newer guard, and we don't
+ // want to clobber it
+ srccmp = _check_replay_guard(oldcid, oldoid, spos);
+ if (srccmp < 0)
+ return 0;
+
+ {
+ // open guard on object so we don't any previous operations on the
+ // new name that will modify the source inode.
+ FDRef fd;
+ r = lfn_open(oldcid, oldoid, 0, &fd);
+ if (r < 0) {
+ // the source collection/object does not exist. If we are replaying, we
+ // should be safe, so just return 0 and move on.
+ assert(replaying);
+ dout(10) << __func__ << " " << c << "/" << o << " from "
+ << oldcid << "/" << oldoid << " (dne, continue replay) " << dendl;
+ return 0;
+ }
+ if (dstcmp > 0) { // if dstcmp == 0 the guard already says "in-progress"
+ _set_replay_guard(**fd, spos, &o, true);
+ }
+
+ r = lfn_link(oldcid, c, oldoid, o);
+ if (replaying && !backend->can_checkpoint() &&
+ r == -EEXIST) // crashed between link() and set_replay_guard()
+ r = 0;
+
+ _inject_failure();
+
+ if (r == 0) {
+ // the name changed; link the omap content
+ r = object_map->clone(oldoid, o, &spos);
+ if (r == -ENOENT)
+ r = 0;
+ }
+
+ _inject_failure();
+
+ lfn_close(fd);
+ fd = FDRef();
+
+ if (r == 0)
+ r = lfn_unlink(oldcid, oldoid, spos, true);
+
+ if (r == 0)
+ r = lfn_open(c, o, 0, &fd);
+
+ // close guard on object so we don't do this again
+ if (r == 0)
+ _close_replay_guard(**fd, spos);
+
+ lfn_close(fd);
+ }
+
+ dout(10) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" << oldoid
+ << " = " << r << dendl;
+ return r;
+
+ out_rm_src:
+ // remove source
+ if (_check_replay_guard(oldcid, oldoid, spos) > 0) {
+ r = lfn_unlink(oldcid, oldoid, spos, true);
+ }
+
+ dout(10) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" << oldoid
+ << " = " << r << dendl;
+ return r;
+}
+
+void FileStore::_inject_failure()
+{
+ if (m_filestore_kill_at.read()) {
+ int final = m_filestore_kill_at.dec();
+ dout(5) << "_inject_failure " << (final+1) << " -> " << final << dendl;
+ if (final == 0) {
+ derr << "_inject_failure KILLING" << dendl;
+ g_ceph_context->_log->flush();
+ _exit(1);
+ }
+ }
+}
+
+int FileStore::_omap_clear(coll_t cid, const ghobject_t &hoid,
+ const SequencerPosition &spos) {
+ dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
+ Index index;
+ int r = get_index(cid, &index);
+ if (r < 0)
+ return r;
+ {
+ assert(NULL != index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0)
+ return r;
+ }
+ r = object_map->clear_keys_header(hoid, &spos);
+ if (r < 0 && r != -ENOENT)
+ return r;
+ return 0;
+}
+
+int FileStore::_omap_setkeys(coll_t cid, const ghobject_t &hoid,
+ const map<string, bufferlist> &aset,
+ const SequencerPosition &spos) {
+ dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
+ Index index;
+ int r;
+ //treat pgmeta as a logical object, skip to check exist
+ if (hoid.is_pgmeta())
+ goto skip;
+
+ r = get_index(cid, &index);
+ if (r < 0) {
+ dout(20) << __func__ << " get_index got " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ {
+ assert(NULL != index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0) {
+ dout(20) << __func__ << " lfn_find got " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+skip:
+ r = object_map->set_keys(hoid, aset, &spos);
+ dout(20) << __func__ << " " << cid << "/" << hoid << " = " << r << dendl;
+ return r;
+}
+
+int FileStore::_omap_rmkeys(coll_t cid, const ghobject_t &hoid,
+ const set<string> &keys,
+ const SequencerPosition &spos) {
+ dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
+ Index index;
+ int r;
+ //treat pgmeta as a logical object, skip to check exist
+ if (hoid.is_pgmeta())
+ goto skip;
+
+ r = get_index(cid, &index);
+ if (r < 0)
+ return r;
+ {
+ assert(NULL != index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0)
+ return r;
+ }
+skip:
+ r = object_map->rm_keys(hoid, keys, &spos);
+ if (r < 0 && r != -ENOENT)
+ return r;
+ return 0;
+}
+
+int FileStore::_omap_rmkeyrange(coll_t cid, const ghobject_t &hoid,
+ const string& first, const string& last,
+ const SequencerPosition &spos) {
+ dout(15) << __func__ << " " << cid << "/" << hoid << " [" << first << "," << last << "]" << dendl;
+ set<string> keys;
+ {
+ ObjectMap::ObjectMapIterator iter = get_omap_iterator(cid, hoid);
+ if (!iter)
+ return -ENOENT;
+ for (iter->lower_bound(first); iter->valid() && iter->key() < last;
+ iter->next()) {
+ keys.insert(iter->key());
+ }
+ }
+ return _omap_rmkeys(cid, hoid, keys, spos);
+}
+
+int FileStore::_omap_setheader(coll_t cid, const ghobject_t &hoid,
+ const bufferlist &bl,
+ const SequencerPosition &spos)
+{
+ dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
+ Index index;
+ int r = get_index(cid, &index);
+ if (r < 0)
+ return r;
+ {
+ assert(NULL != index.index);
+ RWLock::RLocker l((index.index)->access_lock);
+ r = lfn_find(hoid, index);
+ if (r < 0)
+ return r;
+ }
+ return object_map->set_header(hoid, bl, &spos);
+}
+
+int FileStore::_split_collection(coll_t cid,
+ uint32_t bits,
+ uint32_t rem,
+ coll_t dest,
+ const SequencerPosition &spos)
+{
+ int r;
+ {
+ dout(15) << __func__ << " " << cid << " bits: " << bits << dendl;
+ if (!collection_exists(cid)) {
+ dout(2) << __func__ << ": " << cid << " DNE" << dendl;
+ assert(replaying);
+ return 0;
+ }
+ if (!collection_exists(dest)) {
+ dout(2) << __func__ << ": " << dest << " DNE" << dendl;
+ assert(replaying);
+ return 0;
+ }
+
+ int dstcmp = _check_replay_guard(dest, spos);
+ if (dstcmp < 0)
+ return 0;
+
+ int srccmp = _check_replay_guard(cid, spos);
+ if (srccmp < 0)
+ return 0;
+
+ _set_global_replay_guard(cid, spos);
+ _set_replay_guard(cid, spos, true);
+ _set_replay_guard(dest, spos, true);
+
+ Index from;
+ r = get_index(cid, &from);
+
+ Index to;
+ if (!r)
+ r = get_index(dest, &to);
+
+ if (!r) {
+ assert(NULL != from.index);
+ RWLock::WLocker l1((from.index)->access_lock);
+
+ assert(NULL != to.index);
+ RWLock::WLocker l2((to.index)->access_lock);
+
+ r = from->split(rem, bits, to.index);
+ }
+
+ _close_replay_guard(cid, spos);
+ _close_replay_guard(dest, spos);
+ }
+ if (g_conf->filestore_debug_verify_split) {
+ vector<ghobject_t> objects;
+ ghobject_t next;
+ while (1) {
+ collection_list(
+ cid,
+ next, ghobject_t::get_max(),
+ true,
+ get_ideal_list_max(),
+ &objects,
+ &next);
+ if (objects.empty())
+ break;
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ dout(20) << __func__ << ": " << *i << " still in source "
+ << cid << dendl;
+ assert(!i->match(bits, rem));
+ }
+ objects.clear();
+ }
+ next = ghobject_t();
+ while (1) {
+ collection_list(
+ dest,
+ next, ghobject_t::get_max(),
+ true,
+ get_ideal_list_max(),
+ &objects,
+ &next);
+ if (objects.empty())
+ break;
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ dout(20) << __func__ << ": " << *i << " now in dest "
+ << *i << dendl;
+ assert(i->match(bits, rem));
+ }
+ objects.clear();
+ }
+ }
+ return r;
+}
+
+int FileStore::_set_alloc_hint(coll_t cid, const ghobject_t& oid,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size)
+{
+ dout(15) << "set_alloc_hint " << cid << "/" << oid << " object_size " << expected_object_size << " write_size " << expected_write_size << dendl;
+
+ FDRef fd;
+ int ret;
+
+ ret = lfn_open(cid, oid, false, &fd);
+ if (ret < 0)
+ goto out;
+
+ {
+ // TODO: a more elaborate hint calculation
+ uint64_t hint = MIN(expected_write_size, m_filestore_max_alloc_hint_size);
+
+ ret = backend->set_alloc_hint(**fd, hint);
+ dout(20) << "set_alloc_hint hint " << hint << " ret " << ret << dendl;
+ }
+
+ lfn_close(fd);
+out:
+ dout(10) << "set_alloc_hint " << cid << "/" << oid << " object_size " << expected_object_size << " write_size " << expected_write_size << " = " << ret << dendl;
+ assert(!m_filestore_fail_eio || ret != -EIO);
+ return ret;
+}
+
+const char** FileStore::get_tracked_conf_keys() const
+{
+ static const char* KEYS[] = {
+ "filestore_min_sync_interval",
+ "filestore_max_sync_interval",
+ "filestore_queue_max_ops",
+ "filestore_queue_max_bytes",
+ "filestore_queue_committing_max_ops",
+ "filestore_queue_committing_max_bytes",
+ "filestore_commit_timeout",
+ "filestore_dump_file",
+ "filestore_kill_at",
+ "filestore_fail_eio",
+ "filestore_fadvise",
+ "filestore_sloppy_crc",
+ "filestore_sloppy_crc_block_size",
+ "filestore_max_alloc_hint_size",
+ NULL
+ };
+ return KEYS;
+}
+
+void FileStore::handle_conf_change(const struct md_config_t *conf,
+ const std::set <std::string> &changed)
+{
+ if (changed.count("filestore_max_inline_xattr_size") ||
+ changed.count("filestore_max_inline_xattr_size_xfs") ||
+ changed.count("filestore_max_inline_xattr_size_btrfs") ||
+ changed.count("filestore_max_inline_xattr_size_other") ||
+ changed.count("filestore_max_inline_xattrs") ||
+ changed.count("filestore_max_inline_xattrs_xfs") ||
+ changed.count("filestore_max_inline_xattrs_btrfs") ||
+ changed.count("filestore_max_inline_xattrs_other")) {
+ Mutex::Locker l(lock);
+ set_xattr_limits_via_conf();
+ }
+ if (changed.count("filestore_min_sync_interval") ||
+ changed.count("filestore_max_sync_interval") ||
+ changed.count("filestore_queue_max_ops") ||
+ changed.count("filestore_queue_max_bytes") ||
+ changed.count("filestore_queue_committing_max_ops") ||
+ changed.count("filestore_queue_committing_max_bytes") ||
+ changed.count("filestore_kill_at") ||
+ changed.count("filestore_fail_eio") ||
+ changed.count("filestore_sloppy_crc") ||
+ changed.count("filestore_sloppy_crc_block_size") ||
+ changed.count("filestore_max_alloc_hint_size") ||
+ changed.count("filestore_fadvise")) {
+ Mutex::Locker l(lock);
+ m_filestore_min_sync_interval = conf->filestore_min_sync_interval;
+ m_filestore_max_sync_interval = conf->filestore_max_sync_interval;
+ m_filestore_queue_max_ops = conf->filestore_queue_max_ops;
+ m_filestore_queue_max_bytes = conf->filestore_queue_max_bytes;
+ m_filestore_queue_committing_max_ops = conf->filestore_queue_committing_max_ops;
+ m_filestore_queue_committing_max_bytes = conf->filestore_queue_committing_max_bytes;
+ m_filestore_kill_at.set(conf->filestore_kill_at);
+ m_filestore_fail_eio = conf->filestore_fail_eio;
+ m_filestore_fadvise = conf->filestore_fadvise;
+ m_filestore_sloppy_crc = conf->filestore_sloppy_crc;
+ m_filestore_sloppy_crc_block_size = conf->filestore_sloppy_crc_block_size;
+ m_filestore_max_alloc_hint_size = conf->filestore_max_alloc_hint_size;
+ throttle_ops.reset_max(conf->filestore_queue_max_ops);
+ throttle_bytes.reset_max(conf->filestore_queue_max_bytes);
+ }
+ if (changed.count("filestore_commit_timeout")) {
+ Mutex::Locker l(sync_entry_timeo_lock);
+ m_filestore_commit_timeout = conf->filestore_commit_timeout;
+ }
+ if (changed.count("filestore_dump_file")) {
+ if (conf->filestore_dump_file.length() &&
+ conf->filestore_dump_file != "-") {
+ dump_start(conf->filestore_dump_file);
+ } else {
+ dump_stop();
+ }
+ }
+}
+
+void FileStore::dump_start(const std::string& file)
+{
+ dout(10) << "dump_start " << file << dendl;
+ if (m_filestore_do_dump) {
+ dump_stop();
+ }
+ m_filestore_dump_fmt.reset();
+ m_filestore_dump_fmt.open_array_section("dump");
+ m_filestore_dump.open(file.c_str());
+ m_filestore_do_dump = true;
+}
+
+void FileStore::dump_stop()
+{
+ dout(10) << "dump_stop" << dendl;
+ m_filestore_do_dump = false;
+ if (m_filestore_dump.is_open()) {
+ m_filestore_dump_fmt.close_section();
+ m_filestore_dump_fmt.flush(m_filestore_dump);
+ m_filestore_dump.flush();
+ m_filestore_dump.close();
+ }
+}
+
+void FileStore::dump_transactions(list<ObjectStore::Transaction*>& ls, uint64_t seq, OpSequencer *osr)
+{
+ m_filestore_dump_fmt.open_array_section("transactions");
+ unsigned trans_num = 0;
+ for (list<ObjectStore::Transaction*>::iterator i = ls.begin(); i != ls.end(); ++i, ++trans_num) {
+ m_filestore_dump_fmt.open_object_section("transaction");
+ m_filestore_dump_fmt.dump_string("osr", osr->get_name());
+ m_filestore_dump_fmt.dump_unsigned("seq", seq);
+ m_filestore_dump_fmt.dump_unsigned("trans_num", trans_num);
+ (*i)->dump(&m_filestore_dump_fmt);
+ m_filestore_dump_fmt.close_section();
+ }
+ m_filestore_dump_fmt.close_section();
+ m_filestore_dump_fmt.flush(m_filestore_dump);
+ m_filestore_dump.flush();
+}
+
+void FileStore::set_xattr_limits_via_conf()
+{
+ uint32_t fs_xattr_size;
+ uint32_t fs_xattrs;
+
+ switch (m_fs_type) {
+#if defined(__linux__)
+ case XFS_SUPER_MAGIC:
+ fs_xattr_size = g_conf->filestore_max_inline_xattr_size_xfs;
+ fs_xattrs = g_conf->filestore_max_inline_xattrs_xfs;
+ break;
+ case BTRFS_SUPER_MAGIC:
+ fs_xattr_size = g_conf->filestore_max_inline_xattr_size_btrfs;
+ fs_xattrs = g_conf->filestore_max_inline_xattrs_btrfs;
+ break;
+#endif
+ default:
+ fs_xattr_size = g_conf->filestore_max_inline_xattr_size_other;
+ fs_xattrs = g_conf->filestore_max_inline_xattrs_other;
+ break;
+ }
+
+ // Use override value if set
+ if (g_conf->filestore_max_inline_xattr_size)
+ m_filestore_max_inline_xattr_size = g_conf->filestore_max_inline_xattr_size;
+ else
+ m_filestore_max_inline_xattr_size = fs_xattr_size;
+
+ // Use override value if set
+ if (g_conf->filestore_max_inline_xattrs)
+ m_filestore_max_inline_xattrs = g_conf->filestore_max_inline_xattrs;
+ else
+ m_filestore_max_inline_xattrs = fs_xattrs;
+}
+
+// -- FSSuperblock --
+
+void FSSuperblock::encode(bufferlist &bl) const
+{
+ ENCODE_START(2, 1, bl);
+ compat_features.encode(bl);
+ ::encode(omap_backend, bl);
+ ENCODE_FINISH(bl);
+}
+
+void FSSuperblock::decode(bufferlist::iterator &bl)
+{
+ DECODE_START(2, bl);
+ compat_features.decode(bl);
+ if (struct_v >= 2)
+ ::decode(omap_backend, bl);
+ else
+ omap_backend = "leveldb";
+ DECODE_FINISH(bl);
+}
+
+void FSSuperblock::dump(Formatter *f) const
+{
+ f->open_object_section("compat");
+ compat_features.dump(f);
+ f->dump_string("omap_backend", omap_backend);
+ f->close_section();
+}
+
+void FSSuperblock::generate_test_instances(list<FSSuperblock*>& o)
+{
+ FSSuperblock z;
+ o.push_back(new FSSuperblock(z));
+ CompatSet::FeatureSet feature_compat;
+ CompatSet::FeatureSet feature_ro_compat;
+ CompatSet::FeatureSet feature_incompat;
+ feature_incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS);
+ z.compat_features = CompatSet(feature_compat, feature_ro_compat,
+ feature_incompat);
+ o.push_back(new FSSuperblock(z));
+ z.omap_backend = "rocksdb";
+ o.push_back(new FSSuperblock(z));
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_FILESTORE_H
+#define CEPH_FILESTORE_H
+
+#include "include/types.h"
+
+#include <map>
+#include <deque>
+#include <boost/scoped_ptr.hpp>
+#include <fstream>
+using namespace std;
+
+#include "include/unordered_map.h"
+
+#include "include/assert.h"
+
+#include "os/ObjectStore.h"
+#include "JournalingObjectStore.h"
+
+#include "common/Timer.h"
+#include "common/WorkQueue.h"
+
+#include "common/Mutex.h"
+#include "HashIndex.h"
+#include "IndexManager.h"
+#include "os/ObjectMap.h"
+#include "SequencerPosition.h"
+#include "FDCache.h"
+#include "WBThrottle.h"
+
+#include "include/uuid.h"
+
+
+// from include/linux/falloc.h:
+#ifndef FALLOC_FL_PUNCH_HOLE
+# define FALLOC_FL_PUNCH_HOLE 0x2
+#endif
+
+#if defined(__linux__)
+# ifndef BTRFS_SUPER_MAGIC
+#define BTRFS_SUPER_MAGIC 0x9123683E
+# endif
+# ifndef XFS_SUPER_MAGIC
+#define XFS_SUPER_MAGIC 0x58465342
+# endif
+# ifndef ZFS_SUPER_MAGIC
+#define ZFS_SUPER_MAGIC 0x2fc12fc1
+# endif
+#endif
+
+
+class FileStoreBackend;
+
+#define CEPH_FS_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(1, "sharded objects")
+
+class FSSuperblock {
+public:
+ CompatSet compat_features;
+ string omap_backend;
+
+ FSSuperblock() { }
+
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::iterator &bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<FSSuperblock*>& o);
+};
+WRITE_CLASS_ENCODER(FSSuperblock)
+
+inline ostream& operator<<(ostream& out, const FSSuperblock& sb)
+{
+ return out << "sb(" << sb.compat_features << "): "
+ << sb.omap_backend;
+}
+
+class FileStore : public JournalingObjectStore,
+ public md_config_obs_t
+{
+ static const uint32_t target_version = 4;
+public:
+ uint32_t get_target_version() {
+ return target_version;
+ }
+
+ static int get_block_device_fsid(const string& path, uuid_d *fsid);
+
+ struct FSPerfTracker {
+ PerfCounters::avg_tracker<uint64_t> os_commit_latency;
+ PerfCounters::avg_tracker<uint64_t> os_apply_latency;
+
+ objectstore_perf_stat_t get_cur_stats() const {
+ objectstore_perf_stat_t ret;
+ ret.filestore_commit_latency = os_commit_latency.avg();
+ ret.filestore_apply_latency = os_apply_latency.avg();
+ return ret;
+ }
+
+ void update_from_perfcounters(PerfCounters &logger);
+ } perf_tracker;
+ objectstore_perf_stat_t get_cur_stats() {
+ perf_tracker.update_from_perfcounters(*logger);
+ return perf_tracker.get_cur_stats();
+ }
+
+private:
+ string internal_name; ///< internal name, used to name the perfcounter instance
+ string basedir, journalpath;
+ osflagbits_t generic_flags;
+ std::string current_fn;
+ std::string current_op_seq_fn;
+ std::string omap_dir;
+ uuid_d fsid;
+
+ size_t blk_size; ///< fs block size
+
+ int fsid_fd, op_fd, basedir_fd, current_fd;
+
+ FileStoreBackend *backend;
+
+ void create_backend(long f_type);
+
+ deque<uint64_t> snaps;
+
+ // Indexed Collections
+ IndexManager index_manager;
+ int get_index(coll_t c, Index *index);
+ int init_index(coll_t c);
+
+ void _kludge_temp_object_collection(coll_t& cid, const ghobject_t& oid) {
+ // - normal temp case: cid is pg, object is temp (pool < -1)
+ // - hammer temp case: cid is pg (or already temp), object pool is -1
+ if (cid.is_pg() && (oid.hobj.pool < -1 ||
+ oid.hobj.pool == -1))
+ cid = cid.get_temp();
+ }
+ void init_temp_collections();
+
+ // ObjectMap
+ boost::scoped_ptr<ObjectMap> object_map;
+
+ // helper fns
+ int get_cdir(coll_t cid, char *s, int len);
+
+ /// read a uuid from fd
+ int read_fsid(int fd, uuid_d *uuid);
+
+ /// lock fsid_fd
+ int lock_fsid();
+
+ // sync thread
+ Mutex lock;
+ bool force_sync;
+ Cond sync_cond;
+
+ Mutex sync_entry_timeo_lock;
+ SafeTimer timer;
+
+ list<Context*> sync_waiters;
+ bool stop;
+ void sync_entry();
+ struct SyncThread : public Thread {
+ FileStore *fs;
+ SyncThread(FileStore *f) : fs(f) {}
+ void *entry() {
+ fs->sync_entry();
+ return 0;
+ }
+ } sync_thread;
+
+ // -- op workqueue --
+ struct Op {
+ utime_t start;
+ uint64_t op;
+ list<Transaction*> tls;
+ Context *onreadable, *onreadable_sync;
+ uint64_t ops, bytes;
+ TrackedOpRef osd_op;
+ };
+ class OpSequencer : public Sequencer_impl {
+ Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock)
+ list<Op*> q;
+ list<uint64_t> jq;
+ list<pair<uint64_t, Context*> > flush_commit_waiters;
+ Cond cond;
+ public:
+ Sequencer *parent;
+ Mutex apply_lock; // for apply mutual exclusion
+ int id;
+
+ /// get_max_uncompleted
+ bool _get_max_uncompleted(
+ uint64_t *seq ///< [out] max uncompleted seq
+ ) {
+ assert(qlock.is_locked());
+ assert(seq);
+ *seq = 0;
+ if (q.empty() && jq.empty())
+ return true;
+
+ if (!q.empty())
+ *seq = q.back()->op;
+ if (!jq.empty() && jq.back() > *seq)
+ *seq = jq.back();
+
+ return false;
+ } /// @returns true if both queues are empty
+
+ /// get_min_uncompleted
+ bool _get_min_uncompleted(
+ uint64_t *seq ///< [out] min uncompleted seq
+ ) {
+ assert(qlock.is_locked());
+ assert(seq);
+ *seq = 0;
+ if (q.empty() && jq.empty())
+ return true;
+
+ if (!q.empty())
+ *seq = q.front()->op;
+ if (!jq.empty() && jq.front() < *seq)
+ *seq = jq.front();
+
+ return false;
+ } /// @returns true if both queues are empty
+
+ void _wake_flush_waiters(list<Context*> *to_queue) {
+ uint64_t seq;
+ if (_get_min_uncompleted(&seq))
+ seq = -1;
+
+ for (list<pair<uint64_t, Context*> >::iterator i =
+ flush_commit_waiters.begin();
+ i != flush_commit_waiters.end() && i->first < seq;
+ flush_commit_waiters.erase(i++)) {
+ to_queue->push_back(i->second);
+ }
+ }
+
+ void queue_journal(uint64_t s) {
+ Mutex::Locker l(qlock);
+ jq.push_back(s);
+ }
+ void dequeue_journal(list<Context*> *to_queue) {
+ Mutex::Locker l(qlock);
+ jq.pop_front();
+ cond.Signal();
+ _wake_flush_waiters(to_queue);
+ }
+ void queue(Op *o) {
+ Mutex::Locker l(qlock);
+ q.push_back(o);
+ }
+ Op *peek_queue() {
+ Mutex::Locker l(qlock);
+ assert(apply_lock.is_locked());
+ return q.front();
+ }
+
+ Op *dequeue(list<Context*> *to_queue) {
+ assert(to_queue);
+ assert(apply_lock.is_locked());
+ Mutex::Locker l(qlock);
+ Op *o = q.front();
+ q.pop_front();
+ cond.Signal();
+
+ _wake_flush_waiters(to_queue);
+ return o;
+ }
+
+ void flush() {
+ Mutex::Locker l(qlock);
+
+ while (g_conf->filestore_blackhole)
+ cond.Wait(qlock); // wait forever
+
+
+ // get max for journal _or_ op queues
+ uint64_t seq = 0;
+ if (!q.empty())
+ seq = q.back()->op;
+ if (!jq.empty() && jq.back() > seq)
+ seq = jq.back();
+
+ if (seq) {
+ // everything prior to our watermark to drain through either/both queues
+ while ((!q.empty() && q.front()->op <= seq) ||
+ (!jq.empty() && jq.front() <= seq))
+ cond.Wait(qlock);
+ }
+ }
+ bool flush_commit(Context *c) {
+ Mutex::Locker l(qlock);
+ uint64_t seq = 0;
+ if (_get_max_uncompleted(&seq)) {
+ return true;
+ } else {
+ flush_commit_waiters.push_back(make_pair(seq, c));
+ return false;
+ }
+ }
+
+ OpSequencer(int i)
+ : qlock("FileStore::OpSequencer::qlock", false, false),
+ parent(0),
+ apply_lock("FileStore::OpSequencer::apply_lock", false, false),
+ id(i) {}
+ ~OpSequencer() {
+ assert(q.empty());
+ }
+
+ const string& get_name() const {
+ return parent->get_name();
+ }
+ };
+
+ friend ostream& operator<<(ostream& out, const OpSequencer& s);
+
+ FDCache fdcache;
+ WBThrottle wbthrottle;
+
+ atomic_t next_osr_id;
+ deque<OpSequencer*> op_queue;
+ Throttle throttle_ops, throttle_bytes;
+ const int m_ondisk_finisher_num;
+ const int m_apply_finisher_num;
+ vector<Finisher*> ondisk_finishers;
+ vector<Finisher*> apply_finishers;
+
+ ThreadPool op_tp;
+ struct OpWQ : public ThreadPool::WorkQueue<OpSequencer> {
+ FileStore *store;
+ OpWQ(FileStore *fs, time_t timeout, time_t suicide_timeout, ThreadPool *tp)
+ : ThreadPool::WorkQueue<OpSequencer>("FileStore::OpWQ", timeout, suicide_timeout, tp), store(fs) {}
+
+ bool _enqueue(OpSequencer *osr) {
+ store->op_queue.push_back(osr);
+ return true;
+ }
+ void _dequeue(OpSequencer *o) {
+ assert(0);
+ }
+ bool _empty() {
+ return store->op_queue.empty();
+ }
+ OpSequencer *_dequeue() {
+ if (store->op_queue.empty())
+ return NULL;
+ OpSequencer *osr = store->op_queue.front();
+ store->op_queue.pop_front();
+ return osr;
+ }
+ void _process(OpSequencer *osr, ThreadPool::TPHandle &handle) {
+ store->_do_op(osr, handle);
+ }
+ using ThreadPool::WorkQueue<OpSequencer>::_process;
+ void _process_finish(OpSequencer *osr) {
+ store->_finish_op(osr);
+ }
+ void _clear() {
+ assert(store->op_queue.empty());
+ }
+ } op_wq;
+
+ void _do_op(OpSequencer *o, ThreadPool::TPHandle &handle);
+ void _finish_op(OpSequencer *o);
+ Op *build_op(list<Transaction*>& tls,
+ Context *onreadable, Context *onreadable_sync,
+ TrackedOpRef osd_op);
+ void queue_op(OpSequencer *osr, Op *o);
+ void op_queue_reserve_throttle(Op *o, ThreadPool::TPHandle *handle = NULL);
+ void op_queue_release_throttle(Op *o);
+ void _journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk);
+ friend struct C_JournaledAhead;
+
+ void new_journal();
+
+ PerfCounters *logger;
+
+public:
+ int lfn_find(const ghobject_t& oid, const Index& index,
+ IndexedPath *path = NULL);
+ int lfn_truncate(coll_t cid, const ghobject_t& oid, off_t length);
+ int lfn_stat(coll_t cid, const ghobject_t& oid, struct stat *buf);
+ int lfn_open(
+ coll_t cid,
+ const ghobject_t& oid,
+ bool create,
+ FDRef *outfd,
+ Index *index = 0);
+
+ void lfn_close(FDRef fd);
+ int lfn_link(coll_t c, coll_t newcid, const ghobject_t& o, const ghobject_t& newoid) ;
+ int lfn_unlink(coll_t cid, const ghobject_t& o, const SequencerPosition &spos,
+ bool force_clear_omap=false);
+
+public:
+ FileStore(const std::string &base, const std::string &jdev,
+ osflagbits_t flags = 0,
+ const char *internal_name = "filestore", bool update_to=false);
+ ~FileStore();
+
+ int _detect_fs();
+ int _sanity_check_fs();
+
+ bool test_mount_in_use();
+ int read_op_seq(uint64_t *seq);
+ int write_op_seq(int, uint64_t seq);
+ int mount();
+ int umount();
+ unsigned get_max_object_name_length() {
+ // not safe for all file systems, btw! use the tunable to limit this.
+ return 4096;
+ }
+ unsigned get_max_attr_name_length() {
+ // xattr limit is 128; leave room for our prefixes (user.ceph._),
+ // some margin, and cap at 100
+ return 100;
+ }
+ int mkfs();
+ int mkjournal();
+ bool wants_journal() {
+ return true;
+ }
+ bool allows_journal() {
+ return true;
+ }
+ bool needs_journal() {
+ return false;
+ }
+
+ int write_version_stamp();
+ int version_stamp_is_valid(uint32_t *version);
+ int update_version_stamp();
+ int upgrade();
+
+ bool can_sort_nibblewise() {
+ return true; // i support legacy sort order
+ }
+
+ void collect_metadata(map<string,string> *pm);
+
+ int statfs(struct statfs *buf);
+
+ int _do_transactions(
+ list<Transaction*> &tls, uint64_t op_seq,
+ ThreadPool::TPHandle *handle);
+ int do_transactions(list<Transaction*> &tls, uint64_t op_seq) {
+ return _do_transactions(tls, op_seq, 0);
+ }
+ unsigned _do_transaction(
+ Transaction& t, uint64_t op_seq, int trans_num,
+ ThreadPool::TPHandle *handle);
+
+ int queue_transactions(Sequencer *osr, list<Transaction*>& tls,
+ TrackedOpRef op = TrackedOpRef(),
+ ThreadPool::TPHandle *handle = NULL);
+
+ /**
+ * set replay guard xattr on given file
+ *
+ * This will ensure that we will not replay this (or any previous) operation
+ * against this particular inode/object.
+ *
+ * @param fd open file descriptor for the file/object
+ * @param spos sequencer position of the last operation we should not replay
+ */
+ void _set_replay_guard(int fd,
+ const SequencerPosition& spos,
+ const ghobject_t *oid=0,
+ bool in_progress=false);
+ void _set_replay_guard(coll_t cid,
+ const SequencerPosition& spos,
+ bool in_progress);
+ void _set_global_replay_guard(coll_t cid,
+ const SequencerPosition &spos);
+
+ /// close a replay guard opened with in_progress=true
+ void _close_replay_guard(int fd, const SequencerPosition& spos);
+ void _close_replay_guard(coll_t cid, const SequencerPosition& spos);
+
+ /**
+ * check replay guard xattr on given file
+ *
+ * Check the current position against any marker on the file that
+ * indicates which operations have already been applied. If the
+ * current or a newer operation has been marked as applied, we
+ * should not replay the current operation again.
+ *
+ * If we are not replaying the journal, we already return true. It
+ * is only on replay that we might return false, indicated that the
+ * operation should not be performed (again).
+ *
+ * @param fd open fd on the file/object in question
+ * @param spos sequencerposition for an operation we could apply/replay
+ * @return 1 if we can apply (maybe replay) this operation, -1 if spos has already been applied, 0 if it was in progress
+ */
+ int _check_replay_guard(int fd, const SequencerPosition& spos);
+ int _check_replay_guard(coll_t cid, const SequencerPosition& spos);
+ int _check_replay_guard(coll_t cid, ghobject_t oid, const SequencerPosition& pos);
+ int _check_global_replay_guard(coll_t cid, const SequencerPosition& spos);
+
+ // ------------------
+ // objects
+ int pick_object_revision_lt(ghobject_t& oid) {
+ return 0;
+ }
+ bool exists(coll_t cid, const ghobject_t& oid);
+ int stat(
+ coll_t cid,
+ const ghobject_t& oid,
+ struct stat *st,
+ bool allow_eio = false);
+ int read(
+ coll_t cid,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ bufferlist& bl,
+ uint32_t op_flags = 0,
+ bool allow_eio = false);
+ int _do_fiemap(int fd, uint64_t offset, size_t len,
+ map<uint64_t, uint64_t> *m);
+ int _do_seek_hole_data(int fd, uint64_t offset, size_t len,
+ map<uint64_t, uint64_t> *m);
+ int fiemap(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl);
+
+ int _touch(coll_t cid, const ghobject_t& oid);
+ int _write(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len,
+ const bufferlist& bl, uint32_t fadvise_flags = 0);
+ int _zero(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len);
+ int _truncate(coll_t cid, const ghobject_t& oid, uint64_t size);
+ int _clone(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid,
+ const SequencerPosition& spos);
+ int _clone_range(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid,
+ uint64_t srcoff, uint64_t len, uint64_t dstoff,
+ const SequencerPosition& spos);
+ int _do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
+ int _do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
+ int _do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc=false);
+ int _remove(coll_t cid, const ghobject_t& oid, const SequencerPosition &spos);
+
+ int _fgetattr(int fd, const char *name, bufferptr& bp);
+ int _fgetattrs(int fd, map<string,bufferptr>& aset);
+ int _fsetattrs(int fd, map<string, bufferptr> &aset);
+
+ void _start_sync();
+
+ void do_force_sync();
+ void start_sync(Context *onsafe);
+ void sync();
+ void _flush_op_queue();
+ void flush();
+ void sync_and_flush();
+
+ int flush_journal();
+ int dump_journal(ostream& out);
+
+ void set_fsid(uuid_d u) {
+ fsid = u;
+ }
+ uuid_d get_fsid() { return fsid; }
+
+ // DEBUG read error injection, an object is removed from both on delete()
+ Mutex read_error_lock;
+ set<ghobject_t, ghobject_t::BitwiseComparator> data_error_set; // read() will return -EIO
+ set<ghobject_t, ghobject_t::BitwiseComparator> mdata_error_set; // getattr(),stat() will return -EIO
+ void inject_data_error(const ghobject_t &oid);
+ void inject_mdata_error(const ghobject_t &oid);
+ void debug_obj_on_delete(const ghobject_t &oid);
+ bool debug_data_eio(const ghobject_t &oid);
+ bool debug_mdata_eio(const ghobject_t &oid);
+
+ int snapshot(const string& name);
+
+ // attrs
+ int getattr(coll_t cid, const ghobject_t& oid, const char *name, bufferptr &bp);
+ int getattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset);
+
+ int _setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset,
+ const SequencerPosition &spos);
+ int _rmattr(coll_t cid, const ghobject_t& oid, const char *name,
+ const SequencerPosition &spos);
+ int _rmattrs(coll_t cid, const ghobject_t& oid,
+ const SequencerPosition &spos);
+
+ int collection_getattr(coll_t c, const char *name, void *value, size_t size);
+ int collection_getattr(coll_t c, const char *name, bufferlist& bl);
+ int collection_getattrs(coll_t cid, map<string,bufferptr> &aset);
+
+ int _collection_setattr(coll_t c, const char *name, const void *value, size_t size);
+ int _collection_rmattr(coll_t c, const char *name);
+ int _collection_setattrs(coll_t cid, map<string,bufferptr> &aset);
+ int _collection_remove_recursive(const coll_t &cid,
+ const SequencerPosition &spos);
+
+ // collections
+ int collection_list(coll_t c, ghobject_t start, ghobject_t end,
+ bool sort_bitwise, int max,
+ vector<ghobject_t> *ls, ghobject_t *next);
+ int list_collections(vector<coll_t>& ls);
+ int list_collections(vector<coll_t>& ls, bool include_temp);
+ int collection_version_current(coll_t c, uint32_t *version);
+ int collection_stat(coll_t c, struct stat *st);
+ bool collection_exists(coll_t c);
+ bool collection_empty(coll_t c);
+
+ // omap (see ObjectStore.h for documentation)
+ int omap_get(coll_t c, const ghobject_t &oid, bufferlist *header,
+ map<string, bufferlist> *out);
+ int omap_get_header(
+ coll_t c,
+ const ghobject_t &oid,
+ bufferlist *out,
+ bool allow_eio = false);
+ int omap_get_keys(coll_t c, const ghobject_t &oid, set<string> *keys);
+ int omap_get_values(coll_t c, const ghobject_t &oid, const set<string> &keys,
+ map<string, bufferlist> *out);
+ int omap_check_keys(coll_t c, const ghobject_t &oid, const set<string> &keys,
+ set<string> *out);
+ ObjectMap::ObjectMapIterator get_omap_iterator(coll_t c, const ghobject_t &oid);
+
+ int _create_collection(coll_t c, const SequencerPosition &spos);
+ int _destroy_collection(coll_t c);
+ /**
+ * Give an expected number of objects hint to the collection.
+ *
+ * @param c - collection id.
+ * @param pg_num - pg number of the pool this collection belongs to
+ * @param expected_num_objs - expected number of objects in this collection
+ * @param spos - sequence position
+ *
+ * @return 0 on success, an error code otherwise
+ */
+ int _collection_hint_expected_num_objs(coll_t c, uint32_t pg_num,
+ uint64_t expected_num_objs,
+ const SequencerPosition &spos);
+ int _collection_add(coll_t c, coll_t ocid, const ghobject_t& oid,
+ const SequencerPosition& spos);
+ int _collection_move_rename(coll_t oldcid, const ghobject_t& oldoid,
+ coll_t c, const ghobject_t& o,
+ const SequencerPosition& spos);
+
+ int _set_alloc_hint(coll_t cid, const ghobject_t& oid,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size);
+
+ void dump_start(const std::string& file);
+ void dump_stop();
+ void dump_transactions(list<ObjectStore::Transaction*>& ls, uint64_t seq, OpSequencer *osr);
+
+private:
+ void _inject_failure();
+
+ // omap
+ int _omap_clear(coll_t cid, const ghobject_t &oid,
+ const SequencerPosition &spos);
+ int _omap_setkeys(coll_t cid, const ghobject_t &oid,
+ const map<string, bufferlist> &aset,
+ const SequencerPosition &spos);
+ int _omap_rmkeys(coll_t cid, const ghobject_t &oid, const set<string> &keys,
+ const SequencerPosition &spos);
+ int _omap_rmkeyrange(coll_t cid, const ghobject_t &oid,
+ const string& first, const string& last,
+ const SequencerPosition &spos);
+ int _omap_setheader(coll_t cid, const ghobject_t &oid, const bufferlist &bl,
+ const SequencerPosition &spos);
+ int _split_collection(coll_t cid, uint32_t bits, uint32_t rem, coll_t dest,
+ const SequencerPosition &spos);
+ int _split_collection_create(coll_t cid, uint32_t bits, uint32_t rem,
+ coll_t dest,
+ const SequencerPosition &spos);
+
+ virtual const char** get_tracked_conf_keys() const;
+ virtual void handle_conf_change(const struct md_config_t *conf,
+ const std::set <std::string> &changed);
+ float m_filestore_commit_timeout;
+ bool m_filestore_journal_parallel;
+ bool m_filestore_journal_trailing;
+ bool m_filestore_journal_writeahead;
+ int m_filestore_fiemap_threshold;
+ double m_filestore_max_sync_interval;
+ double m_filestore_min_sync_interval;
+ bool m_filestore_fail_eio;
+ bool m_filestore_fadvise;
+ int do_update;
+ bool m_journal_dio, m_journal_aio, m_journal_force_aio;
+ std::string m_osd_rollback_to_cluster_snap;
+ bool m_osd_use_stale_snap;
+ int m_filestore_queue_max_ops;
+ int m_filestore_queue_max_bytes;
+ int m_filestore_queue_committing_max_ops;
+ int m_filestore_queue_committing_max_bytes;
+ bool m_filestore_do_dump;
+ std::ofstream m_filestore_dump;
+ JSONFormatter m_filestore_dump_fmt;
+ atomic_t m_filestore_kill_at;
+ bool m_filestore_sloppy_crc;
+ int m_filestore_sloppy_crc_block_size;
+ uint64_t m_filestore_max_alloc_hint_size;
+ long m_fs_type;
+
+ //Determined xattr handling based on fs type
+ void set_xattr_limits_via_conf();
+ uint32_t m_filestore_max_inline_xattr_size;
+ uint32_t m_filestore_max_inline_xattrs;
+
+ FSSuperblock superblock;
+
+ /**
+ * write_superblock()
+ *
+ * Write superblock to persisent storage
+ *
+ * return value: 0 on success, otherwise negative errno
+ */
+ int write_superblock();
+
+ /**
+ * read_superblock()
+ *
+ * Fill in FileStore::superblock by reading persistent storage
+ *
+ * return value: 0 on success, otherwise negative errno
+ */
+ int read_superblock();
+
+ friend class FileStoreBackend;
+ friend class TestFileStore;
+};
+
+ostream& operator<<(ostream& out, const FileStore::OpSequencer& s);
+
+struct fiemap;
+
+class FileStoreBackend {
+private:
+ FileStore *filestore;
+protected:
+ int get_basedir_fd() {
+ return filestore->basedir_fd;
+ }
+ int get_current_fd() {
+ return filestore->current_fd;
+ }
+ int get_op_fd() {
+ return filestore->op_fd;
+ }
+ size_t get_blksize() {
+ return filestore->blk_size;
+ }
+ const string& get_basedir_path() {
+ return filestore->basedir;
+ }
+ const string& get_current_path() {
+ return filestore->current_fn;
+ }
+ int _copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) {
+ if (has_fiemap() || has_seek_data_hole()) {
+ return filestore->_do_sparse_copy_range(from, to, srcoff, len, dstoff);
+ } else {
+ return filestore->_do_copy_range(from, to, srcoff, len, dstoff);
+ }
+ }
+ int get_crc_block_size() {
+ return filestore->m_filestore_sloppy_crc_block_size;
+ }
+
+public:
+ FileStoreBackend(FileStore *fs) : filestore(fs) {}
+ virtual ~FileStoreBackend() {}
+
+ static FileStoreBackend *create(long f_type, FileStore *fs);
+
+ virtual const char *get_name() = 0;
+ virtual int detect_features() = 0;
+ virtual int create_current() = 0;
+ virtual bool can_checkpoint() = 0;
+ virtual int list_checkpoints(list<string>& ls) = 0;
+ virtual int create_checkpoint(const string& name, uint64_t *cid) = 0;
+ virtual int sync_checkpoint(uint64_t id) = 0;
+ virtual int rollback_to(const string& name) = 0;
+ virtual int destroy_checkpoint(const string& name) = 0;
+ virtual int syncfs() = 0;
+ virtual bool has_fiemap() = 0;
+ virtual bool has_seek_data_hole() = 0;
+ virtual int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) = 0;
+ virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) = 0;
+ virtual int set_alloc_hint(int fd, uint64_t hint) = 0;
+ virtual bool has_splice() const = 0;
+
+ // hooks for (sloppy) crc tracking
+ virtual int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) = 0;
+ virtual int _crc_update_truncate(int fd, loff_t off) = 0;
+ virtual int _crc_update_zero(int fd, loff_t off, size_t len) = 0;
+ virtual int _crc_update_clone_range(int srcfd, int destfd,
+ loff_t srcoff, size_t len, loff_t dstoff) = 0;
+ virtual int _crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
+ ostream *out) = 0;
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/int_types.h"
+#include "include/types.h"
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+
+#if defined(__linux__)
+#include <linux/fs.h>
+#endif
+
+#include "include/compat.h"
+#include "include/linux_fiemap.h"
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "GenericFileStoreBackend.h"
+
+#include "common/errno.h"
+#include "common/config.h"
+#include "common/sync_filesystem.h"
+
+#include "common/SloppyCRCMap.h"
+#include "os/filestore/chain_xattr.h"
+
+#define SLOPPY_CRC_XATTR "user.cephos.scrc"
+
+
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "genericfilestorebackend(" << get_basedir_path() << ") "
+
+#define ALIGN_DOWN(x, by) ((x) - ((x) % (by)))
+#define ALIGNED(x, by) (!((x) % (by)))
+#define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by)))
+
+GenericFileStoreBackend::GenericFileStoreBackend(FileStore *fs):
+ FileStoreBackend(fs),
+ ioctl_fiemap(false),
+ seek_data_hole(false),
+ m_filestore_fiemap(g_conf->filestore_fiemap),
+ m_filestore_seek_data_hole(g_conf->filestore_seek_data_hole),
+ m_filestore_fsync_flushes_journal_data(g_conf->filestore_fsync_flushes_journal_data),
+ m_filestore_splice(false) {}
+
+int GenericFileStoreBackend::detect_features()
+{
+ char fn[PATH_MAX];
+ snprintf(fn, sizeof(fn), "%s/fiemap_test", get_basedir_path().c_str());
+
+ int fd = ::open(fn, O_CREAT|O_RDWR|O_TRUNC, 0644);
+ if (fd < 0) {
+ fd = -errno;
+ derr << "detect_features: unable to create " << fn << ": " << cpp_strerror(fd) << dendl;
+ return fd;
+ }
+
+ // ext4 has a bug in older kernels where fiemap will return an empty
+ // result in some cases. this is a file layout that triggers the bug
+ // on 2.6.34-rc5.
+ int v[] = {
+ 0x0000000000016000, 0x0000000000007000,
+ 0x000000000004a000, 0x0000000000007000,
+ 0x0000000000060000, 0x0000000000001000,
+ 0x0000000000061000, 0x0000000000008000,
+ 0x0000000000069000, 0x0000000000007000,
+ 0x00000000000a3000, 0x000000000000c000,
+ 0x000000000024e000, 0x000000000000c000,
+ 0x000000000028b000, 0x0000000000009000,
+ 0x00000000002b1000, 0x0000000000003000,
+ 0, 0
+ };
+ for (int i=0; v[i]; i++) {
+ int off = v[i++];
+ int len = v[i];
+
+ // write a large extent
+ char buf[len];
+ memset(buf, 1, sizeof(buf));
+ int r = ::lseek(fd, off, SEEK_SET);
+ if (r < 0) {
+ r = -errno;
+ derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(r) << dendl;
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return r;
+ }
+ r = write(fd, buf, sizeof(buf));
+ if (r < 0) {
+ derr << "detect_features: failed to write to " << fn << ": " << cpp_strerror(r) << dendl;
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return r;
+ }
+ }
+
+ // fiemap an extent inside that
+ if (!m_filestore_fiemap) {
+ dout(0) << "detect_features: FIEMAP ioctl is disabled via 'filestore fiemap' config option" << dendl;
+ ioctl_fiemap = false;
+ } else {
+ struct fiemap *fiemap;
+ int r = do_fiemap(fd, 2430421, 59284, &fiemap);
+ if (r < 0) {
+ dout(0) << "detect_features: FIEMAP ioctl is NOT supported" << dendl;
+ ioctl_fiemap = false;
+ } else {
+ if (fiemap->fm_mapped_extents == 0) {
+ dout(0) << "detect_features: FIEMAP ioctl is supported, but buggy -- upgrade your kernel" << dendl;
+ ioctl_fiemap = false;
+ } else {
+ dout(0) << "detect_features: FIEMAP ioctl is supported and appears to work" << dendl;
+ ioctl_fiemap = true;
+ }
+ free(fiemap);
+ }
+ }
+
+ // SEEK_DATA/SEEK_HOLE detection
+ if (!m_filestore_seek_data_hole) {
+ dout(0) << "detect_features: SEEK_DATA/SEEK_HOLE is disabled via 'filestore seek data hole' config option" << dendl;
+ seek_data_hole = false;
+ } else {
+#if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA)
+ // If compiled on an OS with SEEK_HOLE/SEEK_DATA support, but running
+ // on an OS that doesn't support SEEK_HOLE/SEEK_DATA, EINVAL is returned.
+ // Fall back to use fiemap.
+ off_t hole_pos;
+
+ hole_pos = lseek(fd, 0, SEEK_HOLE);
+ if (hole_pos < 0) {
+ if (errno == EINVAL) {
+ dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is NOT supported" << dendl;
+ seek_data_hole = false;
+ } else {
+ derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(-errno) << dendl;
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return -errno;
+ }
+ } else {
+ dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is supported" << dendl;
+ seek_data_hole = true;
+ }
+#endif
+ }
+
+ //splice detection
+#ifdef CEPH_HAVE_SPLICE
+ if (!m_filestore_splice) {
+ int pipefd[2];
+ loff_t off_in = 0;
+ int r;
+ if ((r = pipe(pipefd)) < 0)
+ dout(0) << "detect_features: splice pipe met error " << cpp_strerror(errno) << dendl;
+ else {
+ lseek(fd, 0, SEEK_SET);
+ r = splice(fd, &off_in, pipefd[1], NULL, 10, 0);
+ if (!(r < 0 && errno == EINVAL)) {
+ m_filestore_splice = true;
+ dout(0) << "detect_features: splice is supported" << dendl;
+ } else
+ dout(0) << "detect_features: splice is NOT supported" << dendl;
+ close(pipefd[0]);
+ close(pipefd[1]);
+ }
+ }
+#endif
+ ::unlink(fn);
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+
+
+ bool have_syncfs = false;
+#ifdef HAVE_SYS_SYNCFS
+ if (::syncfs(get_basedir_fd()) == 0) {
+ dout(0) << "detect_features: syncfs(2) syscall fully supported (by glibc and kernel)" << dendl;
+ have_syncfs = true;
+ } else {
+ dout(0) << "detect_features: syncfs(2) syscall supported by glibc BUT NOT the kernel" << dendl;
+ }
+#elif defined(SYS_syncfs)
+ if (syscall(SYS_syncfs, get_basedir_fd()) == 0) {
+ dout(0) << "detect_features: syscall(SYS_syncfs, fd) fully supported" << dendl;
+ have_syncfs = true;
+ } else {
+ dout(0) << "detect_features: syscall(SYS_syncfs, fd) supported by libc BUT NOT the kernel" << dendl;
+ }
+#elif defined(__NR_syncfs)
+ if (syscall(__NR_syncfs, get_basedir_fd()) == 0) {
+ dout(0) << "detect_features: syscall(__NR_syncfs, fd) fully supported" << dendl;
+ have_syncfs = true;
+ } else {
+ dout(0) << "detect_features: syscall(__NR_syncfs, fd) supported by libc BUT NOT the kernel" << dendl;
+ }
+#endif
+ if (!have_syncfs) {
+ dout(0) << "detect_features: syncfs(2) syscall not supported" << dendl;
+ if (m_filestore_fsync_flushes_journal_data) {
+ dout(0) << "detect_features: no syncfs(2), but 'filestore fsync flushes journal data = true', so fsync will suffice." << dendl;
+ } else {
+ dout(0) << "detect_features: no syncfs(2), must use sync(2)." << dendl;
+ dout(0) << "detect_features: WARNING: multiple ceph-osd daemons on the same host will be slow" << dendl;
+ }
+ }
+
+ return 0;
+}
+
+int GenericFileStoreBackend::create_current()
+{
+ struct stat st;
+ int ret = ::stat(get_current_path().c_str(), &st);
+ if (ret == 0) {
+ // current/ exists
+ if (!S_ISDIR(st.st_mode)) {
+ dout(0) << "_create_current: current/ exists but is not a directory" << dendl;
+ ret = -EINVAL;
+ }
+ } else {
+ ret = ::mkdir(get_current_path().c_str(), 0755);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "_create_current: mkdir " << get_current_path() << " failed: "<< cpp_strerror(ret) << dendl;
+ }
+ }
+ return ret;
+}
+
+int GenericFileStoreBackend::syncfs()
+{
+ int ret;
+ if (m_filestore_fsync_flushes_journal_data) {
+ dout(15) << "syncfs: doing fsync on " << get_op_fd() << dendl;
+ // make the file system's journal commit.
+ // this works with ext3, but NOT ext4
+ ret = ::fsync(get_op_fd());
+ if (ret < 0)
+ ret = -errno;
+ } else {
+ dout(15) << "syncfs: doing a full sync (syncfs(2) if possible)" << dendl;
+ ret = sync_filesystem(get_current_fd());
+ }
+ return ret;
+}
+
+int GenericFileStoreBackend::do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap)
+{
+ struct fiemap *fiemap = NULL;
+ struct fiemap *_realloc_fiemap = NULL;
+ int size;
+ int ret;
+
+ fiemap = (struct fiemap*)calloc(sizeof(struct fiemap), 1);
+ if (!fiemap)
+ return -ENOMEM;
+ /*
+ * There is a bug on xfs about fiemap. Suppose(offset=3990, len=4096),
+ * the result is (logical=4096, len=4096). It leak the [3990, 4096).
+ * Commit:"xfs: fix rounding error of fiemap length parameter
+ * (eedf32bfcace7d8e20cc66757d74fc68f3439ff7)" fix this bug.
+ * Here, we make offset aligned with CEPH_PAGE_SIZE to avoid this bug.
+ */
+ fiemap->fm_start = start - start % CEPH_PAGE_SIZE;
+ fiemap->fm_length = len + start % CEPH_PAGE_SIZE;
+ fiemap->fm_flags = FIEMAP_FLAG_SYNC; /* flush extents to disk if needed */
+
+#if defined(DARWIN) || defined(__FreeBSD__)
+ ret = -ENOTSUP;
+ goto done_err;
+#else
+ if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) {
+ ret = -errno;
+ goto done_err;
+ }
+#endif
+ size = sizeof(struct fiemap_extent) * (fiemap->fm_mapped_extents);
+
+ _realloc_fiemap = (struct fiemap *)realloc(fiemap, sizeof(struct fiemap) + size);
+ if (!_realloc_fiemap) {
+ ret = -ENOMEM;
+ goto done_err;
+ } else {
+ fiemap = _realloc_fiemap;
+ }
+
+ memset(fiemap->fm_extents, 0, size);
+
+ fiemap->fm_extent_count = fiemap->fm_mapped_extents;
+ fiemap->fm_mapped_extents = 0;
+
+#if defined(DARWIN) || defined(__FreeBSD__)
+ ret = -ENOTSUP;
+ goto done_err;
+#else
+ if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) {
+ ret = -errno;
+ goto done_err;
+ }
+ *pfiemap = fiemap;
+#endif
+ return 0;
+
+done_err:
+ *pfiemap = NULL;
+ free(fiemap);
+ return ret;
+}
+
+
+int GenericFileStoreBackend::_crc_load_or_init(int fd, SloppyCRCMap *cm)
+{
+ char buf[100];
+ bufferptr bp;
+ int r = 0;
+ int l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, buf, sizeof(buf));
+ if (l == -ENODATA) {
+ return 0;
+ }
+ if (l >= 0) {
+ bp = buffer::create(l);
+ memcpy(bp.c_str(), buf, l);
+ } else if (l == -ERANGE) {
+ l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, 0, 0);
+ if (l > 0) {
+ bp = buffer::create(l);
+ l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, bp.c_str(), l);
+ }
+ }
+ bufferlist bl;
+ bl.append(bp);
+ bufferlist::iterator p = bl.begin();
+ try {
+ ::decode(*cm, p);
+ }
+ catch (buffer::error &e) {
+ r = -EIO;
+ }
+ if (r < 0)
+ derr << __func__ << " got " << cpp_strerror(r) << dendl;
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_save(int fd, SloppyCRCMap *cm)
+{
+ bufferlist bl;
+ ::encode(*cm, bl);
+ int r = chain_fsetxattr(fd, SLOPPY_CRC_XATTR, bl.c_str(), bl.length());
+ if (r < 0)
+ derr << __func__ << " got " << cpp_strerror(r) << dendl;
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl)
+{
+ SloppyCRCMap scm(get_crc_block_size());
+ int r = _crc_load_or_init(fd, &scm);
+ if (r < 0)
+ return r;
+ ostringstream ss;
+ scm.write(off, len, bl, &ss);
+ dout(30) << __func__ << "\n" << ss.str() << dendl;
+ r = _crc_save(fd, &scm);
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_update_truncate(int fd, loff_t off)
+{
+ SloppyCRCMap scm(get_crc_block_size());
+ int r = _crc_load_or_init(fd, &scm);
+ if (r < 0)
+ return r;
+ scm.truncate(off);
+ r = _crc_save(fd, &scm);
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_update_zero(int fd, loff_t off, size_t len)
+{
+ SloppyCRCMap scm(get_crc_block_size());
+ int r = _crc_load_or_init(fd, &scm);
+ if (r < 0)
+ return r;
+ scm.zero(off, len);
+ r = _crc_save(fd, &scm);
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_update_clone_range(int srcfd, int destfd,
+ loff_t srcoff, size_t len, loff_t dstoff)
+{
+ SloppyCRCMap scm_src(get_crc_block_size());
+ SloppyCRCMap scm_dst(get_crc_block_size());
+ int r = _crc_load_or_init(srcfd, &scm_src);
+ if (r < 0)
+ return r;
+ r = _crc_load_or_init(destfd, &scm_dst);
+ if (r < 0)
+ return r;
+ ostringstream ss;
+ scm_dst.clone_range(srcoff, len, dstoff, scm_src, &ss);
+ dout(30) << __func__ << "\n" << ss.str() << dendl;
+ r = _crc_save(destfd, &scm_dst);
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
+ ostream *out)
+{
+ SloppyCRCMap scm(get_crc_block_size());
+ int r = _crc_load_or_init(fd, &scm);
+ if (r < 0)
+ return r;
+ return scm.read(off, len, bl, out);
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_GENERICFILESTOREBACKEDN_H
+#define CEPH_GENERICFILESTOREBACKEDN_H
+
+#include "FileStore.h"
+
+class SloppyCRCMap;
+
+class GenericFileStoreBackend : public FileStoreBackend {
+private:
+ bool ioctl_fiemap;
+ bool seek_data_hole;
+ bool m_filestore_fiemap;
+ bool m_filestore_seek_data_hole;
+ bool m_filestore_fsync_flushes_journal_data;
+ bool m_filestore_splice;
+public:
+ GenericFileStoreBackend(FileStore *fs);
+ virtual ~GenericFileStoreBackend() {}
+
+ virtual const char *get_name() {
+ return "generic";
+ }
+ virtual int detect_features();
+ virtual int create_current();
+ virtual bool can_checkpoint() { return false; }
+ virtual int list_checkpoints(list<string>& ls) { return 0; }
+ virtual int create_checkpoint(const string& name, uint64_t *cid) { return -EOPNOTSUPP; }
+ virtual int sync_checkpoint(uint64_t id) { return -EOPNOTSUPP; }
+ virtual int rollback_to(const string& name) { return -EOPNOTSUPP; }
+ virtual int destroy_checkpoint(const string& name) { return -EOPNOTSUPP; }
+ virtual int syncfs();
+ virtual bool has_fiemap() { return ioctl_fiemap; }
+ virtual bool has_seek_data_hole() { return seek_data_hole; }
+ virtual int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap);
+ virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) {
+ return _copy_range(from, to, srcoff, len, dstoff);
+ }
+ virtual int set_alloc_hint(int fd, uint64_t hint) { return -EOPNOTSUPP; }
+ virtual bool has_splice() const { return m_filestore_splice; }
+private:
+ int _crc_load_or_init(int fd, SloppyCRCMap *cm);
+ int _crc_save(int fd, SloppyCRCMap *cm);
+public:
+ virtual int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl);
+ virtual int _crc_update_truncate(int fd, loff_t off);
+ virtual int _crc_update_zero(int fd, loff_t off, size_t len);
+ virtual int _crc_update_clone_range(int srcfd, int destfd,
+ loff_t srcoff, size_t len, loff_t dstoff);
+ virtual int _crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
+ ostream *out);
+};
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/types.h"
+#include "include/buffer.h"
+#include "osd/osd_types.h"
+#include <errno.h>
+
+#include "HashIndex.h"
+
+#include "common/debug.h"
+#define dout_subsys ceph_subsys_filestore
+
+const string HashIndex::SUBDIR_ATTR = "contents";
+const string HashIndex::IN_PROGRESS_OP_TAG = "in_progress_op";
+
+/// hex digit to integer value
+int hex_to_int(char c)
+{
+ if (c >= '0' && c <= '9')
+ return c - '0';
+ if (c >= 'A' && c <= 'F')
+ return c - 'A' + 10;
+ assert(0);
+}
+
+/// int value to hex digit
+char int_to_hex(int v)
+{
+ assert(v < 16);
+ if (v < 10)
+ return '0' + v;
+ return 'A' + v - 10;
+}
+
+/// reverse bits in a nibble (0..15)
+int reverse_nibble_bits(int in)
+{
+ assert(in < 16);
+ return
+ ((in & 8) >> 3) |
+ ((in & 4) >> 1) |
+ ((in & 2) << 1) |
+ ((in & 1) << 3);
+}
+
+/// reverse nibble bits in a hex digit
+char reverse_hexdigit_bits(char c)
+{
+ return int_to_hex(reverse_nibble_bits(hex_to_int(c)));
+}
+
+/// reverse nibble bits in a hex string
+string reverse_hexdigit_bits_string(string s)
+{
+ for (unsigned i=0; i<s.size(); ++i)
+ s[i] = reverse_hexdigit_bits(s[i]);
+ return s;
+}
+
+/// compare hex digit (as length 1 string) bitwise
+bool cmp_hexdigit_bitwise(const string& l, const string& r)
+{
+ assert(l.length() == 1 && r.length() == 1);
+ int lv = hex_to_int(l[0]);
+ int rv = hex_to_int(r[0]);
+ assert(lv < 16);
+ assert(rv < 16);
+ return reverse_nibble_bits(lv) < reverse_nibble_bits(rv);
+}
+
+/// compare hex digit string bitwise
+bool cmp_hexdigit_string_bitwise(const string& l, const string& r)
+{
+ string ll = reverse_hexdigit_bits_string(l);
+ string rr = reverse_hexdigit_bits_string(r);
+ return ll < rr;
+}
+
+int HashIndex::cleanup() {
+ bufferlist bl;
+ int r = get_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ if (r < 0) {
+ // No in progress operations!
+ return 0;
+ }
+ bufferlist::iterator i = bl.begin();
+ InProgressOp in_progress(i);
+ subdir_info_s info;
+ r = get_info(in_progress.path, &info);
+ if (r == -ENOENT) {
+ return end_split_or_merge(in_progress.path);
+ } else if (r < 0) {
+ return r;
+ }
+
+ if (in_progress.is_split())
+ return complete_split(in_progress.path, info);
+ else if (in_progress.is_merge())
+ return complete_merge(in_progress.path, info);
+ else if (in_progress.is_col_split()) {
+ for (vector<string>::iterator i = in_progress.path.begin();
+ i != in_progress.path.end();
+ ++i) {
+ vector<string> path(in_progress.path.begin(), i);
+ int r = reset_attr(path);
+ if (r < 0)
+ return r;
+ }
+ return 0;
+ }
+ else
+ return -EINVAL;
+}
+
+int HashIndex::reset_attr(
+ const vector<string> &path)
+{
+ int exists = 0;
+ int r = path_exists(path, &exists);
+ if (r < 0)
+ return r;
+ if (!exists)
+ return 0;
+ map<string, ghobject_t> objects;
+ vector<string> subdirs;
+ r = list_objects(path, 0, 0, &objects);
+ if (r < 0)
+ return r;
+ r = list_subdirs(path, &subdirs);
+ if (r < 0)
+ return r;
+
+ subdir_info_s info;
+ info.hash_level = path.size();
+ info.objs = objects.size();
+ info.subdirs = subdirs.size();
+ return set_info(path, info);
+}
+
+int HashIndex::col_split_level(
+ HashIndex &from,
+ HashIndex &to,
+ const vector<string> &path,
+ uint32_t inbits,
+ uint32_t match,
+ unsigned *mkdirred)
+{
+ /* For each subdir, move, recurse, or ignore based on comparing the low order
+ * bits of the hash represented by the subdir path with inbits, match passed
+ * in.
+ */
+ vector<string> subdirs;
+ int r = from.list_subdirs(path, &subdirs);
+ if (r < 0)
+ return r;
+ map<string, ghobject_t> objects;
+ r = from.list_objects(path, 0, 0, &objects);
+ if (r < 0)
+ return r;
+
+ set<string> to_move;
+ for (vector<string>::iterator i = subdirs.begin();
+ i != subdirs.end();
+ ++i) {
+ uint32_t bits = 0;
+ uint32_t hash = 0;
+ vector<string> sub_path(path.begin(), path.end());
+ sub_path.push_back(*i);
+ path_to_hobject_hash_prefix(sub_path, &bits, &hash);
+ if (bits < inbits) {
+ if (hobject_t::match_hash(hash, bits, match)) {
+ r = col_split_level(
+ from,
+ to,
+ sub_path,
+ inbits,
+ match,
+ mkdirred);
+ if (r < 0)
+ return r;
+ if (*mkdirred > path.size())
+ *mkdirred = path.size();
+ } // else, skip, doesn't need to be moved or recursed into
+ } else {
+ if (hobject_t::match_hash(hash, inbits, match)) {
+ to_move.insert(*i);
+ }
+ } // else, skip, doesn't need to be moved or recursed into
+ }
+
+ /* Then, do the same for each object */
+ map<string, ghobject_t> objs_to_move;
+ for (map<string, ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ if (i->second.match(inbits, match)) {
+ objs_to_move.insert(*i);
+ }
+ }
+
+ if (objs_to_move.empty() && to_move.empty())
+ return 0;
+
+ // Make parent directories as needed
+ while (*mkdirred < path.size()) {
+ ++*mkdirred;
+ int exists = 0;
+ vector<string> creating_path(path.begin(), path.begin()+*mkdirred);
+ r = to.path_exists(creating_path, &exists);
+ if (r < 0)
+ return r;
+ if (exists)
+ continue;
+ subdir_info_s info;
+ info.objs = 0;
+ info.subdirs = 0;
+ info.hash_level = creating_path.size();
+ if (*mkdirred < path.size() - 1)
+ info.subdirs = 1;
+ r = to.start_col_split(creating_path);
+ if (r < 0)
+ return r;
+ r = to.create_path(creating_path);
+ if (r < 0)
+ return r;
+ r = to.set_info(creating_path, info);
+ if (r < 0)
+ return r;
+ r = to.end_split_or_merge(creating_path);
+ if (r < 0)
+ return r;
+ }
+
+ subdir_info_s from_info;
+ subdir_info_s to_info;
+ r = from.get_info(path, &from_info);
+ if (r < 0)
+ return r;
+ r = to.get_info(path, &to_info);
+ if (r < 0)
+ return r;
+
+ from.start_col_split(path);
+ to.start_col_split(path);
+
+ // Do subdir moves
+ for (set<string>::iterator i = to_move.begin();
+ i != to_move.end();
+ ++i) {
+ from_info.subdirs--;
+ to_info.subdirs++;
+ r = move_subdir(from, to, path, *i);
+ if (r < 0)
+ return r;
+ }
+
+ for (map<string, ghobject_t>::iterator i = objs_to_move.begin();
+ i != objs_to_move.end();
+ ++i) {
+ from_info.objs--;
+ to_info.objs++;
+ r = move_object(from, to, path, *i);
+ if (r < 0)
+ return r;
+ }
+
+
+ r = to.set_info(path, to_info);
+ if (r < 0)
+ return r;
+ r = from.set_info(path, from_info);
+ if (r < 0)
+ return r;
+ from.end_split_or_merge(path);
+ to.end_split_or_merge(path);
+ return 0;
+}
+
+int HashIndex::_split(
+ uint32_t match,
+ uint32_t bits,
+ CollectionIndex* dest) {
+ assert(collection_version() == dest->collection_version());
+ unsigned mkdirred = 0;
+ return col_split_level(
+ *this,
+ *static_cast<HashIndex*>(dest),
+ vector<string>(),
+ bits,
+ match,
+ &mkdirred);
+}
+
+int HashIndex::_init() {
+ subdir_info_s info;
+ vector<string> path;
+ return set_info(path, info);
+}
+
+/* LFNIndex virtual method implementations */
+int HashIndex::_created(const vector<string> &path,
+ const ghobject_t &oid,
+ const string &mangled_name) {
+ subdir_info_s info;
+ int r;
+ r = get_info(path, &info);
+ if (r < 0)
+ return r;
+ info.objs++;
+ r = set_info(path, info);
+ if (r < 0)
+ return r;
+
+ if (must_split(info)) {
+ int r = initiate_split(path, info);
+ if (r < 0)
+ return r;
+ return complete_split(path, info);
+ } else {
+ return 0;
+ }
+}
+
+int HashIndex::_remove(const vector<string> &path,
+ const ghobject_t &oid,
+ const string &mangled_name) {
+ int r;
+ r = remove_object(path, oid);
+ if (r < 0)
+ return r;
+ subdir_info_s info;
+ r = get_info(path, &info);
+ if (r < 0)
+ return r;
+ info.objs--;
+ r = set_info(path, info);
+ if (r < 0)
+ return r;
+ if (must_merge(info)) {
+ r = initiate_merge(path, info);
+ if (r < 0)
+ return r;
+ return complete_merge(path, info);
+ } else {
+ return 0;
+ }
+}
+
+int HashIndex::_lookup(const ghobject_t &oid,
+ vector<string> *path,
+ string *mangled_name,
+ int *hardlink) {
+ vector<string> path_comp;
+ get_path_components(oid, &path_comp);
+ vector<string>::iterator next = path_comp.begin();
+ int exists;
+ while (1) {
+ int r = path_exists(*path, &exists);
+ if (r < 0)
+ return r;
+ if (!exists) {
+ if (path->empty())
+ return -ENOENT;
+ path->pop_back();
+ break;
+ }
+ if (next == path_comp.end())
+ break;
+ path->push_back(*(next++));
+ }
+ return get_mangled_name(*path, oid, mangled_name, hardlink);
+}
+
+int HashIndex::_collection_list_partial(const ghobject_t &start,
+ const ghobject_t &end,
+ bool sort_bitwise,
+ int max_count,
+ vector<ghobject_t> *ls,
+ ghobject_t *next) {
+ vector<string> path;
+ ghobject_t _next;
+ if (!next)
+ next = &_next;
+ *next = start;
+ dout(20) << __func__ << " start:" << start << " end:" << end << "-" << max_count << " ls.size " << ls->size() << dendl;
+ return list_by_hash(path, end, sort_bitwise, max_count, next, ls);
+}
+
+int HashIndex::prep_delete() {
+ return recursive_remove(vector<string>());
+}
+
+int HashIndex::_pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs) {
+ int ret;
+ vector<string> path;
+ subdir_info_s root_info;
+ // Make sure there is neither objects nor sub-folders
+ // in this collection
+ ret = get_info(path, &root_info);
+ if (ret < 0)
+ return ret;
+
+ // Do the folder splitting first
+ ret = pre_split_folder(pg_num, expected_num_objs);
+ if (ret < 0)
+ return ret;
+ // Initialize the folder info starting from root
+ return init_split_folder(path, 0);
+}
+
+int HashIndex::pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs)
+{
+ // If folder merging is enabled (by setting the threshold positive),
+ // no need to split
+ if (merge_threshold > 0)
+ return 0;
+ const coll_t c = coll();
+ // Do not split if the expected number of objects in this collection is zero (by default)
+ if (expected_num_objs == 0)
+ return 0;
+
+ // Calculate the number of leaf folders (which actually store files)
+ // need to be created
+ const uint64_t objs_per_folder = (uint64_t)(abs(merge_threshold)) * (uint64_t)split_multiplier * 16;
+ uint64_t leavies = expected_num_objs / objs_per_folder ;
+ // No need to split
+ if (leavies == 0 || expected_num_objs == objs_per_folder)
+ return 0;
+
+ spg_t spgid;
+ if (!c.is_pg_prefix(&spgid))
+ return -EINVAL;
+ const ps_t ps = spgid.pgid.ps();
+
+ // the most significant bits of pg_num
+ const int pg_num_bits = calc_num_bits(pg_num - 1);
+ ps_t tmp_id = ps;
+ // calculate the number of levels we only create one sub folder
+ int num = pg_num_bits / 4;
+ // pg num's hex value is like 1xxx,xxxx,xxxx but not 1111,1111,1111,
+ // so that splitting starts at level 3
+ if (pg_num_bits % 4 == 0 && pg_num < ((uint32_t)1 << pg_num_bits)) {
+ --num;
+ }
+
+ int ret;
+ // Start with creation that only has one subfolder
+ vector<string> paths;
+ int dump_num = num;
+ while (num-- > 0) {
+ ps_t v = tmp_id & 0x0000000f;
+ paths.push_back(to_hex(v));
+ ret = create_path(paths);
+ if (ret < 0 && ret != -EEXIST)
+ return ret;
+ tmp_id = tmp_id >> 4;
+ }
+
+ // Starting from here, we can split by creating multiple subfolders
+ const int left_bits = pg_num_bits - dump_num * 4;
+ // this variable denotes how many bits (for this level) that can be
+ // used for sub folder splitting
+ int split_bits = 4 - left_bits;
+ // the below logic is inspired by rados.h#ceph_stable_mod,
+ // it basically determines how many sub-folders should we
+ // create for splitting
+ assert(pg_num_bits > 0); // otherwise BAD_SHIFT
+ if (((1 << (pg_num_bits - 1)) | ps) >= pg_num) {
+ ++split_bits;
+ }
+ const uint32_t subs = (1 << split_bits);
+ // Calculate how many levels we create starting from here
+ int level = 0;
+ leavies /= subs;
+ while (leavies > 1) {
+ ++level;
+ leavies = leavies >> 4;
+ }
+ for (uint32_t i = 0; i < subs; ++i) {
+ assert(split_bits <= 4); // otherwise BAD_SHIFT
+ int v = tmp_id | (i << ((4 - split_bits) % 4));
+ paths.push_back(to_hex(v));
+ ret = create_path(paths);
+ if (ret < 0 && ret != -EEXIST)
+ return ret;
+ ret = recursive_create_path(paths, level);
+ if (ret < 0)
+ return ret;
+ paths.pop_back();
+ }
+ return 0;
+}
+
+int HashIndex::init_split_folder(vector<string> &path, uint32_t hash_level)
+{
+ // Get the number of sub directories for the current path
+ vector<string> subdirs;
+ int ret = list_subdirs(path, &subdirs);
+ if (ret < 0)
+ return ret;
+ subdir_info_s info;
+ info.subdirs = subdirs.size();
+ info.hash_level = hash_level;
+ ret = set_info(path, info);
+ if (ret < 0)
+ return ret;
+ ret = fsync_dir(path);
+ if (ret < 0)
+ return ret;
+
+ // Do the same for subdirs
+ vector<string>::const_iterator iter;
+ for (iter = subdirs.begin(); iter != subdirs.end(); ++iter) {
+ path.push_back(*iter);
+ ret = init_split_folder(path, hash_level + 1);
+ if (ret < 0)
+ return ret;
+ path.pop_back();
+ }
+ return 0;
+}
+
+int HashIndex::recursive_create_path(vector<string>& path, int level)
+{
+ if (level == 0)
+ return 0;
+ for (int i = 0; i < 16; ++i) {
+ path.push_back(to_hex(i));
+ int ret = create_path(path);
+ if (ret < 0 && ret != -EEXIST)
+ return ret;
+ ret = recursive_create_path(path, level - 1);
+ if (ret < 0)
+ return ret;
+ path.pop_back();
+ }
+ return 0;
+}
+
+int HashIndex::recursive_remove(const vector<string> &path) {
+ vector<string> subdirs;
+ int r = list_subdirs(path, &subdirs);
+ if (r < 0)
+ return r;
+ map<string, ghobject_t> objects;
+ r = list_objects(path, 0, 0, &objects);
+ if (r < 0)
+ return r;
+ if (!objects.empty())
+ return -ENOTEMPTY;
+ vector<string> subdir(path);
+ for (vector<string>::iterator i = subdirs.begin();
+ i != subdirs.end();
+ ++i) {
+ subdir.push_back(*i);
+ r = recursive_remove(subdir);
+ if (r < 0)
+ return r;
+ subdir.pop_back();
+ }
+ return remove_path(path);
+}
+
+int HashIndex::start_col_split(const vector<string> &path) {
+ bufferlist bl;
+ InProgressOp op_tag(InProgressOp::COL_SPLIT, path);
+ op_tag.encode(bl);
+ int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ if (r < 0)
+ return r;
+ return fsync_dir(vector<string>());
+}
+
+int HashIndex::start_split(const vector<string> &path) {
+ bufferlist bl;
+ InProgressOp op_tag(InProgressOp::SPLIT, path);
+ op_tag.encode(bl);
+ int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ if (r < 0)
+ return r;
+ return fsync_dir(vector<string>());
+}
+
+int HashIndex::start_merge(const vector<string> &path) {
+ bufferlist bl;
+ InProgressOp op_tag(InProgressOp::MERGE, path);
+ op_tag.encode(bl);
+ int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ if (r < 0)
+ return r;
+ return fsync_dir(vector<string>());
+}
+
+int HashIndex::end_split_or_merge(const vector<string> &path) {
+ return remove_attr_path(vector<string>(), IN_PROGRESS_OP_TAG);
+}
+
+int HashIndex::get_info(const vector<string> &path, subdir_info_s *info) {
+ bufferlist buf;
+ int r = get_attr_path(path, SUBDIR_ATTR, buf);
+ if (r < 0)
+ return r;
+ bufferlist::iterator bufiter = buf.begin();
+ info->decode(bufiter);
+ assert(path.size() == (unsigned)info->hash_level);
+ return 0;
+}
+
+int HashIndex::set_info(const vector<string> &path, const subdir_info_s &info) {
+ bufferlist buf;
+ assert(path.size() == (unsigned)info.hash_level);
+ info.encode(buf);
+ return add_attr_path(path, SUBDIR_ATTR, buf);
+}
+
+bool HashIndex::must_merge(const subdir_info_s &info) {
+ return (info.hash_level > 0 &&
+ merge_threshold > 0 &&
+ info.objs < (unsigned)merge_threshold &&
+ info.subdirs == 0);
+}
+
+bool HashIndex::must_split(const subdir_info_s &info) {
+ return (info.hash_level < (unsigned)MAX_HASH_LEVEL &&
+ info.objs > ((unsigned)(abs(merge_threshold)) * 16 * split_multiplier));
+
+}
+
+int HashIndex::initiate_merge(const vector<string> &path, subdir_info_s info) {
+ return start_merge(path);
+}
+
+int HashIndex::complete_merge(const vector<string> &path, subdir_info_s info) {
+ vector<string> dst = path;
+ dst.pop_back();
+ subdir_info_s dstinfo;
+ int r, exists;
+ r = path_exists(path, &exists);
+ if (r < 0)
+ return r;
+ r = get_info(dst, &dstinfo);
+ if (r < 0)
+ return r;
+ if (exists) {
+ r = move_objects(path, dst);
+ if (r < 0)
+ return r;
+ r = reset_attr(dst);
+ if (r < 0)
+ return r;
+ r = remove_path(path);
+ if (r < 0)
+ return r;
+ }
+ if (must_merge(dstinfo)) {
+ r = initiate_merge(dst, dstinfo);
+ if (r < 0)
+ return r;
+ r = fsync_dir(dst);
+ if (r < 0)
+ return r;
+ return complete_merge(dst, dstinfo);
+ }
+ r = fsync_dir(dst);
+ if (r < 0)
+ return r;
+ return end_split_or_merge(path);
+}
+
+int HashIndex::initiate_split(const vector<string> &path, subdir_info_s info) {
+ return start_split(path);
+}
+
+int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
+ int level = info.hash_level;
+ map<string, ghobject_t> objects;
+ vector<string> dst = path;
+ int r;
+ dst.push_back("");
+ r = list_objects(path, 0, 0, &objects);
+ if (r < 0)
+ return r;
+ vector<string> subdirs_vec;
+ r = list_subdirs(path, &subdirs_vec);
+ if (r < 0)
+ return r;
+ set<string> subdirs;
+ subdirs.insert(subdirs_vec.begin(), subdirs_vec.end());
+ map<string, map<string, ghobject_t> > mapped;
+ map<string, ghobject_t> moved;
+ int num_moved = 0;
+ for (map<string, ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ vector<string> new_path;
+ get_path_components(i->second, &new_path);
+ mapped[new_path[level]][i->first] = i->second;
+ }
+ for (map<string, map<string, ghobject_t> >::iterator i = mapped.begin();
+ i != mapped.end();
+ ) {
+ dst[level] = i->first;
+ /* If the info already exists, it must be correct,
+ * we may be picking up a partially finished split */
+ subdir_info_s temp;
+ // subdir has already been fully copied
+ if (subdirs.count(i->first) && !get_info(dst, &temp)) {
+ for (map<string, ghobject_t>::iterator j = i->second.begin();
+ j != i->second.end();
+ ++j) {
+ moved[j->first] = j->second;
+ num_moved++;
+ objects.erase(j->first);
+ }
+ ++i;
+ continue;
+ }
+
+ subdir_info_s info_new;
+ info_new.objs = i->second.size();
+ info_new.subdirs = 0;
+ info_new.hash_level = level + 1;
+ if (must_merge(info_new) && !subdirs.count(i->first)) {
+ mapped.erase(i++);
+ continue;
+ }
+
+ // Subdir doesn't yet exist
+ if (!subdirs.count(i->first)) {
+ info.subdirs += 1;
+ r = create_path(dst);
+ if (r < 0)
+ return r;
+ } // else subdir has been created but only partially copied
+
+ for (map<string, ghobject_t>::iterator j = i->second.begin();
+ j != i->second.end();
+ ++j) {
+ moved[j->first] = j->second;
+ num_moved++;
+ objects.erase(j->first);
+ r = link_object(path, dst, j->second, j->first);
+ // May be a partially finished split
+ if (r < 0 && r != -EEXIST) {
+ return r;
+ }
+ }
+
+ r = fsync_dir(dst);
+ if (r < 0)
+ return r;
+
+ // Presence of info must imply that all objects have been copied
+ r = set_info(dst, info_new);
+ if (r < 0)
+ return r;
+
+ r = fsync_dir(dst);
+ if (r < 0)
+ return r;
+
+ ++i;
+ }
+ r = remove_objects(path, moved, &objects);
+ if (r < 0)
+ return r;
+ info.objs = objects.size();
+ r = reset_attr(path);
+ if (r < 0)
+ return r;
+ r = fsync_dir(path);
+ if (r < 0)
+ return r;
+ return end_split_or_merge(path);
+}
+
+void HashIndex::get_path_components(const ghobject_t &oid,
+ vector<string> *path) {
+ char buf[MAX_HASH_LEVEL + 1];
+ snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, (uint32_t)oid.hobj.get_nibblewise_key());
+
+ // Path components are the hex characters of oid.hobj.hash, least
+ // significant first
+ for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
+ path->push_back(string(&buf[i], 1));
+ }
+}
+
+string HashIndex::get_hash_str(uint32_t hash) {
+ char buf[MAX_HASH_LEVEL + 1];
+ snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, hash);
+ string retval;
+ for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
+ retval.push_back(buf[MAX_HASH_LEVEL - 1 - i]);
+ }
+ return retval;
+}
+
+string HashIndex::get_path_str(const ghobject_t &oid) {
+ assert(!oid.is_max());
+ return get_hash_str(oid.hobj.get_hash());
+}
+
+uint32_t HashIndex::hash_prefix_to_hash(string prefix) {
+ while (prefix.size() < sizeof(uint32_t) * 2) {
+ prefix.push_back('0');
+ }
+ uint32_t hash;
+ sscanf(prefix.c_str(), "%x", &hash);
+ // nibble reverse
+ hash = ((hash & 0x0f0f0f0f) << 4) | ((hash & 0xf0f0f0f0) >> 4);
+ hash = ((hash & 0x00ff00ff) << 8) | ((hash & 0xff00ff00) >> 8);
+ hash = ((hash & 0x0000ffff) << 16) | ((hash & 0xffff0000) >> 16);
+ return hash;
+}
+
+int HashIndex::get_path_contents_by_hash_bitwise(
+ const vector<string> &path,
+ const ghobject_t *next_object,
+ set<string, CmpHexdigitStringBitwise> *hash_prefixes,
+ set<pair<string, ghobject_t>, CmpPairBitwise> *objects)
+{
+ map<string, ghobject_t> rev_objects;
+ int r;
+ r = list_objects(path, 0, 0, &rev_objects);
+ if (r < 0)
+ return r;
+ // bitwise sort
+ for (map<string, ghobject_t>::iterator i = rev_objects.begin();
+ i != rev_objects.end();
+ ++i) {
+ if (next_object && cmp_bitwise(i->second, *next_object) < 0)
+ continue;
+ string hash_prefix = get_path_str(i->second);
+ hash_prefixes->insert(hash_prefix);
+ objects->insert(pair<string, ghobject_t>(hash_prefix, i->second));
+ }
+ vector<string> subdirs;
+ r = list_subdirs(path, &subdirs);
+ if (r < 0)
+ return r;
+
+ // sort subdirs bitwise (by reversing hex digit nibbles)
+ std::sort(subdirs.begin(), subdirs.end(), cmp_hexdigit_bitwise);
+
+ // Local to this function, we will convert the prefix strings
+ // (previously simply the reversed hex digits) to also have each
+ // digit's nibbles reversed. This will make the strings sort
+ // bitwise.
+ string cur_prefix;
+ for (vector<string>::const_iterator i = path.begin();
+ i != path.end();
+ ++i) {
+ cur_prefix.append(reverse_hexdigit_bits_string(*i));
+ }
+ string next_object_string;
+ if (next_object)
+ next_object_string = reverse_hexdigit_bits_string(get_path_str(*next_object));
+ for (vector<string>::iterator i = subdirs.begin();
+ i != subdirs.end();
+ ++i) {
+ string candidate = cur_prefix + reverse_hexdigit_bits_string(*i);
+ if (next_object) {
+ if (next_object->is_max())
+ continue;
+ if (candidate < next_object_string.substr(0, candidate.size()))
+ continue;
+ }
+ // re-reverse the hex digit nibbles for the caller
+ hash_prefixes->insert(reverse_hexdigit_bits_string(candidate));
+ }
+ return 0;
+}
+
+int HashIndex::get_path_contents_by_hash_nibblewise(
+ const vector<string> &path,
+ const ghobject_t *next_object,
+ set<string> *hash_prefixes,
+ set<pair<string, ghobject_t>, CmpPairNibblewise > *objects)
+{
+ map<string, ghobject_t> rev_objects;
+ int r;
+ r = list_objects(path, 0, 0, &rev_objects);
+ if (r < 0)
+ return r;
+
+ for (map<string, ghobject_t>::iterator i = rev_objects.begin();
+ i != rev_objects.end();
+ ++i) {
+ string hash_prefix = get_path_str(i->second);
+ if (next_object && cmp_nibblewise(i->second, *next_object) < 0)
+ continue;
+ hash_prefixes->insert(hash_prefix);
+ objects->insert(pair<string, ghobject_t>(hash_prefix, i->second));
+ }
+
+ vector<string> subdirs;
+ r = list_subdirs(path, &subdirs);
+ if (r < 0)
+ return r;
+
+ // sort nibblewise (string sort of (reversed) hex digits)
+ std::sort(subdirs.begin(), subdirs.end());
+
+ string cur_prefix;
+ for (vector<string>::const_iterator i = path.begin();
+ i != path.end();
+ ++i) {
+ cur_prefix.append(*i);
+ }
+ string next_object_string;
+ if (next_object)
+ next_object_string = get_path_str(*next_object);
+
+ for (vector<string>::iterator i = subdirs.begin();
+ i != subdirs.end();
+ ++i) {
+ string candidate = cur_prefix + *i;
+ if (next_object) {
+ if (next_object->is_max())
+ continue;
+ if (candidate < next_object_string.substr(0, candidate.size()))
+ continue;
+ }
+ hash_prefixes->insert(cur_prefix + *i);
+ }
+ return 0;
+}
+
+int HashIndex::list_by_hash(const vector<string> &path,
+ const ghobject_t &end,
+ bool sort_bitwise,
+ int max_count,
+ ghobject_t *next,
+ vector<ghobject_t> *out)
+{
+ assert(out);
+ if (sort_bitwise)
+ return list_by_hash_bitwise(path, end, max_count, next, out);
+ else
+ return list_by_hash_nibblewise(path, end, max_count, next, out);
+}
+
+int HashIndex::list_by_hash_bitwise(
+ const vector<string> &path,
+ const ghobject_t& end,
+ int max_count,
+ ghobject_t *next,
+ vector<ghobject_t> *out)
+{
+ vector<string> next_path = path;
+ next_path.push_back("");
+ set<string, CmpHexdigitStringBitwise> hash_prefixes;
+ set<pair<string, ghobject_t>, CmpPairBitwise> objects;
+ int r = get_path_contents_by_hash_bitwise(path,
+ next,
+ &hash_prefixes,
+ &objects);
+ if (r < 0)
+ return r;
+ for (set<string, CmpHexdigitStringBitwise>::iterator i = hash_prefixes.begin();
+ i != hash_prefixes.end();
+ ++i) {
+ dout(20) << __func__ << " prefix " << *i << dendl;
+ set<pair<string, ghobject_t>, CmpPairBitwise>::iterator j = objects.lower_bound(
+ make_pair(*i, ghobject_t()));
+ if (j == objects.end() || j->first != *i) {
+ *(next_path.rbegin()) = *(i->rbegin());
+ ghobject_t next_recurse;
+ if (next)
+ next_recurse = *next;
+ r = list_by_hash_bitwise(next_path,
+ end,
+ max_count,
+ &next_recurse,
+ out);
+
+ if (r < 0)
+ return r;
+ if (!next_recurse.is_max()) {
+ if (next)
+ *next = next_recurse;
+ return 0;
+ }
+ } else {
+ while (j != objects.end() && j->first == *i) {
+ if (max_count > 0 && out->size() == (unsigned)max_count) {
+ if (next)
+ *next = j->second;
+ return 0;
+ }
+ if (cmp_bitwise(j->second, end) >= 0) {
+ if (next)
+ *next = ghobject_t::get_max();
+ return 0;
+ }
+ if (!next || cmp_bitwise(j->second, *next) >= 0) {
+ dout(20) << __func__ << " prefix " << *i << " ob " << j->second << dendl;
+ out->push_back(j->second);
+ }
+ ++j;
+ }
+ }
+ }
+ if (next)
+ *next = ghobject_t::get_max();
+ return 0;
+}
+
+int HashIndex::list_by_hash_nibblewise(
+ const vector<string> &path,
+ const ghobject_t& end,
+ int max_count,
+ ghobject_t *next,
+ vector<ghobject_t> *out)
+{
+ vector<string> next_path = path;
+ next_path.push_back("");
+ set<string> hash_prefixes;
+ set<pair<string, ghobject_t>, CmpPairNibblewise> objects;
+ int r = get_path_contents_by_hash_nibblewise(path,
+ next,
+ &hash_prefixes,
+ &objects);
+ if (r < 0)
+ return r;
+ for (set<string>::iterator i = hash_prefixes.begin();
+ i != hash_prefixes.end();
+ ++i) {
+ dout(20) << __func__ << " prefix " << *i << dendl;
+ set<pair<string, ghobject_t>, CmpPairNibblewise >::iterator j =
+ objects.lower_bound(make_pair(*i, ghobject_t()));
+ if (j == objects.end() || j->first != *i) {
+ *(next_path.rbegin()) = *(i->rbegin());
+ ghobject_t next_recurse;
+ if (next)
+ next_recurse = *next;
+ r = list_by_hash_nibblewise(next_path,
+ end,
+ max_count,
+ &next_recurse,
+ out);
+
+ if (r < 0)
+ return r;
+ if (!next_recurse.is_max()) {
+ if (next)
+ *next = next_recurse;
+ return 0;
+ }
+ } else {
+ while (j != objects.end() && j->first == *i) {
+ if (max_count > 0 && out->size() == (unsigned)max_count) {
+ if (next)
+ *next = j->second;
+ return 0;
+ }
+ if (cmp_nibblewise(j->second, end) >= 0) {
+ if (next)
+ *next = ghobject_t::get_max();
+ return 0;
+ }
+ if (!next || cmp_nibblewise(j->second, *next) >= 0) {
+ out->push_back(j->second);
+ }
+ ++j;
+ }
+ }
+ }
+ if (next)
+ *next = ghobject_t::get_max();
+ return 0;
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_HASHINDEX_H
+#define CEPH_HASHINDEX_H
+
+#include "include/buffer_fwd.h"
+#include "include/encoding.h"
+#include "LFNIndex.h"
+
+extern string reverse_hexdigit_bits_string(string l);
+
+/**
+ * Implements collection prehashing.
+ *
+ * @verbatim
+ * (root) - 0 - 0
+ * - 1
+ * - E
+ * - 1
+ * - 2 - D - 0
+ * .
+ * .
+ * .
+ * - F - 0
+ * @endverbatim
+ *
+ * A file is located at the longest existing directory from the root
+ * given by the hex characters in the hash beginning with the least
+ * significant.
+ *
+ * ex: ghobject_t("object", CEPH_NO_SNAP, 0xA4CEE0D2)
+ * would be located in (root)/2/D/0/
+ *
+ * Subdirectories are created when the number of objects in a directory
+ * exceed (abs(merge_threshhold)) * 16 * split_multiplier. The number of objects in a directory
+ * is encoded as subdir_info_s in an xattr on the directory.
+ */
+class HashIndex : public LFNIndex {
+private:
+ /// Attribute name for storing subdir info @see subdir_info_s
+ static const string SUBDIR_ATTR;
+ /// Attribute name for storing in progress op tag
+ static const string IN_PROGRESS_OP_TAG;
+ /// Size (bits) in object hash
+ static const int PATH_HASH_LEN = 32;
+ /// Max length of hashed path
+ static const int MAX_HASH_LEVEL = (PATH_HASH_LEN/4);
+
+ /**
+ * Merges occur when the number of object drops below
+ * merge_threshold and splits occur when the number of objects
+ * exceeds 16 * abs(merge_threshold) * split_multiplier.
+ * Please note if merge_threshold is less than zero, it will never do merging
+ */
+ int merge_threshold;
+ int split_multiplier;
+
+ /// Encodes current subdir state for determining when to split/merge.
+ struct subdir_info_s {
+ uint64_t objs; ///< Objects in subdir.
+ uint32_t subdirs; ///< Subdirs in subdir.
+ uint32_t hash_level; ///< Hashlevel of subdir.
+
+ subdir_info_s() : objs(0), subdirs(0), hash_level(0) {}
+
+ void encode(bufferlist &bl) const
+ {
+ __u8 v = 1;
+ ::encode(v, bl);
+ ::encode(objs, bl);
+ ::encode(subdirs, bl);
+ ::encode(hash_level, bl);
+ }
+
+ void decode(bufferlist::iterator &bl)
+ {
+ __u8 v;
+ ::decode(v, bl);
+ assert(v == 1);
+ ::decode(objs, bl);
+ ::decode(subdirs, bl);
+ ::decode(hash_level, bl);
+ }
+ };
+
+ /// Encodes in progress split or merge
+ struct InProgressOp {
+ static const int SPLIT = 0;
+ static const int MERGE = 1;
+ static const int COL_SPLIT = 2;
+ int op;
+ vector<string> path;
+
+ InProgressOp(int op, const vector<string> &path)
+ : op(op), path(path) {}
+
+ InProgressOp(bufferlist::iterator &bl) {
+ decode(bl);
+ }
+
+ bool is_split() const { return op == SPLIT; }
+ bool is_col_split() const { return op == COL_SPLIT; }
+ bool is_merge() const { return op == MERGE; }
+
+ void encode(bufferlist &bl) const {
+ __u8 v = 1;
+ ::encode(v, bl);
+ ::encode(op, bl);
+ ::encode(path, bl);
+ }
+
+ void decode(bufferlist::iterator &bl) {
+ __u8 v;
+ ::decode(v, bl);
+ assert(v == 1);
+ ::decode(op, bl);
+ ::decode(path, bl);
+ }
+ };
+
+
+public:
+ /// Constructor.
+ HashIndex(
+ coll_t collection, ///< [in] Collection
+ const char *base_path, ///< [in] Path to the index root.
+ int merge_at, ///< [in] Merge threshhold.
+ int split_multiple, ///< [in] Split threshhold.
+ uint32_t index_version,///< [in] Index version
+ double retry_probability=0) ///< [in] retry probability
+ : LFNIndex(collection, base_path, index_version, retry_probability),
+ merge_threshold(merge_at),
+ split_multiplier(split_multiple) {}
+
+ /// @see CollectionIndex
+ uint32_t collection_version() { return index_version; }
+
+ /// @see CollectionIndex
+ int cleanup();
+
+ /// @see CollectionIndex
+ int prep_delete();
+
+ /// @see CollectionIndex
+ int _split(
+ uint32_t match,
+ uint32_t bits,
+ CollectionIndex* dest
+ );
+
+protected:
+ int _init();
+
+ int _created(
+ const vector<string> &path,
+ const ghobject_t &oid,
+ const string &mangled_name
+ );
+ int _remove(
+ const vector<string> &path,
+ const ghobject_t &oid,
+ const string &mangled_name
+ );
+ int _lookup(
+ const ghobject_t &oid,
+ vector<string> *path,
+ string *mangled_name,
+ int *hardlink
+ );
+
+ /**
+ * Pre-hash the collection to create folders according to the expected number
+ * of objects in this collection.
+ */
+ int _pre_hash_collection(
+ uint32_t pg_num,
+ uint64_t expected_num_objs
+ );
+
+ int _collection_list_partial(
+ const ghobject_t &start,
+ const ghobject_t &end,
+ bool sort_bitwise,
+ int max_count,
+ vector<ghobject_t> *ls,
+ ghobject_t *next
+ );
+private:
+ /// Recursively remove path and its subdirs
+ int recursive_remove(
+ const vector<string> &path ///< [in] path to remove
+ ); /// @return Error Code, 0 on success
+ /// Tag root directory at beginning of col_split
+ int start_col_split(
+ const vector<string> &path ///< [in] path to split
+ ); ///< @return Error Code, 0 on success
+ /// Tag root directory at beginning of split
+ int start_split(
+ const vector<string> &path ///< [in] path to split
+ ); ///< @return Error Code, 0 on success
+ /// Tag root directory at beginning of split
+ int start_merge(
+ const vector<string> &path ///< [in] path to merge
+ ); ///< @return Error Code, 0 on success
+ /// Remove tag at end of split or merge
+ int end_split_or_merge(
+ const vector<string> &path ///< [in] path to split or merged
+ ); ///< @return Error Code, 0 on success
+ /// Gets info from the xattr on the subdir represented by path
+ int get_info(
+ const vector<string> &path, ///< [in] Path from which to read attribute.
+ subdir_info_s *info ///< [out] Attribute value
+ ); /// @return Error Code, 0 on success
+
+ /// Sets info to the xattr on the subdir represented by path
+ int set_info(
+ const vector<string> &path, ///< [in] Path on which to set attribute.
+ const subdir_info_s &info ///< [in] Value to set
+ ); /// @return Error Code, 0 on success
+
+ /// Encapsulates logic for when to split.
+ bool must_merge(
+ const subdir_info_s &info ///< [in] Info to check
+ ); /// @return True if info must be merged, False otherwise
+
+ /// Encapsulates logic for when to merge.
+ bool must_split(
+ const subdir_info_s &info ///< [in] Info to check
+ ); /// @return True if info must be split, False otherwise
+
+ /// Initiates merge
+ int initiate_merge(
+ const vector<string> &path, ///< [in] Subdir to merge
+ subdir_info_s info ///< [in] Info attached to path
+ ); /// @return Error Code, 0 on success
+
+ /// Completes merge
+ int complete_merge(
+ const vector<string> &path, ///< [in] Subdir to merge
+ subdir_info_s info ///< [in] Info attached to path
+ ); /// @return Error Code, 0 on success
+
+ /// Resets attr to match actual subdir contents
+ int reset_attr(
+ const vector<string> &path ///< [in] path to cleanup
+ );
+
+ /// Initiate Split
+ int initiate_split(
+ const vector<string> &path, ///< [in] Subdir to split
+ subdir_info_s info ///< [in] Info attached to path
+ ); /// @return Error Code, 0 on success
+
+ /// Completes Split
+ int complete_split(
+ const vector<string> &path, ///< [in] Subdir to split
+ subdir_info_s info ///< [in] Info attached to path
+ ); /// @return Error Code, 0 on success
+
+ /// Determine path components from hoid hash
+ void get_path_components(
+ const ghobject_t &oid, ///< [in] Object for which to get path components
+ vector<string> *path ///< [out] Path components for hoid.
+ );
+
+ /// Pre-hash and split folders to avoid runtime splitting
+ /// according to the given expected object number.
+ int pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs);
+
+ /// Initialize the folder (dir info) with the given hash
+ /// level and number of its subdirs.
+ int init_split_folder(vector<string> &path, uint32_t hash_level);
+
+ /// do collection split for path
+ static int col_split_level(
+ HashIndex &from, ///< [in] from index
+ HashIndex &dest, ///< [in] to index
+ const vector<string> &path, ///< [in] path to split
+ uint32_t bits, ///< [in] num bits to match
+ uint32_t match, ///< [in] bits to match
+ unsigned *mkdirred ///< [in,out] path[:mkdirred] has been mkdirred
+ );
+
+
+ /**
+ * Get string representation of ghobject_t/hash
+ *
+ * e.g: 0x01234567 -> "76543210"
+ */
+ static string get_path_str(
+ const ghobject_t &oid ///< [in] Object to get hash string for
+ ); ///< @return Hash string for hoid.
+
+ /// Get string from hash, @see get_path_str
+ static string get_hash_str(
+ uint32_t hash ///< [in] Hash to convert to a string.
+ ); ///< @return String representation of hash
+
+ /// Get hash from hash prefix string e.g. "FFFFAB" -> 0xFFFFAB00
+ static uint32_t hash_prefix_to_hash(
+ string prefix ///< [in] string to convert
+ ); ///< @return Hash
+
+ /// Get hash mod from path
+ static void path_to_hobject_hash_prefix(
+ const vector<string> &path,///< [in] path to convert
+ uint32_t *bits, ///< [out] bits
+ uint32_t *hash ///< [out] hash
+ ) {
+ string hash_str;
+ for (vector<string>::const_iterator i = path.begin();
+ i != path.end();
+ ++i) {
+ hash_str.push_back(*i->begin());
+ }
+ uint32_t rev_hash = hash_prefix_to_hash(hash_str);
+ if (hash)
+ *hash = rev_hash;
+ if (bits)
+ *bits = path.size() * 4;
+ }
+
+ /// Calculate the number of bits.
+ static int calc_num_bits(uint64_t n) {
+ int ret = 0;
+ while (n > 0) {
+ n = n >> 1;
+ ret++;
+ }
+ return ret;
+ }
+
+ /// Convert a number to hex string (upper case).
+ static string to_hex(int n) {
+ assert(n >= 0 && n < 16);
+ char c = (n <= 9 ? ('0' + n) : ('A' + n - 10));
+ string str;
+ str.append(1, c);
+ return str;
+ }
+
+ struct CmpPairNibblewise {
+ bool operator()(const pair<string, ghobject_t>& l,
+ const pair<string, ghobject_t>& r)
+ {
+ if (l.first < r.first)
+ return true;
+ if (l.first > r.first)
+ return false;
+ if (cmp_nibblewise(l.second, r.second) < 0)
+ return true;
+ return false;
+ }
+ };
+
+ struct CmpPairBitwise {
+ bool operator()(const pair<string, ghobject_t>& l,
+ const pair<string, ghobject_t>& r)
+ {
+ if (l.first < r.first)
+ return true;
+ if (l.first > r.first)
+ return false;
+ if (cmp_bitwise(l.second, r.second) < 0)
+ return true;
+ return false;
+ }
+ };
+
+ struct CmpHexdigitStringBitwise {
+ bool operator()(const string& l, const string& r) {
+ return reverse_hexdigit_bits_string(l) < reverse_hexdigit_bits_string(r);
+ }
+ };
+
+ /// Get path contents by hash
+ int get_path_contents_by_hash_bitwise(
+ const vector<string> &path, /// [in] Path to list
+ const ghobject_t *next_object, /// [in] list > *next_object
+ set<string, CmpHexdigitStringBitwise> *hash_prefixes, /// [out] prefixes in dir
+ set<pair<string, ghobject_t>, CmpPairBitwise> *objects /// [out] objects
+ );
+ int get_path_contents_by_hash_nibblewise(
+ const vector<string> &path, /// [in] Path to list
+ const ghobject_t *next_object, /// [in] list > *next_object
+ set<string> *hash_prefixes, /// [out] prefixes in dir
+ set<pair<string, ghobject_t>, CmpPairNibblewise> *objects /// [out] objects
+ );
+
+ /// List objects in collection in ghobject_t order
+ int list_by_hash(
+ const vector<string> &path, /// [in] Path to list
+ const ghobject_t &end, /// [in] List only objects < end
+ bool sort_bitwise, /// [in] sort bitwise
+ int max_count, /// [in] List at most max_count
+ ghobject_t *next, /// [in,out] List objects >= *next
+ vector<ghobject_t> *out /// [out] Listed objects
+ ); ///< @return Error Code, 0 on success
+ /// List objects in collection in ghobject_t order
+ int list_by_hash_bitwise(
+ const vector<string> &path, /// [in] Path to list
+ const ghobject_t &end, /// [in] List only objects < end
+ int max_count, /// [in] List at most max_count
+ ghobject_t *next, /// [in,out] List objects >= *next
+ vector<ghobject_t> *out /// [out] Listed objects
+ ); ///< @return Error Code, 0 on success
+ int list_by_hash_nibblewise(
+ const vector<string> &path, /// [in] Path to list
+ const ghobject_t &end, /// [in] List only objects < end
+ int max_count, /// [in] List at most max_count
+ ghobject_t *next, /// [in,out] List objects >= *next
+ vector<ghobject_t> *out /// [out] Listed objects
+ ); ///< @return Error Code, 0 on success
+
+ /// Create the given levels of sub directories from the given root.
+ /// The contents of *path* is not changed after calling this function.
+ int recursive_create_path(vector<string>& path, int level);
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/memory.h"
+#include "include/unordered_map.h"
+
+#if defined(__FreeBSD__)
+#include <sys/param.h>
+#endif
+
+#include <errno.h>
+
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "include/buffer.h"
+
+#include "IndexManager.h"
+#include "HashIndex.h"
+#include "CollectionIndex.h"
+
+#include "chain_xattr.h"
+
+static int set_version(const char *path, uint32_t version) {
+ bufferlist bl;
+ ::encode(version, bl);
+ return chain_setxattr(path, "user.cephos.collection_version", bl.c_str(),
+ bl.length(), true);
+}
+
+static int get_version(const char *path, uint32_t *version) {
+ bufferptr bp(PATH_MAX);
+ int r = chain_getxattr(path, "user.cephos.collection_version",
+ bp.c_str(), bp.length());
+ if (r < 0) {
+ if (r != -ENOENT) {
+ *version = 0;
+ return 0;
+ } else {
+ return r;
+ }
+ }
+ bp.set_length(r);
+ bufferlist bl;
+ bl.push_back(bp);
+ bufferlist::iterator i = bl.begin();
+ ::decode(*version, i);
+ return 0;
+}
+
+IndexManager::~IndexManager() {
+
+ for (ceph::unordered_map<coll_t, CollectionIndex* > ::iterator it = col_indices.begin();
+ it != col_indices.end(); ++it) {
+
+ delete it->second;
+ it->second = NULL;
+ }
+ col_indices.clear();
+}
+
+
+int IndexManager::init_index(coll_t c, const char *path, uint32_t version) {
+ Mutex::Locker l(lock);
+ int r = set_version(path, version);
+ if (r < 0)
+ return r;
+ HashIndex index(c, path, g_conf->filestore_merge_threshold,
+ g_conf->filestore_split_multiple,
+ version,
+ g_conf->filestore_index_retry_probability);
+ return index.init();
+}
+
+int IndexManager::build_index(coll_t c, const char *path, CollectionIndex **index) {
+ if (upgrade) {
+ // Need to check the collection generation
+ int r;
+ uint32_t version = 0;
+ r = get_version(path, &version);
+ if (r < 0)
+ return r;
+
+ switch (version) {
+ case CollectionIndex::FLAT_INDEX_TAG:
+ case CollectionIndex::HASH_INDEX_TAG: // fall through
+ case CollectionIndex::HASH_INDEX_TAG_2: // fall through
+ case CollectionIndex::HOBJECT_WITH_POOL: {
+ // Must be a HashIndex
+ *index = new HashIndex(c, path, g_conf->filestore_merge_threshold,
+ g_conf->filestore_split_multiple, version);
+ return 0;
+ }
+ default: assert(0);
+ }
+
+ } else {
+ // No need to check
+ *index = new HashIndex(c, path, g_conf->filestore_merge_threshold,
+ g_conf->filestore_split_multiple,
+ CollectionIndex::HOBJECT_WITH_POOL,
+ g_conf->filestore_index_retry_probability);
+ return 0;
+ }
+}
+
+int IndexManager::get_index(coll_t c, const string& baseDir, Index *index) {
+
+ Mutex::Locker l(lock);
+ ceph::unordered_map<coll_t, CollectionIndex* > ::iterator it = col_indices.find(c);
+ if (it == col_indices.end()) {
+ char path[PATH_MAX];
+ snprintf(path, sizeof(path), "%s/current/%s", baseDir.c_str(), c.to_str().c_str());
+ CollectionIndex* colIndex = NULL;
+ int r = build_index(c, path, &colIndex);
+ if (r < 0)
+ return r;
+ col_indices[c] = colIndex;
+ index->index = colIndex;
+ } else {
+ index->index = it->second;
+ }
+ return 0;
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef OS_INDEXMANAGER_H
+#define OS_INDEXMANAGER_H
+
+#include "include/memory.h"
+#include "include/unordered_map.h"
+
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/config.h"
+#include "common/debug.h"
+
+#include "CollectionIndex.h"
+#include "HashIndex.h"
+
+
+/// Public type for Index
+struct Index {
+ CollectionIndex *index;
+
+ Index() : index(NULL) {}
+ Index(CollectionIndex* index) : index(index) {}
+
+ CollectionIndex *operator->() { return index; }
+ CollectionIndex &operator*() { return *index; }
+};
+
+
+/**
+ * Encapsulates mutual exclusion for CollectionIndexes.
+ *
+ * Allowing a modification (removal or addition of an object) to occur
+ * while a read is occuring (lookup of an object's path and use of
+ * that path) may result in the path becoming invalid. Thus, during
+ * the lifetime of a CollectionIndex object and any paths returned
+ * by it, no other concurrent accesses may be allowed.
+ * This is enforced by using CollectionIndex::access_lock
+ */
+class IndexManager {
+ Mutex lock; ///< Lock for Index Manager
+ bool upgrade;
+ ceph::unordered_map<coll_t, CollectionIndex* > col_indices;
+
+ /**
+ * Index factory
+ *
+ * Encapsulates logic for handling legacy FileStore
+ * layouts
+ *
+ * @param [in] c Collection for which to get index
+ * @param [in] path Path to collection
+ * @param [out] index Index for c
+ * @return error code
+ */
+ int build_index(coll_t c, const char *path, CollectionIndex **index);
+public:
+ /// Constructor
+ IndexManager(bool upgrade) : lock("IndexManager lock"),
+ upgrade(upgrade) {}
+
+ ~IndexManager();
+
+ /**
+ * Reserve and return index for c
+ *
+ * @param [in] c Collection for which to get index
+ * @param [in] baseDir base directory of collections
+ * @param [out] index Index for c
+ * @return error code
+ */
+ int get_index(coll_t c, const string& baseDir, Index *index);
+
+ /**
+ * Initialize index for collection c at path
+ *
+ * @param [in] c Collection for which to init Index
+ * @param [in] path Path to collection
+ * @param [in] filestore_version version of containing FileStore
+ * @return error code
+ */
+ int init_index(coll_t c, const char *path, uint32_t filestore_version);
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_JOURNAL_H
+#define CEPH_JOURNAL_H
+
+#include <errno.h>
+
+#include "include/buffer_fwd.h"
+#include "include/Context.h"
+#include "common/Finisher.h"
+#include "common/TrackedOp.h"
+#include "os/ObjectStore.h"
+
+class PerfCounters;
+
+class Journal {
+protected:
+ uuid_d fsid;
+ Finisher *finisher;
+public:
+ PerfCounters *logger;
+protected:
+ Cond *do_sync_cond;
+ bool wait_on_full;
+
+public:
+ Journal(uuid_d f, Finisher *fin, Cond *c=0) :
+ fsid(f), finisher(fin), logger(NULL),
+ do_sync_cond(c),
+ wait_on_full(false) { }
+ virtual ~Journal() { }
+
+ virtual int check() = 0; ///< check if journal appears valid
+ virtual int create() = 0; ///< create a fresh journal
+ virtual int open(uint64_t fs_op_seq) = 0; ///< open an existing journal
+ virtual void close() = 0; ///< close an open journal
+
+ virtual void flush() = 0;
+ virtual void throttle() = 0;
+
+ virtual int dump(ostream& out) { return -EOPNOTSUPP; }
+
+ void set_wait_on_full(bool b) { wait_on_full = b; }
+
+ // writes
+ virtual bool is_writeable() = 0;
+ virtual int make_writeable() = 0;
+ virtual void submit_entry(uint64_t seq, bufferlist& e, uint32_t orig_len,
+ Context *oncommit,
+ TrackedOpRef osd_op = TrackedOpRef()) = 0;
+ virtual void commit_start(uint64_t seq) = 0;
+ virtual void committed_thru(uint64_t seq) = 0;
+
+ /// Read next journal entry - asserts on invalid journal
+ virtual bool read_entry(
+ bufferlist &bl, ///< [out] payload on successful read
+ uint64_t &seq ///< [in,out] sequence number on last successful read
+ ) = 0; ///< @return true on successful read, false on journal end
+
+ virtual bool should_commit_now() = 0;
+
+ virtual int prepare_entry(list<ObjectStore::Transaction*>& tls, bufferlist* tbl) = 0;
+
+ // reads/recovery
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#include "JournalingObjectStore.h"
+
+#include "common/errno.h"
+#include "common/debug.h"
+
+#define dout_subsys ceph_subsys_journal
+#undef dout_prefix
+#define dout_prefix *_dout << "journal "
+
+
+
+void JournalingObjectStore::journal_start()
+{
+ dout(10) << "journal_start" << dendl;
+ finisher.start();
+}
+
+void JournalingObjectStore::journal_stop()
+{
+ dout(10) << "journal_stop" << dendl;
+ finisher.stop();
+}
+
+// A journal_replay() makes journal writeable, this closes that out.
+void JournalingObjectStore::journal_write_close()
+{
+ if (journal) {
+ journal->close();
+ delete journal;
+ journal = 0;
+ }
+ apply_manager.reset();
+}
+
+int JournalingObjectStore::journal_replay(uint64_t fs_op_seq)
+{
+ dout(10) << "journal_replay fs op_seq " << fs_op_seq << dendl;
+
+ if (g_conf->journal_replay_from) {
+ dout(0) << "journal_replay forcing replay from " << g_conf->journal_replay_from
+ << " instead of " << fs_op_seq << dendl;
+ // the previous op is the last one committed
+ fs_op_seq = g_conf->journal_replay_from - 1;
+ }
+
+ uint64_t op_seq = fs_op_seq;
+ apply_manager.init_seq(fs_op_seq);
+
+ if (!journal) {
+ submit_manager.set_op_seq(op_seq);
+ return 0;
+ }
+
+ int err = journal->open(op_seq);
+ if (err < 0) {
+ dout(3) << "journal_replay open failed with "
+ << cpp_strerror(err) << dendl;
+ delete journal;
+ journal = 0;
+ return err;
+ }
+
+ replaying = true;
+
+ int count = 0;
+ while (1) {
+ bufferlist bl;
+ uint64_t seq = op_seq + 1;
+ if (!journal->read_entry(bl, seq)) {
+ dout(3) << "journal_replay: end of journal, done." << dendl;
+ break;
+ }
+
+ if (seq <= op_seq) {
+ dout(3) << "journal_replay: skipping old op seq " << seq << " <= " << op_seq << dendl;
+ continue;
+ }
+ assert(op_seq == seq-1);
+
+ dout(3) << "journal_replay: applying op seq " << seq << dendl;
+ bufferlist::iterator p = bl.begin();
+ list<Transaction*> tls;
+ while (!p.end()) {
+ Transaction *t = new Transaction(p);
+ tls.push_back(t);
+ }
+
+ apply_manager.op_apply_start(seq);
+ int r = do_transactions(tls, seq);
+ apply_manager.op_apply_finish(seq);
+
+ op_seq = seq;
+
+ while (!tls.empty()) {
+ delete tls.front();
+ tls.pop_front();
+ }
+
+ dout(3) << "journal_replay: r = " << r << ", op_seq now " << op_seq << dendl;
+ }
+
+ replaying = false;
+
+ submit_manager.set_op_seq(op_seq);
+
+ // done reading, make writeable.
+ err = journal->make_writeable();
+ if (err < 0)
+ return err;
+
+ return count;
+}
+
+
+// ------------------------------------
+
+uint64_t JournalingObjectStore::ApplyManager::op_apply_start(uint64_t op)
+{
+ Mutex::Locker l(apply_lock);
+ while (blocked) {
+ // note: this only happens during journal replay
+ dout(10) << "op_apply_start blocked, waiting" << dendl;
+ blocked_cond.Wait(apply_lock);
+ }
+ dout(10) << "op_apply_start " << op << " open_ops " << open_ops << " -> " << (open_ops+1) << dendl;
+ assert(!blocked);
+ assert(op > committed_seq);
+ open_ops++;
+ return op;
+}
+
+void JournalingObjectStore::ApplyManager::op_apply_finish(uint64_t op)
+{
+ Mutex::Locker l(apply_lock);
+ dout(10) << "op_apply_finish " << op << " open_ops " << open_ops
+ << " -> " << (open_ops-1)
+ << ", max_applied_seq " << max_applied_seq << " -> " << MAX(op, max_applied_seq)
+ << dendl;
+ --open_ops;
+ assert(open_ops >= 0);
+
+ // signal a blocked commit_start (only needed during journal replay)
+ if (blocked) {
+ blocked_cond.Signal();
+ }
+
+ // there can be multiple applies in flight; track the max value we
+ // note. note that we can't _read_ this value and learn anything
+ // meaningful unless/until we've quiesced all in-flight applies.
+ if (op > max_applied_seq)
+ max_applied_seq = op;
+}
+
+uint64_t JournalingObjectStore::SubmitManager::op_submit_start()
+{
+ lock.Lock();
+ uint64_t op = ++op_seq;
+ dout(10) << "op_submit_start " << op << dendl;
+ return op;
+}
+
+void JournalingObjectStore::SubmitManager::op_submit_finish(uint64_t op)
+{
+ dout(10) << "op_submit_finish " << op << dendl;
+ if (op != op_submitted + 1) {
+ dout(0) << "op_submit_finish " << op << " expected " << (op_submitted + 1)
+ << ", OUT OF ORDER" << dendl;
+ assert(0 == "out of order op_submit_finish");
+ }
+ op_submitted = op;
+ lock.Unlock();
+}
+
+
+// ------------------------------------------
+
+void JournalingObjectStore::ApplyManager::add_waiter(uint64_t op, Context *c)
+{
+ Mutex::Locker l(com_lock);
+ assert(c);
+ commit_waiters[op].push_back(c);
+}
+
+bool JournalingObjectStore::ApplyManager::commit_start()
+{
+ bool ret = false;
+
+ uint64_t _committing_seq = 0;
+ {
+ Mutex::Locker l(apply_lock);
+ dout(10) << "commit_start max_applied_seq " << max_applied_seq
+ << ", open_ops " << open_ops
+ << dendl;
+ blocked = true;
+ while (open_ops > 0) {
+ dout(10) << "commit_start waiting for " << open_ops << " open ops to drain" << dendl;
+ blocked_cond.Wait(apply_lock);
+ }
+ assert(open_ops == 0);
+ dout(10) << "commit_start blocked, all open_ops have completed" << dendl;
+ {
+ Mutex::Locker l(com_lock);
+ if (max_applied_seq == committed_seq) {
+ dout(10) << "commit_start nothing to do" << dendl;
+ blocked = false;
+ assert(commit_waiters.empty());
+ goto out;
+ }
+
+ _committing_seq = committing_seq = max_applied_seq;
+
+ dout(10) << "commit_start committing " << committing_seq
+ << ", still blocked" << dendl;
+ }
+ }
+ ret = true;
+
+ out:
+ if (journal)
+ journal->commit_start(_committing_seq); // tell the journal too
+ return ret;
+}
+
+void JournalingObjectStore::ApplyManager::commit_started()
+{
+ Mutex::Locker l(apply_lock);
+ // allow new ops. (underlying fs should now be committing all prior ops)
+ dout(10) << "commit_started committing " << committing_seq << ", unblocking" << dendl;
+ blocked = false;
+ blocked_cond.Signal();
+}
+
+void JournalingObjectStore::ApplyManager::commit_finish()
+{
+ Mutex::Locker l(com_lock);
+ dout(10) << "commit_finish thru " << committing_seq << dendl;
+
+ if (journal)
+ journal->committed_thru(committing_seq);
+
+ committed_seq = committing_seq;
+
+ map<version_t, vector<Context*> >::iterator p = commit_waiters.begin();
+ while (p != commit_waiters.end() &&
+ p->first <= committing_seq) {
+ finisher.queue(p->second);
+ commit_waiters.erase(p++);
+ }
+}
+
+void JournalingObjectStore::_op_journal_transactions(
+ bufferlist& tbl, uint32_t orig_len, uint64_t op,
+ Context *onjournal, TrackedOpRef osd_op)
+{
+ if (osd_op.get())
+ dout(10) << "op_journal_transactions " << op << " reqid_t "
+ << (static_cast<OpRequest *>(osd_op.get()))->get_reqid() << dendl;
+ else
+ dout(10) << "op_journal_transactions " << op << dendl;
+
+ if (journal && journal->is_writeable()) {
+ journal->submit_entry(op, tbl, orig_len, onjournal, osd_op);
+ } else if (onjournal) {
+ apply_manager.add_waiter(op, onjournal);
+ }
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_JOURNALINGOBJECTSTORE_H
+#define CEPH_JOURNALINGOBJECTSTORE_H
+
+#include "os/ObjectStore.h"
+#include "Journal.h"
+#include "FileJournal.h"
+#include "common/RWLock.h"
+
+class JournalingObjectStore : public ObjectStore {
+protected:
+ Journal *journal;
+ Finisher finisher;
+
+
+ class SubmitManager {
+ Mutex lock;
+ uint64_t op_seq;
+ uint64_t op_submitted;
+ public:
+ SubmitManager() :
+ lock("JOS::SubmitManager::lock", false, true, false, g_ceph_context),
+ op_seq(0), op_submitted(0)
+ {}
+ uint64_t op_submit_start();
+ void op_submit_finish(uint64_t op);
+ void set_op_seq(uint64_t seq) {
+ Mutex::Locker l(lock);
+ op_submitted = op_seq = seq;
+ }
+ uint64_t get_op_seq() {
+ return op_seq;
+ }
+ } submit_manager;
+
+ class ApplyManager {
+ Journal *&journal;
+ Finisher &finisher;
+
+ Mutex apply_lock;
+ bool blocked;
+ Cond blocked_cond;
+ int open_ops;
+ uint64_t max_applied_seq;
+
+ Mutex com_lock;
+ map<version_t, vector<Context*> > commit_waiters;
+ uint64_t committing_seq, committed_seq;
+
+ public:
+ ApplyManager(Journal *&j, Finisher &f) :
+ journal(j), finisher(f),
+ apply_lock("JOS::ApplyManager::apply_lock", false, true, false, g_ceph_context),
+ blocked(false),
+ open_ops(0),
+ max_applied_seq(0),
+ com_lock("JOS::ApplyManager::com_lock", false, true, false, g_ceph_context),
+ committing_seq(0), committed_seq(0) {}
+ void reset() {
+ assert(open_ops == 0);
+ assert(blocked == false);
+ max_applied_seq = 0;
+ committing_seq = 0;
+ committed_seq = 0;
+ }
+ void add_waiter(uint64_t, Context*);
+ uint64_t op_apply_start(uint64_t op);
+ void op_apply_finish(uint64_t op);
+ bool commit_start();
+ void commit_started();
+ void commit_finish();
+ bool is_committing() {
+ Mutex::Locker l(com_lock);
+ return committing_seq != committed_seq;
+ }
+ uint64_t get_committed_seq() {
+ Mutex::Locker l(com_lock);
+ return committed_seq;
+ }
+ uint64_t get_committing_seq() {
+ Mutex::Locker l(com_lock);
+ return committing_seq;
+ }
+ void init_seq(uint64_t fs_op_seq) {
+ {
+ Mutex::Locker l(com_lock);
+ committed_seq = fs_op_seq;
+ committing_seq = fs_op_seq;
+ }
+ {
+ Mutex::Locker l(apply_lock);
+ max_applied_seq = fs_op_seq;
+ }
+ }
+ } apply_manager;
+
+ bool replaying;
+
+protected:
+ void journal_start();
+ void journal_stop();
+ void journal_write_close();
+ int journal_replay(uint64_t fs_op_seq);
+
+ void _op_journal_transactions(bufferlist& tls, uint32_t orig_len, uint64_t op,
+ Context *onjournal, TrackedOpRef osd_op);
+
+ virtual int do_transactions(list<ObjectStore::Transaction*>& tls, uint64_t op_seq) = 0;
+
+public:
+ bool is_committing() {
+ return apply_manager.is_committing();
+ }
+ uint64_t get_committed_seq() {
+ return apply_manager.get_committed_seq();
+ }
+
+public:
+ JournalingObjectStore(const std::string& path)
+ : ObjectStore(path),
+ journal(NULL),
+ finisher(g_ceph_context, "JournalObjectStore"),
+ apply_manager(journal, finisher),
+ replaying(false) {}
+
+ ~JournalingObjectStore() {
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <string>
+#include <map>
+#include <set>
+#include <vector>
+#include <errno.h>
+#include <string.h>
+
+#if defined(__FreeBSD__)
+#include <sys/param.h>
+#endif
+
+#include "osd/osd_types.h"
+#include "include/object.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "include/buffer.h"
+#include "common/ceph_crypto.h"
+#include "include/compat.h"
+#include "chain_xattr.h"
+
+#include "LFNIndex.h"
+using ceph::crypto::SHA1;
+
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "LFNIndex(" << get_base_path() << ") "
+
+
+const string LFNIndex::LFN_ATTR = "user.cephos.lfn";
+const string LFNIndex::PHASH_ATTR_PREFIX = "user.cephos.phash.";
+const string LFNIndex::SUBDIR_PREFIX = "DIR_";
+const string LFNIndex::FILENAME_COOKIE = "long";
+const int LFNIndex::FILENAME_PREFIX_LEN = FILENAME_SHORT_LEN - FILENAME_HASH_LEN -
+ FILENAME_COOKIE.size() -
+ FILENAME_EXTRA;
+void LFNIndex::maybe_inject_failure()
+{
+ if (error_injection_enabled) {
+ if (current_failure > last_failure &&
+ (((double)(rand() % 10000))/((double)(10000))
+ < error_injection_probability)) {
+ last_failure = current_failure;
+ current_failure = 0;
+ throw RetryException();
+ }
+ ++current_failure;
+ }
+}
+
+// Helper to close fd's when we leave scope. This is useful when used
+// in combination with RetryException, thrown by the above.
+struct FDCloser {
+ int fd;
+ FDCloser(int f) : fd(f) {}
+ ~FDCloser() {
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ }
+};
+
+
+/* Public methods */
+
+
+int LFNIndex::init()
+{
+ return _init();
+}
+
+int LFNIndex::created(const ghobject_t &oid, const char *path)
+{
+ WRAP_RETRY(
+ vector<string> path_comp;
+ string short_name;
+ r = decompose_full_path(path, &path_comp, 0, &short_name);
+ if (r < 0)
+ goto out;
+ r = lfn_created(path_comp, oid, short_name);
+ if (r < 0)
+ goto out;
+ r = _created(path_comp, oid, short_name);
+ if (r < 0)
+ goto out;
+ );
+}
+
+int LFNIndex::unlink(const ghobject_t &oid)
+{
+ WRAP_RETRY(
+ vector<string> path;
+ string short_name;
+ r = _lookup(oid, &path, &short_name, NULL);
+ if (r < 0) {
+ goto out;
+ }
+ r = _remove(path, oid, short_name);
+ if (r < 0) {
+ goto out;
+ }
+ );
+}
+
+int LFNIndex::lookup(const ghobject_t &oid,
+ IndexedPath *out_path,
+ int *hardlink)
+{
+ WRAP_RETRY(
+ vector<string> path;
+ string short_name;
+ r = _lookup(oid, &path, &short_name, hardlink);
+ if (r < 0)
+ goto out;
+ string full_path = get_full_path(path, short_name);
+ *out_path = IndexedPath(new Path(full_path, this));
+ r = 0;
+ );
+}
+
+int LFNIndex::pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs)
+{
+ return _pre_hash_collection(pg_num, expected_num_objs);
+}
+
+
+int LFNIndex::collection_list_partial(const ghobject_t &start,
+ const ghobject_t &end,
+ bool sort_bitwise,
+ int max_count,
+ vector<ghobject_t> *ls,
+ ghobject_t *next)
+{
+ return _collection_list_partial(start, end, sort_bitwise, max_count, ls, next);
+}
+
+/* Derived class utility methods */
+
+int LFNIndex::fsync_dir(const vector<string> &path)
+{
+ maybe_inject_failure();
+ int fd = ::open(get_full_path_subdir(path).c_str(), O_RDONLY);
+ if (fd < 0)
+ return -errno;
+ FDCloser f(fd);
+ maybe_inject_failure();
+ int r = ::fsync(fd);
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ else
+ return 0;
+}
+
+int LFNIndex::link_object(const vector<string> &from,
+ const vector<string> &to,
+ const ghobject_t &oid,
+ const string &from_short_name)
+{
+ int r;
+ string from_path = get_full_path(from, from_short_name);
+ string to_path;
+ maybe_inject_failure();
+ r = lfn_get_name(to, oid, 0, &to_path, 0);
+ if (r < 0)
+ return r;
+ maybe_inject_failure();
+ r = ::link(from_path.c_str(), to_path.c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ else
+ return 0;
+}
+
+int LFNIndex::remove_objects(const vector<string> &dir,
+ const map<string, ghobject_t> &to_remove,
+ map<string, ghobject_t> *remaining)
+{
+ set<string> clean_chains;
+ for (map<string, ghobject_t>::const_iterator to_clean = to_remove.begin();
+ to_clean != to_remove.end();
+ ++to_clean) {
+ if (!lfn_is_hashed_filename(to_clean->first)) {
+ maybe_inject_failure();
+ int r = ::unlink(get_full_path(dir, to_clean->first).c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ continue;
+ }
+ if (clean_chains.count(lfn_get_short_name(to_clean->second, 0)))
+ continue;
+ set<int> holes;
+ map<int, pair<string, ghobject_t> > chain;
+ for (int i = 0; ; ++i) {
+ string short_name = lfn_get_short_name(to_clean->second, i);
+ if (remaining->count(short_name)) {
+ chain[i] = *(remaining->find(short_name));
+ } else if (to_remove.count(short_name)) {
+ holes.insert(i);
+ } else {
+ break;
+ }
+ }
+
+ map<int, pair<string, ghobject_t > >::reverse_iterator candidate = chain.rbegin();
+ for (set<int>::iterator i = holes.begin();
+ i != holes.end();
+ ++i) {
+ if (candidate == chain.rend() || *i > candidate->first) {
+ string remove_path_name =
+ get_full_path(dir, lfn_get_short_name(to_clean->second, *i));
+ maybe_inject_failure();
+ int r = ::unlink(remove_path_name.c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ continue;
+ }
+ string from = get_full_path(dir, candidate->second.first);
+ string to = get_full_path(dir, lfn_get_short_name(candidate->second.second, *i));
+ maybe_inject_failure();
+ int r = ::rename(from.c_str(), to.c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ remaining->erase(candidate->second.first);
+ remaining->insert(pair<string, ghobject_t>(
+ lfn_get_short_name(candidate->second.second, *i),
+ candidate->second.second));
+ ++candidate;
+ }
+ if (!holes.empty())
+ clean_chains.insert(lfn_get_short_name(to_clean->second, 0));
+ }
+ return 0;
+}
+
+int LFNIndex::move_objects(const vector<string> &from,
+ const vector<string> &to)
+{
+ map<string, ghobject_t> to_move;
+ int r;
+ r = list_objects(from, 0, NULL, &to_move);
+ if (r < 0)
+ return r;
+ for (map<string,ghobject_t>::iterator i = to_move.begin();
+ i != to_move.end();
+ ++i) {
+ string from_path = get_full_path(from, i->first);
+ string to_path, to_name;
+ r = lfn_get_name(to, i->second, &to_name, &to_path, 0);
+ if (r < 0)
+ return r;
+ maybe_inject_failure();
+ r = ::link(from_path.c_str(), to_path.c_str());
+ if (r < 0 && errno != EEXIST)
+ return -errno;
+ maybe_inject_failure();
+ r = lfn_created(to, i->second, to_name);
+ maybe_inject_failure();
+ if (r < 0)
+ return r;
+ }
+ r = fsync_dir(to);
+ if (r < 0)
+ return r;
+ for (map<string,ghobject_t>::iterator i = to_move.begin();
+ i != to_move.end();
+ ++i) {
+ maybe_inject_failure();
+ r = ::unlink(get_full_path(from, i->first).c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ }
+ return fsync_dir(from);
+}
+
+int LFNIndex::remove_object(const vector<string> &from,
+ const ghobject_t &oid)
+{
+ string short_name;
+ int r, exist;
+ maybe_inject_failure();
+ r = get_mangled_name(from, oid, &short_name, &exist);
+ maybe_inject_failure();
+ if (r < 0)
+ return r;
+ if (exist == 0)
+ return -ENOENT;
+ return lfn_unlink(from, oid, short_name);
+}
+
+int LFNIndex::get_mangled_name(const vector<string> &from,
+ const ghobject_t &oid,
+ string *mangled_name, int *hardlink)
+{
+ return lfn_get_name(from, oid, mangled_name, 0, hardlink);
+}
+
+int LFNIndex::move_subdir(
+ LFNIndex &from,
+ LFNIndex &dest,
+ const vector<string> &path,
+ string dir
+ )
+{
+ vector<string> sub_path(path.begin(), path.end());
+ sub_path.push_back(dir);
+ string from_path(from.get_full_path_subdir(sub_path));
+ string to_path(dest.get_full_path_subdir(sub_path));
+ int r = ::rename(from_path.c_str(), to_path.c_str());
+ if (r < 0)
+ return -errno;
+ return 0;
+}
+
+int LFNIndex::move_object(
+ LFNIndex &from,
+ LFNIndex &dest,
+ const vector<string> &path,
+ const pair<string, ghobject_t> &obj
+ )
+{
+ string from_path(from.get_full_path(path, obj.first));
+ string to_path;
+ string to_name;
+ int exists;
+ int r = dest.lfn_get_name(path, obj.second, &to_name, &to_path, &exists);
+ if (r < 0)
+ return r;
+ if (!exists) {
+ r = ::link(from_path.c_str(), to_path.c_str());
+ if (r < 0)
+ return r;
+ }
+ r = dest.lfn_created(path, obj.second, to_name);
+ if (r < 0)
+ return r;
+ r = dest.fsync_dir(path);
+ if (r < 0)
+ return r;
+ r = from.remove_object(path, obj.second);
+ if (r < 0)
+ return r;
+ return from.fsync_dir(path);
+}
+
+
+static int get_hobject_from_oinfo(const char *dir, const char *file,
+ ghobject_t *o)
+{
+ char path[PATH_MAX];
+ bufferptr bp(PATH_MAX);
+ snprintf(path, sizeof(path), "%s/%s", dir, file);
+ // Hack, user.ceph._ is the attribute used to store the object info
+ int r = chain_getxattr(path, "user.ceph._", bp.c_str(), bp.length());
+ if (r < 0)
+ return r;
+ bufferlist bl;
+ bl.push_back(bp);
+ object_info_t oi(bl);
+ *o = ghobject_t(oi.soid);
+ return 0;
+}
+
+
+int LFNIndex::list_objects(const vector<string> &to_list, int max_objs,
+ long *handle, map<string, ghobject_t> *out)
+{
+ string to_list_path = get_full_path_subdir(to_list);
+ DIR *dir = ::opendir(to_list_path.c_str());
+ char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1];
+ int r;
+ if (!dir) {
+ return -errno;
+ }
+
+ if (handle && *handle) {
+ seekdir(dir, *handle);
+ }
+
+ struct dirent *de;
+ int listed = 0;
+ bool end = false;
+ while (!::readdir_r(dir, reinterpret_cast<struct dirent*>(buf), &de)) {
+ if (!de) {
+ end = true;
+ break;
+ }
+ if (max_objs > 0 && listed >= max_objs) {
+ break;
+ }
+ if (de->d_name[0] == '.')
+ continue;
+ string short_name(de->d_name);
+ ghobject_t obj;
+ if (lfn_is_object(short_name)) {
+ r = lfn_translate(to_list, short_name, &obj);
+ if (r < 0) {
+ r = -errno;
+ goto cleanup;
+ } else if (r > 0) {
+ string long_name = lfn_generate_object_name(obj);
+ if (!lfn_must_hash(long_name)) {
+ assert(long_name == short_name);
+ }
+ if (index_version == HASH_INDEX_TAG)
+ get_hobject_from_oinfo(to_list_path.c_str(), short_name.c_str(), &obj);
+
+ out->insert(pair<string, ghobject_t>(short_name, obj));
+ ++listed;
+ } else {
+ continue;
+ }
+ }
+ }
+
+ if (handle && !end) {
+ *handle = telldir(dir);
+ }
+
+ r = 0;
+ cleanup:
+ ::closedir(dir);
+ return r;
+}
+
+int LFNIndex::list_subdirs(const vector<string> &to_list,
+ vector<string> *out)
+{
+ string to_list_path = get_full_path_subdir(to_list);
+ DIR *dir = ::opendir(to_list_path.c_str());
+ char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1];
+ if (!dir)
+ return -errno;
+
+ struct dirent *de;
+ while (!::readdir_r(dir, reinterpret_cast<struct dirent*>(buf), &de)) {
+ if (!de) {
+ break;
+ }
+ string short_name(de->d_name);
+ string demangled_name;
+ if (lfn_is_subdir(short_name, &demangled_name)) {
+ out->push_back(demangled_name);
+ }
+ }
+
+ ::closedir(dir);
+ return 0;
+}
+
+int LFNIndex::create_path(const vector<string> &to_create)
+{
+ maybe_inject_failure();
+ int r = ::mkdir(get_full_path_subdir(to_create).c_str(), 0777);
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ else
+ return 0;
+}
+
+int LFNIndex::remove_path(const vector<string> &to_remove)
+{
+ maybe_inject_failure();
+ int r = ::rmdir(get_full_path_subdir(to_remove).c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ else
+ return 0;
+}
+
+int LFNIndex::path_exists(const vector<string> &to_check, int *exists)
+{
+ string full_path = get_full_path_subdir(to_check);
+ struct stat buf;
+ if (::stat(full_path.c_str(), &buf)) {
+ int r = -errno;
+ if (r == -ENOENT) {
+ *exists = 0;
+ return 0;
+ } else {
+ return r;
+ }
+ } else {
+ *exists = 1;
+ return 0;
+ }
+}
+
+int LFNIndex::add_attr_path(const vector<string> &path,
+ const string &attr_name,
+ bufferlist &attr_value)
+{
+ string full_path = get_full_path_subdir(path);
+ maybe_inject_failure();
+ return chain_setxattr(full_path.c_str(), mangle_attr_name(attr_name).c_str(),
+ reinterpret_cast<void *>(attr_value.c_str()),
+ attr_value.length());
+}
+
+int LFNIndex::get_attr_path(const vector<string> &path,
+ const string &attr_name,
+ bufferlist &attr_value)
+{
+ string full_path = get_full_path_subdir(path);
+ size_t size = 1024; // Initial
+ while (1) {
+ bufferptr buf(size);
+ int r = chain_getxattr(full_path.c_str(), mangle_attr_name(attr_name).c_str(),
+ reinterpret_cast<void *>(buf.c_str()),
+ size);
+ if (r > 0) {
+ buf.set_length(r);
+ attr_value.push_back(buf);
+ break;
+ } else {
+ r = -errno;
+ if (r == -ERANGE) {
+ size *= 2;
+ } else {
+ return r;
+ }
+ }
+ }
+ return 0;
+}
+
+int LFNIndex::remove_attr_path(const vector<string> &path,
+ const string &attr_name)
+{
+ string full_path = get_full_path_subdir(path);
+ string mangled_attr_name = mangle_attr_name(attr_name);
+ maybe_inject_failure();
+ return chain_removexattr(full_path.c_str(), mangled_attr_name.c_str());
+}
+
+string LFNIndex::lfn_generate_object_name_keyless(const ghobject_t &oid)
+{
+ char s[FILENAME_MAX_LEN];
+ char *end = s + sizeof(s);
+ char *t = s;
+
+ assert(oid.generation == ghobject_t::NO_GEN);
+ const char *i = oid.hobj.oid.name.c_str();
+ // Escape subdir prefix
+ if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
+ *t++ = '\\';
+ *t++ = 'd';
+ i += 4;
+ }
+ while (*i && t < end) {
+ if (*i == '\\') {
+ *t++ = '\\';
+ *t++ = '\\';
+ } else if (*i == '.' && i == oid.hobj.oid.name.c_str()) { // only escape leading .
+ *t++ = '\\';
+ *t++ = '.';
+ } else if (*i == '/') {
+ *t++ = '\\';
+ *t++ = 's';
+ } else
+ *t++ = *i;
+ i++;
+ }
+
+ if (oid.hobj.snap == CEPH_NOSNAP)
+ t += snprintf(t, end - t, "_head");
+ else if (oid.hobj.snap == CEPH_SNAPDIR)
+ t += snprintf(t, end - t, "_snapdir");
+ else
+ t += snprintf(t, end - t, "_%llx", (long long unsigned)oid.hobj.snap);
+ snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash());
+
+ return string(s);
+}
+
+static void append_escaped(string::const_iterator begin,
+ string::const_iterator end,
+ string *out)
+{
+ for (string::const_iterator i = begin; i != end; ++i) {
+ if (*i == '\\') {
+ out->append("\\\\");
+ } else if (*i == '/') {
+ out->append("\\s");
+ } else if (*i == '_') {
+ out->append("\\u");
+ } else if (*i == '\0') {
+ out->append("\\n");
+ } else {
+ out->append(i, i+1);
+ }
+ }
+}
+
+string LFNIndex::lfn_generate_object_name(const ghobject_t &oid)
+{
+ if (index_version == HASH_INDEX_TAG)
+ return lfn_generate_object_name_keyless(oid);
+ if (index_version == HASH_INDEX_TAG_2)
+ return lfn_generate_object_name_poolless(oid);
+
+ string full_name;
+ string::const_iterator i = oid.hobj.oid.name.begin();
+ if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
+ full_name.append("\\d");
+ i += 4;
+ } else if (oid.hobj.oid.name[0] == '.') {
+ full_name.append("\\.");
+ ++i;
+ }
+ append_escaped(i, oid.hobj.oid.name.end(), &full_name);
+ full_name.append("_");
+ append_escaped(oid.hobj.get_key().begin(), oid.hobj.get_key().end(), &full_name);
+ full_name.append("_");
+
+ char buf[PATH_MAX];
+ char *t = buf;
+ char *end = t + sizeof(buf);
+ if (oid.hobj.snap == CEPH_NOSNAP)
+ t += snprintf(t, end - t, "head");
+ else if (oid.hobj.snap == CEPH_SNAPDIR)
+ t += snprintf(t, end - t, "snapdir");
+ else
+ t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
+ snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash());
+ full_name += string(buf);
+ full_name.append("_");
+
+ append_escaped(oid.hobj.nspace.begin(), oid.hobj.nspace.end(), &full_name);
+ full_name.append("_");
+
+ t = buf;
+ end = t + sizeof(buf);
+ if (oid.hobj.pool == -1)
+ t += snprintf(t, end - t, "none");
+ else
+ t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.pool);
+ full_name += string(buf);
+
+ if (oid.generation != ghobject_t::NO_GEN ||
+ oid.shard_id != shard_id_t::NO_SHARD) {
+ full_name.append("_");
+
+ t = buf;
+ end = t + sizeof(buf);
+ t += snprintf(t, end - t, "%llx", (long long unsigned)oid.generation);
+ full_name += string(buf);
+
+ full_name.append("_");
+
+ t = buf;
+ end = t + sizeof(buf);
+ t += snprintf(t, end - t, "%x", (int)oid.shard_id);
+ full_name += string(buf);
+ }
+
+ return full_name;
+}
+
+string LFNIndex::lfn_generate_object_name_poolless(const ghobject_t &oid)
+{
+ if (index_version == HASH_INDEX_TAG)
+ return lfn_generate_object_name_keyless(oid);
+
+ assert(oid.generation == ghobject_t::NO_GEN);
+ string full_name;
+ string::const_iterator i = oid.hobj.oid.name.begin();
+ if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
+ full_name.append("\\d");
+ i += 4;
+ } else if (oid.hobj.oid.name[0] == '.') {
+ full_name.append("\\.");
+ ++i;
+ }
+ append_escaped(i, oid.hobj.oid.name.end(), &full_name);
+ full_name.append("_");
+ append_escaped(oid.hobj.get_key().begin(), oid.hobj.get_key().end(), &full_name);
+ full_name.append("_");
+
+ char snap_with_hash[PATH_MAX];
+ char *t = snap_with_hash;
+ char *end = t + sizeof(snap_with_hash);
+ if (oid.hobj.snap == CEPH_NOSNAP)
+ t += snprintf(t, end - t, "head");
+ else if (oid.hobj.snap == CEPH_SNAPDIR)
+ t += snprintf(t, end - t, "snapdir");
+ else
+ t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
+ snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash());
+ full_name += string(snap_with_hash);
+ return full_name;
+}
+
+int LFNIndex::lfn_get_name(const vector<string> &path,
+ const ghobject_t &oid,
+ string *mangled_name, string *out_path,
+ int *hardlink)
+{
+ string subdir_path = get_full_path_subdir(path);
+ string full_name = lfn_generate_object_name(oid);
+ int r;
+
+ if (!lfn_must_hash(full_name)) {
+ if (mangled_name)
+ *mangled_name = full_name;
+ if (out_path)
+ *out_path = get_full_path(path, full_name);
+ if (hardlink) {
+ struct stat buf;
+ string full_path = get_full_path(path, full_name);
+ maybe_inject_failure();
+ r = ::stat(full_path.c_str(), &buf);
+ if (r < 0) {
+ if (errno == ENOENT)
+ *hardlink = 0;
+ else
+ return -errno;
+ } else {
+ *hardlink = buf.st_nlink;
+ }
+ }
+ return 0;
+ }
+
+ int i = 0;
+ string candidate;
+ string candidate_path;
+ char buf[FILENAME_MAX_LEN + 1];
+ for ( ; ; ++i) {
+ candidate = lfn_get_short_name(oid, i);
+ candidate_path = get_full_path(path, candidate);
+ r = chain_getxattr(candidate_path.c_str(), get_lfn_attr().c_str(),
+ buf, sizeof(buf));
+ if (r < 0) {
+ if (errno != ENODATA && errno != ENOENT)
+ return -errno;
+ if (errno == ENODATA) {
+ // Left over from incomplete transaction, it'll be replayed
+ maybe_inject_failure();
+ r = ::unlink(candidate_path.c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ }
+ if (mangled_name)
+ *mangled_name = candidate;
+ if (out_path)
+ *out_path = candidate_path;
+ if (hardlink)
+ *hardlink = 0;
+ return 0;
+ }
+ assert(r > 0);
+ buf[MIN((int)sizeof(buf) - 1, r)] = '\0';
+ if (!strcmp(buf, full_name.c_str())) {
+ if (mangled_name)
+ *mangled_name = candidate;
+ if (out_path)
+ *out_path = candidate_path;
+ if (hardlink) {
+ struct stat st;
+ r = ::stat(candidate_path.c_str(), &st);
+ *hardlink = st.st_nlink;
+ }
+ return 0;
+ }
+ r = chain_getxattr(candidate_path.c_str(), get_alt_lfn_attr().c_str(),
+ buf, sizeof(buf));
+ if (r > 0) {
+ // only consider alt name if nlink > 1
+ struct stat st;
+ int rc = ::stat(candidate_path.c_str(), &st);
+ if (rc < 0)
+ return -errno;
+ if (st.st_nlink <= 1) {
+ // left over from incomplete unlink, remove
+ maybe_inject_failure();
+ dout(20) << __func__ << " found extra alt attr for " << candidate_path
+ << ", long name " << string(buf, r) << dendl;
+ rc = chain_removexattr(candidate_path.c_str(),
+ get_alt_lfn_attr().c_str());
+ maybe_inject_failure();
+ if (rc < 0)
+ return rc;
+ continue;
+ }
+ buf[MIN((int)sizeof(buf) - 1, r)] = '\0';
+ if (!strcmp(buf, full_name.c_str())) {
+ dout(20) << __func__ << " used alt attr for " << full_name << dendl;
+ if (mangled_name)
+ *mangled_name = candidate;
+ if (out_path)
+ *out_path = candidate_path;
+ if (hardlink)
+ *hardlink = st.st_nlink;
+ return 0;
+ }
+ }
+ }
+ assert(0); // Unreachable
+ return 0;
+}
+
+int LFNIndex::lfn_created(const vector<string> &path,
+ const ghobject_t &oid,
+ const string &mangled_name)
+{
+ if (!lfn_is_hashed_filename(mangled_name))
+ return 0;
+ string full_path = get_full_path(path, mangled_name);
+ string full_name = lfn_generate_object_name(oid);
+ maybe_inject_failure();
+
+ // if the main attr exists and is different, move it to the alt attr.
+ char buf[FILENAME_MAX_LEN + 1];
+ int r = chain_getxattr(full_path.c_str(), get_lfn_attr().c_str(),
+ buf, sizeof(buf));
+ if (r >= 0 && (r != (int)full_name.length() ||
+ memcmp(buf, full_name.c_str(), full_name.length()))) {
+ dout(20) << __func__ << " " << mangled_name
+ << " moving old name to alt attr "
+ << string(buf, r)
+ << ", new name is " << full_name << dendl;
+ r = chain_setxattr(full_path.c_str(), get_alt_lfn_attr().c_str(),
+ buf, r);
+ if (r < 0)
+ return r;
+ }
+
+ return chain_setxattr(full_path.c_str(), get_lfn_attr().c_str(),
+ full_name.c_str(), full_name.size());
+}
+
+int LFNIndex::lfn_unlink(const vector<string> &path,
+ const ghobject_t &oid,
+ const string &mangled_name)
+{
+ if (!lfn_is_hashed_filename(mangled_name)) {
+ string full_path = get_full_path(path, mangled_name);
+ maybe_inject_failure();
+ int r = ::unlink(full_path.c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ return 0;
+ }
+ string subdir_path = get_full_path_subdir(path);
+
+
+ int i = 0;
+ for ( ; ; ++i) {
+ string candidate = lfn_get_short_name(oid, i);
+ if (candidate == mangled_name)
+ break;
+ }
+ int removed_index = i;
+ ++i;
+ for ( ; ; ++i) {
+ struct stat buf;
+ string to_check = lfn_get_short_name(oid, i);
+ string to_check_path = get_full_path(path, to_check);
+ int r = ::stat(to_check_path.c_str(), &buf);
+ if (r < 0) {
+ if (errno == ENOENT) {
+ break;
+ } else {
+ return -errno;
+ }
+ }
+ }
+ string full_path = get_full_path(path, mangled_name);
+ int fd = ::open(full_path.c_str(), O_RDONLY);
+ if (fd < 0)
+ return -errno;
+ FDCloser f(fd);
+ if (i == removed_index + 1) {
+ maybe_inject_failure();
+ int r = ::unlink(full_path.c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ } else {
+ string& rename_to = full_path;
+ string rename_from = get_full_path(path, lfn_get_short_name(oid, i - 1));
+ maybe_inject_failure();
+ int r = ::rename(rename_from.c_str(), rename_to.c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ }
+ struct stat st;
+ int r = ::fstat(fd, &st);
+ if (r == 0 && st.st_nlink > 0) {
+ // remove alt attr
+ dout(20) << __func__ << " removing alt attr from " << full_path << dendl;
+ fsync_dir(path);
+ chain_fremovexattr(fd, get_alt_lfn_attr().c_str());
+ }
+ return r;
+}
+
+int LFNIndex::lfn_translate(const vector<string> &path,
+ const string &short_name,
+ ghobject_t *out)
+{
+ if (!lfn_is_hashed_filename(short_name)) {
+ return lfn_parse_object_name(short_name, out);
+ }
+ // Get lfn_attr
+ string full_path = get_full_path(path, short_name);
+ char attr[PATH_MAX];
+ int r = chain_getxattr(full_path.c_str(), get_lfn_attr().c_str(), attr, sizeof(attr) - 1);
+ if (r < 0)
+ return -errno;
+ if (r < (int)sizeof(attr))
+ attr[r] = '\0';
+
+ string long_name(attr);
+ return lfn_parse_object_name(long_name, out);
+}
+
+bool LFNIndex::lfn_is_object(const string &short_name)
+{
+ return lfn_is_hashed_filename(short_name) || !lfn_is_subdir(short_name, 0);
+}
+
+bool LFNIndex::lfn_is_subdir(const string &name, string *demangled)
+{
+ if (name.substr(0, SUBDIR_PREFIX.size()) == SUBDIR_PREFIX) {
+ if (demangled)
+ *demangled = demangle_path_component(name);
+ return 1;
+ }
+ return 0;
+}
+
+static int parse_object(const char *s, ghobject_t& o)
+{
+ const char *hash = s + strlen(s) - 1;
+ while (*hash != '_' &&
+ hash > s)
+ hash--;
+ const char *bar = hash - 1;
+ while (*bar != '_' &&
+ bar > s)
+ bar--;
+ if (*bar == '_') {
+ char buf[bar-s + 1];
+ char *t = buf;
+ const char *i = s;
+ while (i < bar) {
+ if (*i == '\\') {
+ i++;
+ switch (*i) {
+ case '\\': *t++ = '\\'; break;
+ case '.': *t++ = '.'; break;
+ case 's': *t++ = '/'; break;
+ case 'd': {
+ *t++ = 'D';
+ *t++ = 'I';
+ *t++ = 'R';
+ *t++ = '_';
+ break;
+ }
+ default: assert(0);
+ }
+ } else {
+ *t++ = *i;
+ }
+ i++;
+ }
+ *t = 0;
+ o.hobj.oid.name = string(buf, t-buf);
+ if (strncmp(bar+1, "head", 4) == 0)
+ o.hobj.snap = CEPH_NOSNAP;
+ else if (strncmp(bar+1, "snapdir", 7) == 0)
+ o.hobj.snap = CEPH_SNAPDIR;
+ else
+ o.hobj.snap = strtoull(bar+1, NULL, 16);
+
+ uint32_t hobject_hash_input;
+ sscanf(hash, "_%X", &hobject_hash_input);
+ o.hobj.set_hash(hobject_hash_input);
+
+ return 1;
+ }
+ return 0;
+}
+
+bool LFNIndex::lfn_parse_object_name_keyless(const string &long_name, ghobject_t *out)
+{
+ bool r = parse_object(long_name.c_str(), *out);
+ int64_t pool = -1;
+ spg_t pg;
+ if (coll().is_pg_prefix(&pg))
+ pool = (int64_t)pg.pgid.pool();
+ out->hobj.pool = pool;
+ if (!r) return r;
+ string temp = lfn_generate_object_name(*out);
+ return r;
+}
+
+static bool append_unescaped(string::const_iterator begin,
+ string::const_iterator end,
+ string *out)
+{
+ for (string::const_iterator i = begin; i != end; ++i) {
+ if (*i == '\\') {
+ ++i;
+ if (*i == '\\')
+ out->append("\\");
+ else if (*i == 's')
+ out->append("/");
+ else if (*i == 'n')
+ (*out) += '\0';
+ else if (*i == 'u')
+ out->append("_");
+ else
+ return false;
+ } else {
+ out->append(i, i+1);
+ }
+ }
+ return true;
+}
+
+bool LFNIndex::lfn_parse_object_name_poolless(const string &long_name,
+ ghobject_t *out)
+{
+ string name;
+ string key;
+ uint32_t hash;
+ snapid_t snap;
+
+ string::const_iterator current = long_name.begin();
+ if (*current == '\\') {
+ ++current;
+ if (current == long_name.end()) {
+ return false;
+ } else if (*current == 'd') {
+ name.append("DIR_");
+ ++current;
+ } else if (*current == '.') {
+ name.append(".");
+ ++current;
+ } else {
+ --current;
+ }
+ }
+
+ string::const_iterator end = current;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return false;
+ if (!append_unescaped(current, end, &name))
+ return false;
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return false;
+ if (!append_unescaped(current, end, &key))
+ return false;
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return false;
+ string snap_str(current, end);
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end != long_name.end())
+ return false;
+ string hash_str(current, end);
+
+ if (snap_str == "head")
+ snap = CEPH_NOSNAP;
+ else if (snap_str == "snapdir")
+ snap = CEPH_SNAPDIR;
+ else
+ snap = strtoull(snap_str.c_str(), NULL, 16);
+ sscanf(hash_str.c_str(), "%X", &hash);
+
+
+ int64_t pool = -1;
+ spg_t pg;
+ if (coll().is_pg_prefix(&pg))
+ pool = (int64_t)pg.pgid.pool();
+ (*out) = ghobject_t(hobject_t(name, key, snap, hash, pool, ""));
+ return true;
+}
+
+
+bool LFNIndex::lfn_parse_object_name(const string &long_name, ghobject_t *out)
+{
+ string name;
+ string key;
+ string ns;
+ uint32_t hash;
+ snapid_t snap;
+ uint64_t pool;
+ gen_t generation = ghobject_t::NO_GEN;
+ shard_id_t shard_id = shard_id_t::NO_SHARD;
+
+ if (index_version == HASH_INDEX_TAG)
+ return lfn_parse_object_name_keyless(long_name, out);
+ if (index_version == HASH_INDEX_TAG_2)
+ return lfn_parse_object_name_poolless(long_name, out);
+
+ string::const_iterator current = long_name.begin();
+ if (*current == '\\') {
+ ++current;
+ if (current == long_name.end()) {
+ return false;
+ } else if (*current == 'd') {
+ name.append("DIR_");
+ ++current;
+ } else if (*current == '.') {
+ name.append(".");
+ ++current;
+ } else {
+ --current;
+ }
+ }
+
+ string::const_iterator end = current;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return false;
+ if (!append_unescaped(current, end, &name))
+ return false;
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return false;
+ if (!append_unescaped(current, end, &key))
+ return false;
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return false;
+ string snap_str(current, end);
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return false;
+ string hash_str(current, end);
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return false;
+ if (!append_unescaped(current, end, &ns))
+ return false;
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ string pstring(current, end);
+
+ // Optional generation/shard_id
+ string genstring, shardstring;
+ if (end != long_name.end()) {
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return false;
+ genstring = string(current, end);
+
+ generation = (gen_t)strtoull(genstring.c_str(), NULL, 16);
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end != long_name.end())
+ return false;
+ shardstring = string(current, end);
+
+ shard_id = (shard_id_t)strtoul(shardstring.c_str(), NULL, 16);
+ }
+
+ if (snap_str == "head")
+ snap = CEPH_NOSNAP;
+ else if (snap_str == "snapdir")
+ snap = CEPH_SNAPDIR;
+ else
+ snap = strtoull(snap_str.c_str(), NULL, 16);
+ sscanf(hash_str.c_str(), "%X", &hash);
+
+ if (pstring == "none")
+ pool = (uint64_t)-1;
+ else
+ pool = strtoull(pstring.c_str(), NULL, 16);
+
+ (*out) = ghobject_t(hobject_t(name, key, snap, hash, (int64_t)pool, ns), generation, shard_id);
+ return true;
+}
+
+bool LFNIndex::lfn_is_hashed_filename(const string &name)
+{
+ if (name.size() < (unsigned)FILENAME_SHORT_LEN) {
+ return 0;
+ }
+ if (name.substr(name.size() - FILENAME_COOKIE.size(), FILENAME_COOKIE.size())
+ == FILENAME_COOKIE) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+bool LFNIndex::lfn_must_hash(const string &long_name)
+{
+ return (int)long_name.size() >= FILENAME_SHORT_LEN;
+}
+
+static inline void buf_to_hex(const unsigned char *buf, int len, char *str)
+{
+ int i;
+ str[0] = '\0';
+ for (i = 0; i < len; i++) {
+ sprintf(&str[i*2], "%02x", (int)buf[i]);
+ }
+}
+
+int LFNIndex::hash_filename(const char *filename, char *hash, int buf_len)
+{
+ if (buf_len < FILENAME_HASH_LEN + 1)
+ return -EINVAL;
+
+ char buf[FILENAME_LFN_DIGEST_SIZE];
+ char hex[FILENAME_LFN_DIGEST_SIZE * 2];
+
+ SHA1 h;
+ h.Update((const byte *)filename, strlen(filename));
+ h.Final((byte *)buf);
+
+ buf_to_hex((byte *)buf, (FILENAME_HASH_LEN + 1) / 2, hex);
+ strncpy(hash, hex, FILENAME_HASH_LEN);
+ hash[FILENAME_HASH_LEN] = '\0';
+ return 0;
+}
+
+void LFNIndex::build_filename(const char *old_filename, int i, char *filename, int len)
+{
+ char hash[FILENAME_HASH_LEN + 1];
+
+ assert(len >= FILENAME_SHORT_LEN + 4);
+
+ strncpy(filename, old_filename, FILENAME_PREFIX_LEN);
+ filename[FILENAME_PREFIX_LEN] = '\0';
+ if ((int)strlen(filename) < FILENAME_PREFIX_LEN)
+ return;
+ if (old_filename[FILENAME_PREFIX_LEN] == '\0')
+ return;
+
+ hash_filename(old_filename, hash, sizeof(hash));
+ int ofs = FILENAME_PREFIX_LEN;
+ while (1) {
+ int suffix_len = sprintf(filename + ofs, "_%s_%d_%s", hash, i, FILENAME_COOKIE.c_str());
+ if (ofs + suffix_len <= FILENAME_SHORT_LEN || !ofs)
+ break;
+ ofs--;
+ }
+}
+
+string LFNIndex::lfn_get_short_name(const ghobject_t &oid, int i)
+{
+ string long_name = lfn_generate_object_name(oid);
+ assert(lfn_must_hash(long_name));
+ char buf[FILENAME_SHORT_LEN + 4];
+ build_filename(long_name.c_str(), i, buf, sizeof(buf));
+ return string(buf);
+}
+
+const string &LFNIndex::get_base_path()
+{
+ return base_path;
+}
+
+string LFNIndex::get_full_path_subdir(const vector<string> &rel)
+{
+ string retval = get_base_path();
+ for (vector<string>::const_iterator i = rel.begin();
+ i != rel.end();
+ ++i) {
+ retval += "/";
+ retval += mangle_path_component(*i);
+ }
+ return retval;
+}
+
+string LFNIndex::get_full_path(const vector<string> &rel, const string &name)
+{
+ return get_full_path_subdir(rel) + "/" + name;
+}
+
+string LFNIndex::mangle_path_component(const string &component)
+{
+ return SUBDIR_PREFIX + component;
+}
+
+string LFNIndex::demangle_path_component(const string &component)
+{
+ return component.substr(SUBDIR_PREFIX.size(), component.size() - SUBDIR_PREFIX.size());
+}
+
+int LFNIndex::decompose_full_path(const char *in, vector<string> *out,
+ ghobject_t *oid, string *shortname)
+{
+ const char *beginning = in + get_base_path().size();
+ const char *end = beginning;
+ while (1) {
+ end++;
+ beginning = end++;
+ for ( ; *end != '\0' && *end != '/'; ++end) ;
+ if (*end != '\0') {
+ out->push_back(demangle_path_component(string(beginning, end - beginning)));
+ continue;
+ } else {
+ break;
+ }
+ }
+ *shortname = string(beginning, end - beginning);
+ if (oid) {
+ int r = lfn_translate(*out, *shortname, oid);
+ if (r < 0)
+ return r;
+ }
+ return 0;
+}
+
+string LFNIndex::mangle_attr_name(const string &attr)
+{
+ return PHASH_ATTR_PREFIX + attr;
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef OS_LFNINDEX_H
+#define OS_LFNINDEX_H
+
+#include <string>
+#include <map>
+#include <set>
+#include <vector>
+#include "include/memory.h"
+#include <exception>
+
+#include "osd/osd_types.h"
+#include "include/object.h"
+#include "common/ceph_crypto.h"
+
+#include "CollectionIndex.h"
+
+/**
+ * LFNIndex also encapsulates logic for manipulating
+ * subdirectories of of a collection as well as the long filename
+ * logic.
+ *
+ * The protected methods provide machinery for derived classes to
+ * manipulate subdirectories and objects.
+ *
+ * The virtual methods are to be overridden to provide the actual
+ * hashed layout.
+ *
+ * User must call created when an object is created.
+ *
+ * Syncronization: Calling code must ensure that there are no object
+ * creations or deletions during the lifetime of a Path object (except
+ * of an object at that path).
+ *
+ * Unless otherwise noted, methods which return an int return 0 on sucess
+ * and a negative error code on failure.
+ */
+#define WRAP_RETRY(x) { \
+ bool failed = false; \
+ int r = 0; \
+ init_inject_failure(); \
+ while (1) { \
+ try { \
+ if (failed) { \
+ r = cleanup(); \
+ assert(r == 0); \
+ } \
+ { x } \
+ out: \
+ complete_inject_failure(); \
+ return r; \
+ } catch (RetryException) { \
+ failed = true; \
+ } catch (...) { \
+ assert(0); \
+ } \
+ } \
+ return -1; \
+ } \
+
+
+
+class LFNIndex : public CollectionIndex {
+ /// Hash digest output size.
+ static const int FILENAME_LFN_DIGEST_SIZE = CEPH_CRYPTO_SHA1_DIGESTSIZE;
+ /// Length of filename hash.
+ static const int FILENAME_HASH_LEN = FILENAME_LFN_DIGEST_SIZE;
+ /// Max filename size.
+ static const int FILENAME_MAX_LEN = 4096;
+ /// Length of hashed filename.
+ static const int FILENAME_SHORT_LEN = 255;
+ /// Length of hashed filename prefix.
+ static const int FILENAME_PREFIX_LEN;
+ /// Length of hashed filename cookie.
+ static const int FILENAME_EXTRA = 4;
+ /// Lfn cookie value.
+ static const string FILENAME_COOKIE;
+ /// Name of LFN attribute for storing full name.
+ static const string LFN_ATTR;
+ /// Prefix for subdir index attributes.
+ static const string PHASH_ATTR_PREFIX;
+ /// Prefix for index subdirectories.
+ static const string SUBDIR_PREFIX;
+
+ /// Path to Index base.
+ const string base_path;
+
+protected:
+ const uint32_t index_version;
+
+ /// true if retry injection is enabled
+ struct RetryException : public exception {};
+ bool error_injection_enabled;
+ bool error_injection_on;
+ double error_injection_probability;
+ uint64_t last_failure;
+ uint64_t current_failure;
+ void init_inject_failure() {
+ if (error_injection_on) {
+ error_injection_enabled = true;
+ last_failure = current_failure = 0;
+ }
+ }
+ void maybe_inject_failure();
+ void complete_inject_failure() {
+ error_injection_enabled = false;
+ }
+
+private:
+ string lfn_attribute, lfn_alt_attribute;
+ coll_t collection;
+
+public:
+ /// Constructor
+ LFNIndex(
+ coll_t collection,
+ const char *base_path, ///< [in] path to Index root
+ uint32_t index_version,
+ double _error_injection_probability=0)
+ : CollectionIndex(collection),
+ base_path(base_path),
+ index_version(index_version),
+ error_injection_enabled(false),
+ error_injection_on(_error_injection_probability != 0),
+ error_injection_probability(_error_injection_probability),
+ last_failure(0), current_failure(0),
+ collection(collection) {
+ if (index_version == HASH_INDEX_TAG) {
+ lfn_attribute = LFN_ATTR;
+ } else {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%d", index_version);
+ lfn_attribute = LFN_ATTR + string(buf);
+ lfn_alt_attribute = LFN_ATTR + string(buf) + "-alt";
+ }
+ }
+
+ coll_t coll() const { return collection; }
+
+ /// Virtual destructor
+ virtual ~LFNIndex() {}
+
+ /// @see CollectionIndex
+ int init();
+
+ /// @see CollectionIndex
+ int cleanup() = 0;
+
+ /// @see CollectionIndex
+ int created(
+ const ghobject_t &oid,
+ const char *path
+ );
+
+ /// @see CollectionIndex
+ int unlink(
+ const ghobject_t &oid
+ );
+
+ /// @see CollectionIndex
+ int lookup(
+ const ghobject_t &oid,
+ IndexedPath *path,
+ int *hardlink
+ );
+
+ /// @see CollectionIndex;
+ int pre_hash_collection(
+ uint32_t pg_num,
+ uint64_t expected_num_objs
+ );
+
+ /// @see CollectionIndex
+ int collection_list_partial(
+ const ghobject_t &start,
+ const ghobject_t &end,
+ bool sort_bitwise,
+ int max_count,
+ vector<ghobject_t> *ls,
+ ghobject_t *next
+ );
+
+ virtual int _split(
+ uint32_t match, //< [in] value to match
+ uint32_t bits, //< [in] bits to check
+ CollectionIndex* dest //< [in] destination index
+ ) = 0;
+
+ /// @see CollectionIndex
+ int split(
+ uint32_t match,
+ uint32_t bits,
+ CollectionIndex* dest
+ ) {
+ WRAP_RETRY(
+ r = _split(match, bits, dest);
+ goto out;
+ );
+ }
+
+
+protected:
+ virtual int _init() = 0;
+
+ /// Will be called upon object creation
+ virtual int _created(
+ const vector<string> &path, ///< [in] Path to subdir.
+ const ghobject_t &oid, ///< [in] Object created.
+ const string &mangled_name ///< [in] Mangled filename.
+ ) = 0;
+
+ /// Will be called to remove an object
+ virtual int _remove(
+ const vector<string> &path, ///< [in] Path to subdir.
+ const ghobject_t &oid, ///< [in] Object to remove.
+ const string &mangled_name ///< [in] Mangled filename.
+ ) = 0;
+
+ /// Return the path and mangled_name for oid.
+ virtual int _lookup(
+ const ghobject_t &oid,///< [in] Object for lookup.
+ vector<string> *path, ///< [out] Path to the object.
+ string *mangled_name, ///< [out] Mangled filename.
+ int *exists ///< [out] True if the object exists.
+ ) = 0;
+
+ /// Pre-hash the collection with the given pg number and
+ /// expected number of objects in the collection.
+ virtual int _pre_hash_collection(
+ uint32_t pg_num,
+ uint64_t expected_num_objs
+ ) = 0;
+
+ /// @see CollectionIndex
+ virtual int _collection_list_partial(
+ const ghobject_t &start,
+ const ghobject_t &end,
+ bool sort_bitwise,
+ int max_count,
+ vector<ghobject_t> *ls,
+ ghobject_t *next
+ ) = 0;
+
+protected:
+
+ /* Non-virtual utility methods */
+
+ /// Sync a subdirectory
+ int fsync_dir(
+ const vector<string> &path ///< [in] Path to sync
+ ); ///< @return Error Code, 0 on success
+
+ /// Link an object from from into to
+ int link_object(
+ const vector<string> &from, ///< [in] Source subdirectory.
+ const vector<string> &to, ///< [in] Dest subdirectory.
+ const ghobject_t &oid, ///< [in] Object to move.
+ const string &from_short_name ///< [in] Mangled filename of oid.
+ ); ///< @return Error Code, 0 on success
+
+ /**
+ * Efficiently remove objects from a subdirectory
+ *
+ * remove_object invalidates mangled names in the directory requiring
+ * the mangled name of each additional object to be looked up a second
+ * time. remove_objects removes the need for additional lookups
+ *
+ * @param [in] dir Directory from which to remove.
+ * @param [in] map of objects to remove to mangle names
+ * @param [in,out] map of filenames to objects
+ * @return Error Code, 0 on success.
+ */
+ int remove_objects(
+ const vector<string> &dir,
+ const map<string, ghobject_t> &to_remove,
+ map<string, ghobject_t> *remaining
+ );
+
+
+ /**
+ * Moves contents of from into to.
+ *
+ * Invalidates mangled names in to. If interupted, all objects will be
+ * present in to before objects are removed from from. Ignores EEXIST
+ * while linking into to.
+ * @return Error Code, 0 on success
+ */
+ int move_objects(
+ const vector<string> &from, ///< [in] Source subdirectory.
+ const vector<string> &to ///< [in] Dest subdirectory.
+ );
+
+ /**
+ * Remove an object from from.
+ *
+ * Invalidates mangled names in from.
+ * @return Error Code, 0 on success
+ */
+ int remove_object(
+ const vector<string> &from, ///< [in] Directory from which to remove.
+ const ghobject_t &to_remove ///< [in] Object to remove.
+ );
+
+ /**
+ * Gets the filename corresponding to oid in from.
+ *
+ * The filename may differ between subdirectories. Furthermore,
+ * file creations ore removals in from may invalidate the name.
+ * @return Error code on failure, 0 on success
+ */
+ int get_mangled_name(
+ const vector<string> &from, ///< [in] Subdirectory
+ const ghobject_t &oid, ///< [in] Object
+ string *mangled_name, ///< [out] Filename
+ int *hardlink ///< [out] hardlink for this file, hardlink=0 mean no-exist
+ );
+
+ /// do move subdir from from to dest
+ static int move_subdir(
+ LFNIndex &from, ///< [in] from index
+ LFNIndex &dest, ///< [in] to index
+ const vector<string> &path, ///< [in] path containing dir
+ string dir ///< [in] dir to move
+ );
+
+ /// do move object from from to dest
+ static int move_object(
+ LFNIndex &from, ///< [in] from index
+ LFNIndex &dest, ///< [in] to index
+ const vector<string> &path, ///< [in] path to split
+ const pair<string, ghobject_t> &obj ///< [in] obj to move
+ );
+
+ /**
+ * Lists objects in to_list.
+ *
+ * @param [in] to_list Directory to list.
+ * @param [in] max_objects Max number to list.
+ * @param [in,out] handle Cookie for continuing the listing.
+ * Initialize to zero to start at the beginning of the directory.
+ * @param [out] out Mapping of listed object filenames to objects.
+ * @return Error code on failure, 0 on success
+ */
+ int list_objects(
+ const vector<string> &to_list,
+ int max_objects,
+ long *handle,
+ map<string, ghobject_t> *out
+ );
+
+ /// Lists subdirectories.
+ int list_subdirs(
+ const vector<string> &to_list, ///< [in] Directory to list.
+ vector<string> *out ///< [out] Subdirectories listed.
+ );
+
+ /// Create subdirectory.
+ int create_path(
+ const vector<string> &to_create ///< [in] Subdirectory to create.
+ );
+
+ /// Remove subdirectory.
+ int remove_path(
+ const vector<string> &to_remove ///< [in] Subdirectory to remove.
+ );
+
+ /// Check whether to_check exists.
+ int path_exists(
+ const vector<string> &to_check, ///< [in] Subdirectory to check.
+ int *exists ///< [out] 1 if it exists, 0 else
+ );
+
+ /// Save attr_value to attr_name attribute on path.
+ int add_attr_path(
+ const vector<string> &path, ///< [in] Path to modify.
+ const string &attr_name, ///< [in] Name of attribute.
+ bufferlist &attr_value ///< [in] Value to save.
+ );
+
+ /// Read into attr_value atribute attr_name on path.
+ int get_attr_path(
+ const vector<string> &path, ///< [in] Path to read.
+ const string &attr_name, ///< [in] Attribute to read.
+ bufferlist &attr_value ///< [out] Attribute value read.
+ );
+
+ /// Remove attr from path
+ int remove_attr_path(
+ const vector<string> &path, ///< [in] path from which to remove attr
+ const string &attr_name ///< [in] attr to remove
+ ); ///< @return Error code, 0 on success
+
+private:
+ /* lfn translation functions */
+
+ /**
+ * Gets the version specific lfn attribute tag
+ */
+ const string &get_lfn_attr() const {
+ return lfn_attribute;
+ }
+ const string &get_alt_lfn_attr() const {
+ return lfn_alt_attribute;
+ }
+
+ /**
+ * Gets the filename corresponsing to oid in path.
+ *
+ * @param [in] path Path in which to get filename for oid.
+ * @param [in] oid Object for which to get filename.
+ * @param [out] mangled_name Filename for oid, pass NULL if not needed.
+ * @param [out] full_path Fullpath for oid, pass NULL if not needed.
+ * @param [out] hardlink of this file, 0 mean no-exist, pass NULL if
+ * not needed
+ * @return Error Code, 0 on success.
+ */
+ int lfn_get_name(
+ const vector<string> &path,
+ const ghobject_t &oid,
+ string *mangled_name,
+ string *full_path,
+ int *hardlink
+ );
+
+ /// Adjusts path contents when oid is created at name mangled_name.
+ int lfn_created(
+ const vector<string> &path, ///< [in] Path to adjust.
+ const ghobject_t &oid, ///< [in] Object created.
+ const string &mangled_name ///< [in] Filename of created object.
+ );
+
+ /// Removes oid from path while adjusting path contents
+ int lfn_unlink(
+ const vector<string> &path, ///< [in] Path containing oid.
+ const ghobject_t &oid, ///< [in] Object to remove.
+ const string &mangled_name ///< [in] Filename of object to remove.
+ );
+
+ ///Transate a file into and ghobject_t.
+ int lfn_translate(
+ const vector<string> &path, ///< [in] Path containing the file.
+ const string &short_name, ///< [in] Filename to translate.
+ ghobject_t *out ///< [out] Object found.
+ ); ///< @return Negative error code on error, 0 if not an object, 1 else
+
+ /* manglers/demanglers */
+ /// Filters object filenames
+ bool lfn_is_object(
+ const string &short_name ///< [in] Filename to check
+ ); ///< True if short_name is an object, false otherwise
+
+ /// Filters subdir filenames
+ bool lfn_is_subdir(
+ const string &short_name, ///< [in] Filename to check.
+ string *demangled_name ///< [out] Demangled subdir name.
+ ); ///< @return True if short_name is a subdir, false otherwise
+
+ /// Generate object name
+ string lfn_generate_object_name_keyless(
+ const ghobject_t &oid ///< [in] Object for which to generate.
+ ); ///< @return Generated object name.
+
+ /// Generate object name
+ string lfn_generate_object_name_poolless(
+ const ghobject_t &oid ///< [in] Object for which to generate.
+ ); ///< @return Generated object name.
+
+ /// Generate object name
+ string lfn_generate_object_name(
+ const ghobject_t &oid ///< [in] Object for which to generate.
+ ); ///< @return Generated object name.
+
+ /// Parse object name
+ bool lfn_parse_object_name_keyless(
+ const string &long_name, ///< [in] Name to parse
+ ghobject_t *out ///< [out] Resulting Object
+ ); ///< @return True if successfull, False otherwise.
+
+ /// Parse object name
+ bool lfn_parse_object_name_poolless(
+ const string &long_name, ///< [in] Name to parse
+ ghobject_t *out ///< [out] Resulting Object
+ ); ///< @return True if successfull, False otherwise.
+
+ /// Parse object name
+ bool lfn_parse_object_name(
+ const string &long_name, ///< [in] Name to parse
+ ghobject_t *out ///< [out] Resulting Object
+ ); ///< @return True if successfull, False otherwise.
+
+ /// Checks whether short_name is a hashed filename.
+ bool lfn_is_hashed_filename(
+ const string &short_name ///< [in] Name to check.
+ ); ///< @return True if short_name is hashed, False otherwise.
+
+ /// Checks whether long_name must be hashed.
+ bool lfn_must_hash(
+ const string &long_name ///< [in] Name to check.
+ ); ///< @return True if long_name must be hashed, False otherwise.
+
+ /// Generate hashed name.
+ string lfn_get_short_name(
+ const ghobject_t &oid, ///< [in] Object for which to generate.
+ int i ///< [in] Index of hashed name to generate.
+ ); ///< @return Hashed filename.
+
+ /* other common methods */
+ /// Gets the base path
+ const string &get_base_path(); ///< @return Index base_path
+
+ /// Get full path the subdir
+ string get_full_path_subdir(
+ const vector<string> &rel ///< [in] The subdir.
+ ); ///< @return Full path to rel.
+
+ /// Get full path to object
+ string get_full_path(
+ const vector<string> &rel, ///< [in] Path to object.
+ const string &name ///< [in] Filename of object.
+ ); ///< @return Fullpath to object at name in rel.
+
+ /// Get mangled path component
+ string mangle_path_component(
+ const string &component ///< [in] Component to mangle
+ ); /// @return Mangled component
+
+ /// Demangle component
+ string demangle_path_component(
+ const string &component ///< [in] Subdir name to demangle
+ ); ///< @return Demangled path component.
+
+ /// Decompose full path into object name and filename.
+ int decompose_full_path(
+ const char *in, ///< [in] Full path to object.
+ vector<string> *out, ///< [out] Path to object at in.
+ ghobject_t *oid, ///< [out] Object at in.
+ string *shortname ///< [out] Filename of object at in.
+ ); ///< @return Error Code, 0 on success.
+
+ /// Mangle attribute name
+ string mangle_attr_name(
+ const string &attr ///< [in] Attribute to mangle.
+ ); ///< @return Mangled attribute name.
+
+ /// Builds hashed filename
+ void build_filename(
+ const char *old_filename, ///< [in] Filename to convert.
+ int i, ///< [in] Index of hash.
+ char *filename, ///< [out] Resulting filename.
+ int len ///< [in] Size of buffer for filename
+ ); ///< @return Error Code, 0 on success
+
+ /// Get hash of filename
+ int hash_filename(
+ const char *filename, ///< [in] Filename to hash.
+ char *hash, ///< [out] Hash of filename.
+ int len ///< [in] Size of hash buffer.
+ ); ///< @return Error Code, 0 on success.
+
+ friend class TestWrapLFNIndex;
+};
+typedef LFNIndex::IndexedPath IndexedPath;
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef __CEPH_OS_SEQUENCERPOSITION_H
+#define __CEPH_OS_SEQUENCERPOSITION_H
+
+#include "include/types.h"
+#include "include/cmp.h"
+#include "include/encoding.h"
+#include "common/Formatter.h"
+
+#include <ostream>
+
+/**
+ * transaction and op offset
+ */
+struct SequencerPosition {
+ uint64_t seq; ///< seq
+ uint32_t trans; ///< transaction in that seq (0-based)
+ uint32_t op; ///< op in that transaction (0-based)
+
+ SequencerPosition(uint64_t s=0, int32_t t=0, int32_t o=0) : seq(s), trans(t), op(o) {}
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ ::encode(seq, bl);
+ ::encode(trans, bl);
+ ::encode(op, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::iterator& p) {
+ DECODE_START(1, p);
+ ::decode(seq, p);
+ ::decode(trans, p);
+ ::decode(op, p);
+ DECODE_FINISH(p);
+ }
+ void dump(Formatter *f) const {
+ f->dump_unsigned("seq", seq);
+ f->dump_unsigned("trans", trans);
+ f->dump_unsigned("op", op);
+ }
+ static void generate_test_instances(list<SequencerPosition*>& o) {
+ o.push_back(new SequencerPosition);
+ o.push_back(new SequencerPosition(1, 2, 3));
+ o.push_back(new SequencerPosition(4, 5, 6));
+ }
+};
+WRITE_CLASS_ENCODER(SequencerPosition)
+
+inline ostream& operator<<(ostream& out, const SequencerPosition& t) {
+ return out << t.seq << "." << t.trans << "." << t.op;
+}
+
+WRITE_EQ_OPERATORS_3(SequencerPosition, seq, trans, op)
+WRITE_CMP_OPERATORS_3(SequencerPosition, seq, trans, op)
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "acconfig.h"
+
+#include "os/filestore/WBThrottle.h"
+#include "common/perf_counters.h"
+
+WBThrottle::WBThrottle(CephContext *cct) :
+ cur_ios(0), cur_size(0),
+ cct(cct),
+ logger(NULL),
+ stopping(true),
+ lock("WBThrottle::lock", false, true, false, cct),
+ fs(XFS)
+{
+ {
+ Mutex::Locker l(lock);
+ set_from_conf();
+ }
+ assert(cct);
+ PerfCountersBuilder b(
+ cct, string("WBThrottle"),
+ l_wbthrottle_first, l_wbthrottle_last);
+ b.add_u64(l_wbthrottle_bytes_dirtied, "bytes_dirtied", "Dirty data");
+ b.add_u64(l_wbthrottle_bytes_wb, "bytes_wb", "Written data");
+ b.add_u64(l_wbthrottle_ios_dirtied, "ios_dirtied", "Dirty operations");
+ b.add_u64(l_wbthrottle_ios_wb, "ios_wb", "Written operations");
+ b.add_u64(l_wbthrottle_inodes_dirtied, "inodes_dirtied", "Entries waiting for write");
+ b.add_u64(l_wbthrottle_inodes_wb, "inodes_wb", "Written entries");
+ logger = b.create_perf_counters();
+ cct->get_perfcounters_collection()->add(logger);
+ for (unsigned i = l_wbthrottle_first + 1; i != l_wbthrottle_last; ++i)
+ logger->set(i, 0);
+
+ cct->_conf->add_observer(this);
+}
+
+WBThrottle::~WBThrottle() {
+ assert(cct);
+ cct->get_perfcounters_collection()->remove(logger);
+ delete logger;
+ cct->_conf->remove_observer(this);
+}
+
+void WBThrottle::start()
+{
+ {
+ Mutex::Locker l(lock);
+ stopping = false;
+ }
+ create();
+}
+
+void WBThrottle::stop()
+{
+ {
+ Mutex::Locker l(lock);
+ stopping = true;
+ cond.Signal();
+ }
+
+ join();
+}
+
+const char** WBThrottle::get_tracked_conf_keys() const
+{
+ static const char* KEYS[] = {
+ "filestore_wbthrottle_btrfs_bytes_start_flusher",
+ "filestore_wbthrottle_btrfs_bytes_hard_limit",
+ "filestore_wbthrottle_btrfs_ios_start_flusher",
+ "filestore_wbthrottle_btrfs_ios_hard_limit",
+ "filestore_wbthrottle_btrfs_inodes_start_flusher",
+ "filestore_wbthrottle_btrfs_inodes_hard_limit",
+ "filestore_wbthrottle_xfs_bytes_start_flusher",
+ "filestore_wbthrottle_xfs_bytes_hard_limit",
+ "filestore_wbthrottle_xfs_ios_start_flusher",
+ "filestore_wbthrottle_xfs_ios_hard_limit",
+ "filestore_wbthrottle_xfs_inodes_start_flusher",
+ "filestore_wbthrottle_xfs_inodes_hard_limit",
+ NULL
+ };
+ return KEYS;
+}
+
+void WBThrottle::set_from_conf()
+{
+ assert(lock.is_locked());
+ if (fs == BTRFS) {
+ size_limits.first =
+ cct->_conf->filestore_wbthrottle_btrfs_bytes_start_flusher;
+ size_limits.second =
+ cct->_conf->filestore_wbthrottle_btrfs_bytes_hard_limit;
+ io_limits.first =
+ cct->_conf->filestore_wbthrottle_btrfs_ios_start_flusher;
+ io_limits.second =
+ cct->_conf->filestore_wbthrottle_btrfs_ios_hard_limit;
+ fd_limits.first =
+ cct->_conf->filestore_wbthrottle_btrfs_inodes_start_flusher;
+ fd_limits.second =
+ cct->_conf->filestore_wbthrottle_btrfs_inodes_hard_limit;
+ } else if (fs == XFS) {
+ size_limits.first =
+ cct->_conf->filestore_wbthrottle_xfs_bytes_start_flusher;
+ size_limits.second =
+ cct->_conf->filestore_wbthrottle_xfs_bytes_hard_limit;
+ io_limits.first =
+ cct->_conf->filestore_wbthrottle_xfs_ios_start_flusher;
+ io_limits.second =
+ cct->_conf->filestore_wbthrottle_xfs_ios_hard_limit;
+ fd_limits.first =
+ cct->_conf->filestore_wbthrottle_xfs_inodes_start_flusher;
+ fd_limits.second =
+ cct->_conf->filestore_wbthrottle_xfs_inodes_hard_limit;
+ } else {
+ assert(0 == "invalid value for fs");
+ }
+ cond.Signal();
+}
+
+void WBThrottle::handle_conf_change(const md_config_t *conf,
+ const std::set<std::string> &changed)
+{
+ Mutex::Locker l(lock);
+ for (const char** i = get_tracked_conf_keys(); *i; ++i) {
+ if (changed.count(*i)) {
+ set_from_conf();
+ return;
+ }
+ }
+}
+
+bool WBThrottle::get_next_should_flush(
+ boost::tuple<ghobject_t, FDRef, PendingWB> *next)
+{
+ assert(lock.is_locked());
+ assert(next);
+ while (!stopping && !beyond_limit())
+ cond.Wait(lock);
+ if (stopping)
+ return false;
+ assert(!pending_wbs.empty());
+ ghobject_t obj(pop_object());
+
+ ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator i =
+ pending_wbs.find(obj);
+ *next = boost::make_tuple(obj, i->second.second, i->second.first);
+ pending_wbs.erase(i);
+ return true;
+}
+
+
+void *WBThrottle::entry()
+{
+ Mutex::Locker l(lock);
+ boost::tuple<ghobject_t, FDRef, PendingWB> wb;
+ while (get_next_should_flush(&wb)) {
+ clearing = wb.get<0>();
+ cur_ios -= wb.get<2>().ios;
+ logger->dec(l_wbthrottle_ios_dirtied, wb.get<2>().ios);
+ logger->inc(l_wbthrottle_ios_wb, wb.get<2>().ios);
+ cur_size -= wb.get<2>().size;
+ logger->dec(l_wbthrottle_bytes_dirtied, wb.get<2>().size);
+ logger->inc(l_wbthrottle_bytes_wb, wb.get<2>().size);
+ logger->dec(l_wbthrottle_inodes_dirtied);
+ logger->inc(l_wbthrottle_inodes_wb);
+ lock.Unlock();
+#ifdef HAVE_FDATASYNC
+ ::fdatasync(**wb.get<1>());
+#else
+ ::fsync(**wb.get<1>());
+#endif
+#ifdef HAVE_POSIX_FADVISE
+ if (g_conf->filestore_fadvise && wb.get<2>().nocache) {
+ int fa_r = posix_fadvise(**wb.get<1>(), 0, 0, POSIX_FADV_DONTNEED);
+ assert(fa_r == 0);
+ }
+#endif
+ lock.Lock();
+ clearing = ghobject_t();
+ cond.Signal();
+ wb = boost::tuple<ghobject_t, FDRef, PendingWB>();
+ }
+ return 0;
+}
+
+void WBThrottle::queue_wb(
+ FDRef fd, const ghobject_t &hoid, uint64_t offset, uint64_t len,
+ bool nocache)
+{
+ Mutex::Locker l(lock);
+ ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator wbiter =
+ pending_wbs.find(hoid);
+ if (wbiter == pending_wbs.end()) {
+ wbiter = pending_wbs.insert(
+ make_pair(hoid,
+ make_pair(
+ PendingWB(),
+ fd))).first;
+ logger->inc(l_wbthrottle_inodes_dirtied);
+ } else {
+ remove_object(hoid);
+ }
+
+ cur_ios++;
+ logger->inc(l_wbthrottle_ios_dirtied);
+ cur_size += len;
+ logger->inc(l_wbthrottle_bytes_dirtied, len);
+
+ wbiter->second.first.add(nocache, len, 1);
+ insert_object(hoid);
+ if (beyond_limit())
+ cond.Signal();
+}
+
+void WBThrottle::clear()
+{
+ Mutex::Locker l(lock);
+ for (ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator i =
+ pending_wbs.begin();
+ i != pending_wbs.end();
+ ++i) {
+#ifdef HAVE_POSIX_FADVISE
+ if (g_conf->filestore_fadvise && i->second.first.nocache) {
+ int fa_r = posix_fadvise(**i->second.second, 0, 0, POSIX_FADV_DONTNEED);
+ assert(fa_r == 0);
+ }
+#endif
+
+ }
+ cur_ios = cur_size = 0;
+ logger->set(l_wbthrottle_ios_dirtied, 0);
+ logger->set(l_wbthrottle_bytes_dirtied, 0);
+ logger->set(l_wbthrottle_inodes_dirtied, 0);
+ pending_wbs.clear();
+ lru.clear();
+ rev_lru.clear();
+ cond.Signal();
+}
+
+void WBThrottle::clear_object(const ghobject_t &hoid)
+{
+ Mutex::Locker l(lock);
+ while (clearing == hoid)
+ cond.Wait(lock);
+ ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator i =
+ pending_wbs.find(hoid);
+ if (i == pending_wbs.end())
+ return;
+
+ cur_ios -= i->second.first.ios;
+ logger->dec(l_wbthrottle_ios_dirtied, i->second.first.ios);
+ cur_size -= i->second.first.size;
+ logger->dec(l_wbthrottle_bytes_dirtied, i->second.first.size);
+ logger->dec(l_wbthrottle_inodes_dirtied);
+
+ pending_wbs.erase(i);
+ remove_object(hoid);
+ cond.Signal();
+}
+
+void WBThrottle::throttle()
+{
+ Mutex::Locker l(lock);
+ while (!stopping && need_flush())
+ cond.Wait(lock);
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef WBTHROTTLE_H
+#define WBTHROTTLE_H
+
+#include "include/unordered_map.h"
+#include <boost/tuple/tuple.hpp>
+#include "include/memory.h"
+#include "common/Formatter.h"
+#include "common/hobject.h"
+#include "include/interval_set.h"
+#include "FDCache.h"
+#include "common/Thread.h"
+#include "common/ceph_context.h"
+
+class PerfCounters;
+enum {
+ l_wbthrottle_first = 999090,
+ l_wbthrottle_bytes_dirtied,
+ l_wbthrottle_bytes_wb,
+ l_wbthrottle_ios_dirtied,
+ l_wbthrottle_ios_wb,
+ l_wbthrottle_inodes_dirtied,
+ l_wbthrottle_inodes_wb,
+ l_wbthrottle_last
+};
+
+/**
+ * WBThrottle
+ *
+ * Tracks, throttles, and flushes outstanding IO
+ */
+class WBThrottle : Thread, public md_config_obs_t {
+ ghobject_t clearing;
+ /* *_limits.first is the start_flusher limit and
+ * *_limits.second is the hard limit
+ */
+
+ /// Limits on unflushed bytes
+ pair<uint64_t, uint64_t> size_limits;
+
+ /// Limits on unflushed ios
+ pair<uint64_t, uint64_t> io_limits;
+
+ /// Limits on unflushed objects
+ pair<uint64_t, uint64_t> fd_limits;
+
+ uint64_t cur_ios; /// Currently unflushed IOs
+ uint64_t cur_size; /// Currently unflushed bytes
+
+ /**
+ * PendingWB tracks the ios pending on an object.
+ */
+ class PendingWB {
+ public:
+ bool nocache;
+ uint64_t size;
+ uint64_t ios;
+ PendingWB() : nocache(true), size(0), ios(0) {}
+ void add(bool _nocache, uint64_t _size, uint64_t _ios) {
+ if (!_nocache)
+ nocache = false; // only nocache if all writes are nocache
+ size += _size;
+ ios += _ios;
+ }
+ };
+
+ CephContext *cct;
+ PerfCounters *logger;
+ bool stopping;
+ Mutex lock;
+ Cond cond;
+
+
+ /**
+ * Flush objects in lru order
+ */
+ list<ghobject_t> lru;
+ ceph::unordered_map<ghobject_t, list<ghobject_t>::iterator> rev_lru;
+ void remove_object(const ghobject_t &oid) {
+ assert(lock.is_locked());
+ ceph::unordered_map<ghobject_t, list<ghobject_t>::iterator>::iterator iter =
+ rev_lru.find(oid);
+ if (iter == rev_lru.end())
+ return;
+
+ lru.erase(iter->second);
+ rev_lru.erase(iter);
+ }
+ ghobject_t pop_object() {
+ assert(!lru.empty());
+ ghobject_t oid(lru.front());
+ lru.pop_front();
+ rev_lru.erase(oid);
+ return oid;
+ }
+ void insert_object(const ghobject_t &oid) {
+ assert(rev_lru.find(oid) == rev_lru.end());
+ lru.push_back(oid);
+ rev_lru.insert(make_pair(oid, --lru.end()));
+ }
+
+ ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> > pending_wbs;
+
+ /// get next flush to perform
+ bool get_next_should_flush(
+ boost::tuple<ghobject_t, FDRef, PendingWB> *next ///< [out] next to flush
+ ); ///< @return false if we are shutting down
+public:
+ enum FS {
+ BTRFS,
+ XFS
+ };
+
+private:
+ FS fs;
+
+ void set_from_conf();
+ bool beyond_limit() const {
+ if (cur_ios < io_limits.first &&
+ pending_wbs.size() < fd_limits.first &&
+ cur_size < size_limits.first)
+ return false;
+ else
+ return true;
+ }
+ bool need_flush() const {
+ if (cur_ios < io_limits.second &&
+ pending_wbs.size() < fd_limits.second &&
+ cur_size < size_limits.second)
+ return false;
+ else
+ return true;
+ }
+
+public:
+ WBThrottle(CephContext *cct);
+ ~WBThrottle();
+
+ void start();
+ void stop();
+ /// Set fs as XFS or BTRFS
+ void set_fs(FS new_fs) {
+ Mutex::Locker l(lock);
+ fs = new_fs;
+ set_from_conf();
+ }
+
+ /// Queue wb on oid, fd taking throttle (does not block)
+ void queue_wb(
+ FDRef fd, ///< [in] FDRef to oid
+ const ghobject_t &oid, ///< [in] object
+ uint64_t offset, ///< [in] offset written
+ uint64_t len, ///< [in] length written
+ bool nocache ///< [in] try to clear out of cache after write
+ );
+
+ /// Clear all wb (probably due to sync)
+ void clear();
+
+ /// Clear object
+ void clear_object(const ghobject_t &oid);
+
+ /// Block until there is throttle available
+ void throttle();
+
+ /// md_config_obs_t
+ const char** get_tracked_conf_keys() const;
+ void handle_conf_change(const md_config_t *conf,
+ const std::set<std::string> &changed);
+
+ /// Thread
+ void *entry();
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Inktank, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "XfsFileStoreBackend.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <sys/utsname.h>
+
+#include <xfs/xfs.h>
+
+#include "common/errno.h"
+#include "common/linux_version.h"
+#include "include/assert.h"
+#include "include/compat.h"
+
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "xfsfilestorebackend(" << get_basedir_path() << ") "
+
+XfsFileStoreBackend::XfsFileStoreBackend(FileStore *fs):
+ GenericFileStoreBackend(fs), m_has_extsize(false) { }
+
+/*
+ * Set extsize attr on a file to val. Should be a free-standing
+ * function, but dout_prefix expanding to a call to get_basedir_path()
+ * protected member function won't let it.
+ */
+int XfsFileStoreBackend::set_extsize(int fd, unsigned int val)
+{
+ struct fsxattr fsx;
+ struct stat sb;
+ int ret;
+
+ if (fstat(fd, &sb) < 0) {
+ ret = -errno;
+ dout(0) << "set_extsize: fstat: " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ if (!S_ISREG(sb.st_mode)) {
+ dout(0) << "set_extsize: invalid target file type" << dendl;
+ return -EINVAL;
+ }
+
+ if (ioctl(fd, XFS_IOC_FSGETXATTR, &fsx) < 0) {
+ ret = -errno;
+ dout(0) << "set_extsize: FSGETXATTR: " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ // already set?
+ if ((fsx.fsx_xflags & XFS_XFLAG_EXTSIZE) && fsx.fsx_extsize == val)
+ return 0;
+
+ // xfs won't change extent size if any extents are allocated
+ if (fsx.fsx_nextents != 0)
+ return 0;
+
+ fsx.fsx_xflags |= XFS_XFLAG_EXTSIZE;
+ fsx.fsx_extsize = val;
+
+ if (ioctl(fd, XFS_IOC_FSSETXATTR, &fsx) < 0) {
+ ret = -errno;
+ dout(0) << "set_extsize: FSSETXATTR: " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+int XfsFileStoreBackend::detect_features()
+{
+ int ret;
+
+ ret = GenericFileStoreBackend::detect_features();
+ if (ret < 0)
+ return ret;
+
+ // extsize?
+ int fd = ::openat(get_basedir_fd(), "extsize_test", O_CREAT|O_WRONLY, 0600);
+ if (fd < 0) {
+ ret = -errno;
+ dout(0) << "detect_feature: failed to create test file for extsize attr: "
+ << cpp_strerror(ret) << dendl;
+ goto out;
+ }
+ if (::unlinkat(get_basedir_fd(), "extsize_test", 0) < 0) {
+ ret = -errno;
+ dout(0) << "detect_feature: failed to unlink test file for extsize attr: "
+ << cpp_strerror(ret) << dendl;
+ goto out_close;
+ }
+
+ if (g_conf->filestore_xfs_extsize) {
+ ret = set_extsize(fd, 1U << 15); // a few pages
+ if (ret) {
+ ret = 0;
+ dout(0) << "detect_feature: failed to set test file extsize, assuming extsize is NOT supported" << dendl;
+ goto out_close;
+ }
+
+ // make sure we have 3.5 or newer, which includes this fix
+ // aff3a9edb7080f69f07fe76a8bd089b3dfa4cb5d
+ // for this set_extsize bug
+ // http://oss.sgi.com/bugzilla/show_bug.cgi?id=874
+ int ver = get_linux_version();
+ if (ver == 0) {
+ dout(0) << __func__ << ": couldn't verify extsize not buggy, disabling extsize" << dendl;
+ m_has_extsize = false;
+ } else if (ver < KERNEL_VERSION(3, 5, 0)) {
+ dout(0) << __func__ << ": disabling extsize, your kernel < 3.5 and has buggy extsize ioctl" << dendl;
+ m_has_extsize = false;
+ } else {
+ dout(0) << __func__ << ": extsize is supported and your kernel >= 3.5" << dendl;
+ m_has_extsize = true;
+ }
+ } else {
+ dout(0) << "detect_feature: extsize is disabled by conf" << dendl;
+ }
+
+out_close:
+ TEMP_FAILURE_RETRY(::close(fd));
+out:
+ return ret;
+}
+
+int XfsFileStoreBackend::set_alloc_hint(int fd, uint64_t hint)
+{
+ if (!m_has_extsize)
+ return -EOPNOTSUPP;
+
+ assert(hint < UINT_MAX);
+ return set_extsize(fd, hint);
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Inktank, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_XFSFILESTOREBACKEND_H
+#define CEPH_XFSFILESTOREBACKEND_H
+
+#include "GenericFileStoreBackend.h"
+
+#include "include/int_types.h"
+
+class XfsFileStoreBackend : public GenericFileStoreBackend {
+private:
+ bool m_has_extsize;
+ int set_extsize(int fd, unsigned int val);
+public:
+ XfsFileStoreBackend(FileStore *fs);
+ ~XfsFileStoreBackend() {}
+ const char *get_name() {
+ return "xfs";
+ }
+ int detect_features();
+ int set_alloc_hint(int fd, uint64_t hint);
+};
+
+#endif /* CEPH_XFSFILESTOREBACKEND_H */
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/int_types.h"
+#include "include/types.h"
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+
+#include "include/compat.h"
+#include "include/linux_fiemap.h"
+#include "include/color.h"
+#include "include/buffer.h"
+#include "include/assert.h"
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "common/errno.h"
+#include "common/config.h"
+#include "common/sync_filesystem.h"
+
+#ifdef HAVE_LIBZFS
+
+#include "ZFSFileStoreBackend.h"
+
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "zfsfilestorebackend(" << get_basedir_path() << ") "
+
+ZFSFileStoreBackend::ZFSFileStoreBackend(FileStore *fs) :
+ GenericFileStoreBackend(fs), base_zh(NULL), current_zh(NULL),
+ m_filestore_zfs_snap(g_conf->filestore_zfs_snap)
+{
+ int ret = zfs.init();
+ if (ret < 0) {
+ dout(0) << "ZFSFileStoreBackend: failed to init libzfs" << dendl;
+ return;
+ }
+
+ base_zh = zfs.path_to_zhandle(get_basedir_path().c_str(), ZFS::TYPE_FILESYSTEM);
+ if (!base_zh) {
+ dout(0) << "ZFSFileStoreBackend: failed to get zfs handler for basedir" << dendl;
+ return;
+ }
+
+ update_current_zh();
+}
+
+ZFSFileStoreBackend::~ZFSFileStoreBackend()
+{
+ if (base_zh)
+ zfs.close(base_zh);
+ if (current_zh)
+ zfs.close(current_zh);
+}
+
+int ZFSFileStoreBackend::update_current_zh()
+{
+ char path[PATH_MAX];
+ snprintf(path, sizeof(path), "%s/current", zfs.get_name(base_zh));
+ ZFS::Handle *zh = zfs.open(path, ZFS::TYPE_FILESYSTEM);
+ if (zh) {
+ char *mnt;
+ if (zfs.is_mounted(zh, &mnt)) {
+ int ret = get_current_path() == mnt;
+ free(mnt);
+ if (ret) {
+ current_zh = zh;
+ return 0;
+ }
+ } else {
+ int ret = zfs.mount(zh, NULL, 0);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "update_current_zh: zfs_mount '" << zfs.get_name(zh)
+ << "' got " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ }
+ zfs.close(zh);
+ } else {
+ dout(0) << "update_current_zh: zfs_open '" << path << "' got NULL" << dendl;
+ return -ENOENT;
+ }
+
+ zh = zfs.path_to_zhandle(get_current_path().c_str(), ZFS::TYPE_FILESYSTEM);
+ if (zh) {
+ if (strcmp(zfs.get_name(base_zh), zfs.get_name(zh))) {
+ current_zh = zh;
+ return 0;
+ }
+ zfs.close(zh);
+ dout(0) << "update_current_zh: basedir and current/ on the same filesystem" << dendl;
+ } else {
+ dout(0) << "update_current_zh: current/ not exist" << dendl;
+ }
+ return -ENOENT;
+}
+
+int ZFSFileStoreBackend::detect_features()
+{
+ if (!current_zh)
+ dout(0) << "detect_features: null zfs handle for current/" << dendl;
+ return 0;
+}
+
+bool ZFSFileStoreBackend::can_checkpoint()
+{
+ return m_filestore_zfs_snap && current_zh != NULL;
+}
+
+int ZFSFileStoreBackend::create_current()
+{
+ struct stat st;
+ int ret = ::stat(get_current_path().c_str(), &st);
+ if (ret == 0) {
+ // current/ exists
+ if (!S_ISDIR(st.st_mode)) {
+ dout(0) << "create_current: current/ exists but is not a directory" << dendl;
+ return -ENOTDIR;
+ }
+ return 0;
+ } else if (errno != ENOENT) {
+ ret = -errno;
+ dout(0) << "create_current: cannot stat current/ " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ char path[PATH_MAX];
+ snprintf(path, sizeof(path), "%s/current", zfs.get_name(base_zh));
+ ret = zfs.create(path, ZFS::TYPE_FILESYSTEM);
+ if (ret < 0 && errno != EEXIST) {
+ ret = -errno;
+ dout(0) << "create_current: zfs_create '" << path << "' got " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ ret = update_current_zh();
+ return ret;
+}
+
+static int list_checkpoints_callback(ZFS::Handle *zh, void *data)
+{
+ list<string> *ls = static_cast<list<string> *>(data);
+ string str = ZFS::get_name(zh);
+ size_t pos = str.find('@');
+ assert(pos != string::npos && pos + 1 != str.length());
+ ls->push_back(str.substr(pos + 1));
+ return 0;
+}
+
+int ZFSFileStoreBackend::list_checkpoints(list<string>& ls)
+{
+ dout(10) << "list_checkpoints:" << dendl;
+ if (!current_zh)
+ return -EINVAL;
+
+ list<string> snaps;
+ int ret = zfs.iter_snapshots_sorted(current_zh, list_checkpoints_callback, &snaps);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "list_checkpoints: zfs_iter_snapshots_sorted got" << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ ls.swap(snaps);
+ return 0;
+}
+
+int ZFSFileStoreBackend::create_checkpoint(const string& name, uint64_t *cid)
+{
+ dout(10) << "create_checkpoint: '" << name << "'" << dendl;
+ if (!current_zh)
+ return -EINVAL;
+
+ // looks like zfsonlinux doesn't flush dirty data when taking snapshot
+ int ret = sync_filesystem(get_current_fd());
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "create_checkpoint: sync_filesystem got" << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ char path[PATH_MAX];
+ snprintf(path, sizeof(path), "%s@%s", zfs.get_name(current_zh), name.c_str());
+ ret = zfs.snapshot(path, false);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "create_checkpoint: zfs_snapshot '" << path << "' got" << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ if (cid)
+ *cid = 0;
+ return 0;
+}
+
+int ZFSFileStoreBackend::rollback_to(const string& name)
+{
+ dout(10) << "rollback_to: '" << name << "'" << dendl;
+ if (!current_zh)
+ return -EINVAL;
+
+ // umount current to avoid triggering online rollback deadlock
+ int ret;
+ if (zfs.is_mounted(current_zh, NULL)) {
+ ret = zfs.umount(current_zh, NULL, 0);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "rollback_to: zfs_umount '" << zfs.get_name(current_zh) << "' got" << cpp_strerror(ret) << dendl;
+ }
+ }
+
+ char path[PATH_MAX];
+ snprintf(path, sizeof(path), "%s@%s", zfs.get_name(current_zh), name.c_str());
+
+ ZFS::Handle *snap_zh = zfs.open(path, ZFS::TYPE_SNAPSHOT);
+ if (!snap_zh) {
+ dout(0) << "rollback_to: zfs_open '" << path << "' got NULL" << dendl;
+ return -ENOENT;
+ }
+
+ ret = zfs.rollback(current_zh, snap_zh, false);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "rollback_to: zfs_rollback '" << zfs.get_name(snap_zh) << "' got" << cpp_strerror(ret) << dendl;
+ }
+
+ if (!zfs.is_mounted(current_zh, NULL)) {
+ int ret = zfs.mount(current_zh, NULL, 0);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "update_current_zh: zfs_mount '" << zfs.get_name(current_zh) << "' got " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ }
+
+ zfs.close(snap_zh);
+ return ret;
+}
+
+int ZFSFileStoreBackend::destroy_checkpoint(const string& name)
+{
+ dout(10) << "destroy_checkpoint: '" << name << "'" << dendl;
+ if (!current_zh)
+ return -EINVAL;
+
+ int ret = zfs.destroy_snaps(current_zh, name.c_str(), true);
+ if (ret < 0) {
+ ret = -errno;
+ dout(0) << "destroy_checkpoint: zfs_destroy_snaps '" << name << "' got" << cpp_strerror(ret) << dendl;
+ }
+ return ret;
+}
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_ZFSFILESTOREBACKEND_H
+#define CEPH_ZFSFILESTOREBACKEND_H
+
+#ifdef HAVE_LIBZFS
+#include "GenericFileStoreBackend.h"
+#include "ZFS.h"
+
+class ZFSFileStoreBackend : public GenericFileStoreBackend {
+private:
+ ZFS zfs;
+ ZFS::Handle *base_zh;
+ ZFS::Handle *current_zh;
+ bool m_filestore_zfs_snap;
+ int update_current_zh();
+public:
+ ZFSFileStoreBackend(FileStore *fs);
+ ~ZFSFileStoreBackend();
+ int detect_features();
+ bool can_checkpoint();
+ int create_current();
+ int list_checkpoints(list<string>& ls);
+ int create_checkpoint(const string& name, uint64_t *cid);
+ int rollback_to(const string& name);
+ int destroy_checkpoint(const string& name);
+};
+#endif
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "chain_xattr.h"
+
+#include "include/int_types.h"
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/file.h>
+#include <errno.h>
+#include <dirent.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include <stdio.h>
+#include "include/assert.h"
+
+#if defined(__linux__)
+#include <linux/fs.h>
+#endif
+
+#include "common/xattr.h"
+#include "include/compat.h"
+
+/*
+ * chaining xattrs
+ *
+ * In order to support xattrs that are larger than the xattr size limit that some file systems
+ * impose, we use multiple xattrs to store the value of a single xattr. The xattrs keys
+ * are set as follows:
+ * The first xattr in the chain, has a key that holds the original xattr name, with any '@' char
+ * being esacped ("@@").
+ * The chained keys will have the first xattr's key (with the escaping), and a suffix: "@<id>"
+ * where <id> marks the num of xattr in the chain.
+ */
+
+static void get_raw_xattr_name(const char *name, int i, char *raw_name, int raw_len)
+{
+ int pos = 0;
+
+ while (*name) {
+ switch (*name) {
+ case '@': /* escape it */
+ pos += 2;
+ assert (pos < raw_len - 1);
+ *raw_name = '@';
+ raw_name++;
+ *raw_name = '@';
+ break;
+ default:
+ pos++;
+ assert(pos < raw_len - 1);
+ *raw_name = *name;
+ break;
+ }
+ name++;
+ raw_name++;
+ }
+
+ if (!i) {
+ *raw_name = '\0';
+ } else {
+ int r = snprintf(raw_name, raw_len - pos, "@%d", i);
+ assert(r < raw_len - pos);
+ }
+}
+
+static int translate_raw_name(const char *raw_name, char *name, int name_len, bool *is_first)
+{
+ int pos = 0;
+
+ *is_first = true;
+ while (*raw_name) {
+ switch (*raw_name) {
+ case '@': /* escape it */
+ raw_name++;
+ if (!*raw_name)
+ break;
+ if (*raw_name != '@') {
+ *is_first = false;
+ goto done;
+ }
+
+ /* fall through */
+ default:
+ *name = *raw_name;
+ break;
+ }
+ pos++;
+ assert(pos < name_len);
+ name++;
+ raw_name++;
+ }
+done:
+ *name = '\0';
+ return pos;
+}
+
+
+// setxattr
+
+static int getxattr_len(const char *fn, const char *name)
+{
+ int i = 0, total = 0;
+ char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int r;
+
+ do {
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ r = sys_getxattr(fn, raw_name, 0, 0);
+ if (!i && r < 0)
+ return r;
+ if (r < 0)
+ break;
+ total += r;
+ i++;
+ } while (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
+ r == CHAIN_XATTR_SHORT_BLOCK_LEN);
+
+ return total;
+}
+
+int chain_getxattr(const char *fn, const char *name, void *val, size_t size)
+{
+ int i = 0, pos = 0;
+ char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int ret = 0;
+ int r;
+ size_t chunk_size;
+
+ if (!size)
+ return getxattr_len(fn, name);
+
+ do {
+ chunk_size = (size < CHAIN_XATTR_MAX_BLOCK_LEN ? size : CHAIN_XATTR_MAX_BLOCK_LEN);
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+
+ r = sys_getxattr(fn, raw_name, (char *)val + pos, chunk_size);
+ if (i && r == -ENODATA) {
+ ret = pos;
+ break;
+ }
+ if (r < 0) {
+ ret = r;
+ break;
+ }
+
+ if (r > 0) {
+ pos += r;
+ size -= r;
+ }
+
+ i++;
+ } while (size && (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
+ r == CHAIN_XATTR_SHORT_BLOCK_LEN));
+
+ if (r >= 0) {
+ ret = pos;
+ /* is there another chunk? that can happen if the last read size span over
+ exactly one block */
+ if (chunk_size == CHAIN_XATTR_MAX_BLOCK_LEN ||
+ chunk_size == CHAIN_XATTR_SHORT_BLOCK_LEN) {
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ r = sys_getxattr(fn, raw_name, 0, 0);
+ if (r > 0) { // there's another chunk.. the original buffer was too small
+ ret = -ERANGE;
+ }
+ }
+ }
+ return ret;
+}
+
+static int chain_fgetxattr_len(int fd, const char *name)
+{
+ int i = 0, total = 0;
+ char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int r;
+
+ do {
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ r = sys_fgetxattr(fd, raw_name, 0, 0);
+ if (!i && r < 0)
+ return r;
+ if (r < 0)
+ break;
+ total += r;
+ i++;
+ } while (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
+ r == CHAIN_XATTR_SHORT_BLOCK_LEN);
+
+ return total;
+}
+
+int chain_fgetxattr(int fd, const char *name, void *val, size_t size)
+{
+ int i = 0, pos = 0;
+ char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int ret = 0;
+ int r;
+ size_t chunk_size;
+
+ if (!size)
+ return chain_fgetxattr_len(fd, name);
+
+ do {
+ chunk_size = (size < CHAIN_XATTR_MAX_BLOCK_LEN ? size : CHAIN_XATTR_MAX_BLOCK_LEN);
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+
+ r = sys_fgetxattr(fd, raw_name, (char *)val + pos, chunk_size);
+ if (i && r == -ENODATA) {
+ ret = pos;
+ break;
+ }
+ if (r < 0) {
+ ret = r;
+ break;
+ }
+
+ if (r > 0) {
+ pos += r;
+ size -= r;
+ }
+
+ i++;
+ } while (size && (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
+ r == CHAIN_XATTR_SHORT_BLOCK_LEN));
+
+ if (r >= 0) {
+ ret = pos;
+ /* is there another chunk? that can happen if the last read size span over
+ exactly one block */
+ if (chunk_size == CHAIN_XATTR_MAX_BLOCK_LEN ||
+ chunk_size == CHAIN_XATTR_SHORT_BLOCK_LEN) {
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ r = sys_fgetxattr(fd, raw_name, 0, 0);
+ if (r > 0) { // there's another chunk.. the original buffer was too small
+ ret = -ERANGE;
+ }
+ }
+ }
+ return ret;
+}
+
+
+// setxattr
+
+static int get_xattr_block_size(size_t size)
+{
+ if (size <= CHAIN_XATTR_SHORT_LEN_THRESHOLD)
+ // this may fit in the inode; stripe over short attrs so that XFS
+ // won't kick it out.
+ return CHAIN_XATTR_SHORT_BLOCK_LEN;
+ return CHAIN_XATTR_MAX_BLOCK_LEN;
+}
+
+int chain_setxattr(const char *fn, const char *name, const void *val, size_t size, bool onechunk)
+{
+ int i = 0, pos = 0;
+ char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int ret = 0;
+ size_t max_chunk_size = get_xattr_block_size(size);
+
+ do {
+ size_t chunk_size = (size < max_chunk_size ? size : max_chunk_size);
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ size -= chunk_size;
+
+ int r = sys_setxattr(fn, raw_name, (char *)val + pos, chunk_size);
+ if (r < 0) {
+ ret = r;
+ break;
+ }
+ pos += chunk_size;
+ ret = pos;
+ i++;
+ } while (size);
+
+ if (ret >= 0 && !onechunk) {
+ int r;
+ do {
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ r = sys_removexattr(fn, raw_name);
+ if (r < 0 && r != -ENODATA)
+ ret = r;
+ i++;
+ } while (r != -ENODATA);
+ }
+
+ return ret;
+}
+
+int chain_fsetxattr(int fd, const char *name, const void *val, size_t size, bool onechunk)
+{
+ int i = 0, pos = 0;
+ char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int ret = 0;
+ size_t max_chunk_size = get_xattr_block_size(size);
+
+ do {
+ size_t chunk_size = (size < max_chunk_size ? size : max_chunk_size);
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ size -= chunk_size;
+
+ int r = sys_fsetxattr(fd, raw_name, (char *)val + pos, chunk_size);
+ if (r < 0) {
+ ret = r;
+ break;
+ }
+ pos += chunk_size;
+ ret = pos;
+ i++;
+ } while (size);
+
+ if (ret >= 0 && !onechunk) {
+ int r;
+ do {
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ r = sys_fremovexattr(fd, raw_name);
+ if (r < 0 && r != -ENODATA)
+ ret = r;
+ i++;
+ } while (r != -ENODATA);
+ }
+
+ return ret;
+}
+
+
+// removexattr
+
+int chain_removexattr(const char *fn, const char *name)
+{
+ int i = 0;
+ char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int r;
+
+ do {
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ r = sys_removexattr(fn, raw_name);
+ if (!i && r < 0) {
+ return r;
+ }
+ i++;
+ } while (r >= 0);
+ return 0;
+}
+
+int chain_fremovexattr(int fd, const char *name)
+{
+ int i = 0;
+ char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int r;
+
+ do {
+ get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+ r = sys_fremovexattr(fd, raw_name);
+ if (!i && r < 0) {
+ return r;
+ }
+ i++;
+ } while (r >= 0);
+ return 0;
+}
+
+
+// listxattr
+
+int chain_listxattr(const char *fn, char *names, size_t len) {
+ int r;
+
+ if (!len)
+ return sys_listxattr(fn, names, len) * 2;
+
+ r = sys_listxattr(fn, 0, 0);
+ if (r < 0)
+ return r;
+
+ size_t total_len = r * 2; // should be enough
+ char *full_buf = (char *)malloc(total_len);
+ if (!full_buf)
+ return -ENOMEM;
+
+ r = sys_listxattr(fn, full_buf, total_len);
+ if (r < 0) {
+ free(full_buf);
+ return r;
+ }
+
+ char *p = full_buf;
+ const char *end = full_buf + r;
+ char *dest = names;
+ char *dest_end = names + len;
+
+ while (p < end) {
+ char name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int attr_len = strlen(p);
+ bool is_first;
+ int name_len = translate_raw_name(p, name, sizeof(name), &is_first);
+ if (is_first) {
+ if (dest + name_len > dest_end) {
+ r = -ERANGE;
+ goto done;
+ }
+ strcpy(dest, name);
+ dest += name_len + 1;
+ }
+ p += attr_len + 1;
+ }
+ r = dest - names;
+
+done:
+ free(full_buf);
+ return r;
+}
+
+int chain_flistxattr(int fd, char *names, size_t len) {
+ int r;
+ char *p;
+ const char * end;
+ char *dest;
+ char *dest_end;
+
+ if (!len)
+ return sys_flistxattr(fd, names, len) * 2;
+
+ r = sys_flistxattr(fd, 0, 0);
+ if (r < 0)
+ return r;
+
+ size_t total_len = r * 2; // should be enough
+ char *full_buf = (char *)malloc(total_len);
+ if (!full_buf)
+ return -ENOMEM;
+
+ r = sys_flistxattr(fd, full_buf, total_len);
+ if (r < 0)
+ goto done;
+
+ p = full_buf;
+ end = full_buf + r;
+ dest = names;
+ dest_end = names + len;
+
+ while (p < end) {
+ char name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+ int attr_len = strlen(p);
+ bool is_first;
+ int name_len = translate_raw_name(p, name, sizeof(name), &is_first);
+ if (is_first) {
+ if (dest + name_len > dest_end) {
+ r = -ERANGE;
+ goto done;
+ }
+ strcpy(dest, name);
+ dest += name_len + 1;
+ }
+ p += attr_len + 1;
+ }
+ r = dest - names;
+
+done:
+ free(full_buf);
+ return r;
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef __CEPH_OSD_CHAIN_XATTR_H
+#define __CEPH_OSD_CHAIN_XATTR_H
+
+#include "common/xattr.h"
+
+#include <errno.h>
+
+#if defined(__linux__)
+#include <linux/limits.h>
+#define CHAIN_XATTR_MAX_NAME_LEN ((XATTR_NAME_MAX + 1) / 2)
+#elif defined(__APPLE__)
+#include <sys/xattr.h>
+#define CHAIN_XATTR_MAX_NAME_LEN ((XATTR_MAXNAMELEN + 1) / 2)
+#else
+#define CHAIN_XATTR_MAX_NAME_LEN 128
+#endif
+
+#define CHAIN_XATTR_MAX_BLOCK_LEN 2048
+
+/*
+ * XFS will only inline xattrs < 255 bytes, so for xattrs that are
+ * likely to fit in the inode, stripe over short xattrs.
+ */
+#define CHAIN_XATTR_SHORT_BLOCK_LEN 250
+#define CHAIN_XATTR_SHORT_LEN_THRESHOLD 1000
+
+// wrappers to hide annoying errno handling.
+
+static inline int sys_fgetxattr(int fd, const char *name, void *val, size_t size)
+{
+ int r = ::ceph_os_fgetxattr(fd, name, val, size);
+ return (r < 0 ? -errno : r);
+}
+static inline int sys_getxattr(const char *fn, const char *name, void *val, size_t size)
+{
+ int r = ::ceph_os_getxattr(fn, name, val, size);
+ return (r < 0 ? -errno : r);
+}
+
+static inline int sys_setxattr(const char *fn, const char *name, const void *val, size_t size)
+{
+ int r = ::ceph_os_setxattr(fn, name, val, size);
+ return (r < 0 ? -errno : r);
+}
+static inline int sys_fsetxattr(int fd, const char *name, const void *val, size_t size)
+{
+ int r = ::ceph_os_fsetxattr(fd, name, val, size);
+ return (r < 0 ? -errno : r);
+}
+
+static inline int sys_listxattr(const char *fn, char *names, size_t len)
+{
+ int r = ::ceph_os_listxattr(fn, names, len);
+ return (r < 0 ? -errno : r);
+}
+static inline int sys_flistxattr(int fd, char *names, size_t len)
+{
+ int r = ::ceph_os_flistxattr(fd, names, len);
+ return (r < 0 ? -errno : r);
+}
+
+static inline int sys_removexattr(const char *fn, const char *name)
+{
+ int r = ::ceph_os_removexattr(fn, name);
+ return (r < 0 ? -errno : r);
+}
+static inline int sys_fremovexattr(int fd, const char *name)
+{
+ int r = ::ceph_os_fremovexattr(fd, name);
+ return (r < 0 ? -errno : r);
+}
+
+
+// wrappers to chain large values across multiple xattrs
+
+int chain_getxattr(const char *fn, const char *name, void *val, size_t size);
+int chain_fgetxattr(int fd, const char *name, void *val, size_t size);
+int chain_setxattr(const char *fn, const char *name, const void *val, size_t size, bool onechunk=false);
+int chain_fsetxattr(int fd, const char *name, const void *val, size_t size, bool onechunk=false);
+int chain_listxattr(const char *fn, char *names, size_t len);
+int chain_flistxattr(int fd, char *names, size_t len);
+int chain_removexattr(const char *fn, const char *name);
+int chain_fremovexattr(int fd, const char *name);
+
+#endif
#include "include/buffer.h"
#include "test/ObjectMap/KeyValueDBMemory.h"
#include "kv/KeyValueDB.h"
-#include "os/DBObjectMap.h"
-#include "os/HashIndex.h"
+#include "os/filestore/DBObjectMap.h"
+#include "os/filestore/HashIndex.h"
#include <sys/types.h>
#include "global/global_init.h"
#include "common/ceph_argparse.h"
#include "detailed_stat_collector.h"
#include "distribution.h"
#include "global/global_init.h"
-#include "os/FileStore.h"
+#include "os/filestore/FileStore.h"
#include "testfilestore_backend.h"
#include "common/perf_counters.h"
#include "os/ObjectStore.h"
TYPE(ObjectStore::Transaction)
-#include "os/SequencerPosition.h"
+#include "os/filestore/SequencerPosition.h"
TYPE(SequencerPosition)
#include "os/bluestore/bluestore_types.h"
#include "mon/mon_types.h"
TYPE(LevelDBStoreStats)
-#include "os/DBObjectMap.h"
+#include "os/filestore/DBObjectMap.h"
TYPE(DBObjectMap::_Header)
TYPE(DBObjectMap::State)
#include "common/ceph_argparse.h"
#include "global/global_init.h"
-#include "os/FileStore.h"
+#include "os/filestore/FileStore.h"
#include <gtest/gtest.h>
class TestFileStore {
#include <map>
#include <boost/scoped_ptr.hpp>
#include "common/debug.h"
-#include "os/FileStore.h"
+#include "os/filestore/FileStore.h"
#include "common/config.h"
#include "FileStoreDiff.h"
#include <map>
#include <boost/scoped_ptr.hpp>
#include "common/debug.h"
-#include "os/FileStore.h"
+#include "os/filestore/FileStore.h"
#include "common/config.h"
class FileStoreDiff {
#ifndef FILESTORE_TRACKER_H
#define FILESTORE_TRACKER_H
#include "test/common/ObjectContents.h"
-#include "os/FileStore.h"
+#include "os/filestore/FileStore.h"
#include "kv/KeyValueDB.h"
#include <boost/scoped_ptr.hpp>
#include <list>
#include <stdio.h>
#include <signal.h>
-#include "os/chain_xattr.h"
+#include "os/filestore/chain_xattr.h"
#include "include/Context.h"
#include "common/errno.h"
#include "common/ceph_argparse.h"
#include <time.h>
#include <sys/mount.h>
#include "os/ObjectStore.h"
-#include "os/FileStore.h"
+#include "os/filestore/FileStore.h"
#include "os/KeyValueStore.h"
#include "include/Context.h"
#include "common/ceph_argparse.h"
#include <iostream>
#include <sstream>
#include <boost/scoped_ptr.hpp>
-#include "os/FileStore.h"
+#include "os/filestore/FileStore.h"
#include "global/global_init.h"
#include "common/ceph_argparse.h"
#include "common/debug.h"
#include "common/ceph_argparse.h"
#include "global/global_init.h"
#include "common/debug.h"
-#include "os/FileStore.h"
+#include "os/filestore/FileStore.h"
#include "DeterministicOpSequence.h"
#include "FileStoreDiff.h"
#include <stdio.h>
#include <signal.h>
-#include "os/LFNIndex.h"
-#include "os/chain_xattr.h"
+#include "os/filestore/LFNIndex.h"
+#include "os/filestore/chain_xattr.h"
#include "common/ceph_argparse.h"
#include "global/global_init.h"
#include <gtest/gtest.h>
#include "global/global_init.h"
#include "common/config.h"
#include "common/Finisher.h"
-#include "os/FileJournal.h"
+#include "os/filestore/FileJournal.h"
#include "include/Context.h"
#include "common/Mutex.h"
#include "common/safe_io.h"
-#include "os/JournalingObjectStore.h"
+#include "os/filestore/JournalingObjectStore.h"
Finisher *finisher;
Cond sync_cond;
#include <iostream>
#include "common/ceph_argparse.h"
#include "common/debug.h"
-#include "os/FileStore.h"
+#include "os/filestore/FileStore.h"
#include "global/global_init.h"
#include "include/assert.h"
#include <string.h>
#include <iostream>
#include <sstream>
-#include "os/FileStore.h"
+#include "os/filestore/FileStore.h"
#include "include/Context.h"
#include "common/ceph_argparse.h"
#include "global/global_init.h"
#include "global/global_init.h"
#include "os/ObjectStore.h"
-#include "os/FileJournal.h"
+#include "os/filestore/FileJournal.h"
#include "osd/PGLog.h"
#include "osd/OSD.h"
#include "common/errno.h"
#include "global/global_init.h"
-#include "os/DBObjectMap.h"
+#include "os/filestore/DBObjectMap.h"
#include "kv/KeyValueDB.h"
namespace po = boost::program_options;