BuildRequires: libblkid-devel >= 2.17
BuildRequires: cryptsetup-devel
BuildRequires: libcurl-devel
+BuildRequires: libcap-devel
BuildRequires: libcap-ng-devel
BuildRequires: fmt-devel >= 6.2.1
BuildRequires: pkgconfig(libudev)
%dir %{_libdir}/ceph
%dir %{_libdir}/ceph/erasure-code
%{_libdir}/ceph/erasure-code/libec_*.so*
+%dir %{_libdir}/ceph/extblkdev
+%{_libdir}/ceph/extblkdev/libceph_*.so*
%dir %{_libdir}/ceph/compressor
%{_libdir}/ceph/compressor/libceph_*.so*
%{_unitdir}/ceph-crash.service
usr/bin/ceph-kvstore-tool
usr/libexec/ceph/ceph_common.sh
usr/lib/ceph/erasure-code/*
+usr/lib/ceph/extblkdev/*
usr/lib/rados-classes/*
usr/sbin/ceph-create-keys
usr/share/doc/ceph/sample.ceph.conf
libcrypto++-dev <pkg.ceph.crimson>,
libcryptsetup-dev,
libcap-ng-dev,
+ libcap-dev,
libcunit1-dev,
libcurl4-openssl-dev,
libevent-dev,
add_subdirectory(crush)
add_subdirectory(msg)
add_subdirectory(arch)
+add_subdirectory(extblkdev)
set(ceph_common_objs
$<TARGET_OBJECTS:common-auth-objs>
$<TARGET_OBJECTS:common_mountcephfs_objs>
$<TARGET_OBJECTS:crush_objs>)
set(ceph_common_deps
- json_spirit erasure_code arch crc32
+ json_spirit erasure_code extblkdev arch crc32
${LIB_RESOLV}
Boost::thread
Boost::system
ceph_osd.cc)
add_executable(ceph-osd ${ceph_osd_srcs})
-add_dependencies(ceph-osd erasure_code_plugins)
+add_dependencies(ceph-osd erasure_code_plugins extblkdev_plugins)
target_link_libraries(ceph-osd osd os global-static common
${ALLOC_LIBS}
${BLKID_LIBRARIES})
#include "acconfig.h"
#include "common/ceph_mutex.h"
#include "include/common_fwd.h"
+#include "extblkdev/ExtBlkDevInterface.h"
#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
#include "aio/aio.h"
uint64_t get_optimal_io_size() const { return optimal_io_size; }
/// hook to provide utilization of thinly-provisioned device
- virtual bool get_thin_utilization(uint64_t *total, uint64_t *avail) const {
- return false;
+ virtual int get_ebd_state(ExtBlkDevState &state) const {
+ return -ENOENT;
}
virtual int collect_metadata(const std::string& prefix, std::map<std::string,std::string> *pm) const = 0;
endif()
if(HAVE_LIBAIO)
- target_link_libraries(blk PUBLIC ${AIO_LIBRARIES})
+ target_link_libraries(blk PUBLIC ${AIO_LIBRARIES} extblkdev)
endif(HAVE_LIBAIO)
if(WITH_SPDK)
support_discard = blkdev_buffered.support_discard();
optimal_io_size = blkdev_buffered.get_optimal_io_size();
this->devname = devname;
- _detect_vdo();
+ // check if any extended block device plugin recognizes this device
+ // detect_vdo has moved into the VDO plugin
+ int rc = extblkdev::detect_device(cct, devname, ebd_impl);
+ if (rc != 0) {
+ dout(20) << __func__ << " no plugin volume maps to " << devname << dendl;
+ }
}
}
_discard_stop();
_pre_close();
- if (vdo_fd >= 0) {
- VOID_TEMP_FAILURE_RETRY(::close(vdo_fd));
- vdo_fd = -1;
- }
+ extblkdev::release_device(ebd_impl);
for (int i = 0; i < WRITE_LIFE_MAX; i++) {
assert(fd_directs[i] >= 0);
} else {
(*pm)[prefix + "type"] = "ssd";
}
- if (vdo_fd >= 0) {
- (*pm)[prefix + "vdo"] = "true";
- uint64_t total, avail;
- get_vdo_utilization(vdo_fd, &total, &avail);
- (*pm)[prefix + "vdo_physical_size"] = stringify(total);
+ // if compression device detected, collect meta data for device
+ // VDO specific meta data has moved into VDO plugin
+ if (ebd_impl) {
+ ebd_impl->collect_metadata(prefix, pm);
}
{
return 0;
}
-void KernelDevice::_detect_vdo()
-{
- vdo_fd = get_vdo_stats_handle(devname.c_str(), &vdo_name);
- if (vdo_fd >= 0) {
- dout(1) << __func__ << " VDO volume " << vdo_name
- << " maps to " << devname << dendl;
- } else {
- dout(20) << __func__ << " no VDO volume maps to " << devname << dendl;
- }
- return;
-}
-
-bool KernelDevice::get_thin_utilization(uint64_t *total, uint64_t *avail) const
+int KernelDevice::get_ebd_state(ExtBlkDevState &state) const
{
- if (vdo_fd < 0) {
- return false;
+ // use compression driver plugin to determine physical size and availability
+ // VDO specific get_thin_utilization has moved into VDO plugin
+ if (ebd_impl) {
+ return ebd_impl->get_state(state);
}
- return get_vdo_utilization(vdo_fd, total, avail);
+ return -ENOENT;
}
int KernelDevice::choose_fd(bool buffered, int write_hint) const
#include "aio/aio.h"
#include "BlockDevice.h"
+#include "extblkdev/ExtBlkDevPlugin.h"
#define RW_IO_MAX (INT_MAX & CEPH_PAGE_MASK)
bool enable_wrt = true;
bool aio, dio;
- int vdo_fd = -1; ///< fd for vdo sysfs directory
- std::string vdo_name;
+ ExtBlkDevInterfaceRef ebd_impl; // structure for retrieving compression state from extended block device
std::string devname; ///< kernel dev name (/sys/block/$devname), if any
void debug_aio_link(aio_t& aio);
void debug_aio_unlink(aio_t& aio);
- void _detect_vdo();
int choose_fd(bool buffered, int write_hint) const;
ceph::unique_leakable_ptr<buffer::raw> create_custom_aligned(size_t len, IOContext* ioc) const;
}
int get_devices(std::set<std::string> *ls) const override;
- bool get_thin_utilization(uint64_t *total, uint64_t *avail) const override;
+ int get_ebd_state(ExtBlkDevState &state) const override;
int read(uint64_t off, uint64_t len, ceph::buffer::list *pbl,
IOContext *ioc,
#include "mon/MonClient.h"
#include "include/ceph_features.h"
#include "common/config.h"
+#include "extblkdev/ExtBlkDevPlugin.h"
#include "mon/MonMap.h"
forker.exit(0);
}
+ {
+ int r = extblkdev::preload(g_ceph_context);
+ if (r < 0) {
+ derr << "Failed preloading extblkdev plugins, error code: " << r << dendl;
+ forker.exit(1);
+ }
+ }
+
string magic;
uuid_d cluster_fsid, osd_fsid;
ceph_release_t require_osd_release = ceph_release_t::unknown;
}
}
-int _get_vdo_stats_handle(const char *devname, std::string *vdo_name)
-{
- int vdo_fd = -1;
-
- // we need to go from the raw devname (e.g., dm-4) to the VDO volume name.
- // currently the best way seems to be to look at /dev/mapper/* ...
- std::string expect = std::string("../") + devname; // expected symlink target
- DIR *dir = ::opendir("/dev/mapper");
- if (!dir) {
- return -1;
- }
- struct dirent *de = nullptr;
- while ((de = ::readdir(dir))) {
- if (de->d_name[0] == '.')
- continue;
- char fn[4096], target[4096];
- snprintf(fn, sizeof(fn), "/dev/mapper/%s", de->d_name);
- int r = readlink(fn, target, sizeof(target));
- if (r < 0 || r >= (int)sizeof(target))
- continue;
- target[r] = 0;
- if (expect == target) {
- snprintf(fn, sizeof(fn), "/sys/kvdo/%s/statistics", de->d_name);
- vdo_fd = ::open(fn, O_RDONLY|O_CLOEXEC); //DIRECTORY);
- if (vdo_fd >= 0) {
- *vdo_name = de->d_name;
- break;
- }
- }
- }
- closedir(dir);
- return vdo_fd;
-}
-
-int get_vdo_stats_handle(const char *devname, std::string *vdo_name)
-{
- std::set<std::string> devs = { devname };
- while (!devs.empty()) {
- std::string dev = *devs.begin();
- devs.erase(devs.begin());
- int fd = _get_vdo_stats_handle(dev.c_str(), vdo_name);
- if (fd >= 0) {
- // yay, it's vdo
- return fd;
- }
- // ok, see if there are constituent devices
- if (dev.find("dm-") == 0) {
- get_dm_parents(dev, &devs);
- }
- }
- return -1;
-}
-
-int64_t get_vdo_stat(int vdo_fd, const char *property)
-{
- int64_t ret = 0;
- int fd = ::openat(vdo_fd, property, O_RDONLY|O_CLOEXEC);
- if (fd < 0) {
- return 0;
- }
- char buf[1024];
- int r = ::read(fd, buf, sizeof(buf) - 1);
- if (r > 0) {
- buf[r] = 0;
- ret = atoll(buf);
- }
- TEMP_FAILURE_RETRY(::close(fd));
- return ret;
-}
-
-bool get_vdo_utilization(int fd, uint64_t *total, uint64_t *avail)
-{
- int64_t block_size = get_vdo_stat(fd, "block_size");
- int64_t physical_blocks = get_vdo_stat(fd, "physical_blocks");
- int64_t overhead_blocks_used = get_vdo_stat(fd, "overhead_blocks_used");
- int64_t data_blocks_used = get_vdo_stat(fd, "data_blocks_used");
- if (!block_size
- || !physical_blocks
- || !overhead_blocks_used
- || !data_blocks_used) {
- return false;
- }
- int64_t avail_blocks =
- physical_blocks - overhead_blocks_used - data_blocks_used;
- *total = block_size * physical_blocks;
- *avail = block_size * avail_blocks;
- return true;
-}
-
std::string _decode_model_enc(const std::string& in)
{
auto v = boost::replace_all_copy(in, "\\x20", " ");
{
}
-int get_vdo_stats_handle(const char *devname, std::string *vdo_name)
-{
- return -1;
-}
-
-int64_t get_vdo_stat(int fd, const char *property)
-{
- return 0;
-}
-
-bool get_vdo_utilization(int fd, uint64_t *total, uint64_t *avail)
-{
- return false;
-}
-
std::string get_device_id(const std::string& devname,
std::string *err)
{
{
}
-int get_vdo_stats_handle(const char *devname, std::string *vdo_name)
-{
- return -1;
-}
-
-int64_t get_vdo_stat(int fd, const char *property)
-{
- return 0;
-}
-
-bool get_vdo_utilization(int fd, uint64_t *total, uint64_t *avail)
-{
- return false;
-}
-
std::string get_device_id(const std::string& devname,
std::string *err)
{
{
}
-int get_vdo_stats_handle(const char *devname, std::string *vdo_name)
-{
- return -1;
-}
-
-int64_t get_vdo_stat(int fd, const char *property)
-{
- return 0;
-}
-
-bool get_vdo_utilization(int fd, uint64_t *total, uint64_t *avail)
-{
- return false;
-}
-
std::string get_device_id(const std::string& devname,
std::string *err)
{
extern void get_raw_devices(const std::string& in,
std::set<std::string> *ls);
-// for VDO
-/// return an op fd for the sysfs stats dir, if this is a VDO device
-extern int get_vdo_stats_handle(const char *devname, std::string *vdo_name);
-extern int64_t get_vdo_stat(int fd, const char *property);
-extern bool get_vdo_utilization(int fd, uint64_t *total, uint64_t *avail);
-
class BlkDev {
public:
BlkDev(int fd);
- osd_numa_auto_affinity
flags:
- startup
+- name: set_keepcaps
+ type: bool
+ level: advanced
+ desc: set the keepcaps flag before changing UID, preserving the permitted capability set
+ long_desc: When ceph switches from root to the ceph uid, all capabilities in all sets are eraseed. If
+ a component that is capability aware needs a specific capability, the keepcaps flag maintains
+ the permitted capability set, allowing the capabilities in the effective set to be activated as needed.
+ default: false
+ flags:
+ - startup
- name: osd_smart_report_timeout
type: uint
level: advanced
default: 512
fmt_desc: The maximum number of objects per backfill scan.p
with_legacy: true
+- name: osd_extblkdev_plugins
+ type: str
+ level: advanced
+ desc: extended block device plugins to load, provide compression feedback at runtime
+ default: vdo
+ flags:
+ - startup
# minimum number of peers
- name: osd_heartbeat_min_peers
type: int
{
}
-int get_vdo_stats_handle(const char *devname, std::string *vdo_name)
-{
- return -1;
-}
-
-int64_t get_vdo_stat(int fd, const char *property)
-{
- return 0;
-}
-
-bool get_vdo_utilization(int fd, uint64_t *total, uint64_t *avail)
-{
- return false;
-}
-
std::string get_device_id(const std::string& devname,
std::string *err)
{
--- /dev/null
+## extended block device plugins
+
+set(extblkdev_plugin_dir ${CEPH_INSTALL_PKGLIBDIR}/extblkdev)
+
+add_subdirectory(vdo)
+
+add_library(extblkdev STATIC ExtBlkDevPlugin.cc)
+
+if(NOT WIN32)
+target_link_libraries(extblkdev cap)
+endif()
+
+add_custom_target(extblkdev_plugins DEPENDS
+ ceph_ebd_vdo)
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph distributed storage system
+ *
+ * (C) Copyright IBM Corporation 2022
+ * Author: Martin Ohmacht <mohmacht@us.ibm.com>
+ *
+ * Based on the file ceph/src/erasure-code/ErasureCodeInterface.h
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef CEPH_EXT_BLK_DEV_INTERFACE_H
+#define CEPH_EXT_BLK_DEV_INTERFACE_H
+
+/*! @file ExtBlkDevInterface.h
+ @brief Interface provided by extended block device plugins
+
+ Block devices with verdor specific capabilities rely on plugins implementing
+ **ExtBlkDevInterface** to provide access to their capabilities.
+
+ Methods returning an **int** return **0** on success and a
+ negative value on error.
+ */
+
+#include <string>
+#include <map>
+#include <ostream>
+#include <memory>
+#ifdef __linux__
+#include <sys/capability.h>
+#else
+typedef void *cap_t;
+#endif
+
+#include "common/PluginRegistry.h"
+
+namespace ceph {
+ class ExtBlkDevState {
+ uint64_t logical_total=0;
+ uint64_t logical_avail=0;
+ uint64_t physical_total=0;
+ uint64_t physical_avail=0;
+ public:
+ uint64_t get_logical_total(){return logical_total;}
+ uint64_t get_logical_avail(){return logical_avail;}
+ uint64_t get_physical_total(){return physical_total;}
+ uint64_t get_physical_avail(){return physical_avail;}
+ void set_logical_total(uint64_t alogical_total){logical_total=alogical_total;}
+ void set_logical_avail(uint64_t alogical_avail){logical_avail=alogical_avail;}
+ void set_physical_total(uint64_t aphysical_total){physical_total=aphysical_total;}
+ void set_physical_avail(uint64_t aphysical_avail){physical_avail=aphysical_avail;}
+ };
+
+
+ class ExtBlkDevInterface {
+ public:
+ virtual ~ExtBlkDevInterface() {}
+
+ /**
+ * Initialize the instance if device logdevname is supported
+ *
+ * Return 0 on success or a negative errno on error
+ *
+ * @param [in] logdevname name of device to check for support by this plugin
+ * @return 0 on success or a negative errno on error.
+ */
+ virtual int init(const std::string& logdevname) = 0;
+
+ /**
+ * Return the name of the underlying device detected by **init** method
+ *
+ * @return the name of the underlying device
+ */
+ virtual const std::string& get_devname() const = 0;
+
+ /**
+ * Provide status of underlying physical storage after compression
+ *
+ * Return 0 on success or a negative errno on error.
+ *
+ * @param [out] state current state of the undelying device
+ * @return 0 on success or a negative errno on error.
+ */
+ virtual int get_state(ExtBlkDevState& state) = 0;
+
+ /**
+ * Populate property map with meta data of device.
+ *
+ * @param [in] prefix prefix to be prepended to all map values by this method
+ * @param [in,out] pm property map of the device, to be extended by attributes detected by this plugin
+ * @return 0 on success or a negative errno on error.
+ */
+ virtual int collect_metadata(const std::string& prefix, std::map<std::string,std::string> *pm) = 0;
+ };
+
+ typedef std::shared_ptr<ExtBlkDevInterface> ExtBlkDevInterfaceRef;
+
+ class ExtBlkDevPlugin : public Plugin {
+ public:
+
+ explicit ExtBlkDevPlugin(CephContext *cct) : Plugin(cct) {}
+ virtual ~ExtBlkDevPlugin() {}
+
+ /**
+ * Indicate plugin-required capabilities in permitted set
+ * If a plugin requires a capability to be active in the
+ * permitted set when invoked, it must indicate so by setting
+ * the required flags in the cap_t structure passed into this method.
+ * The cap_t structure is empty when passed into the method, and only the
+ * method's modifications to the permitted set are used by ceph.
+ * The plugin must elevate the capabilities into the effective
+ * set at a later point when needed during the invocation of its
+ * other methods, and is responsible to restore the effective set
+ * before returning from the method
+ *
+ * @param [out] caps capability set indicating the necessary capabilities
+ */
+ virtual int get_required_cap_set(cap_t caps) = 0;
+
+ /**
+ * Factory method, creating ExtBlkDev instances
+ *
+ * @param [in] logdevname name of logic device, may be composed of physical devices
+ * @param [out] ext_blk_dev object created on successful device support detection
+ * @return 0 on success or a negative errno on error.
+ */
+ virtual int factory(const std::string& logdevname,
+ ExtBlkDevInterfaceRef& ext_blk_dev) = 0;
+ };
+
+}
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph distributed storage system
+ *
+ * (C) Copyright IBM Corporation 2022
+ * Author: Martin Ohmacht <mohmacht@us.ibm.com>
+ *
+ * Based on the file ceph/src/erasure-code/ErasureCodePlugin.cc
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ * Copyright (C) 2014 Red Hat <contact@redhat.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <errno.h>
+
+#include "ceph_ver.h"
+#include "ExtBlkDevPlugin.h"
+#include "common/errno.h"
+#include "include/dlfcn_compat.h"
+#include "include/str_list.h"
+#include "include/ceph_assert.h"
+#include "common/ceph_context.h"
+#include "common/debug.h"
+
+#define dout_subsys ceph_subsys_bdev
+#define dout_context cct
+
+using namespace std;
+
+namespace ceph {
+
+ namespace extblkdev {
+
+
+#ifdef __linux__
+ // iterate across plugins and determine each capability's reqirement
+ // merge requirements into merge_caps set
+ int get_required_caps(CephContext *cct, cap_t &merge_caps)
+ {
+ cap_t plugin_caps = nullptr;
+ auto close_caps_on_return = make_scope_guard([&] {
+ if (plugin_caps != nullptr) {
+ cap_free(plugin_caps);
+ }
+ });
+
+ // plugin-private cap set to populate by a plugin
+ plugin_caps = cap_init();
+ if (plugin_caps == nullptr) {
+ return -errno;
+ }
+ auto registry = cct->get_plugin_registry();
+ std::lock_guard l(registry->lock);
+ // did we preload any extblkdev type plugins?
+ auto ptype = registry->plugins.find("extblkdev");
+ if (ptype != registry->plugins.end()) {
+ // iterate over all extblkdev plugins
+ for (auto& it : ptype->second) {
+ // clear cap set before passing to plugin
+ if (cap_clear(plugin_caps) < 0) {
+ return -errno;
+ }
+ // let plugin populate set with required caps
+ auto ebdplugin = dynamic_cast<ExtBlkDevPlugin*>(it.second);
+ if (ebdplugin == nullptr) {
+ derr << __func__ << " Is not an extblkdev plugin: " << it.first << dendl;
+ return -ENOENT;
+ }
+ int rc = ebdplugin->get_required_cap_set(plugin_caps);
+ if (rc != 0)
+ return rc;
+ // iterate over capabilities and check for active bits
+ for (int i = 0; i <= CAP_LAST_CAP; ++i) {
+ cap_flag_value_t val;
+ if (cap_get_flag(plugin_caps, i, CAP_PERMITTED, &val) < 0) {
+ return -errno;
+ }
+ if (val != CAP_CLEAR) {
+ cap_value_t arr[1];
+ arr[0] = i;
+ // set capability in merged set
+ if (cap_set_flag(merge_caps, CAP_PERMITTED, 1, arr, CAP_SET) < 0) {
+ return -errno;
+ }
+ }
+ }
+ }
+ }
+ return 0;
+ }
+
+ // trim away all capabilities of this process that are not explicitly set in merge_set
+ int trim_caps(CephContext *cct, cap_t &merge_caps)
+ {
+ cap_t proc_caps = nullptr;
+ auto close_caps_on_return = make_scope_guard([&] {
+ if (proc_caps != nullptr) {
+ cap_free(proc_caps);
+ }
+ });
+ bool changed = false;
+ // get process capability set
+ proc_caps = cap_get_proc();
+ if (proc_caps == nullptr) {
+ dout(1) << " cap_get_proc failed with errno: " << errno << dendl;
+ return -errno;
+ }
+ {
+ char *cap_str = cap_to_text(proc_caps, 0);
+ if (cap_str != nullptr){
+ dout(10) << " cap_get_proc yields: " << cap_str << dendl;
+ cap_free(cap_str);
+ }
+ }
+ // iterate over capabilities
+ for (int i = 0; i <= CAP_LAST_CAP; ++i) {
+ cap_flag_value_t val;
+ if (cap_get_flag(merge_caps, i, CAP_PERMITTED, &val) < 0) {
+ return -errno;
+ }
+ if (val == CAP_CLEAR) {
+ if (cap_get_flag(proc_caps, i, CAP_PERMITTED, &val) < 0) {
+ return -errno;
+ }
+ if (val != CAP_CLEAR) {
+ // if bit clear in merged set, but set in process set, clear in process set
+ changed = true;
+ cap_value_t arr[1];
+ arr[0] = i;
+ if (cap_set_flag(proc_caps, CAP_PERMITTED, 1, arr, CAP_CLEAR) < 0) {
+ return -errno;
+ }
+ if (cap_set_flag(proc_caps, CAP_EFFECTIVE, 1, arr, CAP_CLEAR) < 0) {
+ return -errno;
+ }
+ }
+ }
+ }
+ // apply reduced capability set to process
+ if (changed) {
+ char *cap_str = cap_to_text(proc_caps, 0);
+ if (cap_str != nullptr){
+ dout(10) << " new caps for cap_set_proc: " << cap_str << dendl;
+ cap_free(cap_str);
+ }
+ if (cap_set_proc(proc_caps) < 0) {
+ dout(1) << " cap_set_proc failed with errno: " << errno << dendl;
+ return -errno;
+ }
+ }
+ return 0;
+ }
+
+ int limit_caps(CephContext *cct)
+ {
+ cap_t merge_caps = nullptr;
+ auto close_caps_on_return = make_scope_guard([&] {
+ if (merge_caps != nullptr) {
+ cap_free(merge_caps);
+ }
+ });
+ // collect required caps in merge_caps
+ merge_caps = cap_init();
+ if (merge_caps == nullptr) {
+ return -errno;
+ }
+ int rc = get_required_caps(cct, merge_caps);
+ if (rc != 0) {
+ return rc;
+ }
+ return trim_caps(cct, merge_caps);
+ }
+#endif
+
+ // preload set of extblkdev plugins defined in config
+ int preload(CephContext *cct)
+ {
+ const auto& conf = cct->_conf;
+ string plugins = conf.get_val<std::string>("osd_extblkdev_plugins");
+ dout(10) << "starting preload of extblkdev plugins: " << plugins << dendl;
+
+ list<string> plugins_list;
+ get_str_list(plugins, plugins_list);
+
+ auto registry = cct->get_plugin_registry();
+ {
+ std::lock_guard l(registry->lock);
+ for (auto& plg : plugins_list) {
+ dout(10) << "starting load of extblkdev plugin: " << plg << dendl;
+ int rc = registry->load("extblkdev", std::string("ebd_") + plg);
+ if (rc) {
+ derr << __func__ << " failed preloading extblkdev plugin: " << plg << dendl;
+ return rc;
+ }else{
+ dout(10) << "successful load of extblkdev plugin: " << plg << dendl;
+ }
+ }
+ }
+#ifdef __linux__
+ // if we are still running as root, we do not need to trim capabilities
+ // as we are intended to use the privileges
+ if (geteuid() == 0) {
+ return 0;
+ }
+ return limit_caps(cct);
+#else
+ return 0;
+#endif
+ }
+
+
+ // scan extblkdev plugins for support of this device
+ int detect_device(CephContext *cct,
+ const std::string &logdevname,
+ ExtBlkDevInterfaceRef& ebd_impl)
+ {
+ int rc = -ENOENT;
+ std::string plg_name;
+ auto registry = cct->get_plugin_registry();
+ std::lock_guard l(registry->lock);
+ auto ptype = registry->plugins.find("extblkdev");
+ if (ptype == registry->plugins.end()) {
+ return -ENOENT;
+ }
+
+ for (auto& it : ptype->second) {
+
+ dout(10) << __func__ << " Trying to detect block device " << logdevname
+ << " using plugin " << it.first << dendl;
+ auto ebdplugin = dynamic_cast<ExtBlkDevPlugin*>(it.second);
+ if (ebdplugin == nullptr) {
+ derr << __func__ << " Is not an extblkdev plugin: " << it.first << dendl;
+ return -ENOENT;
+ }
+ rc = ebdplugin->factory(logdevname, ebd_impl);
+ if (rc == 0) {
+ plg_name = it.first;
+ break;
+ }
+ }
+ if (rc == 0) {
+ dout(1) << __func__ << " using plugin " << plg_name << ", " << "volume " << ebd_impl->get_devname()
+ << " maps to " << logdevname << dendl;
+ } else {
+ dout(10) << __func__ << " no plugin volume maps to " << logdevname << dendl;
+ }
+ return rc;
+ }
+
+ // release device object
+ int release_device(ExtBlkDevInterfaceRef& ebd_impl)
+ {
+ if (ebd_impl) {
+ ebd_impl.reset();
+ }
+ return 0;
+ }
+
+ }
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph distributed storage system
+ *
+ * (C) Copyright IBM Corporation 2022
+ * Author: Martin Ohmacht <mohmacht@us.ibm.com>
+ *
+ * Based on the file ceph/src/erasure-code/ErasureCodePlugin.h
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ * Copyright (C) 2014 Red Hat <contact@redhat.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef CEPH_EXT_BLK_DEV_PLUGIN_H
+#define CEPH_EXT_BLK_DEV_PLUGIN_H
+
+#include "ExtBlkDevInterface.h"
+
+namespace ceph {
+
+ namespace extblkdev {
+ int preload(CephContext *cct);
+ int detect_device(CephContext *cct,
+ const std::string &logdevname,
+ ExtBlkDevInterfaceRef& ebd_impl);
+ int release_device(ExtBlkDevInterfaceRef& ebd_impl);
+ }
+}
+
+#endif
--- /dev/null
+# vdo plugin
+
+set(vdo_srcs
+ ExtBlkDevPluginVdo.cc
+ ExtBlkDevVdo.cc
+)
+
+add_library(ceph_ebd_vdo SHARED ${vdo_srcs})
+install(TARGETS ceph_ebd_vdo DESTINATION ${extblkdev_plugin_dir})
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * (C) Copyright IBM Corporation 2022
+ * Author: Martin Ohmacht <mohmacht@us.ibm.com>
+ *
+ * Based on the file src/erasure-code/clay/ErasureCodePluginClay.cc
+ * Copyright (C) 2018 Indian Institute of Science <office.ece@iisc.ac.in>
+ *
+ * Author: Myna Vajha <mynaramana@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include "ceph_ver.h"
+#include "ExtBlkDevPluginVdo.h"
+#include "common/ceph_context.h"
+
+
+// This plugin does not require any capabilities to be set
+int ExtBlkDevPluginVdo::get_required_cap_set(cap_t caps)
+{
+ return 0;
+}
+
+
+int ExtBlkDevPluginVdo::factory(const std::string& logdevname,
+ ceph::ExtBlkDevInterfaceRef& ext_blk_dev)
+{
+ auto vdo = new ExtBlkDevVdo(cct);
+ int r = vdo->init(logdevname);
+ if (r != 0) {
+ delete vdo;
+ return r;
+ }
+ ext_blk_dev.reset(vdo);
+ return 0;
+};
+
+const char *__ceph_plugin_version() { return CEPH_GIT_NICE_VER; }
+
+int __ceph_plugin_init(CephContext *cct,
+ const std::string& type,
+ const std::string& name)
+{
+ auto plg = new ExtBlkDevPluginVdo(cct);
+ if(plg == 0) return -ENOMEM;
+ int rc = cct->get_plugin_registry()->add(type, name, plg);
+ if(rc != 0){
+ delete plg;
+ }
+ return rc;
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph distributed storage system
+ *
+ * (C) Copyright IBM Corporation 2022
+ * Author: Martin Ohmacht <mohmacht@us.ibm.com>
+ *
+ * Based on the file src/erasure-code/clay/ErasureCodePluginClay.h
+ * Copyright (C) 2018 Indian Institute of Science <office.ece@iisc.ac.in>
+ *
+ * Author: Myna Vajha <mynaramana@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef CEPH_EXT_BLK_DEV_PLUGIN_VDO_H
+#define CEPH_EXT_BLK_DEV_PLUGIN_VDO_H
+
+#include "ExtBlkDevVdo.h"
+
+class ExtBlkDevPluginVdo : public ceph::ExtBlkDevPlugin {
+public:
+ explicit ExtBlkDevPluginVdo(CephContext *cct) : ExtBlkDevPlugin(cct) {}
+ int get_required_cap_set(cap_t caps) override;
+ int factory(const std::string& logdevname,
+ ceph::ExtBlkDevInterfaceRef& ext_blk_dev) override;
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * (C) Copyright IBM Corporation 2022
+ * Author: Martin Ohmacht <mohmacht@us.ibm.com>
+ *
+ * Based on the file ceph/src/common/blkdev.cc
+ * Copyright (c) 2015 Hewlett-Packard Development Company, L.P.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "ExtBlkDevVdo.h"
+#include "common/blkdev.h"
+#include "include/stringify.h"
+#include <errno.h>
+#include "common/debug.h"
+
+#define dout_subsys ceph_subsys_bdev
+#define dout_context cct
+#undef dout_prefix
+#define dout_prefix *_dout << "vdo(" << this << ") "
+
+
+int ExtBlkDevVdo::_get_vdo_stats_handle(const std::string& devname)
+{
+ int rc = -ENOENT;
+ dout(10) << __func__ << " VDO init checking device: " << devname << dendl;
+
+ // we need to go from the raw devname (e.g., dm-4) to the VDO volume name.
+ // currently the best way seems to be to look at /dev/mapper/* ...
+ std::string expect = std::string("../") + devname; // expected symlink target
+ DIR *dir = ::opendir("/dev/mapper");
+ if (!dir) {
+ return -errno;
+ }
+ struct dirent *de = nullptr;
+ while ((de = ::readdir(dir))) {
+ if (de->d_name[0] == '.')
+ continue;
+ char fn[4096], target[4096];
+ snprintf(fn, sizeof(fn), "/dev/mapper/%s", de->d_name);
+ int r = readlink(fn, target, sizeof(target));
+ if (r < 0 || r >= (int)sizeof(target))
+ continue;
+ target[r] = 0;
+ if (expect == target) {
+ snprintf(fn, sizeof(fn), "/sys/kvdo/%s/statistics", de->d_name);
+ int vdo_fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+ if (vdo_fd >= 0) {
+ name = de->d_name;
+ vdo_dir_fd = vdo_fd;
+ rc = 0;
+ break;
+ }
+ }
+ }
+ closedir(dir);
+ return rc;
+}
+
+int ExtBlkDevVdo::get_vdo_stats_handle()
+{
+ std::set<std::string> devs = { logdevname };
+ while (!devs.empty()) {
+ std::string dev = *devs.begin();
+ devs.erase(devs.begin());
+ int rc = _get_vdo_stats_handle(dev);
+ if (rc == 0) {
+ // yay, it's vdo
+ return rc;
+ }
+ // ok, see if there are constituent devices
+ if (dev.find("dm-") == 0) {
+ get_dm_parents(dev, &devs);
+ }
+ }
+ return -ENOENT;
+}
+
+int64_t ExtBlkDevVdo::get_vdo_stat(const char *property)
+{
+ int64_t ret = 0;
+ int fd = ::openat(vdo_dir_fd, property, O_RDONLY|O_CLOEXEC);
+ if (fd < 0) {
+ return 0;
+ }
+ char buf[1024];
+ int r = ::read(fd, buf, sizeof(buf) - 1);
+ if (r > 0) {
+ buf[r] = 0;
+ ret = atoll(buf);
+ }
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return ret;
+}
+
+
+int ExtBlkDevVdo::init(const std::string& alogdevname)
+{
+ logdevname = alogdevname;
+ // get directory handle for VDO metadata
+ return get_vdo_stats_handle();
+}
+
+
+int ExtBlkDevVdo::get_state(ceph::ExtBlkDevState& state)
+{
+ int64_t block_size = get_vdo_stat("block_size");
+ int64_t physical_blocks = get_vdo_stat("physical_blocks");
+ int64_t overhead_blocks_used = get_vdo_stat("overhead_blocks_used");
+ int64_t data_blocks_used = get_vdo_stat("data_blocks_used");
+ int64_t logical_blocks = get_vdo_stat("logical_blocks");
+ int64_t logical_blocks_used = get_vdo_stat("logical_blocks_used");
+ if (!block_size
+ || !physical_blocks
+ || !overhead_blocks_used
+ || !data_blocks_used
+ || !logical_blocks) {
+ dout(1) << __func__ << " VDO sysfs provided zero value for at least one statistic: " << dendl;
+ dout(1) << __func__ << " VDO block_size: " << block_size << dendl;
+ dout(1) << __func__ << " VDO physical_blocks: " << physical_blocks << dendl;
+ dout(1) << __func__ << " VDO overhead_blocks_used: " << overhead_blocks_used << dendl;
+ dout(1) << __func__ << " VDO data_blocks_used: " << data_blocks_used << dendl;
+ dout(1) << __func__ << " VDO logical_blocks: " << logical_blocks << dendl;
+ return -1;
+ }
+ int64_t avail_blocks =
+ physical_blocks - overhead_blocks_used - data_blocks_used;
+ int64_t logical_avail_blocks =
+ logical_blocks - logical_blocks_used;
+ state.set_logical_total(block_size * logical_blocks);
+ state.set_logical_avail(block_size * logical_avail_blocks);
+ state.set_physical_total(block_size * physical_blocks);
+ state.set_physical_avail(block_size * avail_blocks);
+ return 0;
+}
+
+int ExtBlkDevVdo::collect_metadata(const std::string& prefix, std::map<std::string,std::string> *pm)
+{
+ ceph::ExtBlkDevState state;
+ int rc = get_state(state);
+ if(rc != 0){
+ return rc;
+ }
+ (*pm)[prefix + "vdo"] = "true";
+ (*pm)[prefix + "vdo_physical_size"] = stringify(state.get_physical_total());
+ return 0;
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * (C) Copyright IBM Corporation 2022
+ * Author: Martin Ohmacht <mohmacht@us.ibm.com>
+ *
+ * Based on the file ceph/src/common/blkdev.cc
+ * Copyright (c) 2015 Hewlett-Packard Development Company, L.P.
+ *
+ * And also based on the file src/erasure-code/clay/ErasureCodeClay.h
+ * Copyright (C) 2018 Indian Institute of Science <office.ece@iisc.ac.in>
+ *
+ * Author: Myna Vajha <mynaramana@gmail.com>
+ *
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_EXT_BLK_DEV_VDO_H
+#define CEPH_EXT_BLK_DEV_VDO_H
+
+#include "extblkdev/ExtBlkDevInterface.h"
+#include "include/compat.h"
+
+class ExtBlkDevVdo final : public ceph::ExtBlkDevInterface
+{
+ int vdo_dir_fd = -1; ///< fd for vdo sysfs directory
+ std::string name; // name of the underlying vdo device
+ std::string logdevname; // name of the top level logical device
+ CephContext *cct;
+public:
+ explicit ExtBlkDevVdo(CephContext *cct) : cct(cct) {}
+ ~ExtBlkDevVdo(){
+ if(vdo_dir_fd >= 0)
+ VOID_TEMP_FAILURE_RETRY(::close(vdo_dir_fd));
+ }
+ int _get_vdo_stats_handle(const std::string& devname);
+ int get_vdo_stats_handle();
+ int64_t get_vdo_stat(const char *property);
+ virtual int init(const std::string& logdevname);
+ virtual const std::string& get_devname() const {return name;}
+ virtual int get_state(ceph::ExtBlkDevState& state);
+ virtual int collect_metadata(const std::string& prefix, std::map<std::string,std::string> *pm);
+};
+
+#endif
#include "common/signal.h"
#include "common/version.h"
#include "erasure-code/ErasureCodePlugin.h"
+#include "extblkdev/ExtBlkDevPlugin.h"
#include "global/global_context.h"
#include "global/global_init.h"
#include "global/pidfile.h"
<< std::endl;
exit(1);
}
+#if defined(HAVE_SYS_PRCTL_H)
+ if (g_conf().get_val<bool>("set_keepcaps")) {
+ if (prctl(PR_SET_KEEPCAPS, 1) == -1) {
+ cerr << "warning: unable to set keepcaps flag: " << cpp_strerror(errno) << std::endl;
+ }
+ }
+#endif
if (setuid(uid) != 0) {
cerr << "unable to setuid " << uid << ": " << cpp_strerror(errno)
<< std::endl;
- buf->omap_allocated;
}
- uint64_t thin_total, thin_avail;
- if (bdev->get_thin_utilization(&thin_total, &thin_avail)) {
- buf->total += thin_total;
+ ExtBlkDevState ebd_state;
+ int rc = bdev->get_ebd_state(ebd_state);
+ if (rc == 0) {
+ buf->total += ebd_state.get_physical_total();
// we are limited by both the size of the virtual device and the
// underlying physical device.
- bfree = std::min(bfree, thin_avail);
+ bfree = std::min(bfree, ebd_state.get_physical_avail());
- buf->allocated = thin_total - thin_avail;
+ buf->allocated = ebd_state.get_physical_total() - ebd_state.get_physical_avail();;
} else {
buf->total += bdev->get_size();
}
(*pm)["backend_filestore_dev_node"] = string(dev_node);
devname = dev_node;
}
- if (rc == 0 && vdo_fd >= 0) {
- (*pm)["vdo"] = "true";
- (*pm)["vdo_physical_size"] =
- stringify(4096 * get_vdo_stat(vdo_fd, "physical_blocks"));
+ // if compression device detected, collect meta data for device
+ // VDO specific meta data has moved into VDO plugin
+ if (rc == 0 && ebd_impl) {
+ ebd_impl->collect_metadata("", pm);
}
if (journal) {
journal->collect_metadata(pm);
buf0->omap_allocated += object_map->get_db()->get_estimated_size(kv_usage);
}
- uint64_t thin_total, thin_avail;
- if (get_vdo_utilization(vdo_fd, &thin_total, &thin_avail)) {
- buf0->total = thin_total;
- bfree = std::min(bfree, thin_avail);
- buf0->allocated = thin_total - thin_avail;
- buf0->data_stored = bfree;
+ if (ebd_impl) {
+ ExtBlkDevState state;
+ int rc = ebd_impl->get_state(state);
+ if (rc == 0){
+ buf0->total = state.get_physical_total();
+ bfree = std::min(bfree, state.get_physical_avail());
+ buf0->allocated = state.get_physical_total() - state.get_physical_avail();
+ buf0->data_stored = bfree;
+ } else {
+ buf0->total = buf.f_blocks * buf.f_bsize;
+ buf0->allocated = bfree;
+ buf0->data_stored = bfree;
+ }
} else {
buf0->total = buf.f_blocks * buf.f_bsize;
buf0->allocated = bfree;
return r;
}
- // vdo
- {
- char dev_node[PATH_MAX];
- if (int rc = BlkDev{fsid_fd}.wholedisk(dev_node, PATH_MAX); rc == 0) {
- vdo_fd = get_vdo_stats_handle(dev_node, &vdo_name);
- if (vdo_fd >= 0) {
- dout(0) << __func__ << " VDO volume " << vdo_name << " for " << dev_node
- << dendl;
- }
- }
+ // check if any extended block device plugin recognizes this device
+ // detect_vdo has moved into the VDO plugin
+ int rc = extblkdev::detect_device(cct, devname, ebd_impl);
+ if (rc != 0) {
+ dout(20) << __func__ << " no plugin volume maps to " << devname << dendl;
}
// test xattrs
(*it)->stop();
}
- if (vdo_fd >= 0) {
- VOID_TEMP_FAILURE_RETRY(::close(vdo_fd));
- vdo_fd = -1;
- }
+ extblkdev::release_device(ebd_impl);
if (fsid_fd >= 0) {
VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
fsid_fd = -1;
#include "WBThrottle.h"
#include "include/uuid.h"
+#include "extblkdev/ExtBlkDevPlugin.h"
#if defined(__linux__)
# ifndef BTRFS_SUPER_MAGIC
std::string devname;
- int vdo_fd = -1;
- std::string vdo_name;
+ ExtBlkDevInterfaceRef ebd_impl; // structure for retrieving compression state from extended block device
deque<uint64_t> snaps;