From: Lucian Petrut Date: Thu, 30 Jul 2020 11:40:31 +0000 (+0000) Subject: rbd: allow mounting images on Windows X-Git-Tag: v16.1.0~543^2~3 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=4af1ae2093c6dfc915b043afb2e53a67d2ee1e0d;p=ceph.git rbd: allow mounting images on Windows This change will allow mapping rbd images on Windows, leveraging the WNBD[1] Virtual Storport Miniport driver [2]. The behavior and CLI is similar to the Linux rbd-nbd, with a few notable differences: * device paths cannot be requested. The disk number and path will be picked by Windows. If a device path is provided by the user when mapping an image, it will be used as an identifier, which can also be used when unmapping the image. * the "show" command was added, which describes a specific mapping. This can be used for retrieving the disk path. * the "service" command was added, allowing rbd-wnbd to run as a Windows service. All mappings are currently perisistent, being recreated when the service stops, unless explicitly unmapped. The service disconnects the mappings when being stopped. * the "list" command also includes a "status" column. The purpose of the "service" mode is to ensure that mappings survive reboots and that the Windows service start order can be adjusted so that rbd images can be mapped before starting services that may depend on it, such as VMMS. The mapped images can either be consumed by the host directly or exposed to Hyper-V VMs. While at it, we'll skip building rbd-mirror as it's quite unlikely that this daemon is going to be used on Windows for now. [1] https://github.com/cloudbase/wnbd [2] https://docs.microsoft.com/en-us/windows-hardware/drivers/storage/overview-of-storage-virtual-miniport-drivers Signed-off-by: Lucian Petrut Signed-off-by: Alin Gabriel Serdean --- diff --git a/src/common/win32/service.cc b/src/common/win32/service.cc index df0e834137cb..20ee5ccae6c5 100644 --- a/src/common/win32/service.cc +++ b/src/common/win32/service.cc @@ -61,18 +61,22 @@ void WINAPI ServiceBase::run() s_service->hstatus = RegisterServiceCtrlHandler( "", (LPHANDLER_FUNCTION)control_handler); if (!s_service->hstatus) { + lderr(s_service->cct) << "Could not initialize service control handler. " + << "Error: " << GetLastError() << dendl; return; } s_service->set_status(SERVICE_START_PENDING); // TODO: should we expect exceptions? + ldout(s_service->cct, 5) << "Starting service." << dendl; int err = s_service->run_hook(); if (err) { lderr(s_service->cct) << "Failed to start service. Error code: " << err << dendl; s_service->set_status(SERVICE_STOPPED); } else { + ldout(s_service->cct, 5) << "Successfully started service." << dendl; s_service->set_status(SERVICE_RUNNING); } } @@ -87,6 +91,7 @@ void ServiceBase::shutdown() derr << "Shutdown service hook failed. Error code: " << err << dendl; set_status(original_state); } else { + dout(5) << "Shutdown hook completed." << dendl; set_status(SERVICE_STOPPED); } } @@ -101,6 +106,7 @@ void ServiceBase::stop() derr << "Service stop hook failed. Error code: " << err << dendl; set_status(original_state); } else { + dout(5) << "Successfully stopped service." << dendl; set_status(SERVICE_STOPPED); } } @@ -132,6 +138,12 @@ void ServiceBase::set_status(DWORD current_state, DWORD exit_code) { status.dwWin32ExitCode = exit_code; if (hstatus) { + dout(5) << "Updating service service status (" << current_state + << ") and exit code(" << exit_code << ")." << dendl; ::SetServiceStatus(hstatus, &status); + } else { + derr << "Service control handler not initialized. Cannot " + << "update service status (" << current_state + << ") and exit code(" << exit_code << ")." << dendl; } } diff --git a/src/tools/CMakeLists.txt b/src/tools/CMakeLists.txt index 5be9fd3413b6..bf108bd87fdf 100644 --- a/src/tools/CMakeLists.txt +++ b/src/tools/CMakeLists.txt @@ -137,6 +137,9 @@ if(WITH_RBD) if(LINUX) add_subdirectory(rbd_nbd) endif() + if(WIN32) + add_subdirectory(rbd_wnbd) + endif() if(FREEBSD) add_subdirectory(rbd_ggate) endif() diff --git a/src/tools/rbd_wnbd/CMakeLists.txt b/src/tools/rbd_wnbd/CMakeLists.txt new file mode 100644 index 000000000000..38f463961298 --- /dev/null +++ b/src/tools/rbd_wnbd/CMakeLists.txt @@ -0,0 +1,10 @@ +add_executable(rbd-wnbd wnbd_handler.cc rbd_wnbd.cc) +set_target_properties( + rbd-wnbd PROPERTIES COMPILE_FLAGS + "-fpermissive -I${WNBD_INCLUDE_DIRS}") +target_link_libraries( + rbd-wnbd setupapi rpcrt4 + ${WNBD_LIBRARIES} + ${Boost_FILESYSTEM_LIBRARY} + librbd librados global) +install(TARGETS rbd-wnbd DESTINATION bin) diff --git a/src/tools/rbd_wnbd/rbd_wnbd.cc b/src/tools/rbd_wnbd/rbd_wnbd.cc new file mode 100644 index 000000000000..eb7a9a52301a --- /dev/null +++ b/src/tools/rbd_wnbd/rbd_wnbd.cc @@ -0,0 +1,1272 @@ +/* + * rbd-wnbd - RBD in userspace + * + * Copyright (C) 2020 SUSE LINUX GmbH + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * +*/ + +#include "include/int_types.h" + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "wnbd_handler.h" +#include "rbd_wnbd.h" + +#include +#include +#include + +#include "common/Formatter.h" +#include "common/TextTable.h" +#include "common/ceph_argparse.h" +#include "common/config.h" +#include "common/debug.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/version.h" +#include "common/win32/service.h" +#include "common/admin_socket_client.h" + +#include "global/global_init.h" + +#include "include/rados/librados.hpp" +#include "include/rbd/librbd.hpp" + +#include + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "rbd-wnbd: " + +using boost::locale::conv::utf_to_utf; + +std::wstring to_wstring(const std::string& str) +{ + return utf_to_utf(str.c_str(), str.c_str() + str.size()); +} + +std::string to_string(const std::wstring& str) +{ + return utf_to_utf(str.c_str(), str.c_str() + str.size()); +} + +bool is_process_running(DWORD pid) +{ + HANDLE process = OpenProcess(SYNCHRONIZE, FALSE, pid); + DWORD ret = WaitForSingleObject(process, 0); + CloseHandle(process); + return ret == WAIT_TIMEOUT; +} + +DWORD WNBDActiveDiskIterator::fetch_list( + PWNBD_CONNECTION_LIST* conn_list) +{ + DWORD curr_buff_sz = 0; + DWORD buff_sz = 0; + DWORD err = 0; + PWNBD_CONNECTION_LIST tmp_list = NULL; + + // We're using a loop because other connections may show up by the time + // we retry. + do { + if (tmp_list) + free(tmp_list); + + if (buff_sz) { + tmp_list = (PWNBD_CONNECTION_LIST) calloc(1, buff_sz); + if (!tmp_list) { + derr << "Could not allocate " << buff_sz << " bytes." << dendl; + err = ERROR_NOT_ENOUGH_MEMORY; + break; + } + } + + curr_buff_sz = buff_sz; + // If the buffer is too small, the return value is 0 and "BufferSize" + // will contain the required size. This is counterintuitive, but + // Windows drivers can't return a buffer as well as a non-zero status. + err = WnbdList(tmp_list, &buff_sz); + if (err) + break; + } while (curr_buff_sz < buff_sz); + + if (err) { + if (tmp_list) + free(tmp_list); + } else { + *conn_list = tmp_list; + } + return err; +} + +WNBDActiveDiskIterator::WNBDActiveDiskIterator() +{ + DWORD status = WNBDActiveDiskIterator::fetch_list(&conn_list); + if (status) { + error = EINVAL; + } +} + +WNBDActiveDiskIterator::~WNBDActiveDiskIterator() +{ + if (conn_list) { + free(conn_list); + conn_list = NULL; + } +} + +bool WNBDActiveDiskIterator::get(Config *cfg) +{ + index += 1; + *cfg = Config(); + + if (!conn_list || index >= (int)conn_list->Count) { + return false; + } + + auto conn_info = conn_list->Connections[index]; + auto conn_props = conn_info.Properties; + + if (strncmp(conn_props.Owner, RBD_WNBD_OWNER_NAME, WNBD_MAX_OWNER_LENGTH)) { + dout(10) << "Ignoring disk: " << conn_props.InstanceName + << ". Owner: " << conn_props.Owner << dendl; + return this->get(cfg); + } + + error = load_mapping_config_from_registry(conn_props.InstanceName, cfg); + if (error) { + derr << "Could not load registry disk info for: " + << conn_props.InstanceName << ". Error: " << error << dendl; + return false; + } + + cfg->disk_number = conn_info.DiskNumber; + cfg->serial_number = std::string(conn_props.SerialNumber); + cfg->pid = conn_props.Pid; + cfg->active = cfg->disk_number > 0 && is_process_running(conn_props.Pid); + cfg->wnbd_mapped = true; + + return true; +} + +RegistryDiskIterator::RegistryDiskIterator() +{ + reg_key = new RegistryKey(g_ceph_context, HKEY_LOCAL_MACHINE, + SERVICE_REG_KEY, false); + if (!reg_key->hKey) { + if (!reg_key->missingKey) + error = EINVAL; + return; + } + + if (RegQueryInfoKey(reg_key->hKey, NULL, NULL, NULL, &subkey_count, + NULL, NULL, NULL, NULL, NULL, NULL, NULL)) { + derr << "Could not query registry key: " << SERVICE_REG_KEY << dendl; + error = EINVAL; + return; + } +} + +bool RegistryDiskIterator::get(Config *cfg) +{ + index += 1; + *cfg = Config(); + + if (!reg_key->hKey || !subkey_count) { + return false; + } + + char subkey_name[MAX_PATH] = {0}; + DWORD subkey_name_sz = MAX_PATH; + int err = RegEnumKeyEx( + reg_key->hKey, index, subkey_name, &subkey_name_sz, + NULL, NULL, NULL, NULL); + if (err == ERROR_NO_MORE_ITEMS) { + return false; + } else if (err) { + derr << "Could not enumerate registry. Error: " << err << dendl; + error = EINVAL; + return false; + } + + if (load_mapping_config_from_registry(subkey_name, cfg)) { + error = EINVAL; + return false; + }; + + return true; +} + +// Iterate over all RBD mappings, getting info from the registry and the driver. +bool WNBDDiskIterator::get(Config *cfg) +{ + *cfg = Config(); + + bool found_active = active_iterator.get(cfg); + if (found_active) { + active_devices.insert(cfg->devpath); + return true; + } + + error = active_iterator.get_error(); + if (error) { + dout(5) << ": WNBD iterator error: " << error << dendl; + return false; + } + + while(registry_iterator.get(cfg)) { + if (active_devices.find(cfg->devpath) != active_devices.end()) { + // Skip active devices that were already yielded. + continue; + } + return true; + } + + error = registry_iterator.get_error(); + if (error) { + dout(5) << ": Registry iterator error: " << error << dendl; + } + return false; +} + +// Spawn a subprocess using the specified command line, which is expected +// to be a "rbd-wnbd map" command. A pipe is passed to the child process, +// which will allow it to communicate the mapping status +int map_device_using_suprocess(std::string command_line) +{ + SECURITY_ATTRIBUTES sa; + STARTUPINFO si; + PROCESS_INFORMATION pi; + HANDLE read_pipe = NULL, write_pipe = NULL; + char buffer[4096]; + char ch; + DWORD err = 0, ret = 0; + int exit_code = 0; + + dout(5) << __func__ << ": command_line: " << command_line << dendl; + + // We may get a command line containing an old pipe handle when + // recreating mappings, so we'll have to remove it. + std::regex pattern("(--pipe-handle [\'\"]?\\d+[\'\"]?)"); + command_line = std::regex_replace(command_line, pattern, ""); + + // Set the security attribute such that a process created will + // inherit the pipe handles. + sa.nLength = sizeof(sa); + sa.lpSecurityDescriptor = NULL; + sa.bInheritHandle = TRUE; + + // Create an anonymous pipe to communicate with the child. */ + if (!CreatePipe(&read_pipe, &write_pipe, &sa, 0)) { + err = GetLastError(); + derr << "CreatePipe failed: " << win32_strerror(err) << dendl; + exit_code = -ECHILD; + goto finally; + } + + GetStartupInfo(&si); + + // Pass an extra argument '--pipe-handle ' + ret = snprintf( + buffer, sizeof(buffer), "%s %s %lld", + command_line.c_str(), "--pipe-handle", + (intptr_t)write_pipe); + if ((uint64_t) ret > sizeof(buffer)) + { + derr << "Command too long: " << command_line.c_str() << dendl; + exit_code = -EINVAL; + goto finally; + } + + // Create a detached child + if (!CreateProcess(NULL, buffer, NULL, NULL, TRUE, DETACHED_PROCESS, + NULL, NULL, &si, &pi)) { + err = GetLastError(); + derr << "CreateProcess failed: " << win32_strerror(err) << dendl; + exit_code = -ECHILD; + goto finally; + } + + // Close one end of the pipe in the parent. + CloseHandle(write_pipe); + write_pipe = NULL; + + // Block and wait for child to say it is ready. + dout(5) << __func__ << ": waiting for child notification." << dendl; + if (!ReadFile(read_pipe, &ch, 1, NULL, NULL)) { + err = GetLastError(); + derr << "Could not start RBD daemon. Receiving child process reply " + "failed with: " << win32_strerror(err) << dendl; + + // Give the child process a chance to exit so that we can retrieve the + // exit code. + WaitForSingleObject(pi.hProcess, 5000); + if (!is_process_running(pi.dwProcessId)) { + GetExitCodeProcess(pi.hProcess, (PDWORD)&exit_code); + derr << "Daemon failed with: " << cpp_strerror(exit_code) << dendl; + } else { + // The process closed the pipe without notifying us or exiting. + // This is quite unlikely, but we'll terminate the process. + dout(5) << "Terminating unresponsive process." << dendl; + TerminateProcess(pi.hProcess, 1); + exit_code = -EINVAL; + } + } else { + dout(5) << __func__ << ": received child notification." << dendl; + } + + finally: + if (write_pipe) + CloseHandle(write_pipe); + if (read_pipe) + CloseHandle(read_pipe); + return exit_code; +} + +BOOL WINAPI console_handler_routine(DWORD dwCtrlType) +{ + dout(5) << "Received control signal: " << dwCtrlType + << ". Exiting." << dendl; + + std::unique_lock l{shutdown_lock}; + if (handler) + handler->shutdown(); + + return true; +} + +int save_config_to_registry(Config* cfg) +{ + std::string strKey{ SERVICE_REG_KEY }; + strKey.append("\\"); + strKey.append(cfg->devpath); + auto reg_key = RegistryKey( + g_ceph_context, HKEY_LOCAL_MACHINE, strKey.c_str(), true); + if (!reg_key.hKey) { + return -EINVAL; + } + + int ret_val = 0; + // Registry writes are immediately available to other processes. + // Still, we'll do a flush to ensure that the mapping can be + // recreated after a system crash. + if (reg_key.set("pid", getpid()) || + reg_key.set("devpath", cfg->devpath) || + reg_key.set("poolname", cfg->poolname) || + reg_key.set("nsname", cfg->nsname) || + reg_key.set("imgname", cfg->imgname) || + reg_key.set("snapname", cfg->snapname) || + reg_key.set("command_line", GetCommandLine()) || + reg_key.set("admin_sock_path", g_conf()->admin_socket) || + reg_key.flush()) { + ret_val = -EINVAL; + } + + return ret_val; +} + +int remove_config_from_registry(Config* cfg) +{ + std::string strKey{ SERVICE_REG_KEY }; + strKey.append("\\"); + strKey.append(cfg->devpath); + return RegistryKey::remove( + g_ceph_context, HKEY_LOCAL_MACHINE, strKey.c_str()); +} + +int load_mapping_config_from_registry(string devpath, Config* cfg) +{ + std::string strKey{ SERVICE_REG_KEY }; + strKey.append("\\"); + strKey.append(devpath); + auto reg_key = RegistryKey( + g_ceph_context, HKEY_LOCAL_MACHINE, strKey.c_str(), false); + if (!reg_key.hKey) { + return -EINVAL; + } + + reg_key.get("devpath", cfg->devpath); + reg_key.get("poolname", cfg->poolname); + reg_key.get("nsname", cfg->nsname); + reg_key.get("imgname", cfg->imgname); + reg_key.get("snapname", cfg->snapname); + reg_key.get("command_line", cfg->command_line); + reg_key.get("admin_sock_path", cfg->admin_sock_path); + + return 0; +} + +int restart_registered_mappings(int worker_count) +{ + Config cfg; + WNBDDiskIterator iterator; + int err = 0, r; + + boost::asio::thread_pool pool(worker_count); + while (iterator.get(&cfg)) { + if (cfg.command_line.empty()) { + derr << "Could not recreate mapping, missing command line: " + << cfg.devpath << dendl; + err = -EINVAL; + continue; + } + if (cfg.wnbd_mapped) { + dout(5) << __func__ << ": device already mapped: " + << cfg.devpath << dendl; + continue; + } + + boost::asio::post(pool, + [&, cfg]() mutable + { + dout(5) << "Remapping: " << cfg.devpath << dendl; + + // We'll try to map all devices and return a non-zero value + // if any of them fails. + r = map_device_using_suprocess(cfg.command_line); + if (r) { + err = r; + derr << "Could not crecreate mapping: " + << cfg.devpath << ". Error: " << r << dendl; + } else { + dout(5) << "Successfully remapped: " << cfg.devpath << dendl; + } + }); + } + pool.join(); + + r = iterator.get_error(); + if (r) { + derr << "Could not fetch all mappings. Error: " << r << dendl; + err = r; + } + + return err; +} + +int disconnect_all_mappings( + bool unregister, + bool hard_disconnect, + int soft_disconnect_timeout, + int worker_count) +{ + // Although not generally recommended, soft_disconnect_timeout can be 0, + // which means infinite timeout. + ceph_assert(soft_disconnect_timeout >= 0); + ceph_assert(worker_count > 0); + int64_t timeout_ms = soft_disconnect_timeout * 1000; + + Config cfg; + WNBDActiveDiskIterator iterator; + int err = 0, r; + + boost::asio::thread_pool pool(worker_count); + LARGE_INTEGER start_t, counter_freq; + QueryPerformanceFrequency(&counter_freq); + QueryPerformanceCounter(&start_t); + while (iterator.get(&cfg)) { + boost::asio::post(pool, + [&, cfg]() mutable + { + LARGE_INTEGER curr_t, elapsed_ms; + QueryPerformanceCounter(&curr_t); + elapsed_ms.QuadPart = curr_t.QuadPart - start_t.QuadPart; + elapsed_ms.QuadPart *= 1000; + elapsed_ms.QuadPart /= counter_freq.QuadPart; + + int64_t time_left_ms = max((int64_t)0, timeout_ms - elapsed_ms.QuadPart); + + cfg.hard_disconnect = hard_disconnect || !time_left_ms; + cfg.hard_disconnect_fallback = true; + cfg.soft_disconnect_timeout = time_left_ms / 1000; + + dout(5) << "Removing mapping: " << cfg.devpath + << ". Timeout: " << cfg.soft_disconnect_timeout + << "ms. Hard disconnect: " << cfg.hard_disconnect + << dendl; + + r = do_unmap(&cfg, unregister); + if (r) { + err = r; + derr << "Could not remove mapping: " << cfg.devpath + << ". Error: " << r << dendl; + } else { + dout(5) << "Successfully removed mapping: " << cfg.devpath << dendl; + } + }); + } + pool.join(); + + r = iterator.get_error(); + if (r) { + derr << "Could not fetch all mappings. Error: " << r << dendl; + err = r; + } + + return err; +} + +class RBDService : public ServiceBase { + private: + bool hard_disconnect; + int soft_disconnect_timeout; + int thread_count; + + public: + RBDService(bool _hard_disconnect, + int _soft_disconnect_timeout, + int _thread_count) + : ServiceBase(g_ceph_context) + , hard_disconnect(_hard_disconnect) + , soft_disconnect_timeout(_soft_disconnect_timeout) + , thread_count(_thread_count) + { + } + + int run_hook() override { + return restart_registered_mappings(thread_count); + } + // Invoked when the service is requested to stop. + int stop_hook() override { + return disconnect_all_mappings( + false, hard_disconnect, soft_disconnect_timeout, thread_count); + } + // Invoked when the system is shutting down. + int shutdown_hook() override { + return stop_hook(); + } +}; + +static void usage() +{ + const char* usage_str =R"( +Usage: rbd-wnbd [options] map Map an image to wnbd device + [options] unmap Unmap wnbd device + [options] list List mapped wnbd devices + [options] show Show mapped wnbd device + stats Show IO counters + [options] service Windows service entrypoint, + handling device lifecycle + +Map options: + --device Optional mapping unique identifier + --exclusive Forbid writes by other clients + --read-only Map read-only + --io-req-workers The number of workers that dispatch IO requests. + Default: 4 + --io-reply-workers The number of workers that dispatch IO replies. + Default: 4 + +Unmap options: + --hard-disconnect Skip attempting a soft disconnect + --no-hard-disconnect-fallback Immediately return an error if the soft + disconnect fails instead of attempting a hard + disconnect as fallback + --soft-disconnect-timeout Soft disconnect timeout in seconds. The soft + disconnect operation uses PnP to notify the + Windows storage stack that the device is going to + be disconnectd. Storage drivers can block this + operation if there are pending operations, + unflushed caches or open handles. Default: 15 + +Service options: + --hard-disconnect Skip attempting a soft disconnect + --soft-disconnect-timeout Cummulative soft disconnect timeout in seconds, + used when disconnecting existing mappings. A hard + disconnect will be issuedwhen hitting the timeout. + --service-thread-count The number of workers used when mapping or + unmapping images. Default: 8 + +Show|List options: + --format plain|json|xml Output format (default: plain) + --pretty-format Pretty formatting (json and xml) + +Common options: + --wnbd-log-level libwnbd.dll log level + +)"; + + std::cout << usage_str; + generic_server_usage(); +} + + +static Command cmd = None; + +int construct_devpath_if_missing(Config* cfg) +{ + // Windows doesn't allow us to request specific disk paths when mapping an + // image. This will just be used by rbd-wnbd and wnbd as an identifier. + if (cfg->devpath.empty()) { + if (cfg->imgname.empty()) { + derr << "Missing image name." << dendl; + return -EINVAL; + } + + if (!cfg->poolname.empty()) { + cfg->devpath += cfg->poolname; + cfg->devpath += '/'; + } + if (!cfg->nsname.empty()) { + cfg->devpath += cfg->nsname; + cfg->devpath += '/'; + } + + cfg->devpath += cfg->imgname; + + if (!cfg->snapname.empty()) { + cfg->devpath += '@'; + cfg->devpath += cfg->snapname; + } + } + + return 0; +} + +boost::intrusive_ptr do_global_init( + int argc, const char *argv[], Config *cfg) +{ + std::vector args; + argv_to_vec(argc, argv, args); + + code_environment_t code_env; + int flags; + + switch(cmd) { + case Connect: + code_env = CODE_ENVIRONMENT_DAEMON; + flags = CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS; + break; + case Service: + code_env = CODE_ENVIRONMENT_DAEMON; + flags = CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS | + CINIT_FLAG_NO_MON_CONFIG | + CINIT_FLAG_NO_DAEMON_ACTIONS; + break; + default: + code_env = CODE_ENVIRONMENT_UTILITY; + flags = CINIT_FLAG_NO_MON_CONFIG; + break; + } + + global_pre_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, code_env, flags); + // Avoid cluttering the console when spawning a mapping that will run + // in the background. + if (g_conf()->daemonize && !cfg->parent_pipe) { + flags |= CINIT_FLAG_NO_DAEMON_ACTIONS; + } + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + code_env, flags, FALSE); + + // There's no fork on Windows, we should be safe calling this anytime. + common_init_finish(g_ceph_context); + global_init_chdir(g_ceph_context); + + return cct; +} + +static int do_map(Config *cfg) +{ + int r; + + librados::Rados rados; + librbd::RBD rbd; + librados::IoCtx io_ctx; + librbd::Image image; + librbd::image_info_t info; + + if (g_conf()->daemonize && !cfg->parent_pipe) { + return map_device_using_suprocess(GetCommandLine()); + } + + dout(0) << "Mapping RBD image: " << cfg->devpath << dendl; + + r = rados.init_with_context(g_ceph_context); + if (r < 0) { + derr << "rbd-wnbd: couldn't initialize rados: " << cpp_strerror(r) + << dendl; + goto close_ret; + } + + r = rados.connect(); + if (r < 0) { + derr << "rbd-wnbd: couldn't connect to rados: " << cpp_strerror(r) + << dendl; + goto close_ret; + } + + r = rados.ioctx_create(cfg->poolname.c_str(), io_ctx); + if (r < 0) { + derr << "rbd-wnbd: couldn't create IO context: " << cpp_strerror(r) + << dendl; + goto close_ret; + } + + io_ctx.set_namespace(cfg->nsname); + + r = rbd.open(io_ctx, image, cfg->imgname.c_str()); + if (r < 0) { + derr << "rbd-wnbd: couldn't open rbd image: " << cpp_strerror(r) + << dendl; + goto close_ret; + } + + if (cfg->exclusive) { + r = image.lock_acquire(RBD_LOCK_MODE_EXCLUSIVE); + if (r < 0) { + derr << "rbd-wnbd: failed to acquire exclusive lock: " << cpp_strerror(r) + << dendl; + goto close_ret; + } + } + + if (!cfg->snapname.empty()) { + r = image.snap_set(cfg->snapname.c_str()); + if (r < 0) { + derr << "rbd-wnbd: couldn't use snapshot: " << cpp_strerror(r) + << dendl; + goto close_ret; + } + } + + r = image.stat(info, sizeof(info)); + if (r < 0) + goto close_ret; + + if (info.size > _UI64_MAX) { + r = -EFBIG; + derr << "rbd-wnbd: image is too large (" << byte_u_t(info.size) + << ", max is " << byte_u_t(_UI64_MAX) << ")" << dendl; + goto close_ret; + } + + r = save_config_to_registry(cfg); + if (r < 0) + goto close_ret; + + handler = new WnbdHandler(image, cfg->devpath, + info.size / RBD_WNBD_BLKSIZE, + RBD_WNBD_BLKSIZE, + !cfg->snapname.empty() || cfg->readonly, + g_conf().get_val("rbd_cache"), + cfg->io_req_workers, + cfg->io_reply_workers); + handler->start(); + + // We're informing the parent processes that the initialization + // was successful. + if (cfg->parent_pipe) { + if (!WriteFile((HANDLE)cfg->parent_pipe, "a", 1, NULL, NULL)) { + // TODO: consider exiting in this case. The parent didn't wait for us, + // maybe it was killed after a timeout. + int err = GetLastError(); + derr << "Failed to communicate with the parent: " + << win32_strerror(err) << dendl; + } else { + dout(5) << __func__ << ": submitted parent notification." << dendl; + } + + global_init_postfork_finish(g_ceph_context); + } + + handler->wait(); + handler->shutdown(); + +close_ret: + std::unique_lock l{shutdown_lock}; + + image.close(); + io_ctx.close(); + rados.shutdown(); + if (handler) { + delete handler; + handler = nullptr; + } + + return r; +} + +static int do_unmap(Config *cfg, bool unregister) +{ + WNBD_REMOVE_OPTIONS remove_options = {0}; + remove_options.Flags.HardRemove = cfg->hard_disconnect; + remove_options.Flags.HardRemoveFallback = cfg->hard_disconnect_fallback; + remove_options.SoftRemoveTimeoutMs = cfg->soft_disconnect_timeout * 1000; + remove_options.SoftRemoveRetryIntervalMs = SOFT_REMOVE_RETRY_INTERVAL * 1000; + + int err = WnbdRemoveEx(cfg->devpath.c_str(), &remove_options); + if (err && err != ERROR_FILE_NOT_FOUND) { + return -EINVAL; + } + + if (unregister) { + err = remove_config_from_registry(cfg); + if (err) { + derr << "rbd-nbd: failed to unregister device: " + << cfg->devpath << ". Error: " << err << dendl; + return -EINVAL; + } + } + return 0; +} + +static int parse_imgpath(const std::string &imgpath, Config *cfg, + std::ostream *err_msg) +{ + std::regex pattern("^(?:([^/]+)/(?:([^/@]+)/)?)?([^@]+)(?:@([^/@]+))?$"); + std::smatch match; + if (!std::regex_match(imgpath, match, pattern)) { + derr << "rbd-wnbd: invalid spec '" << imgpath << "'" << dendl; + return -EINVAL; + } + + if (match[1].matched) { + cfg->poolname = match[1]; + } + + if (match[2].matched) { + cfg->nsname = match[2]; + } + + cfg->imgname = match[3]; + + if (match[4].matched) + cfg->snapname = match[4]; + + return 0; +} + +static int do_list_mapped_devices(const std::string &format, bool pretty_format) +{ + std::unique_ptr f; + TextTable tbl; + + if (format == "json") { + f.reset(new JSONFormatter(pretty_format)); + } else if (format == "xml") { + f.reset(new XMLFormatter(pretty_format)); + } else if (!format.empty() && format != "plain") { + derr << "rbd-nbd: invalid output format: " << format << dendl; + return -EINVAL; + } + + if (f) { + f->open_array_section("devices"); + } else { + tbl.define_column("id", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("pool", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("namespace", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("image", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("snap", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("device", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("disk_number", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("status", TextTable::LEFT, TextTable::LEFT); + } + + Config cfg; + WNBDDiskIterator wnbd_disk_iterator; + + while (wnbd_disk_iterator.get(&cfg)) { + const char* status = cfg.active ? + WNBD_STATUS_ACTIVE : WNBD_STATUS_INACTIVE; + + if (f) { + f->open_object_section("device"); + f->dump_int("id", cfg.pid ? cfg.pid : -1); + f->dump_string("device", cfg.devpath); + f->dump_string("pool", cfg.poolname); + f->dump_string("namespace", cfg.nsname); + f->dump_string("image", cfg.imgname); + f->dump_string("snap", cfg.snapname); + f->dump_int("disk_number", cfg.disk_number ? cfg.disk_number : -1); + f->dump_string("status", status); + f->close_section(); + } else { + if (cfg.snapname.empty()) { + cfg.snapname = "-"; + } + tbl << (cfg.pid ? cfg.pid : -1) << cfg.poolname << cfg.nsname + << cfg.imgname << cfg.snapname << cfg.devpath + << cfg.disk_number << status << TextTable::endrow; + } + } + int error = wnbd_disk_iterator.get_error(); + if (error) { + derr << "Could not get disk list: " << error << dendl; + return -error; + } + + if (f) { + f->close_section(); + f->flush(std::cout); + } else { + std::cout << tbl; + } + + return 0; +} + +static int do_show_mapped_device(std::string format, bool pretty_format, + std::string devpath) +{ + std::unique_ptr f; + TextTable tbl; + + if (format.empty() || format == "plain") { + format = "json"; + pretty_format = true; + } + if (format == "json") { + f.reset(new JSONFormatter(pretty_format)); + } else if (format == "xml") { + f.reset(new XMLFormatter(pretty_format)); + } else { + derr << "rbd-nbd: invalid output format: " << format << dendl; + return -EINVAL; + } + + Config cfg; + int error = load_mapping_config_from_registry(devpath, &cfg); + if (error) { + derr << "Could not load registry disk info for: " + << devpath << ". Error: " << error << dendl; + return error; + } + + WNBD_CONNECTION_INFO conn_info = { 0 }; + // If the device is currently disconnected but there is a persistent + // mapping record, we'll show that. + DWORD ret = WnbdShow(devpath.c_str(), &conn_info); + if (ret && ret != ERROR_FILE_NOT_FOUND) { + return -EINVAL; + } + + auto conn_props = conn_info.Properties; + f->open_object_section("device"); + f->dump_int("id", conn_props.Pid ? conn_props.Pid : -1); + f->dump_string("device", cfg.devpath); + f->dump_string("pool", cfg.poolname); + f->dump_string("namespace", cfg.nsname); + f->dump_string("image", cfg.imgname); + f->dump_string("snap", cfg.snapname); + f->dump_int("disk_number", conn_info.DiskNumber ? conn_info.DiskNumber : -1); + f->dump_string("status", cfg.active ? WNBD_STATUS_ACTIVE : WNBD_STATUS_INACTIVE); + f->dump_string("pnp_device_id", to_string(conn_info.PNPDeviceID)); + f->dump_int("readonly", conn_props.Flags.ReadOnly); + f->dump_int("block_size", conn_props.BlockSize); + f->dump_int("block_count", conn_props.BlockCount); + f->dump_int("flush_enabled", conn_props.Flags.FlushSupported); + f->close_section(); + f->flush(std::cout); + + return 0; +} + +static int do_stats(std::string search_devpath) +{ + Config cfg; + WNBDDiskIterator wnbd_disk_iterator; + + while (wnbd_disk_iterator.get(&cfg)) { + if (cfg.devpath != search_devpath) + continue; + + AdminSocketClient client = AdminSocketClient(cfg.admin_sock_path); + std::string output; + std::string result = client.do_request("{\"prefix\":\"wnbd stats\"}", + &output); + if (!result.empty()) { + std::cerr << "Admin socket error: " << result << std::endl; + return -EINVAL; + } + + std::cout << output << std::endl; + return 0; + } + int error = wnbd_disk_iterator.get_error(); + if (!error) { + error = ENOENT; + } + + derr << "Could not find the specified disk." << dendl; + return -error; +} + +static int parse_args(std::vector& args, + std::ostream *err_msg, + Command *command, Config *cfg) +{ + std::string conf_file_list; + std::string cluster; + CephInitParameters iparams = ceph_argparse_early_args( + args, CEPH_ENTITY_TYPE_CLIENT, &cluster, &conf_file_list); + + ConfigProxy config{false}; + config->name = iparams.name; + config->cluster = cluster; + + if (!conf_file_list.empty()) { + config.parse_config_files(conf_file_list.c_str(), nullptr, 0); + } else { + config.parse_config_files(nullptr, nullptr, 0); + } + config.parse_env(CEPH_ENTITY_TYPE_CLIENT); + config.parse_argv(args); + cfg->poolname = config.get_val("rbd_default_pool"); + + std::vector::iterator i; + std::ostringstream err; + + // TODO: consider using boost::program_options like Device.cc does. + // This should simplify argument parsing. Also, some arguments must be tied + // to specific commands, for example the disconnect timeout. Luckily, + // this is enforced by the "rbd device" wrapper. + for (i = args.begin(); i != args.end(); ) { + if (ceph_argparse_flag(args, i, "-h", "--help", (char*)NULL)) { + return HELP_INFO; + } else if (ceph_argparse_flag(args, i, "-v", "--version", (char*)NULL)) { + return VERSION_INFO; + } else if (ceph_argparse_witharg(args, i, &cfg->devpath, "--device", (char *)NULL)) { + } else if (ceph_argparse_witharg(args, i, &cfg->format, err, "--format", + (char *)NULL)) { + } else if (ceph_argparse_flag(args, i, "--read-only", (char *)NULL)) { + cfg->readonly = true; + } else if (ceph_argparse_flag(args, i, "--exclusive", (char *)NULL)) { + cfg->exclusive = true; + } else if (ceph_argparse_flag(args, i, "--pretty-format", (char *)NULL)) { + cfg->pretty_format = true; + } else if (ceph_argparse_witharg(args, i, &cfg->parent_pipe, err, + "--pipe-handle", (char *)NULL)) { + if (!err.str().empty()) { + *err_msg << "rbd-wnbd: " << err.str(); + return -EINVAL; + } + if (cfg->parent_pipe < 0) { + *err_msg << "rbd-wnbd: Invalid argument for pipe-handle!"; + return -EINVAL; + } + } else if (ceph_argparse_witharg(args, i, (int*)&cfg->wnbd_log_level, + err, "--wnbd-log-level", (char *)NULL)) { + if (!err.str().empty()) { + *err_msg << "rbd-nbd: " << err.str(); + return -EINVAL; + } + if (cfg->wnbd_log_level < 0) { + *err_msg << "rbd-nbd: Invalid argument for wnbd-log-level"; + return -EINVAL; + } + } else if (ceph_argparse_witharg(args, i, (int*)&cfg->io_req_workers, + err, "--io-req-workers", (char *)NULL)) { + if (!err.str().empty()) { + *err_msg << "rbd-nbd: " << err.str(); + return -EINVAL; + } + if (cfg->io_req_workers <= 0) { + *err_msg << "rbd-nbd: Invalid argument for io-req-workers"; + return -EINVAL; + } + } else if (ceph_argparse_witharg(args, i, (int*)&cfg->io_reply_workers, + err, "--io-reply-workers", (char *)NULL)) { + if (!err.str().empty()) { + *err_msg << "rbd-nbd: " << err.str(); + return -EINVAL; + } + if (cfg->io_reply_workers <= 0) { + *err_msg << "rbd-nbd: Invalid argument for io-reply-workers"; + return -EINVAL; + } + } else if (ceph_argparse_witharg(args, i, (int*)&cfg->service_thread_count, + err, "--service-thread-count", (char *)NULL)) { + if (!err.str().empty()) { + *err_msg << "rbd-nbd: " << err.str(); + return -EINVAL; + } + if (cfg->service_thread_count <= 0) { + *err_msg << "rbd-nbd: Invalid argument for service-thread-count"; + return -EINVAL; + } + } else if (ceph_argparse_flag(args, i, "--hard-disconnect", (char *)NULL)) { + cfg->hard_disconnect = true; + } else if (ceph_argparse_flag(args, i, + "--no-hard-disconnect-fallback", (char *)NULL)) { + cfg->hard_disconnect_fallback = false; + } else if (ceph_argparse_witharg(args, i, + (int*)&cfg->soft_disconnect_timeout, + err, "--soft-disconnect-timeout", + (char *)NULL)) { + if (!err.str().empty()) { + *err_msg << "rbd-nbd: " << err.str(); + return -EINVAL; + } + if (cfg->soft_disconnect_timeout < 0) { + *err_msg << "rbd-nbd: Invalid argument for soft-disconnect-timeout"; + return -EINVAL; + } + } else { + ++i; + } + } + + Command cmd = None; + if (args.begin() != args.end()) { + if (strcmp(*args.begin(), "map") == 0) { + cmd = Connect; + } else if (strcmp(*args.begin(), "unmap") == 0) { + cmd = Disconnect; + } else if (strcmp(*args.begin(), "list") == 0) { + cmd = List; + } else if (strcmp(*args.begin(), "show") == 0) { + cmd = Show; + } else if (strcmp(*args.begin(), "service") == 0) { + cmd = Service; + } else if (strcmp(*args.begin(), "stats") == 0) { + cmd = Stats; + } else { + *err_msg << "rbd-wnbd: unknown command: " << *args.begin(); + return -EINVAL; + } + args.erase(args.begin()); + } + + if (cmd == None) { + *err_msg << "rbd-wnbd: must specify command"; + return -EINVAL; + } + + switch (cmd) { + case Connect: + case Disconnect: + case Show: + case Stats: + if (args.begin() == args.end()) { + *err_msg << "rbd-wnbd: must specify wnbd device or image-or-snap-spec"; + return -EINVAL; + } + if (parse_imgpath(*args.begin(), cfg, err_msg) < 0) { + return -EINVAL; + } + args.erase(args.begin()); + break; + default: + //shut up gcc; + break; + } + + if (args.begin() != args.end()) { + *err_msg << "rbd-wnbd: unknown args: " << *args.begin(); + return -EINVAL; + } + + *command = cmd; + return 0; +} + +static int rbd_wnbd(int argc, const char *argv[]) +{ + int r; + Config cfg; + std::vector args; + argv_to_vec(argc, argv, args); + + // Avoid using dout before calling "do_global_init" + if (args.empty()) { + std::cout << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + + std::ostringstream err_msg; + r = parse_args(args, &err_msg, &cmd, &cfg); + if (r == HELP_INFO) { + usage(); + return 0; + } else if (r == VERSION_INFO) { + std::cout << pretty_version_to_str() << std::endl; + return 0; + } else if (r < 0) { + std::cout << err_msg.str() << std::endl; + return r; + } + + auto cct = do_global_init(argc, argv, &cfg); + + WnbdSetLogger(WnbdHandler::LogMessage); + WnbdSetLogLevel(cfg.wnbd_log_level); + + switch (cmd) { + case Connect: + if (construct_devpath_if_missing(&cfg)) { + return -EINVAL; + } + r = do_map(&cfg); + if (r < 0) + return r; + break; + case Disconnect: + if (construct_devpath_if_missing(&cfg)) { + return -EINVAL; + } + r = do_unmap(&cfg, true); + if (r < 0) + return r; + break; + case List: + r = do_list_mapped_devices(cfg.format, cfg.pretty_format); + if (r < 0) + return r; + break; + case Show: + if (construct_devpath_if_missing(&cfg)) { + return r; + } + r = do_show_mapped_device(cfg.format, cfg.pretty_format, cfg.devpath); + if (r < 0) + return r; + break; + case Service: + { + RBDService service(cfg.hard_disconnect, cfg.soft_disconnect_timeout, + cfg.service_thread_count); + // This call will block until the service stops. + r = RBDService::initialize(&service); + if (r < 0) + return r; + break; + } + case Stats: + if (construct_devpath_if_missing(&cfg)) { + return -EINVAL; + } + return do_stats(cfg.devpath); + default: + usage(); + break; + } + + return 0; +} + +int main(int argc, const char *argv[]) +{ + SetConsoleCtrlHandler(console_handler_routine, true); + // Avoid the Windows Error Reporting dialog. + SetErrorMode(GetErrorMode() | SEM_NOGPFAULTERRORBOX); + int r = rbd_wnbd(argc, argv); + if (r < 0) { + return r; + } + return 0; +} diff --git a/src/tools/rbd_wnbd/rbd_wnbd.h b/src/tools/rbd_wnbd/rbd_wnbd.h new file mode 100644 index 000000000000..e6d08659fe57 --- /dev/null +++ b/src/tools/rbd_wnbd/rbd_wnbd.h @@ -0,0 +1,169 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2020 SUSE LINUX GmbH + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef RBD_WNBD_H +#define RBD_WNBD_H + +#include +#include +#include + +#include "include/compat.h" +#include "common/win32/registry.h" + +#include "wnbd_handler.h" + +#define SERVICE_REG_KEY "SYSTEM\\CurrentControlSet\\Services\\rbd-wnbd" + +#define RBD_WNBD_BLKSIZE 512UL + +#define HELP_INFO 1 +#define VERSION_INFO 2 + +#define WNBD_STATUS_ACTIVE "active" +#define WNBD_STATUS_INACTIVE "inactive" + +#define DEFAULT_SERVICE_THREAD_COUNT 8 + +static WnbdHandler* handler = nullptr; +ceph::mutex shutdown_lock = ceph::make_mutex("RbdWnbd::ShutdownLock"); + +struct Config { + bool exclusive = false; + bool readonly = false; + + intptr_t parent_pipe = 0; + + std::string poolname; + std::string nsname; + std::string imgname; + std::string snapname; + std::string devpath; + + std::string format; + bool pretty_format = false; + + bool hard_disconnect = false; + int soft_disconnect_timeout = DEFAULT_SOFT_REMOVE_TIMEOUT; + bool hard_disconnect_fallback = true; + + // TODO: consider moving those fields to a separate structure. Those + // provide connection information without actually being configurable. + // The disk number is provided by Windows. + int disk_number = -1; + int pid = 0; + std::string serial_number; + bool active = false; + bool wnbd_mapped = false; + std::string command_line; + std::string admin_sock_path; + + WnbdLogLevel wnbd_log_level = WnbdLogLevelInfo; + int io_req_workers = DEFAULT_IO_WORKER_COUNT; + int io_reply_workers = DEFAULT_IO_WORKER_COUNT; + int service_thread_count = DEFAULT_SERVICE_THREAD_COUNT; +}; + +enum Command { + None, + Connect, + Disconnect, + List, + Show, + Service, + Stats +}; + +bool is_process_running(DWORD pid); + +void daemonize_complete(HANDLE parent_pipe); +void unmap_at_exit(); + +int disconnect_all_mappings( + bool unregister, + bool hard_disconnect, + int soft_disconnect_timeout, + int worker_count); +int restart_registered_mappings(int worker_count); +int map_device_using_suprocess(std::string command_line); + +int construct_devpath_if_missing(Config* cfg); +int save_config_to_registry(Config* cfg); +int remove_config_from_registry(Config* cfg); +int load_mapping_config_from_registry(std::string devpath, Config* cfg); + +BOOL WINAPI console_handler_routine(DWORD dwCtrlType); + +static int parse_args(std::vector& args, + std::ostream *err_msg, + Command *command, Config *cfg); +static int do_unmap(Config *cfg, bool unregister); + + +class BaseIterator { + public: + virtual ~BaseIterator() {}; + virtual bool get(Config *cfg) = 0; + + int get_error() { + return error; + } + protected: + int error = 0; + int index = -1; +}; + +// Iterate over mapped devices, retrieving info from the driver. +class WNBDActiveDiskIterator : public BaseIterator { + public: + WNBDActiveDiskIterator(); + ~WNBDActiveDiskIterator(); + + bool get(Config *cfg); + + private: + PWNBD_CONNECTION_LIST conn_list = NULL; + + static DWORD fetch_list(PWNBD_CONNECTION_LIST* conn_list); +}; + + +// Iterate over the Windows registry key, retrieving registered mappings. +class RegistryDiskIterator : public BaseIterator { + public: + RegistryDiskIterator(); + ~RegistryDiskIterator() { + delete reg_key; + } + + bool get(Config *cfg); + private: + DWORD subkey_count = 0; + char subkey_name[MAX_PATH]; + + RegistryKey* reg_key = NULL; +}; + +// Iterate over all RBD mappings, getting info from the registry and driver. +class WNBDDiskIterator : public BaseIterator { + public: + bool get(Config *cfg); + + private: + // We'll keep track of the active devices. + std::set active_devices; + + WNBDActiveDiskIterator active_iterator; + RegistryDiskIterator registry_iterator; +}; + +#endif // RBD_WNBD_H diff --git a/src/tools/rbd_wnbd/wnbd_handler.cc b/src/tools/rbd_wnbd/wnbd_handler.cc new file mode 100644 index 000000000000..ecfa47240f2d --- /dev/null +++ b/src/tools/rbd_wnbd/wnbd_handler.cc @@ -0,0 +1,430 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2020 SUSE LINUX GmbH + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rbd + +#include "wnbd_handler.h" + +#define _NTSCSI_USER_MODE_ +#include +#include + +#include + +#include "common/debug.h" +#include "common/errno.h" +#include "common/safe_io.h" +#include "common/SubProcess.h" +#include "common/Formatter.h" + +#include "global/global_context.h" + +WnbdHandler::~WnbdHandler() +{ + if (started && wnbd_disk) { + dout(10) << __func__ << ": terminating" << dendl; + + shutdown(); + reply_tpool->join(); + + WnbdClose(wnbd_disk); + + started = false; + + delete reply_tpool; + delete admin_hook; + } +} + +int WnbdHandler::wait() +{ + int err = 0; + if (started && wnbd_disk) { + dout(10) << __func__ << ": waiting" << dendl; + + err = WnbdWaitDispatcher(wnbd_disk); + if (err) { + derr << __func__ << " failed waiting for dispatcher to stop: " + << err << dendl; + } + } + + return err; +} + +int WnbdAdminHook::call (std::string_view command, const cmdmap_t& cmdmap, + Formatter *f, + std::ostream& errss, + bufferlist& out) { + if (command == "wnbd stats") { + return m_handler->dump_stats(f); + } + return -ENOSYS; + } + +int WnbdHandler::dump_stats(Formatter *f) +{ + if (!f) { + return -EINVAL; + } + + WNBD_USR_STATS stats = { 0 }; + DWORD err = WnbdGetUserspaceStats(wnbd_disk, &stats); + if (err) { + derr << "Failed to retrieve WNBD userspace stats. Error: " << err << dendl; + return -EINVAL; + } + + f->open_object_section("stats"); + f->dump_int("TotalReceivedRequests", stats.TotalReceivedRequests); + f->dump_int("TotalSubmittedRequests", stats.TotalSubmittedRequests); + f->dump_int("TotalReceivedReplies", stats.TotalReceivedReplies); + f->dump_int("UnsubmittedRequests", stats.UnsubmittedRequests); + f->dump_int("PendingSubmittedRequests", stats.PendingSubmittedRequests); + f->dump_int("PendingReplies", stats.PendingReplies); + f->dump_int("ReadErrors", stats.ReadErrors); + f->dump_int("WriteErrors", stats.WriteErrors); + f->dump_int("FlushErrors", stats.FlushErrors); + f->dump_int("UnmapErrors", stats.UnmapErrors); + f->dump_int("InvalidRequests", stats.InvalidRequests); + f->dump_int("TotalRWRequests", stats.TotalRWRequests); + f->dump_int("TotalReadBlocks", stats.TotalReadBlocks); + f->dump_int("TotalWrittenBlocks", stats.TotalWrittenBlocks); + + f->close_section(); + return 0; +} + +void WnbdHandler::shutdown() +{ + std::unique_lock l{shutdown_lock}; + if (!terminated && wnbd_disk) { + // We're requesting the disk to be removed but continue serving IO + // requests until the driver sends us the "Disconnect" event. + // TODO: expose PWNBD_REMOVE_OPTIONS, we're using the defaults ATM. + WnbdRemove(wnbd_disk, NULL); + wait(); + terminated = true; + } +} + +void WnbdHandler::aio_callback(librbd::completion_t cb, void *arg) +{ + librbd::RBD::AioCompletion *aio_completion = + reinterpret_cast(cb); + + WnbdHandler::IOContext* ctx = static_cast(arg); + int ret = aio_completion->get_return_value(); + + dout(20) << __func__ << ": " << *ctx << dendl; + + if (ret == -EINVAL) { + // if shrinking an image, a pagecache writeback might reference + // extents outside of the range of the new image extents + dout(0) << __func__ << ": masking IO out-of-bounds error" << *ctx << dendl; + ctx->data.clear(); + ret = 0; + } + + if (ret < 0) { + ctx->err_code = -ret; + // TODO: check the actual error. + ctx->set_sense(SCSI_SENSE_MEDIUM_ERROR, + SCSI_ADSENSE_UNRECOVERED_ERROR); + } else if ((ctx->req_type == WnbdReqTypeRead) && + ret < static_cast(ctx->req_size)) { + int pad_byte_count = static_cast (ctx->req_size) - ret; + ctx->data.append_zero(pad_byte_count); + dout(20) << __func__ << ": " << *ctx << ": Pad byte count: " + << pad_byte_count << dendl; + ctx->err_code = 0; + } else { + ctx->err_code = 0; + } + + boost::asio::post( + *ctx->handler->reply_tpool, + [&, ctx]() + { + ctx->handler->send_io_response(ctx); + }); + + aio_completion->release(); +} + +void WnbdHandler::send_io_response(WnbdHandler::IOContext *ctx) { + std::unique_ptr pctx{ctx}; + ceph_assert(WNBD_DEFAULT_MAX_TRANSFER_LENGTH >= pctx->data.length()); + + WNBD_IO_RESPONSE wnbd_rsp = {0}; + wnbd_rsp.RequestHandle = pctx->req_handle; + wnbd_rsp.RequestType = pctx->req_type; + wnbd_rsp.Status = pctx->wnbd_status; + int err = 0; + + // Use TLS to store an overlapped structure so that we avoid + // recreating one each time we send a reply. + static boost::thread_specific_ptr overlapped_tls( + // Cleanup routine + [](LPOVERLAPPED p_overlapped) + { + if (p_overlapped->hEvent) { + CloseHandle(p_overlapped->hEvent); + } + delete p_overlapped; + }); + + LPOVERLAPPED overlapped = overlapped_tls.get(); + if (!overlapped) + { + overlapped = new OVERLAPPED{0}; + HANDLE overlapped_evt = CreateEventA(0, TRUE, TRUE, NULL); + if (!overlapped_evt) { + err = GetLastError(); + derr << "Could not create event. Error: " << err << dendl; + return; + } + + overlapped->hEvent = overlapped_evt; + overlapped_tls.reset(overlapped); + } + + if (!ResetEvent(overlapped->hEvent)) { + err = GetLastError(); + derr << "Could not reset event. Error: " << err << dendl; + return; + } + + err = WnbdSendResponseEx( + pctx->handler->wnbd_disk, + &wnbd_rsp, + pctx->data.c_str(), + pctx->data.length(), + overlapped); + if (err == ERROR_IO_PENDING) { + DWORD returned_bytes = 0; + err = 0; + // We've got ERROR_IO_PENDING, which means that the operation is in + // progress. We'll use GetOverlappedResult to wait for it to complete + // and then retrieve the result. + if (!GetOverlappedResult(pctx->handler->wnbd_disk, overlapped, + &returned_bytes, TRUE)) { + err = GetLastError(); + derr << "Could not send response. Request id: " << wnbd_rsp.RequestHandle + << ". Error: " << err << dendl; + } + } +} + +void WnbdHandler::IOContext::set_sense(uint8_t sense_key, uint8_t asc, uint64_t info) +{ + WnbdSetSenseEx(&wnbd_status, sense_key, asc, info); +} + +void WnbdHandler::IOContext::set_sense(uint8_t sense_key, uint8_t asc) +{ + WnbdSetSense(&wnbd_status, sense_key, asc); +} + +void WnbdHandler::Read( + PWNBD_DISK Disk, + UINT64 RequestHandle, + PVOID Buffer, + UINT64 BlockAddress, + UINT32 BlockCount, + BOOLEAN ForceUnitAccess) +{ + WnbdHandler* handler = nullptr; + ceph_assert(!WnbdGetUserContext(Disk, (PVOID*)&handler)); + + WnbdHandler::IOContext* ctx = new WnbdHandler::IOContext(); + ctx->handler = handler; + ctx->req_handle = RequestHandle; + ctx->req_type = WnbdReqTypeRead; + ctx->req_size = BlockCount * handler->block_size; + ctx->req_from = BlockAddress * handler->block_size; + ceph_assert(ctx->req_size <= WNBD_DEFAULT_MAX_TRANSFER_LENGTH); + + int op_flags = 0; + if (ForceUnitAccess) { + op_flags |= LIBRADOS_OP_FLAG_FADVISE_FUA; + } + + dout(20) << *ctx << ": start" << dendl; + + librbd::RBD::AioCompletion *c = new librbd::RBD::AioCompletion(ctx, aio_callback); + handler->image.aio_read2(ctx->req_from, ctx->req_size, ctx->data, c, op_flags); + + dout(20) << *ctx << ": submitted" << dendl; +} + +void WnbdHandler::Write( + PWNBD_DISK Disk, + UINT64 RequestHandle, + PVOID Buffer, + UINT64 BlockAddress, + UINT32 BlockCount, + BOOLEAN ForceUnitAccess) +{ + WnbdHandler* handler = nullptr; + ceph_assert(!WnbdGetUserContext(Disk, (PVOID*)&handler)); + + WnbdHandler::IOContext* ctx = new WnbdHandler::IOContext(); + ctx->handler = handler; + ctx->req_handle = RequestHandle; + ctx->req_type = WnbdReqTypeWrite; + ctx->req_size = BlockCount * handler->block_size; + ctx->req_from = BlockAddress * handler->block_size; + + bufferptr ptr((char*)Buffer, ctx->req_size); + ctx->data.push_back(ptr); + + int op_flags = 0; + if (ForceUnitAccess) { + op_flags |= LIBRADOS_OP_FLAG_FADVISE_FUA; + } + + dout(20) << *ctx << ": start" << dendl; + + librbd::RBD::AioCompletion *c = new librbd::RBD::AioCompletion(ctx, aio_callback); + handler->image.aio_write2(ctx->req_from, ctx->req_size, ctx->data, c, op_flags); + + dout(20) << *ctx << ": submitted" << dendl; +} + +void WnbdHandler::Flush( + PWNBD_DISK Disk, + UINT64 RequestHandle, + UINT64 BlockAddress, + UINT32 BlockCount) +{ + WnbdHandler* handler = nullptr; + ceph_assert(!WnbdGetUserContext(Disk, (PVOID*)&handler)); + + WnbdHandler::IOContext* ctx = new WnbdHandler::IOContext(); + ctx->handler = handler; + ctx->req_handle = RequestHandle; + ctx->req_type = WnbdReqTypeFlush; + ctx->req_size = BlockCount * handler->block_size; + ctx->req_from = BlockAddress * handler->block_size; + + dout(20) << *ctx << ": start" << dendl; + + librbd::RBD::AioCompletion *c = new librbd::RBD::AioCompletion(ctx, aio_callback); + handler->image.aio_flush(c); + + dout(20) << *ctx << ": submitted" << dendl; +} + +void WnbdHandler::Unmap( + PWNBD_DISK Disk, + UINT64 RequestHandle, + PWNBD_UNMAP_DESCRIPTOR Descriptors, + UINT32 Count) +{ + WnbdHandler* handler = nullptr; + ceph_assert(!WnbdGetUserContext(Disk, (PVOID*)&handler)); + ceph_assert(1 == Count); + + WnbdHandler::IOContext* ctx = new WnbdHandler::IOContext(); + ctx->handler = handler; + ctx->req_handle = RequestHandle; + ctx->req_type = WnbdReqTypeUnmap; + ctx->req_size = Descriptors[0].BlockCount * handler->block_size; + ctx->req_from = Descriptors[0].BlockAddress * handler->block_size; + + dout(20) << *ctx << ": start" << dendl; + + librbd::RBD::AioCompletion *c = new librbd::RBD::AioCompletion(ctx, aio_callback); + handler->image.aio_discard(ctx->req_from, ctx->req_size, c); + + dout(20) << *ctx << ": submitted" << dendl; +} + +void WnbdHandler::LogMessage( + WnbdLogLevel LogLevel, + const char* Message, + const char* FileName, + UINT32 Line, + const char* FunctionName) +{ + // We're already passing the log level to WNBD, so we'll use the highest + // log level here. + dout(0) << "libwnbd.dll!" << FunctionName << " " + << WnbdLogLevelToStr(LogLevel) << " " << Message << dendl; +} + + +int WnbdHandler::start() +{ + int err = 0; + WNBD_PROPERTIES wnbd_props = {0}; + + instance_name.copy(wnbd_props.InstanceName, sizeof(wnbd_props.InstanceName)); + ceph_assert(strlen(RBD_WNBD_OWNER_NAME) < WNBD_MAX_OWNER_LENGTH); + strncpy(wnbd_props.Owner, RBD_WNBD_OWNER_NAME, WNBD_MAX_OWNER_LENGTH); + + wnbd_props.BlockCount = block_count; + wnbd_props.BlockSize = block_size; + wnbd_props.MaxUnmapDescCount = 1; + + wnbd_props.Flags.ReadOnly = readonly; + wnbd_props.Flags.UnmapSupported = 1; + if (rbd_cache_enabled) { + wnbd_props.Flags.FUASupported = 1; + wnbd_props.Flags.FlushSupported = 1; + } + + err = WnbdCreate(&wnbd_props, &RbdWnbdInterface, this, &wnbd_disk); + if (err) + goto exit; + + started = true; + + err = WnbdStartDispatcher(wnbd_disk, io_req_workers); + if (err) { + derr << "Could not start WNBD dispatcher. Error: " << err << dendl; + } + +exit: + return err; +} + +std::ostream &operator<<(std::ostream &os, const WnbdHandler::IOContext &ctx) { + + os << "[" << std::hex << ctx.req_handle; + + switch (ctx.req_type) + { + case WnbdReqTypeRead: + os << " READ "; + break; + case WnbdReqTypeWrite: + os << " WRITE "; + break; + case WnbdReqTypeFlush: + os << " FLUSH "; + break; + case WnbdReqTypeUnmap: + os << " TRIM "; + break; + default: + os << " UNKNOWN(" << ctx.req_type << ") "; + break; + } + + os << ctx.req_from << "~" << ctx.req_size << " " + << std::dec << ntohl(ctx.err_code) << "]"; + + return os; +} diff --git a/src/tools/rbd_wnbd/wnbd_handler.h b/src/tools/rbd_wnbd/wnbd_handler.h new file mode 100644 index 000000000000..3fcdbdd5a11a --- /dev/null +++ b/src/tools/rbd_wnbd/wnbd_handler.h @@ -0,0 +1,186 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2020 SUSE LINUX GmbH + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef WNBD_HANDLER_H +#define WNBD_HANDLER_H + +#include + +#include "common/admin_socket.h" +#include "common/ceph_context.h" +#include "common/Thread.h" + +#include "include/rbd/librbd.hpp" +#include "include/xlist.h" + +#include "global/global_context.h" + +// TODO: make this configurable. +#define RBD_WNBD_MAX_TRANSFER 2 * 1024 * 1024 +#define SOFT_REMOVE_RETRY_INTERVAL 2 +#define DEFAULT_SOFT_REMOVE_TIMEOUT 15 +#define DEFAULT_IO_WORKER_COUNT 4 + +// Not defined by mingw. +#ifndef SCSI_ADSENSE_UNRECOVERED_ERROR +#define SCSI_ADSENSE_UNRECOVERED_ERROR 0x11 +#endif + +// The following will be assigned to the "Owner" field of the WNBD +// parameters, which can be used to determine the application managing +// a disk. We'll ignore other disks. +#define RBD_WNBD_OWNER_NAME "ceph-rbd-wnbd" + +class WnbdHandler; + +class WnbdAdminHook : public AdminSocketHook { + WnbdHandler *m_handler; + +public: + explicit WnbdAdminHook(WnbdHandler *handler) : + m_handler(handler) { + g_ceph_context->get_admin_socket()->register_command( + "wnbd stats", this, "get WNBD stats"); + } + ~WnbdAdminHook() override { + g_ceph_context->get_admin_socket()->unregister_commands(this); + } + + int call(std::string_view command, const cmdmap_t& cmdmap, + Formatter *f, std::ostream& errss, bufferlist& out) override; +}; + + +class WnbdHandler +{ +private: + librbd::Image ℑ + std::string instance_name; + uint32_t block_count; + uint32_t block_size; + bool readonly; + bool rbd_cache_enabled; + uint32_t io_req_workers; + uint32_t io_reply_workers; + WnbdAdminHook* admin_hook; + boost::asio::thread_pool* reply_tpool; + +public: + WnbdHandler(librbd::Image& _image, std::string _instance_name, + uint32_t _block_count, uint32_t _block_size, + bool _readonly, bool _rbd_cache_enabled, + uint32_t _io_req_workers, + uint32_t _io_reply_workers) + : image(_image) + , instance_name(_instance_name) + , block_count(_block_count) + , block_size(_block_size) + , readonly(_readonly) + , rbd_cache_enabled(_rbd_cache_enabled) + , io_req_workers(_io_req_workers) + , io_reply_workers(_io_reply_workers) + { + admin_hook = new WnbdAdminHook(this); + // Instead of relying on librbd's own thread pool, we're going to use a + // separate one. This allows us to make assumptions on the threads that + // are going to send the IO replies and thus be able to cache Windows + // OVERLAPPED structures. + reply_tpool = new boost::asio::thread_pool(_io_reply_workers); + } + + int start(); + // Wait for the handler to stop, which normally happens when the driver + // passes the "Disconnect" request. + int wait(); + void shutdown(); + + int dump_stats(Formatter *f); + + ~WnbdHandler(); + + static VOID LogMessage( + WnbdLogLevel LogLevel, + const char* Message, + const char* FileName, + UINT32 Line, + const char* FunctionName); + +private: + ceph::mutex shutdown_lock = ceph::make_mutex("WnbdHandler::DisconnectLocker"); + bool started = false; + bool terminated = false; + WNBD_DISK* wnbd_disk = nullptr; + + struct IOContext + { + xlist::item item; + WnbdHandler *handler = nullptr; + WNBD_STATUS wnbd_status = {0}; + WnbdRequestType req_type = WnbdReqTypeUnknown; + uint64_t req_handle = 0; + uint32_t err_code = 0; + uint32_t req_size; + uint32_t req_from; + bufferlist data; + + IOContext() + : item(this) + {} + + void set_sense(uint8_t sense_key, uint8_t asc, uint64_t info); + void set_sense(uint8_t sense_key, uint8_t asc); + }; + + friend std::ostream &operator<<(std::ostream &os, const IOContext &ctx); + + void send_io_response(IOContext *ctx); + + static void aio_callback(librbd::completion_t cb, void *arg); + + // WNBD IO entry points + static void Read( + PWNBD_DISK Disk, + UINT64 RequestHandle, + PVOID Buffer, + UINT64 BlockAddress, + UINT32 BlockCount, + BOOLEAN ForceUnitAccess); + static void Write( + PWNBD_DISK Disk, + UINT64 RequestHandle, + PVOID Buffer, + UINT64 BlockAddress, + UINT32 BlockCount, + BOOLEAN ForceUnitAccess); + static void Flush( + PWNBD_DISK Disk, + UINT64 RequestHandle, + UINT64 BlockAddress, + UINT32 BlockCount); + static void Unmap( + PWNBD_DISK Disk, + UINT64 RequestHandle, + PWNBD_UNMAP_DESCRIPTOR Descriptors, + UINT32 Count); + + static constexpr WNBD_INTERFACE RbdWnbdInterface = + { + Read, + Write, + Flush, + Unmap, + }; +}; + +std::ostream &operator<<(std::ostream &os, const WnbdHandler::IOContext &ctx); + +#endif // WNBD_HANDLER_H diff --git a/win32_build.sh b/win32_build.sh index 6abb9f5d6220..0b9b888baefb 100755 --- a/win32_build.sh +++ b/win32_build.sh @@ -36,8 +36,8 @@ backtraceDir="${depsToolsetDir}/backtrace" snappyDir="${depsToolsetDir}/snappy" winLibDir="${depsToolsetDir}/windows/lib" generatorUsed="Unix Makefiles" - -pyVersion=`python -c "import sys; print('%d.%d' % (sys.version_info.major, sys.version_info.minor))"` +wnbdSrcDir="${depsSrcDir}/wnbd" +wnbdLibDir="${depsToolsetDir}/wnbd/lib" depsDirs="$lz4Dir;$curlDir;$sslDir;$boostDir;$zlibDir;$backtraceDir;$snappyDir" depsDirs+=";$winLibDir" @@ -83,7 +83,6 @@ fi # or circular), we'll have to stick to static linking. cmake -D CMAKE_PREFIX_PATH=$depsDirs \ -D CMAKE_TOOLCHAIN_FILE="$CEPH_DIR/cmake/toolchains/mingw32.cmake" \ - -D MGR_PYTHON_VERSION=$pyVersion \ -D WITH_RDMA=OFF -D WITH_OPENLDAP=OFF \ -D WITH_GSSAPI=OFF -D WITH_FUSE=OFF -D WITH_XFS=OFF \ -D WITH_BLUESTORE=OFF -D WITH_LEVELDB=OFF \ @@ -100,6 +99,8 @@ cmake -D CMAKE_PREFIX_PATH=$depsDirs \ -D Boost_THREADAPI="pthread" \ -D ENABLE_GIT_VERSION=$ENABLE_GIT_VERSION \ -D ALLOCATOR="$ALLOCATOR" -D CMAKE_BUILD_TYPE=$CMAKE_BUILD_TYPE \ + -D WNBD_INCLUDE_DIRS="$wnbdSrcDir/include" \ + -D WNBD_LIBRARIES="$wnbdLibDir/libwnbd.a" \ -G "$generatorUsed" \ $CEPH_DIR 2>&1 | tee "${BUILD_DIR}/cmake.log" @@ -113,7 +114,7 @@ if [[ -z $SKIP_BUILD ]]; then make_targets["src/tools"]="ceph-conf ceph_radosacl ceph_scratchtool rados" make_targets["src/tools/immutable_object_cache"]="all" make_targets["src/tools/rbd"]="all" - make_targets["src/tools/rbd_mirror"]="all" + make_targets["src/tools/rbd_wnbd"]="all" make_targets["src/compressor"]="all" for target_subdir in "${!make_targets[@]}"; do diff --git a/win32_deps_build.sh b/win32_deps_build.sh index e31eadd7cb89..9ae28c54f32a 100755 --- a/win32_deps_build.sh +++ b/win32_deps_build.sh @@ -37,8 +37,10 @@ snappyTag="1.1.7" # Additional Windows libraries, which aren't provided by Mingw winLibDir="${depsToolsetDir}/windows/lib" - -MINGW_PREFIX="x86_64-w64-mingw32-" +wnbdUrl="https://github.com/cloudbase/wnbd" +wnbdTag="master" +wnbdSrcDir="${depsSrcDir}/wnbd" +wnbdLibDir="${depsToolsetDir}/wnbd/lib" function _make() { make -j $NUM_WORKERS $@