From 83aa1a9e73458f298f8dea2ad0eb7f1c0cd850be Mon Sep 17 00:00:00 2001 From: Jason Dillaman Date: Mon, 11 Jan 2021 22:12:27 -0500 Subject: [PATCH] librbd/migration: basic QCOW v1 format handler The initial implementation does not support backing files, compression, nor (deprecated) encryption. The former two features will be added in a future commit. Signed-off-by: Jason Dillaman --- doc/rbd/rbd-live-migration.rst | 18 +- qa/workunits/rbd/cli_migration.sh | 70 +- src/librbd/CMakeLists.txt | 1 + src/librbd/io/ReadResult.cc | 4 +- src/librbd/io/ReadResult.h | 1 + src/librbd/migration/QCOW.h | 467 ++++++++++ src/librbd/migration/QCOWFormat.cc | 981 ++++++++++++++++++++++ src/librbd/migration/QCOWFormat.h | 146 ++++ src/librbd/migration/SourceSpecBuilder.cc | 3 + 9 files changed, 1686 insertions(+), 5 deletions(-) create mode 100644 src/librbd/migration/QCOW.h create mode 100644 src/librbd/migration/QCOWFormat.cc create mode 100644 src/librbd/migration/QCOWFormat.h diff --git a/doc/rbd/rbd-live-migration.rst b/doc/rbd/rbd-live-migration.rst index a0ba991fd6c..c81670fd5ea 100644 --- a/doc/rbd/rbd-live-migration.rst +++ b/doc/rbd/rbd-live-migration.rst @@ -143,8 +143,9 @@ The general format for the ``source-spec`` JSON is as follows:: } } -The following formats are currently supported: ``native`` and ``raw``. The -following streams are currently supported: ``file``, ``http``, and ``s3``. +The following formats are currently supported: ``native``, ``qcow``, and +``raw``. The following streams are currently supported: ``file``, ``http``, and +``s3``. Formats ~~~~~~~ @@ -176,6 +177,19 @@ it utilizes native Ceph operations. For example, to import from the image "snap_name": "snap1" } +The ``qcow`` format can be used to describe a QCOW (QEMU copy-on-write) block +device. Only the original QCOW (v1) format is currently supported, but support +for QCOW2 will be added in the near future. The ``qcow`` format data can be +linked to any supported stream source described below. For example, +its base ``source-spec`` JSON is encoded as follows:: + + { + "type": "qcow", + "stream": { + + } + } + The ``raw`` format can be used to describe a thick-provisioned, raw block device export (i.e. `rbd export --export-format 1 `). The ``raw`` format data can be linked to any supported stream source described below. For example, diff --git a/qa/workunits/rbd/cli_migration.sh b/qa/workunits/rbd/cli_migration.sh index 5f24f8ac2ce..cf665940c45 100755 --- a/qa/workunits/rbd/cli_migration.sh +++ b/qa/workunits/rbd/cli_migration.sh @@ -63,12 +63,29 @@ remove_images() { done } +show_diff() +{ + local file1=$1 + local file2=$2 + + xxd "${file1}" > "${file1}.xxd" + xxd "${file2}" > "${file2}.xxd" + sdiff -s "${file1}.xxd" "${file2}.xxd" | head -n 64 + rm -f "${file1}.xxd" "${file2}.xxd" +} + compare_images() { local src_image=$1 local dst_image=$2 + local ret=0 export_raw_image ${dst_image} - cmp "${TEMPDIR}/${src_image}" "${TEMPDIR}/${dst_image}" + if ! cmp "${TEMPDIR}/${src_image}" "${TEMPDIR}/${dst_image}" + then + show_diff "${TEMPDIR}/${src_image}" "${TEMPDIR}/${dst_image}" + ret=1 + fi + return ${ret} } test_import_native_format() { @@ -124,6 +141,56 @@ EOF remove_image "${dest_image}" } +test_import_qcow_format() { + case "$(lsb_release --id --short)" in + RedHatEnterpriseWorkstation|RedHatEnterpriseServer|RedHatEnterprise|CentOS) + # QCOW format not included in EL variants + return + ;; + *) + ;; + esac + + local base_image=$1 + local dest_image=$2 + + qemu-img convert -f raw -O qcow rbd:rbd/${base_image} ${TEMPDIR}/${base_image}.qcow + qemu-img info -f qcow ${TEMPDIR}/${base_image}.qcow + + cat > ${TEMPDIR}/spec.json <ictx->cct; ldout(cct, 10) << "C_ImageReadRequest: r=" << r << dendl; - if (r >= 0) { + if (r >= 0 || (ignore_enoent && r == -ENOENT)) { striper::LightweightBufferExtents buffer_extents; size_t length = 0; for (auto &image_extent : image_extents) { buffer_extents.emplace_back(buffer_offset + length, image_extent.second); length += image_extent.second; } - ceph_assert(length == bl.length()); + ceph_assert(r == -ENOENT || length == bl.length()); aio_completion->lock.lock(); aio_completion->read_result.m_destriper.add_partial_result( diff --git a/src/librbd/io/ReadResult.h b/src/librbd/io/ReadResult.h index 1dfd15b6988..12a1e78cc63 100644 --- a/src/librbd/io/ReadResult.h +++ b/src/librbd/io/ReadResult.h @@ -30,6 +30,7 @@ public: uint64_t buffer_offset = 0; Extents image_extents; bufferlist bl; + bool ignore_enoent = false; C_ImageReadRequest(AioCompletion *aio_completion, uint64_t buffer_offset, diff --git a/src/librbd/migration/QCOW.h b/src/librbd/migration/QCOW.h new file mode 100644 index 00000000000..93ba65ed86c --- /dev/null +++ b/src/librbd/migration/QCOW.h @@ -0,0 +1,467 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* Based on QEMU block/qcow.cc and block/qcow2.h, which has this license: */ + +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef CEPH_LIBRBD_MIGRATION_QCOW2_H +#define CEPH_LIBRBD_MIGRATION_QCOW2_H + +#include "include/ceph_assert.h" +#include "include/int_types.h" +#include "librbd/migration/QCOW.h" +#include + +#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb) + +#define QCOW_CRYPT_NONE 0 +#define QCOW_CRYPT_AES 1 +#define QCOW_CRYPT_LUKS 2 + +#define QCOW_MAX_CRYPT_CLUSTERS 32 +#define QCOW_MAX_SNAPSHOTS 65536 + +/* Field widths in qcow2 mean normal cluster offsets cannot reach + * 64PB; depending on cluster size, compressed clusters can have a + * smaller limit (64PB for up to 16k clusters, then ramps down to + * 512TB for 2M clusters). */ +#define QCOW_MAX_CLUSTER_OFFSET ((1ULL << 56) - 1) + +/* 8 MB refcount table is enough for 2 PB images at 64k cluster size + * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */ +#define QCOW_MAX_REFTABLE_SIZE (1ULL << 23) + +/* 32 MB L1 table is enough for 2 PB images at 64k cluster size + * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */ +#define QCOW_MAX_L1_SIZE (1ULL << 25) + +/* Allow for an average of 1k per snapshot table entry, should be plenty of + * space for snapshot names and IDs */ +#define QCOW_MAX_SNAPSHOTS_SIZE (1024 * QCOW_MAX_SNAPSHOTS) + +/* Maximum amount of extra data per snapshot table entry to accept */ +#define QCOW_MAX_SNAPSHOT_EXTRA_DATA 1024 + +/* Bitmap header extension constraints */ +#define QCOW2_MAX_BITMAPS 65535 +#define QCOW2_MAX_BITMAP_DIRECTORY_SIZE (1024 * QCOW2_MAX_BITMAPS) + +/* Maximum of parallel sub-request per guest request */ +#define QCOW2_MAX_WORKERS 8 + +/* indicate that the refcount of the referenced cluster is exactly one. */ +#define QCOW_OFLAG_COPIED (1ULL << 63) +/* indicate that the cluster is compressed (they never have the copied flag) */ +#define QCOW_OFLAG_COMPRESSED (1ULL << 62) +/* The cluster reads as all zeros */ +#define QCOW_OFLAG_ZERO (1ULL << 0) + +#define QCOW_EXTL2_SUBCLUSTERS_PER_CLUSTER 32 + +/* The subcluster X [0..31] is allocated */ +#define QCOW_OFLAG_SUB_ALLOC(X) (1ULL << (X)) +/* The subcluster X [0..31] reads as zeroes */ +#define QCOW_OFLAG_SUB_ZERO(X) (QCOW_OFLAG_SUB_ALLOC(X) << 32) +/* Subclusters [X, Y) (0 <= X <= Y <= 32) are allocated */ +#define QCOW_OFLAG_SUB_ALLOC_RANGE(X, Y) \ + (QCOW_OFLAG_SUB_ALLOC(Y) - QCOW_OFLAG_SUB_ALLOC(X)) +/* Subclusters [X, Y) (0 <= X <= Y <= 32) read as zeroes */ +#define QCOW_OFLAG_SUB_ZERO_RANGE(X, Y) \ + (QCOW_OFLAG_SUB_ALLOC_RANGE(X, Y) << 32) +/* L2 entry bitmap with all allocation bits set */ +#define QCOW_L2_BITMAP_ALL_ALLOC (QCOW_OFLAG_SUB_ALLOC_RANGE(0, 32)) +/* L2 entry bitmap with all "read as zeroes" bits set */ +#define QCOW_L2_BITMAP_ALL_ZEROES (QCOW_OFLAG_SUB_ZERO_RANGE(0, 32)) + +/* Size of normal and extended L2 entries */ +#define QCOW_L2E_SIZE_NORMAL (sizeof(uint64_t)) +#define QCOW_L2E_SIZE_EXTENDED (sizeof(uint64_t) * 2) + +/* Size of L1 table entries */ +#define QCOW_L1E_SIZE (sizeof(uint64_t)) + +/* Size of reftable entries */ +#define QCOW_REFTABLE_ENTRY_SIZE (sizeof(uint64_t)) + +#define QCOW_MIN_CLUSTER_BITS 9 +#define QCOW_MAX_CLUSTER_BITS 21 + +/* Defined in the qcow2 spec (compressed cluster descriptor) */ +#define QCOW2_COMPRESSED_SECTOR_SIZE 512U +#define QCOW2_COMPRESSED_SECTOR_MASK (~(QCOW2_COMPRESSED_SECTOR_SIZE - 1ULL)) + +#define QCOW_L2_CACHE_SIZE 16 + +/* Must be at least 2 to cover COW */ +#define QCOW_MIN_L2_CACHE_SIZE 2 /* cache entries */ + +/* Must be at least 4 to cover all cases of refcount table growth */ +#define QCOW_MIN_REFCOUNT_CACHE_SIZE 4 /* clusters */ + +#define QCOW_DEFAULT_L2_CACHE_MAX_SIZE (1ULL << 25) +#define QCOW_DEFAULT_CACHE_CLEAN_INTERVAL 600 /* seconds */ + +#define QCOW_DEFAULT_CLUSTER_SIZE 65536 + +#define QCOW2_OPT_DATA_FILE "data-file" +#define QCOW2_OPT_LAZY_REFCOUNTS "lazy-refcounts" +#define QCOW2_OPT_DISCARD_REQUEST "pass-discard-request" +#define QCOW2_OPT_DISCARD_SNAPSHOT "pass-discard-snapshot" +#define QCOW2_OPT_DISCARD_OTHER "pass-discard-other" +#define QCOW2_OPT_OVERLAP "overlap-check" +#define QCOW2_OPT_OVERLAP_TEMPLATE "overlap-check.template" +#define QCOW2_OPT_OVERLAP_MAIN_HEADER "overlap-check.main-header" +#define QCOW2_OPT_OVERLAP_ACTIVE_L1 "overlap-check.active-l1" +#define QCOW2_OPT_OVERLAP_ACTIVE_L2 "overlap-check.active-l2" +#define QCOW2_OPT_OVERLAP_REFCOUNT_TABLE "overlap-check.refcount-table" +#define QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK "overlap-check.refcount-block" +#define QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE "overlap-check.snapshot-table" +#define QCOW2_OPT_OVERLAP_INACTIVE_L1 "overlap-check.inactive-l1" +#define QCOW2_OPT_OVERLAP_INACTIVE_L2 "overlap-check.inactive-l2" +#define QCOW2_OPT_OVERLAP_BITMAP_DIRECTORY "overlap-check.bitmap-directory" +#define QCOW2_OPT_CACHE_SIZE "cache-size" +#define QCOW2_OPT_L2_CACHE_SIZE "l2-cache-size" +#define QCOW2_OPT_L2_CACHE_ENTRY_SIZE "l2-cache-entry-size" +#define QCOW2_OPT_REFCOUNT_CACHE_SIZE "refcount-cache-size" +#define QCOW2_OPT_CACHE_CLEAN_INTERVAL "cache-clean-interval" + +typedef struct QCowHeaderProbe { + uint32_t magic; + uint32_t version; +} __attribute__((__packed__)) QCowHeaderProbe; + +typedef struct QCowHeaderV1 +{ + uint32_t magic; + uint32_t version; + uint64_t backing_file_offset; + uint32_t backing_file_size; + uint32_t mtime; + uint64_t size; /* in bytes */ + uint8_t cluster_bits; + uint8_t l2_bits; + uint16_t padding; + uint32_t crypt_method; + uint64_t l1_table_offset; +} __attribute__((__packed__)) QCowHeaderV1; + +typedef struct QCowHeader { + uint32_t magic; + uint32_t version; + uint64_t backing_file_offset; + uint32_t backing_file_size; + uint32_t cluster_bits; + uint64_t size; /* in bytes */ + uint32_t crypt_method; + uint32_t l1_size; /* XXX: save number of clusters instead ? */ + uint64_t l1_table_offset; + uint64_t refcount_table_offset; + uint32_t refcount_table_clusters; + uint32_t nb_snapshots; + uint64_t snapshots_offset; + + /* The following fields are only valid for version >= 3 */ + uint64_t incompatible_features; + uint64_t compatible_features; + uint64_t autoclear_features; + + uint32_t refcount_order; + uint32_t header_length; + + /* Additional fields */ + uint8_t compression_type; + + /* header must be a multiple of 8 */ + uint8_t padding[7]; +} __attribute__((__packed__)) QCowHeader; + +typedef struct QCowSnapshotHeader { + /* header is 8 byte aligned */ + uint64_t l1_table_offset; + + uint32_t l1_size; + uint16_t id_str_size; + uint16_t name_size; + + uint32_t date_sec; + uint32_t date_nsec; + + uint64_t vm_clock_nsec; + + uint32_t vm_state_size; + uint32_t extra_data_size; /* for extension */ + /* extra data follows */ + /* id_str follows */ + /* name follows */ +} __attribute__((__packed__)) QCowSnapshotHeader; + +typedef struct QCowSnapshotExtraData { + uint64_t vm_state_size_large; + uint64_t disk_size; + uint64_t icount; +} __attribute__((__packed__)) QCowSnapshotExtraData; + + +typedef struct QCowSnapshot { + uint64_t l1_table_offset; + uint32_t l1_size; + char *id_str; + char *name; + uint64_t disk_size; + uint64_t vm_state_size; + uint32_t date_sec; + uint32_t date_nsec; + uint64_t vm_clock_nsec; + /* icount value for the moment when snapshot was taken */ + uint64_t icount; + /* Size of all extra data, including QCowSnapshotExtraData if available */ + uint32_t extra_data_size; + /* Data beyond QCowSnapshotExtraData, if any */ + void *unknown_extra_data; +} QCowSnapshot; + +typedef struct Qcow2CryptoHeaderExtension { + uint64_t offset; + uint64_t length; +} __attribute__((__packed__)) Qcow2CryptoHeaderExtension; + +typedef struct Qcow2UnknownHeaderExtension { + uint32_t magic; + uint32_t len; + uint8_t data[]; +} Qcow2UnknownHeaderExtension; + +enum { + QCOW2_FEAT_TYPE_INCOMPATIBLE = 0, + QCOW2_FEAT_TYPE_COMPATIBLE = 1, + QCOW2_FEAT_TYPE_AUTOCLEAR = 2, +}; + +/* Incompatible feature bits */ +enum { + QCOW2_INCOMPAT_DIRTY_BITNR = 0, + QCOW2_INCOMPAT_CORRUPT_BITNR = 1, + QCOW2_INCOMPAT_DATA_FILE_BITNR = 2, + QCOW2_INCOMPAT_COMPRESSION_BITNR = 3, + QCOW2_INCOMPAT_EXTL2_BITNR = 4, + QCOW2_INCOMPAT_DIRTY = 1 << QCOW2_INCOMPAT_DIRTY_BITNR, + QCOW2_INCOMPAT_CORRUPT = 1 << QCOW2_INCOMPAT_CORRUPT_BITNR, + QCOW2_INCOMPAT_DATA_FILE = 1 << QCOW2_INCOMPAT_DATA_FILE_BITNR, + QCOW2_INCOMPAT_COMPRESSION = 1 << QCOW2_INCOMPAT_COMPRESSION_BITNR, + QCOW2_INCOMPAT_EXTL2 = 1 << QCOW2_INCOMPAT_EXTL2_BITNR, + + QCOW2_INCOMPAT_MASK = QCOW2_INCOMPAT_DIRTY + | QCOW2_INCOMPAT_CORRUPT + | QCOW2_INCOMPAT_DATA_FILE + | QCOW2_INCOMPAT_COMPRESSION + | QCOW2_INCOMPAT_EXTL2, +}; + +/* Compatible feature bits */ +enum { + QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR = 0, + QCOW2_COMPAT_LAZY_REFCOUNTS = 1 << QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR, + + QCOW2_COMPAT_FEAT_MASK = QCOW2_COMPAT_LAZY_REFCOUNTS, +}; + +/* Autoclear feature bits */ +enum { + QCOW2_AUTOCLEAR_BITMAPS_BITNR = 0, + QCOW2_AUTOCLEAR_DATA_FILE_RAW_BITNR = 1, + QCOW2_AUTOCLEAR_BITMAPS = 1 << QCOW2_AUTOCLEAR_BITMAPS_BITNR, + QCOW2_AUTOCLEAR_DATA_FILE_RAW = 1 << QCOW2_AUTOCLEAR_DATA_FILE_RAW_BITNR, + + QCOW2_AUTOCLEAR_MASK = QCOW2_AUTOCLEAR_BITMAPS + | QCOW2_AUTOCLEAR_DATA_FILE_RAW, +}; + +enum qcow2_discard_type { + QCOW2_DISCARD_NEVER = 0, + QCOW2_DISCARD_ALWAYS, + QCOW2_DISCARD_REQUEST, + QCOW2_DISCARD_SNAPSHOT, + QCOW2_DISCARD_OTHER, + QCOW2_DISCARD_MAX +}; + +typedef struct Qcow2Feature { + uint8_t type; + uint8_t bit; + char name[46]; +} __attribute__((__packed__)) Qcow2Feature; + +typedef struct Qcow2DiscardRegion { + uint64_t offset; + uint64_t bytes; +} Qcow2DiscardRegion; + +typedef uint64_t Qcow2GetRefcountFunc(const void *refcount_array, + uint64_t index); +typedef void Qcow2SetRefcountFunc(void *refcount_array, + uint64_t index, uint64_t value); + +typedef struct Qcow2BitmapHeaderExt { + uint32_t nb_bitmaps; + uint32_t reserved32; + uint64_t bitmap_directory_size; + uint64_t bitmap_directory_offset; +} __attribute__((__packed__)) Qcow2BitmapHeaderExt; + +#define QCOW_RC_CACHE_SIZE QCOW_L2_CACHE_SIZE; + +typedef struct Qcow2COWRegion { + /** + * Offset of the COW region in bytes from the start of the first cluster + * touched by the request. + */ + unsigned offset; + + /** Number of bytes to copy */ + unsigned nb_bytes; +} Qcow2COWRegion; + +/** + * Describes an in-flight (part of a) write request that writes to clusters + * that are not referenced in their L2 table yet. + */ +typedef struct QCowL2Meta +{ + /** Guest offset of the first newly allocated cluster */ + uint64_t offset; + + /** Host offset of the first newly allocated cluster */ + uint64_t alloc_offset; + + /** Number of newly allocated clusters */ + int nb_clusters; + + /** Do not free the old clusters */ + bool keep_old_clusters; + + /** + * The COW Region between the start of the first allocated cluster and the + * area the guest actually writes to. + */ + Qcow2COWRegion cow_start; + + /** + * The COW Region between the area the guest actually writes to and the + * end of the last allocated cluster. + */ + Qcow2COWRegion cow_end; + + /* + * Indicates that COW regions are already handled and do not require + * any more processing. + */ + bool skip_cow; + + /** + * Indicates that this is not a normal write request but a preallocation. + * If the image has extended L2 entries this means that no new individual + * subclusters will be marked as allocated in the L2 bitmap (but any + * existing contents of that bitmap will be kept). + */ + bool prealloc; + + /** Pointer to next L2Meta of the same write request */ + struct QCowL2Meta *next; +} QCowL2Meta; + +typedef enum QCow2ClusterType { + QCOW2_CLUSTER_UNALLOCATED, + QCOW2_CLUSTER_ZERO_PLAIN, + QCOW2_CLUSTER_ZERO_ALLOC, + QCOW2_CLUSTER_NORMAL, + QCOW2_CLUSTER_COMPRESSED, +} QCow2ClusterType; + +typedef enum QCow2MetadataOverlap { + QCOW2_OL_MAIN_HEADER_BITNR = 0, + QCOW2_OL_ACTIVE_L1_BITNR = 1, + QCOW2_OL_ACTIVE_L2_BITNR = 2, + QCOW2_OL_REFCOUNT_TABLE_BITNR = 3, + QCOW2_OL_REFCOUNT_BLOCK_BITNR = 4, + QCOW2_OL_SNAPSHOT_TABLE_BITNR = 5, + QCOW2_OL_INACTIVE_L1_BITNR = 6, + QCOW2_OL_INACTIVE_L2_BITNR = 7, + QCOW2_OL_BITMAP_DIRECTORY_BITNR = 8, + + QCOW2_OL_MAX_BITNR = 9, + + QCOW2_OL_NONE = 0, + QCOW2_OL_MAIN_HEADER = (1 << QCOW2_OL_MAIN_HEADER_BITNR), + QCOW2_OL_ACTIVE_L1 = (1 << QCOW2_OL_ACTIVE_L1_BITNR), + QCOW2_OL_ACTIVE_L2 = (1 << QCOW2_OL_ACTIVE_L2_BITNR), + QCOW2_OL_REFCOUNT_TABLE = (1 << QCOW2_OL_REFCOUNT_TABLE_BITNR), + QCOW2_OL_REFCOUNT_BLOCK = (1 << QCOW2_OL_REFCOUNT_BLOCK_BITNR), + QCOW2_OL_SNAPSHOT_TABLE = (1 << QCOW2_OL_SNAPSHOT_TABLE_BITNR), + QCOW2_OL_INACTIVE_L1 = (1 << QCOW2_OL_INACTIVE_L1_BITNR), + /* NOTE: Checking overlaps with inactive L2 tables will result in bdrv + * reads. */ + QCOW2_OL_INACTIVE_L2 = (1 << QCOW2_OL_INACTIVE_L2_BITNR), + QCOW2_OL_BITMAP_DIRECTORY = (1 << QCOW2_OL_BITMAP_DIRECTORY_BITNR), +} QCow2MetadataOverlap; + +/* Perform all overlap checks which can be done in constant time */ +#define QCOW2_OL_CONSTANT \ + (QCOW2_OL_MAIN_HEADER | QCOW2_OL_ACTIVE_L1 | QCOW2_OL_REFCOUNT_TABLE | \ + QCOW2_OL_SNAPSHOT_TABLE | QCOW2_OL_BITMAP_DIRECTORY) + +/* Perform all overlap checks which don't require disk access */ +#define QCOW2_OL_CACHED \ + (QCOW2_OL_CONSTANT | QCOW2_OL_ACTIVE_L2 | QCOW2_OL_REFCOUNT_BLOCK | \ + QCOW2_OL_INACTIVE_L1) + +/* Perform all overlap checks */ +#define QCOW2_OL_ALL \ + (QCOW2_OL_CACHED | QCOW2_OL_INACTIVE_L2) + +#define QCOW_L1E_OFFSET_MASK 0x00fffffffffffe00ULL +#define QCOW_L2E_OFFSET_MASK 0x00fffffffffffe00ULL +#define QCOW_L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL + +#define REFT_OFFSET_MASK 0xfffffffffffffe00ULL + +#define INV_OFFSET (-1ULL) + +static inline uint64_t l2meta_cow_start(QCowL2Meta *m) +{ + return m->offset + m->cow_start.offset; +} + +static inline uint64_t l2meta_cow_end(QCowL2Meta *m) +{ + return m->offset + m->cow_end.offset + m->cow_end.nb_bytes; +} + +static inline uint64_t refcount_diff(uint64_t r1, uint64_t r2) +{ + return r1 > r2 ? r1 - r2 : r2 - r1; +} + +#endif // CEPH_LIBRBD_MIGRATION_QCOW2_H diff --git a/src/librbd/migration/QCOWFormat.cc b/src/librbd/migration/QCOWFormat.cc new file mode 100644 index 00000000000..1e8956bdbe8 --- /dev/null +++ b/src/librbd/migration/QCOWFormat.cc @@ -0,0 +1,981 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/migration/QCOWFormat.h" +#include "common/dout.h" +#include "common/errno.h" +#include "include/intarith.h" +#include "librbd/AsioEngine.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Utils.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ReadResult.h" +#include "librbd/migration/SnapshotInterface.h" +#include "librbd/migration/SourceSpecBuilder.h" +#include "librbd/migration/StreamInterface.h" +#include +#include +#include +#include +#include +#include +#include + +#define dout_subsys ceph_subsys_rbd + +namespace librbd { +namespace migration { + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::QCOWFormat: " \ + << __func__ << ": " + +namespace { + +struct ClusterExtent { + uint64_t cluster_offset; + uint64_t cluster_length; + uint64_t intra_cluster_offset; + uint64_t image_offset; + uint64_t buffer_offset; + + ClusterExtent(uint64_t cluster_offset, uint64_t cluster_length, + uint64_t intra_cluster_offset, uint64_t image_offset, + uint64_t buffer_offset) + : cluster_offset(cluster_offset), cluster_length(cluster_length), + intra_cluster_offset(intra_cluster_offset), image_offset(image_offset), + buffer_offset(buffer_offset) { + } +}; + +typedef std::vector ClusterExtents; + +void populate_cluster_extents(CephContext* cct, uint64_t cluster_size, + const io::Extents& image_extents, + ClusterExtents* cluster_extents) { + uint64_t buffer_offset = 0; + for (auto [image_offset, image_length] : image_extents) { + while (image_length > 0) { + auto intra_cluster_offset = image_offset & (cluster_size - 1); + auto intra_cluster_length = cluster_size - intra_cluster_offset; + auto cluster_length = std::min(image_length, intra_cluster_length); + + ldout(cct, 20) << "image_offset=" << image_offset << ", " + << "image_length=" << image_length << ", " + << "cluster_length=" << cluster_length << dendl; + + + cluster_extents->emplace_back(0, cluster_length, intra_cluster_offset, + image_offset, buffer_offset); + + image_offset += cluster_length; + image_length -= cluster_length; + buffer_offset += cluster_length; + } + } +} + +} // anonymous namespace + +template +struct QCOWFormat::Cluster { + const uint64_t cluster_offset; + bufferlist cluster_data_bl; + + Cluster(uint64_t cluster_offset) : cluster_offset(cluster_offset) { + } +}; + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::QCOWFormat::ClusterCache: " \ + << this << " " << __func__ << ": " + +template +class QCOWFormat::ClusterCache { +public: + ClusterCache(QCOWFormat* qcow_format) + : qcow_format(qcow_format), + m_strand(*qcow_format->m_image_ctx->asio_engine) { + } + + void get_cluster(uint64_t cluster_offset, uint64_t cluster_length, + uint64_t intra_cluster_offset, bufferlist* bl, + Context* on_finish) { + auto cct = qcow_format->m_image_ctx->cct; + ldout(cct, 20) << "cluster_offset=" << cluster_offset << dendl; + + // cache state machine runs in a single strand thread + boost::asio::dispatch( + m_strand, + [this, cluster_offset, cluster_length, intra_cluster_offset, bl, + on_finish]() { + execute_get_cluster(cluster_offset, cluster_length, + intra_cluster_offset, bl, on_finish); + }); + } + +private: + typedef std::tuple Completion; + typedef std::list Completions; + + QCOWFormat* qcow_format; + boost::asio::io_context::strand m_strand; + + std::shared_ptr cluster; + std::unordered_map> clusters; + std::unordered_map cluster_completions; + + void execute_get_cluster(uint64_t cluster_offset, uint64_t cluster_length, + uint64_t intra_cluster_offset, bufferlist* bl, + Context* on_finish) { + auto cct = qcow_format->m_image_ctx->cct; + ldout(cct, 20) << "cluster_offset=" << cluster_offset << dendl; + + if (cluster && cluster->cluster_offset == cluster_offset) { + // most-recent cluster matches + bl->substr_of(cluster->cluster_data_bl, intra_cluster_offset, + cluster_length); + boost::asio::post(*qcow_format->m_image_ctx->asio_engine, + [on_finish]() { on_finish->complete(0); }); + return; + } + + // record callback for cluster + cluster_completions[cluster_offset].emplace_back( + intra_cluster_offset, cluster_length, bl, on_finish); + if (clusters.count(cluster_offset) == 0) { + // start the new read request + auto cluster = std::make_shared(cluster_offset); + clusters[cluster_offset] = cluster; + + read_cluster(cluster); + } + } + + void read_cluster(std::shared_ptr cluster) { + auto cct = qcow_format->m_image_ctx->cct; + + uint64_t stream_offset = cluster->cluster_offset; + uint64_t stream_length = qcow_format->m_cluster_size; + if ((cluster->cluster_offset & QCOW_OFLAG_COMPRESSED) != 0) { + // compressed clusters encode the compressed length in the lower bits + stream_offset = cluster->cluster_offset & + qcow_format->m_cluster_offset_mask; + stream_length = (cluster->cluster_offset >> + (63 - qcow_format->m_cluster_bits)) & + (qcow_format->m_cluster_size - 1); + } + + ldout(cct, 20) << "cluster_offset=" << cluster->cluster_offset << ", " + << "stream_offset=" << stream_offset << ", " + << "stream_length=" << stream_length << dendl; + + // read the cluster into the cache entry + auto ctx = new LambdaContext([this, cluster](int r) { + boost::asio::post(m_strand, [this, cluster, r]() { + handle_read_cluster(r, cluster); }); }); + qcow_format->m_stream->read({{stream_offset, stream_length}}, + &cluster->cluster_data_bl, ctx); + } + + void handle_read_cluster(int r, std::shared_ptr cluster) { + auto cct = qcow_format->m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << ", " + << "cluster_offset=" << cluster->cluster_offset << dendl; + + auto completions = std::move(cluster_completions[cluster->cluster_offset]); + cluster_completions.erase(cluster->cluster_offset); + clusters.erase(cluster->cluster_offset); + + if (r < 0) { + lderr(cct) << "failed to read cluster offset " << cluster->cluster_offset + << ": " << cpp_strerror(r) << dendl; + } else { + if ((cluster->cluster_offset & QCOW_OFLAG_COMPRESSED) != 0) { + bufferlist compressed_bl{std::move(cluster->cluster_data_bl)}; + cluster->cluster_data_bl.clear(); + + // TODO + lderr(cct) << "support for compressed clusters is not available" + << dendl; + r = -EINVAL; + } + } + + // complete the IO back to caller + boost::asio::post(*qcow_format->m_image_ctx->asio_engine, + [r, cluster, completions=std::move(completions)]() { + for (auto completion : completions) { + if (r >= 0) { + std::get<2>(completion)->substr_of( + cluster->cluster_data_bl, + std::get<0>(completion), + std::get<1>(completion)); + } + std::get<3>(completion)->complete(r); + } + }); + } +}; + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::QCOWFormat::L2TableCache: " \ + << this << " " << __func__ << ": " + +template +class QCOWFormat::L2TableCache { +public: + L2TableCache(QCOWFormat* qcow_format, uint32_t l2_bits) + : qcow_format(qcow_format), l2_bits(l2_bits), l2_size(1UL << l2_bits), + m_strand(*qcow_format->m_image_ctx->asio_engine), + l2_cache_entries(QCOW_L2_CACHE_SIZE) { + } + + void get_cluster_offset(uint64_t image_offset, uint64_t* cluster_offset, + Context* on_finish) { + auto cct = qcow_format->m_image_ctx->cct; + ldout(cct, 20) << "image_offset=" << image_offset << dendl; + + // cache state machine runs in a single strand thread + boost::asio::dispatch( + m_strand, [this, image_offset, cluster_offset, on_finish]() { + requests.emplace_back(image_offset, cluster_offset, on_finish); + }); + dispatch_get_cluster_offset(); + } + +private: + QCOWFormat* qcow_format; + uint32_t l2_bits; + uint32_t l2_size; + + boost::asio::io_context::strand m_strand; + + struct Request { + uint64_t image_offset; + uint64_t* cluster_offset; + Context* on_finish; + + Request(uint64_t image_offset, uint64_t* cluster_offset, Context* on_finish) + : image_offset(image_offset), cluster_offset(cluster_offset), + on_finish(on_finish) { + } + }; + typedef std::deque Requests; + + struct L2Cache { + uint64_t l2_offset = 0; + uint64_t* l2_table = nullptr; + bufferlist l2_table_bl; + + uint32_t count = 0; + bool in_flight = false; + + int ret_val = 0; + }; + std::vector l2_cache_entries; + + Requests requests; + + void dispatch_get_cluster_offset() { + boost::asio::dispatch(m_strand, [this]() { execute_get_cluster_offset(); }); + } + + void execute_get_cluster_offset() { + auto cct = qcow_format->m_image_ctx->cct; + if (requests.empty()) { + return; + } + + auto request = requests.front(); + auto l1_index = request.image_offset >> + (l2_bits + qcow_format->m_cluster_bits); + auto l2_offset = qcow_format->m_l1_table[l1_index] & + qcow_format->m_cluster_mask; + auto l2_index = (request.image_offset >> qcow_format->m_cluster_bits) & + (l2_size - 1); + ldout(cct, 20) << "image_offset=" << request.image_offset << ", " + << "l1_index=" << l1_index << ", " + << "l2_offset=" << l2_offset << ", " + << "l2_index=" << l2_index << dendl; + + int r = 0; + if (l2_offset == 0) { + // L2 table has not been allocated for specified offset + ldout(cct, 20) << "image_offset=" << request.image_offset << ", " + << "cluster_offset=DNE" << dendl; + *request.cluster_offset = 0; + r = -ENOENT; + } else { + const uint64_t* l2_table = nullptr; + r = l2_table_lookup(l2_offset, &l2_table); + if (r < 0) { + lderr(cct) << "failed to load L2 table: l2_offset=" << l2_offset << ": " + << cpp_strerror(r) << dendl; + } else if (l2_table == nullptr) { + // table not in cache -- will restart once its loaded + return; + } else { + *request.cluster_offset = be64toh(l2_table[l2_index]); + ldout(cct, 20) << "image_offset=" << request.image_offset << ", " + << "cluster_offset=" << *request.cluster_offset << dendl; + } + } + + // complete the L2 cache request + boost::asio::post(*qcow_format->m_image_ctx->asio_engine, + [r, ctx=request.on_finish]() { ctx->complete(r); }); + requests.pop_front(); + + // process next request (if any) + dispatch_get_cluster_offset(); + } + + int l2_table_lookup(uint64_t l2_offset, const uint64_t** cluster_offset) { + auto cct = qcow_format->m_image_ctx->cct; + + *cluster_offset = nullptr; + + // find a match in the existing cache + for (auto idx = 0U; idx < l2_cache_entries.size(); ++idx) { + auto& l2_cache = l2_cache_entries[idx]; + if (l2_cache.l2_offset == l2_offset) { + if (l2_cache.in_flight) { + ldout(cct, 20) << "l2_offset=" << l2_offset << ", " + << "index=" << idx << " (in-flight)" << dendl; + return 0; + } + + if (l2_cache.ret_val < 0) { + ldout(cct, 20) << "l2_offset=" << l2_offset << ", " + << "index=" << idx << " (error)" << dendl; + l2_cache = L2Cache{}; + return l2_cache.ret_val; + } + + ++l2_cache.count; + if (l2_cache.count == std::numeric_limits::max()) { + for (auto& entry : l2_cache_entries) { + entry.count >>= 1; + } + } + + *cluster_offset = l2_cache.l2_table; + return 0; + } + } + + // find the least used entry + int32_t min_idx = -1; + uint32_t min_count = std::numeric_limits::max(); + for (uint32_t idx = 0U; idx < l2_cache_entries.size(); ++idx) { + auto& l2_cache = l2_cache_entries[idx]; + if (l2_cache.in_flight) { + continue; + } + + if (l2_cache.count > 0) { + --l2_cache.count; + } + if (l2_cache.count < min_count) { + min_count = l2_cache.count; + min_idx = idx; + } + } + + if (min_idx == -1) { + // no space in the cache due to in-flight requests + ldout(cct, 20) << "l2_offset=" << l2_offset << ", " + << "index=DNE (cache busy)" << dendl; + return 0; + } + + ldout(cct, 20) << "l2_offset=" << l2_offset << ", " + << "index=" << min_idx << " (loading)" << dendl; + auto& l2_cache = l2_cache_entries[min_idx]; + l2_cache.l2_offset = l2_offset; + l2_cache.count = 1; + l2_cache.in_flight = true; + + // read the L2 table into the L2 cache entry + auto ctx = new LambdaContext([this, index=min_idx, l2_offset](int r) { + boost::asio::post(m_strand, [this, index, l2_offset, r]() { + handle_l2_table_lookup(r, index, l2_offset); }); }); + l2_cache.l2_table_bl.clear(); + qcow_format->m_stream->read( + {{l2_offset, l2_size * sizeof(uint64_t)}}, &l2_cache.l2_table_bl, ctx); + + return 0; + } + + void handle_l2_table_lookup(int r, uint32_t index, uint64_t l2_offset) { + auto cct = qcow_format->m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << ", " + << "l2_offset=" << l2_offset << ", " + << "index=" << index << dendl; + + auto& l2_cache = l2_cache_entries[index]; + ceph_assert(l2_cache.in_flight); + l2_cache.in_flight = false; + + if (r < 0) { + lderr(cct) << "failed to load L2 table: " + << "l2_offset=" << l2_cache.l2_offset << ": " + << cpp_strerror(r) << dendl; + l2_cache.ret_val = r; + } else { + l2_cache.l2_table = reinterpret_cast( + l2_cache.l2_table_bl.c_str()); + } + + // restart the state machine + dispatch_get_cluster_offset(); + } + +}; + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::QCOWFormat::ReadRequest: " \ + << this << " " << __func__ << ": " + +template +class QCOWFormat::ReadRequest { +public: + ReadRequest(QCOWFormat* qcow_format, io::AioCompletion* aio_comp, + io::Extents&& image_extents) + : qcow_format(qcow_format), aio_comp(aio_comp), + image_extents(std::move(image_extents)) { + } + + void send() { + get_cluster_offsets(); + } + +private: + QCOWFormat* qcow_format; + io::AioCompletion* aio_comp; + + io::Extents image_extents; + size_t image_extents_idx = 0; + uint32_t image_extent_offset = 0; + + ClusterExtents cluster_extents; + + void get_cluster_offsets() { + auto cct = qcow_format->m_image_ctx->cct; + populate_cluster_extents(cct, qcow_format->m_cluster_size, image_extents, + &cluster_extents); + + ldout(cct, 20) << dendl; + auto ctx = new LambdaContext([this](int r) { + handle_get_cluster_offsets(r); }); + auto gather_ctx = new C_Gather(cct, ctx); + + for (auto& cluster_extent : cluster_extents) { + auto sub_ctx = new LambdaContext( + [this, &cluster_extent, on_finish=gather_ctx->new_sub()](int r) { + handle_get_cluster_offset(r, cluster_extent, on_finish); }); + qcow_format->m_l2_table_cache->get_cluster_offset( + cluster_extent.image_offset, &cluster_extent.cluster_offset, sub_ctx); + } + + gather_ctx->activate(); + } + + void handle_get_cluster_offset(int r, const ClusterExtent& cluster_extent, + Context* on_finish) { + auto cct = qcow_format->m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << ", " + << "image_offset=" << cluster_extent.image_offset << ", " + << "cluster_offset=" << cluster_extent.cluster_offset + << dendl; + + if (r == -ENOENT) { + ldout(cct, 20) << "image offset DNE in QCOW image" << dendl; + r = 0; + } else if (r < 0) { + lderr(cct) << "failed to map image offset " << cluster_extent.image_offset + << ": " << cpp_strerror(r) << dendl; + } + + on_finish->complete(r); + } + + void handle_get_cluster_offsets(int r) { + auto cct = qcow_format->m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to retrieve cluster extents: " << cpp_strerror(r) + << dendl; + aio_comp->fail(r); + delete this; + return; + } + + read_clusters(); + } + + void read_clusters() { + auto cct = qcow_format->m_image_ctx->cct; + ldout(cct, 20) << dendl; + + aio_comp->set_request_count(cluster_extents.size()); + for (auto& cluster_extent : cluster_extents) { + auto read_ctx = new io::ReadResult::C_ImageReadRequest( + aio_comp, cluster_extent.buffer_offset, + {{cluster_extent.image_offset, cluster_extent.cluster_length}}); + read_ctx->ignore_enoent = true; + + auto log_ctx = new LambdaContext( + [this, cct=qcow_format->m_image_ctx->cct, + image_offset=cluster_extent.image_offset, + image_length=cluster_extent.cluster_length, ctx=read_ctx](int r) { + handle_read_cluster(cct, r, image_offset, image_length, ctx); + }); + + if (cluster_extent.cluster_offset == 0) { + // QCOW header is at offset 0, implies cluster DNE + log_ctx->complete(-ENOENT); + } else { + // request the (sub)cluster from the cluster cache + qcow_format->m_cluster_cache->get_cluster( + cluster_extent.cluster_offset, cluster_extent.cluster_length, + cluster_extent.intra_cluster_offset, &read_ctx->bl, log_ctx); + } + } + + delete this; + } + + void handle_read_cluster(CephContext* cct, int r, uint64_t image_offset, + uint64_t image_length, Context* on_finish) { + ldout(cct, 20) << "r=" << r << ", " + << "image_offset=" << image_offset << ", " + << "image_length=" << image_length << dendl; + + if (r != -ENOENT && r < 0) { + lderr(cct) << "failed to read image extent " << image_offset << "~" + << image_length << ": " << cpp_strerror(r) << dendl; + } + + on_finish->complete(r); + } +}; + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::QCOWFormat::" \ + << "ListSnapsRequest: " << this << " " \ + << __func__ << ": " + +template +class QCOWFormat::ListSnapsRequest { +public: + ListSnapsRequest(QCOWFormat* qcow_format, io::Extents&& image_extents, + io::SparseExtents* sparse_extents, Context* on_finish) + : qcow_format(qcow_format), image_extents(std::move(image_extents)), + sparse_extents(sparse_extents), on_finish(on_finish) { + } + + void send() { + list_snaps(); + } + +private: + QCOWFormat* qcow_format; + io::Extents image_extents; + io::SparseExtents* sparse_extents; + Context* on_finish; + + ClusterExtents cluster_extents; + + void list_snaps() { + auto cct = qcow_format->m_image_ctx->cct; + ldout(cct, 20) << dendl; + + populate_cluster_extents(cct, qcow_format->m_cluster_size, image_extents, + &cluster_extents); + + auto ctx = new LambdaContext([this](int r) { + handle_list_snaps(r); }); + auto gather_ctx = new C_Gather(cct, ctx); + for (auto& cluster_extent : cluster_extents) { + auto ctx = new LambdaContext( + [this, cluster_extent=&cluster_extent, + ctx=gather_ctx->new_sub()](int r) { + boost::asio::post( + qcow_format->m_strand, [this, cluster_extent, ctx, r]() { + handle_get_cluster_offset( r, *cluster_extent, ctx); + }); + }); + qcow_format->m_l2_table_cache->get_cluster_offset( + cluster_extent.image_offset, &cluster_extent.cluster_offset, ctx); + } + + gather_ctx->activate(); + } + + void handle_get_cluster_offset( + int r, const ClusterExtent& cluster_extent, Context* on_finish) { + auto cct = qcow_format->m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << ", " + << "image_offset=" << cluster_extent.image_offset << ", " + << "image_length=" << cluster_extent.cluster_length << ", " + << "cluster_offset=" << cluster_extent.cluster_offset + << dendl; + + if (r == -ENOENT) { + r = 0; + } else if (r >= 0 && cluster_extent.cluster_offset != 0) { + sparse_extents->insert( + cluster_extent.image_offset, cluster_extent.cluster_length, + {io::SPARSE_EXTENT_STATE_DATA, cluster_extent.cluster_length}); + } + + on_finish->complete(r); + } + + void handle_list_snaps(int r) { + auto cct = qcow_format->m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + on_finish->complete(r); + delete this; + } +}; + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::QCOWFormat: " << this \ + << " " << __func__ << ": " + +template +QCOWFormat::QCOWFormat( + I* image_ctx, const json_spirit::mObject& json_object, + const SourceSpecBuilder* source_spec_builder) + : m_image_ctx(image_ctx), m_json_object(json_object), + m_source_spec_builder(source_spec_builder), + m_strand(*image_ctx->asio_engine) { +} + +template +void QCOWFormat::open(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + int r = m_source_spec_builder->build_stream(m_json_object, &m_stream); + if (r < 0) { + lderr(cct) << "failed to build migration stream handler" << cpp_strerror(r) + << dendl; + on_finish->complete(r); + return; + } + + auto ctx = new LambdaContext([this, on_finish](int r) { + handle_open(r, on_finish); }); + m_stream->open(ctx); +} + +template +void QCOWFormat::handle_open(int r, Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to open QCOW image: " << cpp_strerror(r) + << dendl; + on_finish->complete(r); + return; + } + + probe(on_finish); +} + +template +void QCOWFormat::probe(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + auto ctx = new LambdaContext([this, on_finish](int r) { + handle_probe(r, on_finish); }); + m_bl.clear(); + m_stream->read({{0, 8}}, &m_bl, ctx); +} + +template +void QCOWFormat::handle_probe(int r, Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to probe QCOW image: " << cpp_strerror(r) + << dendl; + on_finish->complete(r); + return; + } + + auto header_probe = *reinterpret_cast( + m_bl.c_str()); + header_probe.magic = be32toh(header_probe.magic); + header_probe.version = be32toh(header_probe.version); + + if (header_probe.magic != QCOW_MAGIC) { + lderr(cct) << "invalid QCOW header magic" << dendl; + on_finish->complete(-EINVAL); + return; + } + + m_bl.clear(); + if (header_probe.version == 1) { + read_v1_header(on_finish); + return; + } else if (header_probe.version == 2) { + read_v2_header(on_finish); + return; + } else { + lderr(cct) << "invalid QCOW header version " << header_probe.version + << dendl; + on_finish->complete(-EINVAL); + return; + } +} + +template +void QCOWFormat::read_v1_header(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + auto ctx = new LambdaContext([this, on_finish](int r) { + handle_read_v1_header(r, on_finish); }); + m_bl.clear(); + m_stream->read({{0, sizeof(QCowHeaderV1)}}, &m_bl, ctx); +} + +template +void QCOWFormat::handle_read_v1_header(int r, Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to read QCOW header: " << cpp_strerror(r) << dendl; + on_finish->complete(r); + return; + } + + auto header = *reinterpret_cast(m_bl.c_str()); + + // byte-swap important fields + header.magic = be32toh(header.magic); + header.version = be32toh(header.version); + header.backing_file_offset = be64toh(header.backing_file_offset); + header.backing_file_size = be32toh(header.backing_file_size); + header.size = be64toh(header.size); + header.crypt_method = be32toh(header.crypt_method); + header.l1_table_offset = be64toh(header.l1_table_offset); + + if (header.magic != QCOW_MAGIC || header.version != 1) { + // honestly shouldn't happen since we've already validated it + lderr(cct) << "header is not QCOW" << dendl; + on_finish->complete(-EINVAL); + return; + } + + if (header.cluster_bits < QCOW_MIN_CLUSTER_BITS || + header.cluster_bits > QCOW_MAX_CLUSTER_BITS) { + lderr(cct) << "invalid cluster bits: " << header.cluster_bits << dendl; + on_finish->complete(-EINVAL); + return; + } + + if (header.l2_bits < (QCOW_MIN_CLUSTER_BITS - 3) || + header.l2_bits > (QCOW_MAX_CLUSTER_BITS - 3)) { + lderr(cct) << "invalid L2 bits: " << header.l2_bits << dendl; + on_finish->complete(-EINVAL); + return; + } + + if (header.crypt_method != QCOW_CRYPT_NONE) { + lderr(cct) << "invalid or unsupported encryption method" << dendl; + on_finish->complete(-EINVAL); + return; + } + + m_size = header.size; + if (p2roundup(m_size, static_cast(512)) != m_size) { + lderr(cct) << "image size is not a multiple of block size" << dendl; + on_finish->complete(-EINVAL); + return; + } + + m_backing_file_offset = header.backing_file_offset; + m_backing_file_size = header.backing_file_size; + + m_cluster_bits = header.cluster_bits; + m_cluster_size = 1UL << header.cluster_bits; + m_cluster_offset_mask = (1ULL << (63 - header.cluster_bits)) - 1; + m_cluster_mask = ~QCOW_OFLAG_COMPRESSED; + + uint32_t l2_bits = header.l2_bits; + uint32_t shift = m_cluster_bits + l2_bits; + m_l1_size = (m_size + (1LL << shift) - 1) >> shift; + m_l1_table_offset = header.l1_table_offset; + if (m_size > (std::numeric_limits::max() - (1ULL << shift)) || + m_l1_size > (std::numeric_limits::max() / sizeof(uint64_t))) { + lderr(cct) << "image size too big: " << m_size << dendl; + on_finish->complete(-EINVAL); + return; + } + + ldout(cct, 15) << "size=" << m_size << ", " + << "cluster_bits=" << m_cluster_bits << ", " + << "l2_bits=" << l2_bits << dendl; + + // allocate memory for L1 table and L2 + cluster caches + m_l2_table_cache = std::make_unique(this, l2_bits); + m_cluster_cache = std::make_unique(this); + + read_l1_table(on_finish); +} + +template +void QCOWFormat::read_v2_header(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + auto ctx = new LambdaContext([this, on_finish](int r) { + handle_read_v2_header(r, on_finish); }); + m_bl.clear(); + m_stream->read({{0, sizeof(QCowHeader)}}, &m_bl, ctx); +} + +template +void QCOWFormat::handle_read_v2_header(int r, Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << r << dendl; + + // TODO add support for QCOW2 + on_finish->complete(-ENOTSUP); +} + +template +void QCOWFormat::read_l1_table(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + auto ctx = new LambdaContext([this, on_finish](int r) { + handle_read_l1_table(r, on_finish); }); + m_stream->read({{m_l1_table_offset, + m_l1_size * sizeof(uint64_t)}}, &m_l1_table_bl, ctx); +} + +template +void QCOWFormat::handle_read_l1_table(int r, Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to read L1 table: " << cpp_strerror(r) << dendl; + on_finish->complete(r); + return; + } + + // translate the L1 table (big-endian -> CPU endianess) + m_l1_table = reinterpret_cast(m_l1_table_bl.c_str()); + for (auto idx = 0UL; idx < m_l1_size; ++idx) { + m_l1_table[idx] = be64toh(m_l1_table[idx]); + } + + read_backing_file(on_finish); +} + +template +void QCOWFormat::read_backing_file(Context* on_finish) { + if (m_backing_file_offset == 0 || m_backing_file_size == 0) { + // all data is within the specified file + on_finish->complete(0); + return; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + // TODO add support for backing files + on_finish->complete(-ENOTSUP); +} + +template +void QCOWFormat::close(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + m_stream->close(on_finish); +} + +template +void QCOWFormat::get_snapshots(SnapInfos* snap_infos, Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + snap_infos->clear(); + + // TODO QCOW2 supports snapshots, not QCOW + on_finish->complete(0); +} + +template +void QCOWFormat::get_image_size(uint64_t snap_id, uint64_t* size, + Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + *size = m_size; + on_finish->complete(0); +} + +template +bool QCOWFormat::read( + io::AioCompletion* aio_comp, uint64_t snap_id, io::Extents&& image_extents, + io::ReadResult&& read_result, int op_flags, int read_flags, + const ZTracer::Trace &parent_trace) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "snap_id=" << snap_id << ", " + << "image_extents=" << image_extents << dendl; + + if (snap_id != CEPH_NOSNAP) { + // TODO add QCOW2 snapshot support + lderr(cct) << "snapshots are not supported" << dendl; + aio_comp->fail(-EINVAL); + return true; + } + + aio_comp->read_result = std::move(read_result); + aio_comp->read_result.set_image_extents(image_extents); + + auto read_request = new ReadRequest(this, aio_comp, std::move(image_extents)); + read_request->send(); + + return true; +} + +template +void QCOWFormat::list_snaps(io::Extents&& image_extents, + io::SnapIds&& snap_ids, int list_snaps_flags, + io::SnapshotDelta* snapshot_delta, + const ZTracer::Trace &parent_trace, + Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "image_extents=" << image_extents << dendl; + + // TODO add QCOW2 snapshot support + + // QCOW does support snapshots so just use cluster existence for delta + auto snapshot = &(*snapshot_delta)[{CEPH_NOSNAP, CEPH_NOSNAP}]; + auto list_snaps_request = new ListSnapsRequest( + this, io::Extents{image_extents}, snapshot, on_finish); + list_snaps_request->send(); +} + +} // namespace migration +} // namespace librbd + +template class librbd::migration::QCOWFormat; diff --git a/src/librbd/migration/QCOWFormat.h b/src/librbd/migration/QCOWFormat.h new file mode 100644 index 00000000000..ebcb062917a --- /dev/null +++ b/src/librbd/migration/QCOWFormat.h @@ -0,0 +1,146 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIGRATION_QCOW_FORMAT_H +#define CEPH_LIBRBD_MIGRATION_QCOW_FORMAT_H + +#include "include/int_types.h" +#include "librbd/Types.h" +#include "librbd/migration/FormatInterface.h" +#include "librbd/migration/QCOW.h" +#include "json_spirit/json_spirit.h" +#include +#include +#include +#include +#include + +struct Context; + +namespace librbd { + +struct AsioEngine; +struct ImageCtx; + +namespace migration { + +template struct SourceSpecBuilder; +struct StreamInterface; + +template +class QCOWFormat : public FormatInterface { +public: + static QCOWFormat* create( + ImageCtxT* image_ctx, const json_spirit::mObject& json_object, + const SourceSpecBuilder* source_spec_builder) { + return new QCOWFormat(image_ctx, json_object, source_spec_builder); + } + + QCOWFormat(ImageCtxT* image_ctx, const json_spirit::mObject& json_object, + const SourceSpecBuilder* source_spec_builder); + QCOWFormat(const QCOWFormat&) = delete; + QCOWFormat& operator=(const QCOWFormat&) = delete; + + void open(Context* on_finish) override; + void close(Context* on_finish) override; + + void get_snapshots(SnapInfos* snap_infos, Context* on_finish) override; + void get_image_size(uint64_t snap_id, uint64_t* size, + Context* on_finish) override; + + bool read(io::AioCompletion* aio_comp, uint64_t snap_id, + io::Extents&& image_extents, io::ReadResult&& read_result, + int op_flags, int read_flags, + const ZTracer::Trace &parent_trace) override; + + void list_snaps(io::Extents&& image_extents, io::SnapIds&& snap_ids, + int list_snaps_flags, io::SnapshotDelta* snapshot_delta, + const ZTracer::Trace &parent_trace, + Context* on_finish) override; + +private: + /** + * @verbatim + * + * + * | + * v + * OPEN + * | + * v + * PROBE + * | + * |\---> READ V1 HEADER ----\ + * | | + * \---> READ V2 HEADER ----\| + * | + * v + * READ L1 TABLE + * | + * v + * READ BACKING FILE + * | + * /-------------------------/ + * | + * v + * + * + * @endverbatim + */ + + struct Cluster; + struct ClusterCache; + struct L2TableCache; + struct ReadRequest; + struct ListSnapsRequest; + + ImageCtxT* m_image_ctx; + json_spirit::mObject m_json_object; + const SourceSpecBuilder* m_source_spec_builder; + + boost::asio::io_context::strand m_strand; + std::shared_ptr m_stream; + + bufferlist m_bl; + + uint64_t m_size = 0; + + uint64_t m_backing_file_offset = 0; + uint32_t m_backing_file_size = 0; + + uint32_t m_cluster_bits = 0; + uint32_t m_cluster_size = 0; + uint64_t m_cluster_offset_mask = 0; + uint64_t m_cluster_mask = 0; + + uint32_t m_l1_size = 0; + uint64_t m_l1_table_offset = 0; + uint64_t* m_l1_table = nullptr; + bufferlist m_l1_table_bl; + + std::unique_ptr m_l2_table_cache; + std::unique_ptr m_cluster_cache; + + void handle_open(int r, Context* on_finish); + + void probe(Context* on_finish); + void handle_probe(int r, Context* on_finish); + + void read_v1_header(Context* on_finish); + void handle_read_v1_header(int r, Context* on_finish); + + void read_v2_header(Context* on_finish); + void handle_read_v2_header(int r, Context* on_finish); + + void read_l1_table(Context* on_finish); + void handle_read_l1_table(int r, Context* on_finish); + + void read_backing_file(Context* on_finish); +}; + +} // namespace migration +} // namespace librbd + +extern template class librbd::migration::QCOWFormat; + +#endif // CEPH_LIBRBD_MIGRATION_QCOW_FORMAT_H diff --git a/src/librbd/migration/SourceSpecBuilder.cc b/src/librbd/migration/SourceSpecBuilder.cc index 526522fba60..214d7ce0e5d 100644 --- a/src/librbd/migration/SourceSpecBuilder.cc +++ b/src/librbd/migration/SourceSpecBuilder.cc @@ -8,6 +8,7 @@ #include "librbd/migration/HttpStream.h" #include "librbd/migration/S3Stream.h" #include "librbd/migration/NativeFormat.h" +#include "librbd/migration/QCOWFormat.h" #include "librbd/migration/RawFormat.h" #include "librbd/migration/RawSnapshot.h" @@ -64,6 +65,8 @@ int SourceSpecBuilder::build_format( if (type == "native") { format->reset(NativeFormat::create(m_image_ctx, source_spec_object, import_only)); + } else if (type == "qcow") { + format->reset(QCOWFormat::create(m_image_ctx, source_spec_object, this)); } else if (type == "raw") { format->reset(RawFormat::create(m_image_ctx, source_spec_object, this)); } else { -- 2.39.5