]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
librbd/migration: basic QCOW v1 format handler
authorJason Dillaman <dillaman@redhat.com>
Tue, 12 Jan 2021 03:12:27 +0000 (22:12 -0500)
committerJason Dillaman <dillaman@redhat.com>
Thu, 14 Jan 2021 14:35:34 +0000 (09:35 -0500)
The initial implementation does not support backing files, compression, nor
(deprecated) encryption. The former two features will be added in a future commit.

Signed-off-by: Jason Dillaman <dillaman@redhat.com>
doc/rbd/rbd-live-migration.rst
qa/workunits/rbd/cli_migration.sh
src/librbd/CMakeLists.txt
src/librbd/io/ReadResult.cc
src/librbd/io/ReadResult.h
src/librbd/migration/QCOW.h [new file with mode: 0644]
src/librbd/migration/QCOWFormat.cc [new file with mode: 0644]
src/librbd/migration/QCOWFormat.h [new file with mode: 0644]
src/librbd/migration/SourceSpecBuilder.cc

index a0ba991fd6cc45b2f90cb62c18f49a9a81b671b1..c81670fd5eaad0db6fe156dff0e6982db4fa453f 100644 (file)
@@ -143,8 +143,9 @@ The general format for the ``source-spec`` JSON is as follows::
             }
         }
 
-The following formats are currently supported: ``native`` and ``raw``. The
-following streams are currently supported: ``file``, ``http``, and ``s3``.
+The following formats are currently supported: ``native``, ``qcow``, and
+``raw``. The following streams are currently supported: ``file``, ``http``, and
+``s3``.
 
 Formats
 ~~~~~~~
@@ -176,6 +177,19 @@ it utilizes native Ceph operations. For example, to import from the image
             "snap_name": "snap1"
         }
 
+The ``qcow`` format can be used to describe a QCOW (QEMU copy-on-write) block
+device. Only the original QCOW (v1) format is currently supported, but support
+for QCOW2 will be added in the near future. The ``qcow`` format data can be
+linked to any supported stream source described below. For example,
+its base ``source-spec`` JSON is encoded as follows::
+
+        {
+            "type": "qcow",
+            "stream": {
+                <stream unique parameters for HEAD, non-snapshot revision>
+            }
+        }
+
 The ``raw`` format can be used to describe a thick-provisioned, raw block device
 export (i.e. `rbd export --export-format 1 <snap-spec>`). The ``raw`` format
 data can be linked to any supported stream source described below. For example,
index 5f24f8ac2cef39def02bdd42cdfa45dca1a6fa20..cf665940c45ba5b1ad4aa84e938c8b40261271ab 100755 (executable)
@@ -63,12 +63,29 @@ remove_images() {
     done
 }
 
+show_diff()
+{
+    local file1=$1
+    local file2=$2
+
+    xxd "${file1}" > "${file1}.xxd"
+    xxd "${file2}" > "${file2}.xxd"
+    sdiff -s "${file1}.xxd" "${file2}.xxd" | head -n 64
+    rm -f "${file1}.xxd" "${file2}.xxd"
+}
+
 compare_images() {
     local src_image=$1
     local dst_image=$2
+    local ret=0
 
     export_raw_image ${dst_image}
-    cmp "${TEMPDIR}/${src_image}" "${TEMPDIR}/${dst_image}"
+    if ! cmp "${TEMPDIR}/${src_image}" "${TEMPDIR}/${dst_image}"
+    then
+        show_diff "${TEMPDIR}/${src_image}" "${TEMPDIR}/${dst_image}"
+        ret=1
+    fi
+    return ${ret}
 }
 
 test_import_native_format() {
@@ -124,6 +141,56 @@ EOF
     remove_image "${dest_image}"
 }
 
+test_import_qcow_format() {
+    case "$(lsb_release --id --short)" in
+    RedHatEnterpriseWorkstation|RedHatEnterpriseServer|RedHatEnterprise|CentOS)
+        # QCOW format not included in EL variants
+        return
+        ;;
+    *)
+        ;;
+    esac
+
+    local base_image=$1
+    local dest_image=$2
+
+    qemu-img convert -f raw -O qcow rbd:rbd/${base_image} ${TEMPDIR}/${base_image}.qcow
+    qemu-img info -f qcow ${TEMPDIR}/${base_image}.qcow
+
+    cat > ${TEMPDIR}/spec.json <<EOF
+{
+  "type": "qcow",
+  "stream": {
+    "type": "file",
+    "file_path": "${TEMPDIR}/${base_image}.qcow"
+  }
+}
+EOF
+    cat ${TEMPDIR}/spec.json
+
+    rbd migration prepare --import-only \
+        --source-spec-path ${TEMPDIR}/spec.json ${dest_image}
+
+    compare_images "${base_image}" "${dest_image}"
+
+    rbd migration abort ${dest_image}
+
+    rbd migration prepare --import-only \
+        --source-spec-path ${TEMPDIR}/spec.json ${dest_image}
+
+    compare_images "${base_image}" "${dest_image}"
+
+    rbd migration execute ${dest_image}
+
+    compare_images "${base_image}" "${dest_image}"
+
+    rbd migration commit ${dest_image}
+
+    compare_images "${base_image}" "${dest_image}"
+
+    remove_image "${dest_image}"
+}
+
 test_import_raw_format() {
     local base_image=$1
     local dest_image=$2
@@ -211,6 +278,7 @@ create_base_image ${IMAGE1}
 export_base_image ${IMAGE1}
 
 test_import_native_format ${IMAGE1} ${IMAGE2}
+test_import_qcow_format ${IMAGE1} ${IMAGE2}
 test_import_raw_format ${IMAGE1} ${IMAGE2}
 
 echo OK
index a5819e17b08839e4b2cba61916aa75fbbf223a6a..e3c9d29e9e62d601778f2c11e5f5d4176e396215 100644 (file)
@@ -140,6 +140,7 @@ set(librbd_internal_srcs
   migration/ImageDispatch.cc
   migration/NativeFormat.cc
   migration/OpenSourceImageRequest.cc
+  migration/QCOWFormat.cc
   migration/RawFormat.cc
   migration/RawSnapshot.cc
   migration/S3Stream.cc
index d8c03e1da38b1793d1379f7d8a5d6609be4bb24d..c4053fee6b3e33d694096b92ee6d1ea4dd6b01f2 100644 (file)
@@ -153,14 +153,14 @@ void ReadResult::C_ImageReadRequest::finish(int r) {
   CephContext *cct = aio_completion->ictx->cct;
   ldout(cct, 10) << "C_ImageReadRequest: r=" << r
                  << dendl;
-  if (r >= 0) {
+  if (r >= 0 || (ignore_enoent && r == -ENOENT)) {
     striper::LightweightBufferExtents buffer_extents;
     size_t length = 0;
     for (auto &image_extent : image_extents) {
       buffer_extents.emplace_back(buffer_offset + length, image_extent.second);
       length += image_extent.second;
     }
-    ceph_assert(length == bl.length());
+    ceph_assert(r == -ENOENT || length == bl.length());
 
     aio_completion->lock.lock();
     aio_completion->read_result.m_destriper.add_partial_result(
index 1dfd15b69880a08a2f200ee9fd652aa9be6d1837..12a1e78cc635c35fda064988f20732c33283a6c8 100644 (file)
@@ -30,6 +30,7 @@ public:
     uint64_t buffer_offset = 0;
     Extents image_extents;
     bufferlist bl;
+    bool ignore_enoent = false;
 
     C_ImageReadRequest(AioCompletion *aio_completion,
                        uint64_t buffer_offset,
diff --git a/src/librbd/migration/QCOW.h b/src/librbd/migration/QCOW.h
new file mode 100644 (file)
index 0000000..93ba65e
--- /dev/null
@@ -0,0 +1,467 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/* Based on QEMU block/qcow.cc and block/qcow2.h, which has this license: */
+
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef CEPH_LIBRBD_MIGRATION_QCOW2_H
+#define CEPH_LIBRBD_MIGRATION_QCOW2_H
+
+#include "include/ceph_assert.h"
+#include "include/int_types.h"
+#include "librbd/migration/QCOW.h"
+#include <endian.h>
+
+#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
+
+#define QCOW_CRYPT_NONE 0
+#define QCOW_CRYPT_AES  1
+#define QCOW_CRYPT_LUKS 2
+
+#define QCOW_MAX_CRYPT_CLUSTERS 32
+#define QCOW_MAX_SNAPSHOTS 65536
+
+/* Field widths in qcow2 mean normal cluster offsets cannot reach
+ * 64PB; depending on cluster size, compressed clusters can have a
+ * smaller limit (64PB for up to 16k clusters, then ramps down to
+ * 512TB for 2M clusters).  */
+#define QCOW_MAX_CLUSTER_OFFSET ((1ULL << 56) - 1)
+
+/* 8 MB refcount table is enough for 2 PB images at 64k cluster size
+ * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */
+#define QCOW_MAX_REFTABLE_SIZE (1ULL << 23)
+
+/* 32 MB L1 table is enough for 2 PB images at 64k cluster size
+ * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */
+#define QCOW_MAX_L1_SIZE (1ULL << 25)
+
+/* Allow for an average of 1k per snapshot table entry, should be plenty of
+ * space for snapshot names and IDs */
+#define QCOW_MAX_SNAPSHOTS_SIZE (1024 * QCOW_MAX_SNAPSHOTS)
+
+/* Maximum amount of extra data per snapshot table entry to accept */
+#define QCOW_MAX_SNAPSHOT_EXTRA_DATA 1024
+
+/* Bitmap header extension constraints */
+#define QCOW2_MAX_BITMAPS 65535
+#define QCOW2_MAX_BITMAP_DIRECTORY_SIZE (1024 * QCOW2_MAX_BITMAPS)
+
+/* Maximum of parallel sub-request per guest request */
+#define QCOW2_MAX_WORKERS 8
+
+/* indicate that the refcount of the referenced cluster is exactly one. */
+#define QCOW_OFLAG_COPIED     (1ULL << 63)
+/* indicate that the cluster is compressed (they never have the copied flag) */
+#define QCOW_OFLAG_COMPRESSED (1ULL << 62)
+/* The cluster reads as all zeros */
+#define QCOW_OFLAG_ZERO (1ULL << 0)
+
+#define QCOW_EXTL2_SUBCLUSTERS_PER_CLUSTER 32
+
+/* The subcluster X [0..31] is allocated */
+#define QCOW_OFLAG_SUB_ALLOC(X)   (1ULL << (X))
+/* The subcluster X [0..31] reads as zeroes */
+#define QCOW_OFLAG_SUB_ZERO(X)    (QCOW_OFLAG_SUB_ALLOC(X) << 32)
+/* Subclusters [X, Y) (0 <= X <= Y <= 32) are allocated */
+#define QCOW_OFLAG_SUB_ALLOC_RANGE(X, Y) \
+    (QCOW_OFLAG_SUB_ALLOC(Y) - QCOW_OFLAG_SUB_ALLOC(X))
+/* Subclusters [X, Y) (0 <= X <= Y <= 32) read as zeroes */
+#define QCOW_OFLAG_SUB_ZERO_RANGE(X, Y) \
+    (QCOW_OFLAG_SUB_ALLOC_RANGE(X, Y) << 32)
+/* L2 entry bitmap with all allocation bits set */
+#define QCOW_L2_BITMAP_ALL_ALLOC  (QCOW_OFLAG_SUB_ALLOC_RANGE(0, 32))
+/* L2 entry bitmap with all "read as zeroes" bits set */
+#define QCOW_L2_BITMAP_ALL_ZEROES (QCOW_OFLAG_SUB_ZERO_RANGE(0, 32))
+
+/* Size of normal and extended L2 entries */
+#define QCOW_L2E_SIZE_NORMAL   (sizeof(uint64_t))
+#define QCOW_L2E_SIZE_EXTENDED (sizeof(uint64_t) * 2)
+
+/* Size of L1 table entries */
+#define QCOW_L1E_SIZE (sizeof(uint64_t))
+
+/* Size of reftable entries */
+#define QCOW_REFTABLE_ENTRY_SIZE (sizeof(uint64_t))
+
+#define QCOW_MIN_CLUSTER_BITS 9
+#define QCOW_MAX_CLUSTER_BITS 21
+
+/* Defined in the qcow2 spec (compressed cluster descriptor) */
+#define QCOW2_COMPRESSED_SECTOR_SIZE 512U
+#define QCOW2_COMPRESSED_SECTOR_MASK (~(QCOW2_COMPRESSED_SECTOR_SIZE - 1ULL))
+
+#define QCOW_L2_CACHE_SIZE 16
+
+/* Must be at least 2 to cover COW */
+#define QCOW_MIN_L2_CACHE_SIZE 2 /* cache entries */
+
+/* Must be at least 4 to cover all cases of refcount table growth */
+#define QCOW_MIN_REFCOUNT_CACHE_SIZE 4 /* clusters */
+
+#define QCOW_DEFAULT_L2_CACHE_MAX_SIZE (1ULL << 25)
+#define QCOW_DEFAULT_CACHE_CLEAN_INTERVAL 600  /* seconds */
+
+#define QCOW_DEFAULT_CLUSTER_SIZE 65536
+
+#define QCOW2_OPT_DATA_FILE "data-file"
+#define QCOW2_OPT_LAZY_REFCOUNTS "lazy-refcounts"
+#define QCOW2_OPT_DISCARD_REQUEST "pass-discard-request"
+#define QCOW2_OPT_DISCARD_SNAPSHOT "pass-discard-snapshot"
+#define QCOW2_OPT_DISCARD_OTHER "pass-discard-other"
+#define QCOW2_OPT_OVERLAP "overlap-check"
+#define QCOW2_OPT_OVERLAP_TEMPLATE "overlap-check.template"
+#define QCOW2_OPT_OVERLAP_MAIN_HEADER "overlap-check.main-header"
+#define QCOW2_OPT_OVERLAP_ACTIVE_L1 "overlap-check.active-l1"
+#define QCOW2_OPT_OVERLAP_ACTIVE_L2 "overlap-check.active-l2"
+#define QCOW2_OPT_OVERLAP_REFCOUNT_TABLE "overlap-check.refcount-table"
+#define QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK "overlap-check.refcount-block"
+#define QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE "overlap-check.snapshot-table"
+#define QCOW2_OPT_OVERLAP_INACTIVE_L1 "overlap-check.inactive-l1"
+#define QCOW2_OPT_OVERLAP_INACTIVE_L2 "overlap-check.inactive-l2"
+#define QCOW2_OPT_OVERLAP_BITMAP_DIRECTORY "overlap-check.bitmap-directory"
+#define QCOW2_OPT_CACHE_SIZE "cache-size"
+#define QCOW2_OPT_L2_CACHE_SIZE "l2-cache-size"
+#define QCOW2_OPT_L2_CACHE_ENTRY_SIZE "l2-cache-entry-size"
+#define QCOW2_OPT_REFCOUNT_CACHE_SIZE "refcount-cache-size"
+#define QCOW2_OPT_CACHE_CLEAN_INTERVAL "cache-clean-interval"
+
+typedef struct QCowHeaderProbe {
+    uint32_t magic;
+    uint32_t version;
+} __attribute__((__packed__)) QCowHeaderProbe;
+
+typedef struct QCowHeaderV1
+{
+    uint32_t magic;
+    uint32_t version;
+    uint64_t backing_file_offset;
+    uint32_t backing_file_size;
+    uint32_t mtime;
+    uint64_t size; /* in bytes */
+    uint8_t cluster_bits;
+    uint8_t l2_bits;
+    uint16_t padding;
+    uint32_t crypt_method;
+    uint64_t l1_table_offset;
+} __attribute__((__packed__)) QCowHeaderV1;
+
+typedef struct QCowHeader {
+    uint32_t magic;
+    uint32_t version;
+    uint64_t backing_file_offset;
+    uint32_t backing_file_size;
+    uint32_t cluster_bits;
+    uint64_t size; /* in bytes */
+    uint32_t crypt_method;
+    uint32_t l1_size; /* XXX: save number of clusters instead ? */
+    uint64_t l1_table_offset;
+    uint64_t refcount_table_offset;
+    uint32_t refcount_table_clusters;
+    uint32_t nb_snapshots;
+    uint64_t snapshots_offset;
+
+    /* The following fields are only valid for version >= 3 */
+    uint64_t incompatible_features;
+    uint64_t compatible_features;
+    uint64_t autoclear_features;
+
+    uint32_t refcount_order;
+    uint32_t header_length;
+
+    /* Additional fields */
+    uint8_t compression_type;
+
+    /* header must be a multiple of 8 */
+    uint8_t padding[7];
+} __attribute__((__packed__)) QCowHeader;
+
+typedef struct QCowSnapshotHeader {
+    /* header is 8 byte aligned */
+    uint64_t l1_table_offset;
+
+    uint32_t l1_size;
+    uint16_t id_str_size;
+    uint16_t name_size;
+
+    uint32_t date_sec;
+    uint32_t date_nsec;
+
+    uint64_t vm_clock_nsec;
+
+    uint32_t vm_state_size;
+    uint32_t extra_data_size; /* for extension */
+    /* extra data follows */
+    /* id_str follows */
+    /* name follows  */
+} __attribute__((__packed__)) QCowSnapshotHeader;
+
+typedef struct QCowSnapshotExtraData {
+    uint64_t vm_state_size_large;
+    uint64_t disk_size;
+    uint64_t icount;
+} __attribute__((__packed__)) QCowSnapshotExtraData;
+
+
+typedef struct QCowSnapshot {
+    uint64_t l1_table_offset;
+    uint32_t l1_size;
+    char *id_str;
+    char *name;
+    uint64_t disk_size;
+    uint64_t vm_state_size;
+    uint32_t date_sec;
+    uint32_t date_nsec;
+    uint64_t vm_clock_nsec;
+    /* icount value for the moment when snapshot was taken */
+    uint64_t icount;
+    /* Size of all extra data, including QCowSnapshotExtraData if available */
+    uint32_t extra_data_size;
+    /* Data beyond QCowSnapshotExtraData, if any */
+    void *unknown_extra_data;
+} QCowSnapshot;
+
+typedef struct Qcow2CryptoHeaderExtension {
+    uint64_t offset;
+    uint64_t length;
+} __attribute__((__packed__)) Qcow2CryptoHeaderExtension;
+
+typedef struct Qcow2UnknownHeaderExtension {
+    uint32_t magic;
+    uint32_t len;
+    uint8_t data[];
+} Qcow2UnknownHeaderExtension;
+
+enum {
+    QCOW2_FEAT_TYPE_INCOMPATIBLE    = 0,
+    QCOW2_FEAT_TYPE_COMPATIBLE      = 1,
+    QCOW2_FEAT_TYPE_AUTOCLEAR       = 2,
+};
+
+/* Incompatible feature bits */
+enum {
+    QCOW2_INCOMPAT_DIRTY_BITNR      = 0,
+    QCOW2_INCOMPAT_CORRUPT_BITNR    = 1,
+    QCOW2_INCOMPAT_DATA_FILE_BITNR  = 2,
+    QCOW2_INCOMPAT_COMPRESSION_BITNR = 3,
+    QCOW2_INCOMPAT_EXTL2_BITNR      = 4,
+    QCOW2_INCOMPAT_DIRTY            = 1 << QCOW2_INCOMPAT_DIRTY_BITNR,
+    QCOW2_INCOMPAT_CORRUPT          = 1 << QCOW2_INCOMPAT_CORRUPT_BITNR,
+    QCOW2_INCOMPAT_DATA_FILE        = 1 << QCOW2_INCOMPAT_DATA_FILE_BITNR,
+    QCOW2_INCOMPAT_COMPRESSION      = 1 << QCOW2_INCOMPAT_COMPRESSION_BITNR,
+    QCOW2_INCOMPAT_EXTL2            = 1 << QCOW2_INCOMPAT_EXTL2_BITNR,
+
+    QCOW2_INCOMPAT_MASK             = QCOW2_INCOMPAT_DIRTY
+                                    | QCOW2_INCOMPAT_CORRUPT
+                                    | QCOW2_INCOMPAT_DATA_FILE
+                                    | QCOW2_INCOMPAT_COMPRESSION
+                                    | QCOW2_INCOMPAT_EXTL2,
+};
+
+/* Compatible feature bits */
+enum {
+    QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR = 0,
+    QCOW2_COMPAT_LAZY_REFCOUNTS       = 1 << QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
+
+    QCOW2_COMPAT_FEAT_MASK            = QCOW2_COMPAT_LAZY_REFCOUNTS,
+};
+
+/* Autoclear feature bits */
+enum {
+    QCOW2_AUTOCLEAR_BITMAPS_BITNR       = 0,
+    QCOW2_AUTOCLEAR_DATA_FILE_RAW_BITNR = 1,
+    QCOW2_AUTOCLEAR_BITMAPS             = 1 << QCOW2_AUTOCLEAR_BITMAPS_BITNR,
+    QCOW2_AUTOCLEAR_DATA_FILE_RAW       = 1 << QCOW2_AUTOCLEAR_DATA_FILE_RAW_BITNR,
+
+    QCOW2_AUTOCLEAR_MASK                = QCOW2_AUTOCLEAR_BITMAPS
+                                        | QCOW2_AUTOCLEAR_DATA_FILE_RAW,
+};
+
+enum qcow2_discard_type {
+    QCOW2_DISCARD_NEVER = 0,
+    QCOW2_DISCARD_ALWAYS,
+    QCOW2_DISCARD_REQUEST,
+    QCOW2_DISCARD_SNAPSHOT,
+    QCOW2_DISCARD_OTHER,
+    QCOW2_DISCARD_MAX
+};
+
+typedef struct Qcow2Feature {
+    uint8_t type;
+    uint8_t bit;
+    char    name[46];
+} __attribute__((__packed__)) Qcow2Feature;
+
+typedef struct Qcow2DiscardRegion {
+    uint64_t offset;
+    uint64_t bytes;
+} Qcow2DiscardRegion;
+
+typedef uint64_t Qcow2GetRefcountFunc(const void *refcount_array,
+                                      uint64_t index);
+typedef void Qcow2SetRefcountFunc(void *refcount_array,
+                                  uint64_t index, uint64_t value);
+
+typedef struct Qcow2BitmapHeaderExt {
+    uint32_t nb_bitmaps;
+    uint32_t reserved32;
+    uint64_t bitmap_directory_size;
+    uint64_t bitmap_directory_offset;
+} __attribute__((__packed__)) Qcow2BitmapHeaderExt;
+
+#define QCOW_RC_CACHE_SIZE QCOW_L2_CACHE_SIZE;
+
+typedef struct Qcow2COWRegion {
+    /**
+     * Offset of the COW region in bytes from the start of the first cluster
+     * touched by the request.
+     */
+    unsigned    offset;
+
+    /** Number of bytes to copy */
+    unsigned    nb_bytes;
+} Qcow2COWRegion;
+
+/**
+ * Describes an in-flight (part of a) write request that writes to clusters
+ * that are not referenced in their L2 table yet.
+ */
+typedef struct QCowL2Meta
+{
+    /** Guest offset of the first newly allocated cluster */
+    uint64_t offset;
+
+    /** Host offset of the first newly allocated cluster */
+    uint64_t alloc_offset;
+
+    /** Number of newly allocated clusters */
+    int nb_clusters;
+
+    /** Do not free the old clusters */
+    bool keep_old_clusters;
+
+    /**
+     * The COW Region between the start of the first allocated cluster and the
+     * area the guest actually writes to.
+     */
+    Qcow2COWRegion cow_start;
+
+    /**
+     * The COW Region between the area the guest actually writes to and the
+     * end of the last allocated cluster.
+     */
+    Qcow2COWRegion cow_end;
+
+    /*
+     * Indicates that COW regions are already handled and do not require
+     * any more processing.
+     */
+    bool skip_cow;
+
+    /**
+     * Indicates that this is not a normal write request but a preallocation.
+     * If the image has extended L2 entries this means that no new individual
+     * subclusters will be marked as allocated in the L2 bitmap (but any
+     * existing contents of that bitmap will be kept).
+     */
+    bool prealloc;
+
+    /** Pointer to next L2Meta of the same write request */
+    struct QCowL2Meta *next;
+} QCowL2Meta;
+
+typedef enum QCow2ClusterType {
+    QCOW2_CLUSTER_UNALLOCATED,
+    QCOW2_CLUSTER_ZERO_PLAIN,
+    QCOW2_CLUSTER_ZERO_ALLOC,
+    QCOW2_CLUSTER_NORMAL,
+    QCOW2_CLUSTER_COMPRESSED,
+} QCow2ClusterType;
+
+typedef enum QCow2MetadataOverlap {
+    QCOW2_OL_MAIN_HEADER_BITNR      = 0,
+    QCOW2_OL_ACTIVE_L1_BITNR        = 1,
+    QCOW2_OL_ACTIVE_L2_BITNR        = 2,
+    QCOW2_OL_REFCOUNT_TABLE_BITNR   = 3,
+    QCOW2_OL_REFCOUNT_BLOCK_BITNR   = 4,
+    QCOW2_OL_SNAPSHOT_TABLE_BITNR   = 5,
+    QCOW2_OL_INACTIVE_L1_BITNR      = 6,
+    QCOW2_OL_INACTIVE_L2_BITNR      = 7,
+    QCOW2_OL_BITMAP_DIRECTORY_BITNR = 8,
+
+    QCOW2_OL_MAX_BITNR              = 9,
+
+    QCOW2_OL_NONE             = 0,
+    QCOW2_OL_MAIN_HEADER      = (1 << QCOW2_OL_MAIN_HEADER_BITNR),
+    QCOW2_OL_ACTIVE_L1        = (1 << QCOW2_OL_ACTIVE_L1_BITNR),
+    QCOW2_OL_ACTIVE_L2        = (1 << QCOW2_OL_ACTIVE_L2_BITNR),
+    QCOW2_OL_REFCOUNT_TABLE   = (1 << QCOW2_OL_REFCOUNT_TABLE_BITNR),
+    QCOW2_OL_REFCOUNT_BLOCK   = (1 << QCOW2_OL_REFCOUNT_BLOCK_BITNR),
+    QCOW2_OL_SNAPSHOT_TABLE   = (1 << QCOW2_OL_SNAPSHOT_TABLE_BITNR),
+    QCOW2_OL_INACTIVE_L1      = (1 << QCOW2_OL_INACTIVE_L1_BITNR),
+    /* NOTE: Checking overlaps with inactive L2 tables will result in bdrv
+     * reads. */
+    QCOW2_OL_INACTIVE_L2      = (1 << QCOW2_OL_INACTIVE_L2_BITNR),
+    QCOW2_OL_BITMAP_DIRECTORY = (1 << QCOW2_OL_BITMAP_DIRECTORY_BITNR),
+} QCow2MetadataOverlap;
+
+/* Perform all overlap checks which can be done in constant time */
+#define QCOW2_OL_CONSTANT \
+    (QCOW2_OL_MAIN_HEADER | QCOW2_OL_ACTIVE_L1 | QCOW2_OL_REFCOUNT_TABLE | \
+     QCOW2_OL_SNAPSHOT_TABLE | QCOW2_OL_BITMAP_DIRECTORY)
+
+/* Perform all overlap checks which don't require disk access */
+#define QCOW2_OL_CACHED \
+    (QCOW2_OL_CONSTANT | QCOW2_OL_ACTIVE_L2 | QCOW2_OL_REFCOUNT_BLOCK | \
+     QCOW2_OL_INACTIVE_L1)
+
+/* Perform all overlap checks */
+#define QCOW2_OL_ALL \
+    (QCOW2_OL_CACHED | QCOW2_OL_INACTIVE_L2)
+
+#define QCOW_L1E_OFFSET_MASK 0x00fffffffffffe00ULL
+#define QCOW_L2E_OFFSET_MASK 0x00fffffffffffe00ULL
+#define QCOW_L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL
+
+#define REFT_OFFSET_MASK 0xfffffffffffffe00ULL
+
+#define INV_OFFSET (-1ULL)
+
+static inline uint64_t l2meta_cow_start(QCowL2Meta *m)
+{
+    return m->offset + m->cow_start.offset;
+}
+
+static inline uint64_t l2meta_cow_end(QCowL2Meta *m)
+{
+    return m->offset + m->cow_end.offset + m->cow_end.nb_bytes;
+}
+
+static inline uint64_t refcount_diff(uint64_t r1, uint64_t r2)
+{
+    return r1 > r2 ? r1 - r2 : r2 - r1;
+}
+
+#endif // CEPH_LIBRBD_MIGRATION_QCOW2_H
diff --git a/src/librbd/migration/QCOWFormat.cc b/src/librbd/migration/QCOWFormat.cc
new file mode 100644 (file)
index 0000000..1e8956b
--- /dev/null
@@ -0,0 +1,981 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/migration/QCOWFormat.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "include/intarith.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Utils.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/migration/SnapshotInterface.h"
+#include "librbd/migration/SourceSpecBuilder.h"
+#include "librbd/migration/StreamInterface.h"
+#include <boost/asio/dispatch.hpp>
+#include <boost/asio/post.hpp>
+#include <deque>
+#include <endian.h>
+#include <tuple>
+#include <unordered_map>
+#include <vector>
+
+#define dout_subsys ceph_subsys_rbd
+
+namespace librbd {
+namespace migration {
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::QCOWFormat: " \
+                           << __func__ << ": "
+
+namespace {
+
+struct ClusterExtent {
+  uint64_t cluster_offset;
+  uint64_t cluster_length;
+  uint64_t intra_cluster_offset;
+  uint64_t image_offset;
+  uint64_t buffer_offset;
+
+  ClusterExtent(uint64_t cluster_offset, uint64_t cluster_length,
+                uint64_t intra_cluster_offset, uint64_t image_offset,
+                uint64_t buffer_offset)
+    : cluster_offset(cluster_offset), cluster_length(cluster_length),
+      intra_cluster_offset(intra_cluster_offset), image_offset(image_offset),
+      buffer_offset(buffer_offset) {
+  }
+};
+
+typedef std::vector<ClusterExtent> ClusterExtents;
+
+void populate_cluster_extents(CephContext* cct, uint64_t cluster_size,
+                              const io::Extents& image_extents,
+                              ClusterExtents* cluster_extents) {
+  uint64_t buffer_offset = 0;
+  for (auto [image_offset, image_length] : image_extents) {
+    while (image_length > 0) {
+      auto intra_cluster_offset = image_offset & (cluster_size - 1);
+      auto intra_cluster_length = cluster_size - intra_cluster_offset;
+      auto cluster_length = std::min(image_length, intra_cluster_length);
+
+      ldout(cct, 20) << "image_offset=" << image_offset << ", "
+                     << "image_length=" << image_length << ", "
+                     << "cluster_length=" << cluster_length << dendl;
+
+
+      cluster_extents->emplace_back(0, cluster_length, intra_cluster_offset,
+                                   image_offset, buffer_offset);
+
+      image_offset += cluster_length;
+      image_length -= cluster_length;
+      buffer_offset += cluster_length;
+    }
+  }
+}
+
+} // anonymous namespace
+
+template <typename I>
+struct QCOWFormat<I>::Cluster {
+  const uint64_t cluster_offset;
+  bufferlist cluster_data_bl;
+
+  Cluster(uint64_t cluster_offset) : cluster_offset(cluster_offset) {
+  }
+};
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::QCOWFormat::ClusterCache: " \
+                           << this << " " << __func__ << ": "
+
+template <typename I>
+class QCOWFormat<I>::ClusterCache {
+public:
+  ClusterCache(QCOWFormat* qcow_format)
+    : qcow_format(qcow_format),
+      m_strand(*qcow_format->m_image_ctx->asio_engine) {
+  }
+
+  void get_cluster(uint64_t cluster_offset, uint64_t cluster_length,
+                   uint64_t intra_cluster_offset, bufferlist* bl,
+                   Context* on_finish) {
+    auto cct = qcow_format->m_image_ctx->cct;
+    ldout(cct, 20) << "cluster_offset=" << cluster_offset << dendl;
+
+    // cache state machine runs in a single strand thread
+    boost::asio::dispatch(
+      m_strand,
+      [this, cluster_offset, cluster_length, intra_cluster_offset, bl,
+       on_finish]() {
+        execute_get_cluster(cluster_offset, cluster_length,
+                            intra_cluster_offset, bl, on_finish);
+      });
+  }
+
+private:
+  typedef std::tuple<uint64_t, uint64_t, bufferlist*, Context*> Completion;
+  typedef std::list<Completion> Completions;
+
+  QCOWFormat* qcow_format;
+  boost::asio::io_context::strand m_strand;
+
+  std::shared_ptr<Cluster> cluster;
+  std::unordered_map<uint64_t, std::shared_ptr<Cluster>> clusters;
+  std::unordered_map<uint64_t, Completions> cluster_completions;
+
+  void execute_get_cluster(uint64_t cluster_offset, uint64_t cluster_length,
+                           uint64_t intra_cluster_offset, bufferlist* bl,
+                           Context* on_finish) {
+    auto cct = qcow_format->m_image_ctx->cct;
+    ldout(cct, 20) << "cluster_offset=" << cluster_offset << dendl;
+
+    if (cluster && cluster->cluster_offset == cluster_offset) {
+      // most-recent cluster matches
+      bl->substr_of(cluster->cluster_data_bl, intra_cluster_offset,
+                    cluster_length);
+      boost::asio::post(*qcow_format->m_image_ctx->asio_engine,
+                        [on_finish]() { on_finish->complete(0); });
+      return;
+    }
+
+    // record callback for cluster
+    cluster_completions[cluster_offset].emplace_back(
+      intra_cluster_offset, cluster_length, bl, on_finish);
+    if (clusters.count(cluster_offset) == 0) {
+      // start the new read request
+      auto cluster = std::make_shared<Cluster>(cluster_offset);
+      clusters[cluster_offset] = cluster;
+
+      read_cluster(cluster);
+    }
+  }
+
+  void read_cluster(std::shared_ptr<Cluster> cluster) {
+    auto cct = qcow_format->m_image_ctx->cct;
+
+    uint64_t stream_offset = cluster->cluster_offset;
+    uint64_t stream_length = qcow_format->m_cluster_size;
+    if ((cluster->cluster_offset & QCOW_OFLAG_COMPRESSED) != 0) {
+      // compressed clusters encode the compressed length in the lower bits
+      stream_offset = cluster->cluster_offset &
+                      qcow_format->m_cluster_offset_mask;
+      stream_length = (cluster->cluster_offset >>
+                        (63 - qcow_format->m_cluster_bits)) &
+                      (qcow_format->m_cluster_size - 1);
+    }
+
+    ldout(cct, 20) << "cluster_offset=" << cluster->cluster_offset << ", "
+                   << "stream_offset=" << stream_offset << ", "
+                   << "stream_length=" << stream_length << dendl;
+
+    // read the cluster into the cache entry
+    auto ctx = new LambdaContext([this, cluster](int r) {
+      boost::asio::post(m_strand, [this, cluster, r]() {
+        handle_read_cluster(r, cluster); }); });
+    qcow_format->m_stream->read({{stream_offset, stream_length}},
+                                &cluster->cluster_data_bl, ctx);
+  }
+
+  void handle_read_cluster(int r, std::shared_ptr<Cluster> cluster) {
+    auto cct = qcow_format->m_image_ctx->cct;
+    ldout(cct, 20) << "r=" << r << ", "
+                   << "cluster_offset=" << cluster->cluster_offset << dendl;
+
+    auto completions = std::move(cluster_completions[cluster->cluster_offset]);
+    cluster_completions.erase(cluster->cluster_offset);
+    clusters.erase(cluster->cluster_offset);
+
+    if (r < 0) {
+      lderr(cct) << "failed to read cluster offset " << cluster->cluster_offset
+                 << ": " << cpp_strerror(r) << dendl;
+    } else {
+      if ((cluster->cluster_offset & QCOW_OFLAG_COMPRESSED) != 0) {
+        bufferlist compressed_bl{std::move(cluster->cluster_data_bl)};
+        cluster->cluster_data_bl.clear();
+
+        // TODO
+        lderr(cct) << "support for compressed clusters is not available"
+                   << dendl;
+        r = -EINVAL;
+      }
+    }
+
+    // complete the IO back to caller
+    boost::asio::post(*qcow_format->m_image_ctx->asio_engine,
+                      [r, cluster, completions=std::move(completions)]() {
+      for (auto completion : completions) {
+        if (r >= 0) {
+          std::get<2>(completion)->substr_of(
+            cluster->cluster_data_bl,
+            std::get<0>(completion),
+            std::get<1>(completion));
+        }
+        std::get<3>(completion)->complete(r);
+      }
+    });
+  }
+};
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::QCOWFormat::L2TableCache: " \
+                           << this << " " << __func__ << ": "
+
+template <typename I>
+class QCOWFormat<I>::L2TableCache {
+public:
+  L2TableCache(QCOWFormat* qcow_format, uint32_t l2_bits)
+    : qcow_format(qcow_format), l2_bits(l2_bits), l2_size(1UL << l2_bits),
+      m_strand(*qcow_format->m_image_ctx->asio_engine),
+      l2_cache_entries(QCOW_L2_CACHE_SIZE) {
+  }
+
+  void get_cluster_offset(uint64_t image_offset, uint64_t* cluster_offset,
+                          Context* on_finish) {
+    auto cct = qcow_format->m_image_ctx->cct;
+    ldout(cct, 20) << "image_offset=" << image_offset << dendl;
+
+    // cache state machine runs in a single strand thread
+    boost::asio::dispatch(
+      m_strand, [this, image_offset, cluster_offset, on_finish]() {
+        requests.emplace_back(image_offset, cluster_offset, on_finish);
+      });
+    dispatch_get_cluster_offset();
+  }
+
+private:
+  QCOWFormat* qcow_format;
+  uint32_t l2_bits;
+  uint32_t l2_size;
+
+  boost::asio::io_context::strand m_strand;
+
+  struct Request {
+    uint64_t image_offset;
+    uint64_t* cluster_offset;
+    Context* on_finish;
+
+    Request(uint64_t image_offset, uint64_t* cluster_offset, Context* on_finish)
+      : image_offset(image_offset), cluster_offset(cluster_offset),
+        on_finish(on_finish) {
+    }
+  };
+  typedef std::deque<Request> Requests;
+
+  struct L2Cache {
+    uint64_t l2_offset = 0;
+    uint64_t* l2_table = nullptr;
+    bufferlist l2_table_bl;
+
+    uint32_t count = 0;
+    bool in_flight = false;
+
+    int ret_val = 0;
+  };
+  std::vector<L2Cache> l2_cache_entries;
+
+  Requests requests;
+
+  void dispatch_get_cluster_offset() {
+    boost::asio::dispatch(m_strand, [this]() { execute_get_cluster_offset(); });
+  }
+
+  void execute_get_cluster_offset() {
+    auto cct = qcow_format->m_image_ctx->cct;
+    if (requests.empty()) {
+      return;
+    }
+
+    auto request = requests.front();
+    auto l1_index = request.image_offset >>
+                    (l2_bits + qcow_format->m_cluster_bits);
+    auto l2_offset = qcow_format->m_l1_table[l1_index] &
+                     qcow_format->m_cluster_mask;
+    auto l2_index = (request.image_offset >> qcow_format->m_cluster_bits) &
+                    (l2_size - 1);
+    ldout(cct, 20) << "image_offset=" << request.image_offset << ", "
+                   << "l1_index=" << l1_index << ", "
+                   << "l2_offset=" << l2_offset << ", "
+                   << "l2_index=" << l2_index << dendl;
+
+    int r = 0;
+    if (l2_offset == 0) {
+      // L2 table has not been allocated for specified offset
+      ldout(cct, 20) << "image_offset=" << request.image_offset << ", "
+                     << "cluster_offset=DNE" << dendl;
+      *request.cluster_offset = 0;
+      r = -ENOENT;
+    } else {
+      const uint64_t* l2_table = nullptr;
+      r = l2_table_lookup(l2_offset, &l2_table);
+      if (r < 0) {
+        lderr(cct) << "failed to load L2 table: l2_offset=" << l2_offset << ": "
+                   << cpp_strerror(r) << dendl;
+      } else if (l2_table == nullptr) {
+        // table not in cache -- will restart once its loaded
+        return;
+      } else {
+        *request.cluster_offset = be64toh(l2_table[l2_index]);
+        ldout(cct, 20) << "image_offset=" << request.image_offset << ", "
+                       << "cluster_offset=" << *request.cluster_offset << dendl;
+      }
+    }
+
+    // complete the L2 cache request
+    boost::asio::post(*qcow_format->m_image_ctx->asio_engine,
+                      [r, ctx=request.on_finish]() { ctx->complete(r); });
+    requests.pop_front();
+
+    // process next request (if any)
+    dispatch_get_cluster_offset();
+  }
+
+  int l2_table_lookup(uint64_t l2_offset, const uint64_t** cluster_offset) {
+    auto cct = qcow_format->m_image_ctx->cct;
+
+    *cluster_offset = nullptr;
+
+    // find a match in the existing cache
+    for (auto idx = 0U; idx < l2_cache_entries.size(); ++idx) {
+      auto& l2_cache = l2_cache_entries[idx];
+      if (l2_cache.l2_offset == l2_offset) {
+        if (l2_cache.in_flight) {
+          ldout(cct, 20) << "l2_offset=" << l2_offset << ", "
+                         << "index=" << idx << " (in-flight)" << dendl;
+          return 0;
+        }
+
+        if (l2_cache.ret_val < 0) {
+          ldout(cct, 20) << "l2_offset=" << l2_offset << ", "
+                         << "index=" << idx << " (error)" << dendl;
+          l2_cache = L2Cache{};
+          return l2_cache.ret_val;
+        }
+
+        ++l2_cache.count;
+        if (l2_cache.count == std::numeric_limits<uint32_t>::max()) {
+          for (auto& entry : l2_cache_entries) {
+            entry.count >>= 1;
+          }
+        }
+
+        *cluster_offset = l2_cache.l2_table;
+        return 0;
+      }
+    }
+
+    // find the least used entry
+    int32_t min_idx = -1;
+    uint32_t min_count = std::numeric_limits<uint32_t>::max();
+    for (uint32_t idx = 0U; idx < l2_cache_entries.size(); ++idx) {
+      auto& l2_cache = l2_cache_entries[idx];
+      if (l2_cache.in_flight) {
+        continue;
+      }
+
+      if (l2_cache.count > 0) {
+        --l2_cache.count;
+      }
+      if (l2_cache.count < min_count) {
+        min_count = l2_cache.count;
+        min_idx = idx;
+      }
+    }
+
+    if (min_idx == -1) {
+      // no space in the cache due to in-flight requests
+      ldout(cct, 20) << "l2_offset=" << l2_offset << ", "
+                     << "index=DNE (cache busy)" << dendl;
+      return 0;
+    }
+
+    ldout(cct, 20) << "l2_offset=" << l2_offset << ", "
+                   << "index=" << min_idx << " (loading)" << dendl;
+    auto& l2_cache = l2_cache_entries[min_idx];
+    l2_cache.l2_offset = l2_offset;
+    l2_cache.count = 1;
+    l2_cache.in_flight = true;
+
+    // read the L2 table into the L2 cache entry
+    auto ctx = new LambdaContext([this, index=min_idx, l2_offset](int r) {
+      boost::asio::post(m_strand, [this, index, l2_offset, r]() {
+        handle_l2_table_lookup(r, index, l2_offset); }); });
+    l2_cache.l2_table_bl.clear();
+    qcow_format->m_stream->read(
+      {{l2_offset, l2_size * sizeof(uint64_t)}}, &l2_cache.l2_table_bl, ctx);
+
+    return 0;
+  }
+
+  void handle_l2_table_lookup(int r, uint32_t index, uint64_t l2_offset) {
+    auto cct = qcow_format->m_image_ctx->cct;
+    ldout(cct, 20) << "r=" << r << ", "
+                   << "l2_offset=" << l2_offset << ", "
+                   << "index=" << index << dendl;
+
+    auto& l2_cache = l2_cache_entries[index];
+    ceph_assert(l2_cache.in_flight);
+    l2_cache.in_flight = false;
+
+    if (r < 0) {
+      lderr(cct) << "failed to load L2 table: "
+                 << "l2_offset=" << l2_cache.l2_offset << ": "
+                 << cpp_strerror(r) << dendl;
+      l2_cache.ret_val = r;
+    } else {
+      l2_cache.l2_table = reinterpret_cast<uint64_t*>(
+        l2_cache.l2_table_bl.c_str());
+    }
+
+    // restart the state machine
+    dispatch_get_cluster_offset();
+  }
+
+};
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::QCOWFormat::ReadRequest: " \
+                           << this << " " << __func__ << ": "
+
+template <typename I>
+class QCOWFormat<I>::ReadRequest {
+public:
+  ReadRequest(QCOWFormat* qcow_format, io::AioCompletion* aio_comp,
+              io::Extents&& image_extents)
+    : qcow_format(qcow_format), aio_comp(aio_comp),
+      image_extents(std::move(image_extents)) {
+  }
+
+  void send() {
+    get_cluster_offsets();
+  }
+
+private:
+  QCOWFormat* qcow_format;
+  io::AioCompletion* aio_comp;
+
+  io::Extents image_extents;
+  size_t image_extents_idx = 0;
+  uint32_t image_extent_offset = 0;
+
+  ClusterExtents cluster_extents;
+
+  void get_cluster_offsets() {
+    auto cct = qcow_format->m_image_ctx->cct;
+    populate_cluster_extents(cct, qcow_format->m_cluster_size, image_extents,
+                             &cluster_extents);
+
+    ldout(cct, 20) << dendl;
+    auto ctx = new LambdaContext([this](int r) {
+      handle_get_cluster_offsets(r); });
+    auto gather_ctx = new C_Gather(cct, ctx);
+
+    for (auto& cluster_extent : cluster_extents) {
+      auto sub_ctx = new LambdaContext(
+        [this, &cluster_extent, on_finish=gather_ctx->new_sub()](int r) {
+          handle_get_cluster_offset(r, cluster_extent, on_finish); });
+      qcow_format->m_l2_table_cache->get_cluster_offset(
+        cluster_extent.image_offset, &cluster_extent.cluster_offset, sub_ctx);
+    }
+
+    gather_ctx->activate();
+  }
+
+  void handle_get_cluster_offset(int r, const ClusterExtent& cluster_extent,
+                                 Context* on_finish) {
+    auto cct = qcow_format->m_image_ctx->cct;
+    ldout(cct, 20) << "r=" << r << ", "
+                   << "image_offset=" << cluster_extent.image_offset << ", "
+                   << "cluster_offset=" << cluster_extent.cluster_offset
+                   << dendl;
+
+    if (r == -ENOENT) {
+      ldout(cct, 20) << "image offset DNE in QCOW image" << dendl;
+      r = 0;
+    } else if (r < 0) {
+      lderr(cct) << "failed to map image offset " << cluster_extent.image_offset
+                 << ": " << cpp_strerror(r) << dendl;
+    }
+
+    on_finish->complete(r);
+  }
+
+  void handle_get_cluster_offsets(int r) {
+    auto cct = qcow_format->m_image_ctx->cct;
+    ldout(cct, 20) << "r=" << r << dendl;
+
+    if (r < 0) {
+      lderr(cct) << "failed to retrieve cluster extents: " << cpp_strerror(r)
+                 << dendl;
+      aio_comp->fail(r);
+      delete this;
+      return;
+    }
+
+    read_clusters();
+  }
+
+  void read_clusters() {
+    auto cct = qcow_format->m_image_ctx->cct;
+    ldout(cct, 20) << dendl;
+
+    aio_comp->set_request_count(cluster_extents.size());
+    for (auto& cluster_extent : cluster_extents) {
+      auto read_ctx = new io::ReadResult::C_ImageReadRequest(
+        aio_comp, cluster_extent.buffer_offset,
+        {{cluster_extent.image_offset, cluster_extent.cluster_length}});
+      read_ctx->ignore_enoent = true;
+
+      auto log_ctx = new LambdaContext(
+        [this, cct=qcow_format->m_image_ctx->cct,
+         image_offset=cluster_extent.image_offset,
+         image_length=cluster_extent.cluster_length, ctx=read_ctx](int r) {
+          handle_read_cluster(cct, r, image_offset, image_length, ctx);
+        });
+
+      if (cluster_extent.cluster_offset == 0) {
+        // QCOW header is at offset 0, implies cluster DNE
+        log_ctx->complete(-ENOENT);
+      } else {
+        // request the (sub)cluster from the cluster cache
+        qcow_format->m_cluster_cache->get_cluster(
+          cluster_extent.cluster_offset, cluster_extent.cluster_length,
+          cluster_extent.intra_cluster_offset, &read_ctx->bl, log_ctx);
+      }
+    }
+
+    delete this;
+  }
+
+  void handle_read_cluster(CephContext* cct, int r, uint64_t image_offset,
+                           uint64_t image_length, Context* on_finish) {
+    ldout(cct, 20) << "r=" << r << ", "
+                   << "image_offset=" << image_offset << ", "
+                   << "image_length=" << image_length << dendl;
+
+    if (r != -ENOENT && r < 0) {
+      lderr(cct) << "failed to read image extent " << image_offset << "~"
+                 << image_length << ": " << cpp_strerror(r) << dendl;
+    }
+
+    on_finish->complete(r);
+  }
+};
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::QCOWFormat::" \
+                           << "ListSnapsRequest: " << this << " " \
+                           << __func__ << ": "
+
+template <typename I>
+class QCOWFormat<I>::ListSnapsRequest {
+public:
+  ListSnapsRequest(QCOWFormat* qcow_format, io::Extents&& image_extents,
+                   io::SparseExtents* sparse_extents, Context* on_finish)
+    : qcow_format(qcow_format), image_extents(std::move(image_extents)),
+      sparse_extents(sparse_extents), on_finish(on_finish) {
+  }
+
+  void send() {
+    list_snaps();
+  }
+
+private:
+  QCOWFormat* qcow_format;
+  io::Extents image_extents;
+  io::SparseExtents* sparse_extents;
+  Context* on_finish;
+
+  ClusterExtents cluster_extents;
+
+  void list_snaps() {
+    auto cct = qcow_format->m_image_ctx->cct;
+    ldout(cct, 20) << dendl;
+
+    populate_cluster_extents(cct, qcow_format->m_cluster_size, image_extents,
+                             &cluster_extents);
+
+    auto ctx = new LambdaContext([this](int r) {
+      handle_list_snaps(r); });
+    auto gather_ctx = new C_Gather(cct, ctx);
+    for (auto& cluster_extent : cluster_extents) {
+      auto ctx = new LambdaContext(
+        [this, cluster_extent=&cluster_extent,
+         ctx=gather_ctx->new_sub()](int r) {
+          boost::asio::post(
+            qcow_format->m_strand, [this, cluster_extent, ctx, r]() {
+              handle_get_cluster_offset( r, *cluster_extent, ctx);
+            });
+        });
+      qcow_format->m_l2_table_cache->get_cluster_offset(
+        cluster_extent.image_offset, &cluster_extent.cluster_offset, ctx);
+    }
+
+    gather_ctx->activate();
+  }
+
+  void handle_get_cluster_offset(
+      int r, const ClusterExtent& cluster_extent, Context* on_finish) {
+    auto cct = qcow_format->m_image_ctx->cct;
+    ldout(cct, 20) << "r=" << r << ", "
+                   << "image_offset=" << cluster_extent.image_offset << ", "
+                   << "image_length=" << cluster_extent.cluster_length << ", "
+                   << "cluster_offset=" << cluster_extent.cluster_offset
+                   << dendl;
+
+    if (r == -ENOENT) {
+      r = 0;
+    } else if (r >= 0 && cluster_extent.cluster_offset != 0) {
+      sparse_extents->insert(
+        cluster_extent.image_offset, cluster_extent.cluster_length,
+        {io::SPARSE_EXTENT_STATE_DATA, cluster_extent.cluster_length});
+    }
+
+    on_finish->complete(r);
+  }
+
+  void handle_list_snaps(int r) {
+    auto cct = qcow_format->m_image_ctx->cct;
+    ldout(cct, 20) << "r=" << r << dendl;
+
+    on_finish->complete(r);
+    delete this;
+  }
+};
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::QCOWFormat: " << this \
+                           << " " << __func__ << ": "
+
+template <typename I>
+QCOWFormat<I>::QCOWFormat(
+    I* image_ctx, const json_spirit::mObject& json_object,
+    const SourceSpecBuilder<I>* source_spec_builder)
+  : m_image_ctx(image_ctx), m_json_object(json_object),
+    m_source_spec_builder(source_spec_builder),
+    m_strand(*image_ctx->asio_engine) {
+}
+
+template <typename I>
+void QCOWFormat<I>::open(Context* on_finish) {
+  auto cct = m_image_ctx->cct;
+  ldout(cct, 10) << dendl;
+
+  int r = m_source_spec_builder->build_stream(m_json_object, &m_stream);
+  if (r < 0) {
+    lderr(cct) << "failed to build migration stream handler" << cpp_strerror(r)
+               << dendl;
+    on_finish->complete(r);
+    return;
+  }
+
+  auto ctx = new LambdaContext([this, on_finish](int r) {
+    handle_open(r, on_finish); });
+  m_stream->open(ctx);
+}
+
+template <typename I>
+void QCOWFormat<I>::handle_open(int r, Context* on_finish) {
+  auto cct = m_image_ctx->cct;
+  ldout(cct, 10) << "r=" << r << dendl;
+
+  if (r < 0) {
+    lderr(cct) << "failed to open QCOW image: " << cpp_strerror(r)
+               << dendl;
+    on_finish->complete(r);
+    return;
+  }
+
+  probe(on_finish);
+}
+
+template <typename I>
+void QCOWFormat<I>::probe(Context* on_finish) {
+  auto cct = m_image_ctx->cct;
+  ldout(cct, 10) << dendl;
+
+  auto ctx = new LambdaContext([this, on_finish](int r) {
+    handle_probe(r, on_finish); });
+  m_bl.clear();
+  m_stream->read({{0, 8}}, &m_bl, ctx);
+}
+
+template <typename I>
+void QCOWFormat<I>::handle_probe(int r, Context* on_finish) {
+  auto cct = m_image_ctx->cct;
+  ldout(cct, 10) << "r=" << r << dendl;
+
+  if (r < 0) {
+    lderr(cct) << "failed to probe QCOW image: " << cpp_strerror(r)
+               << dendl;
+    on_finish->complete(r);
+    return;
+  }
+
+  auto header_probe = *reinterpret_cast<QCowHeaderProbe*>(
+    m_bl.c_str());
+  header_probe.magic = be32toh(header_probe.magic);
+  header_probe.version = be32toh(header_probe.version);
+
+  if (header_probe.magic != QCOW_MAGIC) {
+    lderr(cct) << "invalid QCOW header magic" << dendl;
+    on_finish->complete(-EINVAL);
+    return;
+  }
+
+  m_bl.clear();
+  if (header_probe.version == 1) {
+    read_v1_header(on_finish);
+    return;
+  } else if (header_probe.version == 2) {
+    read_v2_header(on_finish);
+    return;
+  } else {
+    lderr(cct) << "invalid QCOW header version " << header_probe.version
+               << dendl;
+    on_finish->complete(-EINVAL);
+    return;
+  }
+}
+
+template <typename I>
+void QCOWFormat<I>::read_v1_header(Context* on_finish) {
+  auto cct = m_image_ctx->cct;
+  ldout(cct, 10) << dendl;
+
+  auto ctx = new LambdaContext([this, on_finish](int r) {
+    handle_read_v1_header(r, on_finish); });
+  m_bl.clear();
+  m_stream->read({{0, sizeof(QCowHeaderV1)}}, &m_bl, ctx);
+}
+
+template <typename I>
+void QCOWFormat<I>::handle_read_v1_header(int r, Context* on_finish) {
+  auto cct = m_image_ctx->cct;
+  ldout(cct, 10) << "r=" << r << dendl;
+
+  if (r < 0) {
+    lderr(cct) << "failed to read QCOW header: " << cpp_strerror(r) << dendl;
+    on_finish->complete(r);
+    return;
+  }
+
+  auto header = *reinterpret_cast<QCowHeaderV1*>(m_bl.c_str());
+
+  // byte-swap important fields
+  header.magic = be32toh(header.magic);
+  header.version = be32toh(header.version);
+  header.backing_file_offset = be64toh(header.backing_file_offset);
+  header.backing_file_size = be32toh(header.backing_file_size);
+  header.size = be64toh(header.size);
+  header.crypt_method = be32toh(header.crypt_method);
+  header.l1_table_offset = be64toh(header.l1_table_offset);
+
+  if (header.magic != QCOW_MAGIC || header.version != 1) {
+    // honestly shouldn't happen since we've already validated it
+    lderr(cct) << "header is not QCOW" << dendl;
+    on_finish->complete(-EINVAL);
+    return;
+  }
+
+  if (header.cluster_bits < QCOW_MIN_CLUSTER_BITS ||
+      header.cluster_bits > QCOW_MAX_CLUSTER_BITS) {
+    lderr(cct) << "invalid cluster bits: " << header.cluster_bits << dendl;
+    on_finish->complete(-EINVAL);
+    return;
+  }
+
+  if (header.l2_bits < (QCOW_MIN_CLUSTER_BITS - 3) ||
+      header.l2_bits > (QCOW_MAX_CLUSTER_BITS - 3)) {
+    lderr(cct) << "invalid L2 bits: " << header.l2_bits << dendl;
+    on_finish->complete(-EINVAL);
+    return;
+  }
+
+  if (header.crypt_method != QCOW_CRYPT_NONE) {
+    lderr(cct) << "invalid or unsupported encryption method" << dendl;
+    on_finish->complete(-EINVAL);
+    return;
+  }
+
+  m_size = header.size;
+  if (p2roundup(m_size, static_cast<uint64_t>(512)) != m_size) {
+    lderr(cct) << "image size is not a multiple of block size" << dendl;
+    on_finish->complete(-EINVAL);
+    return;
+  }
+
+  m_backing_file_offset = header.backing_file_offset;
+  m_backing_file_size = header.backing_file_size;
+
+  m_cluster_bits = header.cluster_bits;
+  m_cluster_size = 1UL << header.cluster_bits;
+  m_cluster_offset_mask = (1ULL << (63 - header.cluster_bits)) - 1;
+  m_cluster_mask = ~QCOW_OFLAG_COMPRESSED;
+
+  uint32_t l2_bits = header.l2_bits;
+  uint32_t shift = m_cluster_bits + l2_bits;
+  m_l1_size = (m_size + (1LL << shift) - 1) >> shift;
+  m_l1_table_offset = header.l1_table_offset;
+  if (m_size > (std::numeric_limits<uint64_t>::max() - (1ULL << shift)) ||
+      m_l1_size > (std::numeric_limits<int32_t>::max() / sizeof(uint64_t))) {
+    lderr(cct) << "image size too big: " << m_size << dendl;
+    on_finish->complete(-EINVAL);
+    return;
+  }
+
+  ldout(cct, 15) << "size=" << m_size << ", "
+                 << "cluster_bits=" << m_cluster_bits << ", "
+                 << "l2_bits=" << l2_bits << dendl;
+
+  // allocate memory for L1 table and L2 + cluster caches
+  m_l2_table_cache = std::make_unique<L2TableCache>(this, l2_bits);
+  m_cluster_cache = std::make_unique<ClusterCache>(this);
+
+  read_l1_table(on_finish);
+}
+
+template <typename I>
+void QCOWFormat<I>::read_v2_header(Context* on_finish) {
+  auto cct = m_image_ctx->cct;
+  ldout(cct, 10) << dendl;
+
+  auto ctx = new LambdaContext([this, on_finish](int r) {
+    handle_read_v2_header(r, on_finish); });
+  m_bl.clear();
+  m_stream->read({{0, sizeof(QCowHeader)}}, &m_bl, ctx);
+}
+
+template <typename I>
+void QCOWFormat<I>::handle_read_v2_header(int r, Context* on_finish) {
+  auto cct = m_image_ctx->cct;
+  ldout(cct, 10) << "r=" << r << dendl;
+
+  // TODO add support for QCOW2
+  on_finish->complete(-ENOTSUP);
+}
+
+template <typename I>
+void QCOWFormat<I>::read_l1_table(Context* on_finish) {
+  auto cct = m_image_ctx->cct;
+  ldout(cct, 10) << dendl;
+
+  auto ctx = new LambdaContext([this, on_finish](int r) {
+    handle_read_l1_table(r, on_finish); });
+  m_stream->read({{m_l1_table_offset,
+                   m_l1_size * sizeof(uint64_t)}}, &m_l1_table_bl, ctx);
+}
+
+template <typename I>
+void QCOWFormat<I>::handle_read_l1_table(int r, Context* on_finish) {
+  auto cct = m_image_ctx->cct;
+  ldout(cct, 10) << "r=" << r << dendl;
+
+  if (r < 0) {
+    lderr(cct) << "failed to read L1 table: " << cpp_strerror(r) << dendl;
+    on_finish->complete(r);
+    return;
+  }
+
+  // translate the L1 table (big-endian -> CPU endianess)
+  m_l1_table = reinterpret_cast<uint64_t*>(m_l1_table_bl.c_str());
+  for (auto idx = 0UL; idx < m_l1_size; ++idx) {
+    m_l1_table[idx] = be64toh(m_l1_table[idx]);
+  }
+
+  read_backing_file(on_finish);
+}
+
+template <typename I>
+void QCOWFormat<I>::read_backing_file(Context* on_finish) {
+  if (m_backing_file_offset == 0 || m_backing_file_size == 0) {
+    // all data is within the specified file
+    on_finish->complete(0);
+    return;
+  }
+
+  auto cct = m_image_ctx->cct;
+  ldout(cct, 10) << dendl;
+
+  // TODO add support for backing files
+  on_finish->complete(-ENOTSUP);
+}
+
+template <typename I>
+void QCOWFormat<I>::close(Context* on_finish) {
+  auto cct = m_image_ctx->cct;
+  ldout(cct, 10) << dendl;
+
+  m_stream->close(on_finish);
+}
+
+template <typename I>
+void QCOWFormat<I>::get_snapshots(SnapInfos* snap_infos, Context* on_finish) {
+  auto cct = m_image_ctx->cct;
+  ldout(cct, 10) << dendl;
+
+  snap_infos->clear();
+
+  // TODO QCOW2 supports snapshots, not QCOW
+  on_finish->complete(0);
+}
+
+template <typename I>
+void QCOWFormat<I>::get_image_size(uint64_t snap_id, uint64_t* size,
+                                  Context* on_finish) {
+  auto cct = m_image_ctx->cct;
+  ldout(cct, 10) << dendl;
+
+  *size = m_size;
+  on_finish->complete(0);
+}
+
+template <typename I>
+bool QCOWFormat<I>::read(
+    io::AioCompletion* aio_comp, uint64_t snap_id, io::Extents&& image_extents,
+    io::ReadResult&& read_result, int op_flags, int read_flags,
+    const ZTracer::Trace &parent_trace) {
+  auto cct = m_image_ctx->cct;
+  ldout(cct, 20) << "snap_id=" << snap_id << ", "
+                 << "image_extents=" << image_extents << dendl;
+
+  if (snap_id != CEPH_NOSNAP) {
+    // TODO add QCOW2 snapshot support
+    lderr(cct) << "snapshots are not supported" << dendl;
+    aio_comp->fail(-EINVAL);
+    return true;
+  }
+
+  aio_comp->read_result = std::move(read_result);
+  aio_comp->read_result.set_image_extents(image_extents);
+
+  auto read_request = new ReadRequest(this, aio_comp, std::move(image_extents));
+  read_request->send();
+
+  return true;
+}
+
+template <typename I>
+void QCOWFormat<I>::list_snaps(io::Extents&& image_extents,
+                              io::SnapIds&& snap_ids, int list_snaps_flags,
+                              io::SnapshotDelta* snapshot_delta,
+                              const ZTracer::Trace &parent_trace,
+                              Context* on_finish) {
+  auto cct = m_image_ctx->cct;
+  ldout(cct, 20) << "image_extents=" << image_extents << dendl;
+
+  // TODO add QCOW2 snapshot support
+
+  // QCOW does support snapshots so just use cluster existence for delta
+  auto snapshot = &(*snapshot_delta)[{CEPH_NOSNAP, CEPH_NOSNAP}];
+  auto list_snaps_request = new ListSnapsRequest(
+    this, io::Extents{image_extents}, snapshot, on_finish);
+  list_snaps_request->send();
+}
+
+} // namespace migration
+} // namespace librbd
+
+template class librbd::migration::QCOWFormat<librbd::ImageCtx>;
diff --git a/src/librbd/migration/QCOWFormat.h b/src/librbd/migration/QCOWFormat.h
new file mode 100644 (file)
index 0000000..ebcb062
--- /dev/null
@@ -0,0 +1,146 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIGRATION_QCOW_FORMAT_H
+#define CEPH_LIBRBD_MIGRATION_QCOW_FORMAT_H
+
+#include "include/int_types.h"
+#include "librbd/Types.h"
+#include "librbd/migration/FormatInterface.h"
+#include "librbd/migration/QCOW.h"
+#include "json_spirit/json_spirit.h"
+#include <boost/asio/io_context_strand.hpp>
+#include <boost/iostreams/filter/zlib.hpp>
+#include <deque>
+#include <vector>
+#include <memory>
+
+struct Context;
+
+namespace librbd {
+
+struct AsioEngine;
+struct ImageCtx;
+
+namespace migration {
+
+template <typename> struct SourceSpecBuilder;
+struct StreamInterface;
+
+template <typename ImageCtxT>
+class QCOWFormat : public FormatInterface {
+public:
+  static QCOWFormat* create(
+      ImageCtxT* image_ctx, const json_spirit::mObject& json_object,
+      const SourceSpecBuilder<ImageCtxT>* source_spec_builder) {
+    return new QCOWFormat(image_ctx, json_object, source_spec_builder);
+  }
+
+  QCOWFormat(ImageCtxT* image_ctx, const json_spirit::mObject& json_object,
+             const SourceSpecBuilder<ImageCtxT>* source_spec_builder);
+  QCOWFormat(const QCOWFormat&) = delete;
+  QCOWFormat& operator=(const QCOWFormat&) = delete;
+
+  void open(Context* on_finish) override;
+  void close(Context* on_finish) override;
+
+  void get_snapshots(SnapInfos* snap_infos, Context* on_finish) override;
+  void get_image_size(uint64_t snap_id, uint64_t* size,
+                      Context* on_finish) override;
+
+  bool read(io::AioCompletion* aio_comp, uint64_t snap_id,
+            io::Extents&& image_extents, io::ReadResult&& read_result,
+            int op_flags, int read_flags,
+            const ZTracer::Trace &parent_trace) override;
+
+  void list_snaps(io::Extents&& image_extents, io::SnapIds&& snap_ids,
+                  int list_snaps_flags, io::SnapshotDelta* snapshot_delta,
+                  const ZTracer::Trace &parent_trace,
+                  Context* on_finish) override;
+
+private:
+  /**
+   * @verbatim
+   *
+   * <start>
+   *    |
+   *    v
+   *  OPEN
+   *    |
+   *    v
+   *  PROBE
+   *    |
+   *    |\---> READ V1 HEADER ----\
+   *    |                         |
+   *    \---> READ V2 HEADER ----\|
+   *                              |
+   *                              v
+   *                        READ L1 TABLE
+   *                              |
+   *                              v
+   *                        READ BACKING FILE
+   *                              |
+   *    /-------------------------/
+   *    |
+   *    v
+   * <opened>
+   *
+   * @endverbatim
+   */
+
+  struct Cluster;
+  struct ClusterCache;
+  struct L2TableCache;
+  struct ReadRequest;
+  struct ListSnapsRequest;
+
+  ImageCtxT* m_image_ctx;
+  json_spirit::mObject m_json_object;
+  const SourceSpecBuilder<ImageCtxT>* m_source_spec_builder;
+
+  boost::asio::io_context::strand m_strand;
+  std::shared_ptr<StreamInterface> m_stream;
+
+  bufferlist m_bl;
+
+  uint64_t m_size = 0;
+
+  uint64_t m_backing_file_offset = 0;
+  uint32_t m_backing_file_size = 0;
+
+  uint32_t m_cluster_bits = 0;
+  uint32_t m_cluster_size = 0;
+  uint64_t m_cluster_offset_mask = 0;
+  uint64_t m_cluster_mask = 0;
+
+  uint32_t m_l1_size = 0;
+  uint64_t m_l1_table_offset = 0;
+  uint64_t* m_l1_table = nullptr;
+  bufferlist m_l1_table_bl;
+
+  std::unique_ptr<L2TableCache> m_l2_table_cache;
+  std::unique_ptr<ClusterCache> m_cluster_cache;
+
+  void handle_open(int r, Context* on_finish);
+
+  void probe(Context* on_finish);
+  void handle_probe(int r, Context* on_finish);
+
+  void read_v1_header(Context* on_finish);
+  void handle_read_v1_header(int r, Context* on_finish);
+
+  void read_v2_header(Context* on_finish);
+  void handle_read_v2_header(int r, Context* on_finish);
+
+  void read_l1_table(Context* on_finish);
+  void handle_read_l1_table(int r, Context* on_finish);
+
+  void read_backing_file(Context* on_finish);
+};
+
+} // namespace migration
+} // namespace librbd
+
+extern template class librbd::migration::QCOWFormat<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIGRATION_QCOW_FORMAT_H
index 526522fba60b8cbf39481ec64b558ea71164687f..214d7ce0e5d4408e596babb8f0fc44656c8717ab 100644 (file)
@@ -8,6 +8,7 @@
 #include "librbd/migration/HttpStream.h"
 #include "librbd/migration/S3Stream.h"
 #include "librbd/migration/NativeFormat.h"
+#include "librbd/migration/QCOWFormat.h"
 #include "librbd/migration/RawFormat.h"
 #include "librbd/migration/RawSnapshot.h"
 
@@ -64,6 +65,8 @@ int SourceSpecBuilder<I>::build_format(
   if (type == "native") {
     format->reset(NativeFormat<I>::create(m_image_ctx, source_spec_object,
                                           import_only));
+  } else if (type == "qcow") {
+    format->reset(QCOWFormat<I>::create(m_image_ctx, source_spec_object, this));
   } else if (type == "raw") {
     format->reset(RawFormat<I>::create(m_image_ctx, source_spec_object, this));
   } else {