]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
os/bluestore: introduce offline DB/WAL volume migration for
authorIgor Fedotov <ifedotov@suse.com>
Tue, 17 Jul 2018 13:06:36 +0000 (16:06 +0300)
committerIgor Fedotov <ifedotov@suse.com>
Wed, 17 Oct 2018 19:39:25 +0000 (22:39 +0300)
ceph-bluestore-tool.

Signed-off-by: Igor Fedotov <ifedotov@suse.com>
src/os/bluestore/BlueFS.cc
src/os/bluestore/BlueFS.h
src/os/bluestore/BlueStore.cc
src/os/bluestore/BlueStore.h
src/os/bluestore/bluestore_tool.cc

index 058541a30edb0a266ac4973468238ef41de2b4be..5d28a22ddc438b14677020d01cd3f5c08f00d7ff 100644 (file)
@@ -377,7 +377,7 @@ int BlueFS::mkfs(uuid_d osd_uuid)
 
   // write supers
   super.log_fnode = log_file->fnode;
-  _write_super();
+  _write_super(BDEV_DB);
   flush_bdev();
 
   // clean up
@@ -491,6 +491,31 @@ void BlueFS::umount()
   _shutdown_logger();
 }
 
+int BlueFS::prepare_new_device(int id)
+{
+  dout(1) << __func__ << dendl;
+
+  if(id == BDEV_NEWDB) {
+    int new_log_dev_cur = BDEV_WAL;
+    int new_log_dev_next = BDEV_WAL;
+    if (!bdev[BDEV_WAL]) {
+      new_log_dev_cur = BDEV_NEWDB;
+      new_log_dev_next = BDEV_DB;
+    }
+    _rewrite_log_sync(false,
+      BDEV_NEWDB,
+      new_log_dev_cur,
+      new_log_dev_next,
+      RENAME_DB2SLOW);
+    //}
+  } else if(id == BDEV_NEWWAL) {
+    _rewrite_log_sync(false, BDEV_DB, BDEV_NEWWAL, BDEV_WAL, REMOVE_WAL);
+  } else {
+    assert(false);
+  }
+  return 0;
+}
+
 void BlueFS::collect_metadata(map<string,string> *pm, unsigned skip_bdev_id)
 {
   if (skip_bdev_id != BDEV_DB && bdev[BDEV_DB])
@@ -516,7 +541,7 @@ int BlueFS::fsck()
   return 0;
 }
 
-int BlueFS::_write_super()
+int BlueFS::_write_super(int dev)
 {
   // build superblock
   bufferlist bl;
@@ -529,7 +554,7 @@ int BlueFS::_write_super()
   ceph_assert(bl.length() <= get_super_length());
   bl.append_zero(get_super_length() - bl.length());
 
-  bdev[BDEV_DB]->write(get_super_offset(), bl, false);
+  bdev[dev]->write(get_super_offset(), bl, false);
   dout(20) << __func__ << " v " << super.version
            << " crc 0x" << std::hex << crc
            << " offset 0x" << get_super_offset() << std::dec
@@ -972,6 +997,261 @@ int BlueFS::log_dump()
   return 0;
 }
 
+int BlueFS::device_migrate_to_existing(
+  CephContext *cct,
+  const set<int>& devs_source,
+  int dev_target)
+{
+  vector<byte> buf;
+  bool buffered = cct->_conf->bluefs_buffered_io;
+
+  assert(dev_target < (int)MAX_BDEV);
+
+  int flags = 0;
+  flags |= devs_source.count(BDEV_DB) ?
+    (REMOVE_DB | RENAME_SLOW2DB) : 0;
+  flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
+  int dev_target_new = dev_target;
+
+  // Slow device without separate DB one is addressed via BDEV_DB
+  // Hence need renaming.
+  if ((flags & REMOVE_DB) && dev_target == BDEV_SLOW) {
+    dev_target_new = BDEV_DB;
+    dout(0) << __func__ << " super to be written to " << dev_target << dendl;
+  }
+
+  for (auto& p : file_map) {
+    //do not copy log
+    if (p.second->fnode.ino == 1) {
+      continue;
+    }
+    auto& fnode_extents = p.second->fnode.extents;
+
+    for (auto ext_it = fnode_extents.begin();
+      ext_it != p.second->fnode.extents.end();
+      ++ext_it) {
+      if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) {
+       bluefs_extent_t old_ext = *ext_it;
+       PExtentVector extents;
+       auto l =
+         _allocate_without_fallback(dev_target, old_ext.length, &extents);
+       if (l == 0) {
+         buf.resize(old_ext.length);
+         int r = bdev[old_ext.bdev]->read_random(
+           old_ext.offset,
+           old_ext.length,
+           (char*)&buf.at(0),
+           buffered);
+         if (r != 0) {
+           derr << __func__ << " failed to read 0x" << std::hex
+             << old_ext.offset << "~" <<old_ext.length << std::dec
+             << " from " << (int)dev_target << dendl;
+           return -EIO;
+         }
+
+         assert(extents.size() > 0);
+         uint64_t src_buf_pos = 0;
+         {
+           // overwrite existing extent
+           *ext_it=
+             bluefs_extent_t(dev_target_new, extents[0].offset, extents[0].length);
+           bufferlist bl;
+           bl.append((char*)&buf.at(src_buf_pos), extents[0].length);
+           int r = bdev[dev_target]->write(extents[0].offset, bl, buffered);
+           ceph_assert(r == 0);
+           src_buf_pos += extents[0].length;
+         }
+         // then insert more extents if needed
+         for( size_t i = 1; i < extents.size(); ++i) {
+           bufferlist bl;
+           bl.append((char*)&buf.at(src_buf_pos), extents[i].length);
+           ++ext_it;
+           ext_it = fnode_extents.emplace(ext_it, dev_target_new,
+             extents[i].offset, extents[i].length);
+           int r = bdev[dev_target]->write(extents[i].offset, bl, buffered);
+           ceph_assert(r == 0);
+           src_buf_pos += extents[i].length;
+         }
+         {
+           PExtentVector to_release;
+           to_release.emplace_back(old_ext.offset, old_ext.length);
+           alloc[old_ext.bdev]->release(to_release);
+         }
+
+       } else {
+         derr << __func__ << " unable to allocate len 0x" << std::hex
+           << old_ext.length << std::dec << " from " << (int)dev_target
+           << dendl;
+         return -ENOSPC;
+       }
+      } else if (dev_target != dev_target_new && ext_it->bdev == dev_target) {
+       ext_it->bdev = dev_target_new;
+      }
+    }
+    auto& prefer_bdev = p.second->fnode.prefer_bdev;
+    if (prefer_bdev != dev_target && devs_source.count(prefer_bdev)) {
+      prefer_bdev = dev_target_new;
+    }
+  }
+  // new logging device in the current naming scheme
+  int new_log_dev_cur = bdev[BDEV_WAL] ?
+    BDEV_WAL :
+    bdev[BDEV_DB] ? BDEV_DB : BDEV_SLOW;
+
+  // new logging device in new naming scheme
+  int new_log_dev_next = new_log_dev_cur;
+
+  if (devs_source.count(new_log_dev_cur)) {
+    // SLOW device is addressed via BDEV_DB too hence either WAL or DB
+    new_log_dev_next = (flags & REMOVE_WAL) || !bdev[BDEV_WAL] ?
+      BDEV_DB :
+      BDEV_WAL;
+
+    dout(0) << __func__ << " log moved from " << new_log_dev_cur
+      << " to " << new_log_dev_next << dendl;
+
+    new_log_dev_cur =
+      (flags & REMOVE_DB) && new_log_dev_next == BDEV_DB ?
+        BDEV_SLOW :
+        new_log_dev_next;
+  }
+
+  _rewrite_log_sync(
+    false,
+    (flags & REMOVE_DB) ? BDEV_SLOW : BDEV_DB,
+    new_log_dev_cur,
+    new_log_dev_next,
+    flags);
+  return 0;
+}
+
+int BlueFS::device_migrate_to_new(
+  CephContext *cct,
+  const set<int>& devs_source,
+  int dev_target)
+{
+  vector<byte> buf;
+  bool buffered = cct->_conf->bluefs_buffered_io;
+
+  assert(dev_target == (int)BDEV_NEWDB || (int)BDEV_NEWWAL);
+
+  int flags = 0;
+
+  flags |= devs_source.count(BDEV_DB) ?
+    (!bdev[BDEV_SLOW] ? RENAME_DB2SLOW: REMOVE_DB) :
+    0;
+  flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
+  int dev_target_new = dev_target;
+
+  for (auto& p : file_map) {
+    //do not copy log
+    if (p.second->fnode.ino == 1) {
+      continue;
+    }
+    auto& fnode_extents = p.second->fnode.extents;
+
+    for (auto ext_it = fnode_extents.begin();
+      ext_it != p.second->fnode.extents.end();
+      ++ext_it) {
+      if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) {
+       bluefs_extent_t old_ext = *ext_it;
+       PExtentVector extents;
+       auto l =
+         _allocate_without_fallback(dev_target, old_ext.length, &extents);
+       if (l == 0) {
+         buf.resize(old_ext.length);
+         int r = bdev[old_ext.bdev]->read_random(
+           old_ext.offset,
+           old_ext.length,
+           (char*)&buf.at(0),
+           buffered);
+         dout(10)<<__func__<<" read = "<<r<<dendl;
+         if (r != 0) {
+           derr << __func__ << " failed to read 0x" << std::hex
+             << old_ext.offset << "~" <<old_ext.length << std::dec
+             << " from " << (int)dev_target << dendl;
+           return -EIO;
+         }
+
+         assert(extents.size() > 0);
+         uint64_t src_buf_pos = 0;
+         {
+           // overwrite existing extent
+           *ext_it=
+             bluefs_extent_t(dev_target_new, extents[0].offset, extents[0].length);
+           bufferlist bl;
+           bl.append((char*)&buf.at(src_buf_pos), extents[0].length);
+           int r = bdev[dev_target]->write(extents[0].offset, bl, buffered);
+           ceph_assert(r == 0);
+           src_buf_pos += extents[0].length;
+         }
+         // then insert more extents if needed
+         for( size_t i = 1; i < extents.size(); ++i) {
+           bufferlist bl;
+           bl.append((char*)&buf.at(src_buf_pos), extents[i].length);
+           ++ext_it;
+           ext_it = fnode_extents.emplace(ext_it, dev_target_new,
+             extents[i].offset, extents[i].length);
+           int r = bdev[dev_target]->write(extents[i].offset, bl, buffered);
+           ceph_assert(r == 0);
+           src_buf_pos += extents[i].length;
+         }
+         {
+           PExtentVector to_release;
+           to_release.emplace_back(old_ext.offset, old_ext.length);
+           alloc[old_ext.bdev]->release(to_release);
+         }
+       } else {
+         derr << __func__ << " unable to allocate len 0x" << std::hex
+           << old_ext.length << std::dec << " from " << (int)dev_target
+           << dendl;
+         return -ENOSPC;
+       }
+      } else if (dev_target != dev_target_new && ext_it->bdev == dev_target) {
+       ext_it->bdev = dev_target_new;
+      }
+    }
+    auto& prefer_bdev = p.second->fnode.prefer_bdev;
+    if (prefer_bdev != dev_target && devs_source.count(prefer_bdev)) {
+      prefer_bdev = dev_target_new;
+    }
+  }
+  // new logging device in the current naming scheme
+  int new_log_dev_cur =
+    bdev[BDEV_NEWWAL] ?
+      BDEV_NEWWAL :
+      bdev[BDEV_WAL] && !(flags & REMOVE_WAL) ?
+        BDEV_WAL :
+       bdev[BDEV_NEWDB] ?
+         BDEV_NEWDB :
+         bdev[BDEV_DB] && !(flags & REMOVE_DB)?
+           BDEV_DB :
+           BDEV_SLOW;
+
+  // new logging device in new naming scheme
+  int new_log_dev_next =
+    new_log_dev_cur == BDEV_NEWWAL ?
+      BDEV_WAL :
+      new_log_dev_cur == BDEV_NEWDB ?
+       BDEV_DB :
+        new_log_dev_cur;
+
+  int super_dev =
+    dev_target == BDEV_NEWDB ?
+      BDEV_NEWDB :
+      bdev[BDEV_DB] ?
+        BDEV_DB :
+       BDEV_SLOW;
+
+  _rewrite_log_sync(
+    false,
+    super_dev,
+    new_log_dev_cur,
+    new_log_dev_next,
+    flags);
+  return 0;
+}
+
 BlueFS::FileRef BlueFS::_get_file(uint64_t ino)
 {
   auto p = file_map.find(ino);
@@ -1204,7 +1484,8 @@ bool BlueFS::_should_compact_log()
   return true;
 }
 
-void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t)
+void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t,
+                                       int flags)
 {
   t->seq = 1;
   t->uuid = super.uuid;
@@ -1214,17 +1495,63 @@ void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t)
   for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
     interval_set<uint64_t>& p = block_all[bdev];
     for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
-      dout(20) << __func__ << " op_alloc_add " << bdev << " 0x"
+      auto bdev_new = bdev;
+      if ((flags & REMOVE_WAL) && bdev == BDEV_WAL) {
+       continue;
+      }
+      if ((flags & REMOVE_DB) && bdev == BDEV_DB) {
+       continue;
+      }
+      if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
+       bdev_new = BDEV_DB;
+      }
+      if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
+       bdev_new = BDEV_SLOW;
+      }
+      if (bdev == BDEV_NEWDB) {
+       // REMOVE_DB xor RENAME_DB
+       ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
+       ceph_assert(!(flags & RENAME_SLOW2DB));
+       bdev_new = BDEV_DB;
+      }
+      if (bdev == BDEV_NEWWAL) {
+       ceph_assert(flags & REMOVE_WAL);
+       bdev_new = BDEV_WAL;
+      }
+      dout(20) << __func__ << " op_alloc_add " << bdev_new << " 0x"
                << std::hex << q.get_start() << "~" << q.get_len() << std::dec
                << dendl;
-      t->op_alloc_add(bdev, q.get_start(), q.get_len());
+      t->op_alloc_add(bdev_new, q.get_start(), q.get_len());
     }
   }
   for (auto& p : file_map) {
     if (p.first == 1)
       continue;
-    dout(20) << __func__ << " op_file_update " << p.second->fnode << dendl;
     ceph_assert(p.first > 1);
+
+    for(auto& e : p.second->fnode.extents) {
+      auto bdev = e.bdev;
+      auto bdev_new = bdev;
+      ceph_assert(!((flags & REMOVE_WAL) && bdev == BDEV_WAL));
+      if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
+       bdev_new = BDEV_DB;
+      }
+      if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
+       bdev_new = BDEV_SLOW;
+      }
+      if (bdev == BDEV_NEWDB) {
+       // REMOVE_DB xor RENAME_DB
+       ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
+       ceph_assert(!(flags & RENAME_SLOW2DB));
+       bdev_new = BDEV_DB;
+      }
+      if (bdev == BDEV_NEWWAL) {
+       ceph_assert(flags & REMOVE_WAL);
+       bdev_new = BDEV_WAL;
+      }
+      e.bdev = bdev_new;
+    }
+    dout(20) << __func__ << " op_file_update " << p.second->fnode << dendl;
     t->op_file_update(p.second->fnode);
   }
   for (auto& p : dir_map) {
@@ -1241,13 +1568,32 @@ void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t)
 void BlueFS::_compact_log_sync()
 {
   dout(10) << __func__ << dendl;
+  _rewrite_log_sync(true,
+    BDEV_DB,
+    log_writer->file->fnode.prefer_bdev,
+    log_writer->file->fnode.prefer_bdev,
+    0);
+  logger->inc(l_bluefs_log_compactions);
+}
+
+void BlueFS::_rewrite_log_sync(bool allocate_with_fallback,
+                              int super_dev,
+                              int log_dev,
+                              int log_dev_new,
+                              int flags)
+{
   File *log_file = log_writer->file.get();
 
   // clear out log (be careful who calls us!!!)
   log_t.clear();
 
+  dout(20) << __func__ << " super_dev:" << super_dev
+                       << " log_dev:" << log_dev
+                       << " log_dev_new:" << log_dev_new
+                      << " flags:" << flags
+                      << dendl;
   bluefs_transaction_t t;
-  _compact_log_dump_metadata(&t);
+  _compact_log_dump_metadata(&t, flags);
 
   dout(20) << __func__ << " op_jump_seq " << log_seq << dendl;
   t.op_jump_seq(log_seq);
@@ -1261,9 +1607,22 @@ void BlueFS::_compact_log_sync()
 
   mempool::bluefs::vector<bluefs_extent_t> old_extents;
   uint64_t old_allocated = 0;
+  int r;
   log_file->fnode.swap_extents(old_extents, old_allocated);
-  int r = _allocate(log_file->fnode.prefer_bdev, need, &log_file->fnode);
-  ceph_assert(r == 0);
+  if (allocate_with_fallback) {
+    r = _allocate(log_dev, need, &log_file->fnode);
+    ceph_assert(r == 0);
+  } else {
+    PExtentVector extents;
+    r = _allocate_without_fallback(log_dev,
+                              need,
+                              &extents);
+    ceph_assert(r == 0);
+    for (auto& p : extents) {
+      log_file->fnode.append_extent(
+       bluefs_extent_t(log_dev, p.offset, p.length));
+    }
+  }
 
   _close_writer(log_writer);
 
@@ -1282,18 +1641,24 @@ void BlueFS::_compact_log_sync()
 #endif
   flush_bdev();
 
-  dout(10) << __func__ << " writing super" << dendl;
   super.log_fnode = log_file->fnode;
+  // rename device if needed
+  if (log_dev != log_dev_new) {
+    dout(10) << __func__ << " renaming log extents to " << log_dev_new << dendl;
+    for (auto& p : super.log_fnode.extents) {
+      p.bdev = log_dev_new;
+    }
+  }
+  dout(10) << __func__ << " writing super, log fnode: " << super.log_fnode << dendl;
+
   ++super.version;
-  _write_super();
+  _write_super(super_dev);
   flush_bdev();
 
   dout(10) << __func__ << " release old log extents " << old_extents << dendl;
   for (auto& r : old_extents) {
     pending_release[r.bdev].insert(r.offset, r.length);
   }
-
-  logger->inc(l_bluefs_log_compactions);
 }
 
 /*
@@ -1360,7 +1725,7 @@ void BlueFS::_compact_log_async(std::unique_lock<std::mutex>& l)
   bluefs_transaction_t t;
   //avoid record two times in log_t and _compact_log_dump_metadata.
   log_t.clear();
-  _compact_log_dump_metadata(&t);
+  _compact_log_dump_metadata(&t, 0);
 
   // conservative estimate for final encoded size
   new_log_jump_to = round_up_to(t.op_bl.length() + super.block_size * 2,
@@ -1432,7 +1797,7 @@ void BlueFS::_compact_log_async(std::unique_lock<std::mutex>& l)
   dout(10) << __func__ << " writing super" << dendl;
   super.log_fnode = log_file->fnode;
   ++super.version;
-  _write_super();
+  _write_super(BDEV_DB);
 
   lock.unlock();
   flush_bdev();
@@ -1815,8 +2180,7 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
   logger->inc(l_bluefs_bytes_written_slow, bytes_written_slow);
   for (unsigned i = 0; i < MAX_BDEV; ++i) {
     if (bdev[i]) {
-      ceph_assert(h->iocv[i]);
-      if (h->iocv[i]->has_pending_aios()) {
+      if (h->iocv[i] && h->iocv[i]->has_pending_aios()) {
         bdev[i]->aio_submit(h->iocv[i]);
       }
     }
@@ -1982,6 +2346,40 @@ void BlueFS::flush_bdev()
   }
 }
 
+int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len,
+                     PExtentVector* extents)
+{
+  dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
+           << " from " << (int)id << dendl;
+  assert(id < alloc.size());
+  uint64_t min_alloc_size = cct->_conf->bluefs_alloc_size;
+
+  uint64_t left = round_up_to(len, min_alloc_size);
+
+  if (!alloc[id]) {
+    return -ENOENT;
+  }
+  extents->reserve(4);  // 4 should be (more than) enough for most allocations
+  int64_t alloc_len = alloc[id]->allocate(left, min_alloc_size, 0, extents);
+  if (alloc_len < (int64_t)left) {
+    if (alloc_len != 0) {
+      alloc[id]->release(*extents);
+    }
+    if (bdev[id])
+      derr << __func__ << " failed to allocate 0x" << std::hex << left
+          << " on bdev " << (int)id
+          << ", free 0x" << alloc[id]->get_free() << std::dec << dendl;
+    else
+      derr << __func__ << " failed to allocate 0x" << std::hex << left
+          << " on bdev " << (int)id << ", dne" << std::dec << dendl;
+    if (alloc[id])
+      alloc[id]->dump();
+    return -ENOSPC;
+  }
+
+  return 0;
+}
+
 int BlueFS::_allocate(uint8_t id, uint64_t len,
                      bluefs_fnode_t* node)
 {
@@ -2187,9 +2585,10 @@ void BlueFS::_close_writer(FileWriter *h)
   dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl;
   for (unsigned i=0; i<MAX_BDEV; ++i) {
     if (bdev[i]) {
-      ceph_assert(h->iocv[i]);
-      h->iocv[i]->aio_wait();
-      bdev[i]->queue_reap_ioc(h->iocv[i]);
+      if (h->iocv[i]) {
+       h->iocv[i]->aio_wait();
+       bdev[i]->queue_reap_ioc(h->iocv[i]);
+      }
     }
   }
   delete h;
index 333c16b54bbd3a0c7d6c8967eadf5919e0dfe9bc..9043634c8c9985dc36c13ee0f04ab1012f20c0de 100644 (file)
@@ -42,10 +42,12 @@ enum {
 class BlueFS {
 public:
   CephContext* cct;
-  static constexpr unsigned MAX_BDEV = 3;
+  static constexpr unsigned MAX_BDEV = 5;
   static constexpr unsigned BDEV_WAL = 0;
   static constexpr unsigned BDEV_DB = 1;
   static constexpr unsigned BDEV_SLOW = 2;
+  static constexpr unsigned BDEV_NEWWAL = 3;
+  static constexpr unsigned BDEV_NEWDB = 4;
 
   enum {
     WRITER_UNKNOWN,
@@ -273,6 +275,9 @@ private:
 
   int _allocate(uint8_t bdev, uint64_t len,
                bluefs_fnode_t* node);
+  int _allocate_without_fallback(uint8_t id, uint64_t len,
+                                PExtentVector* extents);
+
   int _flush_range(FileWriter *h, uint64_t offset, uint64_t length);
   int _flush(FileWriter *h, bool force);
   int _fsync(FileWriter *h, std::unique_lock<std::mutex>& l);
@@ -287,10 +292,24 @@ private:
                          uint64_t jump_to = 0);
   uint64_t _estimate_log_size();
   bool _should_compact_log();
-  void _compact_log_dump_metadata(bluefs_transaction_t *t);
+
+  enum {
+    REMOVE_DB = 1,
+    REMOVE_WAL = 2,
+    RENAME_SLOW2DB = 4,
+    RENAME_DB2SLOW = 8,
+  };
+  void _compact_log_dump_metadata(bluefs_transaction_t *t,
+                                 int flags);
   void _compact_log_sync();
   void _compact_log_async(std::unique_lock<std::mutex>& l);
 
+  void _rewrite_log_sync(bool allocate_with_fallback,
+                        int super_dev,
+                        int log_dev,
+                        int new_log_dev,
+                        int flags);
+
   //void _aio_finish(void *priv);
 
   void _flush_bdev_safely(FileWriter *h);
@@ -316,7 +335,7 @@ private:
   void _invalidate_cache(FileRef f, uint64_t offset, uint64_t length);
 
   int _open_super();
-  int _write_super();
+  int _write_super(int dev);
   int _replay(bool noop, bool to_stdout = false); ///< replay journal
 
   FileWriter *_create_writer(FileRef f);
@@ -339,6 +358,7 @@ public:
   int mkfs(uuid_d osd_uuid);
   int mount();
   void umount();
+  int prepare_new_device(int id);
   
   int log_dump();
 
@@ -346,6 +366,15 @@ public:
   void get_devices(set<string> *ls);
   int fsck();
 
+  int device_migrate_to_new(
+    CephContext *cct,
+    const set<int>& devs_source,
+    int dev_target);
+  int device_migrate_to_existing(
+    CephContext *cct,
+    const set<int>& devs_source,
+    int dev_target);
+
   uint64_t get_used();
   uint64_t get_total(unsigned id);
   uint64_t get_free(unsigned id);
index 280fba4a9175f65bd8a9156ebc71ed657a4a46cd..5521045833d280cb3bdb5913a17f365005209dbd 100644 (file)
@@ -4885,6 +4885,156 @@ bool BlueStore::test_mount_in_use()
   return ret;
 }
 
+int BlueStore::_open_bluefs(bool create)
+{
+  int r;
+  bluefs = new BlueFS(cct);
+
+  string bfn;
+  struct stat st;
+
+  bfn = path + "/block.db";
+  if (::stat(bfn.c_str(), &st) == 0) {
+    r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn,
+         create && cct->_conf->bdev_enable_discard);
+    if (r < 0) {
+      derr << __func__ << " add block device(" << bfn << ") returned: "
+            << cpp_strerror(r) << dendl;
+      goto free_bluefs;
+    }
+
+    if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
+      r = _check_or_set_bdev_label(
+       bfn,
+       bluefs->get_block_device_size(BlueFS::BDEV_DB),
+        "bluefs db", create);
+      if (r < 0) {
+        derr << __func__
+             << " check block device(" << bfn << ") label returned: "
+              << cpp_strerror(r) << dendl;
+        goto free_bluefs;
+      }
+    }
+    if (create) {
+      bluefs->add_block_extent(
+       BlueFS::BDEV_DB,
+       SUPER_RESERVED,
+       bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
+    }
+    bluefs_shared_bdev = BlueFS::BDEV_SLOW;
+    bluefs_single_shared_device = false;
+  } else {
+    r = -errno;
+    if (::lstat(bfn.c_str(), &st) == -1) {
+      r = 0;
+      bluefs_shared_bdev = BlueFS::BDEV_DB;
+    } else {
+      derr << __func__ << " " << bfn << " symlink exists but target unusable: "
+           << cpp_strerror(r) << dendl;
+      goto free_bluefs;
+    }
+  }
+
+  // shared device
+  bfn = path + "/block";
+  // never trim here
+  r = bluefs->add_block_device(bluefs_shared_bdev, bfn, false);
+  if (r < 0) {
+    derr << __func__ << " add block device(" << bfn << ") returned: "
+         << cpp_strerror(r) << dendl;
+    goto free_bluefs;
+  }
+  if (create) {
+    // note: we always leave the first SUPER_RESERVED (8k) of the device unused
+    uint64_t initial =
+      bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio +
+                         cct->_conf->bluestore_bluefs_gift_ratio);
+    initial = std::max(initial, cct->_conf->bluestore_bluefs_min);
+    if (cct->_conf->bluefs_alloc_size % min_alloc_size) {
+      derr << __func__ << " bluefs_alloc_size 0x" << std::hex
+           << cct->_conf->bluefs_alloc_size << " is not a multiple of "
+           << "min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
+      r = -EINVAL;
+      goto free_bluefs;
+    }
+    // align to bluefs's alloc_size
+    initial = p2roundup(initial, cct->_conf->bluefs_alloc_size);
+    // put bluefs in the middle of the device in case it is an HDD
+    uint64_t start = p2align((bdev->get_size() - initial) / 2,
+                             cct->_conf->bluefs_alloc_size);
+    //avoiding superblock overwrite
+    ceph_assert(cct->_conf->bluefs_alloc_size > _get_ondisk_reserved());
+    start = std::max(cct->_conf->bluefs_alloc_size, start);
+
+    bluefs->add_block_extent(bluefs_shared_bdev, start, initial);
+    bluefs_extents.insert(start, initial);
+  }
+
+  bfn = path + "/block.wal";
+  if (::stat(bfn.c_str(), &st) == 0) {
+    r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn,
+      create && cct->_conf->bdev_enable_discard);
+    if (r < 0) {
+      derr << __func__ << " add block device(" << bfn << ") returned: "
+           << cpp_strerror(r) << dendl;
+      goto free_bluefs;
+    }
+
+    if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
+      r = _check_or_set_bdev_label(
+       bfn,
+       bluefs->get_block_device_size(BlueFS::BDEV_WAL),
+        "bluefs wal", create);
+      if (r < 0) {
+        derr << __func__ << " check block device(" << bfn
+              << ") label returned: " << cpp_strerror(r) << dendl;
+        goto free_bluefs;
+      }
+    }
+
+    if (create) {
+      bluefs->add_block_extent(
+        BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
+         bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
+         BDEV_LABEL_BLOCK_SIZE);
+    }
+    kv_options["separate_wal_dir"] = "1";
+    bluefs_single_shared_device = false;
+  } else {
+    r = -errno;
+    if (::lstat(bfn.c_str(), &st) == -1) {
+      kv_options.erase("separate_wal_dir");
+      r = 0;
+    } else {
+      derr << __func__ << " " << bfn << " symlink exists but target unusable: "
+           << cpp_strerror(r) << dendl;
+      goto free_bluefs;
+    }
+  }
+
+  if (create) {
+    bluefs->mkfs(fsid);
+  }
+  r = bluefs->mount();
+  if (r < 0) {
+    derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
+    goto free_bluefs;
+  }
+  
+  return 0;
+free_bluefs:
+  ceph_assert(bluefs);
+  delete bluefs;
+  bluefs = NULL;
+  return r;
+}
+
+void BlueStore::_close_bluefs()
+{
+  bluefs->umount();
+  delete bluefs;
+  bluefs = NULL;
+}
 int BlueStore::_open_db(bool create, bool to_repair_db)
 {
   int r;
@@ -4938,138 +5088,9 @@ int BlueStore::_open_db(bool create, bool to_repair_db)
       derr << " backend must be rocksdb to use bluefs" << dendl;
       return -EINVAL;
     }
-    bluefs = new BlueFS(cct);
-
-    string bfn;
-    struct stat st;
-
-    bfn = path + "/block.db";
-    if (::stat(bfn.c_str(), &st) == 0) {
-      r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn,
-           create && cct->_conf->bdev_enable_discard);
-      if (r < 0) {
-        derr << __func__ << " add block device(" << bfn << ") returned: " 
-             << cpp_strerror(r) << dendl;
-        goto free_bluefs;
-      }
-
-      if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
-        r = _check_or_set_bdev_label(
-         bfn,
-         bluefs->get_block_device_size(BlueFS::BDEV_DB),
-          "bluefs db", create);
-        if (r < 0) {
-          derr << __func__
-              << " check block device(" << bfn << ") label returned: "
-               << cpp_strerror(r) << dendl;
-          goto free_bluefs;
-        }
-      }
-      if (create) {
-       bluefs->add_block_extent(
-         BlueFS::BDEV_DB,
-         SUPER_RESERVED,
-         bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
-      }
-      bluefs_shared_bdev = BlueFS::BDEV_SLOW;
-      bluefs_single_shared_device = false;
-    } else {
-      r = -errno;
-      if (::lstat(bfn.c_str(), &st) == -1) {
-       r = 0;
-       bluefs_shared_bdev = BlueFS::BDEV_DB;
-      } else {
-       derr << __func__ << " " << bfn << " symlink exists but target unusable: "
-            << cpp_strerror(r) << dendl;
-       goto free_bluefs;
-      }
-    }
-
-    // shared device
-    bfn = path + "/block";
-    // never trim here
-    r = bluefs->add_block_device(bluefs_shared_bdev, bfn, false);
-    if (r < 0) {
-      derr << __func__ << " add block device(" << bfn << ") returned: " 
-          << cpp_strerror(r) << dendl;
-      goto free_bluefs;
-    }
-    if (create) {
-      // note: we always leave the first SUPER_RESERVED (8k) of the device unused
-      uint64_t initial =
-       bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio +
-                           cct->_conf->bluestore_bluefs_gift_ratio);
-      initial = std::max(initial, cct->_conf->bluestore_bluefs_min);
-      if (cct->_conf->bluefs_alloc_size % min_alloc_size) {
-       derr << __func__ << " bluefs_alloc_size 0x" << std::hex
-            << cct->_conf->bluefs_alloc_size << " is not a multiple of "
-            << "min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
-       r = -EINVAL;
-       goto free_bluefs;
-      }
-      // align to bluefs's alloc_size
-      initial = p2roundup(initial, cct->_conf->bluefs_alloc_size);
-      // put bluefs in the middle of the device in case it is an HDD
-      uint64_t start = p2align((bdev->get_size() - initial) / 2,
-                              cct->_conf->bluefs_alloc_size);
-      //avoiding superblock overwrite
-      ceph_assert(cct->_conf->bluefs_alloc_size > _get_ondisk_reserved());
-      start = std::max(cct->_conf->bluefs_alloc_size, start);
-
-      bluefs->add_block_extent(bluefs_shared_bdev, start, initial);
-      bluefs_extents.insert(start, initial);
-    }
-
-    bfn = path + "/block.wal";
-    if (::stat(bfn.c_str(), &st) == 0) {
-      r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn,
-       create && cct->_conf->bdev_enable_discard);
-      if (r < 0) {
-        derr << __func__ << " add block device(" << bfn << ") returned: " 
-            << cpp_strerror(r) << dendl;
-        goto free_bluefs;                      
-      }
 
-      if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
-        r = _check_or_set_bdev_label(
-         bfn,
-         bluefs->get_block_device_size(BlueFS::BDEV_WAL),
-          "bluefs wal", create);
-        if (r < 0) {
-          derr << __func__ << " check block device(" << bfn
-               << ") label returned: " << cpp_strerror(r) << dendl;
-          goto free_bluefs;
-        }
-      }
+    r = _open_bluefs(create);
 
-      if (create) {
-       bluefs->add_block_extent(
-         BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
-         bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
-          BDEV_LABEL_BLOCK_SIZE);
-      }
-      kv_options["separate_wal_dir"] = "1";
-      bluefs_single_shared_device = false;
-    } else {
-      r = -errno;
-      if (::lstat(bfn.c_str(), &st) == -1) {
-       kv_options.erase("separate_wal_dir");
-       r = 0;
-      } else {
-       derr << __func__ << " " << bfn << " symlink exists but target unusable: "
-            << cpp_strerror(r) << dendl;
-       goto free_bluefs;
-      }
-    }
-
-    if (create) {
-      bluefs->mkfs(fsid);
-    }
-    r = bluefs->mount();
-    if (r < 0) {
-      derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
-      goto free_bluefs;
-    }
     if (cct->_conf->bluestore_bluefs_env_mirror) {
       rocksdb::Env *a = new BlueRocksEnv(bluefs);
       rocksdb::Env *b = rocksdb::Env::Default();
@@ -5143,9 +5164,7 @@ int BlueStore::_open_db(bool create, bool to_repair_db)
   if (!db) {
     derr << __func__ << " error creating db" << dendl;
     if (bluefs) {
-      bluefs->umount();
-      delete bluefs;
-      bluefs = NULL;
+      _close_bluefs();
     }
     // delete env manually here since we can't depend on db to do this
     // under this case
@@ -5194,12 +5213,6 @@ int BlueStore::_open_db(bool create, bool to_repair_db)
   dout(1) << __func__ << " opened " << kv_backend
          << " path " << fn << " options " << options << dendl;
   return 0;
-
-free_bluefs:
-  ceph_assert(bluefs);
-  delete bluefs;
-  bluefs = NULL;
-  return r;
 }
 
 void BlueStore::_close_db()
@@ -5208,9 +5221,7 @@ void BlueStore::_close_db()
   delete db;
   db = NULL;
   if (bluefs) {
-    bluefs->umount();
-    delete bluefs;
-    bluefs = NULL;
+    _close_bluefs();
   }
 }
 
@@ -5312,9 +5323,10 @@ int BlueStore::allocate_bluefs_freespace(uint64_t size)
              << bluefs_extents << std::dec << dendl;
     synct->set(PREFIX_SUPER, "bluefs_extents", bl);
 
-    int r = db->submit_transaction_sync(synct);
-    assert(r == 0);
+    synct->set(PREFIX_SUPER, "bluefs_extents_back", bl);
 
+    int r = db->submit_transaction_sync(synct);
+    ceph_assert(r == 0);
   }
   return 0;
 }
@@ -5403,13 +5415,13 @@ int BlueStore::_balance_bluefs_freespace(PExtentVector *extents)
 
     if (alloc_len <= 0) {
       dout(0) << __func__ << " no allocate on 0x" << std::hex << gift
-              << " min_alloc_size 0x" << cct->_conf->bluefs_alloc_size
+              << " bluefs_alloc_size 0x" << cct->_conf->bluefs_alloc_size
              << std::dec << dendl;
       _dump_alloc_on_rebalance_failure();
       return 0;
     } else if (alloc_len < (int64_t)gift) {
       dout(0) << __func__ << " insufficient allocate on 0x" << std::hex << gift
-              << " min_alloc_size 0x" << cct->_conf->bluefs_alloc_size
+              << " bluefs_alloc_size 0x" << cct->_conf->bluefs_alloc_size
              << " allocated 0x" << alloc_len
              << std::dec << dendl;
       _dump_alloc_on_rebalance_failure();
@@ -5796,6 +5808,277 @@ int BlueStore::mkfs()
   return r;
 }
 
+int BlueStore::_mount_for_bluefs()
+{
+  int r = _open_path();
+  ceph_assert(r == 0);
+  r = _open_fsid(false);
+  ceph_assert(r == 0);
+  r = _read_fsid(&fsid);
+  ceph_assert(r == 0);
+  r = _lock_fsid();
+  ceph_assert(r == 0);
+  r = _open_bluefs(false);
+  ceph_assert(r == 0);
+  return r;
+}
+
+void BlueStore::_umount_for_bluefs()
+{
+  _close_bluefs();
+  _close_fsid();
+  _close_path();
+}
+
+int BlueStore::add_new_bluefs_device(int id, const string& dev_path)
+{
+  dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
+  int r;
+  ceph_assert(path_fd < 0);
+
+  ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
+
+  if (!cct->_conf->bluestore_bluefs) {
+    derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
+    return -EIO;
+  }
+
+  r = _mount_for_bluefs();
+
+  int reserved;
+  if (id == BlueFS::BDEV_NEWWAL) {
+    string p = path + "/block.wal";
+    r = _setup_block_symlink_or_file("block.wal", dev_path,
+       cct->_conf->bluestore_block_wal_size,
+       true);
+    ceph_assert(r == 0);
+
+    r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p,
+      cct->_conf->bdev_enable_discard);
+    ceph_assert(r == 0);
+
+    if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
+      r = _check_or_set_bdev_label(
+       p,
+       bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
+        "bluefs wal",
+       true);
+      ceph_assert(r == 0);
+    }
+
+    reserved = BDEV_LABEL_BLOCK_SIZE;
+  } else if (id == BlueFS::BDEV_NEWDB) {
+    string p = path + "/block.db";
+    r = _setup_block_symlink_or_file("block.db", dev_path,
+       cct->_conf->bluestore_block_db_size,
+       true);
+    ceph_assert(r == 0);
+
+    r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p,
+      cct->_conf->bdev_enable_discard);
+    ceph_assert(r == 0);
+
+    if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
+      r = _check_or_set_bdev_label(
+       p,
+       bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
+        "bluefs db",
+       true);
+      ceph_assert(r == 0);
+    }
+    reserved = SUPER_RESERVED;
+  }
+
+  bluefs->umount();
+  bluefs->mount();
+
+  bluefs->add_block_extent(
+    id,
+    reserved,
+    bluefs->get_block_device_size(id) - reserved);
+
+  r = bluefs->prepare_new_device(id);
+  ceph_assert(r == 0);
+
+  if (r < 0) {
+    derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
+  } else {
+    dout(0) << __func__ << " success" << dendl;
+  }
+
+  _umount_for_bluefs();
+  return r;
+}
+
+int BlueStore::migrate_to_existing_bluefs_device(const set<int>& devs_source,
+  int id)
+{
+  dout(10) << __func__ << " id:" << id << dendl;
+  ceph_assert(path_fd < 0);
+
+  ceph_assert(id == BlueFS::BDEV_SLOW || id == BlueFS::BDEV_DB);
+
+  if (!cct->_conf->bluestore_bluefs) {
+    derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
+    return -EIO;
+  }
+
+  int r = _mount_for_bluefs();
+
+  // require bluestore_bluefs_min_free to be free at target device!
+  uint64_t used_space = cct->_conf.get_val<Option::size_t>("bluestore_bluefs_min_free");
+  for(auto src_id : devs_source) {
+    used_space += bluefs->get_total(src_id) - bluefs->get_free(src_id);
+  }
+  uint64_t target_free = bluefs->get_free(id);
+  if (id == BlueFS::BDEV_SLOW && target_free < used_space) {
+    // will need to remount full BlueStore instance to allocate more space
+    _umount_for_bluefs();
+
+    r = mount();
+    ceph_assert(r == 0);
+    dout(1) << __func__
+            << " Allocating more space at slow device for BlueFS: +"
+           << used_space - target_free << " bytes" << dendl;
+    r = allocate_bluefs_freespace(used_space - target_free);
+    umount();
+    if (r != 0) {
+      derr << __func__
+          << " can't migrate, unable to allocate extra space: "
+          << used_space - target_free << " at target:" << id
+          << dendl;
+      return -ENOSPC;
+    }
+
+    r = _mount_for_bluefs();
+    ceph_assert(r == 0);
+  } else if (target_free < used_space) {
+    derr << __func__
+         << " can't migrate, free space at target: " << target_free
+        << " is less than required space: " << used_space
+        << dendl;
+    return -ENOSPC;
+  }
+  r = bluefs->device_migrate_to_existing(cct, devs_source, id);
+  if (r < 0) {
+    derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
+    goto shutdown;
+  }
+
+  if (devs_source.count(BlueFS::BDEV_DB)) {
+    r = unlink(string(path + "/block.db").c_str());
+    ceph_assert(r == 0);
+  }
+  if (devs_source.count(BlueFS::BDEV_WAL)) {
+    r = unlink(string(path + "/block.wal").c_str());
+    ceph_assert(r == 0);
+  }
+
+shutdown:
+  _umount_for_bluefs();
+  return r;
+}
+
+int BlueStore::migrate_to_new_bluefs_device(const set<int>& devs_source,
+  int id,
+  const string& dev_path)
+{
+  dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
+  int r;
+  ceph_assert(path_fd < 0);
+
+  ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
+
+  if (!cct->_conf->bluestore_bluefs) {
+    derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
+    return -EIO;
+  }
+
+  r = _mount_for_bluefs();
+
+  int reserved = 0;
+  string link_db;
+  string link_wal;
+  if (devs_source.count(BlueFS::BDEV_DB) &&
+      bluefs_shared_bdev != BlueFS::BDEV_DB) {
+    link_db = path + "/block.db";
+  }
+  if (devs_source.count(BlueFS::BDEV_WAL)) {
+    link_wal = path + "/block.wal";
+  }
+
+  size_t target_size;
+  string target_name;
+  if (id == BlueFS::BDEV_NEWWAL) {
+    target_name = "block.wal";
+    target_size = cct->_conf->bluestore_block_wal_size;
+
+    r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path,
+      cct->_conf->bdev_enable_discard);
+    ceph_assert(r == 0);
+
+    if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
+      r = _check_or_set_bdev_label(
+       dev_path,
+       bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
+        "bluefs wal",
+       true);
+      ceph_assert(r == 0);
+    }
+    reserved = BDEV_LABEL_BLOCK_SIZE;
+  } else if (id == BlueFS::BDEV_NEWDB) {
+    target_name = "block.db";
+    target_size = cct->_conf->bluestore_block_db_size;
+
+    r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path,
+      cct->_conf->bdev_enable_discard);
+    ceph_assert(r == 0);
+
+    if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
+      r = _check_or_set_bdev_label(
+       dev_path,
+       bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
+        "bluefs db",
+       true);
+      ceph_assert(r == 0);
+    }
+    reserved = SUPER_RESERVED;
+  }
+
+  bluefs->umount();
+  bluefs->mount();
+
+  bluefs->add_block_extent(
+    id, reserved, bluefs->get_block_device_size(id) - reserved);
+
+  r = bluefs->device_migrate_to_new(cct, devs_source, id);
+
+  if (r < 0) {
+    derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
+    goto shutdown;
+  }
+
+  if (!link_db.empty()) {
+    r = unlink(link_db.c_str());
+    ceph_assert(r == 0);
+  }
+  if (!link_wal.empty()) {
+    r = unlink(link_wal.c_str());
+    ceph_assert(r == 0);
+  }
+  r = _setup_block_symlink_or_file(
+    target_name,
+    dev_path,
+    target_size,
+    true);
+  ceph_assert(r == 0);
+  dout(0) << __func__ << " success" << dendl;
+
+shutdown:
+  _umount_for_bluefs();
+  return r;
+}
+
 void BlueStore::set_cache_shards(unsigned num)
 {
   dout(10) << __func__ << " " << num << dendl;
@@ -8509,6 +8792,21 @@ int BlueStore::_open_super_meta()
 
   // bluefs alloc
   if (cct->_conf->bluestore_bluefs) {
+    {
+      bluefs_extents.clear();
+      bufferlist bl;
+      db->get(PREFIX_SUPER, "bluefs_extents_back", &bl);
+      auto p = bl.cbegin();
+      try {
+       decode(bluefs_extents, p);
+      }
+      catch (buffer::error& e) {
+       dout(0) << __func__ << " unable to read bluefs_extents_back" << dendl;
+       //return -EIO;
+      }
+      dout(10) << __func__ << " bluefs_extents_back 0x" << std::hex << bluefs_extents
+              << std::dec << dendl;
+    }
     bluefs_extents.clear();
     bufferlist bl;
     db->get(PREFIX_SUPER, "bluefs_extents", &bl);
@@ -8522,6 +8820,7 @@ int BlueStore::_open_super_meta()
     }
     dout(10) << __func__ << " bluefs_extents 0x" << std::hex << bluefs_extents
             << std::dec << dendl;
+    
   }
 
   // ondisk format
index 098311f32cec3efad1f1e408f015d59abb8fc97d..e2d48a951201729269bf20a50e86208e2a8a6bbb 100644 (file)
@@ -2123,6 +2123,15 @@ private:
   // its initialization (and outside of _open_bdev)
   void _validate_bdev();
   void _close_bdev();
+
+  int _open_bluefs(bool create);
+  void _close_bluefs();
+
+  // Limited (u)mount intended for BlueFS operations only
+  int _mount_for_bluefs();
+  void _umount_for_bluefs();
+
+
   /*
    * @warning to_repair_db means that we open this db to repair it, will not
    * hold the rocksdb's file lock.
@@ -2356,6 +2365,13 @@ public:
     f->close_section();
   }
 
+  int add_new_bluefs_device(int id, const string& path);
+  int migrate_to_existing_bluefs_device(const set<int>& devs_source,
+    int id);
+  int migrate_to_new_bluefs_device(const set<int>& devs_source,
+    int id,
+    const string& path);
+
 public:
   int statfs(struct store_statfs_t *buf) override;
 
index 0c4ed2fcbae5a7321124d267e673dd5ce83f8164..601b72879e3295cd14a08be132ad2498c84b262b 100644 (file)
@@ -88,47 +88,67 @@ const char* find_device_path(
   return nullptr;
 }
 
-void add_devices(
-  BlueFS *fs,
+void parse_devices(
   CephContext *cct,
-  const vector<string>& devs)
+  const vector<string>& devs,
+  map<string, int>* got,
+  bool* has_db,
+  bool* has_wal)
 {
   string main;
-  set<int> got;
-  for (auto& i : devs) {
+  bool was_db = false;
+  if (has_wal) {
+    *has_wal = false;
+  }
+  if (has_db) {
+    *has_db = false;
+  }
+  for (auto& d : devs) {
     bluestore_bdev_label_t label;
-    int r = BlueStore::_read_bdev_label(cct, i, &label);
+    int r = BlueStore::_read_bdev_label(cct, d, &label);
     if (r < 0) {
-      cerr << "unable to read label for " << i << ": "
+      cerr << "unable to read label for " << d << ": "
           << cpp_strerror(r) << std::endl;
       exit(EXIT_FAILURE);
     }
     int id = -1;
     if (label.description == "main")
-      main = i;
-    else if (label.description == "bluefs db")
+      main = d;
+    else if (label.description == "bluefs db") {
       id = BlueFS::BDEV_DB;
-    else if (label.description == "bluefs wal")
+      was_db = true;
+      if (has_db) {
+       *has_db = true;
+      }
+    }
+    else if (label.description == "bluefs wal") {
       id = BlueFS::BDEV_WAL;
-    if (id >= 0) {
-      got.insert(id);
-      cout << " slot " << id << " " << i << std::endl;
-      int r = fs->add_block_device(id, i, false);
-      if (r < 0) {
-       cerr << "unable to open " << i << ": " << cpp_strerror(r) << std::endl;
-       exit(EXIT_FAILURE);
+      if (has_wal) {
+       *has_wal = true;
       }
     }
+    if (id >= 0) {
+      got->emplace(d, id);
+    }
   }
   if (main.length()) {
-    int id = BlueFS::BDEV_DB;
-    if (got.count(BlueFS::BDEV_DB))
-      id = BlueFS::BDEV_SLOW;
-    cout << " slot " << id << " " << main << std::endl;
-    int r = fs->add_block_device(id, main, false);
+    int id = was_db ? BlueFS::BDEV_SLOW : BlueFS::BDEV_DB;
+    got->emplace(main, id);
+  }
+}
+
+void add_devices(
+  BlueFS *fs,
+  CephContext *cct,
+  const vector<string>& devs)
+{
+  map<string, int> got;
+  parse_devices(cct, devs, &got, nullptr, nullptr);
+  for(auto e : got) {
+    cout << " slot " << e.second << " " << e.first << std::endl;
+    int r = fs->add_block_device(e.second, e.first, false);
     if (r < 0) {
-      cerr << "unable to open " << main << ": " << cpp_strerror(r)
-          << std::endl;
+      cerr << "unable to open " << e.first << ": " << cpp_strerror(r) << std::endl;
       exit(EXIT_FAILURE);
     }
   }
@@ -185,6 +205,8 @@ int main(int argc, char **argv)
 {
   string out_dir;
   vector<string> devs;
+  vector<string> devs_source;
+  string dev_target;
   string path;
   string action;
   string log_file;
@@ -199,13 +221,15 @@ int main(int argc, char **argv)
     ("log-file,l", po::value<string>(&log_file), "log file")
     ("log-level", po::value<int>(&log_level), "log level (30=most, 20=lots, 10=some, 1=little)")
     ("dev", po::value<vector<string>>(&devs), "device(s)")
+    ("devs-source", po::value<vector<string>>(&devs_source), "bluefs-dev-migrate source device(s)")
+    ("dev-target", po::value<string>(&dev_target), "target/resulting device")
     ("deep", po::value<bool>(&fsck_deep), "deep fsck (read all data)")
     ("key,k", po::value<string>(&key), "label metadata key name")
     ("value,v", po::value<string>(&value), "label metadata value")
     ;
   po::options_description po_positional("Positional options");
   po_positional.add_options()
-    ("command", po::value<string>(&action), "fsck, repair, bluefs-export, bluefs-bdev-sizes, bluefs-bdev-expand, show-label, set-label-key, rm-label-key, prime-osd-dir, bluefs-log-dump")
+    ("command", po::value<string>(&action), "fsck, repair, bluefs-export, bluefs-bdev-sizes, bluefs-bdev-expand, bluefs-bdev-new-db, bluefs-bdev-new-wal, bluefs-bdev-migrate, show-label, set-label-key, rm-label-key, prime-osd-dir, bluefs-log-dump")
     ;
   po::options_description po_all("All options");
   po_all.add(po_options).add(po_positional);
@@ -292,6 +316,31 @@ int main(int argc, char **argv)
     }
     inferring_bluefs_devices(devs, path);
   }
+  if (action == "bluefs-bdev-new-db" || action == "bluefs-bdev-new-wal") {
+    if (path.empty()) {
+      cerr << "must specify bluestore path" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (dev_target.empty()) {
+      cout << "NOTICE: --dev-target option omitted, will allocate as a file" << std::endl;
+    }
+    inferring_bluefs_devices(devs, path);
+  }
+  if (action == "bluefs-bdev-migrate") {
+    if (path.empty()) {
+      cerr << "must specify bluestore path" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    inferring_bluefs_devices(devs, path);
+    if (devs_source.size() == 0) {
+      cerr << "must specify source devices with --devs-source" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (dev_target.empty()) {
+      cerr << "must specify target device with --dev-target" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
 
   vector<const char*> args;
   if (log_file.size()) {
@@ -592,6 +641,157 @@ int main(int argc, char **argv)
     delete fs;
   } else if (action == "bluefs-log-dump") {
     log_dump(cct.get(), path, devs);
+  } else if (action == "bluefs-bdev-new-db" || action == "bluefs-bdev-new-wal") {
+    map<string, int> cur_devs_map;
+    bool need_db = action == "bluefs-bdev-new-db";
+
+    bool has_wal = false;
+    bool has_db = false;
+    parse_devices(cct.get(), devs, &cur_devs_map, &has_db, &has_wal);
+
+    if (has_db && has_wal) {
+      cerr << "can't allocate new device, both WAL and DB exist"
+           << std::endl;
+      exit(EXIT_FAILURE);
+    } else if (need_db && has_db) {
+      cerr << "can't allocate new DB device, already exists"
+           << std::endl;
+      exit(EXIT_FAILURE);
+    } else if (!need_db && has_wal) {
+      cerr << "can't allocate new WAL device, already exists"
+           << std::endl;
+      exit(EXIT_FAILURE);
+    } else {
+      // Create either DB or WAL volume
+      BlueStore bluestore(cct.get(), path);
+
+      char target_path[PATH_MAX] = "";
+      if(!dev_target.empty()) {
+       if (realpath(dev_target.c_str(), target_path) == nullptr) {
+         cerr << "failed to retrieve absolute path for " << dev_target
+              << ": " << cpp_strerror(errno)
+              << std::endl;
+       }
+      }
+      int r = bluestore.add_new_bluefs_device(
+       need_db ? BlueFS::BDEV_NEWDB : BlueFS::BDEV_NEWWAL,
+       target_path);
+      if (r == 0) {
+       cout << (need_db ? "DB" : "WAL") << " device added " << target_path
+            << std::endl;
+      } else {
+       cerr << "failed to add " << (need_db ? "DB" : "WAL") << " device:"
+            << cpp_strerror(r)
+            << std::endl;
+      }
+    }
+  } else if (action == "bluefs-bdev-migrate") {
+    map<string, int> cur_devs_map;
+    set<int> src_dev_ids;
+    map<string, int> src_devs;
+
+
+    parse_devices(cct.get(), devs, &cur_devs_map, nullptr, nullptr);
+    for (auto& s :  devs_source) {
+      auto i = cur_devs_map.find(s);
+      if (i != cur_devs_map.end()) {
+       src_devs.emplace(*i);
+       src_dev_ids.emplace(i->second);
+      } else {
+       cerr << "can't migrate " << s << ", not a valid bluefs volume "
+             << std::endl;
+       exit(EXIT_FAILURE);
+      }
+    }
+
+    auto i = cur_devs_map.find(dev_target);
+
+    if (i != cur_devs_map.end()) {
+      // Migrate to an existing BlueFS volume
+
+      auto dev_target_id = i->second;
+      if (dev_target_id == BlueFS::BDEV_WAL) {
+       // currently we're unable to migrate to WAL device since there is no space
+       // reserved for superblock
+       cerr << "Migrate to WAL device isn't supported." << std::endl;
+       exit(EXIT_FAILURE);
+      }
+
+      bool need_db = dev_target_id == BlueFS::BDEV_NEWDB;
+
+      BlueStore bluestore(cct.get(), path);
+      int r = bluestore.migrate_to_existing_bluefs_device(
+       src_dev_ids,
+       dev_target_id);
+      if (r == 0) {
+       for(auto src : src_devs) {
+         if (src.second != BlueFS::BDEV_SLOW) {
+           cout << " device removed:" << src.second << " " << src.first
+                << std::endl;
+         }
+       }
+      } else {
+       cerr << "failed to migrate to existing BlueFS device: "
+            << (need_db ? BlueFS::BDEV_DB : BlueFS::BDEV_DB)
+            << " " << dev_target
+            << cpp_strerror(r)
+            << std::endl;
+      }
+      ceph_assert(r == 0);
+    } else {
+      // Migrate to a new BlueFS volume
+      // via creating either DB or WAL volume
+      int dev_target_id;
+      if (src_dev_ids.count(BlueFS::BDEV_DB)) {
+       // if we have DB device in the source list - we create DB device
+       // (and may be remove WAL).
+       dev_target_id = BlueFS::BDEV_NEWDB;
+      } else if (src_dev_ids.count(BlueFS::BDEV_WAL)) {
+       dev_target_id = BlueFS::BDEV_NEWWAL;
+      } else {
+        cerr << "Unable to migrate Slow volume to new location, "
+               "please allocate new DB or WAL with "
+               "--bluefs-bdev-new-db(wal) command"
+            << std::endl;
+       exit(EXIT_FAILURE);
+      }
+
+      BlueStore bluestore(cct.get(), path);
+
+      char target_path[PATH_MAX] = "";
+      if(!dev_target.empty()) {
+       if (realpath(dev_target.c_str(), target_path) == nullptr) {
+         cerr << "failed to retrieve absolute path for " << dev_target
+              << ": " << cpp_strerror(errno)
+              << std::endl;
+       }
+      }
+      bool need_db = dev_target_id == BlueFS::BDEV_NEWDB;
+      int r = bluestore.migrate_to_new_bluefs_device(
+       src_dev_ids,
+       dev_target_id,
+       target_path);
+      if (r == 0) {
+       for(auto src : src_devs) {
+         if (src.second != BlueFS::BDEV_SLOW) {
+           cout << " device removed:" << src.second << " " << src.first
+                << std::endl;
+         }
+       }
+       cout << " device added: "
+            << (need_db ? BlueFS::BDEV_DB : BlueFS::BDEV_DB)
+            << " " << target_path
+            << std::endl;
+      } else {
+       cerr << "failed to migrate to new BlueFS device: "
+            << (need_db ? BlueFS::BDEV_DB : BlueFS::BDEV_DB)
+            << " " << target_path
+            << cpp_strerror(r)
+            << std::endl;
+      }
+
+      ceph_assert(r == 0);
+    }
   } else {
     cerr << "unrecognized action " << action << std::endl;
     return 1;