From 631041a7b788eaaf010f1e39f58b5546ab5580fc Mon Sep 17 00:00:00 2001
From: Igor Fedotov <ifedotov@suse.com>
Date: Thu, 3 May 2018 20:22:15 +0300
Subject: [PATCH] os/bluestore: add new bitmap allocator

Signed-off-by: Igor Fedotov <ifedotov@suse.com>
---
 src/os/CMakeLists.txt                         |   2 +
 src/os/bluestore/Allocator.cc                 |   3 +-
 src/os/bluestore/BitmapFastAllocator.cc       |  73 ++
 src/os/bluestore/BitmapFastAllocator.h        |  49 ++
 src/os/bluestore/fastbmap_allocator_impl.cc   | 359 ++++++++
 src/os/bluestore/fastbmap_allocator_impl.h    | 775 ++++++++++++++++++
 src/test/objectstore/Allocator_test.cc        |   6 +-
 src/test/objectstore/CMakeLists.txt           |  11 +
 .../objectstore/fastbmap_allocator_test.cc    | 584 +++++++++++++
 9 files changed, 1857 insertions(+), 5 deletions(-)
 create mode 100755 src/os/bluestore/BitmapFastAllocator.cc
 create mode 100755 src/os/bluestore/BitmapFastAllocator.h
 create mode 100755 src/os/bluestore/fastbmap_allocator_impl.cc
 create mode 100755 src/os/bluestore/fastbmap_allocator_impl.h
 create mode 100755 src/test/objectstore/fastbmap_allocator_test.cc

diff --git a/src/os/CMakeLists.txt b/src/os/CMakeLists.txt
index bd2af99ec3a7f..60c21e6df15cb 100644
--- a/src/os/CMakeLists.txt
+++ b/src/os/CMakeLists.txt
@@ -28,9 +28,11 @@ if(WITH_BLUESTORE)
     bluestore/BlueRocksEnv.cc
     bluestore/BlueStore.cc
     bluestore/bluestore_types.cc
+    bluestore/fastbmap_allocator_impl.cc
     bluestore/FreelistManager.cc
     bluestore/StupidAllocator.cc
     bluestore/BitMapAllocator.cc
+    bluestore/BitmapFastAllocator.cc
     bluestore/BitAllocator.cc
   )
 endif(WITH_BLUESTORE)
diff --git a/src/os/bluestore/Allocator.cc b/src/os/bluestore/Allocator.cc
index 059e088e1822b..d91568ef184f9 100644
--- a/src/os/bluestore/Allocator.cc
+++ b/src/os/bluestore/Allocator.cc
@@ -4,6 +4,7 @@
 #include "Allocator.h"
 #include "StupidAllocator.h"
 #include "BitMapAllocator.h"
+#include "BitmapFastAllocator.h"
 #include "common/debug.h"
 
 #define dout_subsys ceph_subsys_bluestore
@@ -14,7 +15,7 @@ Allocator *Allocator::create(CephContext* cct, string type,
   if (type == "stupid") {
     return new StupidAllocator(cct);
   } else if (type == "bitmap") {
-    return new BitMapAllocator(cct, size, block_size);
+    return new BitmapFastAllocator(cct, size, block_size);
   }
   lderr(cct) << "Allocator::" << __func__ << " unknown alloc type "
 	     << type << dendl;
diff --git a/src/os/bluestore/BitmapFastAllocator.cc b/src/os/bluestore/BitmapFastAllocator.cc
new file mode 100755
index 0000000000000..9139e817e65c0
--- /dev/null
+++ b/src/os/bluestore/BitmapFastAllocator.cc
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "BitmapFastAllocator.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+#undef dout_prefix
+#define dout_prefix *_dout << "fbmap_alloc 0x" << this << " "
+
+BitmapFastAllocator::BitmapFastAllocator(CephContext* _cct,
+					 int64_t capacity,
+					 int64_t alloc_unit) :
+    cct(_cct)
+{
+  ldout(cct, 10) << __func__ << " 0x" << std::hex << capacity << "/"
+		 << alloc_unit << std::dec << dendl;
+  _init(capacity, alloc_unit, false);
+}
+
+int64_t BitmapFastAllocator::allocate(
+  uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
+  int64_t hint, PExtentVector *extents)
+{
+  uint64_t allocated = 0;
+
+  if (hint != 0) {
+    last_pos = hint;
+  }
+
+  _allocate_l2(want_size, alloc_unit, max_alloc_size, &last_pos,
+    &allocated, extents);
+  if (!allocated) {
+    return -ENOSPC;
+  }
+  for (auto e : *extents) {
+    ldout(cct, 10) << __func__
+                   << " 0x" << std::hex << e.offset << "~" << e.length
+		   << "/" << alloc_unit << "," << max_alloc_size << "," << hint
+		   << std::dec << dendl;
+  }
+  return int64_t(allocated);
+}
+
+void BitmapFastAllocator::release(
+  const interval_set<uint64_t>& release_set)
+{
+  for (auto r : release_set) {
+    ldout(cct, 10) << __func__ << " 0x" << std::hex << r.first << "~" << r.second
+		  << std::dec << dendl;
+  }
+  _free_l2(release_set);
+}
+
+
+void BitmapFastAllocator::init_add_free(uint64_t offset, uint64_t length)
+{
+  ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length
+		  << std::dec << dendl;
+  _mark_free(offset, length);
+}
+void BitmapFastAllocator::init_rm_free(uint64_t offset, uint64_t length)
+{
+  ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length
+		 << std::dec << dendl;
+  _mark_allocated(offset, length);
+}
+
+void BitmapFastAllocator::shutdown()
+{
+  ldout(cct, 1) << __func__ << dendl;
+  last_pos = 0;
+}
diff --git a/src/os/bluestore/BitmapFastAllocator.h b/src/os/bluestore/BitmapFastAllocator.h
new file mode 100755
index 0000000000000..ac58f2a5a6368
--- /dev/null
+++ b/src/os/bluestore/BitmapFastAllocator.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OS_BLUESTORE_BITMAPFASTALLOCATOR_H
+#define CEPH_OS_BLUESTORE_BITMAPFASTALLOCATOR_H
+
+#include <mutex>
+
+#include "Allocator.h"
+#include "os/bluestore/bluestore_types.h"
+#include "fastbmap_allocator_impl.h"
+#include "include/mempool.h"
+#include "common/debug.h"
+
+class BitmapFastAllocator : public Allocator,
+  public AllocatorLevel02<AllocatorLevel01Loose> {
+  CephContext* cct;
+  uint64_t last_pos = 0;
+
+public:
+  BitmapFastAllocator(CephContext* _cct, int64_t capacity, int64_t alloc_unit);
+  ~BitmapFastAllocator() override
+  {
+  }
+
+
+  int64_t allocate(
+    uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
+    int64_t hint, PExtentVector *extents) override;
+
+  void release(
+    const interval_set<uint64_t>& release_set) override;
+
+  uint64_t get_free() override
+  {
+    return get_available();
+  }
+
+  void dump() override
+  {
+  }
+
+  void init_add_free(uint64_t offset, uint64_t length) override;
+  void init_rm_free(uint64_t offset, uint64_t length) override;
+
+  void shutdown() override;
+};
+
+#endif
diff --git a/src/os/bluestore/fastbmap_allocator_impl.cc b/src/os/bluestore/fastbmap_allocator_impl.cc
new file mode 100755
index 0000000000000..eced4d43089ed
--- /dev/null
+++ b/src/os/bluestore/fastbmap_allocator_impl.cc
@@ -0,0 +1,359 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Bitmap based in-memory allocator implementation.
+ * Author: Igor Fedotov, ifedotov@suse.com
+ *
+ */
+
+#include "fastbmap_allocator_impl.h"
+
+uint64_t AllocatorLevel::l0_dives = 0;
+uint64_t AllocatorLevel::l0_iterations = 0;
+uint64_t AllocatorLevel::l0_inner_iterations = 0;
+uint64_t AllocatorLevel::alloc_fragments = 0;
+uint64_t AllocatorLevel::alloc_fragments_fast = 0;
+uint64_t AllocatorLevel::l2_allocs = 0;
+
+void AllocatorLevel01Loose::_analyze_partials(uint64_t pos_start,
+  uint64_t pos_end, uint64_t length, uint64_t min_length, int mode,
+  search_ctx_t* ctx)
+{
+  auto d = CHILD_PER_SLOT;
+  assert((pos_start % d) == 0);
+  assert((pos_end % d) == 0);
+
+  uint64_t l0_w = slotset_width * CHILD_PER_SLOT_L0;
+
+  uint64_t l1_pos = pos_start;
+  bool prev_pos_partial = false;
+  uint64_t next_free_l1_pos = 0;
+  for (auto pos = pos_start / d; pos < pos_end / d; ++pos) {
+    slot_t slot_val = l1[pos];
+    // FIXME minor: code below can be optimized to check slot_val against
+    // all_slot_set(_clear)
+
+    for (auto c = 0; c < d; c++) {
+      switch (slot_val & L1_ENTRY_MASK) {
+      case L1_ENTRY_FREE:
+        prev_pos_partial = false;
+        if (!ctx->free_count) {
+          ctx->free_l1_pos = l1_pos;
+        } else if (l1_pos != next_free_l1_pos){
+	  break;
+	}
+	next_free_l1_pos = l1_pos + 1;
+        ++ctx->free_count;
+        if (mode == STOP_ON_EMPTY) {
+          return;
+        }
+        break;
+      case L1_ENTRY_FULL:
+        prev_pos_partial = false;
+        break;
+      case L1_ENTRY_PARTIAL:
+        uint64_t l;
+        uint64_t p0 = 0;
+	uint64_t p1 = 0;
+        ++ctx->partial_count;
+
+        if (!prev_pos_partial) {
+          l = _get_longest_from_l0(l1_pos * l0_w, (l1_pos + 1) * l0_w, &p0, &p1);
+          prev_pos_partial = true;
+        } else {
+          l = _get_longest_from_l0((l1_pos - 1) * l0_w, (l1_pos + 1) * l0_w, &p0, &p1);
+        }
+        if (l >= length) {
+          if ((ctx->min_affordable_len == 0) ||
+              ((ctx->min_affordable_len != 0) &&
+                (l - length < ctx->min_affordable_len - length))) {
+            ctx->min_affordable_len = l;
+            ctx->affordable_l0_pos_start = p0;
+            ctx->affordable_l0_pos_end = p1;
+          }
+        }
+        if (l > ctx->max_len) {
+          ctx->max_len = l;
+          ctx->max_l0_pos_start = p0;
+          ctx->max_l0_pos_end = p1;
+        }
+        if (mode == STOP_ON_PARTIAL) {
+          return;
+        }
+        break;
+      }
+      slot_val >>= L1_ENTRY_WIDTH;
+      ++l1_pos;
+    }
+  }
+  ctx->fully_processed = true;
+}
+
+void AllocatorLevel01Loose::_mark_l1_on_l0(int64_t l0_pos, int64_t l0_pos_end)
+{
+  if (l0_pos == l0_pos_end) {
+    return;
+  }
+  auto d0 = bits_per_slotset;
+  uint64_t l1_w = CHILD_PER_SLOT;
+  // this should be aligned with slotset boundaries
+  assert(0 == (l0_pos % d0));
+  assert(0 == (l0_pos_end % d0));
+
+  int64_t idx = l0_pos / bits_per_slot;
+  int64_t idx_end = l0_pos_end / bits_per_slot;
+  bool was_all_free = true;
+  bool was_all_allocated = true;
+
+  auto l1_pos = l0_pos / d0;
+
+  while (idx < idx_end) {
+    if (l0[idx] == all_slot_clear) {
+      was_all_free = false;
+
+      // if not all prev slots are allocated then no need to check the
+      // current slot set, it's partial
+      ++idx;
+      idx =
+        was_all_allocated ? idx : p2roundup(idx, int64_t(slotset_width));
+    } else if (l0[idx] == all_slot_set) {
+      // all free
+      was_all_allocated = false;
+      // if not all prev slots are free then no need to check the
+      // current slot set, it's partial
+      ++idx;
+      idx = was_all_free ? idx : p2roundup(idx, int64_t(slotset_width));
+    } else {
+      // no need to check the current slot set, it's partial
+      was_all_free = false;
+      was_all_allocated = false;
+      ++idx;
+      idx = p2roundup(idx, int64_t(slotset_width));
+    }
+    if ((idx % slotset_width) == 0) {
+
+      uint64_t shift = (l1_pos % l1_w) * L1_ENTRY_WIDTH;
+      slot_t& slot_val = l1[l1_pos / l1_w];
+        slot_val &= ~(uint64_t(L1_ENTRY_MASK) << shift);
+
+      if (was_all_allocated) {
+        assert(!was_all_free);
+        slot_val |= uint64_t(L1_ENTRY_FULL) << shift;
+      } else if (was_all_free) {
+        assert(!was_all_allocated);
+        slot_val |= uint64_t(L1_ENTRY_FREE) << shift;
+      } else {
+        slot_val |= uint64_t(L1_ENTRY_PARTIAL) << shift;
+      }
+      was_all_free = true;
+      was_all_allocated = true;
+      ++l1_pos;
+    }
+  }
+}
+
+void AllocatorLevel01Loose::_mark_alloc_l0(int64_t l0_pos_start,
+  int64_t l0_pos_end)
+{
+  auto d0 = CHILD_PER_SLOT_L0;
+
+  int64_t pos = l0_pos_start;
+  slot_t bits = (slot_t)1 << (l0_pos_start % d0);
+
+  while (pos < std::min(l0_pos_end, (int64_t)p2roundup(l0_pos_start, d0))) {
+    l0[pos / d0] &= ~bits;
+    bits <<= 1;
+    pos++;
+  }
+
+  while (pos < std::min(l0_pos_end, (int64_t)p2align(l0_pos_end, d0))) {
+    l0[pos / d0] = all_slot_clear;
+    pos += d0;
+  }
+  bits = 1;
+  while (pos < l0_pos_end) {
+    l0[pos / d0] &= ~bits;
+    bits <<= 1;
+    pos++;
+  }
+}
+
+interval_t AllocatorLevel01Loose::_allocate_l1(uint64_t length,
+  uint64_t min_length, uint64_t max_length,
+  uint64_t pos_start, uint64_t pos_end)
+{
+  interval_t res = { 0, 0 };
+  uint64_t l0_w = slotset_width * CHILD_PER_SLOT_L0;
+
+  if (length <= l0_granularity) {
+    search_ctx_t ctx;
+    _analyze_partials(pos_start, pos_end, l0_granularity, l0_granularity,
+      STOP_ON_PARTIAL, &ctx);
+
+    // check partially free slot sets first (including neighboring),
+    // full length match required.
+    if (ctx.min_affordable_len) {
+      // allocate as specified
+      assert(ctx.min_affordable_len >= length);
+      auto pos_end = ctx.affordable_l0_pos_start + 1;
+      _mark_alloc_l1_l0(ctx.affordable_l0_pos_start, pos_end);
+      res = interval_t(ctx.affordable_l0_pos_start * l0_granularity, length);
+      return res;
+    }
+
+    // allocate from free slot sets
+    if (ctx.free_count) {
+      auto l = std::min(length, ctx.free_count * l1_granularity);
+      assert((l % l0_granularity) == 0);
+      auto pos_end = ctx.free_l1_pos * l0_w + l / l0_granularity;
+
+      _mark_alloc_l1_l0(ctx.free_l1_pos * l0_w, pos_end);
+      res = interval_t(ctx.free_l1_pos * l1_granularity, l);
+      return res;
+    }
+  } else if (length == l1_granularity) {
+    search_ctx_t ctx;
+    _analyze_partials(pos_start, pos_end, length, min_length, STOP_ON_EMPTY, &ctx);
+
+    // allocate exactly matched entry if any
+    if (ctx.free_count) {
+      auto l = std::min(length, ctx.free_count * l1_granularity);
+      assert((l % l0_granularity) == 0);
+      auto pos_end = ctx.free_l1_pos * l0_w + l / l0_granularity;
+
+      _mark_alloc_l1_l0(ctx.free_l1_pos * l0_w, pos_end);
+      res = interval_t(ctx.free_l1_pos * l1_granularity, l);
+
+      return res;
+    }
+
+    // we can terminate earlier on free entry only
+    assert(ctx.fully_processed);
+
+    // check partially free slot sets first (including neighboring),
+    // full length match required.
+    if (ctx.min_affordable_len) {
+      assert(ctx.min_affordable_len >= length);
+      assert((length % l0_granularity) == 0);
+      auto pos_end = ctx.affordable_l0_pos_start + length / l0_granularity;
+      _mark_alloc_l1_l0(ctx.affordable_l0_pos_start, pos_end);
+      res = interval_t(ctx.affordable_l0_pos_start * l0_granularity, length);
+      return res;
+    }
+    if (ctx.max_len >= min_length) {
+      assert((ctx.max_len % l0_granularity) == 0);
+      auto pos_end = ctx.max_l0_pos_start + ctx.max_len / l0_granularity;
+      _mark_alloc_l1_l0(ctx.max_l0_pos_start, pos_end);
+      res = interval_t(ctx.max_l0_pos_start * l0_granularity, ctx.max_len);
+      return res;
+    }
+  } else {
+    search_ctx_t ctx;
+    _analyze_partials(pos_start, pos_end, length, min_length, NO_STOP, &ctx);
+    assert(ctx.fully_processed);
+    // check partially free slot sets first (including neighboring),
+    // full length match required.
+    if (ctx.min_affordable_len) {
+      assert(ctx.min_affordable_len >= length);
+      assert((length % l0_granularity) == 0);
+      auto pos_end = ctx.affordable_l0_pos_start + length / l0_granularity;
+      _mark_alloc_l1_l0(ctx.affordable_l0_pos_start, pos_end);
+      res = interval_t(ctx.affordable_l0_pos_start * l0_granularity, length);
+      return res;
+    }
+    // allocate exactly matched entry if any
+    if (ctx.free_count) {
+
+      auto l = std::min(length, ctx.free_count * l1_granularity);
+      assert((l % l0_granularity) == 0);
+      auto pos_end = ctx.free_l1_pos * l0_w + l / l0_granularity;
+
+      _mark_alloc_l1_l0(ctx.free_l1_pos * l0_w, pos_end);
+      res = interval_t(ctx.free_l1_pos * l1_granularity, l);
+      return res;
+    }
+    if (ctx.max_len >= min_length) {
+      assert((ctx.max_len % l0_granularity) == 0);
+      auto pos_end = ctx.max_l0_pos_start + ctx.max_len / l0_granularity;
+      _mark_alloc_l1_l0(ctx.max_l0_pos_start, pos_end);
+      res = interval_t(ctx.max_l0_pos_start * l0_granularity, ctx.max_len);
+      return res;
+    }
+  }
+  return res;
+}
+
+bool AllocatorLevel01Loose::_allocate_l1(uint64_t length,
+  uint64_t min_length, uint64_t max_length,
+  uint64_t l1_pos_start, uint64_t l1_pos_end,
+  uint64_t* allocated,
+  interval_vector_t* res)
+{
+  uint64_t d0 = CHILD_PER_SLOT_L0;
+  uint64_t d1 = CHILD_PER_SLOT;
+
+  assert(0 == (l1_pos_start % (slotset_width * d1)));
+  assert(0 == (l1_pos_end % (slotset_width * d1)));
+  if (min_length != l0_granularity) {
+    // probably not the most effecient way but
+    // don't care much about that at the moment
+    bool has_space = true;
+    while (length > *allocated && has_space) {
+      interval_t i =
+        _allocate_l1(length - *allocated, min_length, max_length,
+	  l1_pos_start, l1_pos_end);
+      if (i.length == 0) {
+        has_space = false;
+      } else {
+	_fragment_and_emplace(max_length, i.offset, i.length, res);
+        *allocated += i.length;
+      }
+    }
+  } else {
+    uint64_t l0_w = slotset_width * d0;
+
+    for (auto idx = l1_pos_start / d1;
+      idx < l1_pos_end / d1 && length > *allocated;
+      ++idx) {
+      slot_t& slot_val = l1[idx];
+      if (slot_val == all_slot_clear) {
+        continue;
+      } else if (slot_val == all_slot_set) {
+        uint64_t to_alloc = std::min(length - *allocated,
+          l1_granularity * d1);
+        *allocated += to_alloc;
+        ++alloc_fragments_fast;
+	_fragment_and_emplace(max_length, idx * d1 * l1_granularity, to_alloc,
+	  res);
+        _mark_alloc_l1_l0(idx * d1 * bits_per_slotset, idx * d1 * bits_per_slotset + to_alloc / l0_granularity);
+        continue;
+      }
+      auto free_pos = find_next_set_bit(slot_val, 0);
+      assert(free_pos < bits_per_slot);
+      do {
+        assert(length > *allocated);
+
+        bool empty;
+        empty = _allocate_l0(length, min_length, max_length,
+	  (idx * d1 + free_pos / L1_ENTRY_WIDTH) * l0_w,
+          (idx * d1 + free_pos / L1_ENTRY_WIDTH + 1) * l0_w,
+          allocated,
+          res);
+        slot_val &= (~slot_t(L1_ENTRY_MASK)) << free_pos;
+        if (empty) {
+          // the next line is no op with the current L1_ENTRY_FULL but left
+          // as-is for the sake of uniformity and to avoid potential errors
+          // in future
+          slot_val |= slot_t(L1_ENTRY_FULL) << free_pos;
+        } else {
+          slot_val |= slot_t(L1_ENTRY_PARTIAL) << free_pos;
+        }
+        if (length <= *allocated || slot_val == all_slot_clear) {
+          break;
+        }
+	free_pos = find_next_set_bit(slot_val, free_pos + L1_ENTRY_WIDTH);
+      } while (free_pos < bits_per_slot);
+    }
+  }
+  return _is_empty_l1(l1_pos_start, l1_pos_end);
+}
diff --git a/src/os/bluestore/fastbmap_allocator_impl.h b/src/os/bluestore/fastbmap_allocator_impl.h
new file mode 100755
index 0000000000000..d8f7b1d8b76bd
--- /dev/null
+++ b/src/os/bluestore/fastbmap_allocator_impl.h
@@ -0,0 +1,775 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Bitmap based in-memory allocator implementation.
+ * Author: Igor Fedotov, ifedotov@suse.com
+ *
+ */
+
+#ifndef __FAST_BITMAP_ALLOCATOR_IMPL_H
+#define __FAST_BITMAP_ALLOCATOR_IMPL_H
+#include "include/intarith.h"
+
+#include <vector>
+#include <algorithm>
+#include <mutex>
+
+typedef uint64_t slot_t;
+
+#ifdef NON_CEPH_BUILD
+#include <assert.h>
+struct interval_t
+{
+  uint64_t offset = 0;
+  uint32_t length = 0;
+
+  interval_t() {}
+  interval_t(uint64_t o, uint64_t l) : offset(o), length(l) {}
+  interval_t(const interval_t &ext) :
+    offset(ext.offset), length(ext.length) {}
+};
+typedef std::vector<interval_t> interval_vector_t;
+typedef std::vector<slot_t> slot_vector_t;
+#else
+#include "include/assert.h"
+#include "os/bluestore/bluestore_types.h"
+#include "include/mempool.h"
+
+typedef bluestore_pextent_t interval_t;
+typedef PExtentVector interval_vector_t;
+
+typedef mempool::bluestore_alloc::vector<slot_t> slot_vector_t;
+
+#endif
+
+// fitting into cache line on x86_64
+static const size_t slotset_width = 8; // 8 slots per set
+static const size_t slotset_bytes = sizeof(slot_t) * slotset_width;
+static const size_t bits_per_slot = sizeof(slot_t) * 8;
+static const size_t bits_per_slotset = slotset_bytes * 8;
+static const slot_t all_slot_set = 0xffffffffffffffff;
+static const slot_t all_slot_clear = 0;
+
+inline size_t find_next_set_bit(slot_t slot_val, size_t start_pos)
+{
+#ifdef __GNUC__
+  if (start_pos == 0) {
+    start_pos = __builtin_ffsll(slot_val);
+    return start_pos ? start_pos - 1 : bits_per_slot;
+  }
+#endif
+  slot_t mask = slot_t(1) << start_pos;
+  while (start_pos < bits_per_slot && !(slot_val & mask)) {
+    mask <<= 1;
+    ++start_pos;
+  }
+  return start_pos;
+}
+
+
+class AllocatorLevel
+{
+protected:
+
+  virtual uint64_t _children_per_slot() const = 0;
+  virtual uint64_t _level_granularity() const = 0;
+
+public:
+  static uint64_t l0_dives;
+  static uint64_t l0_iterations;
+  static uint64_t l0_inner_iterations;
+  static uint64_t alloc_fragments;
+  static uint64_t alloc_fragments_fast;
+  static uint64_t l2_allocs;
+
+  virtual ~AllocatorLevel()
+  {}
+
+};
+
+class AllocatorLevel01 : public AllocatorLevel
+{
+protected:
+  slot_vector_t l0; // set bit means free entry
+  slot_vector_t l1;
+  uint64_t l0_granularity = 0; // space per entry
+  uint64_t l1_granularity = 0; // space per entry
+
+  uint64_t _level_granularity() const override
+  {
+    return l1_granularity;
+  }
+
+  inline bool _is_slot_fully_allocated(uint64_t idx) const {
+    return l1[idx] == all_slot_clear;
+  }
+};
+
+template <class T>
+class AllocatorLevel02;
+
+class AllocatorLevel01Loose : public AllocatorLevel01
+{
+  enum {
+    L1_ENTRY_WIDTH = 2,
+    L1_ENTRY_MASK = (1 << L1_ENTRY_WIDTH) - 1,
+    L1_ENTRY_FULL = 0x00,
+    L1_ENTRY_PARTIAL = 0x01,
+    L1_ENTRY_FREE = 0x03,
+    CHILD_PER_SLOT = bits_per_slot / L1_ENTRY_WIDTH, // 32
+    CHILD_PER_SLOT_L0 = bits_per_slot, // 64
+  };
+  uint64_t _children_per_slot() const override
+  {
+    return CHILD_PER_SLOT;
+  }
+
+  uint64_t _get_longest_from_l0(uint64_t pos0, uint64_t pos1,
+    uint64_t* pos0_res, uint64_t* pos1_res) const
+  {
+    uint64_t res = 0;
+    if (pos0 >= pos1) {
+      return res;
+    }
+
+    uint64_t res_candidate = 0;
+
+    auto pos = pos0;
+    uint64_t pos_free_start = 0;
+    bool was_free = false;
+    auto d = slotset_width * 8;
+    slot_t bits = l0[pos / d];
+    bits >>= pos % d;
+    bool end_loop = false;
+    do {
+      if ((pos % d) == 0) {
+        bits = l0[pos / d];
+        if (pos1 - pos >= d) {
+          if (bits == all_slot_set) {
+            // slot is totally free
+            if (was_free) {
+              res_candidate += d;
+            } else {
+              was_free = true;
+              res_candidate = d;
+              pos_free_start = pos;
+            }
+            pos += d;
+            end_loop = pos >= pos1;
+            if (end_loop && res < res_candidate) {
+              *pos0_res = pos_free_start;
+              *pos1_res = pos;
+              res = res_candidate;
+              res_candidate = 0;
+            }
+            continue;
+          }
+          if (bits == all_slot_clear) {
+            // slot is totally allocated
+            if (was_free) {
+              if (res < res_candidate) {
+                *pos0_res = pos_free_start;
+                *pos1_res = pos;
+                res = res_candidate;
+                res_candidate = 0;
+              }
+              was_free = false;
+            }
+            pos += d;
+            end_loop = pos >= pos1;
+            continue;
+          }
+        }
+      } //if ((pos % d) == 0)
+
+      if ((bits & 1) == 1) {
+        // item is free
+        ++res_candidate;
+        if (!was_free) {
+          pos_free_start = pos;
+          was_free = true;
+        }
+      }
+      end_loop = ++pos >= pos1;
+      if (was_free && (end_loop || !(bits & 1))) {
+        if (res < res_candidate) {
+          *pos0_res = pos_free_start;
+          *pos1_res = pos;
+          res = res_candidate;
+        }
+        res_candidate = 0;
+        was_free = false;
+      }
+      bits >>= 1;
+    } while (!end_loop);
+    res *= l0_granularity;
+    return res;
+  }
+
+  inline void _fragment_and_emplace(uint64_t max_length, uint64_t offset,
+    uint64_t len,
+    interval_vector_t* res)
+  {
+    if (max_length) {
+      while (len > max_length) {
+	res->emplace_back(offset, max_length);
+	offset += max_length;
+	len -= max_length;
+      }
+    }
+    res->emplace_back(offset, len);
+  }
+
+  bool _allocate_l0(uint64_t length,
+    uint64_t min_length, uint64_t max_length,
+    uint64_t l0_pos0, uint64_t l0_pos1,
+    uint64_t* allocated,
+    interval_vector_t* res)
+  {
+    uint64_t d0 = CHILD_PER_SLOT_L0;
+
+    ++l0_dives;
+
+    assert(l0_pos0 < l0_pos1);
+    assert(length > *allocated);
+    assert(0 == (l0_pos0 % (slotset_width * d0)));
+    assert(0 == (l0_pos1 % (slotset_width * d0)));
+    assert(max_length == 0 || max_length >= min_length);
+    assert(max_length == 0 || (max_length % min_length) == 0);
+    assert(((length - *allocated) % l0_granularity) == 0);
+
+    uint64_t need_entries = (length - *allocated) / l0_granularity;
+
+    for (auto idx = l0_pos0 / d0; (idx < l0_pos1 / d0) && (length > *allocated);
+      ++idx) {
+      ++l0_iterations;
+      slot_t& slot_val = l0[idx];
+      auto base = idx * d0;
+      if (slot_val == all_slot_clear) {
+        continue;
+      } else if (slot_val == all_slot_set) {
+        uint64_t to_alloc = std::min(need_entries, d0);
+        *allocated += to_alloc * l0_granularity;
+	++alloc_fragments;
+        need_entries -= to_alloc;
+
+	_fragment_and_emplace(max_length, base * l0_granularity,
+          to_alloc * l0_granularity, res);
+
+        if (to_alloc == d0) {
+          slot_val = all_slot_clear;
+        } else {
+          _mark_alloc_l0(base, base + to_alloc);
+        }
+        continue;
+      }
+
+      auto free_pos = find_next_set_bit(slot_val, 0);
+      assert(free_pos < bits_per_slot);
+      auto next_pos = free_pos + 1;
+      while (next_pos < bits_per_slot &&
+        (next_pos - free_pos) < need_entries) {
+	++l0_inner_iterations;
+
+        if (0 == (slot_val & (slot_t(1) << next_pos))) {
+          auto to_alloc = (next_pos - free_pos);
+          *allocated += to_alloc * l0_granularity;
+	  ++alloc_fragments;
+          need_entries -= to_alloc;
+	  _fragment_and_emplace(max_length, (base + free_pos) * l0_granularity,
+	    to_alloc * l0_granularity, res);
+          _mark_alloc_l0(base + free_pos, base + next_pos);
+          free_pos = find_next_set_bit(slot_val, next_pos + 1);
+          next_pos = free_pos + 1;
+        } else {
+          ++next_pos;
+        }
+      }
+      if (need_entries && free_pos < bits_per_slot) {
+        auto to_alloc = std::min(need_entries, d0 - free_pos);
+        *allocated += to_alloc * l0_granularity;
+	++alloc_fragments;
+	need_entries -= to_alloc;
+	_fragment_and_emplace(max_length, (base + free_pos) * l0_granularity,
+	  to_alloc * l0_granularity, res);
+        _mark_alloc_l0(base + free_pos, base + free_pos + to_alloc);
+      }
+    }
+    return _is_empty_l0(l0_pos0, l0_pos1);
+  }
+
+protected:
+
+  friend class AllocatorLevel02<AllocatorLevel01Loose>;
+
+  void _init(uint64_t capacity, uint64_t _alloc_unit, bool mark_as_free = true)
+  {
+    l0_granularity = _alloc_unit;
+    // 512 bits at L0 mapped to L1 entry
+    l1_granularity = l0_granularity * bits_per_slotset;
+
+    // capacity to have slot alignment at l1
+    auto aligned_capacity =
+      p2roundup((int64_t)capacity,
+        int64_t(l1_granularity * slotset_width * _children_per_slot()));
+    size_t slot_count =
+      aligned_capacity / l1_granularity / _children_per_slot();
+    // we use set bit(s) as a marker for (partially) free entry
+    l1.resize(slot_count, mark_as_free ? all_slot_set : all_slot_clear);
+
+    // l0 slot count
+    slot_count = aligned_capacity / _alloc_unit / bits_per_slot;
+    // we use set bit(s) as a marker for (partially) free entry
+    l0.resize(slot_count, mark_as_free ? all_slot_set : all_slot_clear);
+
+    if (mark_as_free) {
+      auto l0_pos_no_use = p2roundup((int64_t)capacity, (int64_t)l0_granularity) / l0_granularity;
+      _mark_alloc_l1_l0(l0_pos_no_use, aligned_capacity / l0_granularity);
+    }
+  }
+
+  struct search_ctx_t
+  {
+    size_t partial_count = 0;
+    size_t free_count = 0;
+    uint64_t free_l1_pos = 0;
+
+    uint64_t max_len = 0;
+    uint64_t max_l0_pos_start = 0;
+    uint64_t max_l0_pos_end = 0;
+    uint64_t min_affordable_len = 0;
+    uint64_t affordable_l0_pos_start = 0;
+    uint64_t affordable_l0_pos_end = 0;
+
+    bool fully_processed = false;
+
+    void reset()
+    {
+      *this = search_ctx_t();
+    }
+  };
+  enum {
+    NO_STOP,
+    STOP_ON_EMPTY,
+    STOP_ON_PARTIAL,
+  };
+  void _analyze_partials(uint64_t pos_start, uint64_t pos_end,
+    uint64_t length, uint64_t min_length, int mode,
+    search_ctx_t* ctx);
+
+  void _mark_l1_on_l0(int64_t l0_pos, int64_t l0_pos_end);
+  void _mark_alloc_l0(int64_t l0_pos_start, int64_t l0_pos_end);
+
+  void _mark_alloc_l1_l0(int64_t l0_pos_start, int64_t l0_pos_end)
+  {
+    _mark_alloc_l0(l0_pos_start, l0_pos_end);
+    l0_pos_start = p2align(l0_pos_start, int64_t(bits_per_slotset));
+    l0_pos_end = p2roundup(l0_pos_end, int64_t(bits_per_slotset));
+    _mark_l1_on_l0(l0_pos_start, l0_pos_end);
+  }
+
+  void _mark_free_l0(int64_t l0_pos_start, int64_t l0_pos_end)
+  {
+    auto d0 = CHILD_PER_SLOT_L0;
+
+    auto pos = l0_pos_start;
+    slot_t bits = (slot_t)1 << (l0_pos_start % d0);
+    slot_t& val_s = l0[pos / d0];
+    int64_t pos_e = std::min(l0_pos_end, (int64_t)p2roundup(l0_pos_start + 1, d0));
+    while (pos < pos_e) {
+      val_s |= bits;
+      bits <<= 1;
+      pos++;
+    }
+    pos_e = std::min(l0_pos_end, (int64_t)p2align(l0_pos_end, d0));
+    auto idx = pos / d0;
+    while (pos < pos_e) {
+      l0[idx++] = all_slot_set;
+      pos += d0;
+    }
+    bits = 1;
+    uint64_t& val_e = l0[pos / d0];
+    while (pos < l0_pos_end) {
+      val_e |= bits;
+      bits <<= 1;
+      pos++;
+    }
+  }
+
+  void _mark_free_l1_l0(int64_t l0_pos_start, int64_t l0_pos_end)
+  {
+    _mark_free_l0(l0_pos_start, l0_pos_end);
+    l0_pos_start = p2align(l0_pos_start, int64_t(bits_per_slotset));
+    l0_pos_end = p2roundup(l0_pos_end, int64_t(bits_per_slotset));
+    _mark_l1_on_l0(l0_pos_start, l0_pos_end);
+  }
+
+  bool _is_empty_l0(uint64_t l0_pos, uint64_t l0_pos_end)
+  {
+    bool no_free = true;
+    uint64_t d = slotset_width * CHILD_PER_SLOT_L0;
+    assert(0 == (l0_pos % d));
+    assert(0 == (l0_pos_end % d));
+
+    auto idx = l0_pos / CHILD_PER_SLOT_L0;
+    auto idx_end = l0_pos_end / CHILD_PER_SLOT_L0;
+    while (idx < idx_end && no_free) {
+      no_free = l0[idx] == all_slot_clear;
+      ++idx;
+    }
+    return no_free;
+  }
+  bool _is_empty_l1(uint64_t l1_pos, uint64_t l1_pos_end)
+  {
+    bool no_free = true;
+    uint64_t d = slotset_width * _children_per_slot();
+    assert(0 == (l1_pos % d));
+    assert(0 == (l1_pos_end % d));
+
+    auto idx = l1_pos / CHILD_PER_SLOT;
+    auto idx_end = l1_pos_end / CHILD_PER_SLOT;
+    while (idx < idx_end && no_free) {
+      no_free = _is_slot_fully_allocated(idx);
+      ++idx;
+    }
+    return no_free;
+  }
+
+  interval_t _allocate_l1(uint64_t length,
+    uint64_t min_length, uint64_t max_length,
+    uint64_t pos_start, uint64_t pos_end);
+
+  bool _allocate_l1(uint64_t length,
+    uint64_t min_length, uint64_t max_length,
+    uint64_t l1_pos_start, uint64_t l1_pos_end,
+    uint64_t* allocated,
+    interval_vector_t* res);
+
+  uint64_t _mark_alloc_l1(const interval_t& r)
+  {
+    uint64_t l0_pos_start = r.offset / l0_granularity;
+    uint64_t l0_pos_end = p2roundup(r.offset + r.length, l0_granularity) / l0_granularity;
+    _mark_alloc_l1_l0(l0_pos_start, l0_pos_end);
+    return l0_granularity * (l0_pos_end - l0_pos_start);
+  }
+
+  uint64_t _free_l1(uint64_t offs, uint64_t len)
+  {
+    uint64_t l0_pos_start = offs / l0_granularity;
+    uint64_t l0_pos_end = p2roundup(offs + len, l0_granularity) / l0_granularity;
+    _mark_free_l1_l0(l0_pos_start, l0_pos_end);
+    return l0_granularity * (l0_pos_end - l0_pos_start);
+  }
+
+public:
+  uint64_t debug_get_allocated(uint64_t pos0 = 0, uint64_t pos1 = 0)
+  {
+    if (pos1 == 0) {
+      pos1 = l1.size() * CHILD_PER_SLOT;
+    }
+    auto avail = debug_get_free(pos0, pos1);
+    return (pos1 - pos0) * l1_granularity - avail;
+  }
+
+  uint64_t debug_get_free(uint64_t l1_pos0 = 0, uint64_t l1_pos1 = 0)
+  {
+    assert(0 == (l1_pos0 % CHILD_PER_SLOT));
+    assert(0 == (l1_pos1 % CHILD_PER_SLOT));
+
+    auto idx0 = l1_pos0 * slotset_width;
+    auto idx1 = l1_pos1 * slotset_width;
+
+    if (idx1 == 0) {
+      idx1 = l0.size();
+    }
+
+    uint64_t res = 0;
+    for (uint64_t i = idx0; i < idx1; ++i) {
+      auto v = l0[i];
+      if (v == all_slot_set) {
+        res += CHILD_PER_SLOT_L0;
+      } else if (v != all_slot_clear) {
+        size_t cnt = 0;
+#ifdef __GNUC__
+        cnt = __builtin_popcountll(v);
+#else
+        // Kernighan's Alg to count set bits
+        while (v) {
+          v &= (v - 1);
+          cnt++;
+        }
+#endif
+        res += cnt;
+      }
+    }
+    return res * l0_granularity;
+  }
+};
+
+class AllocatorLevel01Compact : public AllocatorLevel01
+{
+  uint64_t _children_per_slot() const override
+  {
+    return 8;
+  }
+public:
+};
+
+template <class L1>
+class AllocatorLevel02 : public AllocatorLevel
+{
+public:
+  uint64_t debug_get_free(uint64_t pos0 = 0, uint64_t pos1 = 0)
+  {
+    std::lock_guard<std::mutex> l(lock);
+    return l1.debug_get_free(pos0 * l1._children_per_slot() * bits_per_slot,
+      pos1 * l1._children_per_slot() * bits_per_slot);
+  }
+  uint64_t debug_get_allocated(uint64_t pos0 = 0, uint64_t pos1 = 0)
+  {
+    std::lock_guard<std::mutex> l(lock);
+    return l1.debug_get_allocated(pos0 * l1._children_per_slot() * bits_per_slot,
+      pos1 * l1._children_per_slot() * bits_per_slot);
+  }
+
+  uint64_t get_available()
+  {
+    std::lock_guard<std::mutex> l(lock);
+    return available;
+  }
+
+protected:
+  std::mutex lock;
+  L1 l1;
+  slot_vector_t l2;
+  uint64_t l2_granularity = 0; // space per entry
+  uint64_t available = 0;
+
+  enum {
+    CHILD_PER_SLOT = bits_per_slot, // 64
+  };
+
+  uint64_t _children_per_slot() const override
+  {
+    return CHILD_PER_SLOT;
+  }
+  uint64_t _level_granularity() const override
+  {
+    return l2_granularity;
+  }
+
+  void _init(uint64_t capacity, uint64_t _alloc_unit, bool mark_as_free = true)
+  {
+    l1._init(capacity, _alloc_unit, mark_as_free);
+
+    l2_granularity =
+      l1._level_granularity() * l1._children_per_slot() * slotset_width;
+
+    // capacity to have slot alignment at l2
+    auto aligned_capacity =
+      p2roundup((int64_t)capacity, (int64_t)l2_granularity * CHILD_PER_SLOT);
+    size_t elem_count = aligned_capacity / l2_granularity / CHILD_PER_SLOT;
+    // we use set bit(s) as a marker for (partially) free entry
+    l2.resize(elem_count, mark_as_free ? all_slot_set : all_slot_clear);
+
+    if (mark_as_free) {
+      // capacity to have slotset alignment at l1
+      auto l2_pos_no_use =
+	p2roundup((int64_t)capacity, (int64_t)l2_granularity) / l2_granularity;
+      _mark_l2_allocated(l2_pos_no_use, aligned_capacity / l2_granularity);
+      available = p2align(capacity, _alloc_unit);
+    } else {
+      available = 0;
+    }
+  }
+
+  void _mark_l2_allocated(int64_t l2_pos, int64_t l2_pos_end)
+  {
+    auto d = CHILD_PER_SLOT;
+    assert(0 <= l2_pos_end);
+    assert((int64_t)l2.size() >= (l2_pos_end / d));
+
+    while (l2_pos < l2_pos_end) {
+      l2[l2_pos / d] &= ~(slot_t(1) << (l2_pos % d));
+      ++l2_pos;
+    }
+  }
+
+  void _mark_l2_free(int64_t l2_pos, int64_t l2_pos_end)
+  {
+    auto d = CHILD_PER_SLOT;
+    assert(0 <= l2_pos_end);
+    assert((int64_t)l2.size() >= (l2_pos_end / d));
+
+    while (l2_pos < l2_pos_end) {
+        l2[l2_pos / d] |= (slot_t(1) << (l2_pos % d));
+        ++l2_pos;
+    }
+  }
+
+  void _mark_l2_on_l1(int64_t l2_pos, int64_t l2_pos_end)
+  {
+    auto d = CHILD_PER_SLOT;
+    assert(0 <= l2_pos_end);
+    assert((int64_t)l2.size() >= (l2_pos_end / d));
+
+    auto idx = l2_pos * slotset_width;
+    auto idx_end = l2_pos_end * slotset_width;
+    bool all_allocated = true;
+    while (idx < idx_end) {
+      if (!l1._is_slot_fully_allocated(idx)) {
+        all_allocated = false;
+        idx = p2roundup(int64_t(++idx), int64_t(slotset_width));
+      }
+      else {
+        ++idx;
+      }
+      if ((idx % slotset_width) == 0) {
+        if (all_allocated) {
+          l2[l2_pos / d] &= ~(slot_t(1) << (l2_pos % d));
+        }
+        else {
+          l2[l2_pos / d] |= (slot_t(1) << (l2_pos % d));
+        }
+        all_allocated = true;
+        ++l2_pos;
+      }
+    }
+  }
+
+  void _allocate_l2(uint64_t length,
+    uint64_t min_length,
+    uint64_t max_length,
+    uint64_t* pos_start,
+    uint64_t* allocated,
+    interval_vector_t* res)
+  {
+    uint64_t prev_allocated = *allocated;
+    uint64_t d = CHILD_PER_SLOT;
+    assert((*pos_start % d) == 0);
+    assert(min_length <= l2_granularity);
+    assert(max_length == 0 || max_length >= min_length);
+    assert(max_length == 0 || (max_length % min_length) == 0);
+
+    uint64_t l1_w = slotset_width * l1._children_per_slot();
+
+    auto last_pos = *pos_start;
+
+    auto l2_pos = *pos_start;
+    std::lock_guard<std::mutex> l(lock);
+
+    if (available < min_length) {
+      return;
+    }
+    auto pos = *pos_start / d;
+    auto pos_end = l2.size();
+    // outer loop below is intended to optimize the performance by
+    // avoiding 'modulo' operations inside the internal loop.
+    // Looks like they have negative impact on the performance
+    for (auto i = 0; i < 2; ++i) {
+      for(; length > *allocated && pos < pos_end; ++pos) {
+	slot_t& slot_val = l2[pos];
+	size_t free_pos = 0;
+	bool all_set = false;
+	if (slot_val == all_slot_clear) {
+	  l2_pos += d;
+	  last_pos = l2_pos;
+	  continue;
+	} else if (slot_val == all_slot_set) {
+	  free_pos = 0;
+	  all_set = true;
+	} else {
+	  free_pos = find_next_set_bit(slot_val, 0);
+	  assert(free_pos < bits_per_slot);
+	}
+	do {
+	  assert(length > *allocated);
+	  bool empty = l1._allocate_l1(length,
+	    min_length,
+	    max_length,
+	    (l2_pos + free_pos)* l1_w,
+	    (l2_pos + free_pos + 1) * l1_w,
+	    allocated,
+	    res);
+	  if (empty) {
+	    slot_val &= ~(slot_t(1) << free_pos);
+	  }
+	  if (length <= *allocated || slot_val == all_slot_clear) {
+	    break;
+	  }
+	  ++free_pos;
+	  if (!all_set) {
+	    free_pos = find_next_set_bit(slot_val, free_pos);
+	  }
+	} while (free_pos < bits_per_slot);
+	last_pos = l2_pos;
+	l2_pos += d;
+      }
+      l2_pos = 0;
+      pos = 0;
+      pos_end = *pos_start;
+    }
+
+    ++l2_allocs;
+    auto allocated_here = *allocated - prev_allocated;
+    assert(available >= allocated_here);
+    available -= allocated_here;
+    *pos_start = last_pos;
+  }
+
+#ifndef NON_CEPH_BUILD
+  // to provide compatibility with BlueStore's allocator interface
+  void _free_l2(const interval_set<uint64_t> & rr)
+  {
+    uint64_t released = 0;
+    std::lock_guard<std::mutex> l(lock);
+    for (auto r : rr) {
+      released += l1._free_l1(r.first, r.second);
+      uint64_t l2_pos = r.first / l2_granularity;
+      uint64_t l2_pos_end = p2roundup(int64_t(r.first + r.second), int64_t(l2_granularity)) / l2_granularity;
+
+      _mark_l2_free(l2_pos, l2_pos_end);
+    }
+    available += released;
+  }
+#endif
+
+  template <typename T>
+  void _free_l2(const T& rr)
+  {
+    uint64_t released = 0;
+    std::lock_guard<std::mutex> l(lock);
+    for (auto r : rr) {
+      released += l1._free_l1(r.offset, r.length);
+      uint64_t l2_pos = r.offset / l2_granularity;
+      uint64_t l2_pos_end = p2roundup(int64_t(r.offset + r.length), int64_t(l2_granularity)) / l2_granularity;
+
+      _mark_l2_free(l2_pos, l2_pos_end);
+    }
+    available += released;
+  }
+
+  void _mark_allocated(uint64_t o, uint64_t len)
+  {
+    uint64_t l2_pos = o / l2_granularity;
+    uint64_t l2_pos_end = p2roundup(int64_t(o + len), int64_t(l2_granularity)) / l2_granularity;
+
+    std::lock_guard<std::mutex> l(lock);
+    auto allocated = l1._mark_alloc_l1(interval_t(o, len));
+    assert(available >= allocated);
+    available -= allocated;
+    _mark_l2_on_l1(l2_pos, l2_pos_end);
+  }
+
+  void _mark_free(uint64_t o, uint64_t len)
+  {
+    uint64_t l2_pos = o / l2_granularity;
+    uint64_t l2_pos_end = p2roundup(int64_t(o + len), int64_t(l2_granularity)) / l2_granularity;
+
+    std::lock_guard<std::mutex> l(lock);
+    available += l1._free_l1(o, len);
+    _mark_l2_free(l2_pos, l2_pos_end);
+  }
+};
+
+#endif
diff --git a/src/test/objectstore/Allocator_test.cc b/src/test/objectstore/Allocator_test.cc
index 1dda2a9992cdd..3e4f0bb095afd 100644
--- a/src/test/objectstore/Allocator_test.cc
+++ b/src/test/objectstore/Allocator_test.cc
@@ -293,7 +293,6 @@ TEST_P(AllocTest, test_alloc_bench_seq)
   for (uint64_t i = 0; i < capacity; i += want_size)
   {
     tmp.clear();
-    EXPECT_EQ(0, alloc->reserve(want_size));
     EXPECT_EQ(want_size, alloc->allocate(want_size, alloc_unit, 0, 0, &tmp));
     if (0 == (i % (1 * 1024 * _1m))) {
       std::cout << "alloc " << i / 1024 / 1024 << " mb of "
@@ -411,14 +410,13 @@ TEST_P(AllocTest, test_alloc_bench)
   for (uint64_t i = 0; i < capacity * 2; )
   {
     uint32_t want = alloc_unit << u1(rng);
-    auto r = alloc->reserve(want);
-    if (r != 0) {
+    auto r = alloc->allocate(want, alloc_unit, 0, 0, &tmp);
+    if (r <= 0) {
       break;
     }
     i += want;
     tmp.clear();
 
-    EXPECT_EQ(want, alloc->allocate(want, alloc_unit, 0, 0, &tmp));
     for(auto a : tmp) {
       bool full = !at.push(a.offset, a.length);
       EXPECT_EQ(full, false);
diff --git a/src/test/objectstore/CMakeLists.txt b/src/test/objectstore/CMakeLists.txt
index 7017d6419f25d..e2666870a703a 100644
--- a/src/test/objectstore/CMakeLists.txt
+++ b/src/test/objectstore/CMakeLists.txt
@@ -103,6 +103,7 @@ if(WITH_BLUESTORE)
   add_ceph_unittest(unittest_bit_alloc)
   target_link_libraries(unittest_bit_alloc os global)
 
+
   add_executable(unittest_alloc
     Allocator_test.cc
     $<TARGET_OBJECTS:unit-main>
@@ -110,6 +111,16 @@ if(WITH_BLUESTORE)
   add_ceph_unittest(unittest_alloc)
   target_link_libraries(unittest_alloc os global)
 
+  add_executable(unittest_fastbmap_allocator
+    fastbmap_allocator_test.cc
+    $<TARGET_OBJECTS:unit-main>
+    )
+  add_ceph_unittest(unittest_fastbmap_allocator)
+  target_link_libraries(unittest_fastbmap_allocator os global)
+
+  set_target_properties(unittest_fastbmap_allocator PROPERTIES COMPILE_FLAGS
+  ${UNITTEST_CXX_FLAGS})
+
   # unittest_bluefs
   add_executable(unittest_bluefs
     test_bluefs.cc
diff --git a/src/test/objectstore/fastbmap_allocator_test.cc b/src/test/objectstore/fastbmap_allocator_test.cc
new file mode 100755
index 0000000000000..acddb3f3723a5
--- /dev/null
+++ b/src/test/objectstore/fastbmap_allocator_test.cc
@@ -0,0 +1,584 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iostream>
+#include <gtest/gtest.h>
+
+#include "os/bluestore/fastbmap_allocator_impl.h"
+
+class TestAllocatorLevel01 : public AllocatorLevel01Loose
+{
+public:
+  void init(uint64_t capacity, uint64_t alloc_unit)
+  {
+    _init(capacity, alloc_unit);
+  }
+  interval_t allocate_l1(uint64_t length, uint64_t min_length,
+    uint64_t pos_start, uint64_t pos_end)
+  {
+    return _allocate_l1(length, min_length, 0, pos_start, pos_end);
+  }
+  void free_l1(const interval_t& r)
+  {
+    _free_l1(r.offset, r.length);
+  }
+};
+
+class TestAllocatorLevel02 : public AllocatorLevel02<AllocatorLevel01Loose>
+{
+public:
+  void init(uint64_t capacity, uint64_t alloc_unit)
+  {
+    _init(capacity, alloc_unit);
+  }
+  void allocate_l2(uint64_t length, uint64_t min_length,
+    uint64_t* allocated0,
+    interval_vector_t* res)
+  {
+    uint64_t allocated = 0;
+    uint64_t last_pos = 0; // do hint support so far
+    _allocate_l2(length, min_length, 0, &last_pos, &allocated, res);
+    *allocated0 += allocated;
+  }
+  void free_l2(const interval_vector_t& r)
+  {
+    _free_l2(r);
+  }
+};
+
+const uint64_t _1m = 1024 * 1024;
+const uint64_t _2m = 2 * 1024 * 1024;
+
+TEST(TestAllocatorLevel01, test_l1)
+{
+  TestAllocatorLevel01 al1;
+  uint64_t num_l1_entries = 3 * 256;
+  uint64_t capacity = num_l1_entries * 512 * 4096;
+  al1.init(capacity, 0x1000);
+  ASSERT_TRUE(capacity == al1.debug_get_free());
+
+  auto i1 = al1.allocate_l1(0x1000, 0x1000, 0, num_l1_entries);
+  ASSERT_TRUE(i1.offset == 0);
+  ASSERT_TRUE(i1.length == 0x1000);
+  ASSERT_TRUE(capacity - 0x1000 == al1.debug_get_free());
+
+  auto i2 = al1.allocate_l1(0x1000, 0x1000, 0, num_l1_entries);
+  ASSERT_TRUE(i2.offset == 0x1000);
+  ASSERT_TRUE(i2.length == 0x1000);
+  al1.free_l1(i2);
+  al1.free_l1(i1);
+  i1 = al1.allocate_l1(0x1000, 0x1000, 0, num_l1_entries);
+  ASSERT_TRUE(i1.offset == 0);
+  ASSERT_TRUE(i1.length == 0x1000);
+  i2 = al1.allocate_l1(0x1000, 0x1000, 0, num_l1_entries);
+  ASSERT_TRUE(i2.offset == 0x1000);
+  ASSERT_TRUE(i2.length == 0x1000);
+  al1.free_l1(i1);
+  al1.free_l1(i2);
+
+  i1 = al1.allocate_l1(0x2000, 0x1000, 0, num_l1_entries);
+  ASSERT_TRUE(i1.offset == 0);
+  ASSERT_TRUE(i1.length == 0x2000);
+
+  i2 = al1.allocate_l1(0x3000, 0x1000, 0, num_l1_entries);
+  ASSERT_TRUE(i2.offset == 0x2000);
+  ASSERT_TRUE(i2.length == 0x3000);
+
+  al1.free_l1(i1);
+  al1.free_l1(i2);
+
+  i1 = al1.allocate_l1(0x2000, 0x1000, 0, num_l1_entries);
+  ASSERT_TRUE(i1.offset == 0);
+  ASSERT_TRUE(i1.length == 0x2000);
+
+  i2 = al1.allocate_l1(2 * 1024 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_TRUE(i2.offset == 2 * 1024 * 1024);
+  ASSERT_TRUE(i2.length == 2 * 1024 * 1024);
+
+  al1.free_l1(i1);
+  i1 = al1.allocate_l1(1024 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_TRUE(i1.offset == 0);
+  ASSERT_TRUE(i1.length == 1024 * 1024);
+
+  auto i3 = al1.allocate_l1(1024 * 1024 + 0x1000, 0x1000, 0, num_l1_entries);
+  ASSERT_TRUE(i3.offset == 2 * 2 * 1024 * 1024);
+  ASSERT_TRUE(i3.length == 1024 * 1024 + 0x1000);
+
+  // here we have the following layout:
+  // Alloc: 0~1M, 2M~2M, 4M~1M+4K
+  // Free: 1M~1M, 4M+4K ~ 2M-4K, 6M ~...
+  //
+  auto i4 = al1.allocate_l1(1024 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_TRUE(i4.offset == 1 * 1024 * 1024);
+  ASSERT_TRUE(i4.length == 1024 * 1024);
+  al1.free_l1(i4);
+
+  i4 = al1.allocate_l1(1024 * 1024 - 0x1000, 0x1000, 0, num_l1_entries);
+  ASSERT_TRUE(i4.offset == 5 * 1024 * 1024 + 0x1000);
+  ASSERT_TRUE(i4.length == 1024 * 1024 - 0x1000);
+  al1.free_l1(i4);
+
+  i4 = al1.allocate_l1(1024 * 1024 + 0x1000, 0x1000, 0, num_l1_entries);
+  ASSERT_TRUE(i4.offset == 6 * 1024 * 1024);
+  //ASSERT_TRUE(i4.offset == 5 * 1024 * 1024 + 0x1000);
+  ASSERT_TRUE(i4.length == 1024 * 1024 + 0x1000);
+
+  al1.free_l1(i1);
+  al1.free_l1(i2);
+  al1.free_l1(i3);
+  al1.free_l1(i4);
+
+  i1 = al1.allocate_l1(1024 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_TRUE(i1.offset == 0);
+  ASSERT_TRUE(i1.length == 1024 * 1024);
+
+  i2 = al1.allocate_l1(1024 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_TRUE(i2.offset == 1 * 1024 * 1024);
+  ASSERT_TRUE(i2.length == 1024 * 1024 );
+
+  i3 = al1.allocate_l1(512 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_TRUE(i3.offset == 2 * 1024 * 1024);
+  ASSERT_TRUE(i3.length == 512 * 1024);
+
+  i4 = al1.allocate_l1(1536 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_TRUE(i4.offset == (2 * 1024 + 512) * 1024 );
+  ASSERT_TRUE(i4.length == 1536 * 1024);
+  // making a hole 1.5 Mb length
+  al1.free_l1(i2);
+  al1.free_l1(i3);
+  // and trying to fill it
+  i2 = al1.allocate_l1(1536 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_TRUE(i2.offset == 1024 * 1024);
+  ASSERT_TRUE(i2.length == 1536 * 1024);
+
+  al1.free_l1(i2);
+  // and trying to fill it partially
+  i2 = al1.allocate_l1(1528 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_TRUE(i2.offset == 1024 * 1024);
+  ASSERT_TRUE(i2.length == 1528 * 1024);
+
+  i3 = al1.allocate_l1(8 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_TRUE(i3.offset == 2552 * 1024);
+  ASSERT_TRUE(i3.length == 8 * 1024);
+
+  al1.free_l1(i2);
+  // here we have the following layout:
+  // Alloc: 0~1M, 2552K~8K, num_l1_entries0K~1.5M
+  // Free: 1M~1528K, 4M ~...
+  //
+  i2 = al1.allocate_l1(1536 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_TRUE(i2.offset == 4 * 1024 * 1024);
+  ASSERT_TRUE(i2.length == 1536 * 1024);
+
+  al1.free_l1(i1);
+  al1.free_l1(i2);
+  al1.free_l1(i3);
+  al1.free_l1(i4);
+  ASSERT_TRUE(capacity == al1.debug_get_free());
+
+  for (uint64_t i = 0; i < capacity; i += _2m) {
+    i1 = al1.allocate_l1(_2m, _2m, 0, num_l1_entries);
+    ASSERT_TRUE(i1.offset == i);
+    ASSERT_TRUE(i1.length == _2m);
+  }
+  ASSERT_TRUE(0 == al1.debug_get_free());
+  i2 = al1.allocate_l1(_2m, _2m, 0, num_l1_entries);
+  ASSERT_TRUE(i2.length == 0);
+  ASSERT_TRUE(0 == al1.debug_get_free());
+
+  al1.free_l1(i1);
+  i2 = al1.allocate_l1(_2m, _2m, 0, num_l1_entries);
+  ASSERT_TRUE(i2 == i1);
+  al1.free_l1(i2);
+  i2 = al1.allocate_l1(_1m, _1m, 0, num_l1_entries);
+  ASSERT_TRUE(i2.offset == i1.offset);
+  ASSERT_TRUE(i2.length == _1m);
+
+  i3 = al1.allocate_l1(_2m, _2m, 0, num_l1_entries);
+  ASSERT_TRUE(i3.length == 0);
+
+  i3 = al1.allocate_l1(_2m, _1m, 0, num_l1_entries);
+  ASSERT_TRUE(i3.length == _1m);
+
+  i4 = al1.allocate_l1(_2m, _1m, 0, num_l1_entries);
+  ASSERT_TRUE(i4.length == 0);
+
+  al1.free_l1(i2);
+  i2 = al1.allocate_l1(_2m, _2m, 0, num_l1_entries);
+  ASSERT_TRUE(i2.length == 0);
+
+  i2 = al1.allocate_l1(_2m, 0x1000, 0, num_l1_entries);
+  ASSERT_TRUE(i2.length == _1m);
+
+  al1.free_l1(i2);
+  al1.free_l1(i3);
+  ASSERT_TRUE(_2m == al1.debug_get_free());
+
+  i1 = al1.allocate_l1(_2m - 3 * 0x1000, 0x1000, 0, num_l1_entries);
+  i2 = al1.allocate_l1(0x1000, 0x1000, 0, num_l1_entries);
+  i3 = al1.allocate_l1(0x1000, 0x1000, 0, num_l1_entries);
+  i4 = al1.allocate_l1(0x1000, 0x1000, 0, num_l1_entries);
+  ASSERT_TRUE(0 == al1.debug_get_free());
+
+  al1.free_l1(i2);
+  al1.free_l1(i4);
+
+  i2 = al1.allocate_l1(0x4000, 0x2000, 0, num_l1_entries);
+  ASSERT_TRUE(i2.length == 0);
+  i2 = al1.allocate_l1(0x4000, 0x1000, 0, num_l1_entries);
+  ASSERT_TRUE(i2.length == 0x1000);
+
+  al1.free_l1(i3);
+  i3 = al1.allocate_l1(0x6000, 0x3000, 0, num_l1_entries);
+  ASSERT_TRUE(i3.length == 0);
+  i3 = al1.allocate_l1(0x6000, 0x1000, 0, num_l1_entries);
+  ASSERT_TRUE(i3.length == 0x2000);
+  ASSERT_TRUE(0 == al1.debug_get_free());
+
+  std::cout << "Done L1" << std::endl;
+}
+
+TEST(TestAllocatorLevel01, test_l2)
+{
+  TestAllocatorLevel02 al2;
+  uint64_t num_l2_entries = 64;// *512;
+  uint64_t capacity = num_l2_entries * 256 * 512 * 4096;
+  al2.init(capacity, 0x1000);
+  std::cout << "Init L2" << std::endl;
+
+  uint64_t allocated1 = 0;
+  interval_vector_t a1;
+  al2.allocate_l2(0x2000, 0x2000, &allocated1, &a1);
+  ASSERT_TRUE(allocated1 == 0x2000);
+  ASSERT_TRUE(a1[0].offset == 0);
+  ASSERT_TRUE(a1[0].length == 0x2000);
+
+  // limit query range in debug_get_free for the sake of performance
+  ASSERT_TRUE(0x2000 == al2.debug_get_allocated(0, 1));
+  ASSERT_TRUE(0 == al2.debug_get_allocated(1, 2));
+
+  uint64_t allocated2 = 0;
+  interval_vector_t a2;
+  al2.allocate_l2(0x2000, 0x2000, &allocated2, &a2);
+  ASSERT_TRUE(allocated2 == 0x2000);
+  ASSERT_TRUE(a2[0].offset == 0x2000);
+  ASSERT_TRUE(a2[0].length == 0x2000);
+  // limit query range in debug_get_free for the sake of performance
+  ASSERT_TRUE(0x4000 == al2.debug_get_allocated(0, 1));
+  ASSERT_TRUE(0 == al2.debug_get_allocated(1, 2));
+
+  al2.free_l2(a1);
+
+  allocated2 = 0;
+  a2.clear();
+  al2.allocate_l2(0x1000, 0x1000, &allocated2, &a2);
+  ASSERT_TRUE(allocated2 == 0x1000);
+  ASSERT_TRUE(a2[0].offset == 0x0000);
+  ASSERT_TRUE(a2[0].length == 0x1000);
+  // limit query range in debug_get_free for the sake of performance
+  ASSERT_TRUE(0x3000 == al2.debug_get_allocated(0, 1));
+  ASSERT_TRUE(0 == al2.debug_get_allocated(1, 2));
+
+  uint64_t allocated3 = 0;
+  interval_vector_t a3;
+  al2.allocate_l2(0x2000, 0x1000, &allocated3, &a3);
+  ASSERT_TRUE(allocated3 == 0x2000);
+  ASSERT_TRUE(a3.size() == 2);
+  ASSERT_TRUE(a3[0].offset == 0x1000);
+  ASSERT_TRUE(a3[0].length == 0x1000);
+  ASSERT_TRUE(a3[1].offset == 0x4000);
+  ASSERT_TRUE(a3[1].length == 0x1000);
+  // limit query range in debug_get_free for the sake of performance
+  ASSERT_TRUE(0x5000 == al2.debug_get_allocated(0, 1));
+  ASSERT_TRUE(0 == al2.debug_get_allocated(1, 2));
+  {
+    interval_vector_t r;
+    r.emplace_back(0x0, 0x5000);
+    al2.free_l2(r);
+  }
+
+  a3.clear();
+  allocated3 = 0;
+  al2.allocate_l2(_1m, _1m, &allocated3, &a3);
+  ASSERT_TRUE(a3.size() == 1);
+  ASSERT_TRUE(a3[0].offset == 0);
+  ASSERT_TRUE(a3[0].length == _1m);
+
+  al2.free_l2(a3);
+
+  a3.clear();
+  allocated3 = 0;
+  al2.allocate_l2(4 * _1m, _1m, &allocated3, &a3);
+  ASSERT_TRUE(a3.size() == 1);
+  ASSERT_TRUE(a3[0].offset == 0);
+  ASSERT_TRUE(a3[0].length == 4 * _1m);
+
+  al2.free_l2(a3);
+
+#ifndef _DEBUG
+  for (uint64_t i = 0; i < capacity; i += 0x1000) {
+    uint64_t allocated4 = 0;
+    interval_vector_t a4;
+    al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+    ASSERT_TRUE(a4.size() == 1);
+    ASSERT_TRUE(a4[0].offset == i);
+    ASSERT_TRUE(a4[0].length == 0x1000);
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "alloc1 " << i / 1024 / 1024 << " mb of "
+        << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+#else
+  for (uint64_t i = 0; i < capacity; i += _2m) {
+    uint64_t allocated4 = 0;
+    interval_vector_t a4;
+    al2.allocate_l2(_2m, _2m, &allocated4, &a4);
+    ASSERT_TRUE(a4.size() == 1);
+    ASSERT_TRUE(a4[0].offset == i);
+    ASSERT_TRUE(a4[0].length == _2m);
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "alloc1 " << i / 1024 / 1024 << " mb of "
+                << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+#endif
+
+  ASSERT_TRUE(0 == al2.debug_get_free());
+  for (uint64_t i = 0; i < capacity; i += _1m) {
+    interval_vector_t r;
+    r.emplace_back(interval_t(i, _1m));
+    al2.free_l2(r);
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "free1 " << i / 1024 / 1024 << " mb of "
+        << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+  ASSERT_TRUE(capacity == al2.debug_get_free());
+
+  for (uint64_t i = 0; i < capacity; i += _1m) {
+    uint64_t allocated4 = 0;
+    interval_vector_t a4;
+    al2.allocate_l2(_1m, _1m, &allocated4, &a4);
+    ASSERT_TRUE(a4.size() == 1);
+    ASSERT_TRUE(allocated4 == _1m);
+    ASSERT_TRUE(a4[0].offset == i);
+    ASSERT_TRUE(a4[0].length == _1m);
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "alloc2 " << i / 1024 / 1024 << " mb of "
+        << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+  ASSERT_TRUE(0 == al2.debug_get_free());
+  uint64_t allocated4 = 0;
+  interval_vector_t a4;
+  al2.allocate_l2(_1m, _1m, &allocated4, &a4);
+  ASSERT_TRUE(a4.size() == 0);
+  al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+  ASSERT_TRUE(a4.size() == 0);
+
+  for (uint64_t i = 0; i < capacity; i += 0x2000) {
+    interval_vector_t r;
+    r.emplace_back(interval_t(i, 0x1000));
+    al2.free_l2(r);
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "free2 " << i / 1024 / 1024 << " mb of "
+        << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+  ASSERT_TRUE(capacity / 2 == al2.debug_get_free());
+
+  // unable to allocate due to fragmentation
+  al2.allocate_l2(_1m, _1m, &allocated4, &a4);
+  ASSERT_TRUE(a4.size() == 0);
+
+  for (uint64_t i = 0; i < capacity; i += 2 * _1m) {
+    a4.clear();
+    allocated4 = 0;
+    al2.allocate_l2(_1m, 0x1000, &allocated4, &a4);
+    ASSERT_TRUE(a4.size() == _1m / 0x1000);
+    ASSERT_TRUE(allocated4 == _1m);
+    ASSERT_TRUE(a4[0].offset == i);
+    ASSERT_TRUE(a4[0].length == 0x1000);
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "alloc3 " << i / 1024 / 1024 << " mb of "
+        << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+  ASSERT_TRUE(0 == al2.debug_get_free());
+
+  std::cout << "Done L2" << std::endl;
+}
+
+TEST(TestAllocatorLevel01, test_l2_huge)
+{
+  TestAllocatorLevel02 al2;
+  uint64_t num_l2_entries = 4 * 512;
+  uint64_t capacity = num_l2_entries * 256 * 512 * 4096; // 1 TB
+  al2.init(capacity, 0x1000);
+  std::cout << "Init L2 Huge" << std::endl;
+
+  for (uint64_t i = 0; i < capacity; i += _1m) {
+    uint64_t allocated4 = 0;
+    interval_vector_t a4;
+    al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+    ASSERT_TRUE(a4.size() == 1);
+    ASSERT_TRUE(allocated4 == 0x1000);
+    ASSERT_TRUE(a4[0].offset == i);
+    ASSERT_TRUE(a4[0].length == 0x1000);
+
+    allocated4 = 0;
+    a4.clear();
+    al2.allocate_l2(_1m - 0x1000, _1m - 0x1000, &allocated4, &a4);
+    ASSERT_TRUE(a4.size() == 1);
+    ASSERT_TRUE(allocated4 == _1m - 0x1000);
+    ASSERT_TRUE(a4[0].offset == i + 0x1000);
+    ASSERT_TRUE(a4[0].length == _1m - 0x1000);
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "allocH " << i / 1024 / 1024 << " mb of "
+        << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+  for (uint64_t i = 0; i < capacity; i += _1m) {
+    interval_vector_t a4;
+    a4.emplace_back(i, 0x1000);
+    al2.free_l2(a4);
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "freeH1 " << i / 1024 / 1024 << " mb of "
+        << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+  {
+    std::cout << "Try" << std::endl;
+    time_t t = time(NULL);
+    for (int i = 0; i < 10; ++i) {
+      uint64_t allocated = 0;
+      interval_vector_t a;
+      al2.allocate_l2(0x2000, 0x2000, &allocated, &a);
+      ASSERT_TRUE(a.size() == 0);
+    }
+    std::cout << "End try in " << time(NULL) - t << " seconds" << std::endl;
+  }
+  {
+    std::cout << "Try" << std::endl;
+    time_t t = time(NULL);
+    for (int i = 0; i < 10; ++i) {
+      uint64_t allocated = 0;
+      interval_vector_t a;
+      al2.allocate_l2(_2m, _2m, &allocated, &a);
+      ASSERT_TRUE(a.size() == 0);
+    }
+    std::cout << "End try in " << time(NULL) - t << " seconds" << std::endl;
+  }
+
+  ASSERT_TRUE((capacity / _1m) * 0x1000 == al2.debug_get_free());
+
+  std::cout << "Done L2 Huge" << std::endl;
+}
+
+TEST(TestAllocatorLevel01, test_l2_unaligned)
+{
+  {
+    TestAllocatorLevel02 al2;
+    uint64_t num_l2_entries = 3;
+    uint64_t capacity = num_l2_entries * 256 * 512 * 4096; // 3x512 MB
+    al2.init(capacity, 0x1000);
+    std::cout << "Init L2 Unaligned" << std::endl;
+    for (uint64_t i = 0; i < capacity; i += _1m / 2) {
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(_1m / 2, _1m / 2, &allocated4, &a4);
+      ASSERT_TRUE(a4.size() == 1);
+      ASSERT_TRUE(allocated4 == _1m / 2);
+      ASSERT_TRUE(a4[0].offset == i);
+      ASSERT_TRUE(a4[0].length == _1m / 2);
+      if (0 == (i % (1 * 1024 * _1m))) {
+        std::cout << "allocU " << i / 1024 / 1024 << " mb of "
+          << capacity / 1024 / 1024 << std::endl;
+      }
+    }
+    ASSERT_TRUE(0 == al2.debug_get_free());
+    {
+      // no space to allocate
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+      ASSERT_TRUE(a4.size() == 0);
+    }
+  }
+  {
+    TestAllocatorLevel02 al2;
+    uint64_t capacity = 500 * 512 * 4096; // 500x2 MB
+    al2.init(capacity, 0x1000);
+    std::cout << ("Init L2 Unaligned2\n");
+    for (uint64_t i = 0; i < capacity; i += _1m / 2) {
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(_1m / 2, _1m / 2, &allocated4, &a4);
+      ASSERT_TRUE(a4.size() == 1);
+      ASSERT_TRUE(allocated4 == _1m / 2);
+      ASSERT_TRUE(a4[0].offset == i);
+      ASSERT_TRUE(a4[0].length == _1m / 2);
+      if (0 == (i % (1 * 1024 * _1m))) {
+        std::cout << "allocU2 " << i / 1024 / 1024 << " mb of "
+          << capacity / 1024 / 1024 << std::endl;
+      }
+    }
+    ASSERT_TRUE(0 == al2.debug_get_free());
+    {
+      // no space to allocate
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+      ASSERT_TRUE(a4.size() == 0);
+    }
+  }
+
+  {
+    TestAllocatorLevel02 al2;
+    uint64_t capacity = 100 * 512 * 4096 + 127 * 4096;
+    al2.init(capacity, 0x1000);
+    std::cout << "Init L2 Unaligned2" << std::endl;
+    for (uint64_t i = 0; i < capacity; i += 0x1000) {
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+      ASSERT_TRUE(a4.size() == 1);
+      ASSERT_TRUE(allocated4 == 0x1000);
+      ASSERT_TRUE(a4[0].offset == i);
+      ASSERT_TRUE(a4[0].length == 0x1000);
+    }
+    ASSERT_TRUE(0 == al2.debug_get_free());
+    {
+      // no space to allocate
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+      ASSERT_TRUE(a4.size() == 0);
+    }
+  }
+  {
+    TestAllocatorLevel02 al2;
+    uint64_t capacity = 3 * 4096;
+    al2.init(capacity, 0x1000);
+    std::cout << "Init L2 Unaligned2" << std::endl;
+    for (uint64_t i = 0; i < capacity; i += 0x1000) {
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+      ASSERT_TRUE(a4.size() == 1);
+      ASSERT_TRUE(allocated4 == 0x1000);
+      ASSERT_TRUE(a4[0].offset == i);
+      ASSERT_TRUE(a4[0].length == 0x1000);
+    }
+    ASSERT_TRUE(0 == al2.debug_get_free());
+    {
+      // no space to allocate
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+      ASSERT_TRUE(a4.size() == 0);
+    }
+  }
+
+  std::cout << "Done L2 Unaligned" << std::endl;
+}
-- 
2.39.5